NAMD
ComputeNonbondedCUDA.C
Go to the documentation of this file.
1 
2 #include "common.h"
3 #include "charm++.h"
4 #include "HipDefines.h"
5 
6 #ifdef NAMD_CUDA
7 #include <cuda_runtime.h>
8 #include <cuda.h>
9 #endif
10 #ifdef NAMD_HIP
11 #include <hip/hip_runtime.h>
12 #endif
13 
14 #include "WorkDistrib.h"
15 #include "ComputeMgr.h"
16 #include "ProxyMgr.h"
18 #include "ComputeNonbondedCUDA.h"
19 #include "LJTable.h"
20 #include "ObjectArena.h"
21 #include "SortAtoms.h"
22 #include "Priorities.h"
23 #include <algorithm>
24 
25 #include "NamdTypes.h"
26 #include "DeviceCUDA.h"
27 #include "CudaUtils.h"
28 
29 //#define PRINT_GBIS
30 #undef PRINT_GBIS
31 
32 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
33 
34 #ifdef WIN32
35 #define __thread __declspec(thread)
36 #endif
37 
38 extern __thread int max_grid_size;
39 
40 extern __thread cudaStream_t stream;
41 extern __thread cudaStream_t stream2;
42 
43 extern __thread DeviceCUDA *deviceCUDA;
44 
45 void cuda_errcheck(const char *msg) {
46  cudaError_t err;
47  if ((err = cudaGetLastError()) != cudaSuccess) {
48  char host[128];
49  gethostname(host, 128); host[127] = 0;
50  char devstr[128] = "";
51  int devnum;
52  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
53  sprintf(devstr, " device %d", devnum);
54  }
55  cudaDeviceProp deviceProp;
56  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
57  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
58  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
59  }
60  char errmsg[1024];
61  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
62  NAMD_die(errmsg);
63  }
64 }
65 
66 static inline bool sortop_bitreverse(int a, int b) {
67  if ( a == b ) return 0;
68  for ( int bit = 1; bit; bit *= 2 ) {
69  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
70  }
71  return 0;
72 }
73 
74 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
75 static __thread ComputeMgr *computeMgr = 0;
76 
79 }
80 
82  if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
85 }
86 
88 
90  const int dim = ljTable->get_table_dim();
91 
92  // round dim up to odd multiple of 16
93  int tsize = (((dim+16+31)/32)*32)-16;
94  if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
95 
96  float2 *t = new float2[tsize*tsize];
97  float2 *row = t;
98  for ( int i=0; i<dim; ++i, row += tsize ) {
99  for ( int j=0; j<dim; ++j ) {
100  const LJTable::TableEntry *e = ljTable->table_val(i,j);
101  row[j].x = e->A * scaling;
102  row[j].y = e->B * scaling;
103  }
104  }
105 
106  cuda_bind_lj_table(t,tsize);
107  delete [] t;
108 
109  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
110  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
111  }
112 }
113 
115 
116  float4 t[FORCE_TABLE_SIZE];
117  float4 et[FORCE_TABLE_SIZE]; // energy table
118 
121  // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
122  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
123 
124  double r2list[FORCE_TABLE_SIZE]; // double to match cpu code
125  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
126  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
127  r2list[i] = r*r + r2_delta;
128  }
129 
130  union { double f; int32 i[2]; } byte_order_test;
131  byte_order_test.f = 1.0; // should occupy high-order bits only
132  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
133 
134  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
135  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
136  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
137 
138  if ( r > cutoff ) {
139  t[i].x = 0.;
140  t[i].y = 0.;
141  t[i].z = 0.;
142  t[i].w = 0.;
143  et[i].x = 0.;
144  et[i].y = 0.;
145  et[i].z = 0.;
146  et[i].w = 0.;
147  continue;
148  }
149 
150  BigReal diffa = r2list[i] - r2_table[table_i];
151 
152  // coulomb 1/r or fast force
153  // t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
154  {
155  BigReal table_a = fast_table[4*table_i];
156  BigReal table_b = fast_table[4*table_i+1];
157  BigReal table_c = fast_table[4*table_i+2];
158  BigReal table_d = fast_table[4*table_i+3];
159  BigReal grad =
160  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
161  t[i].x = 2. * grad;
162  BigReal ener = table_a + diffa *
163  ( ( table_d * diffa + table_c ) * diffa + table_b);
164  et[i].x = ener;
165  }
166 
167 
168  // pme correction for slow force
169  // t[i].w = 0.;
170  {
171  BigReal table_a = scor_table[4*table_i];
172  BigReal table_b = scor_table[4*table_i+1];
173  BigReal table_c = scor_table[4*table_i+2];
174  BigReal table_d = scor_table[4*table_i+3];
175  BigReal grad =
176  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
177  t[i].w = 2. * grad;
178  BigReal ener = table_a + diffa *
179  ( ( table_d * diffa + table_c ) * diffa + table_b);
180  et[i].w = ener;
181  }
182 
183 
184  // vdw 1/r^6
185  // t[i].y = 6. / (r8); // -1/r * d/dr r^-6
186  {
187  BigReal table_a = vdwb_table[4*table_i];
188  BigReal table_b = vdwb_table[4*table_i+1];
189  BigReal table_c = vdwb_table[4*table_i+2];
190  BigReal table_d = vdwb_table[4*table_i+3];
191  BigReal grad =
192  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
193  t[i].y = 2. * -1. * grad;
194  BigReal ener = table_a + diffa *
195  ( ( table_d * diffa + table_c ) * diffa + table_b);
196  et[i].y = -1. * ener;
197  }
198 
199 
200  // vdw 1/r^12
201  // t[i].z = 12e / (r8 * r4 * r2); // -1/r * d/dr r^-12
202  {
203  BigReal table_a = vdwa_table[4*table_i];
204  BigReal table_b = vdwa_table[4*table_i+1];
205  BigReal table_c = vdwa_table[4*table_i+2];
206  BigReal table_d = vdwa_table[4*table_i+3];
207  BigReal grad =
208  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
209  t[i].z = 2. * grad;
210  BigReal ener = table_a + diffa *
211  ( ( table_d * diffa + table_c ) * diffa + table_b);
212  et[i].z = ener;
213  }
214 
215  // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
216  // t[i].x, t[i].y, t[i].z, t[i].w);
217 
218 /*
219  double r2 = r * r;
220  double r4 = r2 * r2;
221  double r8 = r4 * r4;
222 
223  t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
224  t[i].y = 6. / (r8); // -1/r * d/dr r^-6
225  t[i].z = 12. / (r8 * r4 * r2); // -1/r * d/dr r^-12
226  t[i].w = 0.;
227 */
228  }
229 
230  t[0].x = 0.f;
231  t[0].y = 0.f;
232  t[0].z = 0.f;
233  t[0].w = 0.f;
234  et[0].x = et[1].x;
235  et[0].y = et[1].y;
236  et[0].z = et[1].z;
237  et[0].w = et[1].w;
238 
239  cuda_bind_force_table(t,et);
240 
241  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
242  CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
243  }
244 }
245 
247  bool operator() (int32 *li, int32 *lj) {
248  return ( li[1] < lj[1] );
249  }
250 };
251 
253  if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
255 }
256 
257 static __thread int2 *exclusionsByAtom;
258 
261 
262 #ifdef MEM_OPT_VERSION
263  int natoms = mol->exclSigPoolSize;
264 #else
265  int natoms = mol->numAtoms;
266 #endif
267 
268  delete [] exclusionsByAtom;
269  exclusionsByAtom = new int2[natoms];
270 
271  // create unique sorted lists
272 
273  ObjectArena<int32> listArena;
274  ResizeArray<int32*> unique_lists;
275  int32 **listsByAtom = new int32*[natoms];
277  for ( int i=0; i<natoms; ++i ) {
278  curList.resize(0);
279  curList.add(0); // always excluded from self
280 #ifdef MEM_OPT_VERSION
281  const ExclusionSignature *sig = mol->exclSigPool + i;
282  int n = sig->fullExclCnt;
283  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
284  n += 1;
285 #else
286  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
287  int n = mol_list[0] + 1;
288  for ( int j=1; j<n; ++j ) {
289  curList.add(mol_list[j] - i);
290  }
291 #endif
292  curList.sort();
293 
294  int j;
295  for ( j=0; j<unique_lists.size(); ++j ) {
296  if ( n != unique_lists[j][0] ) continue; // no match
297  int k;
298  for ( k=0; k<n; ++k ) {
299  if ( unique_lists[j][k+3] != curList[k] ) break;
300  }
301  if ( k == n ) break; // found match
302  }
303  if ( j == unique_lists.size() ) { // no match
304  int32 *list = listArena.getNewArray(n+3);
305  list[0] = n;
306  int maxdiff = 0;
307  maxdiff = -1 * curList[0];
308  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
309  list[1] = maxdiff;
310  for ( int k=0; k<n; ++k ) {
311  list[k+3] = curList[k];
312  }
313  unique_lists.add(list);
314  }
315  listsByAtom[i] = unique_lists[j];
316  }
317  // sort lists by maxdiff
318  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
319  long int totalbits = 0;
320  int nlists = unique_lists.size();
321  for ( int j=0; j<nlists; ++j ) {
322  int32 *list = unique_lists[j];
323  int maxdiff = list[1];
324  list[2] = totalbits + maxdiff;
325  totalbits += 2*maxdiff + 1;
326  }
327  for ( int i=0; i<natoms; ++i ) {
328  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
329  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
330  }
331  delete [] listsByAtom;
332 
333  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
334 
335  {
336  long int bytesneeded = totalbits / 8;
337  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
338  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
339  unique_lists.size(), bytesneeded);
340  }
341 
342  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
343  if ( bytesneeded > bytesavail ) {
344  char errmsg[512];
345  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
346  "but only %ld bytes can be addressed with 32-bit int.",
347  unique_lists.size(), bytesneeded, bytesavail);
348  NAMD_die(errmsg);
349  }
350  }
351 
352 #define SET_EXCL(EXCL,BASE,DIFF) \
353  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
354 
355  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
356  memset(exclusion_bits, 0, totalbits/8);
357 
358  long int base = 0;
359  for ( int i=0; i<unique_lists.size(); ++i ) {
360  base += unique_lists[i][1];
361  if ( unique_lists[i][2] != (int32)base ) {
362  NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
363  }
364  int n = unique_lists[i][0];
365  for ( int j=0; j<n; ++j ) {
366  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
367  }
368  base += unique_lists[i][1] + 1;
369  }
370 
371  cuda_bind_exclusions(exclusion_bits, totalbits/32);
372 
373  delete [] exclusion_bits;
374 }
375 
376 
378 
379  if ( ! cudaCompute ) NAMD_bug("register_self called early");
380 
382 
384  cr.c = c;
385  cr.pid[0] = pid; cr.pid[1] = pid;
386  cr.offset = 0.;
387  if ( cudaCompute->patchRecords[pid].isLocal ) {
389  } else {
391  }
392 }
393 
395 
396  if ( ! cudaCompute ) NAMD_bug("register_pair called early");
397 
398  cudaCompute->requirePatch(pid[0]);
399  cudaCompute->requirePatch(pid[1]);
400 
402  cr.c = c;
403  cr.pid[0] = pid[0]; cr.pid[1] = pid[1];
404 
405  int t1 = t[0];
406  int t2 = t[1];
407  Vector offset = cudaCompute->patchMap->center(pid[0])
408  - cudaCompute->patchMap->center(pid[1]);
409  offset.x += (t1%3-1) - (t2%3-1);
410  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
411  offset.z += (t1/9-1) - (t2/9-1);
412  cr.offset = offset;
413 
414  if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
416  } else {
418  }
419 }
420 
422 
423  NAMD_bug("unregister_compute unimplemented");
424 
425 }
426 
427 // static __thread cudaEvent_t start_upload;
428 static __thread cudaEvent_t start_calc;
429 static __thread cudaEvent_t end_remote_download;
430 static __thread cudaEvent_t end_local_download;
431 
434 
435 void init_arrays();
436 
438  ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
439 #ifdef PRINT_GBIS
440  CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
441 #endif
442 
443  if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
444  if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
445  if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
446 
447  // CkPrintf("create ComputeNonbondedCUDA\n");
448  master = m ? m : this;
449  cudaCompute = this;
450  computeMgr = mgr;
453  reduction = 0;
454 
456  if (params->pressureProfileOn) {
457  NAMD_die("pressure profile not supported in CUDA");
458  }
459 
460  atomsChanged = 1;
461  computesChanged = 1;
463 
464  pairlistsValid = 0;
465  pairlistTolerance = 0.;
466  usePairlists = 0;
467  savePairlists = 0;
468  plcutoff2 = 0.;
469 
470  workStarted = 0;
473 
474  // Zero array sizes and pointers
475  init_arrays();
476 
477  atoms_size = 0;
478  atoms = NULL;
479 
480  forces_size = 0;
481  forces = NULL;
482 
483  slow_forces_size = 0;
484  slow_forces = NULL;
485 
486  psiSumH_size = 0;
487  psiSumH = NULL;
488 
489  dEdaSumH_size = 0;
490  dEdaSumH = NULL;
491 
492  if ( master != this ) { // I am slave
494  master->slaves[slaveIndex] = this;
495  if ( master->slavePes[slaveIndex] != CkMyPe() ) {
496  NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
497  }
499  registerPatches();
500  return;
501  }
502  masterPe = CkMyPe();
503 
504  const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
505  if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
507 
508  // Sanity checks for New CUDA kernel
509  if (params->GBISOn) {
510  // GBIS
511  if (deviceCUDA->getNoMergeGrids()) {
512  NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
513  }
514  // Set mergegrids ON as long as user hasn't defined +nomergegrids
516  // Final sanity check
517  if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
518  } else {
519  // non-GBIS
521  NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
522  }
523  }
524 
525 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
526  int leastPriority, greatestPriority;
527  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
528  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
529  if ( leastPriority != greatestPriority ) {
530  if ( CkMyNode() == 0 ) {
531  int dev = deviceCUDA->getDeviceID();
532  CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
533  }
534  if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
535  greatestPriority = leastPriority;
536  }
537  if (params->usePMECUDA) greatestPriority = leastPriority;
538  cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
539  cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
540  } else
541 #endif
542  {
543  cudaStreamCreate(&stream);
544  cuda_errcheck("cudaStreamCreate");
545  int dev = deviceCUDA->getDeviceID();
546  cudaDeviceProp deviceProp;
547  cudaGetDeviceProperties(&deviceProp, dev);
548  cuda_errcheck("cudaGetDeviceProperties");
549  if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
550  if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
551  cudaStreamCreate(&stream2);
552  } else {
553  stream2 = stream;
554  }
555  }
556  cuda_errcheck("cudaStreamCreate");
557 
558  // Get GPU device ID
560 
561  cuda_init();
562  if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
564  // cudaEventCreate(&start_upload);
565  cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
566  cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
567  cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
568 
569  patchRecords.resize(patchMap->numPatches());
572 
573  if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
574 }
575 
576 
578 
580 
581  computesChanged = 1;
582  patch_record &pr = patchRecords[pid];
583  if ( pr.refCount == 0 ) {
584  pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
585  pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
586  if ( deviceCUDA->getMergeGrids() ) {
587  pr.isLocal = 0;
588  } else if ( CkNumNodes() < 2 ) {
589  pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
590  patchMap->index_b(pid) ^ patchMap->index_c(pid) );
591  } else {
592  pr.isLocal = pr.isSameNode;
593  }
594  if ( pr.isLocal ) {
595  localActivePatches.add(pid);
596  } else {
598  }
599  activePatches.add(pid);
600  pr.patchID = pid;
601  pr.hostPe = -1;
602  pr.x = NULL;
603  pr.xExt = NULL;
604  pr.r = NULL;
605  pr.f = NULL;
606  pr.intRad = NULL;
607  pr.psiSum = NULL;
608  pr.bornRad = NULL;
609  pr.dEdaSum = NULL;
610  pr.dHdrPrefix = NULL;
611  }
612  pr.refCount += 1;
613 }
614 
616 
618  int npatches = master->activePatches.size();
619  int *pids = master->activePatches.begin();
620  patch_record *recs = master->patchRecords.begin();
621  for ( int i=0; i<npatches; ++i ) {
622  int pid = pids[i];
623  patch_record &pr = recs[pid];
624  if ( pr.hostPe == CkMyPe() ) {
625  pr.slave = this;
626  pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
627  hostedPatches.add(pid);
628  if ( pr.isLocal ) {
629  localHostedPatches.add(pid);
630  } else {
632  }
634  pr.p = patchMap->patch(pid);
635  pr.positionBox = pr.p->registerPositionPickup(this);
636  pr.forceBox = pr.p->registerForceDeposit(this);
637  if (simParams->GBISOn) {
638  pr.intRadBox = pr.p->registerIntRadPickup(this);
639  pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
640  pr.bornRadBox = pr.p->registerBornRadPickup(this);
641  pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
643  }
644  }
645  }
646  if ( master == this ) setNumPatches(activePatches.size());
648  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
649  CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
650 }
651 
653  bool operator() (int pidj, int pidi) { // i and j reversed
654  int ppi = PATCH_PRIORITY(pidi);
655  int ppj = PATCH_PRIORITY(pidj);
656  if ( ppi != ppj ) return ppi < ppj;
657  return pidi < pidj;
658  }
659 };
660 
662 
663  int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
664  int numPesOnNodeSharingDevice = 0;
665  int masterIndex = -1;
666  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
667  int pe = deviceCUDA->getPesSharingDevice(i);
668  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
669  if ( CkNodeOf(pe) == CkMyNode() ) {
670  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
671  }
672  }
673 
674  int npatches = activePatches.size();
675 
676  if ( npatches ) {
678  }
679 
680  // calculate priority rank of local home patch within pe
681  {
682  ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
683  for ( int i=0; i<npatches; ++i ) {
684  int pid = activePatches[i];
685  int homePe = patchMap->node(pid);
686  if ( CkNodeOf(homePe) == CkMyNode() ) {
687  homePatchByRank[CkRankOf(homePe)].add(pid);
688  }
689  }
690  for ( int i=0; i<CkMyNodeSize(); ++i ) {
692  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
693  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
694  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
695  int pid = homePatchByRank[i][j];
696  patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
697  }
698  }
699  }
700 
701  int *count = new int[npatches];
702  memset(count, 0, sizeof(int)*npatches);
703  int *pcount = new int[numPesOnNodeSharingDevice];
704  memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
705  int *rankpcount = new int[CkMyNodeSize()];
706  memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
707  char *table = new char[npatches*numPesOnNodeSharingDevice];
708  memset(table, 0, npatches*numPesOnNodeSharingDevice);
709 
710  int unassignedpatches = npatches;
711 
712  if ( 0 ) { // assign all to device pe
713  for ( int i=0; i<npatches; ++i ) {
714  int pid = activePatches[i];
715  patch_record &pr = patchRecords[pid];
716  pr.hostPe = CkMyPe();
717  }
718  unassignedpatches = 0;
719  pcount[masterIndex] = npatches;
720  } else
721 
722  // assign if home pe and build table of natural proxies
723  for ( int i=0; i<npatches; ++i ) {
724  int pid = activePatches[i];
725  patch_record &pr = patchRecords[pid];
726  int homePe = patchMap->node(pid);
727  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
728  int pe = pesOnNodeSharingDevice[j];
729  if ( pe == homePe ) {
730  pr.hostPe = pe; --unassignedpatches;
731  pcount[j] += 1;
732  }
733  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
734  table[i*numPesOnNodeSharingDevice+j] = 1;
735  }
736  }
737  if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
738  pr.hostPe = homePe; --unassignedpatches;
739  rankpcount[CkRankOf(homePe)] += 1;
740  }
741  }
742  // assign if only one pe has a required proxy
743  int assignj = 0;
744  for ( int i=0; i<npatches; ++i ) {
745  int pid = activePatches[i];
746  patch_record &pr = patchRecords[pid];
747  if ( pr.hostPe != -1 ) continue;
748  int c = 0;
749  int lastj;
750  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
751  if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
752  }
753  count[i] = c;
754  if ( c == 1 ) {
755  pr.hostPe = pesOnNodeSharingDevice[lastj];
756  --unassignedpatches;
757  pcount[lastj] += 1;
758  }
759  }
760  while ( unassignedpatches ) {
761  int i;
762  for ( i=0; i<npatches; ++i ) {
763  if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
764  int pid = activePatches[i];
765  patch_record &pr = patchRecords[pid];
766  if ( pr.hostPe != -1 ) continue;
767  pr.hostPe = pesOnNodeSharingDevice[assignj];
768  --unassignedpatches;
769  pcount[assignj] += 1;
770  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
771  break;
772  }
773  if ( i<npatches ) continue; // start search again
774  for ( i=0; i<npatches; ++i ) {
775  int pid = activePatches[i];
776  patch_record &pr = patchRecords[pid];
777  if ( pr.hostPe != -1 ) continue;
778  if ( count[i] ) continue;
779  pr.hostPe = pesOnNodeSharingDevice[assignj];
780  --unassignedpatches;
781  pcount[assignj] += 1;
782  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
783  break;
784  }
785  if ( i<npatches ) continue; // start search again
786  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
787  }
788 
789  for ( int i=0; i<npatches; ++i ) {
790  int pid = activePatches[i];
791  patch_record &pr = patchRecords[pid];
792  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
793  }
794 
795  slavePes = new int[CkMyNodeSize()];
796  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
797  numSlaves = 0;
798  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
799  int pe = pesOnNodeSharingDevice[j];
800  int rank = pe - CkNodeFirst(CkMyNode());
801  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
802  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
803  if ( pe == CkMyPe() ) continue;
804  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
805  rankpcount[rank] = 0; // skip in rank loop below
806  slavePes[numSlaves] = pe;
808  ++numSlaves;
809  }
810  for ( int j=0; j<CkMyNodeSize(); ++j ) {
811  int pe = CkNodeFirst(CkMyNode()) + j;
812  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
813  // CkMyPe(),j,pe,rankpcount[j]);
814  if ( ! rankpcount[j] ) continue;
815  if ( pe == CkMyPe() ) continue;
816  slavePes[numSlaves] = pe;
818  ++numSlaves;
819  }
820  registerPatches();
821 
822  delete [] pesOnNodeSharingDevice;
823  delete [] count;
824  delete [] pcount;
825  delete [] rankpcount;
826  delete [] table;
827 }
828 
829 static __thread int atom_params_size;
830 static __thread atom_param* atom_params;
831 
832 static __thread int vdw_types_size;
833 static __thread int* vdw_types;
834 
835 static __thread int dummy_size;
836 static __thread float* dummy_dev;
837 
838 static __thread int force_ready_queue_size;
839 static __thread int *force_ready_queue;
840 static __thread int force_ready_queue_len;
841 static __thread int force_ready_queue_next;
842 
843 static __thread int block_order_size;
844 static __thread int *block_order;
845 
846 static __thread int num_atoms;
847 static __thread int num_local_atoms;
848 static __thread int num_remote_atoms;
849 
850 static __thread int virials_size;
851 static __thread float *virials;
852 static __thread int num_virials;
853 // NOTE: slow_virials is a computed pointer from "virials" -do not deallocate
854 static __thread float *slow_virials;
855 
856 static __thread int energy_gbis_size;
857 static __thread float *energy_gbis;
858 
859 //GBIS host pointers
860 static __thread int intRad0H_size;
861 static __thread float *intRad0H;
862 static __thread int intRadSH_size;
863 static __thread float *intRadSH;
864 static __thread int bornRadH_size;
865 static __thread float *bornRadH;
866 static __thread int dHdrPrefixH_size;
867 static __thread float *dHdrPrefixH;
868 
869 static __thread int cuda_timer_count;
870 static __thread double cuda_timer_total;
871 static __thread double kernel_time;
872 static __thread double remote_submit_time;
873 static __thread double local_submit_time;
874 
875 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
876 
877 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime); // fix Charm++
878 
879 #ifdef PRINT_GBIS
880 #define GBISP(...) CkPrintf(__VA_ARGS__);
881 #else
882 #define GBISP(...)
883 #endif
884 
885 #define count_limit 1000000
886 static __thread int check_count;
887 static __thread int check_remote_count;
888 static __thread int check_local_count;
889 
890 void init_arrays() {
891 
892  atom_params_size = 0;
893  atom_params = NULL;
894 
895  vdw_types_size = 0;
896  vdw_types = NULL;
897 
898  dummy_size = 0;
899  dummy_dev = NULL;
900 
902  force_ready_queue = NULL;
905 
906  block_order_size = 0;
907  block_order = NULL;
908 
909  num_atoms = 0;
910  num_local_atoms = 0;
911  num_remote_atoms = 0;
912 
913  virials_size = 0;
914  virials = NULL;
915  num_virials = 0;
916 
917  energy_gbis_size = 0;
918  energy_gbis = NULL;
919 
920  intRad0H_size = 0;
921  intRad0H = NULL;
922  intRadSH_size = 0;
923  intRadSH = NULL;
924  bornRadH_size = 0;
925  bornRadH = NULL;
926  dHdrPrefixH_size = 0;
927  dHdrPrefixH = NULL;
928 
929 }
930 
931 void cuda_check_progress(void *arg, double walltime) {
933 
934  int flindex;
935  int poll_again = 1;
936  while ( -1 != (flindex = force_ready_queue[force_ready_queue_next]) ) {
937  // CkPrintf("Pe %d forces ready %d is index %d at %lf\n",
938  // CkMyPe(), force_ready_queue_next, flindex, walltime);
941  check_count = 0;
942  if ( force_ready_queue_next == force_ready_queue_len ) {
943  poll_again = 0;
944  CUDA_TRACE_LOCAL(kernel_time,walltime);
945  kernel_time = walltime - kernel_time;
946  // need to guarantee this finishes before the last patch message!
947  ((ComputeNonbondedCUDA *) arg)->workStarted = 0;
948  ((ComputeNonbondedCUDA *) arg)->finishReductions();
949  }
950  ((ComputeNonbondedCUDA *) arg)->messageFinishPatch(flindex);
951  if ( force_ready_queue_next == force_ready_queue_len ) break;
952  }
953  if ( ++check_count >= count_limit ) {
954  char errmsg[256];
955  sprintf(errmsg,"cuda_check_progress polled %d times over %f s on step %d",
956  check_count, walltime - remote_submit_time,
957  ((ComputeNonbondedCUDA *) arg)->step);
958  cudaDie(errmsg,cudaSuccess);
959  }
960  if ( poll_again ) {
961  CcdCallBacksReset(0,walltime); // fix Charm++
963  }
964 }
965 
966 void cuda_check_remote_progress(void *arg, double walltime) {
967 
969  cudaError_t err = cudaEventQuery(end_remote_download);
970  if ( err == cudaSuccess ) {
971  local_submit_time = walltime;
973  if ( deviceCUDA->getMergeGrids() ) { // no local
975  }
976  check_remote_count = 0;
977  cuda_errcheck("at cuda remote stream completed");
979  } else if ( err != cudaErrorNotReady ) {
980  char errmsg[256];
981  sprintf(errmsg,"in cuda_check_remote_progress after polling %d times over %f s on step %d",
983  ((ComputeNonbondedCUDA *) arg)->step);
984  cudaDie(errmsg,err);
985  } else if ( ++check_remote_count >= count_limit ) {
986  char errmsg[256];
987  sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
989  ((ComputeNonbondedCUDA *) arg)->step);
990  cudaDie(errmsg,err);
991  } else if ( check_local_count ) {
992  NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
993  } else {
994  CcdCallBacksReset(0,walltime); // fix Charm++
996  }
997 }
998 
999 void cuda_check_local_progress(void *arg, double walltime) {
1000 
1002  cudaError_t err = cudaEventQuery(end_local_download);
1003  if ( err == cudaSuccess ) {
1005  kernel_time = walltime - kernel_time;
1006  check_local_count = 0;
1007  cuda_errcheck("at cuda local stream completed");
1009  } else if ( err != cudaErrorNotReady ) {
1010  char errmsg[256];
1011  sprintf(errmsg,"in cuda_check_local_progress after polling %d times over %f s on step %d",
1013  ((ComputeNonbondedCUDA *) arg)->step);
1014  cudaDie(errmsg,err);
1015  } else if ( ++check_local_count >= count_limit ) {
1016  char errmsg[256];
1017  sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
1019  ((ComputeNonbondedCUDA *) arg)->step);
1020  cudaDie(errmsg,err);
1021  } else if ( check_remote_count ) {
1022  NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
1023  } else {
1024  CcdCallBacksReset(0,walltime); // fix Charm++
1026  }
1027 }
1028 
1029 #if 0
1030 // don't use this one unless timer is part of stream, above is better
1031 void cuda_check_progress(void *arg, double walltime) {
1032  if ( cuda_stream_finished() ) {
1033  kernel_time = walltime - kernel_time;
1034  CcdCallBacksReset(0,walltime); // fix Charm++
1035  CUDA_POLL(ccd_index);
1036  // ((ComputeNonbondedCUDA *) arg)->finishWork();
1038  }
1039 }
1040 #endif
1041 
1043  //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
1044  atomsChanged = 1;
1045 }
1046 
1047 static __thread int kernel_launch_state = 0;
1048 
1050  const Lattice &l;
1051  cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
1054  Vector a = l.a();
1055  Vector b = l.b();
1056  Vector c = l.c();
1057  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
1058  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
1059  return ( ri < rj );
1060  }
1061 };
1062 
1067  const ComputeNonbondedCUDA::patch_record *patchrecs) : distop(sod), pr(patchrecs) { }
1068  bool pid_compare_priority(int pidi, int pidj) {
1069  const ComputeNonbondedCUDA::patch_record &pri = pr[pidi];
1070  const ComputeNonbondedCUDA::patch_record &prj = pr[pidj];
1071  if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
1072  if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
1073  if ( pri.isSameNode && ! prj.isSameNode ) return 0;
1074  if ( prj.isSameNode && ! pri.isSameNode ) return 1;
1075  if ( pri.isSameNode ) { // and prj.isSameNode
1076  int rpri = pri.reversePriorityRankInPe;
1077  int rprj = prj.reversePriorityRankInPe;
1078  if ( rpri != rprj ) return rpri > rprj;
1079  return sortop_bitreverse(CkRankOf(pri.hostPe),CkRankOf(prj.hostPe));
1080  }
1081  int ppi = PATCH_PRIORITY(pidi);
1082  int ppj = PATCH_PRIORITY(pidj);
1083  if ( ppi != ppj ) return ppi < ppj;
1084  return pidi < pidj;
1085  }
1087  ComputeNonbondedCUDA::compute_record i) { // i and j reversed
1088  int pidi = pid_compare_priority(i.pid[0],i.pid[1]) ? i.pid[0] : i.pid[1];
1089  int pidj = pid_compare_priority(j.pid[0],j.pid[1]) ? j.pid[0] : j.pid[1];
1090  if ( pidi != pidj ) return pid_compare_priority(pidi, pidj);
1091  return distop(i,j);
1092  }
1093 };
1094 
1096  //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
1098  for ( int i=0; i<hostedPatches.size(); ++i ) {
1100  pr.positionBox->skip();
1101  pr.forceBox->skip();
1102  if (simParams->GBISOn) {
1103  pr.intRadBox->skip();
1104  pr.psiSumBox->skip();
1105  pr.bornRadBox->skip();
1106  pr.dEdaSumBox->skip();
1107  pr.dHdrPrefixBox->skip();
1108  }
1109  }
1110 }
1111 
1113 
1115  Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
1116  lattice = flags.lattice;
1117  doSlow = flags.doFullElectrostatics;
1118  doEnergy = flags.doEnergy;
1119  step = flags.step;
1120 
1121  if ( ! flags.doNonbonded ) {
1122  GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1123  if ( master != this ) {
1126  } else {
1127  for ( int i = 0; i < numSlaves; ++i ) {
1129  }
1130  skip();
1131  }
1132  if ( reduction ) {
1133  reduction->submit();
1134  }
1135 
1136  return 1;
1137  }
1138 
1139  for ( int i=0; i<hostedPatches.size(); ++i ) {
1141  if (!simParams->GBISOn || gbisPhase == 1) {
1142  GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
1143  pr.x = pr.positionBox->open();
1144  pr.xExt = pr.p->getCompAtomExtInfo();
1145  }
1146 
1147  if (simParams->GBISOn) {
1148  if (gbisPhase == 1) {
1149  GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
1150  pr.intRad = pr.intRadBox->open();
1151  pr.psiSum = pr.psiSumBox->open();
1152  } else if (gbisPhase == 2) {
1153  GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
1154  pr.bornRad = pr.bornRadBox->open();
1155  pr.dEdaSum = pr.dEdaSumBox->open();
1156  } else if (gbisPhase == 3) {
1157  GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
1158  pr.dHdrPrefix = pr.dHdrPrefixBox->open();
1159  }
1160  GBISP("opened GBIS boxes");
1161  }
1162  }
1163 
1164  if ( master == this ) return 0; //work to do, enqueue as usual
1165 
1166  // message masterPe
1169  atomsChanged = 0;
1170 
1171  workStarted = 1;
1172 
1173  return 1;
1174 }
1175 
1177 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
1178 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
1179 
1180  // Set GPU device ID
1181  cudaCheck(cudaSetDevice(deviceID));
1182 
1184  ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
1185 
1186  if ( workStarted ) { //if work already started, check if finished
1187  if ( finishWork() ) { // finished
1188  workStarted = 0;
1189  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
1190  } else { // need to call again
1191  workStarted = 2;
1192  basePriority = PROXY_RESULTS_PRIORITY; // lower for local
1193  if ( master == this && kernel_launch_state > 2 ) {
1194  cuda_check_local_progress(this,0.); // launches polling
1195  }
1196  }
1197  return;
1198  }
1199 
1200  workStarted = 1;
1202 
1204  Parameters *params = Node::Object()->parameters;
1206 
1207  //execute only during GBIS phase 1, or if not using GBIS
1208  if (!simParams->GBISOn || gbisPhase == 1) {
1209 
1210  //bind new patches to GPU
1211  if ( atomsChanged || computesChanged ) {
1212  int npatches = activePatches.size();
1213 
1214  pairlistsValid = 0;
1215  pairlistTolerance = 0.;
1216 
1217  if ( computesChanged ) {
1218  computesChanged = 0;
1219 
1220  if ( ! dummy_size ) {
1221  dummy_size = 1024*1024;
1222  cudaMalloc((void**)&dummy_dev,dummy_size);
1223  cuda_errcheck("in cudaMalloc dummy_dev");
1224  }
1225 
1226  bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
1227  1.2f, cudaHostAllocMapped);
1228  if (did_realloc) {
1229  for (int k=0; k < force_ready_queue_size; ++k)
1230  force_ready_queue[k] = -1;
1231  }
1232  force_ready_queue_len = npatches;
1233  reallocate_host<int>(&block_order, &block_order_size,
1234  2*(localComputeRecords.size()+remoteComputeRecords.size()),
1235  1.2f, cudaHostAllocMapped);
1236 
1237  num_virials = npatches;
1238  reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
1239  1.2f, cudaHostAllocMapped);
1240  slow_virials = virials + 16*num_virials;
1241 
1242  reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
1243  1.2f, cudaHostAllocMapped);
1244  for (int i = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
1245 
1246  int *ap = activePatches.begin();
1247  for ( int i=0; i<localActivePatches.size(); ++i ) {
1248  *(ap++) = localActivePatches[i];
1249  }
1250  for ( int i=0; i<remoteActivePatches.size(); ++i ) {
1251  *(ap++) = remoteActivePatches[i];
1252  }
1253 
1254  // sort computes by distance between patches
1256  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
1257  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
1258 
1259  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1260 
1261  if ( streaming ) {
1262  // iout << "Reverse-priority sorting...\n" << endi;
1263  cr_sortop_reverse_priority sorp(so,patchRecords.begin());
1264  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
1265  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
1266  patchPairsReordered = 0;
1267  //patchPairsReordered = 1;
1268  // int len = remoteComputeRecords.size();
1269  // for ( int i=0; i<len; ++i ) {
1270  // iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
1271  // }
1272  // int len2 = localComputeRecords.size();
1273  // for ( int i=0; i<len2; ++i ) {
1274  // iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
1275  // }
1276  // iout << endi;
1277  } else {
1278  patchPairsReordered = 1;
1279  }
1280 
1281  int nlc = localComputeRecords.size();
1282  int nrc = remoteComputeRecords.size();
1283  computeRecords.resize(nlc+nrc);
1284  compute_record *cr = computeRecords.begin();
1285  for ( int i=0; i<nrc; ++i ) {
1286  *(cr++) = remoteComputeRecords[i];
1287  }
1288  for ( int i=0; i<nlc; ++i ) {
1289  *(cr++) = localComputeRecords[i];
1290  }
1291 
1292  // patch_pair_num[i] = number of patch pairs that involve patch i
1293  patch_pair_num.resize(npatches);
1294  for ( int i=0; i<npatches; ++i ) {
1295  patchRecords[activePatches[i]].localIndex = i;
1296  patch_pair_num[i] = 0;
1297  }
1298 
1299  int ncomputes = computeRecords.size();
1300  patch_pairs.resize(ncomputes);
1301  for ( int i=0; i<ncomputes; ++i ) {
1303  int lp1 = patchRecords[cr.pid[0]].localIndex;
1304  int lp2 = patchRecords[cr.pid[1]].localIndex;
1305  patch_pair_num[lp1]++;
1306  if (lp1 != lp2) patch_pair_num[lp2]++;
1307  patch_pair &pp = patch_pairs[i];
1308  pp.offset.x = cr.offset.x;
1309  pp.offset.y = cr.offset.y;
1310  pp.offset.z = cr.offset.z;
1311  }
1312 
1313  for ( int i=0; i<ncomputes; ++i ) {
1315  int lp1 = patchRecords[cr.pid[0]].localIndex;
1316  int lp2 = patchRecords[cr.pid[1]].localIndex;
1317  patch_pair &pp = patch_pairs[i];
1318  pp.patch1_ind = lp1;
1319  pp.patch2_ind = lp2;
1320  pp.patch1_num_pairs = patch_pair_num[lp1];
1321  pp.patch2_num_pairs = patch_pair_num[lp2];
1322  }
1323 
1324  if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
1325  CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1327  localComputeRecords.size(), remoteComputeRecords.size());
1328  }
1329  } // computesChanged
1330  else if ( ! patchPairsReordered ) {
1331  patchPairsReordered = 1;
1332  int len = patch_pairs.size();
1333  int nlc = localComputeRecords.size();
1334  int nrc = remoteComputeRecords.size();
1335  if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
1336  ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
1337  ResizeArray<patch_pair> new_patch_pairs(len);
1338  int irc=nrc;
1339  int ilc=len;
1340  for ( int i=0; i<len; ++i ) {
1341  int boi = block_order[i];
1342  int dest;
1343  if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
1344  new_computeRecords[dest] = computeRecords[boi];
1345  new_patch_pairs[dest] = patch_pairs[boi];
1346  }
1347  if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
1348  computeRecords.swap(new_computeRecords);
1349  patch_pairs.swap(new_patch_pairs);
1350  }
1351 
1352  int istart = 0;
1353  int i;
1354  for ( i=0; i<npatches; ++i ) {
1355  if ( i == localActivePatches.size() ) {
1356  num_local_atoms = istart;
1357  }
1359  pr.localStart = istart;
1360  int natoms = pr.p->getNumAtoms();
1361  int nfreeatoms = natoms;
1362  if ( fixedAtomsOn ) {
1363  const CompAtomExt *aExt = pr.xExt;
1364  for ( int j=0; j<natoms; ++j ) {
1365  if ( aExt[j].atomFixed ) --nfreeatoms;
1366  }
1367  }
1368  pr.numAtoms = natoms;
1369  pr.numFreeAtoms = nfreeatoms;
1370  istart += natoms;
1371  istart += 16 - (natoms & 15);
1372  }
1373  if ( i == localActivePatches.size() ) {
1374  num_local_atoms = istart;
1375  }
1376  num_atoms = istart;
1378  reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
1379  reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
1380  reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
1381  reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1382  reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1383  if (simParams->GBISOn) {
1384  reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
1385  reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
1386  reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1387  reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
1388  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1389  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
1390  }
1391 
1392  int bfstart = 0;
1393  int exclmask_start = 0;
1394  int ncomputes = computeRecords.size();
1395  for ( int i=0; i<ncomputes; ++i ) {
1397  int p1 = cr.pid[0];
1398  int p2 = cr.pid[1];
1399  patch_pair &pp = patch_pairs[i];
1400  pp.patch1_start = patchRecords[p1].localStart;
1401  pp.patch1_size = patchRecords[p1].numAtoms;
1402  pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
1403  pp.patch2_start = patchRecords[p2].localStart;
1404  pp.patch2_size = patchRecords[p2].numAtoms;
1405  pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
1406  pp.plist_start = bfstart;
1407  // size1*size2 = number of patch pairs
1408  int size1 = (pp.patch1_size-1)/WARPSIZE+1;
1409  int size2 = (pp.patch2_size-1)/WARPSIZE+1;
1410  pp.plist_size = (size1*size2-1)/32+1;
1411  bfstart += pp.plist_size;
1412  pp.exclmask_start = exclmask_start;
1413  exclmask_start += size1*size2;
1414  } //for ncomputes
1415 
1416  cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
1417  activePatches.size(), num_atoms, bfstart,
1418  exclmask_start);
1419 
1420  } // atomsChanged || computesChanged
1421 
1422  double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
1423 
1424  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1425  float maxAtomMovement = 0.;
1426  float maxPatchTolerance = 0.;
1427 
1428  for ( int i=0; i<activePatches.size(); ++i ) {
1430 
1431  float maxMove = pr.p->flags.maxAtomMovement;
1432  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1433 
1434  float maxTol = pr.p->flags.pairlistTolerance;
1435  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1436 
1437  int start = pr.localStart;
1438  int n = pr.numAtoms;
1439  const CompAtom *a = pr.x;
1440  const CompAtomExt *aExt = pr.xExt;
1441  if ( atomsChanged ) {
1442 
1443  atom_param *ap = atom_params + start;
1444  for ( int k=0; k<n; ++k ) {
1445  int j = aExt[k].sortOrder;
1446  ap[k].vdw_type = a[j].vdwType;
1447  vdw_types[start + k] = a[j].vdwType;
1448  ap[k].index = aExt[j].id;
1449 #ifdef MEM_OPT_VERSION
1450  ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1451  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1452 #else // ! MEM_OPT_VERSION
1453  ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
1454  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
1455 #endif // MEM_OPT_VERSION
1456  }
1457  }
1458  {
1459 #if 1
1460  const CudaAtom *ac = pr.p->getCudaAtomList();
1461  CudaAtom *ap = atoms + start;
1462  memcpy(ap, ac, sizeof(atom)*n);
1463 #else
1464  Vector center =
1466  atom *ap = atoms + start;
1467  for ( int k=0; k<n; ++k ) {
1468  int j = aExt[k].sortOrder;
1469  ap[k].position.x = a[j].position.x - center.x;
1470  ap[k].position.y = a[j].position.y - center.y;
1471  ap[k].position.z = a[j].position.z - center.z;
1472  ap[k].charge = charge_scaling * a[j].charge;
1473  }
1474 #endif
1475  }
1476  }
1477 
1478  savePairlists = 0;
1479  usePairlists = 0;
1480  if ( flags.savePairlists ) {
1481  savePairlists = 1;
1482  usePairlists = 1;
1483  } else if ( flags.usePairlists ) {
1484  if ( ! pairlistsValid ||
1485  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1487  } else {
1488  usePairlists = 1;
1489  }
1490  }
1491  if ( ! usePairlists ) {
1492  pairlistsValid = 0;
1493  }
1494  float plcutoff = cutoff;
1495  if ( savePairlists ) {
1496  pairlistsValid = 1;
1497  pairlistTolerance = 2. * maxPatchTolerance;
1498  plcutoff += pairlistTolerance;
1499  }
1500  plcutoff2 = plcutoff * plcutoff;
1501 
1502  //CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1503  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1504 
1505 #if 0
1506  // calculate warp divergence
1507  if ( 1 ) {
1508  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1509  Lattice &lattice = flags.lattice;
1510  float3 lata, latb, latc;
1511  lata.x = lattice.a().x;
1512  lata.y = lattice.a().y;
1513  lata.z = lattice.a().z;
1514  latb.x = lattice.b().x;
1515  latb.y = lattice.b().y;
1516  latb.z = lattice.b().z;
1517  latc.x = lattice.c().x;
1518  latc.y = lattice.c().y;
1519  latc.z = lattice.c().z;
1520 
1521  int ncomputes = computeRecords.size();
1522  for ( int ic=0; ic<ncomputes; ++ic ) {
1523  patch_pair &pp = patch_pairs[ic];
1524  atom *a1 = atoms + pp.patch1_atom_start;
1525  int n1 = pp.patch1_size;
1526  atom *a2 = atoms + pp.patch2_atom_start;
1527  int n2 = pp.patch2_size;
1528  float offx = pp.offset.x * lata.x
1529  + pp.offset.y * latb.x
1530  + pp.offset.z * latc.x;
1531  float offy = pp.offset.x * lata.y
1532  + pp.offset.y * latb.y
1533  + pp.offset.z * latc.y;
1534  float offz = pp.offset.x * lata.z
1535  + pp.offset.y * latb.z
1536  + pp.offset.z * latc.z;
1537  // CkPrintf("%f %f %f\n", offx, offy, offz);
1538  int atoms_tried = 0;
1539  int blocks_tried = 0;
1540  int atoms_used = 0;
1541  int blocks_used = 0;
1542  for ( int ii=0; ii<n1; ii+=32 ) { // warps
1543  for ( int jj=0; jj<n2; jj+=16 ) { // shared atom loads
1544  int block_used = 0;
1545  for ( int j=jj; j<jj+16 && j<n2; ++j ) { // shared atoms
1546  int atom_used = 0;
1547  for ( int i=ii; i<ii+32 && i<n1; ++i ) { // threads
1548  float dx = offx + a1[i].position.x - a2[j].position.x;
1549  float dy = offy + a1[i].position.y - a2[j].position.y;
1550  float dz = offz + a1[i].position.z - a2[j].position.z;
1551  float r2 = dx*dx + dy*dy + dz*dz;
1552  if ( r2 < cutoff2 ) atom_used = 1;
1553  }
1554  ++atoms_tried;
1555  if ( atom_used ) { block_used = 1; ++atoms_used; }
1556  }
1557  ++blocks_tried;
1558  if ( block_used ) { ++blocks_used; }
1559  }
1560  }
1561  CkPrintf("blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
1562  blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
1563  atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
1564  }
1565  }
1566 #endif
1567 
1568  } // !GBISOn || gbisPhase == 1
1569 
1570  //Do GBIS
1571  if (simParams->GBISOn) {
1572  //open GBIS boxes depending on phase
1573  for ( int i=0; i<activePatches.size(); ++i ) {
1575  GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
1576  if (gbisPhase == 1) {
1577  //Copy GBIS intRadius to Host
1578  if (atomsChanged) {
1579  float *intRad0 = intRad0H + pr.localStart;
1580  float *intRadS = intRadSH + pr.localStart;
1581  for ( int k=0; k<pr.numAtoms; ++k ) {
1582  int j = pr.xExt[k].sortOrder;
1583  intRad0[k] = pr.intRad[2*j+0];
1584  intRadS[k] = pr.intRad[2*j+1];
1585  }
1586  }
1587  } else if (gbisPhase == 2) {
1588  float *bornRad = bornRadH + pr.localStart;
1589  for ( int k=0; k<pr.numAtoms; ++k ) {
1590  int j = pr.xExt[k].sortOrder;
1591  bornRad[k] = pr.bornRad[j];
1592  }
1593  } else if (gbisPhase == 3) {
1594  float *dHdrPrefix = dHdrPrefixH + pr.localStart;
1595  for ( int k=0; k<pr.numAtoms; ++k ) {
1596  int j = pr.xExt[k].sortOrder;
1597  dHdrPrefix[k] = pr.dHdrPrefix[j];
1598  }
1599  } // end phases
1600  } // end for patches
1601  } // if GBISOn
1602 
1603  kernel_time = CkWallTimer();
1604  kernel_launch_state = 1;
1605  //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
1606  if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
1607 }
1608 
1609 void cuda_check_remote_calc(void *arg, double walltime) {
1610  // in theory we only need end_remote_calc, but overlap isn't reliable
1611  // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
1612  if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
1613 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
1615 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
1616  } else {
1617  CcdCallBacksReset(0,walltime); // fix Charm++
1619  }
1620 }
1621 
1622 void cuda_check_local_calc(void *arg, double walltime) {
1623  // in theory we only need end_local_calc, but overlap isn't reliable
1624  // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
1625  if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
1626 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
1628 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
1629  } else {
1630  CcdCallBacksReset(0,walltime); // fix Charm++
1632  }
1633 }
1634 
1635 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
1636 
1638 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
1639 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
1640 workStarted, gbisPhase, kernel_launch_state, pe)
1641 
1642  float3 lata, latb, latc;
1643  lata.x = lattice.a().x;
1644  lata.y = lattice.a().y;
1645  lata.z = lattice.a().z;
1646  latb.x = lattice.b().x;
1647  latb.y = lattice.b().y;
1648  latb.z = lattice.b().z;
1649  latc.x = lattice.c().x;
1650  latc.y = lattice.c().y;
1651  latc.z = lattice.c().z;
1653 
1654  // Set GPU device ID
1655  cudaSetDevice(deviceID);
1656 
1657  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1658 
1659  double walltime;
1660  if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
1661  walltime = CkWallTimer();
1662  CcdCallBacksReset(0,walltime); // fix Charm++
1663  }
1664 
1665  switch ( kernel_launch_state ) {
1667 // Remote
1668  case 1:
1669 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
1671  //gpu_is_mine = 0;
1673  remote_submit_time = walltime;
1674 
1675  if (!simParams->GBISOn || gbisPhase == 1) {
1676  // cudaEventRecord(start_upload, stream);
1677  if ( atomsChanged ) {
1680  }
1681  if ( simParams->GBISOn) {
1683  if ( atomsChanged ) {
1685  }
1686  }
1687  atomsChanged = 0;
1688  cuda_bind_atoms((const atom *)atoms);
1691  if ( simParams->GBISOn) {
1693  }
1694  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1695  //call CUDA Kernels
1696 
1697  cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
1698  0,remoteComputeRecords.size(),
1701  if (simParams->GBISOn) {
1702  cuda_GBIS_P1(
1703  0,remoteComputeRecords.size(),
1705  simParams->alpha_cutoff-simParams->fsMax,
1706  simParams->coulomb_radius_offset,
1707  lata, latb, latc, stream);
1708  }
1709  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1710  // num_local_atom_records,num_remote_atom_records);
1711  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1712  cudaEventRecord(end_remote_download, stream);
1713  }
1714  if ( streaming ) {
1717  } else {
1719  }
1720  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1722  break;
1723  }
1724  } // !GBIS or gbisPhase==1
1725  if (simParams->GBISOn) {
1726  if (gbisPhase == 1) {
1727  //GBIS P1 Kernel launched in previous code block
1728  } else if (gbisPhase == 2) {
1729 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
1730  // cudaEventRecord(start_upload, stream);
1733  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1734  cuda_GBIS_P2(
1735  0,remoteComputeRecords.size(),
1737  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1738  simParams->nonbondedScaling, simParams->kappa,
1739  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1740  simParams->dielectric, simParams->solvent_dielectric,
1741  lata, latb, latc,
1743  );
1744  cudaEventRecord(end_remote_download, stream);
1746  } else if (gbisPhase == 3) {
1747 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
1748  // cudaEventRecord(start_upload, stream);
1750  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1751  if (doSlow)
1752  cuda_GBIS_P3(
1753  0,remoteComputeRecords.size(),
1755  (simParams->alpha_cutoff-simParams->fsMax),
1756  simParams->coulomb_radius_offset,
1757  simParams->nonbondedScaling,
1758  lata, latb, latc, stream
1759  );
1760  cudaEventRecord(end_remote_download, stream);
1762  }
1763  }
1764 
1766 // Local
1767  case 2:
1768 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
1770  //gpu_is_mine = 0;
1772 
1773  if ( stream2 != stream ) {
1774  // needed to ensure that upload is finished
1775  cudaStreamWaitEvent(stream2, start_calc, 0);
1776  // priorities do not prevent local from launching ahead
1777  // of remote, so delay local stream with a small memset
1778  cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
1779  }
1780 
1781  if (!simParams->GBISOn || gbisPhase == 1) {
1782 
1783  cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
1787  if (simParams->GBISOn) {
1788  cuda_GBIS_P1(
1791  simParams->alpha_cutoff-simParams->fsMax,
1792  simParams->coulomb_radius_offset,
1793  lata, latb, latc, stream2 );
1794  }
1795  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1796  // 0,num_local_atom_records);
1797  //cuda_load_virials(virials, doSlow); // slow_virials follows virials
1798  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1799  cudaEventRecord(end_local_download, stream2);
1800  }
1801  if ( ! streaming && workStarted == 2 ) {
1802  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1803 cuda_check_local_progress\n", CkMyPe())
1805  }
1806  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1807  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1808 cuda_check_local_calc\n", CkMyPe())
1810  break;
1811  }
1812 
1813  } // !GBIS or gbisPhase==1
1814  if (simParams->GBISOn) {
1815  if (gbisPhase == 1) {
1816  //GBIS P1 Kernel launched in previous code block
1817  } else if (gbisPhase == 2) {
1818 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
1819  cuda_GBIS_P2(
1822  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1823  simParams->nonbondedScaling, simParams->kappa,
1824  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1825  simParams->dielectric, simParams->solvent_dielectric,
1826  lata, latb, latc,
1828  );
1829  cudaEventRecord(end_local_download, stream2);
1830  if ( workStarted == 2 ) {
1832  }
1833  } else if (gbisPhase == 3) {
1834 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
1835  if (doSlow)
1836  cuda_GBIS_P3(
1839  (simParams->alpha_cutoff-simParams->fsMax),
1840  simParams->coulomb_radius_offset,
1841  simParams->nonbondedScaling,
1842  lata, latb, latc, stream2
1843  );
1844  cudaEventRecord(end_local_download, stream2);
1845  if ( workStarted == 2 ) {
1847  }
1848  } // phases
1849  } // GBISOn
1850  if ( simParams->PMEOn && simParams->PMEOffload && !simParams->usePMECUDA) break;
1851 
1852  default:
1853 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
1854  //gpu_is_mine = 1;
1856  break;
1857  } // switch
1858 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
1859 }
1860 
1862  int pid = activePatches[flindex];
1863  patch_record &pr = patchRecords[pid];
1864  //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
1866 }
1867 
1869  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
1871  pr.r = pr.forceBox->open();
1872  finishPatch(pr);
1873  pr.positionBox->close(&(pr.x));
1874  pr.forceBox->close(&(pr.r));
1875 }
1876 
1877 void ComputeNonbondedCUDA::finishPatch(patch_record &pr) {
1878  int start = pr.localStart;
1879  const CompAtomExt *aExt = pr.xExt;
1880  int nfree = pr.numAtoms;
1881  pr.f = pr.r->f[Results::nbond];
1882  Force *f = pr.f;
1883  Force *f_slow = pr.r->f[Results::slow];
1884  const CompAtom *a = pr.x;
1885  // int *ao = atom_order + start;
1886  float4 *af = master->forces + start;
1887  float4 *af_slow = master->slow_forces + start;
1888  // only free atoms return forces
1889  for ( int k=0; k<nfree; ++k ) {
1890  int j = aExt[k].sortOrder;
1891  f[j].x += af[k].x;
1892  f[j].y += af[k].y;
1893  f[j].z += af[k].z;
1894  // wcount += af[k].w;
1895  // virial += af[k].w;
1896  if ( doSlow ) {
1897  f_slow[j].x += af_slow[k].x;
1898  f_slow[j].y += af_slow[k].y;
1899  f_slow[j].z += af_slow[k].z;
1900  // virial_slow += af_slow[k].w;
1901  }
1902  }
1903 }
1904 
1905 //dtanner
1907  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
1908 
1909  if ( master == this ) {
1910  for ( int i = 0; i < numSlaves; ++i ) {
1912  }
1913  }
1914 
1915 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
1916 CkMyPe(), workStarted, gbisPhase)
1917 
1920 
1921  ResizeArray<int> &patches( workStarted == 1 ?
1923 
1924  // long long int wcount = 0;
1925  // double virial = 0.;
1926  // double virial_slow = 0.;
1927  for ( int i=0; i<patches.size(); ++i ) {
1928  // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
1929  patch_record &pr = master->patchRecords[patches[i]];
1930 
1931  if ( !simParams->GBISOn || gbisPhase == 1 ) {
1932  patch_record &pr = master->patchRecords[patches[i]];
1933 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
1934  pr.r = pr.forceBox->open();
1935  } // !GBISOn || gbisPhase==1
1936 
1937  int start = pr.localStart;
1938  const CompAtomExt *aExt = pr.xExt;
1939  if ( !simParams->GBISOn || gbisPhase == 3 ) {
1940  finishPatch(pr);
1941  } // !GBISOn || gbisPhase == 3
1942 
1943 #if 0
1944  if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
1945  CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
1946  j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
1947  af[j].w);
1948  }
1949 #endif
1950 
1951  //Close Boxes depending on Phase
1952  if (simParams->GBISOn) {
1953  if (gbisPhase == 1) {
1954  //Copy dEdaSum from Host to Patch Box
1955  GBReal *psiSumMaster = master->psiSumH + start;
1956  for ( int k=0; k<pr.numAtoms; ++k ) {
1957  int j = aExt[k].sortOrder;
1958  pr.psiSum[j] += psiSumMaster[k];
1959  }
1960 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
1961  pr.psiSumBox->close(&(pr.psiSum));
1962 
1963  } else if (gbisPhase == 2) {
1964  //Copy dEdaSum from Host to Patch Box
1965  GBReal *dEdaSumMaster = master->dEdaSumH + start;
1966  for ( int k=0; k<pr.numAtoms; ++k ) {
1967  int j = aExt[k].sortOrder;
1968  pr.dEdaSum[j] += dEdaSumMaster[k];
1969  }
1970 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
1971  pr.dEdaSumBox->close(&(pr.dEdaSum));
1972 
1973  } else if (gbisPhase == 3) {
1974 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
1975  pr.intRadBox->close(&(pr.intRad)); //box 6
1976  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1977  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1978  pr.positionBox->close(&(pr.x)); //box 0
1979  pr.forceBox->close(&(pr.r));
1980  } //end phases
1981  } else { //not GBIS
1982 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
1983  pr.positionBox->close(&(pr.x));
1984  pr.forceBox->close(&(pr.r));
1985  }
1986  }// end for
1987 
1988 #if 0
1989  virial *= (-1./6.);
1990  reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
1991  reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
1992  reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
1993  if ( doSlow ) {
1994  virial_slow *= (-1./6.);
1995  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
1996  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
1997  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
1998  }
1999 #endif
2000 
2001  if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
2002  ( localHostedPatches.size() || master == this ) ) {
2003  GBISP("not finished, call again\n");
2004  return 0; // not finished, call again
2005  }
2006 
2007  if ( master != this ) { // finished
2008  GBISP("finished\n");
2009  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2010  atomsChanged = 0;
2011  return 1;
2012  }
2013 
2015 
2016  if ( !simParams->GBISOn || gbisPhase == 3 ) {
2017 
2018  atomsChanged = 0;
2019  finishReductions();
2020 
2021  } // !GBISOn || gbisPhase==3
2022 
2023  // Next GBIS Phase
2024 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
2025  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2026 
2027  GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
2028  return 1; // finished and ready for next step
2029 }
2030 
2031 
2033  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
2034 
2035  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
2036 
2038 
2039  if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
2040  if ( patchPairsReordered++ == 2) {
2041  int patch_len = patchRecords.size();
2042  ResizeArray<int> plast(patch_len);
2043  for ( int i=0; i<patch_len; ++i ) {
2044  plast[i] = -1;
2045  }
2046  int order_len = localComputeRecords.size()+remoteComputeRecords.size();
2047  for ( int i=0; i<order_len; ++i ) {
2048  plast[computeRecords[block_order[i]].pid[0]] = i;
2049  iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
2050  }
2051  iout << endi;
2052  for ( int i=0; i<patch_len; ++i ) {
2053  iout << "patch_last " << i << " " << plast[i] << "\n";
2054  }
2055  iout << endi;
2056  }
2057  }
2058 
2059  {
2060  Tensor virial_tensor;
2061  BigReal energyv = 0.;
2062  BigReal energye = 0.;
2063  BigReal energys = 0.;
2064  int nexcluded = 0;
2065  for ( int i = 0; i < num_virials; ++i ) {
2066  virial_tensor.xx += virials[16*i];
2067  virial_tensor.xy += virials[16*i+1];
2068  virial_tensor.xz += virials[16*i+2];
2069  virial_tensor.yx += virials[16*i+3];
2070  virial_tensor.yy += virials[16*i+4];
2071  virial_tensor.yz += virials[16*i+5];
2072  virial_tensor.zx += virials[16*i+6];
2073  virial_tensor.zy += virials[16*i+7];
2074  virial_tensor.zz += virials[16*i+8];
2075  energyv += virials[16*i+9];
2076  energye += virials[16*i+10];
2077  energys += virials[16*i+11];
2078  nexcluded += ((int *)virials)[16*i+12];
2079  if (simParams->GBISOn) {
2080  energye += energy_gbis[i];
2081  }
2082  }
2084  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
2085  if ( doEnergy ) {
2086  reduction->item(REDUCTION_LJ_ENERGY) += energyv;
2087  reduction->item(REDUCTION_ELECT_ENERGY) += energye;
2089  }
2090  }
2091  if ( doSlow ) {
2092  Tensor virial_slow_tensor;
2093  for ( int i = 0; i < num_virials; ++i ) {
2094  virial_slow_tensor.xx += slow_virials[16*i];
2095  virial_slow_tensor.xy += slow_virials[16*i+1];
2096  virial_slow_tensor.xz += slow_virials[16*i+2];
2097  virial_slow_tensor.yx += slow_virials[16*i+3];
2098  virial_slow_tensor.yy += slow_virials[16*i+4];
2099  virial_slow_tensor.yz += slow_virials[16*i+5];
2100  virial_slow_tensor.zx += slow_virials[16*i+6];
2101  virial_slow_tensor.zy += slow_virials[16*i+7];
2102  virial_slow_tensor.zz += slow_virials[16*i+8];
2103  }
2104  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
2105  }
2106 
2107  reduction->submit();
2108 
2109  cuda_timer_count++;
2110  if ( simParams->outputCudaTiming &&
2111  step % simParams->outputCudaTiming == 0 ) {
2112 
2113  // int natoms = mol->numAtoms;
2114  // double wpa = wcount; wpa /= natoms;
2115 
2116  // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
2117  // kernel_time * 1.e3, time * 1.e3, wpa);
2118 
2119 #if 0
2120  float upload_ms, remote_calc_ms;
2121  float local_calc_ms, total_ms;
2122  cuda_errcheck("before event timers");
2123  cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
2124  cuda_errcheck("in event timer 1");
2125  cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
2126  cuda_errcheck("in event timer 2");
2127  cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
2128  cuda_errcheck("in event timer 3");
2129  cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
2130  cuda_errcheck("in event timer 4");
2131  cuda_errcheck("in event timers");
2132 
2133  CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
2134  CkMyPe(), upload_ms, remote_calc_ms,
2135  local_calc_ms, total_ms);
2136 #endif
2137 
2138  if ( cuda_timer_count >= simParams->outputCudaTiming ) {
2140  CkPrintf("CUDA TIMING: %d %f ms/step on node %d\n",
2141  step, cuda_timer_total * 1.e3, CkMyPe());
2142  }
2143  cuda_timer_count = 0;
2144  cuda_timer_total = 0;
2145  }
2146 
2147 }
2148 
2149 
2150 #endif // NAMD_CUDA
2151 
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
ResizeArray< int > remoteHostedPatches
void sendNonbondedCUDASlaveEnqueuePatch(ComputeNonbondedCUDA *c, int, int, int, int, FinishWorkMsg *)
Definition: ComputeMgr.C:1565
static __thread int * block_order
Box< Patch, GBReal > * registerDEdaSumDeposit(Compute *cid)
Definition: Patch.C:204
static BigReal * fast_table
#define COMPUTE_PROXY_PRIORITY
Definition: Priorities.h:71
void cuda_bind_force_table(const float4 *t, const float4 *et)
BigReal zy
Definition: Tensor.h:19
void sendNonbondedCUDASlaveSkip(ComputeNonbondedCUDA *c, int)
Definition: ComputeMgr.C:1541
void sendBuildCudaForceTable()
Definition: ComputeMgr.C:1467
void build_cuda_exclusions()
static BigReal * scor_table
void sendYieldDevice(int pe)
Definition: ComputeMgr.C:1434
Type * getNewArray(int n)
Definition: ObjectArena.h:49
#define PROXY_RESULTS_PRIORITY
Definition: Priorities.h:73
ResizeArray< int > localActivePatches
static void messageFinishCUDA(Compute *)
Definition: WorkDistrib.C:2901
int sequence(void)
Definition: Compute.h:64
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread int check_count
static bool sortop_bitreverse(int a, int b)
void build_cuda_force_table()
BigReal xz
Definition: Tensor.h:17
void unregister_cuda_compute(ComputeID c)
static __thread double cuda_timer_total
BigReal solvent_dielectric
int cuda_stream_finished()
int sortOrder
Definition: NamdTypes.h:87
static __thread int intRadSH_size
static __thread int check_remote_count
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
void cuda_check_progress(void *arg, double walltime)
ResizeArray< compute_record > computeRecords
void cuda_bind_exclusions(const unsigned int *t, int n)
short int32
Definition: dumpdcd.c:24
int ComputeID
Definition: NamdTypes.h:183
void cuda_check_local_progress(void *arg, double walltime)
void cuda_bind_atoms(const atom *a)
#define CUDA_TRACE_POLL_REMOTE
Definition: DeviceCUDA.h:22
bool getSharedGpu()
Definition: DeviceCUDA.h:103
static __thread cudaEvent_t end_remote_download
static PatchMap * Object()
Definition: PatchMap.h:27
#define GBISP(...)
void setMergeGrids(const int val)
Definition: DeviceCUDA.h:101
#define CUDA_TRACE_REMOTE(START, END)
Definition: DeviceCUDA.h:28
static __thread ComputeMgr * computeMgr
Definition: Vector.h:64
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
BigReal nonbondedScaling
static __thread int dummy_size
int index_a(int pid) const
Definition: PatchMap.h:86
int savePairlists
Definition: PatchTypes.h:39
static const Molecule * mol
const ComputeNonbondedCUDA::patch_record * pr
#define COULOMB
Definition: common.h:46
#define FORCE_TABLE_SIZE
BigReal & item(int i)
Definition: ReductionMgr.h:312
static __thread float * bornRadH
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:66
float x
Definition: NamdTypes.h:154
static __thread int2 * exclusionsByAtom
static BigReal * vdwa_table
int usePairlists
Definition: PatchTypes.h:38
Position position
Definition: NamdTypes.h:53
static __thread float * dHdrPrefixH
BigReal yz
Definition: Tensor.h:18
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
int getMergeGrids()
Definition: DeviceCUDA.h:100
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
Box< Patch, Real > * registerBornRadPickup(Compute *cid)
Definition: Patch.C:196
int get_table_dim() const
Definition: LJTable.h:44
#define count_limit
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
#define iout
Definition: InfoStream.h:51
void cuda_bind_GBIS_energy(float *e)
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
void sendNonbondedCUDASlaveReady(int, int, int, int)
Definition: ComputeMgr.C:1525
void CcdCallBacksReset(void *ignored, double curWallTime)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
#define WARPSIZE
Definition: CudaUtils.h:10
SubmitReduction * reduction
static __thread int dHdrPrefixH_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
BigReal switchingDist
static __thread cudaEvent_t end_local_download
BigReal coulomb_radius_offset
Flags flags
Definition: Patch.h:127
void register_cuda_compute_self(ComputeID c, PatchID pid)
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
__thread cudaStream_t stream
static __thread float * slow_virials
bool operator()(int pidj, int pidi)
static __thread int force_ready_queue_next
void send_build_cuda_force_table()
static __thread int intRad0H_size
Charge charge
Definition: NamdTypes.h:54
void cuda_check_local_calc(void *arg, double walltime)
#define CUDA_POLL(FN, ARG)
static __thread ResizeArray< int > * patch_pair_num_ptr
Box< Patch, GBReal > * registerPsiSumDeposit(Compute *cid)
Definition: Patch.C:164
#define PRIORITY_SIZE
Definition: Priorities.h:13
static __thread patch_pair * patch_pairs
CudaAtom * getCudaAtomList()
Definition: Patch.h:124
static __thread float * intRadSH
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
static __thread double kernel_time
void setGpuIsMine(const int val)
Definition: DeviceCUDA.h:110
ComputeNonbondedCUDA ** slaves
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
int getMasterPe()
Definition: DeviceCUDA.h:105
void sendCreateNonbondedCUDASlave(int, int)
Definition: ComputeMgr.C:1511
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
BigReal yx
Definition: Tensor.h:18
iterator end(void)
Definition: ResizeArray.h:37
Box< Patch, Real > * registerIntRadPickup(Compute *cid)
Definition: Patch.C:179
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int force_ready_queue_size
static __thread int num_remote_atoms
int index_b(int pid) const
Definition: PatchMap.h:87
static __thread int virials_size
int priority(void)
Definition: Compute.h:65
void cuda_bind_GBIS_bornRad(float *bornRadH)
ResizeArray< int > remoteActivePatches
BigReal x
Definition: Vector.h:66
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:107
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
BigReal alpha_cutoff
int numAtoms
Definition: Molecule.h:557
int PatchID
Definition: NamdTypes.h:182
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
ComputeNonbondedCUDA * master
ResizeArray< compute_record > localComputeRecords
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:85
static __thread float * virials
void cuda_bind_virials(float *v, int *queue, int *blockorder)
void init_arrays()
ResizeArray< compute_record > remoteComputeRecords
LocalWorkMsg * localWorkMsg2
static AtomMap * Object()
Definition: AtomMap.h:36
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread int cuda_timer_count
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
BigReal maxAtomMovement
Definition: PatchTypes.h:41
static __thread int bornRadH_size
static __thread int num_virials
ResizeArray< int > hostedPatches
BigReal xx
Definition: Tensor.h:17
ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
ResizeArray< int > activePatches
int getDeviceID()
Definition: DeviceCUDA.h:112
void skip(void)
Definition: Box.h:63
int index_c(int pid) const
Definition: PatchMap.h:88
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
static __thread int vdw_types_size
static __thread double remote_submit_time
int add(const Elem &elem)
Definition: ResizeArray.h:97
static __thread double local_submit_time
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
BigReal zz
Definition: Tensor.h:19
int gbisPhase
Definition: Compute.h:39
Parameters * parameters
Definition: Node.h:177
void cuda_init()
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BlockRadixSort::TempStorage sort
void cuda_check_remote_progress(void *arg, double walltime)
static __thread int force_ready_queue_len
static __thread int energy_gbis_size
static __thread float * dummy_dev
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
int numPatches(void) const
Definition: PatchMap.h:59
void resize(int i)
Definition: ResizeArray.h:84
void swap(ResizeArray< Elem > &ra)
Definition: ResizeArray.h:64
#define CUDA_TRACE_POLL_LOCAL
Definition: DeviceCUDA.h:25
int node(int pid) const
Definition: PatchMap.h:114
int getNoMergeGrids()
Definition: DeviceCUDA.h:99
void cuda_errcheck(const char *msg)
Position unscale(ScaledPosition s) const
Definition: Lattice.h:77
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static BigReal * vdwb_table
int getNextPeSharingGpu()
Definition: DeviceCUDA.h:104
ResizeArray< int > localHostedPatches
Definition: Tensor.h:15
void sendNonbondedCUDASlaveEnqueue(ComputeNonbondedCUDA *c, int, int, int, int)
Definition: ComputeMgr.C:1554
static __thread float * energy_gbis
BigReal xy
Definition: Tensor.h:17
Box< Patch, Real > * registerDHdrPrefixPickup(Compute *cid)
Definition: Patch.C:218
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
static __thread int kernel_launch_state
void cuda_check_remote_calc(void *arg, double walltime)
int getNumAtoms()
Definition: Patch.h:105
static const LJTable * ljTable
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
Bool pressureProfileOn
int getNoStreaming()
Definition: DeviceCUDA.h:98
BigReal yy
Definition: Tensor.h:18
Lattice lattice
Definition: PatchTypes.h:44
static __thread cudaEvent_t start_calc
bool operator()(int32 *li, int32 *lj)
static __thread atom_param * atom_params
__thread int max_grid_size
ResizeArray< patch_record > patchRecords
cr_sortop_reverse_priority(cr_sortop_distance &sod, const ComputeNonbondedCUDA::patch_record *patchrecs)
Box< Patch, CompAtom > * positionBox
BigReal pairlistTolerance
Definition: PatchTypes.h:40
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
static __thread int block_order_size
static __thread int * vdw_types
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
BigReal dielectric
void submit(void)
Definition: ReductionMgr.h:323
int basePriority
Definition: Compute.h:37
int size(void) const
Definition: ResizeArray.h:127
__thread cudaStream_t stream2
int getGpuIsMine()
Definition: DeviceCUDA.h:109
static __thread int atom_params_size
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void cuda_bind_atom_params(const atom_param *t)
#define SET_EXCL(EXCL, BASE, DIFF)
#define CUDA_TRACE_LOCAL(START, END)
Definition: DeviceCUDA.h:31
#define MAX_EXCLUSIONS
Data * open(void)
Definition: Box.h:39
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:106
BigReal zx
Definition: Tensor.h:19
static __thread int num_atoms
Molecule * molecule
Definition: Node.h:176
Vector a() const
Definition: Lattice.h:252
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1161
static __thread int num_local_atoms
void close(Data **const t)
Definition: Box.h:49
Vector c() const
Definition: Lattice.h:254
static __thread float * intRad0H
static __thread int check_local_count
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:107
double BigReal
Definition: common.h:114
int step
Definition: PatchTypes.h:16
float GBReal
Definition: ComputeGBIS.inl:17
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
static __thread ComputeNonbondedCUDA * cudaCompute
iterator begin(void)
Definition: ResizeArray.h:36
void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[])
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:228