NAMD
CudaComputeNonbonded.C
Go to the documentation of this file.
1 #include <algorithm>
2 #include <map>
3 #include <vector>
4 #include "CudaUtils.h"
5 #include "CudaRecord.h"
6 #include "NamdTypes.h"
7 #include "Patch.h"
8 #include "PatchMap.h"
9 #include "ProxyMgr.h"
10 #include "Node.h"
11 #include "ObjectArena.h"
12 // #include "ComputeCUDAMgr.h"
13 #include "ReductionMgr.h"
14 #include "CudaComputeNonbonded.h"
15 #include "WorkDistrib.h"
16 #include "HomePatch.h"
17 #include "Priorities.h"
18 #include "ComputePmeCUDAMgr.h"
19 #include "ComputeNonbondedUtil.h"
20 #include "PatchData.h"
21 //#include "CudaUtils.h"
22 
23 #include "NamdEventsProfiling.h"
24 
25 #include "DeviceCUDA.h"
26 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
27 #ifdef WIN32
28 #define __thread __declspec(thread)
29 #endif
30 extern __thread DeviceCUDA *deviceCUDA;
31 #endif
32 
33 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
34 
35 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
36 //
37 // Class constructor
38 //
40  CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) :
41 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
42 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
43 
44  cudaCheck(cudaSetDevice(deviceID));
45 
46  exclusionsByAtom = NULL;
47 
48  vdwTypes = NULL;
49  vdwTypesSize = 0;
50 
51  exclIndexMaxDiff = NULL;
52  exclIndexMaxDiffSize = 0;
53 
54  atomIndex = NULL;
55  atomIndexSize = 0;
56 
57  atomStorageSize = 0;
58 
59  // Atom and charge storage
60  atoms = NULL;
61  atomsSize = 0;
62  part = NULL;
63  partSize = 0;
64  doAlch = false;
65  lambdaWindowUpdated = false;
66 
67  // Force storage
68  h_forces = NULL;
69  h_forcesSize = 0;
70  h_forcesSlow = NULL;
71  h_forcesSlowSize = 0;
72 
73  d_forces = NULL;
74  d_forcesSize = 0;
75  d_forcesSlow = NULL;
76  d_forcesSlowSize = 0;
77 
78  // GBIS
79  intRad0H = NULL;
80  intRad0HSize = 0;
81  intRadSH = NULL;
82  intRadSHSize = 0;
83  psiSumH = NULL;
84  psiSumHSize = 0;
85  bornRadH = NULL;
86  bornRadHSize = 0;
87  dEdaSumH = NULL;
88  dEdaSumHSize = 0;
89  dHdrPrefixH = NULL;
90  dHdrPrefixHSize = 0;
91  maxShmemPerBlock = 0;
92  cudaPatches = NULL;
93 
94  atomsChangedIn = true;
95  atomsChanged = true;
96  computesChanged = true;
97 
98  forceDoneEventRecord = false;
99 
100  doNbThole = false;
101  isDrude = nullptr;
102  isDrudeSize = 0;
103  drudeAtomAlpha = nullptr;
104  drudeAtomAlphaSize = 0;
105 
107  if (simParams->pressureProfileOn) {
108  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
109  }
110 
111  if (simParams->GBISOn) gbisPhase = 3;
112 
113  doSkip = false;
114 }
115 
116 //
117 // Class destructor
118 //
120  // fprintf(stderr, "Pe %d calling destructor ", CkMyPe());
121  cudaCheck(cudaSetDevice(deviceID));
122  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
123  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
124  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
125  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
126  if (part != NULL) deallocate_host<char>(&part);
127  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
128  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
129  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
130  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
131 
132  // GBIS
133  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
134  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
135  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
136  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
137  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
138  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
139 
140  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
141 
142  // Drude/NbThole
143  if (isDrude != nullptr) deallocate_host(&isDrude);
144  if (drudeAtomAlpha != nullptr) deallocate_host(&drudeAtomAlpha);
145 
146  if (patches.size() > 0) {
147  deallocate_host<VirialEnergy>(&h_virialEnergy);
148  deallocate_device<VirialEnergy>(&d_virialEnergy);
149  cudaCheck(cudaStreamDestroy(stream));
150  cudaCheck(cudaEventDestroy(forceDoneEvent));
151  CmiDestroyLock(lock);
152  if (reductionGpuOffload) {
153  delete reductionGpuOffload;
154  }
155  if (reductionGpuResident) {
156  delete reductionGpuResident;
157  }
158  }
159 
160  // NOTE: unregistering happens in [sync] -entry method
161  // fprintf(stderr, "unregistering patches on pe %d\n", CkMyPe());
162  computeMgr->sendUnregisterBoxesOnPe(pes, this);
163 
164 }
165 
166 void CudaComputeNonbonded::unregisterBox(int i) {
167  if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(this, &patches[i].positionBox);
168  if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(this, &patches[i].forceBox);
169  if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(this, &patches[i].intRadBox);
170  if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(this, &patches[i].psiSumBox);
171  if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(this, &patches[i].bornRadBox);
172  if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(this, &patches[i].dEdaSumBox);
173  if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(this, &patches[i].dHdrPrefixBox);
174 }
175 
177  if (rankPatches[CkMyRank()].size() == 0)
178  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
179  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
180  unregisterBox(rankPatches[CkMyRank()][i]);
181  }
182 }
183 
184 //
185 // Register inter-patch (self) compute.
186 // Only serialized calls allowed
187 //
189  computesChanged = true;
190  addPatch(pid);
191  addCompute(cid, pid, pid, 0.);
192 }
193 
194 //
195 // Register pair-patch compute.
196 // Only serialized calls allowed
197 //
199  computesChanged = true;
200  addPatch(pid[0]);
201  addPatch(pid[1]);
202  PatchMap* patchMap = PatchMap::Object();
203  int t1 = trans[0];
204  int t2 = trans[1];
205  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
206  offset.x += (t1%3-1) - (t2%3-1);
207  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
208  offset.z += (t1/9-1) - (t2/9-1);
209  addCompute(cid, pid[0], pid[1], offset);
210 }
211 
212 //
213 // Add patch
214 //
215 void CudaComputeNonbonded::addPatch(PatchID pid) {
216  patches.push_back(PatchRecord(pid));
217 }
218 
219 //
220 // Add compute
221 //
222 void CudaComputeNonbonded::addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset) {
223  ComputeRecord cr;
224  cr.cid = cid;
225  cr.pid[0] = pid1;
226  cr.pid[1] = pid2;
227  cr.offset = offset;
228  computes.push_back(cr);
229 }
230 
231 //
232 // Update numAtoms and numFreeAtoms on a patch
233 //
234 void CudaComputeNonbonded::updatePatch(int i) {
235  int numAtoms = patches[i].patch->getNumAtoms();
236  int numFreeAtoms = numAtoms;
237  if ( fixedAtomsOn ) {
238  const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
239  for ( int j=0; j< numAtoms; ++j ) {
240  if ( aExt[j].atomFixed ) --numFreeAtoms;
241  }
242  }
243  patches[i].numAtoms = numAtoms;
244  patches[i].numFreeAtoms = numFreeAtoms;
245  cudaPatches[i].numAtoms = numAtoms;
246  cudaPatches[i].numFreeAtoms = numFreeAtoms;
247 #ifdef NODEGROUP_FORCE_REGISTER
248  cudaPatches[i].patchID = patches[i].patchID;
249 #endif
250 }
251 
252 int CudaComputeNonbonded::findPid(PatchID pid) {
253  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
254  int j = rankPatches[CkMyRank()][i];
255  if (patches[j].patchID == pid) return j;
256  }
257  return -1;
258 }
259 
260 void CudaComputeNonbonded::patchReady(PatchID pid, int doneMigration, int seq) {
261  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
262  // however that is already called in Sequencer::runComputeObjects_CUDA
263  // The functionality of updatePatch() was moved into updatePatches()
264  if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
265  if (doneMigration) {
266  int i = findPid(pid);
267  if (i == -1)
268  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
269  updatePatch(i);
270  }
271  CmiLock(lock);
272  Compute::patchReady(pid, doneMigration, seq);
273  CmiUnlock(lock);
274  }
275 }
276 
278  CmiLock(lock);
279  Compute::gbisP2PatchReady(pid, seq);
280  CmiUnlock(lock);
281 }
282 
284  CmiLock(lock);
285  Compute::gbisP3PatchReady(pid, seq);
286  CmiUnlock(lock);
287 }
288 
289 void CudaComputeNonbonded::assignPatch(int i) {
290 
291  PatchMap* patchMap = PatchMap::Object();
292  PatchID pid = patches[i].patchID;
293  Patch* patch = patchMap->patch(pid);
294  if (patch == NULL) {
295  // Create ProxyPatch if none exists
297  patch = patchMap->patch(pid);
298  }
299  patches[i].patch = patch;
300  if (patches[i].patch == NULL) {
301  NAMD_bug("CudaComputeNonbonded::assignPatch, patch not found");
302  }
303  patches[i].positionBox = patches[i].patch->registerPositionPickup(this);
304  patches[i].forceBox = patches[i].patch->registerForceDeposit(this);
306  if (simParams->GBISOn) {
307  patches[i].intRadBox = patches[i].patch->registerIntRadPickup(this);
308  patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(this);
309  patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(this);
310  patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(this);
311  patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this);
312  }
313  // Store Pe where this patch was registered
314 #if 1
315  if (patches[i].pe != CkMyPe()) {
316  NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
317  }
318 #else
319  patches[i].pe = CkMyPe();
320 #endif
321  //
322  patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
323  patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
324 }
325 
327  bool operator() (int2 pidj, int2 pidi) { // i and j reversed
328  int ppi = PATCH_PRIORITY(pidi.x);
329  int ppj = PATCH_PRIORITY(pidj.x);
330  if ( ppi != ppj ) return ppi < ppj;
331  return pidi.x < pidj.x;
332  }
333 };
334 
336  if (rankPatches[CkMyRank()].size() == 0)
337  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
338 
339  // calculate priority rank of local home patch within pe
340  {
341  PatchMap* patchMap = PatchMap::Object();
342  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
343  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
344  int i = rankPatches[CkMyRank()][k];
345  int pid = patches[i].patchID;
346  int homePe = patchMap->node(pid);
347  if ( CkNodeOf(homePe) == CkMyNode() ) {
348  int2 pid_index;
349  pid_index.x = pid;
350  pid_index.y = i;
351  homePatchByRank[CkRankOf(homePe)].add(pid_index);
352  }
353  }
354  for ( int i=0; i<CkMyNodeSize(); ++i ) {
356  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
357  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
358  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
359  int index = homePatchByRank[i][j].y;
360  patches[index].reversePriorityRankInPe = j + masterBoost;
361  }
362  }
363  }
364 
365  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
366  assignPatch(rankPatches[CkMyRank()][i]);
367  }
368 }
369 
370 //
371 // Returns Pe of Patch ID "pid", -1 otherwise
372 //
373 // int findHomePatchPe(std::vector<PatchIDList>& rankPatchIDs, PatchID pid) {
374 int findHomePatchPe(PatchIDList* rankPatchIDs, PatchID pid) {
375  // for (int i=0;i < rankPatchIDs.size();i++) {
376  for (int i=0;i < CkMyNodeSize();i++) {
377  if (rankPatchIDs[i].find(pid) != -1) return CkNodeFirst(CkMyNode()) + i;
378  }
379  return -1;
380 }
381 
382 //
383 // Find all PEs that have Patch
384 //
385 void findProxyPatchPes(std::vector<int>& proxyPatchPes, PatchID pid) {
386  proxyPatchPes.clear();
387  for (int i=0;i < CkMyNodeSize();i++) {
388  int pe = CkNodeFirst(CkMyNode()) + i;
389  if (PatchMap::ObjectOnPe(pe)->patch(pid) != NULL)
390  proxyPatchPes.push_back(pe);
391  }
392 }
393 
394 //
395 // Called after all computes have been registered
396 //
398  // Remove duplicate patches
399  std::sort(patches.begin(), patches.end());
400  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
401  patches.erase(last, patches.end());
402  // Set number of patches
403  setNumPatches(patches.size());
404  masterPe = CkMyPe();
405  computeMgr = computeMgrIn;
406  // Start patch counter
407  patchesCounter = getNumPatches();
408  // Patch ID map
409  std::map<PatchID, int> pidMap;
410 #if 1
411  //-------------------------------------------------------
412  // Copied in from ComputeNonbondedCUDA::assignPatches()
413  //-------------------------------------------------------
414 
415  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
416  int numPesOnNodeSharingDevice = 0;
417  int masterIndex = -1;
418  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
419  int pe = deviceCUDA->getPesSharingDevice(i);
420  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
421  if ( CkNodeOf(pe) == CkMyNode() ) {
422  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
423  }
424  }
425 
426  std::vector<int> count(patches.size(), 0);
427  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
428  std::vector<int> rankpcount(CkMyNodeSize(), 0);
429  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
430 
431  PatchMap* patchMap = PatchMap::Object();
432 
433  int unassignedpatches = patches.size();
434 
435  for (int i=0;i < patches.size(); ++i) {
436  patches[i].pe = -1;
437  }
438 
439  // assign if home pe and build table of natural proxies
440  for (int i=0;i < patches.size(); ++i) {
441  int pid = patches[i].patchID;
442  // homePe = PE where the patch currently resides
443  int homePe = patchMap->node(pid);
444  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
445  int pe = pesOnNodeSharingDevice[j];
446  // If homePe is sharing this device, assign this patch to homePe
447  if ( pe == homePe ) {
448  patches[i].pe = pe;
449  --unassignedpatches;
450  pcount[j] += 1;
451  }
452  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
453  table[i*numPesOnNodeSharingDevice + j] = 1;
454  }
455  }
456  // Assign this patch to homePe, if it resides on the same node
457  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
458  patches[i].pe = homePe;
459  --unassignedpatches;
460  rankpcount[CkRankOf(homePe)] += 1;
461  }
462  }
463  // assign if only one pe has a required proxy
464  for (int i=0; i < patches.size(); ++i) {
465  int pid = patches[i].patchID;
466  if ( patches[i].pe != -1 ) continue;
467  int c = 0;
468  int lastj;
469  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
470  if ( table[i*numPesOnNodeSharingDevice + j] ) {
471  ++c;
472  lastj = j;
473  }
474  }
475  count[i] = c;
476  if ( c == 1 ) {
477  patches[i].pe = pesOnNodeSharingDevice[lastj];
478  --unassignedpatches;
479  pcount[lastj] += 1;
480  }
481  }
482  int assignj = 0;
483  while ( unassignedpatches ) {
484  int i;
485  for (i=0;i < patches.size(); ++i) {
486  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
487  int pid = patches[i].patchID;
488  // patch_record &pr = patchRecords[pid];
489  if ( patches[i].pe != -1 ) continue;
490  patches[i].pe = pesOnNodeSharingDevice[assignj];
491  --unassignedpatches;
492  pcount[assignj] += 1;
493  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
494  break;
495  }
496  if (i < patches.size() ) continue; // start search again
497  for ( i=0;i < patches.size(); ++i ) {
498  int pid = patches[i].patchID;
499  // patch_record &pr = patchRecords[pid];
500  if ( patches[i].pe != -1 ) continue;
501  if ( count[i] ) continue;
502  patches[i].pe = pesOnNodeSharingDevice[assignj];
503  --unassignedpatches;
504  pcount[assignj] += 1;
505  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
506  break;
507  }
508  if ( i < patches.size() ) continue; // start search again
509  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
510  }
511 
512  // For each rank, list of patches
513  rankPatches.resize(CkMyNodeSize());
514  for (int i=0; i < patches.size(); ++i) {
515  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
516  pidMap[patches[i].patchID] = i;
517  }
518 
519  // for ( int i=0; i < patches.size(); ++i ) {
520  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
521  // }
522 #else
523  // For each rank, list of patches
524  rankPatches.resize(CkMyNodeSize());
525  // For each rank, list of home patch IDs
526  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
527  for (int i=0;i < CkMyNodeSize();i++) {
528  int pe = CkNodeFirst(CkMyNode()) + i;
529  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
530  }
531  std::vector<int> proxyPatchPes;
532  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
533  //--------------------------------------------------------
534  // Build a list of PEs to avoid
535  std::vector<int> pesToAvoid;
536 #if 0
537  // Avoid other GPUs' master PEs
538  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
539  int pe = deviceCUDA->getMasterPeForDeviceID(i);
540  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
541  }
542  // Avoid PEs that are involved in PME
543  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
544  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
545  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
546  }
547  // Set counters of avoidable PEs to high numbers
548  for (int i=0;i < pesToAvoid.size();i++) {
549  int pe = pesToAvoid[i];
550  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
551  }
552 #endif
553  // Avoid master Pe somewhat
554  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
555  //--------------------------------------------------------
556  for (int i=0;i < patches.size();i++) {
557  //if I had this datastructure "patches" on the GPU, I could use it
558  PatchID pid = patches[i].patchID;
559  int pe = findHomePatchPe(rankHomePatchIDs, pid);
560  if (pe == -1) {
561  // Patch not present on this node => try finding a ProxyPatch
562  findProxyPatchPes(proxyPatchPes, pid);
563  if (proxyPatchPes.size() == 0) {
564  // No ProxyPatch => create one on rank that has the least ProxyPatches
565  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
566  pe = CkNodeFirst(CkMyNode()) + rank;
567  peProxyPatchCounter[rank]++;
568  } else {
569  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
570  // this is done by finding the entry with minimum peProxyPatchCounter -value
571  // Find miniumum among proxyPatchPes, i.e., find the minimum among
572  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
573  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
574  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
575  // - proxyPatchPes.begin();
576  // pe = proxyPatchPes[pppi];
577  int minCounter = (1 << 30);
578  for (int j=0;j < proxyPatchPes.size();j++) {
579  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
580  pe = proxyPatchPes[j];
581  minCounter = peProxyPatchCounter[CkRankOf(pe)];
582  }
583  }
584  if (pe == -1)
585  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
586  peProxyPatchCounter[CkRankOf(pe)]++;
587  }
588  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
589  // Found home patch on this node, but it's on PE that should be avoided => find a new one
590  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
591  pe = CkNodeFirst(CkMyNode()) + rank;
592  peProxyPatchCounter[rank]++;
593  }
594  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
595  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
596  rankPatches[CkRankOf(pe)].push_back(i);
597  pidMap[pid] = i;
598  }
599 
600  delete [] rankHomePatchIDs;
601 #endif
602  // Setup computes using pidMap
603  for (int i=0;i < computes.size();i++) {
604  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
605  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
606  }
607  for (int i=0;i < CkMyNodeSize();i++) {
608  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
609  }
610  computeMgr->sendAssignPatchesOnPe(pes, this);
611 }
612 
613 void CudaComputeNonbonded::updatePatchOrder(const std::vector<CudaLocalRecord>& data) {
614  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
615  std::map<int, int> pidMap;
616  for (int i=0; i < data.size(); ++i) {
617  pidMap[data[i].patchID] = i;
618  }
619 
620  std::vector<PatchRecord> copy = patches;
621 
622  for (int i=0; i < copy.size(); i++) {
623  const int new_idx = pidMap[copy[i].patchID];
624  patches[new_idx] = copy[i];
625  }
626 
627  for (int i=0; i < rankPatches.size(); i++) {
628  rankPatches[i].clear();
629  }
630  for (int i=0; i < patches.size(); ++i) {
631  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
632  }
633 
634  // Setup computes using pidMap
635  for (int i=0;i < computes.size();i++) {
636  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
637  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
638  }
639  // TODO do we need to call sendAssignPatchesOnPe with the new order?
640 }
641 
643  if (patches.size() > 0) {
644  npairlists = 0;
645  // Allocate CUDA version of patches
646  cudaCheck(cudaSetDevice(deviceID));
647  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
648 
649  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
650  allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
651 
652  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
653  */
654  cudaDeviceProp props;
655  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
656  maxShmemPerBlock = props.sharedMemPerBlock;
657 
658 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
659  int leastPriority, greatestPriority;
660  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
661  int priority = (doStreaming) ? leastPriority : greatestPriority;
662  // int priority = greatestPriority;
663  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
664 #else
665  cudaCheck(cudaStreamCreate(&stream));
666 #endif
667  cudaCheck(cudaEventCreate(&forceDoneEvent));
668 
669  buildExclusions();
670 
671  lock = CmiCreateLock();
672  params = Node::Object()->simParameters;
673  if (params->CUDASOAintegrateMode) {
674  reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
675  }
676  reductionGpuOffload = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
677  doNbThole = params->drudeOn && (params->drudeNbtholeCut > 0.0);
678 
679 #ifdef NODEGROUP_FORCE_REGISTER
680  int devInd = deviceCUDA->getDeviceIndex();
681  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
682  PatchData *patchData = cpdata.ckLocalBranch();
683  patchData->devData[devInd].nbond_stream = stream;
684  // Fill auxiliary arrays for merging forces here
685  PatchMap* map = PatchMap::Object();
686  int nGlobalPatches = map->numPatches();
687  allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
688  memset(patchData->devData[devInd].h_hasPatches, 0, sizeof(bool)*nGlobalPatches);
689 
690  for(int i = 0; i < patches.size(); i++){
691  patchData->devData[devInd].h_hasPatches[patches[i].patchID] = true;
692  }
693  allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
694  copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
695 #endif
696  }
697 }
698 
699 //
700 // atomUpdate() can be called by any Pe
701 //
703  atomsChangedIn = true;
704 }
705 
706 //
707 // Compute patches[].atomStart, patches[].numAtoms, patches[].numFreeAtoms, and atomStorageSize
708 //
709 void CudaComputeNonbonded::updatePatches() {
710  if(params->CUDASOAintegrate && params->useDeviceMigration) {
711 #ifdef NODEGROUP_FORCE_REGISTER
712  const int deviceIndex = deviceCUDA->getDeviceIndex();
713  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
714  PatchData *patchData = cpdata.ckLocalBranch();
715  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
716  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
717 
718  // Maximum number of tiles per tile list
719  maxTileListLen = 0;
720  int atomStart = 0;
721  for (int i=0;i < numPatchesHomeAndProxy; i++) {
722  patches[i].numAtoms = localPatches[i].numAtoms;
723  patches[i].numFreeAtoms = localPatches[i].numAtoms;
724  patches[i].atomStart = localPatches[i].bufferOffsetNBPad;
725  cudaPatches[i].numAtoms = localPatches[i].numAtoms;
726  cudaPatches[i].numFreeAtoms = localPatches[i].numAtoms;
727  cudaPatches[i].atomStart = localPatches[i].bufferOffsetNBPad;
728  cudaPatches[i].patchID = localPatches[i].patchID;
729  // Haochuan: count the number of fixed atoms per patch
730  if (fixedAtomsOn) {
731  Patch* patch = NULL;
732  // Search the patch map to determine the number of free atoms of this patch
733  for (int j = 0; j < deviceCUDA->getNumPesSharingDevice(); j++){
735  patch = pm->patch(localPatches[i].patchID);
736  if (patch != NULL) break;
737  }
738  if (patch == NULL) NAMD_bug("CudaComputeNonbonded::updatePatches cannot find patch.\n");
739  if (patch->getNumAtoms() != localPatches[i].numAtoms) {
740  NAMD_bug("CudaComputeNonbonded::updatePatches numAtoms mismatches!\n");
741  }
742  const CompAtomExt *aExt = patch->getCompAtomExtInfo();
743  for (int j = 0; j < localPatches[i].numAtoms; ++j) {
744  if (aExt[j].atomFixed) {
745  --patches[i].numFreeAtoms;
746  --cudaPatches[i].numFreeAtoms;
747  }
748  }
749  }
750  int numAtoms = patches[i].numAtoms;
751 #if defined(NAMD_CUDA)
752  int numTiles = CudaComputeNonbondedKernel::computeNumTiles(numAtoms, WARPSIZE);
753  maxTileListLen = std::max(maxTileListLen, numTiles);
754  // computeAtomPad will recompute the number of tiles. Recomputing for clarity
755  atomStart += CudaComputeNonbondedKernel::computeAtomPad(numAtoms, WARPSIZE);
756 #else
758  maxTileListLen = std::max(maxTileListLen, numTiles);
759  // computeAtomPad will recompute the number of tiles. Recomputing for clarity
761 #endif
762  }
763  atomStorageSize = atomStart;
764 
765  if (maxTileListLen >= 65536) {
766  NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
767  }
768 #endif
769  } else {
770 
771  // Maximum number of tiles per tile list
772  maxTileListLen = 0;
773  int atomStart = 0;
774  for (int i=0;i < patches.size();i++) {
775  patches[i].atomStart = atomStart;
776  cudaPatches[i].atomStart = atomStart;
777 #ifdef NODEGROUP_FORCE_REGISTER
778  cudaPatches[i].patchID = patches[i].patchID;
779 #endif
780  int numAtoms = patches[i].numAtoms;
781 #ifdef NAMD_HIP
783  maxTileListLen = std::max(maxTileListLen, numTiles);
785 #else
786  int numTiles = CudaComputeNonbondedKernel::computeNumTiles(numAtoms, WARPSIZE);
787  maxTileListLen = std::max(maxTileListLen, numTiles);
789 #endif
790  }
791  atomStorageSize = atomStart;
792 
793  if (maxTileListLen >= 65536) {
794  NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
795  }
796  }
797 }
798 
799 void CudaComputeNonbonded::skipPatch(int i) {
800  if (CkMyPe() != patches[i].pe)
801  NAMD_bug("CudaComputeNonbonded::skipPatch called on wrong Pe");
802  Flags &flags = patches[i].patch->flags;
803  patches[i].positionBox->skip();
804  patches[i].forceBox->skip();
805  if (flags.doGBIS) {
806  patches[i].psiSumBox->skip();
807  patches[i].intRadBox->skip();
808  patches[i].bornRadBox->skip();
809  patches[i].dEdaSumBox->skip();
810  patches[i].dHdrPrefixBox->skip();
811  }
812 }
813 
815  if (rankPatches[CkMyRank()].size() == 0)
816  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
817  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
818  skipPatch(rankPatches[CkMyRank()][i]);
819  }
820  bool done = false;
821  CmiLock(lock);
822  patchesCounter -= rankPatches[CkMyRank()].size();
823  if (patchesCounter == 0) {
824  patchesCounter = getNumPatches();
825  done = true;
826  }
827  CmiUnlock(lock);
828  if (done) {
829  // Reduction must be done on masterPe
830  computeMgr->sendFinishReductions(masterPe, this);
831  }
832 }
833 
834 void CudaComputeNonbonded::skip() {
835  if (CkMyPe() != masterPe)
836  NAMD_bug("CudaComputeNonbonded::skip() called on non masterPe");
837 
838  if (patches.size() == 0) return;
839 
841  doSkip = true;
842  if (!simParams->CUDASOAintegrate) {
843  computeMgr->sendSkipPatchesOnPe(pes, this);
844  }
845 }
846 
847 void CudaComputeNonbonded::getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance) {
848  if (CkMyPe() != masterPe)
849  NAMD_bug("CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
850 
851  for (int i=0;i < patches.size();++i) {
852  PatchRecord &pr = patches[i];
853 
854  float maxMove = pr.patch->flags.maxAtomMovement;
855  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
856 
857  float maxTol = pr.patch->flags.pairlistTolerance;
858  //if(pr.patch->getPatchID() == 0) fprintf(stderr,
859  // "\n\nP0: Maximum mov/tol during CudaComputeNonbonded: %lf %lf\n", maxMove, maxTol);
860  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
861  }
862 }
863 
864 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param) {
866  c->updateVdwTypesExclSubset(first, last);
867 }
868 
869 void CudaComputeNonbonded::updateVdwTypesExclSubset(int first, int last) {
870  Molecule* mol;
871  if (doNbThole) {
873  }
874  for (int i=first;i <= last;i++) {
875  PatchRecord &pr = patches[i];
876  int start = pr.atomStart;
877  int numAtoms = pr.numAtoms;
878  const CompAtom *compAtom = pr.compAtom;
879  const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
880  // Atoms have changed, re-do exclusions and vdw types
881  int2* exclp = exclIndexMaxDiff + start;
882  int* aip = atomIndex + start;
883  char* pst;
884  if(doAlch) pst = part + start;
885  // Drude/NbThole
886  int* isDrudeStart;
887  float* drudeAtomAlphaStart;
888  if (doNbThole) {
889  isDrudeStart = isDrude + start;
890  drudeAtomAlphaStart = drudeAtomAlpha + start;
891  }
892  for ( int k=0;k < numAtoms; ++k ) {
893  int j = compAtomExt[k].sortOrder;
894  vdwTypes[start + k] = compAtom[j].vdwType;
895  aip[k] = compAtomExt[j].id;
896  if(doAlch) pst[k] = compAtom[j].partition;
897 #ifdef MEM_OPT_VERSION
898  exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
899  exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
900 #else // ! MEM_OPT_VERSION
901  exclp[k].x = exclusionsByAtom[compAtomExt[j].id].y;
902  exclp[k].y = exclusionsByAtom[compAtomExt[j].id].x;
903 #endif // MEM_OPT_VERSION
904  if (doNbThole) {
905  isDrudeStart[k] = mol->is_drude(aip[k]) ? int(mol->get_mother_atom(aip[k])) : -1;
906  drudeAtomAlphaStart[k] = std::cbrt(mol->is_drude(aip[k]) ? mol->GetAtomAlpha(mol->get_mother_atom(aip[k])) : mol->GetAtomAlpha(aip[k]));
907  atomIndexToNBindex[aip[k]] = start + k;
908  }
909  }
910  }
911 }
912 
913 //
914 // Called every time atoms changed
915 //
916 void CudaComputeNonbonded::updateVdwTypesExcl() {
917  // Re-allocate (VdwTypes, exclIndexMaxDiff) as needed
918  reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
919  reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
920  reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
921  if (doAlch) reallocate_host<char>(&part, &partSize, atomStorageSize, 1.4f);
922  if (doNbThole) {
923  reallocate_host(&isDrude, &isDrudeSize, atomStorageSize, 1.4f);
924  std::fill(isDrude, isDrude + atomStorageSize, -1);
925  reallocate_host(&drudeAtomAlpha, &drudeAtomAlphaSize, atomStorageSize, 1.4f);
926  std::fill(drudeAtomAlpha, drudeAtomAlpha + atomStorageSize, 0.0);
927  const Molecule* mol = Node::Object()->molecule;
928  atomIndexToNBindex.resize(mol->numAtoms);
929  }
930 
931  if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
932 #if CMK_SMP && USE_CKLOOP
933  int useCkLoop = Node::Object()->simParameters->useCkLoop;
934  if (useCkLoop >= 1) {
935  CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (void *)this, CkMyNodeSize(), 0, patches.size()-1);
936  } else
937 #endif
938  {
939  updateVdwTypesExclSubset(0, patches.size()-1);
940  }
941 
942  if (doNbThole) {
943  auto updateIsDrudeNBIndexWorker = [&](int start, int end, void* unused){
944  for (int i = start;i <= end;i++) {
945  if (isDrude[i] > -1) {
946  isDrude[i] = atomIndexToNBindex[isDrude[i]];
947  }
948  }
949  };
950  const Molecule* mol = Node::Object()->molecule;
951 #if CMK_SMP && USE_CKLOOP
952  const int useCkLoop = Node::Object()->simParameters->useCkLoop;
953  if (useCkLoop >= 1) {
954  const int numChunks = CkMyNodeSize();
955  const int lowerRange = 0;
956  const int upperRange = atomStorageSize - 1;
957  CkLoop_Parallelize(numChunks, lowerRange, upperRange, updateIsDrudeNBIndexWorker, NULL, CKLOOP_NONE, NULL);
958  } else
959 #endif
960  {
961  updateIsDrudeNBIndexWorker(0, atomStorageSize - 1, nullptr);
962  }
963 #if defined (NAMD_CUDA) || defined (NAMD_HIP)
964  nonbondedKernel.updateDrudeData(atomIndexSize, drudeAtomAlpha, isDrude, stream);
965  cudaCheck(cudaStreamSynchronize(stream));
966 #endif
967  }
968 
969  nonbondedKernel.updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
970  } else {
971 #ifdef NODEGROUP_FORCE_REGISTER
972  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
973  PatchData *patchData = cpdata.ckLocalBranch();
974  const int deviceIndex = deviceCUDA->getDeviceIndex();
975  nonbondedKernel.updateVdwTypesExclOnGPU(tileListKernel,
976  patchData->devData[deviceIndex].numPatchesHomeAndProxy,
977  atomStorageSize, params->alchOn,
978  patchData->devData[deviceIndex].d_localPatches,
979  patchData->h_soa_vdwType[deviceIndex],
980  patchData->h_soa_id[deviceIndex],
981  patchData->h_soa_sortOrder[deviceIndex],
982  patchData->h_soa_partition[deviceIndex],
983  stream
984  );
985 #endif // NODEGROUP_FORCE_REGISTER
986  if (doNbThole) {
987  // TODO
988  NAMD_bug("Unimplemented feature combination: Nbthole and GPU atom migration.\n");
989  }
990  }
991 }
992 
993 inline void CudaComputeNonbonded::copyAtomsLoop(int first, int last, void *result, int paraNum, void *param) {
995  c->copyAtomsSubset(first, last);
996 }
997 
998 void CudaComputeNonbonded::copyAtomsSubset(int first, int last) {
999  for (int i=first;i <= last;++i) {
1000  PatchRecord &pr = patches[i];
1001  int numAtoms = pr.numAtoms;
1002  if (numAtoms > 0) {
1003  int start = pr.atomStart;
1004  const CudaAtom *src = pr.patch->getCudaAtomList();
1005  CudaAtom *dst = atoms + start;
1006  memcpy(dst, src, sizeof(CudaAtom)*numAtoms);
1007  // Fill the rest with the copy of the last atom
1008 #ifdef NAMD_HIP
1009  int numAtomsAlign = ((numAtoms-1)/BOUNDINGBOXSIZE+1)*BOUNDINGBOXSIZE;
1010 #else
1011  int numAtomsAlign = ((numAtoms-1)/WARPSIZE+1)*WARPSIZE;
1012 #endif
1013  CudaAtom lastAtom = src[numAtoms-1];
1014  for (int j=numAtoms;j < numAtomsAlign;j++) {
1015  dst[j] = lastAtom;
1016  }
1017 #if 0
1018  fprintf(stderr, " printing patch %d\n", pr.patch->getPatchID());
1019  for(int k = 0; k < numAtoms; k++){
1020  fprintf(stderr, "%lf %lf %lf\n", dst[k].x, dst[k].y, dst[k].z);
1021  }
1022 #endif
1023  }
1024  }
1025 }
1026 
1027 void CudaComputeNonbonded::copyGBISphase(int i) {
1028  if (CkMyPe() != patches[i].pe)
1029  NAMD_bug("CudaComputeNonbonded::copyGBISphase called on wrong Pe");
1030  PatchRecord &pr = patches[i];
1031  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1032  if (gbisPhase == 1) {
1033  //Copy GBIS intRadius to Host
1034  if (atomsChanged) {
1035  float *intRad0 = intRad0H + pr.atomStart;
1036  float *intRadS = intRadSH + pr.atomStart;
1037  for (int k=0;k < pr.numAtoms;++k) {
1038  int j = aExt[k].sortOrder;
1039  intRad0[k] = pr.intRad[2*j+0];
1040  intRadS[k] = pr.intRad[2*j+1];
1041  }
1042  }
1043  } else if (gbisPhase == 2) {
1044  float *bornRad = bornRadH + pr.atomStart;
1045  for ( int k=0; k < pr.numAtoms; ++k ) {
1046  int j = aExt[k].sortOrder;
1047  bornRad[k] = pr.bornRad[j];
1048  }
1049  } else if (gbisPhase == 3) {
1050  float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
1051  for ( int k=0; k < pr.numAtoms; ++k ) {
1052  int j = aExt[k].sortOrder;
1053  dHdrPrefix[k] = pr.dHdrPrefix[j];
1054  }
1055  } // end phases
1056 }
1057 
1058 void CudaComputeNonbonded::openBox(int i) {
1059  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1060 
1061  if (CkMyPe() != patches[i].pe)
1062  NAMD_bug("CudaComputeNonbonded::openBox called on wrong Pe");
1064  if (!simParams->GBISOn || gbisPhase == 1) {
1065  // what is positionBox????
1066  patches[i].compAtom = patches[i].positionBox->open();
1067  // the compAtom datastructure is null for PEs
1068  //fprintf(stderr, "opening box at patches[%d] = %p\n", i, patches[i].compAtom);
1069  // JM: This is not necessary if we already have the positions from integration
1070  // This is only necessary in the first iteration
1071  // XXX TODO: Find out if we really need to open the position box or if we
1072  // can skip this step entirely
1073 #ifdef NODEGROUP_FORCE_REGISTER
1074  if(simParams->CUDASOAintegrate){
1075  if(atomsChanged && !simParams->useDeviceMigration) copyAtomsSubset(i, i);
1076  }else copyAtomsSubset(i, i);
1077 #else
1078  copyAtomsSubset(i, i);
1079 #endif
1080  }
1081  if (simParams->GBISOn) {
1082  if (gbisPhase == 1) {
1083  patches[i].intRad = patches[i].intRadBox->open();
1084  patches[i].psiSum = patches[i].psiSumBox->open();
1085  } else if (gbisPhase == 2) {
1086  patches[i].bornRad = patches[i].bornRadBox->open();
1087  patches[i].dEdaSum = patches[i].dEdaSumBox->open();
1088  } else if (gbisPhase == 3) {
1089  patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
1090  }
1091  copyGBISphase(i);
1092  }
1093 
1094  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1095 }
1096 
1098  if (masterPe != CkMyPe())
1099  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
1101 }
1102 
1104  if (rankPatches[CkMyRank()].size() == 0)
1105  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
1106 
1107  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1108 
1109 #ifdef NODEGROUP_FORCE_REGISTER
1110  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
1111  // opens boxes to make sure NAMD won't complain
1112  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1113  int j = rankPatches[CkMyRank()][i];
1114  patches[j].positionBox->open();
1115  }
1116  if(masterPe == CkMyPe()) {
1117  // we need to open boxes here...
1118  if(params->CUDASOAintegrate){
1119  if(!atomsChanged) this->launchWork();
1120  }
1121  else computeMgr->sendLaunchWork(masterPe, this);
1122  }
1123  }
1124  else{
1125 #endif
1126  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1127  openBox(rankPatches[CkMyRank()][i]);
1128  }
1129  bool done = false;
1130  CmiLock(lock);
1131  patchesCounter -= rankPatches[CkMyRank()].size();
1132  if (patchesCounter == 0) {
1133  patchesCounter = getNumPatches();
1134  done = true;
1135  }
1136  CmiUnlock(lock);
1137  if (done) {
1138  if(params->CUDASOAintegrate){
1139  if(!atomsChanged) this->launchWork();
1140  }
1141  else computeMgr->sendLaunchWork(masterPe, this);
1142  }
1143 #ifdef NODEGROUP_FORCE_REGISTER
1144  }
1145 #endif
1146  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1147 }
1148 
1150  // Simply enqueu doWork on masterPe and return "no work"
1151  computeMgr->sendMessageEnqueueWork(masterPe, this);
1152  return 1;
1153 }
1154 
1155 void CudaComputeNonbonded::reallocateArrays() {
1156  cudaCheck(cudaSetDevice(deviceID));
1158 
1159  // Re-allocate atoms
1160  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
1161 
1162  // Re-allocate forces
1163  if (doStreaming) {
1164  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1165  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1166  } else {
1167  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
1168  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
1169  }
1170  reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
1171  reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
1172  nonbondedKernel.reallocate_forceSOA(atomStorageSize);
1173 
1174  if (simParams->GBISOn) {
1175  reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
1176  reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
1177  reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
1178  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
1179  reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
1180  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
1181  }
1182 }
1183 
1185  if (CkMyPe() != masterPe)
1186  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
1187 
1188  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
1189  // atomsChangedIn can be set to true by any Pe
1190  // atomsChanged can only be set by masterPe
1191  // This use of double varibles makes sure we don't have race condition
1192  // it seems like it's important to have the masterPe call doWork() first
1193  atomsChanged = atomsChangedIn;
1194  atomsChangedIn = false;
1195 
1197 
1198  if (patches.size() == 0) return; // No work do to
1199 
1200  // Take the flags from the first patch on this Pe
1201  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
1202  // these flags are probably wrong.
1203  Flags &flags = patches[0].patch->flags;
1204 
1205  doSlow = flags.doFullElectrostatics;
1206  doEnergy = flags.doEnergy;
1207  doVirial = flags.doVirial;
1208  doAlch = simParams->alchOn;
1209  doMinimize = flags.doMinimize;
1210 
1211  if (flags.doNonbonded) {
1212 
1213  if (simParams->GBISOn) {
1214  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
1215  }
1216 
1217  if (!simParams->GBISOn || gbisPhase == 1) {
1218  if ( computesChanged ) {
1219  updateComputes();
1220  }
1221  if (atomsChanged) {
1222  // Re-calculate patch atom numbers and storage
1223  updatePatches();
1224  reSortDone = false;
1225  }
1226  reallocateArrays();
1227 #ifdef NODEGROUP_FORCE_REGISTER
1228  if (simParams->CUDASOAintegrate && simParams->useDeviceMigration && atomsChanged) {
1229  tileListKernel.prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
1230  updatePatchRecord();
1231  }
1232 #endif // NODEGROUP_FORCE_REGISTER
1233  }
1234 
1235  // Open boxes on Pes and launch work to masterPe
1236  if(params->CUDASOAintegrate){
1237  if(!atomsChanged) this->openBoxesOnPe();
1238  }
1239  else computeMgr->sendOpenBoxesOnPe(pes, this);
1240 
1241  } else {
1242  // No work to do, skip
1243  skip();
1244  }
1245 
1246 }
1247 
1249  if (CkMyPe() != masterPe)
1250  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
1251 
1252  beforeForceCompute = CkWallTimer();
1253  cudaCheck(cudaSetDevice(deviceID));
1255 
1256  // So, it seems like PE's are invoking the same object, however the patches[i] is borked on the masterPe
1257 
1258  // When I get here, it seems like compAtoms are not set for all Pes? How can that be?
1259 
1260  //execute only during GBIS phase 1, or if not using GBIS
1261  if (!simParams->GBISOn || gbisPhase == 1) {
1262 
1263  if ( atomsChanged || computesChanged ) {
1264  // Invalidate pair lists
1265  pairlistsValid = false;
1266  pairlistTolerance = 0.0f;
1267  }
1268 
1269  // Get maximum atom movement and patch tolerance
1270  float maxAtomMovement = 0.0f;
1271  float maxPatchTolerance = 0.0f;
1272  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
1273  // Update pair-list cutoff
1274  Flags &flags = patches[0].patch->flags;
1275  savePairlists = false;
1276  usePairlists = false;
1277  if ( flags.savePairlists ) {
1278  savePairlists = true;
1279  usePairlists = true;
1280  } else if ( flags.usePairlists ) {
1281  if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1282  SubmitReduction *reduction = getCurrentReduction();
1283  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1284  } else {
1285  usePairlists = true;
1286  }
1287  }
1288  if ( ! usePairlists ) {
1289  pairlistsValid = false;
1290  }
1291  float plcutoff = cutoff;
1292  if ( savePairlists ) {
1293  pairlistsValid = true;
1294  pairlistTolerance = 2. * maxPatchTolerance;
1295  plcutoff += pairlistTolerance;
1296  }
1297  plcutoff2 = plcutoff * plcutoff;
1298 
1299  // fprintf(stderr, "STEP[%d] plcutoff = %f listTolerance = %f save = %d maxPatchTolerance = %f maxAtomMovement = %f plvalid = %d flags.use = %d use = %d\n",
1300  // flags.step, plcutoff, pairlistTolerance, savePairlists, maxPatchTolerance, maxAtomMovement, pairlistsValid, flags.usePairlists, usePairlists);
1301  if(savePairlists || !usePairlists){
1302  reSortDone = false; // Ensures pairlist resorting if doPairlist
1303  }
1304 
1305  // if (atomsChanged)
1306  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1307  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1308 
1309  } // if (!simParams->GBISOn || gbisPhase == 1)
1310 
1311  // Calculate PME & VdW forces
1312  if (!simParams->GBISOn || gbisPhase == 1) {
1313  doForce();
1314  if (doStreaming) {
1315  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1316  patchReadyQueueLen = tileListKernel.getNumPatches();
1317  patchReadyQueueNext = 0;
1318  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1319  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1320  int* emptyPatches = tileListKernel.getEmptyPatches();
1321  for (int i=0;i < numEmptyPatches;i++) {
1322  PatchRecord &pr = patches[emptyPatches[i]];
1323  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1324  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1325  patchReadyQueue[i] = emptyPatches[i];
1326  }
1327  if (patchReadyQueueLen != patches.size())
1328  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1329  }
1330  }
1331 
1332  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1333  // before calling doGBISphase1().
1334  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1335  // In this code path doGBISphase1() is called in forceDone()
1336  forceDoneSetCallback();
1337  return;
1338  }
1339 
1340  // GBIS Phases
1341  if (simParams->GBISOn) {
1342  if (gbisPhase == 1) {
1343  doGBISphase1();
1344  } else if (gbisPhase == 2) {
1345  doGBISphase2();
1346  } else if (gbisPhase == 3) {
1347  doGBISphase3();
1348  }
1349  }
1350 
1351  // Copy forces to host
1352  if (!simParams->GBISOn || gbisPhase == 3) {
1353  if (!doStreaming) {
1354 #ifdef NODEGROUP_FORCE_REGISTER
1355  if(!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration)){
1356  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1357  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1358  }
1359 #else
1360  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1361  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1362 #endif
1363 
1364  }
1365  }
1366 
1367  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1368 
1369  NAMD_EVENT_START(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1370  // For GBIS, energies are ready after phase 2
1371  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1372  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1373  d_forces, d_forcesSlow, d_virialEnergy, stream);
1374  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1375 
1376  NAMD_EVENT_STOP(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1377 
1378  }
1379 
1380  if(simParams->CUDASOAintegrate && ((savePairlists || !usePairlists)) && !atomsChanged) reSortTileLists();
1381 
1382  // Setup call back
1383  forceDoneSetCallback();
1384 
1385 #if 0
1386  cudaCheck(cudaStreamSynchronize(stream));
1387  PatchMap *map = PatchMap::Object();
1388  HomePatchElem *elem;
1389  for(elem = map->homePatchList()->begin(); elem != map->homePatchList()->end(); elem++){
1390  if(elem->patch->getPatchID() == 7) break;
1391  }
1392  if(elem->patch->flags.step == 11){
1393  // it would be good to know from which patch these atoms are...
1394  fprintf(stderr, "CudaNonbonded data\n");
1395  for(int i = 0 ; i < atomStorageSize; i++){
1396  fprintf(stderr, "pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n",
1397  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
1398  // for some reason, we needed to set the positions
1399  h_forces[i].x, h_forces[i].y, h_forces[i].z,
1400  h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
1401  }
1402  }
1403 #endif
1404 
1405 }
1406 
1407 //
1408 // GBIS Phase 1
1409 //
1410 void CudaComputeNonbonded::doGBISphase1() {
1411  cudaCheck(cudaSetDevice(deviceID));
1412 
1413  if (atomsChanged) {
1414  GBISKernel.updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1415  }
1416 
1418  Lattice lattice = patches[0].patch->flags.lattice;
1419 
1420  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1421  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1422  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1423 
1424  GBISKernel.GBISphase1(tileListKernel, atomStorageSize,
1425  lata, latb, latc,
1426  simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream);
1427 }
1428 
1429 //
1430 // GBIS Phase 2
1431 //
1432 void CudaComputeNonbonded::doGBISphase2() {
1433  cudaCheck(cudaSetDevice(deviceID));
1434 
1436  Lattice lattice = patches[0].patch->flags.lattice;
1437 
1438  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1439  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1440  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1441 
1442  GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream);
1443 
1444  GBISKernel.GBISphase2(tileListKernel, atomStorageSize,
1445  doEnergy, doSlow,
1446  lata, latb, latc,
1447  simParams->cutoff, simParams->nonbondedScaling, simParams->kappa,
1448  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1449  simParams->dielectric, simParams->solvent_dielectric,
1450  d_forces, dEdaSumH, stream);
1451 }
1452 
1453 //
1454 // GBIS Phase 3
1455 //
1456 void CudaComputeNonbonded::doGBISphase3() {
1457  cudaCheck(cudaSetDevice(deviceID));
1459  Lattice lattice = patches[0].patch->flags.lattice;
1460 
1461  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1462  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1463  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1464 
1465  if (doSlow) {
1466  GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream);
1467 
1468  GBISKernel.GBISphase3(tileListKernel, atomStorageSize,
1469  lata, latb, latc,
1470  simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream);
1471  }
1472 }
1473 
1474 //
1475 // Calculate electrostatic & VdW forces
1476 //
1477 void CudaComputeNonbonded::doForce() {
1478  cudaCheck(cudaSetDevice(deviceID));
1480  // XXX TODO: This will not work if the patch flags are not correctly set
1481  Lattice lattice = patches[0].patch->flags.lattice;
1482  bool CUDASOAintegrator = simParams->CUDASOAintegrate;
1483  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1484  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1485  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1486  bool doPairlist = savePairlists || (!usePairlists);
1487  bool doFEP=false, doTI=false, doAlchVdwForceSwitching=false;
1488  if(doAlch){
1489  static thread_local bool firsttime = true;
1490  doTI = simParams->alchThermIntOn;
1491  doFEP = simParams->alchFepOn;
1492  doAlchVdwForceSwitching = simParams->vdwForceSwitching;
1493  // Otherwise, update them only when lambda window is updated.
1494  // getCurrentLambda and getCurrentLambda2 are assumed to have no side effects.
1495  // use float here to match the type of CudaAlchLambdas
1496  const decltype(alchFlags.lambdaUp) currentLambda = simParams->getCurrentLambda(patches[0].patch->flags.step);
1497  const decltype(alchFlags.lambda2Up) currentLambda2 = simParams->getCurrentLambda2(patches[0].patch->flags.step);
1498  if (firsttime) {
1499  // Update the alchemical flags if this is the first time
1500  firsttime = false;
1501  lambdaWindowUpdated = true;
1502  } else {
1503  // Compare the above parameters with respect to the saved parameters.
1504  if (alchFlags.lambdaUp != currentLambda ||
1505  alchFlags.lambda2Up != currentLambda2 ||
1506  // Could the following parameters also be changed?
1507  // I am not quite sure, but checking them by CPU code is not computationally expensive anyway.
1508  alchFlags.cutoff2 != ComputeNonbondedUtil::cutoff2 ||
1512  alchFlags.scaling != ComputeNonbondedUtil::scaling) {
1513  lambdaWindowUpdated = true;
1514  } else {
1515  lambdaWindowUpdated = false;
1516  }
1517  }
1518  if (lambdaWindowUpdated) {
1519  // Flags that are independent of the number of steps
1525  const double factor = alchFlags.cutoff2 - alchFlags.switchdist2;
1526  // When switching is off, cutoff is the same as switchdist,
1527  // so we need to check it to avoid passing inf for the computation of switchmul and switchmul2
1528  alchFlags.switchfactor = simParams->switchingActive ? 1.0/(factor*factor*factor) : 0;
1529  // Step-dependent parameters (lambdas)
1530  // alchFlags.alchLambda is redundant because we have lambdaUp already.
1531  // alchFlags.alchLambda = currentLambda;
1532  alchFlags.lambdaUp = currentLambda;
1533  alchFlags.lambdaDown = 1.0 - alchFlags.lambdaUp;
1534  alchFlags.elecLambdaUp = simParams->getElecLambda(alchFlags.lambdaUp);
1535  alchFlags.elecLambdaDown = simParams->getElecLambda(alchFlags.lambdaDown);
1536  alchFlags.vdwLambdaUp = simParams->getVdwLambda(alchFlags.lambdaUp);
1537  alchFlags.vdwLambdaDown = simParams->getVdwLambda(alchFlags.lambdaDown);
1538  alchFlags.lambda2Up = currentLambda2;
1539  alchFlags.lambda2Down = 1.0 - alchFlags.lambda2Up;
1540  alchFlags.elecLambda2Up = simParams->getElecLambda(alchFlags.lambda2Up);
1541  alchFlags.elecLambda2Down = simParams->getElecLambda(alchFlags.lambda2Down);
1542  alchFlags.vdwLambda2Up = simParams->getVdwLambda(alchFlags.lambda2Up);
1543  alchFlags.vdwLambda2Down = simParams->getVdwLambda(alchFlags.lambda2Down);
1544  alchFlags.vdwShiftUp = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambdaUp);
1545  alchFlags.vdwShift2Up = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambda2Up);
1546  alchFlags.vdwShiftDown = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambdaDown);
1547  alchFlags.vdwShift2Down = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambda2Down);
1548  }
1549  }
1550 
1551  if (doPairlist) {
1552  int numTileLists = calcNumTileLists();
1553 
1554  // Build initial tile lists and sort
1555  tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize,
1556  maxTileListLen, lata, latb, latc,
1557  cudaPatches, (const float4*)atoms, plcutoff2, maxShmemPerBlock, stream, atomsChanged, doAlch, CUDASOAintegrator, simParams->useDeviceMigration);
1558  // Prepare tile list for atom-based refinement
1559  tileListKernel.prepareTileList(stream);
1560  tileListKernel.clearTileListStat(stream);
1561  }
1562 
1563  if (atomsChanged) {
1564  // Update Vdw types and exclusion index & maxdiff
1565  updateVdwTypesExcl();
1566  }
1567 
1568  beforeForceCompute = CkWallTimer();
1569 
1570  // Calculate forces (and refine tile list if atomsChanged=true)
1571 #if 0
1572  if(atomsChanged){
1573  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1574  PatchData *patchData = cpdata.ckLocalBranch();
1575 
1576  CmiLock(patchData->printlock);
1577  fprintf(stderr, "DEV[%d] MIG POS PRINTOUT\n", deviceID);
1578  for (int p = 0; p < patches.size(); p++) {
1579  fprintf(stderr, "Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1580  for (int i = 0; i < patches[p].numAtoms; i++) {
1581  const int ai = i + patches[p].atomStart;
1582  fprintf(stderr, "POS[%d,%d,%d] = %lf %lf %lf %lf. Type %d\n", i, ai, atomIndex[ai],
1583  atoms[ai].x, atoms[ai].y, atoms[ai].z, atoms[ai].q, vdwTypes[ai]);
1584  }
1585  }
1586  CmiUnlock(patchData->printlock);
1587  }
1588 #endif
1589 
1590  const bool doTable = CudaComputeNonbonded::getDoTable(params, doSlow, doVirial);
1592 
1593 #ifdef DEBUG_MINIMIZE
1594  printf("%s, line %d:\n", __FILE__, __LINE__);
1595  printf(" atomsChanged = %d\n", atomsChanged);
1596  printf(" doMinimize = %d\n", doMinimize);
1597  printf(" doPairlist = %d\n", doPairlist);
1598  printf(" doEnergy = %d\n", doEnergy);
1599  printf(" doVirial = %d\n", doVirial);
1600  printf(" doSlow = %d\n", doSlow);
1601  printf("\n");
1602 #endif
1603 
1604  float drudeNbTholeCut2;
1605  if (doNbThole) {
1606  drudeNbTholeCut2 = simParams->drudeNbtholeCut * simParams->drudeNbtholeCut;
1607  }
1608 
1609  nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize,
1610  atomsChanged, doMinimize, doPairlist, doEnergy, doVirial,
1611  doSlow, doAlch, doAlchVdwForceSwitching, doFEP, doTI,
1612  doNbThole, doTable, lata, latb, latc,
1613  (const float4*)atoms, cutoff2, c,
1614  d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1615  &alchFlags, lambdaWindowUpdated, part,
1616  CUDASOAintegrator, params->useDeviceMigration,
1617  drudeNbTholeCut2,
1618  stream);
1619 
1620  if (doPairlist) {
1621  tileListKernel.finishTileList(stream);
1622  }
1623 
1624 // TODO remove once GPU migration has been merged
1625 #ifdef NODEGROUP_FORCE_REGISTER
1626 
1627  updatePatchRecord();
1628 
1629 #endif
1630 
1631 
1632 #if 0
1633  if(atomsChanged){
1634 
1635  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1636  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1637  cudaStreamSynchronize(stream);
1638  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1639  PatchData *patchData = cpdata.ckLocalBranch();
1640 
1641  CmiLock(patchData->printlock);
1642  fprintf(stderr, "DEV[%d] MIG POS PRINTOUT\n", deviceID);
1643  for (int p = 0; p < patches.size(); p++) {
1644  fprintf(stderr, "Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1645  for (int i = 0; i < patches[p].numAtoms; i++) {
1646  const int ai = i + patches[p].atomStart;
1647  fprintf(stderr, "POS[%d,%d,%d] = Type %d (%lf %lf %lf) (%lf %lf %lf)\n", i, ai, atomIndex[ai],
1648  vdwTypes[ai],
1649  h_forces[ai].x, h_forces[ai].y, h_forces[ai].z,
1650  h_forcesSlow[ai].x, h_forcesSlow[ai].y, h_forcesSlow[ai].z);
1651  }
1652  }
1653  CmiUnlock(patchData->printlock);
1654  }
1655 #endif
1656 
1657 
1658 
1659 
1660  traceUserBracketEvent(CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1661 }
1662 
1663 #ifdef NODEGROUP_FORCE_REGISTER
1664 void CudaComputeNonbonded::updatePatchRecord() {
1665  // register device pointers inside nodegroup for later integration
1666  // these can be moved inside atomsChanged laters
1667  int devInd = deviceCUDA->getDeviceIndex();
1668  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1669  PatchData *patchData = cpdata.ckLocalBranch();
1670  PatchMap* patchMap = PatchMap::Object();
1671  patchData->devData[devInd].f_nbond = d_forces;
1672  patchData->devData[devInd].f_nbond_slow = d_forcesSlow;
1673  patchData->devData[devInd].f_nbond_size = atomStorageSize;
1674  // device pointer to CudaPatchRecord
1675  patchData->devData[devInd].nbond_precord = tileListKernel.getCudaPatches();
1676  patchData->devData[devInd].nb_precord_size = tileListKernel.getCudaPatchesSize();
1677  patchData->devData[devInd].nb_datoms = tileListKernel.get_xyzq();
1678  patchData->devData[devInd].nbond_tkernel = &tileListKernel;
1679  patchData->devData[devInd].size_nb_datoms = atomStorageSize;
1680 }
1681 #endif
1682 
1683 //
1684 // Count an upper estimate for the number of tile lists
1685 //
1686 int CudaComputeNonbonded::calcNumTileLists() {
1687  int numTileLists = 0;
1688  for (int i=0;i < computes.size();i++) {
1689  int pi1 = computes[i].patchInd[0];
1690  int numAtoms1 = patches[pi1].numAtoms;
1691 #ifdef NAMD_HIP
1693 #else
1694  int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(numAtoms1, WARPSIZE);
1695 #endif
1696  numTileLists += numTiles1;
1697  }
1698  return numTileLists;
1699 }
1700 
1701 //
1702 // Finish & submit reductions
1703 //
1705  if (CkMyPe() != masterPe)
1706  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1707 
1708  SubmitReduction *reduction = getCurrentReduction();
1709 
1710  // fprintf(stderr, "PE[%d]: Nbond finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1711  if (!doSkip) {
1712 
1713  if (doStreaming && (doVirial || doEnergy)) {
1714  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1715  if (!forceDoneEventRecord)
1716  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1717  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1718  forceDoneEventRecord = false;
1719  }
1720 
1721  if (doVirial) {
1722  // if(params->CUDASOAintegrate) cudaCheck(cudaStreamSynchronize(stream));
1723  Tensor virialTensor;
1724  virialTensor.xx = h_virialEnergy->virial[0];
1725  virialTensor.xy = h_virialEnergy->virial[1];
1726  virialTensor.xz = h_virialEnergy->virial[2];
1727  virialTensor.yx = h_virialEnergy->virial[3];
1728  virialTensor.yy = h_virialEnergy->virial[4];
1729  virialTensor.yz = h_virialEnergy->virial[5];
1730  virialTensor.zx = h_virialEnergy->virial[6];
1731  virialTensor.zy = h_virialEnergy->virial[7];
1732  virialTensor.zz = h_virialEnergy->virial[8];
1733  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1734  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1735  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1736  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1737  if (doSlow) {
1738  Tensor virialTensor;
1739  virialTensor.xx = h_virialEnergy->virialSlow[0];
1740  virialTensor.xy = h_virialEnergy->virialSlow[1];
1741  virialTensor.xz = h_virialEnergy->virialSlow[2];
1742  virialTensor.yx = h_virialEnergy->virialSlow[3];
1743  virialTensor.yy = h_virialEnergy->virialSlow[4];
1744  virialTensor.yz = h_virialEnergy->virialSlow[5];
1745  virialTensor.zx = h_virialEnergy->virialSlow[6];
1746  virialTensor.zy = h_virialEnergy->virialSlow[7];
1747  virialTensor.zz = h_virialEnergy->virialSlow[8];
1748  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1749  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1750  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1751  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1752  }
1753  }
1754  if (doEnergy) {
1755  // if (doSlow)
1756 
1757  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1758  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1759  reduction->item(REDUCTION_LJ_ENERGY_F) += h_virialEnergy->energyVdw_s;
1760  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1761  reduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
1762 
1763  //Reduce values for TI
1764  reduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
1765  reduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
1766  reduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
1767  reduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
1768 
1769  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1770  if (doSlow){
1771  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1772  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
1773  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
1774  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
1775  //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
1776  }
1777  }
1778 
1779  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1780  }
1781  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1782 
1783  reduction->submit();
1784  // Reset flags
1785  doSkip = false;
1786  computesChanged = false;
1787 }
1788 
1789 //
1790 // Finish a single patch
1791 //
1792 void CudaComputeNonbonded::finishPatch(int i) {
1793  if (CkMyPe() != patches[i].pe)
1794  NAMD_bug("CudaComputeNonbonded::finishPatch called on wrong Pe");
1795 
1796  PatchMap *map;
1797  PatchRecord &pr = patches[i];
1798  pr.results = pr.forceBox->open();
1800 
1801  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1802  int atomStart = pr.atomStart;
1803  int numAtoms = pr.numAtoms;
1804 #ifdef NODEGROUP_FORCE_REGISTER
1805  if (numAtoms > 0 && (!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration))) {
1806  Force *f = pr.results->f[Results::nbond];
1807  Force *f_slow = pr.results->f[Results::slow];
1808  float4 *af = h_forces + atomStart;
1809  float4 *af_slow = h_forcesSlow + atomStart;
1810  // float maxf = 0.0f;
1811  // int maxf_k;
1812  for ( int k=0; k<numAtoms; ++k ) {
1813  int j = aExt[k].sortOrder;
1814 #ifdef DEBUG_MINIMIZE
1815  if (j == 0) {
1816  printf("%s, line %d\n", __FILE__, __LINE__);
1817  printf(" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1818  }
1819 #endif
1820  f[j].x += af[k].x;
1821  f[j].y += af[k].y;
1822  f[j].z += af[k].z;
1823 #ifdef DEBUG_MINIMIZE
1824  if (j == 0) {
1825  printf(" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1826  }
1827 #endif
1828  // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
1829  // maxf = std::max(maxf, fabsf(af[k].x));
1830  // maxf = std::max(maxf, fabsf(af[k].y));
1831  // maxf = std::max(maxf, fabsf(af[k].z));
1832  // maxf_k = k;
1833  // }
1834  if ( doSlow ) {
1835  f_slow[j].x += af_slow[k].x;
1836  f_slow[j].y += af_slow[k].y;
1837  f_slow[j].z += af_slow[k].z;
1838  }
1839  }
1840  // if (maxf > 10000.0f) {
1841  // fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
1842  // cudaCheck(cudaStreamSynchronize(stream));
1843  // NAMD_die("maxf!");
1844  // }
1845  }
1846 #else
1847  if (numAtoms > 0) {
1848  Force *f = pr.results->f[Results::nbond];
1849  Force *f_slow = pr.results->f[Results::slow];
1850  float4 *af = h_forces + atomStart;
1851  float4 *af_slow = h_forcesSlow + atomStart;
1852  // float maxf = 0.0f;
1853  // int maxf_k;
1854  for ( int k=0; k<numAtoms; ++k ) {
1855  int j = aExt[k].sortOrder;
1856 #ifdef DEBUG_MINIMIZE
1857  if (j == 0) {
1858  printf("%s, line %d\n", __FILE__, __LINE__);
1859  printf(" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1860  }
1861 #endif
1862  f[j].x += af[k].x;
1863  f[j].y += af[k].y;
1864  f[j].z += af[k].z;
1865 #ifdef DEBUG_MINIMIZE
1866  if (j == 0) {
1867  printf(" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1868  }
1869 #endif
1870  // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
1871  // maxf = std::max(maxf, fabsf(af[k].x));
1872  // maxf = std::max(maxf, fabsf(af[k].y));
1873  // maxf = std::max(maxf, fabsf(af[k].z));
1874  // maxf_k = k;
1875  // }
1876  if ( doSlow ) {
1877  f_slow[j].x += af_slow[k].x;
1878  f_slow[j].y += af_slow[k].y;
1879  f_slow[j].z += af_slow[k].z;
1880  }
1881  }
1882  // if (maxf > 10000.0f) {
1883  // fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
1884  // cudaCheck(cudaStreamSynchronize(stream));
1885  // NAMD_die("maxf!");
1886  // }
1887  }
1888 #endif
1889  // should I skip the close()?
1890  // do I need to close it even if there's a migration?
1891  if(!simParams->CUDASOAintegrate || atomsChanged){
1892  pr.positionBox->close(&(pr.compAtom));
1893  pr.forceBox->close(&(pr.results));
1894  }
1895 }
1896 
1897 //
1898 // Finish a set of patches on this pe
1899 //
1900 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1901  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1902  if (patchSet.size() == 0)
1903  NAMD_bug("CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1905  // Save value of gbisPhase here because it can change after the last finishGBISPhase() or finishPatch() is called
1906  int gbisPhaseSave = gbisPhase;
1907  // Close Boxes depending on Phase
1908  if (simParams->GBISOn) {
1909  for (int i=0;i < patchSet.size();i++) {
1910  finishGBISPhase(patchSet[i]);
1911  }
1912  }
1913  // Finish patches
1914  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1915  for (int i=0;i < patchSet.size();i++) {
1916  finishPatch(patchSet[i]);
1917  }
1918  }
1919  bool done = false;
1920  CmiLock(lock);
1921  patchesCounter -= patchSet.size();
1922  if(params->CUDASOAintegrate && !atomsChanged){
1923  // masterPe is executing this, we can go ahead and do
1924  // reductions by themselves, However, for migrations, I still need to
1925  // follow the usual codepath, because of the box setup
1926  patchesCounter = 0;
1927  }
1928  if (patchesCounter == 0) {
1929  patchesCounter = getNumPatches();
1930  done = true;
1931  }
1932  CmiUnlock(lock);
1933  if (done) {
1934  // Do reductions
1935  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1936  // Reduction must be done on masterPe
1937  if(params->CUDASOAintegrate ){
1938  if(!atomsChanged) this->finishReductions();
1939  }
1940  else computeMgr->sendFinishReductions(masterPe, this);
1941  }
1942  }
1943 
1944  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1945 
1946 }
1947 
1948 //
1949 // Finish all patches that are on this pe
1950 //
1952  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1953 }
1954 
1955 //
1956 // Finish single patch on this pe
1957 //
1959  std::vector<int> v(1, i);
1960  finishSetOfPatchesOnPe(v);
1961 }
1962 
1964  if(params->CUDASOAintegrate){
1965  if (atomsChanged || doEnergy || doVirial) cudaCheck(cudaStreamSynchronize(stream));
1966  this->finishPatchesOnPe();
1967  }
1968  else {
1969  computeMgr->sendFinishPatchesOnPe(pes, this);
1970  }
1971 }
1972 
1973 void CudaComputeNonbonded::finishGBISPhase(int i) {
1974  if (CkMyPe() != patches[i].pe)
1975  NAMD_bug("CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1976  PatchRecord &pr = patches[i];
1977  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1978  int atomStart = pr.atomStart;
1979  if (gbisPhase == 1) {
1980  GBReal *psiSumMaster = psiSumH + atomStart;
1981  for ( int k=0; k<pr.numAtoms; ++k ) {
1982  int j = aExt[k].sortOrder;
1983  pr.psiSum[j] += psiSumMaster[k];
1984  }
1985  pr.psiSumBox->close(&(pr.psiSum));
1986  } else if (gbisPhase == 2) {
1987  GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1988  for ( int k=0; k<pr.numAtoms; ++k ) {
1989  int j = aExt[k].sortOrder;
1990  pr.dEdaSum[j] += dEdaSumMaster[k];
1991  }
1992  pr.dEdaSumBox->close(&(pr.dEdaSum));
1993  } else if (gbisPhase == 3) {
1994  pr.intRadBox->close(&(pr.intRad)); //box 6
1995  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1996  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1997  } //end phases
1998 }
1999 
2000 void CudaComputeNonbonded::finishTimers() {
2002 
2003  if (simParams->GBISOn) {
2004  if (gbisPhase == 1)
2005  traceUserBracketEvent(CUDA_GBIS1_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
2006  if (gbisPhase == 2)
2007  traceUserBracketEvent(CUDA_GBIS2_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
2008  if (gbisPhase == 3)
2009  traceUserBracketEvent(CUDA_GBIS3_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
2010  } else {
2011  traceUserBracketEvent(CUDA_NONBONDED_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
2012  }
2013 }
2014 
2015 //
2016 // Re-sort tile lists if neccessary
2017 //
2019  // Re-sort tile lists
2021  cudaCheck(cudaSetDevice(deviceID));
2022 #ifdef NAMD_HIP
2023  tileListKernel.reSortTileLists(simParams->GBISOn, simParams->CUDASOAintegrateMode, stream);
2024 #else
2025  tileListKernel.reSortTileLists(simParams->GBISOn, stream);
2026 #endif
2027 }
2028 
2029 void CudaComputeNonbonded::forceDoneCheck(void *arg, double walltime) {
2031 
2032  if (CkMyPe() != c->masterPe)
2033  NAMD_bug("CudaComputeNonbonded::forceDoneCheck called on non masterPe");
2034 
2036  cudaCheck(cudaSetDevice(c->deviceID));
2037 
2038  if (c->doStreaming) {
2039  int patchInd;
2040  while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
2041  c->patchReadyQueue[c->patchReadyQueueNext] = -1;
2042  c->patchReadyQueueNext++;
2043  c->checkCount = 0;
2044 
2045  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
2046  c->finishTimers();
2047  // TODO: Figure out why I cannot check c->atomsChanged in case of GBIS
2048  if (!simParams->GBISOn) {
2049  if (c->atomsChanged && !c->reSortDone) {
2050  c->reSortTileLists();
2051  c->reSortDone = true;
2052  }
2053  } else {
2054  if ( (c->savePairlists || !(c->usePairlists)) && (c->gbisPhase == 1) && !c->reSortDone) {
2055  c->reSortTileLists();
2056  c->reSortDone = true;
2057  if (c->gbisPhase == 1) {
2058  // We must do GBIS Phase 1
2059  c->doGBISphase1();
2060  c->forceDoneSetCallback();
2061  return;
2062  }
2063  }
2064  }
2065  }
2066 
2067  // Finish patch
2068  int pe = c->patches[patchInd].pe;
2069  PatchID patchID = c->patches[patchInd].patchID; // for priority
2070  //c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
2071  if(c->params->CUDASOAintegrate) c->finishPatchOnPe(patchInd);
2072  else c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
2073 
2074  // Last patch, return
2075  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) return;
2076 
2077  }
2078  } else {
2079  if (!c->forceDoneEventRecord)
2080  NAMD_bug("CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
2081  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2082  if (err == cudaSuccess) {
2083  // Event has occurred
2084  c->forceDoneEventRecord = false;
2085  c->checkCount = 0;
2086  c->finishTimers();
2087  // TODO: Figure out why I cannot check c->atomsChanged in case of GBIS
2088  if (!simParams->GBISOn) {
2089  if (c->atomsChanged && !c->reSortDone) {
2090  c->reSortTileLists();
2091  c->reSortDone = true;
2092  }
2093  } else {
2094  if ( (c->savePairlists || !(c->usePairlists)) && (c->gbisPhase == 1) && !c->reSortDone) {
2095  c->reSortTileLists();
2096  c->reSortDone = true;
2097  if (c->gbisPhase == 1) {
2098  // We must do GBIS Phase 1
2099  c->doGBISphase1();
2100  c->forceDoneSetCallback();
2101  return;
2102  }
2103  }
2104  }
2105  c->finishPatches();
2106  return;
2107  } else if (err != cudaErrorNotReady) {
2108  // Anything else is an error
2109  char errmsg[256];
2110  sprintf(errmsg,"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
2111  c->checkCount, walltime - c->beforeForceCompute);
2112  cudaDie(errmsg,err);
2113  }
2114  }
2115 
2116  // if (c->checkCount % 1000 == 0)
2117  // fprintf(stderr, "c->patchReadyQueueNext %d\n", c->patchReadyQueueNext);
2118 
2119  // Event has not occurred
2120  c->checkCount++;
2121  if (c->checkCount >= 1000000) {
2122  char errmsg[256];
2123  sprintf(errmsg,"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
2124  c->checkCount, walltime - c->beforeForceCompute);
2125  cudaDie(errmsg,cudaSuccess);
2126  }
2127 
2128  // Call again
2129  CcdCallBacksReset(0, walltime);
2130  // we need to do this only for the first timestep I guess?
2131 
2132  if(!c->params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2133 }
2134 
2135 //
2136 // Set call back for all the work in the stream at this point
2137 //
2138 void CudaComputeNonbonded::forceDoneSetCallback() {
2139  if (CkMyPe() != masterPe)
2140  NAMD_bug("CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
2141  beforeForceCompute = CkWallTimer();
2142  cudaCheck(cudaSetDevice(deviceID));
2143  if (!doStreaming || doVirial || doEnergy) {
2144  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2145  forceDoneEventRecord = true;
2146  }
2147  checkCount = 0;
2148  CcdCallBacksReset(0, CmiWallTimer());
2149  // Set the call back at 0.1ms
2150  if(!params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, this, 0.1);
2151 }
2152 
2154  const Lattice &l;
2155  cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
2158  Vector a = l.a();
2159  Vector b = l.b();
2160  Vector c = l.c();
2161  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
2162  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
2163  return ( ri < rj );
2164  }
2165 };
2166 
2167 static inline bool sortop_bitreverse(int a, int b) {
2168  if ( a == b ) return 0;
2169  for ( int bit = 1; bit; bit *= 2 ) {
2170  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
2171  }
2172  return 0;
2173 }
2174 
2179  const CudaComputeNonbonded::PatchRecord *patchrecs) : distop(sod), pr(patchrecs) { }
2180  bool pid_compare_priority(int2 pidi, int2 pidj) {
2181  const CudaComputeNonbonded::PatchRecord &pri = pr[pidi.y];
2182  const CudaComputeNonbonded::PatchRecord &prj = pr[pidj.y];
2183  if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
2184  if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
2185  if ( pri.isSameNode && ! prj.isSameNode ) return 0;
2186  if ( prj.isSameNode && ! pri.isSameNode ) return 1;
2187  if ( pri.isSameNode ) { // and prj.isSameNode
2188  int rpri = pri.reversePriorityRankInPe;
2189  int rprj = prj.reversePriorityRankInPe;
2190  if ( rpri != rprj ) return rpri > rprj;
2191  return sortop_bitreverse(CkRankOf(pri.pe),CkRankOf(prj.pe));
2192  }
2193  int ppi = PATCH_PRIORITY(pidi.x);
2194  int ppj = PATCH_PRIORITY(pidj.x);
2195  if ( ppi != ppj ) return ppi < ppj;
2196  return pidi.x < pidj.x;
2197  }
2199  CudaComputeNonbonded::ComputeRecord i) { // i and j reversed
2200  // Choose patch i (= patch with greater priority)
2201  int2 pidi = pid_compare_priority(make_int2(i.pid[0], i.patchInd[0]), make_int2(i.pid[1], i.patchInd[1])) ? make_int2(i.pid[0], i.patchInd[0]) : make_int2(i.pid[1], i.patchInd[1]);
2202  // Choose patch j
2203  int2 pidj = pid_compare_priority(make_int2(j.pid[0], j.patchInd[0]), make_int2(j.pid[1], j.patchInd[1])) ? make_int2(j.pid[0], j.patchInd[0]) : make_int2(j.pid[1], j.patchInd[1]);
2204  if ( pidi.x != pidj.x ) return pid_compare_priority(pidi, pidj);
2205  return distop(i,j);
2206  }
2207 };
2208 
2209 //
2210 // Setup computes. This is only done at the beginning and at load balancing, hence the lack of
2211 // consideration for performance in the CPU->GPU memory copy.
2212 //
2213 void CudaComputeNonbonded::updateComputes() {
2214  cudaCheck(cudaSetDevice(deviceID));
2215 
2216  Lattice lattice = patches[0].patch->flags.lattice;
2217  cr_sortop_distance so(lattice);
2218  std::stable_sort(computes.begin(), computes.end(), so);
2219 
2220  if (doStreaming) {
2221  cr_sortop_reverse_priority sorp(so, patches.data());
2222  std::stable_sort(computes.begin(), computes.end(), sorp);
2223  }
2224 
2225  CudaComputeRecord* cudaComputes = new CudaComputeRecord[computes.size()];
2226 
2227  for (int i=0;i < computes.size();i++) {
2228  cudaComputes[i].patchInd.x = computes[i].patchInd[0];
2229  cudaComputes[i].patchInd.y = computes[i].patchInd[1];
2230  cudaComputes[i].offsetXYZ.x = computes[i].offset.x;
2231  cudaComputes[i].offsetXYZ.y = computes[i].offset.y;
2232  cudaComputes[i].offsetXYZ.z = computes[i].offset.z;
2233  }
2234 
2235  tileListKernel.updateComputes(computes.size(), cudaComputes, stream);
2236  cudaCheck(cudaStreamSynchronize(stream));
2237 
2238  delete [] cudaComputes;
2239 }
2240 
2242  bool operator() (int32 *li, int32 *lj) {
2243  return ( li[1] < lj[1] );
2244  }
2245 };
2246 
2247 //
2248 // Builds the exclusions table. Swiped from ComputeNonbondedCUDA.C
2249 //
2250 void CudaComputeNonbonded::buildExclusions() {
2251  cudaCheck(cudaSetDevice(deviceID));
2252 
2254 
2255 #ifdef MEM_OPT_VERSION
2256  int natoms = mol->exclSigPoolSize;
2257 #else
2258  int natoms = mol->numAtoms;
2259 #endif
2260 
2261  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
2262  exclusionsByAtom = new int2[natoms];
2263 
2264  // create unique sorted lists
2265 
2266  ObjectArena<int32> listArena;
2267  ResizeArray<int32*> unique_lists;
2268  int32 **listsByAtom = new int32*[natoms];
2270  for ( int i=0; i<natoms; ++i ) {
2271  curList.resize(0);
2272  curList.add(0); // always excluded from self
2273 #ifdef MEM_OPT_VERSION
2274  const ExclusionSignature *sig = mol->exclSigPool + i;
2275  int n = sig->fullExclCnt;
2276  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
2277  n += 1;
2278 #else
2279  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
2280  int n = mol_list[0] + 1;
2281  for ( int j=1; j<n; ++j ) {
2282  curList.add(mol_list[j] - i);
2283  }
2284 #endif
2285  curList.sort();
2286 
2287  int j;
2288  for ( j=0; j<unique_lists.size(); ++j ) {
2289  if ( n != unique_lists[j][0] ) continue; // no match
2290  int k;
2291  for ( k=0; k<n; ++k ) {
2292  if ( unique_lists[j][k+3] != curList[k] ) break;
2293  }
2294  if ( k == n ) break; // found match
2295  }
2296  if ( j == unique_lists.size() ) { // no match
2297  int32 *list = listArena.getNewArray(n+3);
2298  list[0] = n;
2299  int maxdiff = 0;
2300  maxdiff = -1 * curList[0];
2301  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
2302  list[1] = maxdiff;
2303  for ( int k=0; k<n; ++k ) {
2304  list[k+3] = curList[k];
2305  }
2306  unique_lists.add(list);
2307  }
2308  listsByAtom[i] = unique_lists[j];
2309  }
2310  // sort lists by maxdiff
2311  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
2312  long int totalbits = 0;
2313  int nlists = unique_lists.size();
2314  for ( int j=0; j<nlists; ++j ) {
2315  int32 *list = unique_lists[j];
2316  int maxdiff = list[1];
2317  list[2] = totalbits + maxdiff;
2318  totalbits += 2*maxdiff + 1;
2319  }
2320  for ( int i=0; i<natoms; ++i ) {
2321  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
2322  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
2323  }
2324  delete [] listsByAtom;
2325 
2326  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
2327 
2328  {
2329  long int bytesneeded = totalbits / 8;
2330  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
2331  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
2332  unique_lists.size(), bytesneeded);
2333  }
2334 
2335  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
2336  if ( bytesneeded > bytesavail ) {
2337  char errmsg[512];
2338  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
2339  "but only %ld bytes can be addressed with 32-bit int.",
2340  unique_lists.size(), bytesneeded, bytesavail);
2341  NAMD_die(errmsg);
2342  }
2343  }
2344 
2345 #define SET_EXCL(EXCL,BASE,DIFF) \
2346  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
2347 
2348  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
2349  memset(exclusion_bits, 0, totalbits/8);
2350 
2351  long int base = 0;
2352  for ( int i=0; i<unique_lists.size(); ++i ) {
2353  base += unique_lists[i][1];
2354  if ( unique_lists[i][2] != (int32)base ) {
2355  NAMD_bug("CudaComputeNonbonded::build_exclusions base != stored");
2356  }
2357  int n = unique_lists[i][0];
2358  for ( int j=0; j<n; ++j ) {
2359  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
2360  }
2361  base += unique_lists[i][1] + 1;
2362  }
2363 
2364  int numExclusions = totalbits/32;
2365 
2366  nonbondedKernel.bindExclusions(numExclusions, exclusion_bits);
2367 
2368 
2370  if(simParams->CUDASOAintegrate && simParams->useDeviceMigration){
2371  nonbondedKernel.setExclusionsByAtom(exclusionsByAtom, natoms);
2372  }
2373 
2374  delete [] exclusion_bits;
2375 }
2376 
2378  const float cutoff = ComputeNonbondedUtil::cutoff;
2379  const float cutoff2 = ComputeNonbondedUtil::cutoff2;
2380  const float cutoffInv = 1.0f / cutoff;
2381  const float cutoff2Inv = 1.0f / cutoff2;
2382  const float scutoff = ComputeNonbondedUtil::switchOn;
2383  const float scutoff2 = ComputeNonbondedUtil::switchOn2;
2384  const float scutoff2Inv = 1.0f / scutoff2;
2385  const float scutoff_denom = ComputeNonbondedUtil::c1;
2388  const float slowScale = ((float) simParams->fullElectFrequency) / simParams->nonbondedFrequency;
2389 
2390  CudaNBConstants c;
2391  c.lj_0 = scutoff_denom * cutoff2 - 3.0f * scutoff2 * scutoff_denom;
2392  c.lj_1 = scutoff_denom * 2.0f;
2393  c.lj_2 = scutoff_denom * -12.0f;
2394  c.lj_3 = 12.0f * scutoff_denom * scutoff2;
2395  c.lj_4 = cutoff2;
2396  c.lj_5 = scutoff2;
2397  c.e_0 = cutoff2Inv * cutoffInv;
2398  c.e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
2399  c.e_1 = cutoff2Inv;
2400  c.e_2 = cutoffInv;
2401  c.ewald_0 = ewaldcof;
2402  c.ewald_1 = pi_ewaldcof;
2403  c.ewald_2 = ewaldcof * ewaldcof;
2404  c.ewald_3_slow = ewaldcof * ewaldcof * ewaldcof * slowScale;
2405  c.slowScale = slowScale;
2406 
2407  return c;
2408 }
2409 
2410 bool CudaComputeNonbonded::getDoTable(SimParameters *simParams, const bool doSlow, const bool doVirial) {
2411  // There is additional logic in SimParameters.C which guards against unsupported force fields
2412  // This should only be used for performance heuristics
2413  bool doTable = simParams->useCUDANonbondedForceTable;
2414 
2415  // DMC: I found the doSlow case is faster with force tables, so overriding setting
2416  // TODO This should be reevaluated for future architectures
2417  doTable = doTable || doSlow;
2418  // Direct math does not support virial+slow
2419  // Redundant but necessary for correctness so doing it explicitly
2420  doTable = doTable || (doSlow && doVirial);
2421 
2422  return doTable;
2423 }
2424 
2427  return (simParams->CUDASOAintegrate) ? reductionGpuResident :
2428  reductionGpuOffload;
2429 }
2430 
2431 #endif // NAMD_CUDA
2432 
2433 
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:33
BigReal zy
Definition: Tensor.h:19
SubmitReduction * getCurrentReduction()
#define NAMD_EVENT_STOP(eon, id)
int get_mother_atom(int) const
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
float ewald_3_slow
Definition: CudaUtils.h:607
Type * getNewArray(int n)
Definition: ObjectArena.h:49
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
int getDeviceCount()
Definition: DeviceCUDA.h:124
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:106
HomePatch * patch
Definition: HomePatchList.h:23
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
int size(void) const
Definition: ResizeArray.h:131
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
BigReal xz
Definition: Tensor.h:17
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
void deallocate_host(T **pp)
Definition: CudaUtils.h:387
int numAtoms
Definition: CudaRecord.h:17
int32 ComputeID
Definition: NamdTypes.h:288
static PatchMap * Object()
Definition: PatchMap.h:27
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1863
CmiNodeLock printlock
Definition: PatchData.h:163
Definition: Vector.h:72
virtual void submit(void)=0
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
SimParameters * simParameters
Definition: Node.h:181
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1852
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
Bool CUDASOAintegrateMode
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
void clearTileListStat(cudaStream_t stream)
virtual void gbisP2PatchReady(PatchID, int seq)
int savePairlists
Definition: PatchTypes.h:41
static const Molecule * mol
int32_t int32
Definition: common.h:38
BigReal & item(int i)
Definition: ReductionMgr.h:336
HomePatchList * homePatchList()
Definition: PatchMap.C:438
BigReal z
Definition: Vector.h:74
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
int usePairlists
Definition: PatchTypes.h:40
BigReal yz
Definition: Tensor.h:18
CudaPatchRecord * getCudaPatches()
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
bool operator()(int2 pidj, int2 pidi)
void updateDrudeData(const int atomStorageSize, const float *h_drudeAtomAlpha, const int *h_isDrude, cudaStream_t stream)
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1232
BigReal GetAtomAlpha(int i) const
Definition: Molecule.h:512
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
int add(const Elem &elem)
Definition: ResizeArray.h:101
#define WARPSIZE
Definition: CudaUtils.h:17
const CudaComputeNonbonded::PatchRecord * pr
Molecule stores the structural information for the system.
Definition: Molecule.h:174
void reallocate_forceSOA(int atomStorageSize)
BigReal drudeNbtholeCut
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:96
Definition: Patch.h:35
Bool useDeviceMigration
Flags flags
Definition: Patch.h:128
void resize(int i)
Definition: ResizeArray.h:84
uint32 id
Definition: NamdTypes.h:160
static CudaNBConstants getNonbondedCoef(SimParameters *params)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1874
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
static __device__ __host__ __forceinline__ int computeAtomPad(const int numAtoms, const int tilesize=WARPSIZE)
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
bool pid_compare_priority(int2 pidi, int2 pidj)
bool reallocate_host(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
Definition: CudaUtils.h:375
virtual void gbisP3PatchReady(PatchID, int seq)
NAMD_HOST_DEVICE float3 make_float3(float4 a)
Definition: Vector.h:335
#define ATOMIC_BINS
Definition: CudaUtils.h:70
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:30
int doEnergy
Definition: PatchTypes.h:20
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
int doFullElectrostatics
Definition: PatchTypes.h:23
BigReal yx
Definition: Tensor.h:18
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:532
int16 vdwType
Definition: NamdTypes.h:80
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1885
#define BOUNDINGBOXSIZE
Definition: CudaUtils.h:18
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:34
int priority(void)
Definition: Compute.h:65
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
uint8 partition
Definition: NamdTypes.h:81
BigReal x
Definition: Vector.h:74
PatchID getPatchID() const
Definition: Patch.h:114
void CcdCallBacksReset(void *ignored, double curWallTime)
cr_sortop_distance(const Lattice &lattice)
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool atomsChanged, const bool doMinimize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doAlch, const bool doAlchVdwForceSwitching, const bool doFEP, const bool doTI, const bool doNbThole, const bool doTable, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, const CudaNBConstants nbConstants, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, AlchData *fepFlags, bool lambdaWindowUpdated, char *part, bool CUDASOAintegratorOn, bool useDeviceMigration, const float drudeNbtholeCut2, cudaStream_t stream)
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
void finishTileList(cudaStream_t stream)
void setExclusionsByAtom(int2 *h_data, const int num_atoms)
int numAtoms
Definition: Molecule.h:586
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1811
void createProxy(PatchID pid)
Definition: ProxyMgr.C:492
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:147
void registerComputeSelf(ComputeID cid, PatchID pid)
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
static bool sortop_bitreverse(int a, int b)
void updateVdwTypesExclOnGPU(CudaTileListKernel &tlKernel, const int numPatches, const int atomStorageSize, const bool alchOn, CudaLocalRecord *localRecords, const int *d_vdwTypes, const int *d_id, const int *d_sortOrder, const int *d_partition, cudaStream_t stream)
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
BigReal zz
Definition: Tensor.h:19
virtual void patchReady(PatchID, int doneMigration, int seq)
int gbisPhase
Definition: Compute.h:39
static __device__ __host__ __forceinline__ int computeNumTiles(const int numAtoms, const int tilesize=WARPSIZE)
#define MAX_EXCLUSIONS
#define simParams
Definition: Output.C:131
int32 sortOrder
Definition: NamdTypes.h:153
iterator begin(void)
Definition: ResizeArray.h:36
Definition: Tensor.h:15
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:31
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
BigReal xy
Definition: Tensor.h:17
int getNumPatches()
Definition: Compute.h:53
iterator end(void)
Definition: ResizeArray.h:37
int doVirial
Definition: PatchTypes.h:21
BigReal y
Definition: Vector.h:74
BigReal yy
Definition: Tensor.h:18
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:32
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
Bool is_drude(int) const
int getDeviceIndex()
Definition: DeviceCUDA.h:166
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
int doGBIS
Definition: PatchTypes.h:30
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
int node(int pid) const
Definition: PatchMap.h:114
static BigReal alchVdwShiftCoeff
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1838
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
Definition: ComputeMgr.C:1825
int atomStart
Definition: CudaRecord.h:16
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
int32 PatchID
Definition: NamdTypes.h:287
BigReal zx
Definition: Tensor.h:19
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
Molecule * molecule
Definition: Node.h:179
const ComputeID cid
Definition: Compute.h:43
bool operator()(CudaComputeNonbonded::ComputeRecord j, CudaComputeNonbonded::ComputeRecord i)
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1785
int doMinimize
Definition: PatchTypes.h:26
double BigReal
Definition: common.h:123
int step
Definition: PatchTypes.h:16
float GBReal
Definition: ComputeGBIS.inl:17
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
void updatePatchOrder(const std::vector< CudaLocalRecord > &data)
bool operator()(CudaComputeNonbonded::ComputeRecord i, CudaComputeNonbonded::ComputeRecord j)
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1798
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)