NAMD
CudaComputeNonbonded.C
Go to the documentation of this file.
1 #include <algorithm>
2 #include <map>
3 #include <vector>
4 #include "CudaUtils.h"
5 #include "CudaRecord.h"
6 #include "NamdTypes.h"
7 #include "charm++.h"
8 #include "Patch.h"
9 #include "PatchMap.h"
10 #include "ProxyMgr.h"
11 #include "LJTable.h"
12 #include "Node.h"
13 #include "ObjectArena.h"
14 // #include "ComputeCUDAMgr.h"
15 #include "ReductionMgr.h"
16 #include "CudaComputeNonbonded.h"
17 #include "WorkDistrib.h"
18 #include "HomePatch.h"
19 #include "Priorities.h"
20 #include "ComputePmeCUDAMgr.h"
21 #include "ComputeNonbondedUtil.h"
22 #include "PatchData.h"
23 //#include "CudaUtils.h"
24 
25 #include "NamdEventsProfiling.h"
26 
27 #include "DeviceCUDA.h"
28 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
29 #ifdef WIN32
30 #define __thread __declspec(thread)
31 #endif
32 extern __thread DeviceCUDA *deviceCUDA;
33 #endif
34 
35 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
36 
37 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
38 //
39 // Class constructor
40 //
42  CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) :
43 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
44 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
45 
46  cudaCheck(cudaSetDevice(deviceID));
47 
48  exclusionsByAtom = NULL;
49 
50  vdwTypes = NULL;
51  vdwTypesSize = 0;
52 
53  exclIndexMaxDiff = NULL;
54  exclIndexMaxDiffSize = 0;
55 
56  atomIndex = NULL;
57  atomIndexSize = 0;
58 
59  atomStorageSize = 0;
60 
61  // Atom and charge storage
62  atoms = NULL;
63  atomsSize = 0;
64  part = NULL;
65  partSize = 0;
66  doAlch = false;
67  lambdaWindowUpdated = false;
68 
69  // Force storage
70  h_forces = NULL;
71  h_forcesSize = 0;
72  h_forcesSlow = NULL;
73  h_forcesSlowSize = 0;
74 
75  d_forces = NULL;
76  d_forcesSize = 0;
77  d_forcesSlow = NULL;
78  d_forcesSlowSize = 0;
79 
80  // GBIS
81  intRad0H = NULL;
82  intRad0HSize = 0;
83  intRadSH = NULL;
84  intRadSHSize = 0;
85  psiSumH = NULL;
86  psiSumHSize = 0;
87  bornRadH = NULL;
88  bornRadHSize = 0;
89  dEdaSumH = NULL;
90  dEdaSumHSize = 0;
91  dHdrPrefixH = NULL;
92  dHdrPrefixHSize = 0;
93  maxShmemPerBlock = 0;
94  cudaPatches = NULL;
95 
96  atomsChangedIn = true;
97  atomsChanged = true;
98  computesChanged = true;
99 
100  forceDoneEventRecord = false;
101 
103  if (simParams->pressureProfileOn) {
104  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
105  }
106 
107  if (simParams->GBISOn) gbisPhase = 3;
108 
109  doSkip = false;
110 }
111 
112 //
113 // Class destructor
114 //
116  // fprintf(stderr, "Pe %d calling destructor ", CkMyPe());
117  cudaCheck(cudaSetDevice(deviceID));
118  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
119  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
120  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
121  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
122  if (part != NULL) deallocate_host<char>(&part);
123  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
124  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
125  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
126  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
127 
128  // GBIS
129  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
130  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
131  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
132  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
133  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
134  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
135 
136  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
137 
138  if (patches.size() > 0) {
139  deallocate_host<VirialEnergy>(&h_virialEnergy);
140  deallocate_device<VirialEnergy>(&d_virialEnergy);
141  cudaCheck(cudaStreamDestroy(stream));
142  cudaCheck(cudaEventDestroy(forceDoneEvent));
143  CmiDestroyLock(lock);
144  delete reduction;
145  }
146 
147  // NOTE: unregistering happens in [sync] -entry method
148  // fprintf(stderr, "unregistering patches on pe %d\n", CkMyPe());
149  computeMgr->sendUnregisterBoxesOnPe(pes, this);
150 
151 }
152 
153 void CudaComputeNonbonded::unregisterBox(int i) {
154  if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(this, &patches[i].positionBox);
155  if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(this, &patches[i].forceBox);
156  if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(this, &patches[i].intRadBox);
157  if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(this, &patches[i].psiSumBox);
158  if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(this, &patches[i].bornRadBox);
159  if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(this, &patches[i].dEdaSumBox);
160  if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(this, &patches[i].dHdrPrefixBox);
161 }
162 
164  if (rankPatches[CkMyRank()].size() == 0)
165  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
166  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
167  unregisterBox(rankPatches[CkMyRank()][i]);
168  }
169 }
170 
171 //
172 // Register inter-patch (self) compute.
173 // Only serialized calls allowed
174 //
176  computesChanged = true;
177  addPatch(pid);
178  addCompute(cid, pid, pid, 0.);
179 }
180 
181 //
182 // Register pair-patch compute.
183 // Only serialized calls allowed
184 //
186  computesChanged = true;
187  addPatch(pid[0]);
188  addPatch(pid[1]);
189  PatchMap* patchMap = PatchMap::Object();
190  int t1 = trans[0];
191  int t2 = trans[1];
192  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
193  offset.x += (t1%3-1) - (t2%3-1);
194  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
195  offset.z += (t1/9-1) - (t2/9-1);
196  addCompute(cid, pid[0], pid[1], offset);
197 }
198 
199 //
200 // Add patch
201 //
202 void CudaComputeNonbonded::addPatch(PatchID pid) {
203  patches.push_back(PatchRecord(pid));
204 }
205 
206 //
207 // Add compute
208 //
209 void CudaComputeNonbonded::addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset) {
210  ComputeRecord cr;
211  cr.cid = cid;
212  cr.pid[0] = pid1;
213  cr.pid[1] = pid2;
214  cr.offset = offset;
215  computes.push_back(cr);
216 }
217 
218 //
219 // Update numAtoms and numFreeAtoms on a patch
220 //
221 void CudaComputeNonbonded::updatePatch(int i) {
222  int numAtoms = patches[i].patch->getNumAtoms();
223  int numFreeAtoms = numAtoms;
224  if ( fixedAtomsOn ) {
225  const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
226  for ( int j=0; j< numAtoms; ++j ) {
227  if ( aExt[j].atomFixed ) --numFreeAtoms;
228  }
229  }
230  patches[i].numAtoms = numAtoms;
231  patches[i].numFreeAtoms = numFreeAtoms;
232  cudaPatches[i].numAtoms = numAtoms;
233  cudaPatches[i].numFreeAtoms = numFreeAtoms;
234 #ifdef NODEGROUP_FORCE_REGISTER
235  cudaPatches[i].patchID = patches[i].patchID;
236 #endif
237 }
238 
239 int CudaComputeNonbonded::findPid(PatchID pid) {
240  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
241  int j = rankPatches[CkMyRank()][i];
242  if (patches[j].patchID == pid) return j;
243  }
244  return -1;
245 }
246 
247 void CudaComputeNonbonded::patchReady(PatchID pid, int doneMigration, int seq) {
248  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
249  // however that is already called in Sequencer::runComputeObjects_CUDA
250  // The functionality of updatePatch() was moved into updatePatches()
251  if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
252  if (doneMigration) {
253  int i = findPid(pid);
254  if (i == -1)
255  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
256  updatePatch(i);
257  }
258  CmiLock(lock);
259  Compute::patchReady(pid, doneMigration, seq);
260  CmiUnlock(lock);
261  }
262 }
263 
265  CmiLock(lock);
266  Compute::gbisP2PatchReady(pid, seq);
267  CmiUnlock(lock);
268 }
269 
271  CmiLock(lock);
272  Compute::gbisP3PatchReady(pid, seq);
273  CmiUnlock(lock);
274 }
275 
276 void CudaComputeNonbonded::assignPatch(int i) {
277 
278  PatchMap* patchMap = PatchMap::Object();
279  PatchID pid = patches[i].patchID;
280  Patch* patch = patchMap->patch(pid);
281  if (patch == NULL) {
282  // Create ProxyPatch if none exists
284  patch = patchMap->patch(pid);
285  }
286  patches[i].patch = patch;
287  if (patches[i].patch == NULL) {
288  NAMD_bug("CudaComputeNonbonded::assignPatch, patch not found");
289  }
290  patches[i].positionBox = patches[i].patch->registerPositionPickup(this);
291  patches[i].forceBox = patches[i].patch->registerForceDeposit(this);
293  if (simParams->GBISOn) {
294  patches[i].intRadBox = patches[i].patch->registerIntRadPickup(this);
295  patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(this);
296  patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(this);
297  patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(this);
298  patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this);
299  }
300  // Store Pe where this patch was registered
301 #if 1
302  if (patches[i].pe != CkMyPe()) {
303  NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
304  }
305 #else
306  patches[i].pe = CkMyPe();
307 #endif
308  //
309  patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
310  patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
311 }
312 
314  bool operator() (int2 pidj, int2 pidi) { // i and j reversed
315  int ppi = PATCH_PRIORITY(pidi.x);
316  int ppj = PATCH_PRIORITY(pidj.x);
317  if ( ppi != ppj ) return ppi < ppj;
318  return pidi.x < pidj.x;
319  }
320 };
321 
323  if (rankPatches[CkMyRank()].size() == 0)
324  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
325 
326  // calculate priority rank of local home patch within pe
327  {
328  PatchMap* patchMap = PatchMap::Object();
329  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
330  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
331  int i = rankPatches[CkMyRank()][k];
332  int pid = patches[i].patchID;
333  int homePe = patchMap->node(pid);
334  if ( CkNodeOf(homePe) == CkMyNode() ) {
335  int2 pid_index;
336  pid_index.x = pid;
337  pid_index.y = i;
338  homePatchByRank[CkRankOf(homePe)].add(pid_index);
339  }
340  }
341  for ( int i=0; i<CkMyNodeSize(); ++i ) {
343  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
344  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
345  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
346  int index = homePatchByRank[i][j].y;
347  patches[index].reversePriorityRankInPe = j + masterBoost;
348  }
349  }
350  }
351 
352  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
353  assignPatch(rankPatches[CkMyRank()][i]);
354  }
355 }
356 
357 //
358 // Returns Pe of Patch ID "pid", -1 otherwise
359 //
360 // int findHomePatchPe(std::vector<PatchIDList>& rankPatchIDs, PatchID pid) {
361 int findHomePatchPe(PatchIDList* rankPatchIDs, PatchID pid) {
362  // for (int i=0;i < rankPatchIDs.size();i++) {
363  for (int i=0;i < CkMyNodeSize();i++) {
364  if (rankPatchIDs[i].find(pid) != -1) return CkNodeFirst(CkMyNode()) + i;
365  }
366  return -1;
367 }
368 
369 //
370 // Find all PEs that have Patch
371 //
372 void findProxyPatchPes(std::vector<int>& proxyPatchPes, PatchID pid) {
373  proxyPatchPes.clear();
374  for (int i=0;i < CkMyNodeSize();i++) {
375  int pe = CkNodeFirst(CkMyNode()) + i;
376  if (PatchMap::ObjectOnPe(pe)->patch(pid) != NULL)
377  proxyPatchPes.push_back(pe);
378  }
379 }
380 
381 //
382 // Called after all computes have been registered
383 //
385  // Remove duplicate patches
386  std::sort(patches.begin(), patches.end());
387  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
388  patches.erase(last, patches.end());
389  // Set number of patches
390  setNumPatches(patches.size());
391  masterPe = CkMyPe();
392  computeMgr = computeMgrIn;
393  // Start patch counter
394  patchesCounter = getNumPatches();
395  // Patch ID map
396  std::map<PatchID, int> pidMap;
397 #if 1
398  //-------------------------------------------------------
399  // Copied in from ComputeNonbondedCUDA::assignPatches()
400  //-------------------------------------------------------
401 
402  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
403  int numPesOnNodeSharingDevice = 0;
404  int masterIndex = -1;
405  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
406  int pe = deviceCUDA->getPesSharingDevice(i);
407  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
408  if ( CkNodeOf(pe) == CkMyNode() ) {
409  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
410  }
411  }
412 
413  std::vector<int> count(patches.size(), 0);
414  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
415  std::vector<int> rankpcount(CkMyNodeSize(), 0);
416  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
417 
418  PatchMap* patchMap = PatchMap::Object();
419 
420  int unassignedpatches = patches.size();
421 
422  for (int i=0;i < patches.size(); ++i) {
423  patches[i].pe = -1;
424  }
425 
426  // assign if home pe and build table of natural proxies
427  for (int i=0;i < patches.size(); ++i) {
428  int pid = patches[i].patchID;
429  // homePe = PE where the patch currently resides
430  int homePe = patchMap->node(pid);
431  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
432  int pe = pesOnNodeSharingDevice[j];
433  // If homePe is sharing this device, assign this patch to homePe
434  if ( pe == homePe ) {
435  patches[i].pe = pe;
436  --unassignedpatches;
437  pcount[j] += 1;
438  }
439  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
440  table[i*numPesOnNodeSharingDevice + j] = 1;
441  }
442  }
443  // Assign this patch to homePe, if it resides on the same node
444  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
445  patches[i].pe = homePe;
446  --unassignedpatches;
447  rankpcount[CkRankOf(homePe)] += 1;
448  }
449  }
450  // assign if only one pe has a required proxy
451  for (int i=0; i < patches.size(); ++i) {
452  int pid = patches[i].patchID;
453  if ( patches[i].pe != -1 ) continue;
454  int c = 0;
455  int lastj;
456  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
457  if ( table[i*numPesOnNodeSharingDevice + j] ) {
458  ++c;
459  lastj = j;
460  }
461  }
462  count[i] = c;
463  if ( c == 1 ) {
464  patches[i].pe = pesOnNodeSharingDevice[lastj];
465  --unassignedpatches;
466  pcount[lastj] += 1;
467  }
468  }
469  int assignj = 0;
470  while ( unassignedpatches ) {
471  int i;
472  for (i=0;i < patches.size(); ++i) {
473  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
474  int pid = patches[i].patchID;
475  // patch_record &pr = patchRecords[pid];
476  if ( patches[i].pe != -1 ) continue;
477  patches[i].pe = pesOnNodeSharingDevice[assignj];
478  --unassignedpatches;
479  pcount[assignj] += 1;
480  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
481  break;
482  }
483  if (i < patches.size() ) continue; // start search again
484  for ( i=0;i < patches.size(); ++i ) {
485  int pid = patches[i].patchID;
486  // patch_record &pr = patchRecords[pid];
487  if ( patches[i].pe != -1 ) continue;
488  if ( count[i] ) continue;
489  patches[i].pe = pesOnNodeSharingDevice[assignj];
490  --unassignedpatches;
491  pcount[assignj] += 1;
492  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
493  break;
494  }
495  if ( i < patches.size() ) continue; // start search again
496  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
497  }
498 
499  // For each rank, list of patches
500  rankPatches.resize(CkMyNodeSize());
501  for (int i=0; i < patches.size(); ++i) {
502  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
503  pidMap[patches[i].patchID] = i;
504  }
505 
506  // for ( int i=0; i < patches.size(); ++i ) {
507  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
508  // }
509 #else
510  // For each rank, list of patches
511  rankPatches.resize(CkMyNodeSize());
512  // For each rank, list of home patch IDs
513  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
514  for (int i=0;i < CkMyNodeSize();i++) {
515  int pe = CkNodeFirst(CkMyNode()) + i;
516  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
517  }
518  std::vector<int> proxyPatchPes;
519  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
520  //--------------------------------------------------------
521  // Build a list of PEs to avoid
522  std::vector<int> pesToAvoid;
523 #if 0
524  // Avoid other GPUs' master PEs
525  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
526  int pe = deviceCUDA->getMasterPeForDeviceID(i);
527  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
528  }
529  // Avoid PEs that are involved in PME
530  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
531  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
532  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
533  }
534  // Set counters of avoidable PEs to high numbers
535  for (int i=0;i < pesToAvoid.size();i++) {
536  int pe = pesToAvoid[i];
537  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
538  }
539 #endif
540  // Avoid master Pe somewhat
541  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
542  //--------------------------------------------------------
543  for (int i=0;i < patches.size();i++) {
544  //if I had this datastructure "patches" on the GPU, I could use it
545  PatchID pid = patches[i].patchID;
546  int pe = findHomePatchPe(rankHomePatchIDs, pid);
547  if (pe == -1) {
548  // Patch not present on this node => try finding a ProxyPatch
549  findProxyPatchPes(proxyPatchPes, pid);
550  if (proxyPatchPes.size() == 0) {
551  // No ProxyPatch => create one on rank that has the least ProxyPatches
552  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
553  pe = CkNodeFirst(CkMyNode()) + rank;
554  peProxyPatchCounter[rank]++;
555  } else {
556  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
557  // this is done by finding the entry with minimum peProxyPatchCounter -value
558  // Find miniumum among proxyPatchPes, i.e., find the minimum among
559  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
560  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
561  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
562  // - proxyPatchPes.begin();
563  // pe = proxyPatchPes[pppi];
564  int minCounter = (1 << 30);
565  for (int j=0;j < proxyPatchPes.size();j++) {
566  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
567  pe = proxyPatchPes[j];
568  minCounter = peProxyPatchCounter[CkRankOf(pe)];
569  }
570  }
571  if (pe == -1)
572  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
573  peProxyPatchCounter[CkRankOf(pe)]++;
574  }
575  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
576  // Found home patch on this node, but it's on PE that should be avoided => find a new one
577  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
578  pe = CkNodeFirst(CkMyNode()) + rank;
579  peProxyPatchCounter[rank]++;
580  }
581  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
582  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
583  rankPatches[CkRankOf(pe)].push_back(i);
584  pidMap[pid] = i;
585  }
586 
587  delete [] rankHomePatchIDs;
588 #endif
589  // Setup computes using pidMap
590  for (int i=0;i < computes.size();i++) {
591  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
592  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
593  }
594  for (int i=0;i < CkMyNodeSize();i++) {
595  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
596  }
597  computeMgr->sendAssignPatchesOnPe(pes, this);
598 }
599 
600 void CudaComputeNonbonded::updatePatchOrder(const std::vector<CudaLocalRecord>& data) {
601  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
602  std::map<int, int> pidMap;
603  for (int i=0; i < data.size(); ++i) {
604  pidMap[data[i].patchID] = i;
605  }
606 
607  std::vector<PatchRecord> copy = patches;
608 
609  for (int i=0; i < copy.size(); i++) {
610  const int new_idx = pidMap[copy[i].patchID];
611  patches[new_idx] = copy[i];
612  }
613 
614  for (int i=0; i < rankPatches.size(); i++) {
615  rankPatches[i].clear();
616  }
617  for (int i=0; i < patches.size(); ++i) {
618  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
619  }
620 
621  // Setup computes using pidMap
622  for (int i=0;i < computes.size();i++) {
623  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
624  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
625  }
626  // TODO do we need to call sendAssignPatchesOnPe with the new order?
627 }
628 
630  if (patches.size() > 0) {
631  npairlists = 0;
632  // Allocate CUDA version of patches
633  cudaCheck(cudaSetDevice(deviceID));
634  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
635 
636  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
637  allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
638 
639  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
640  */
641  cudaDeviceProp props;
642  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
643  maxShmemPerBlock = props.sharedMemPerBlock;
644 
645 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
646  int leastPriority, greatestPriority;
647  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
648  int priority = (doStreaming) ? leastPriority : greatestPriority;
649  // int priority = greatestPriority;
650  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
651 #else
652  cudaCheck(cudaStreamCreate(&stream));
653 #endif
654  cudaCheck(cudaEventCreate(&forceDoneEvent));
655 
656  buildExclusions();
657 
658  lock = CmiCreateLock();
659  params = Node::Object()->simParameters;
661 
662 #ifdef NODEGROUP_FORCE_REGISTER
663  int devInd = deviceCUDA->getDeviceIndex();
664  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
665  PatchData *patchData = cpdata.ckLocalBranch();
666  nodeReduction = patchData->reduction;
667  patchData->devData[devInd].nbond_stream = stream;
668  // Fill auxiliary arrays for merging forces here
669  PatchMap* map = PatchMap::Object();
670  int nGlobalPatches = map->numPatches();
671  allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
672  memset(patchData->devData[devInd].h_hasPatches, 0, sizeof(bool)*nGlobalPatches);
673 
674  for(int i = 0; i < patches.size(); i++){
675  patchData->devData[devInd].h_hasPatches[patches[i].patchID] = true;
676  }
677  allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
678  copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
679 #endif
680  }
681 }
682 
683 //
684 // atomUpdate() can be called by any Pe
685 //
687  atomsChangedIn = true;
688 }
689 
690 //
691 // Compute patches[].atomStart, patches[].numAtoms, patches[].numFreeAtoms, and atomStorageSize
692 //
693 void CudaComputeNonbonded::updatePatches() {
694  if(params->CUDASOAintegrate && params->useDeviceMigration) {
695 #ifdef NODEGROUP_FORCE_REGISTER
696  const int deviceIndex = deviceCUDA->getDeviceIndex();
697  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
698  PatchData *patchData = cpdata.ckLocalBranch();
699  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
700  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
701 
702  // Maximum number of tiles per tile list
703  maxTileListLen = 0;
704  int atomStart = 0;
705  for (int i=0;i < numPatchesHomeAndProxy; i++) {
706  patches[i].numAtoms = localPatches[i].numAtoms;
707  patches[i].numFreeAtoms = localPatches[i].numAtoms;
708  patches[i].atomStart = localPatches[i].bufferOffsetNBPad;
709  cudaPatches[i].numAtoms = localPatches[i].numAtoms;
710  cudaPatches[i].numFreeAtoms = localPatches[i].numAtoms;
711  cudaPatches[i].atomStart = localPatches[i].bufferOffsetNBPad;
712  cudaPatches[i].patchID = localPatches[i].patchID;
713  // Haochuan: count the number of fixed atoms per patch
714  if (fixedAtomsOn) {
715  Patch* patch = NULL;
716  // Search the patch map to determine the number of free atoms of this patch
717  for (int j = 0; j < deviceCUDA->getNumPesSharingDevice(); j++){
719  patch = pm->patch(localPatches[i].patchID);
720  if (patch != NULL) break;
721  }
722  if (patch == NULL) NAMD_bug("CudaComputeNonbonded::updatePatches cannot find patch.\n");
723  if (patch->getNumAtoms() != localPatches[i].numAtoms) {
724  NAMD_bug("CudaComputeNonbonded::updatePatches numAtoms mismatches!\n");
725  }
726  const CompAtomExt *aExt = patch->getCompAtomExtInfo();
727  for (int j = 0; j < localPatches[i].numAtoms; ++j) {
728  if (aExt[j].atomFixed) {
729  --patches[i].numFreeAtoms;
730  --cudaPatches[i].numFreeAtoms;
731  }
732  }
733  }
734  int numAtoms = patches[i].numAtoms;
735 #if defined(NAMD_CUDA)
736  int numTiles = CudaComputeNonbondedKernel::computeNumTiles(numAtoms, WARPSIZE);
737  maxTileListLen = std::max(maxTileListLen, numTiles);
738  // computeAtomPad will recompute the number of tiles. Recomputing for clarity
739  atomStart += CudaComputeNonbondedKernel::computeAtomPad(numAtoms, WARPSIZE);
740 #else
742  maxTileListLen = std::max(maxTileListLen, numTiles);
743  // computeAtomPad will recompute the number of tiles. Recomputing for clarity
745 #endif
746  }
747  atomStorageSize = atomStart;
748 
749  if (maxTileListLen >= 65536) {
750  NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
751  }
752 #endif
753  } else {
754 
755  // Maximum number of tiles per tile list
756  maxTileListLen = 0;
757  int atomStart = 0;
758  for (int i=0;i < patches.size();i++) {
759  patches[i].atomStart = atomStart;
760  cudaPatches[i].atomStart = atomStart;
761 #ifdef NODEGROUP_FORCE_REGISTER
762  cudaPatches[i].patchID = patches[i].patchID;
763 #endif
764  int numAtoms = patches[i].numAtoms;
765 #ifdef NAMD_HIP
767  maxTileListLen = std::max(maxTileListLen, numTiles);
769 #else
770  int numTiles = CudaComputeNonbondedKernel::computeNumTiles(numAtoms, WARPSIZE);
771  maxTileListLen = std::max(maxTileListLen, numTiles);
773 #endif
774  }
775  atomStorageSize = atomStart;
776 
777  if (maxTileListLen >= 65536) {
778  NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
779  }
780  }
781 }
782 
783 void CudaComputeNonbonded::skipPatch(int i) {
784  if (CkMyPe() != patches[i].pe)
785  NAMD_bug("CudaComputeNonbonded::skipPatch called on wrong Pe");
786  Flags &flags = patches[i].patch->flags;
787  patches[i].positionBox->skip();
788  patches[i].forceBox->skip();
789  if (flags.doGBIS) {
790  patches[i].psiSumBox->skip();
791  patches[i].intRadBox->skip();
792  patches[i].bornRadBox->skip();
793  patches[i].dEdaSumBox->skip();
794  patches[i].dHdrPrefixBox->skip();
795  }
796 }
797 
799  if (rankPatches[CkMyRank()].size() == 0)
800  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
801  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
802  skipPatch(rankPatches[CkMyRank()][i]);
803  }
804  bool done = false;
805  CmiLock(lock);
806  patchesCounter -= rankPatches[CkMyRank()].size();
807  if (patchesCounter == 0) {
808  patchesCounter = getNumPatches();
809  done = true;
810  }
811  CmiUnlock(lock);
812  if (done) {
813  // Reduction must be done on masterPe
814  computeMgr->sendFinishReductions(masterPe, this);
815  }
816 }
817 
818 void CudaComputeNonbonded::skip() {
819  if (CkMyPe() != masterPe)
820  NAMD_bug("CudaComputeNonbonded::skip() called on non masterPe");
821 
822  if (patches.size() == 0) return;
823 
824  doSkip = true;
825 
826  computeMgr->sendSkipPatchesOnPe(pes, this);
827 }
828 
829 void CudaComputeNonbonded::getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance) {
830  if (CkMyPe() != masterPe)
831  NAMD_bug("CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
832 
833  for (int i=0;i < patches.size();++i) {
834  PatchRecord &pr = patches[i];
835 
836  float maxMove = pr.patch->flags.maxAtomMovement;
837  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
838 
839  float maxTol = pr.patch->flags.pairlistTolerance;
840  //if(pr.patch->getPatchID() == 0) fprintf(stderr,
841  // "\n\nP0: Maximum mov/tol during CudaComputeNonbonded: %lf %lf\n", maxMove, maxTol);
842  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
843  }
844 }
845 
846 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param) {
848  c->updateVdwTypesExclSubset(first, last);
849 }
850 
851 void CudaComputeNonbonded::updateVdwTypesExclSubset(int first, int last) {
852  for (int i=first;i <= last;i++) {
853  PatchRecord &pr = patches[i];
854  int start = pr.atomStart;
855  int numAtoms = pr.numAtoms;
856  const CompAtom *compAtom = pr.compAtom;
857  const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
858  // Atoms have changed, re-do exclusions and vdw types
859  int2* exclp = exclIndexMaxDiff + start;
860  int* aip = atomIndex + start;
861  char* pst;
862  if(doAlch) pst = part + start;
863  for ( int k=0;k < numAtoms; ++k ) {
864  int j = compAtomExt[k].sortOrder;
865  vdwTypes[start + k] = compAtom[j].vdwType;
866  aip[k] = compAtomExt[j].id;
867  if(doAlch) pst[k] = compAtom[j].partition;
868 #ifdef MEM_OPT_VERSION
869  exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
870  exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
871 #else // ! MEM_OPT_VERSION
872  exclp[k].x = exclusionsByAtom[compAtomExt[j].id].y;
873  exclp[k].y = exclusionsByAtom[compAtomExt[j].id].x;
874 #endif // MEM_OPT_VERSION
875  }
876  }
877 }
878 
879 //
880 // Called every time atoms changed
881 //
882 void CudaComputeNonbonded::updateVdwTypesExcl() {
883  // Re-allocate (VdwTypes, exclIndexMaxDiff) as needed
884  reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
885  reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
886  reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
887  if (doAlch) reallocate_host<char>(&part, &partSize, atomStorageSize, 1.4f);
888 
889 
890  if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
891 #if CMK_SMP && USE_CKLOOP
892  int useCkLoop = Node::Object()->simParameters->useCkLoop;
893  if (useCkLoop >= 1) {
894  CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (void *)this, CkMyNodeSize(), 0, patches.size()-1);
895  } else
896 #endif
897  {
898  updateVdwTypesExclSubset(0, patches.size()-1);
899  }
900 
901  nonbondedKernel.updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
902  } else {
903 #ifdef NODEGROUP_FORCE_REGISTER
904  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
905  PatchData *patchData = cpdata.ckLocalBranch();
906  const int deviceIndex = deviceCUDA->getDeviceIndex();
907  nonbondedKernel.updateVdwTypesExclOnGPU(tileListKernel,
908  patchData->devData[deviceIndex].numPatchesHomeAndProxy,
909  atomStorageSize, params->alchOn,
910  patchData->devData[deviceIndex].d_localPatches,
911  patchData->h_soa_vdwType[deviceIndex],
912  patchData->h_soa_id[deviceIndex],
913  patchData->h_soa_sortOrder[deviceIndex],
914  patchData->h_soa_partition[deviceIndex],
915  stream
916  );
917 #endif // NODEGROUP_FORCE_REGISTER
918  }
919 }
920 
921 inline void CudaComputeNonbonded::copyAtomsLoop(int first, int last, void *result, int paraNum, void *param) {
923  c->copyAtomsSubset(first, last);
924 }
925 
926 void CudaComputeNonbonded::copyAtomsSubset(int first, int last) {
927  for (int i=first;i <= last;++i) {
928  PatchRecord &pr = patches[i];
929  int numAtoms = pr.numAtoms;
930  if (numAtoms > 0) {
931  int start = pr.atomStart;
932  const CudaAtom *src = pr.patch->getCudaAtomList();
933  CudaAtom *dst = atoms + start;
934  memcpy(dst, src, sizeof(CudaAtom)*numAtoms);
935  // Fill the rest with the copy of the last atom
936 #ifdef NAMD_HIP
937  int numAtomsAlign = ((numAtoms-1)/BOUNDINGBOXSIZE+1)*BOUNDINGBOXSIZE;
938 #else
939  int numAtomsAlign = ((numAtoms-1)/WARPSIZE+1)*WARPSIZE;
940 #endif
941  CudaAtom lastAtom = src[numAtoms-1];
942  for (int j=numAtoms;j < numAtomsAlign;j++) {
943  dst[j] = lastAtom;
944  }
945 #if 0
946  fprintf(stderr, " printing patch %d\n", pr.patch->getPatchID());
947  for(int k = 0; k < numAtoms; k++){
948  fprintf(stderr, "%lf %lf %lf\n", dst[k].x, dst[k].y, dst[k].z);
949  }
950 #endif
951  }
952  }
953 }
954 
955 void CudaComputeNonbonded::copyGBISphase(int i) {
956  if (CkMyPe() != patches[i].pe)
957  NAMD_bug("CudaComputeNonbonded::copyGBISphase called on wrong Pe");
958  PatchRecord &pr = patches[i];
959  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
960  if (gbisPhase == 1) {
961  //Copy GBIS intRadius to Host
962  if (atomsChanged) {
963  float *intRad0 = intRad0H + pr.atomStart;
964  float *intRadS = intRadSH + pr.atomStart;
965  for (int k=0;k < pr.numAtoms;++k) {
966  int j = aExt[k].sortOrder;
967  intRad0[k] = pr.intRad[2*j+0];
968  intRadS[k] = pr.intRad[2*j+1];
969  }
970  }
971  } else if (gbisPhase == 2) {
972  float *bornRad = bornRadH + pr.atomStart;
973  for ( int k=0; k < pr.numAtoms; ++k ) {
974  int j = aExt[k].sortOrder;
975  bornRad[k] = pr.bornRad[j];
976  }
977  } else if (gbisPhase == 3) {
978  float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
979  for ( int k=0; k < pr.numAtoms; ++k ) {
980  int j = aExt[k].sortOrder;
981  dHdrPrefix[k] = pr.dHdrPrefix[j];
982  }
983  } // end phases
984 }
985 
986 void CudaComputeNonbonded::openBox(int i) {
987  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
988 
989  if (CkMyPe() != patches[i].pe)
990  NAMD_bug("CudaComputeNonbonded::openBox called on wrong Pe");
992  if (!simParams->GBISOn || gbisPhase == 1) {
993  // what is positionBox????
994  patches[i].compAtom = patches[i].positionBox->open();
995  // the compAtom datastructure is null for PEs
996  //fprintf(stderr, "opening box at patches[%d] = %p\n", i, patches[i].compAtom);
997  // JM: This is not necessary if we already have the positions from integration
998  // This is only necessary in the first iteration
999  // XXX TODO: Find out if we really need to open the position box or if we
1000  // can skip this step entirely
1001 #ifdef NODEGROUP_FORCE_REGISTER
1002  if(simParams->CUDASOAintegrate){
1003  if(atomsChanged && !simParams->useDeviceMigration) copyAtomsSubset(i, i);
1004  }else copyAtomsSubset(i, i);
1005 #else
1006  copyAtomsSubset(i, i);
1007 #endif
1008  }
1009  if (simParams->GBISOn) {
1010  if (gbisPhase == 1) {
1011  patches[i].intRad = patches[i].intRadBox->open();
1012  patches[i].psiSum = patches[i].psiSumBox->open();
1013  } else if (gbisPhase == 2) {
1014  patches[i].bornRad = patches[i].bornRadBox->open();
1015  patches[i].dEdaSum = patches[i].dEdaSumBox->open();
1016  } else if (gbisPhase == 3) {
1017  patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
1018  }
1019  copyGBISphase(i);
1020  }
1021 
1022  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1023 }
1024 
1026  if (masterPe != CkMyPe())
1027  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
1029 }
1030 
1032  if (rankPatches[CkMyRank()].size() == 0)
1033  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
1034 
1035  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1036 
1037 #ifdef NODEGROUP_FORCE_REGISTER
1038  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
1039  // opens boxes to make sure NAMD won't complain
1040  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1041  int j = rankPatches[CkMyRank()][i];
1042  patches[j].positionBox->open();
1043  }
1044  if(masterPe == CkMyPe()) {
1045  // we need to open boxes here...
1046  if(params->CUDASOAintegrate){
1047  if(!atomsChanged) this->launchWork();
1048  }
1049  else computeMgr->sendLaunchWork(masterPe, this);
1050  }
1051  }
1052  else{
1053 #endif
1054  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1055  openBox(rankPatches[CkMyRank()][i]);
1056  }
1057  bool done = false;
1058  CmiLock(lock);
1059  patchesCounter -= rankPatches[CkMyRank()].size();
1060  if (patchesCounter == 0) {
1061  patchesCounter = getNumPatches();
1062  done = true;
1063  }
1064  CmiUnlock(lock);
1065  if (done) {
1066  if(params->CUDASOAintegrate){
1067  if(!atomsChanged) this->launchWork();
1068  }
1069  else computeMgr->sendLaunchWork(masterPe, this);
1070  }
1071 #ifdef NODEGROUP_FORCE_REGISTER
1072  }
1073 #endif
1074  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1075 }
1076 
1078  // Simply enqueu doWork on masterPe and return "no work"
1079  computeMgr->sendMessageEnqueueWork(masterPe, this);
1080  return 1;
1081 }
1082 
1083 void CudaComputeNonbonded::reallocateArrays() {
1084  cudaCheck(cudaSetDevice(deviceID));
1086 
1087  // Re-allocate atoms
1088  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
1089 
1090  // Re-allocate forces
1091  if (doStreaming) {
1092  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1093  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1094  } else {
1095  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
1096  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
1097  }
1098  reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
1099  reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
1100  nonbondedKernel.reallocate_forceSOA(atomStorageSize);
1101 
1102  if (simParams->GBISOn) {
1103  reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
1104  reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
1105  reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
1106  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
1107  reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
1108  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
1109  }
1110 }
1111 
1113  if (CkMyPe() != masterPe)
1114  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
1115 
1116  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
1117  // atomsChangedIn can be set to true by any Pe
1118  // atomsChanged can only be set by masterPe
1119  // This use of double varibles makes sure we don't have race condition
1120  // it seems like it's important to have the masterPe call doWork() first
1121  atomsChanged = atomsChangedIn;
1122  atomsChangedIn = false;
1123 
1125 
1126  if (patches.size() == 0) return; // No work do to
1127 
1128  // Take the flags from the first patch on this Pe
1129  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
1130  // these flags are probably wrong.
1131  Flags &flags = patches[0].patch->flags;
1132 
1133  doSlow = flags.doFullElectrostatics;
1134  doEnergy = flags.doEnergy;
1135  doVirial = flags.doVirial;
1136  doAlch = simParams->alchOn;
1137  doMinimize = flags.doMinimize;
1138 
1139  if (flags.doNonbonded) {
1140 
1141  if (simParams->GBISOn) {
1142  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
1143  }
1144 
1145  if (!simParams->GBISOn || gbisPhase == 1) {
1146  if ( computesChanged ) {
1147  updateComputes();
1148  }
1149  if (atomsChanged) {
1150  // Re-calculate patch atom numbers and storage
1151  updatePatches();
1152  reSortDone = false;
1153  }
1154  reallocateArrays();
1155 #ifdef NODEGROUP_FORCE_REGISTER
1156  if (simParams->CUDASOAintegrate && simParams->useDeviceMigration && atomsChanged) {
1157  tileListKernel.prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
1158  updatePatchRecord();
1159  }
1160 #endif // NODEGROUP_FORCE_REGISTER
1161  }
1162 
1163  // Open boxes on Pes and launch work to masterPe
1164  if(params->CUDASOAintegrate){
1165  if(!atomsChanged) this->openBoxesOnPe();
1166  }
1167  else computeMgr->sendOpenBoxesOnPe(pes, this);
1168 
1169  } else {
1170  // No work to do, skip
1171  skip();
1172  }
1173 
1174 }
1175 
1177  if (CkMyPe() != masterPe)
1178  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
1179 
1180  beforeForceCompute = CkWallTimer();
1181  cudaCheck(cudaSetDevice(deviceID));
1183 
1184  // So, it seems like PE's are invoking the same object, however the patches[i] is borked on the masterPe
1185 
1186  // When I get here, it seems like compAtoms are not set for all Pes? How can that be?
1187 
1188  //execute only during GBIS phase 1, or if not using GBIS
1189  if (!simParams->GBISOn || gbisPhase == 1) {
1190 
1191  if ( atomsChanged || computesChanged ) {
1192  // Invalidate pair lists
1193  pairlistsValid = false;
1194  pairlistTolerance = 0.0f;
1195  }
1196 
1197  // Get maximum atom movement and patch tolerance
1198  float maxAtomMovement = 0.0f;
1199  float maxPatchTolerance = 0.0f;
1200  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
1201  // Update pair-list cutoff
1202  Flags &flags = patches[0].patch->flags;
1203  savePairlists = false;
1204  usePairlists = false;
1205  if ( flags.savePairlists ) {
1206  savePairlists = true;
1207  usePairlists = true;
1208  } else if ( flags.usePairlists ) {
1209  if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1210  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1211 #ifdef NODEGROUP_FORCE_REGISTER
1212  nodeReduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1213 #endif
1214  } else {
1215  usePairlists = true;
1216  }
1217  }
1218  if ( ! usePairlists ) {
1219  pairlistsValid = false;
1220  }
1221  float plcutoff = cutoff;
1222  if ( savePairlists ) {
1223  pairlistsValid = true;
1224  pairlistTolerance = 2. * maxPatchTolerance;
1225  plcutoff += pairlistTolerance;
1226  }
1227  plcutoff2 = plcutoff * plcutoff;
1228 
1229  // fprintf(stderr, "STEP[%d] plcutoff = %f listTolerance = %f save = %d maxPatchTolerance = %f maxAtomMovement = %f plvalid = %d flags.use = %d use = %d\n",
1230  // flags.step, plcutoff, pairlistTolerance, savePairlists, maxPatchTolerance, maxAtomMovement, pairlistsValid, flags.usePairlists, usePairlists);
1231  if(savePairlists || !usePairlists){
1232  reSortDone = false; // Ensures pairlist resorting if doPairlist
1233  }
1234 
1235  // if (atomsChanged)
1236  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1237  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1238 
1239  } // if (!simParams->GBISOn || gbisPhase == 1)
1240 
1241  // Calculate PME & VdW forces
1242  if (!simParams->GBISOn || gbisPhase == 1) {
1243  doForce();
1244  if (doStreaming) {
1245  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1246  patchReadyQueueLen = tileListKernel.getNumPatches();
1247  patchReadyQueueNext = 0;
1248  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1249  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1250  int* emptyPatches = tileListKernel.getEmptyPatches();
1251  for (int i=0;i < numEmptyPatches;i++) {
1252  PatchRecord &pr = patches[emptyPatches[i]];
1253  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1254  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1255  patchReadyQueue[i] = emptyPatches[i];
1256  }
1257  if (patchReadyQueueLen != patches.size())
1258  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1259  }
1260  }
1261 
1262  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1263  // before calling doGBISphase1().
1264  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1265  // In this code path doGBISphase1() is called in forceDone()
1266  forceDoneSetCallback();
1267  return;
1268  }
1269 
1270  // GBIS Phases
1271  if (simParams->GBISOn) {
1272  if (gbisPhase == 1) {
1273  doGBISphase1();
1274  } else if (gbisPhase == 2) {
1275  doGBISphase2();
1276  } else if (gbisPhase == 3) {
1277  doGBISphase3();
1278  }
1279  }
1280 
1281  // Copy forces to host
1282  if (!simParams->GBISOn || gbisPhase == 3) {
1283  if (!doStreaming) {
1284 #ifdef NODEGROUP_FORCE_REGISTER
1285  if(!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration)){
1286  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1287  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1288  }
1289 #else
1290  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1291  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1292 #endif
1293 
1294  }
1295  }
1296 
1297  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1298 
1299  NAMD_EVENT_START(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1300  // For GBIS, energies are ready after phase 2
1301  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1302  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1303  d_forces, d_forcesSlow, d_virialEnergy, stream);
1304  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1305 
1306  NAMD_EVENT_STOP(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1307 
1308  }
1309 
1310  if(simParams->CUDASOAintegrate && ((savePairlists || !usePairlists)) && !atomsChanged) reSortTileLists();
1311 
1312  // Setup call back
1313  forceDoneSetCallback();
1314 
1315 #if 0
1316  cudaCheck(cudaStreamSynchronize(stream));
1317  PatchMap *map = PatchMap::Object();
1318  HomePatchElem *elem;
1319  for(elem = map->homePatchList()->begin(); elem != map->homePatchList()->end(); elem++){
1320  if(elem->patch->getPatchID() == 7) break;
1321  }
1322  if(elem->patch->flags.step == 11){
1323  // it would be good to know from which patch these atoms are...
1324  fprintf(stderr, "CudaNonbonded data\n");
1325  for(int i = 0 ; i < atomStorageSize; i++){
1326  fprintf(stderr, "pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n",
1327  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
1328  // for some reason, we needed to set the positions
1329  h_forces[i].x, h_forces[i].y, h_forces[i].z,
1330  h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
1331  }
1332  }
1333 #endif
1334 
1335 }
1336 
1337 //
1338 // GBIS Phase 1
1339 //
1340 void CudaComputeNonbonded::doGBISphase1() {
1341  cudaCheck(cudaSetDevice(deviceID));
1342 
1343  if (atomsChanged) {
1344  GBISKernel.updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1345  }
1346 
1348  Lattice lattice = patches[0].patch->flags.lattice;
1349 
1350  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1351  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1352  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1353 
1354  GBISKernel.GBISphase1(tileListKernel, atomStorageSize,
1355  lata, latb, latc,
1356  simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream);
1357 }
1358 
1359 //
1360 // GBIS Phase 2
1361 //
1362 void CudaComputeNonbonded::doGBISphase2() {
1363  cudaCheck(cudaSetDevice(deviceID));
1364 
1366  Lattice lattice = patches[0].patch->flags.lattice;
1367 
1368  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1369  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1370  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1371 
1372  GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream);
1373 
1374  GBISKernel.GBISphase2(tileListKernel, atomStorageSize,
1375  doEnergy, doSlow,
1376  lata, latb, latc,
1377  simParams->cutoff, simParams->nonbondedScaling, simParams->kappa,
1378  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1379  simParams->dielectric, simParams->solvent_dielectric,
1380  d_forces, dEdaSumH, stream);
1381 }
1382 
1383 //
1384 // GBIS Phase 3
1385 //
1386 void CudaComputeNonbonded::doGBISphase3() {
1387  cudaCheck(cudaSetDevice(deviceID));
1389  Lattice lattice = patches[0].patch->flags.lattice;
1390 
1391  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1392  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1393  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1394 
1395  if (doSlow) {
1396  GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream);
1397 
1398  GBISKernel.GBISphase3(tileListKernel, atomStorageSize,
1399  lata, latb, latc,
1400  simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream);
1401  }
1402 }
1403 
1404 //
1405 // Calculate electrostatic & VdW forces
1406 //
1407 void CudaComputeNonbonded::doForce() {
1408  cudaCheck(cudaSetDevice(deviceID));
1410  // XXX TODO: This will not work if the patch flags are not correctly set
1411  Lattice lattice = patches[0].patch->flags.lattice;
1412  bool CUDASOAintegrator = simParams->CUDASOAintegrate;
1413  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1414  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1415  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1416  bool doPairlist = savePairlists || (!usePairlists);
1417  bool doFEP=false, doTI=false, doAlchVdwForceSwitching=false;
1418  if(doAlch){
1419  static thread_local bool firsttime = true;
1420  doTI = simParams->alchThermIntOn;
1421  doFEP = simParams->alchFepOn;
1422  doAlchVdwForceSwitching = simParams->vdwForceSwitching;
1423  // Otherwise, update them only when lambda window is updated.
1424  // getCurrentLambda and getCurrentLambda2 are assumed to have no side effects.
1425  // use float here to match the type of CudaAlchLambdas
1426  const decltype(alchFlags.lambdaUp) currentLambda = simParams->getCurrentLambda(patches[0].patch->flags.step);
1427  const decltype(alchFlags.lambda2Up) currentLambda2 = simParams->getCurrentLambda2(patches[0].patch->flags.step);
1428  if (firsttime) {
1429  // Update the alchemical flags if this is the first time
1430  firsttime = false;
1431  lambdaWindowUpdated = true;
1432  } else {
1433  // Compare the above parameters with respect to the saved parameters.
1434  if (alchFlags.lambdaUp != currentLambda ||
1435  alchFlags.lambda2Up != currentLambda2 ||
1436  // Could the following parameters also be changed?
1437  // I am not quite sure, but checking them by CPU code is not computationally expensive anyway.
1438  alchFlags.cutoff2 != ComputeNonbondedUtil::cutoff2 ||
1442  alchFlags.scaling != ComputeNonbondedUtil::scaling) {
1443  lambdaWindowUpdated = true;
1444  } else {
1445  lambdaWindowUpdated = false;
1446  }
1447  }
1448  if (lambdaWindowUpdated) {
1449  // Flags that are independent of the number of steps
1455  const double factor = alchFlags.cutoff2 - alchFlags.switchdist2;
1456  // When switching is off, cutoff is the same as switchdist,
1457  // so we need to check it to avoid passing inf for the computation of switchmul and switchmul2
1458  alchFlags.switchfactor = simParams->switchingActive ? 1.0/(factor*factor*factor) : 0;
1459  // Step-dependent parameters (lambdas)
1460  // alchFlags.alchLambda is redundant because we have lambdaUp already.
1461  // alchFlags.alchLambda = currentLambda;
1462  alchFlags.lambdaUp = currentLambda;
1463  alchFlags.lambdaDown = 1.0 - alchFlags.lambdaUp;
1464  alchFlags.elecLambdaUp = simParams->getElecLambda(alchFlags.lambdaUp);
1465  alchFlags.elecLambdaDown = simParams->getElecLambda(alchFlags.lambdaDown);
1466  alchFlags.vdwLambdaUp = simParams->getVdwLambda(alchFlags.lambdaUp);
1467  alchFlags.vdwLambdaDown = simParams->getVdwLambda(alchFlags.lambdaDown);
1468  alchFlags.lambda2Up = currentLambda2;
1469  alchFlags.lambda2Down = 1.0 - alchFlags.lambda2Up;
1470  alchFlags.elecLambda2Up = simParams->getElecLambda(alchFlags.lambda2Up);
1471  alchFlags.elecLambda2Down = simParams->getElecLambda(alchFlags.lambda2Down);
1472  alchFlags.vdwLambda2Up = simParams->getVdwLambda(alchFlags.lambda2Up);
1473  alchFlags.vdwLambda2Down = simParams->getVdwLambda(alchFlags.lambda2Down);
1474  alchFlags.vdwShiftUp = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambdaUp);
1475  alchFlags.vdwShift2Up = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambda2Up);
1476  alchFlags.vdwShiftDown = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambdaDown);
1477  alchFlags.vdwShift2Down = alchFlags.alchVdwShiftCoeff*(1 - alchFlags.vdwLambda2Down);
1478  }
1479  }
1480 
1481  if (doPairlist) {
1482  int numTileLists = calcNumTileLists();
1483 
1484  // Build initial tile lists and sort
1485  tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize,
1486  maxTileListLen, lata, latb, latc,
1487  cudaPatches, (const float4*)atoms, plcutoff2, maxShmemPerBlock, stream, atomsChanged, doAlch,
1488  CUDASOAintegrator, simParams->useDeviceMigration);
1489  // Prepare tile list for atom-based refinement
1490  tileListKernel.prepareTileList(stream);
1491  tileListKernel.clearTileListStat(stream);
1492  }
1493 
1494  if (atomsChanged) {
1495  // Update Vdw types and exclusion index & maxdiff
1496  updateVdwTypesExcl();
1497  }
1498 
1499  beforeForceCompute = CkWallTimer();
1500 
1501  // Calculate forces (and refine tile list if atomsChanged=true)
1502 #if 0
1503  if(atomsChanged){
1504  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1505  PatchData *patchData = cpdata.ckLocalBranch();
1506 
1507  CmiLock(patchData->printlock);
1508  fprintf(stderr, "DEV[%d] MIG POS PRINTOUT\n", deviceID);
1509  for (int p = 0; p < patches.size(); p++) {
1510  fprintf(stderr, "Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1511  for (int i = 0; i < patches[p].numAtoms; i++) {
1512  const int ai = i + patches[p].atomStart;
1513  fprintf(stderr, "POS[%d,%d,%d] = %lf %lf %lf %lf. Type %d\n", i, ai, atomIndex[ai],
1514  atoms[ai].x, atoms[ai].y, atoms[ai].z, atoms[ai].q, vdwTypes[ai]);
1515  }
1516  }
1517  CmiUnlock(patchData->printlock);
1518  }
1519 #endif
1520 
1521  const bool doTable = CudaComputeNonbonded::getDoTable(params, doSlow, doVirial);
1523 
1524 #ifdef DEBUG_MINIMIZE
1525  printf("%s, line %d:\n", __FILE__, __LINE__);
1526  printf(" atomsChanged = %d\n", atomsChanged);
1527  printf(" doMinimize = %d\n", doMinimize);
1528  printf(" doPairlist = %d\n", doPairlist);
1529  printf(" doEnergy = %d\n", doEnergy);
1530  printf(" doVirial = %d\n", doVirial);
1531  printf(" doSlow = %d\n", doSlow);
1532  printf("\n");
1533 #endif
1534 
1535 
1536  nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize,
1537  atomsChanged, doMinimize, doPairlist, doEnergy, doVirial,
1538  doSlow, doAlch, doAlchVdwForceSwitching, doFEP, doTI,
1539  doTable, lata, latb, latc,
1540  (const float4*)atoms, cutoff2, c,
1541  d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1542  &alchFlags, lambdaWindowUpdated, part, CUDASOAintegrator,
1543  params->useDeviceMigration, stream);
1544 
1545  if (doPairlist) {
1546  tileListKernel.finishTileList(stream);
1547  }
1548 
1549 // TODO remove once GPU migration has been merged
1550 #ifdef NODEGROUP_FORCE_REGISTER
1551 
1552  updatePatchRecord();
1553 
1554 #endif
1555 
1556 
1557 #if 0
1558  if(atomsChanged){
1559 
1560  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1561  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1562  cudaStreamSynchronize(stream);
1563  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1564  PatchData *patchData = cpdata.ckLocalBranch();
1565 
1566  CmiLock(patchData->printlock);
1567  fprintf(stderr, "DEV[%d] MIG POS PRINTOUT\n", deviceID);
1568  for (int p = 0; p < patches.size(); p++) {
1569  fprintf(stderr, "Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1570  for (int i = 0; i < patches[p].numAtoms; i++) {
1571  const int ai = i + patches[p].atomStart;
1572  fprintf(stderr, "POS[%d,%d,%d] = Type %d (%lf %lf %lf) (%lf %lf %lf)\n", i, ai, atomIndex[ai],
1573  vdwTypes[ai],
1574  h_forces[ai].x, h_forces[ai].y, h_forces[ai].z,
1575  h_forcesSlow[ai].x, h_forcesSlow[ai].y, h_forcesSlow[ai].z);
1576  }
1577  }
1578  CmiUnlock(patchData->printlock);
1579  }
1580 #endif
1581 
1582 
1583 
1584 
1585  traceUserBracketEvent(CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1586 }
1587 
1588 #ifdef NODEGROUP_FORCE_REGISTER
1589 void CudaComputeNonbonded::updatePatchRecord() {
1590  // register device pointers inside nodegroup for later integration
1591  // these can be moved inside atomsChanged laters
1592  int devInd = deviceCUDA->getDeviceIndex();
1593  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1594  PatchData *patchData = cpdata.ckLocalBranch();
1595  PatchMap* patchMap = PatchMap::Object();
1596  patchData->devData[devInd].f_nbond = d_forces;
1597  patchData->devData[devInd].f_nbond_slow = d_forcesSlow;
1598  patchData->devData[devInd].f_nbond_size = atomStorageSize;
1599  // device pointer to CudaPatchRecord
1600  patchData->devData[devInd].nbond_precord = tileListKernel.getCudaPatches();
1601  patchData->devData[devInd].nb_precord_size = tileListKernel.getCudaPatchesSize();
1602  patchData->devData[devInd].nb_datoms = tileListKernel.get_xyzq();
1603  patchData->devData[devInd].nbond_tkernel = &tileListKernel;
1604  patchData->devData[devInd].size_nb_datoms = atomStorageSize;
1605 }
1606 #endif
1607 
1608 //
1609 // Count an upper estimate for the number of tile lists
1610 //
1611 int CudaComputeNonbonded::calcNumTileLists() {
1612  int numTileLists = 0;
1613  for (int i=0;i < computes.size();i++) {
1614  int pi1 = computes[i].patchInd[0];
1615  int numAtoms1 = patches[pi1].numAtoms;
1616 #ifdef NAMD_HIP
1618 #else
1619  int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(numAtoms1, WARPSIZE);
1620 #endif
1621  numTileLists += numTiles1;
1622  }
1623  return numTileLists;
1624 }
1625 
1626 //
1627 // Finish & submit reductions
1628 //
1630  if (CkMyPe() != masterPe)
1631  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1632 
1633  // fprintf(stderr, "PE[%d]: Nbond finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1634  if (!doSkip) {
1635 
1636  if (doStreaming && (doVirial || doEnergy)) {
1637  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1638  if (!forceDoneEventRecord)
1639  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1640  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1641  forceDoneEventRecord = false;
1642  }
1643 
1644  if (doVirial) {
1645  // if(params->CUDASOAintegrate) cudaCheck(cudaStreamSynchronize(stream));
1646  Tensor virialTensor;
1647  virialTensor.xx = h_virialEnergy->virial[0];
1648  virialTensor.xy = h_virialEnergy->virial[1];
1649  virialTensor.xz = h_virialEnergy->virial[2];
1650  virialTensor.yx = h_virialEnergy->virial[3];
1651  virialTensor.yy = h_virialEnergy->virial[4];
1652  virialTensor.yz = h_virialEnergy->virial[5];
1653  virialTensor.zx = h_virialEnergy->virial[6];
1654  virialTensor.zy = h_virialEnergy->virial[7];
1655  virialTensor.zz = h_virialEnergy->virial[8];
1656  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1657  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1658  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1659 #ifdef NODEGROUP_FORCE_REGISTER
1660  if (params->CUDASOAintegrate) {
1661  ADD_TENSOR_OBJECT(nodeReduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1662  } else
1663 #endif
1664  {
1665  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1666  }
1667  if (doSlow) {
1668  Tensor virialTensor;
1669  virialTensor.xx = h_virialEnergy->virialSlow[0];
1670  virialTensor.xy = h_virialEnergy->virialSlow[1];
1671  virialTensor.xz = h_virialEnergy->virialSlow[2];
1672  virialTensor.yx = h_virialEnergy->virialSlow[3];
1673  virialTensor.yy = h_virialEnergy->virialSlow[4];
1674  virialTensor.yz = h_virialEnergy->virialSlow[5];
1675  virialTensor.zx = h_virialEnergy->virialSlow[6];
1676  virialTensor.zy = h_virialEnergy->virialSlow[7];
1677  virialTensor.zz = h_virialEnergy->virialSlow[8];
1678  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1679  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1680  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1681 #ifdef NODEGROUP_FORCE_REGISTER
1682  if (params->CUDASOAintegrate) {
1683  ADD_TENSOR_OBJECT(nodeReduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1684  } else
1685 #endif
1686  {
1687  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1688  }
1689  }
1690  }
1691  if (doEnergy) {
1692  // if (doSlow)
1693 
1694 #ifdef NODEGROUP_FORCE_REGISTER
1695  if (params->CUDASOAintegrate) {
1696  nodeReduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1697  nodeReduction->item(REDUCTION_LJ_ENERGY_F) += h_virialEnergy->energyVdw_s;
1698  nodeReduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1699  nodeReduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
1700 
1701  //Reduce values for TI
1702  nodeReduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
1703  nodeReduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
1704  nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
1705  nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
1706  } else
1707 #endif
1708  {
1709  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1710  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1711  reduction->item(REDUCTION_LJ_ENERGY_F) += h_virialEnergy->energyVdw_s;
1712  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1713  reduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
1714 
1715  //Reduce values for TI
1716  reduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
1717  reduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
1718  reduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
1719  reduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
1720  }
1721 
1722  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1723  if (doSlow){
1724 #ifdef NODEGROUP_FORCE_REGISTER
1725  if (params->CUDASOAintegrate) {
1726  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1727  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
1728  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
1729  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
1730  //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
1731  } else
1732 #endif
1733  {
1734  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1735  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
1736  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
1737  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
1738  //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
1739  }
1740  }
1741  }
1742 
1743 #ifdef NODEGROUP_FORCE_REGISTER
1744  if (params->CUDASOAintegrate) {
1745  nodeReduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1746  } else
1747 #endif
1748  {
1749  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1750  }
1751  }
1752 #ifdef NODEGROUP_FORCE_REGISTER
1753  if (params->CUDASOAintegrate) {
1754  nodeReduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1755  } else
1756 #endif
1757  {
1758  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1759  }
1760 
1761 
1762  // I need to get rid of this for every timestep.
1763  if(!params->CUDASOAintegrate ) reduction->submit();
1764  // Reset flags
1765  doSkip = false;
1766  computesChanged = false;
1767 }
1768 
1769 //
1770 // Finish a single patch
1771 //
1772 void CudaComputeNonbonded::finishPatch(int i) {
1773  if (CkMyPe() != patches[i].pe)
1774  NAMD_bug("CudaComputeNonbonded::finishPatch called on wrong Pe");
1775 
1776  PatchMap *map;
1777  PatchRecord &pr = patches[i];
1778  pr.results = pr.forceBox->open();
1780 
1781  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1782  int atomStart = pr.atomStart;
1783  int numAtoms = pr.numAtoms;
1784 #ifdef NODEGROUP_FORCE_REGISTER
1785  if (numAtoms > 0 && (!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration))) {
1786  Force *f = pr.results->f[Results::nbond];
1787  Force *f_slow = pr.results->f[Results::slow];
1788  float4 *af = h_forces + atomStart;
1789  float4 *af_slow = h_forcesSlow + atomStart;
1790  // float maxf = 0.0f;
1791  // int maxf_k;
1792  for ( int k=0; k<numAtoms; ++k ) {
1793  int j = aExt[k].sortOrder;
1794 #ifdef DEBUG_MINIMIZE
1795  if (j == 0) {
1796  printf("%s, line %d\n", __FILE__, __LINE__);
1797  printf(" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1798  }
1799 #endif
1800  f[j].x += af[k].x;
1801  f[j].y += af[k].y;
1802  f[j].z += af[k].z;
1803 #ifdef DEBUG_MINIMIZE
1804  if (j == 0) {
1805  printf(" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1806  }
1807 #endif
1808  // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
1809  // maxf = std::max(maxf, fabsf(af[k].x));
1810  // maxf = std::max(maxf, fabsf(af[k].y));
1811  // maxf = std::max(maxf, fabsf(af[k].z));
1812  // maxf_k = k;
1813  // }
1814  if ( doSlow ) {
1815  f_slow[j].x += af_slow[k].x;
1816  f_slow[j].y += af_slow[k].y;
1817  f_slow[j].z += af_slow[k].z;
1818  }
1819  }
1820  // if (maxf > 10000.0f) {
1821  // fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
1822  // cudaCheck(cudaStreamSynchronize(stream));
1823  // NAMD_die("maxf!");
1824  // }
1825  }
1826 #else
1827  if (numAtoms > 0) {
1828  Force *f = pr.results->f[Results::nbond];
1829  Force *f_slow = pr.results->f[Results::slow];
1830  float4 *af = h_forces + atomStart;
1831  float4 *af_slow = h_forcesSlow + atomStart;
1832  // float maxf = 0.0f;
1833  // int maxf_k;
1834  for ( int k=0; k<numAtoms; ++k ) {
1835  int j = aExt[k].sortOrder;
1836 #ifdef DEBUG_MINIMIZE
1837  if (j == 0) {
1838  printf("%s, line %d\n", __FILE__, __LINE__);
1839  printf(" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1840  }
1841 #endif
1842  f[j].x += af[k].x;
1843  f[j].y += af[k].y;
1844  f[j].z += af[k].z;
1845 #ifdef DEBUG_MINIMIZE
1846  if (j == 0) {
1847  printf(" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1848  }
1849 #endif
1850  // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
1851  // maxf = std::max(maxf, fabsf(af[k].x));
1852  // maxf = std::max(maxf, fabsf(af[k].y));
1853  // maxf = std::max(maxf, fabsf(af[k].z));
1854  // maxf_k = k;
1855  // }
1856  if ( doSlow ) {
1857  f_slow[j].x += af_slow[k].x;
1858  f_slow[j].y += af_slow[k].y;
1859  f_slow[j].z += af_slow[k].z;
1860  }
1861  }
1862  // if (maxf > 10000.0f) {
1863  // fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
1864  // cudaCheck(cudaStreamSynchronize(stream));
1865  // NAMD_die("maxf!");
1866  // }
1867  }
1868 #endif
1869  // should I skip the close()?
1870  // do I need to close it even if there's a migration?
1871  if(!simParams->CUDASOAintegrate || atomsChanged){
1872  pr.positionBox->close(&(pr.compAtom));
1873  pr.forceBox->close(&(pr.results));
1874  }
1875 }
1876 
1877 //
1878 // Finish a set of patches on this pe
1879 //
1880 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1881  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1882  if (patchSet.size() == 0)
1883  NAMD_bug("CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1885  // Save value of gbisPhase here because it can change after the last finishGBISPhase() or finishPatch() is called
1886  int gbisPhaseSave = gbisPhase;
1887  // Close Boxes depending on Phase
1888  if (simParams->GBISOn) {
1889  for (int i=0;i < patchSet.size();i++) {
1890  finishGBISPhase(patchSet[i]);
1891  }
1892  }
1893  // Finish patches
1894  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1895  for (int i=0;i < patchSet.size();i++) {
1896  finishPatch(patchSet[i]);
1897  }
1898  }
1899  bool done = false;
1900  CmiLock(lock);
1901  patchesCounter -= patchSet.size();
1902  if(params->CUDASOAintegrate && !atomsChanged){
1903  // masterPe is executing this, we can go ahead and do
1904  // reductions by themselves, However, for migrations, I still need to
1905  // follow the usual codepath, because of the box setup
1906  patchesCounter = 0;
1907  }
1908  if (patchesCounter == 0) {
1909  patchesCounter = getNumPatches();
1910  done = true;
1911  }
1912  CmiUnlock(lock);
1913  if (done) {
1914  // Do reductions
1915  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1916  // Reduction must be done on masterPe
1917  if(params->CUDASOAintegrate ){
1918  if(!atomsChanged) this->finishReductions();
1919  }
1920  else computeMgr->sendFinishReductions(masterPe, this);
1921  }
1922  }
1923 
1924  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1925 
1926 }
1927 
1928 //
1929 // Finish all patches that are on this pe
1930 //
1932  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1933 }
1934 
1935 //
1936 // Finish single patch on this pe
1937 //
1939  std::vector<int> v(1, i);
1940  finishSetOfPatchesOnPe(v);
1941 }
1942 
1944  if(params->CUDASOAintegrate){
1945  if (atomsChanged || doEnergy || doVirial) cudaCheck(cudaStreamSynchronize(stream));
1946  this->finishPatchesOnPe();
1947  }
1948  else {
1949  computeMgr->sendFinishPatchesOnPe(pes, this);
1950  }
1951 }
1952 
1953 void CudaComputeNonbonded::finishGBISPhase(int i) {
1954  if (CkMyPe() != patches[i].pe)
1955  NAMD_bug("CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1956  PatchRecord &pr = patches[i];
1957  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1958  int atomStart = pr.atomStart;
1959  if (gbisPhase == 1) {
1960  GBReal *psiSumMaster = psiSumH + atomStart;
1961  for ( int k=0; k<pr.numAtoms; ++k ) {
1962  int j = aExt[k].sortOrder;
1963  pr.psiSum[j] += psiSumMaster[k];
1964  }
1965  pr.psiSumBox->close(&(pr.psiSum));
1966  } else if (gbisPhase == 2) {
1967  GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1968  for ( int k=0; k<pr.numAtoms; ++k ) {
1969  int j = aExt[k].sortOrder;
1970  pr.dEdaSum[j] += dEdaSumMaster[k];
1971  }
1972  pr.dEdaSumBox->close(&(pr.dEdaSum));
1973  } else if (gbisPhase == 3) {
1974  pr.intRadBox->close(&(pr.intRad)); //box 6
1975  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1976  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1977  } //end phases
1978 }
1979 
1980 void CudaComputeNonbonded::finishTimers() {
1982 
1983  if (simParams->GBISOn) {
1984  if (gbisPhase == 1)
1985  traceUserBracketEvent(CUDA_GBIS1_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1986  if (gbisPhase == 2)
1987  traceUserBracketEvent(CUDA_GBIS2_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1988  if (gbisPhase == 3)
1989  traceUserBracketEvent(CUDA_GBIS3_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1990  } else {
1991  traceUserBracketEvent(CUDA_NONBONDED_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1992  }
1993 }
1994 
1995 //
1996 // Re-sort tile lists if neccessary
1997 //
1999  // Re-sort tile lists
2001  cudaCheck(cudaSetDevice(deviceID));
2002 #ifdef NAMD_HIP
2003  tileListKernel.reSortTileLists(simParams->GBISOn, simParams->CUDASOAintegrateMode, stream);
2004 #else
2005  tileListKernel.reSortTileLists(simParams->GBISOn, stream);
2006 #endif
2007 }
2008 
2009 void CudaComputeNonbonded::forceDoneCheck(void *arg, double walltime) {
2011 
2012  if (CkMyPe() != c->masterPe)
2013  NAMD_bug("CudaComputeNonbonded::forceDoneCheck called on non masterPe");
2014 
2016  cudaCheck(cudaSetDevice(c->deviceID));
2017 
2018  if (c->doStreaming) {
2019  int patchInd;
2020  while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
2021  c->patchReadyQueue[c->patchReadyQueueNext] = -1;
2022  c->patchReadyQueueNext++;
2023  c->checkCount = 0;
2024 
2025  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
2026  c->finishTimers();
2027  if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
2028  c->reSortTileLists();
2029  c->reSortDone = true;
2030  if (simParams->GBISOn && c->gbisPhase == 1) {
2031  // We must do GBIS Phase 1
2032  c->doGBISphase1();
2033  c->forceDoneSetCallback();
2034  return;
2035  }
2036  }
2037  }
2038 
2039  // Finish patch
2040  int pe = c->patches[patchInd].pe;
2041  PatchID patchID = c->patches[patchInd].patchID; // for priority
2042  //c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
2043  if(c->params->CUDASOAintegrate) c->finishPatchOnPe(patchInd);
2044  else c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
2045 
2046  // Last patch, return
2047  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) return;
2048 
2049  }
2050  } else {
2051  if (!c->forceDoneEventRecord)
2052  NAMD_bug("CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
2053  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2054  if (err == cudaSuccess) {
2055  // Event has occurred
2056  c->forceDoneEventRecord = false;
2057  c->checkCount = 0;
2058  c->finishTimers();
2059  if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
2060  c->reSortTileLists();
2061  c->reSortDone = true;
2062  if (simParams->GBISOn && c->gbisPhase == 1) {
2063  // We must do GBIS Phase 1
2064  c->doGBISphase1();
2065  c->forceDoneSetCallback();
2066  return;
2067  }
2068  }
2069  c->finishPatches();
2070  return;
2071  } else if (err != cudaErrorNotReady) {
2072  // Anything else is an error
2073  char errmsg[256];
2074  sprintf(errmsg,"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
2075  c->checkCount, walltime - c->beforeForceCompute);
2076  cudaDie(errmsg,err);
2077  }
2078  }
2079 
2080  // if (c->checkCount % 1000 == 0)
2081  // fprintf(stderr, "c->patchReadyQueueNext %d\n", c->patchReadyQueueNext);
2082 
2083  // Event has not occurred
2084  c->checkCount++;
2085  if (c->checkCount >= 1000000) {
2086  char errmsg[256];
2087  sprintf(errmsg,"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
2088  c->checkCount, walltime - c->beforeForceCompute);
2089  cudaDie(errmsg,cudaSuccess);
2090  }
2091 
2092  // Call again
2093  CcdCallBacksReset(0, walltime);
2094  // we need to do this only for the first timestep I guess?
2095 
2096  if(!c->params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2097 }
2098 
2099 //
2100 // Set call back for all the work in the stream at this point
2101 //
2102 void CudaComputeNonbonded::forceDoneSetCallback() {
2103  if (CkMyPe() != masterPe)
2104  NAMD_bug("CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
2105  beforeForceCompute = CkWallTimer();
2106  cudaCheck(cudaSetDevice(deviceID));
2107  if (!doStreaming || doVirial || doEnergy) {
2108  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2109  forceDoneEventRecord = true;
2110  }
2111  checkCount = 0;
2112  CcdCallBacksReset(0, CmiWallTimer());
2113  // Set the call back at 0.1ms
2114  if(!params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, this, 0.1);
2115 }
2116 
2118  const Lattice &l;
2119  cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
2122  Vector a = l.a();
2123  Vector b = l.b();
2124  Vector c = l.c();
2125  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
2126  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
2127  return ( ri < rj );
2128  }
2129 };
2130 
2131 static inline bool sortop_bitreverse(int a, int b) {
2132  if ( a == b ) return 0;
2133  for ( int bit = 1; bit; bit *= 2 ) {
2134  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
2135  }
2136  return 0;
2137 }
2138 
2143  const CudaComputeNonbonded::PatchRecord *patchrecs) : distop(sod), pr(patchrecs) { }
2144  bool pid_compare_priority(int2 pidi, int2 pidj) {
2145  const CudaComputeNonbonded::PatchRecord &pri = pr[pidi.y];
2146  const CudaComputeNonbonded::PatchRecord &prj = pr[pidj.y];
2147  if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
2148  if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
2149  if ( pri.isSameNode && ! prj.isSameNode ) return 0;
2150  if ( prj.isSameNode && ! pri.isSameNode ) return 1;
2151  if ( pri.isSameNode ) { // and prj.isSameNode
2152  int rpri = pri.reversePriorityRankInPe;
2153  int rprj = prj.reversePriorityRankInPe;
2154  if ( rpri != rprj ) return rpri > rprj;
2155  return sortop_bitreverse(CkRankOf(pri.pe),CkRankOf(prj.pe));
2156  }
2157  int ppi = PATCH_PRIORITY(pidi.x);
2158  int ppj = PATCH_PRIORITY(pidj.x);
2159  if ( ppi != ppj ) return ppi < ppj;
2160  return pidi.x < pidj.x;
2161  }
2163  CudaComputeNonbonded::ComputeRecord i) { // i and j reversed
2164  // Choose patch i (= patch with greater priority)
2165  int2 pidi = pid_compare_priority(make_int2(i.pid[0], i.patchInd[0]), make_int2(i.pid[1], i.patchInd[1])) ? make_int2(i.pid[0], i.patchInd[0]) : make_int2(i.pid[1], i.patchInd[1]);
2166  // Choose patch j
2167  int2 pidj = pid_compare_priority(make_int2(j.pid[0], j.patchInd[0]), make_int2(j.pid[1], j.patchInd[1])) ? make_int2(j.pid[0], j.patchInd[0]) : make_int2(j.pid[1], j.patchInd[1]);
2168  if ( pidi.x != pidj.x ) return pid_compare_priority(pidi, pidj);
2169  return distop(i,j);
2170  }
2171 };
2172 
2173 //
2174 // Setup computes. This is only done at the beginning and at load balancing, hence the lack of
2175 // consideration for performance in the CPU->GPU memory copy.
2176 //
2177 void CudaComputeNonbonded::updateComputes() {
2178  cudaCheck(cudaSetDevice(deviceID));
2179 
2180  Lattice lattice = patches[0].patch->flags.lattice;
2181  cr_sortop_distance so(lattice);
2182  std::stable_sort(computes.begin(), computes.end(), so);
2183 
2184  if (doStreaming) {
2185  cr_sortop_reverse_priority sorp(so, patches.data());
2186  std::stable_sort(computes.begin(), computes.end(), sorp);
2187  }
2188 
2189  CudaComputeRecord* cudaComputes = new CudaComputeRecord[computes.size()];
2190 
2191  for (int i=0;i < computes.size();i++) {
2192  cudaComputes[i].patchInd.x = computes[i].patchInd[0];
2193  cudaComputes[i].patchInd.y = computes[i].patchInd[1];
2194  cudaComputes[i].offsetXYZ.x = computes[i].offset.x;
2195  cudaComputes[i].offsetXYZ.y = computes[i].offset.y;
2196  cudaComputes[i].offsetXYZ.z = computes[i].offset.z;
2197  }
2198 
2199  tileListKernel.updateComputes(computes.size(), cudaComputes, stream);
2200  cudaCheck(cudaStreamSynchronize(stream));
2201 
2202  delete [] cudaComputes;
2203 }
2204 
2206  bool operator() (int32 *li, int32 *lj) {
2207  return ( li[1] < lj[1] );
2208  }
2209 };
2210 
2211 //
2212 // Builds the exclusions table. Swiped from ComputeNonbondedCUDA.C
2213 //
2214 void CudaComputeNonbonded::buildExclusions() {
2215  cudaCheck(cudaSetDevice(deviceID));
2216 
2218 
2219 #ifdef MEM_OPT_VERSION
2220  int natoms = mol->exclSigPoolSize;
2221 #else
2222  int natoms = mol->numAtoms;
2223 #endif
2224 
2225  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
2226  exclusionsByAtom = new int2[natoms];
2227 
2228  // create unique sorted lists
2229 
2230  ObjectArena<int32> listArena;
2231  ResizeArray<int32*> unique_lists;
2232  int32 **listsByAtom = new int32*[natoms];
2234  for ( int i=0; i<natoms; ++i ) {
2235  curList.resize(0);
2236  curList.add(0); // always excluded from self
2237 #ifdef MEM_OPT_VERSION
2238  const ExclusionSignature *sig = mol->exclSigPool + i;
2239  int n = sig->fullExclCnt;
2240  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
2241  n += 1;
2242 #else
2243  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
2244  int n = mol_list[0] + 1;
2245  for ( int j=1; j<n; ++j ) {
2246  curList.add(mol_list[j] - i);
2247  }
2248 #endif
2249  curList.sort();
2250 
2251  int j;
2252  for ( j=0; j<unique_lists.size(); ++j ) {
2253  if ( n != unique_lists[j][0] ) continue; // no match
2254  int k;
2255  for ( k=0; k<n; ++k ) {
2256  if ( unique_lists[j][k+3] != curList[k] ) break;
2257  }
2258  if ( k == n ) break; // found match
2259  }
2260  if ( j == unique_lists.size() ) { // no match
2261  int32 *list = listArena.getNewArray(n+3);
2262  list[0] = n;
2263  int maxdiff = 0;
2264  maxdiff = -1 * curList[0];
2265  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
2266  list[1] = maxdiff;
2267  for ( int k=0; k<n; ++k ) {
2268  list[k+3] = curList[k];
2269  }
2270  unique_lists.add(list);
2271  }
2272  listsByAtom[i] = unique_lists[j];
2273  }
2274  // sort lists by maxdiff
2275  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
2276  long int totalbits = 0;
2277  int nlists = unique_lists.size();
2278  for ( int j=0; j<nlists; ++j ) {
2279  int32 *list = unique_lists[j];
2280  int maxdiff = list[1];
2281  list[2] = totalbits + maxdiff;
2282  totalbits += 2*maxdiff + 1;
2283  }
2284  for ( int i=0; i<natoms; ++i ) {
2285  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
2286  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
2287  }
2288  delete [] listsByAtom;
2289 
2290  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
2291 
2292  {
2293  long int bytesneeded = totalbits / 8;
2294  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
2295  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
2296  unique_lists.size(), bytesneeded);
2297  }
2298 
2299  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
2300  if ( bytesneeded > bytesavail ) {
2301  char errmsg[512];
2302  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
2303  "but only %ld bytes can be addressed with 32-bit int.",
2304  unique_lists.size(), bytesneeded, bytesavail);
2305  NAMD_die(errmsg);
2306  }
2307  }
2308 
2309 #define SET_EXCL(EXCL,BASE,DIFF) \
2310  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
2311 
2312  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
2313  memset(exclusion_bits, 0, totalbits/8);
2314 
2315  long int base = 0;
2316  for ( int i=0; i<unique_lists.size(); ++i ) {
2317  base += unique_lists[i][1];
2318  if ( unique_lists[i][2] != (int32)base ) {
2319  NAMD_bug("CudaComputeNonbonded::build_exclusions base != stored");
2320  }
2321  int n = unique_lists[i][0];
2322  for ( int j=0; j<n; ++j ) {
2323  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
2324  }
2325  base += unique_lists[i][1] + 1;
2326  }
2327 
2328  int numExclusions = totalbits/32;
2329 
2330  nonbondedKernel.bindExclusions(numExclusions, exclusion_bits);
2331 
2332 
2334  if(simParams->CUDASOAintegrate && simParams->useDeviceMigration){
2335  nonbondedKernel.setExclusionsByAtom(exclusionsByAtom, natoms);
2336  }
2337 
2338  delete [] exclusion_bits;
2339 }
2340 
2342  const float cutoff = ComputeNonbondedUtil::cutoff;
2343  const float cutoff2 = ComputeNonbondedUtil::cutoff2;
2344  const float cutoffInv = 1.0f / cutoff;
2345  const float cutoff2Inv = 1.0f / cutoff2;
2346  const float scutoff = ComputeNonbondedUtil::switchOn;
2347  const float scutoff2 = ComputeNonbondedUtil::switchOn2;
2348  const float scutoff2Inv = 1.0f / scutoff2;
2349  const float scutoff_denom = ComputeNonbondedUtil::c1;
2352  const float slowScale = ((float) simParams->fullElectFrequency) / simParams->nonbondedFrequency;
2353 
2354  CudaNBConstants c;
2355  c.lj_0 = scutoff_denom * cutoff2 - 3.0f * scutoff2 * scutoff_denom;
2356  c.lj_1 = scutoff_denom * 2.0f;
2357  c.lj_2 = scutoff_denom * -12.0f;
2358  c.lj_3 = 12.0f * scutoff_denom * scutoff2;
2359  c.lj_4 = cutoff2;
2360  c.lj_5 = scutoff2;
2361  c.e_0 = cutoff2Inv * cutoffInv;
2362  c.e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
2363  c.e_1 = cutoff2Inv;
2364  c.e_2 = cutoffInv;
2365  c.ewald_0 = ewaldcof;
2366  c.ewald_1 = pi_ewaldcof;
2367  c.ewald_2 = ewaldcof * ewaldcof;
2368  c.ewald_3_slow = ewaldcof * ewaldcof * ewaldcof * slowScale;
2369  c.slowScale = slowScale;
2370 
2371  return c;
2372 }
2373 
2374 bool CudaComputeNonbonded::getDoTable(SimParameters *simParams, const bool doSlow, const bool doVirial) {
2375  // There is additional logic in SimParameters.C which guards against unsupported force fields
2376  // This should only be used for performance heuristics
2377  bool doTable = simParams->useCUDANonbondedForceTable;
2378 
2379  // DMC: I found the doSlow case is faster with force tables, so overriding setting
2380  // TODO This should be reevaluated for future architectures
2381  doTable = doTable || doSlow;
2382  // Direct math does not support virial+slow
2383  // Redundant but necessary for correctness so doing it explicitly
2384  doTable = doTable || (doSlow && doVirial);
2385 
2386  return doTable;
2387 }
2388 
2389 #endif // NAMD_CUDA
2390 
2391 
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:33
BigReal zy
Definition: Tensor.h:19
#define NAMD_EVENT_STOP(eon, id)
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
float ewald_3_slow
Definition: CudaUtils.h:607
Type * getNewArray(int n)
Definition: ObjectArena.h:49
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
int getDeviceCount()
Definition: DeviceCUDA.h:124
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:106
HomePatch * patch
Definition: HomePatchList.h:23
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
int size(void) const
Definition: ResizeArray.h:131
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool atomsChanged, const bool doMinimize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doAlch, const bool doAlchVdwForceSwitching, const bool doFEP, const bool doTI, const bool doTable, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, const CudaNBConstants nbConstants, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, AlchData *fepFlags, bool lambdaWindowUpdated, char *part, bool CUDASOAintegratorOn, bool useDeviceMigration, cudaStream_t stream)
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
BigReal xz
Definition: Tensor.h:17
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
int numAtoms
Definition: CudaRecord.h:17
int32 ComputeID
Definition: NamdTypes.h:278
static PatchMap * Object()
Definition: PatchMap.h:27
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1759
CmiNodeLock printlock
Definition: PatchData.h:157
Definition: Vector.h:72
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
SimParameters * simParameters
Definition: Node.h:181
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1748
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
void clearTileListStat(cudaStream_t stream)
virtual void gbisP2PatchReady(PatchID, int seq)
int savePairlists
Definition: PatchTypes.h:40
static const Molecule * mol
int32_t int32
Definition: common.h:38
BigReal & item(int i)
Definition: ReductionMgr.h:313
HomePatchList * homePatchList()
Definition: PatchMap.C:438
BigReal z
Definition: Vector.h:74
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
int usePairlists
Definition: PatchTypes.h:39
BigReal yz
Definition: Tensor.h:18
CudaPatchRecord * getCudaPatches()
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2852
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:366
bool operator()(int2 pidj, int2 pidi)
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1225
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:279
NodeReduction * reduction
Definition: PatchData.h:133
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
int add(const Elem &elem)
Definition: ResizeArray.h:101
#define WARPSIZE
Definition: CudaUtils.h:17
const CudaComputeNonbonded::PatchRecord * pr
Molecule stores the structural information for the system.
Definition: Molecule.h:175
void reallocate_forceSOA(int atomStorageSize)
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:96
Definition: Patch.h:35
Bool useDeviceMigration
Flags flags
Definition: Patch.h:128
void resize(int i)
Definition: ResizeArray.h:84
uint32 id
Definition: NamdTypes.h:156
static CudaNBConstants getNonbondedCoef(SimParameters *params)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1770
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
static __device__ __host__ __forceinline__ int computeAtomPad(const int numAtoms, const int tilesize=WARPSIZE)
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
bool pid_compare_priority(int2 pidi, int2 pidj)
virtual void gbisP3PatchReady(PatchID, int seq)
NAMD_HOST_DEVICE float3 make_float3(float4 a)
Definition: Vector.h:335
#define ATOMIC_BINS
Definition: CudaUtils.h:70
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:30
int doEnergy
Definition: PatchTypes.h:20
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
int doFullElectrostatics
Definition: PatchTypes.h:23
BigReal yx
Definition: Tensor.h:18
ReductionValue & item(int index)
Definition: ReductionMgr.C:633
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:530
int16 vdwType
Definition: NamdTypes.h:79
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1781
#define BOUNDINGBOXSIZE
Definition: CudaUtils.h:18
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:34
int priority(void)
Definition: Compute.h:65
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
uint8 partition
Definition: NamdTypes.h:80
BigReal x
Definition: Vector.h:74
PatchID getPatchID() const
Definition: Patch.h:114
void CcdCallBacksReset(void *ignored, double curWallTime)
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
void finishTileList(cudaStream_t stream)
void setExclusionsByAtom(int2 *h_data, const int num_atoms)
int numAtoms
Definition: Molecule.h:585
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1707
void createProxy(PatchID pid)
Definition: ProxyMgr.C:492
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:147
void registerComputeSelf(ComputeID cid, PatchID pid)
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
static bool sortop_bitreverse(int a, int b)
void updateVdwTypesExclOnGPU(CudaTileListKernel &tlKernel, const int numPatches, const int atomStorageSize, const bool alchOn, CudaLocalRecord *localRecords, const int *d_vdwTypes, const int *d_id, const int *d_sortOrder, const int *d_partition, cudaStream_t stream)
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
BigReal zz
Definition: Tensor.h:19
virtual void patchReady(PatchID, int doneMigration, int seq)
int gbisPhase
Definition: Compute.h:39
static __device__ __host__ __forceinline__ int computeNumTiles(const int numAtoms, const int tilesize=WARPSIZE)
#define MAX_EXCLUSIONS
#define simParams
Definition: Output.C:129
int32 sortOrder
Definition: NamdTypes.h:149
iterator begin(void)
Definition: ResizeArray.h:36
Definition: Tensor.h:15
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:31
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
BigReal xy
Definition: Tensor.h:17
int getNumPatches()
Definition: Compute.h:53
iterator end(void)
Definition: ResizeArray.h:37
int doVirial
Definition: PatchTypes.h:21
BigReal y
Definition: Vector.h:74
BigReal yy
Definition: Tensor.h:18
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:32
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
int getDeviceIndex()
Definition: DeviceCUDA.h:166
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
int doGBIS
Definition: PatchTypes.h:29
void submit(void)
Definition: ReductionMgr.h:324
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
int node(int pid) const
Definition: PatchMap.h:114
static BigReal alchVdwShiftCoeff
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1734
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
Definition: ComputeMgr.C:1721
int atomStart
Definition: CudaRecord.h:16
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
int32 PatchID
Definition: NamdTypes.h:277
BigReal zx
Definition: Tensor.h:19
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
Molecule * molecule
Definition: Node.h:179
const ComputeID cid
Definition: Compute.h:43
bool operator()(CudaComputeNonbonded::ComputeRecord j, CudaComputeNonbonded::ComputeRecord i)
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1681
int doMinimize
Definition: PatchTypes.h:25
double BigReal
Definition: common.h:123
int step
Definition: PatchTypes.h:16
float GBReal
Definition: ComputeGBIS.inl:17
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
void updatePatchOrder(const std::vector< CudaLocalRecord > &data)
bool operator()(CudaComputeNonbonded::ComputeRecord i, CudaComputeNonbonded::ComputeRecord j)
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1694
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)