NAMD
CudaComputeNonbonded.C
Go to the documentation of this file.
1 #include <algorithm>
2 #include <map>
3 #include <vector>
4 #include "NamdTypes.h"
5 #include "charm++.h"
6 #include "Patch.h"
7 #include "PatchMap.h"
8 #include "ProxyMgr.h"
9 #include "LJTable.h"
10 #include "Node.h"
11 #include "ObjectArena.h"
12 // #include "ComputeCUDAMgr.h"
13 #include "ReductionMgr.h"
14 #include "CudaComputeNonbonded.h"
15 #include "WorkDistrib.h"
16 #include "HomePatch.h"
17 #include "Priorities.h"
18 #include "ComputePmeCUDAMgr.h"
19 #include "NamdEventsProfiling.h"
20 //#include "CudaUtils.h"
21 
22 #include "DeviceCUDA.h"
23 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
24 #ifdef WIN32
25 #define __thread __declspec(thread)
26 #endif
27 extern __thread DeviceCUDA *deviceCUDA;
28 #endif
29 
30 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
31 
32 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
33 
34 //
35 // Class constructor
36 //
38  CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) :
39 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
40 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
41 
42  cudaCheck(cudaSetDevice(deviceID));
43 
44  exclusionsByAtom = NULL;
45 
46  vdwTypes = NULL;
47  vdwTypesSize = 0;
48 
49  exclIndexMaxDiff = NULL;
50  exclIndexMaxDiffSize = 0;
51 
52  atomIndex = NULL;
53  atomIndexSize = 0;
54 
55  atomStorageSize = 0;
56 
57  // Atom and charge storage
58  atoms = NULL;
59  atomsSize = 0;
60 
61  // Force storage
62  h_forces = NULL;
63  h_forcesSize = 0;
64  h_forcesSlow = NULL;
65  h_forcesSlowSize = 0;
66 
67  d_forces = NULL;
68  d_forcesSize = 0;
69  d_forcesSlow = NULL;
70  d_forcesSlowSize = 0;
71 
72  // GBIS
73  intRad0H = NULL;
74  intRad0HSize = 0;
75  intRadSH = NULL;
76  intRadSHSize = 0;
77  psiSumH = NULL;
78  psiSumHSize = 0;
79  bornRadH = NULL;
80  bornRadHSize = 0;
81  dEdaSumH = NULL;
82  dEdaSumHSize = 0;
83  dHdrPrefixH = NULL;
84  dHdrPrefixHSize = 0;
85  maxShmemPerBlock = 0;
86  cudaPatches = NULL;
87 
88  atomsChangedIn = true;
89  atomsChanged = true;
90  computesChanged = true;
91 
92  forceDoneEventRecord = false;
93 
95  if (simParams->pressureProfileOn) {
96  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
97  }
98 
99  if (simParams->GBISOn) gbisPhase = 3;
100 
101  doSkip = false;
102 }
103 
104 //
105 // Class destructor
106 //
108  cudaCheck(cudaSetDevice(deviceID));
109  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
110  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
111  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
112  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
113  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
114  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
115  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
116  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
117 
118  // GBIS
119  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
120  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
121  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
122  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
123  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
124  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
125 
126  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
127 
128  if (patches.size() > 0) {
129  deallocate_host<VirialEnergy>(&h_virialEnergy);
130  deallocate_device<VirialEnergy>(&d_virialEnergy);
131  cudaCheck(cudaStreamDestroy(stream));
132  cudaCheck(cudaEventDestroy(forceDoneEvent));
133  CmiDestroyLock(lock);
134  delete reduction;
135  }
136 
137  // NOTE: unregistering happens in [sync] -entry method
138  computeMgr->sendUnregisterBoxesOnPe(pes, this);
139 
140 }
141 
142 void CudaComputeNonbonded::unregisterBox(int i) {
143  if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(this, &patches[i].positionBox);
144  if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(this, &patches[i].forceBox);
145  if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(this, &patches[i].intRadBox);
146  if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(this, &patches[i].psiSumBox);
147  if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(this, &patches[i].bornRadBox);
148  if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(this, &patches[i].dEdaSumBox);
149  if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(this, &patches[i].dHdrPrefixBox);
150 }
151 
153  if (rankPatches[CkMyRank()].size() == 0)
154  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
155  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
156  unregisterBox(rankPatches[CkMyRank()][i]);
157  }
158 }
159 
160 //
161 // Register inter-patch (self) compute.
162 // Only serialized calls allowed
163 //
165  computesChanged = true;
166  addPatch(pid);
167  addCompute(cid, pid, pid, 0.);
168 }
169 
170 //
171 // Register pair-patch compute.
172 // Only serialized calls allowed
173 //
175  computesChanged = true;
176  addPatch(pid[0]);
177  addPatch(pid[1]);
178  PatchMap* patchMap = PatchMap::Object();
179  int t1 = trans[0];
180  int t2 = trans[1];
181  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
182  offset.x += (t1%3-1) - (t2%3-1);
183  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
184  offset.z += (t1/9-1) - (t2/9-1);
185  addCompute(cid, pid[0], pid[1], offset);
186 }
187 
188 //
189 // Add patch
190 //
191 void CudaComputeNonbonded::addPatch(PatchID pid) {
192  patches.push_back(PatchRecord(pid));
193 }
194 
195 //
196 // Add compute
197 //
198 void CudaComputeNonbonded::addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset) {
199  ComputeRecord cr;
200  cr.cid = cid;
201  cr.pid[0] = pid1;
202  cr.pid[1] = pid2;
203  cr.offset = offset;
204  computes.push_back(cr);
205 }
206 
207 //
208 // Update numAtoms and numFreeAtoms on a patch
209 //
210 void CudaComputeNonbonded::updatePatch(int i) {
211  int numAtoms = patches[i].patch->getNumAtoms();
212  int numFreeAtoms = numAtoms;
213  if ( fixedAtomsOn ) {
214  const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
215  for ( int j=0; j< numAtoms; ++j ) {
216  if ( aExt[j].atomFixed ) --numFreeAtoms;
217  }
218  }
219  patches[i].numAtoms = numAtoms;
220  patches[i].numFreeAtoms = numFreeAtoms;
221  cudaPatches[i].numAtoms = numAtoms;
222  cudaPatches[i].numFreeAtoms = numFreeAtoms;
223 }
224 
225 int CudaComputeNonbonded::findPid(PatchID pid) {
226  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
227  int j = rankPatches[CkMyRank()][i];
228  if (patches[j].patchID == pid) return j;
229  }
230  return -1;
231 }
232 
233 void CudaComputeNonbonded::patchReady(PatchID pid, int doneMigration, int seq) {
234  if (doneMigration) {
235  int i = findPid(pid);
236  if (i == -1)
237  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
238  updatePatch(i);
239  }
240  CmiLock(lock);
241  Compute::patchReady(pid, doneMigration, seq);
242  CmiUnlock(lock);
243 }
244 
246  CmiLock(lock);
247  Compute::gbisP2PatchReady(pid, seq);
248  CmiUnlock(lock);
249 }
250 
252  CmiLock(lock);
253  Compute::gbisP3PatchReady(pid, seq);
254  CmiUnlock(lock);
255 }
256 
257 void CudaComputeNonbonded::assignPatch(int i) {
258  PatchMap* patchMap = PatchMap::Object();
259  PatchID pid = patches[i].patchID;
260  Patch* patch = patchMap->patch(pid);
261  if (patch == NULL) {
262  // Create ProxyPatch if none exists
264  patch = patchMap->patch(pid);
265  }
266  patches[i].patch = patch;
267  if (patches[i].patch == NULL) {
268  NAMD_bug("CudaComputeNonbonded::assignPatch, patch not found");
269  }
270  patches[i].positionBox = patches[i].patch->registerPositionPickup(this);
271  patches[i].forceBox = patches[i].patch->registerForceDeposit(this);
273  if (simParams->GBISOn) {
274  patches[i].intRadBox = patches[i].patch->registerIntRadPickup(this);
275  patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(this);
276  patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(this);
277  patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(this);
278  patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this);
279  }
280  // Store Pe where this patch was registered
281 #if 1
282  if (patches[i].pe != CkMyPe()) {
283  NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
284  }
285 #else
286  patches[i].pe = CkMyPe();
287 #endif
288  //
289  patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
290  patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
291 }
292 
294  bool operator() (int2 pidj, int2 pidi) { // i and j reversed
295  int ppi = PATCH_PRIORITY(pidi.x);
296  int ppj = PATCH_PRIORITY(pidj.x);
297  if ( ppi != ppj ) return ppi < ppj;
298  return pidi.x < pidj.x;
299  }
300 };
301 
303  if (rankPatches[CkMyRank()].size() == 0)
304  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
305 
306  // calculate priority rank of local home patch within pe
307  {
308  PatchMap* patchMap = PatchMap::Object();
309  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
310  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
311  int i = rankPatches[CkMyRank()][k];
312  int pid = patches[i].patchID;
313  int homePe = patchMap->node(pid);
314  if ( CkNodeOf(homePe) == CkMyNode() ) {
315  int2 pid_index;
316  pid_index.x = pid;
317  pid_index.y = i;
318  homePatchByRank[CkRankOf(homePe)].add(pid_index);
319  }
320  }
321  for ( int i=0; i<CkMyNodeSize(); ++i ) {
323  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
324  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
325  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
326  int index = homePatchByRank[i][j].y;
327  patches[index].reversePriorityRankInPe = j + masterBoost;
328  }
329  }
330  }
331 
332  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
333  assignPatch(rankPatches[CkMyRank()][i]);
334  }
335 }
336 
337 //
338 // Returns Pe of Patch ID "pid", -1 otherwise
339 //
340 // int findHomePatchPe(std::vector<PatchIDList>& rankPatchIDs, PatchID pid) {
341 int findHomePatchPe(PatchIDList* rankPatchIDs, PatchID pid) {
342  // for (int i=0;i < rankPatchIDs.size();i++) {
343  for (int i=0;i < CkMyNodeSize();i++) {
344  if (rankPatchIDs[i].find(pid) != -1) return CkNodeFirst(CkMyNode()) + i;
345  }
346  return -1;
347 }
348 
349 //
350 // Find all PEs that have Patch
351 //
352 void findProxyPatchPes(std::vector<int>& proxyPatchPes, PatchID pid) {
353  proxyPatchPes.clear();
354  for (int i=0;i < CkMyNodeSize();i++) {
355  int pe = CkNodeFirst(CkMyNode()) + i;
356  if (PatchMap::ObjectOnPe(pe)->patch(pid) != NULL)
357  proxyPatchPes.push_back(pe);
358  }
359 }
360 
361 //
362 // Called after all computes have been registered
363 //
365  // Remove duplicate patches
366  std::sort(patches.begin(), patches.end());
367  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
368  patches.erase(last, patches.end());
369  // Set number of patches
370  setNumPatches(patches.size());
371  masterPe = CkMyPe();
372  computeMgr = computeMgrIn;
373  // Start patch counter
374  patchesCounter = getNumPatches();
375  // Patch ID map
376  std::map<PatchID, int> pidMap;
377 #if 1
378  //-------------------------------------------------------
379  // Copied in from ComputeNonbondedCUDA::assignPatches()
380  //-------------------------------------------------------
381 
382  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
383  int numPesOnNodeSharingDevice = 0;
384  int masterIndex = -1;
385  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
386  int pe = deviceCUDA->getPesSharingDevice(i);
387  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
388  if ( CkNodeOf(pe) == CkMyNode() ) {
389  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
390  }
391  }
392 
393  std::vector<int> count(patches.size(), 0);
394  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
395  std::vector<int> rankpcount(CkMyNodeSize(), 0);
396  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
397 
398  PatchMap* patchMap = PatchMap::Object();
399 
400  int unassignedpatches = patches.size();
401 
402  for (int i=0;i < patches.size(); ++i) {
403  patches[i].pe = -1;
404  }
405 
406  // assign if home pe and build table of natural proxies
407  for (int i=0;i < patches.size(); ++i) {
408  int pid = patches[i].patchID;
409  // homePe = PE where the patch currently resides
410  int homePe = patchMap->node(pid);
411  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
412  int pe = pesOnNodeSharingDevice[j];
413  // If homePe is sharing this device, assign this patch to homePe
414  if ( pe == homePe ) {
415  patches[i].pe = pe;
416  --unassignedpatches;
417  pcount[j] += 1;
418  }
419  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
420  table[i*numPesOnNodeSharingDevice + j] = 1;
421  }
422  }
423  // Assign this patch to homePe, if it resides on the same node
424  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
425  patches[i].pe = homePe;
426  --unassignedpatches;
427  rankpcount[CkRankOf(homePe)] += 1;
428  }
429  }
430  // assign if only one pe has a required proxy
431  for (int i=0; i < patches.size(); ++i) {
432  int pid = patches[i].patchID;
433  if ( patches[i].pe != -1 ) continue;
434  int c = 0;
435  int lastj;
436  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
437  if ( table[i*numPesOnNodeSharingDevice + j] ) {
438  ++c;
439  lastj = j;
440  }
441  }
442  count[i] = c;
443  if ( c == 1 ) {
444  patches[i].pe = pesOnNodeSharingDevice[lastj];
445  --unassignedpatches;
446  pcount[lastj] += 1;
447  }
448  }
449  int assignj = 0;
450  while ( unassignedpatches ) {
451  int i;
452  for (i=0;i < patches.size(); ++i) {
453  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
454  int pid = patches[i].patchID;
455  // patch_record &pr = patchRecords[pid];
456  if ( patches[i].pe != -1 ) continue;
457  patches[i].pe = pesOnNodeSharingDevice[assignj];
458  --unassignedpatches;
459  pcount[assignj] += 1;
460  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
461  break;
462  }
463  if (i < patches.size() ) continue; // start search again
464  for ( i=0;i < patches.size(); ++i ) {
465  int pid = patches[i].patchID;
466  // patch_record &pr = patchRecords[pid];
467  if ( patches[i].pe != -1 ) continue;
468  if ( count[i] ) continue;
469  patches[i].pe = pesOnNodeSharingDevice[assignj];
470  --unassignedpatches;
471  pcount[assignj] += 1;
472  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
473  break;
474  }
475  if ( i < patches.size() ) continue; // start search again
476  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
477  }
478 
479  // For each rank, list of patches
480  rankPatches.resize(CkMyNodeSize());
481  for (int i=0; i < patches.size(); ++i) {
482  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
483  pidMap[patches[i].patchID] = i;
484  }
485 
486  // for ( int i=0; i < patches.size(); ++i ) {
487  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
488  // }
489 
490 /*
491  slavePes = new int[CkMyNodeSize()];
492  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
493  numSlaves = 0;
494  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
495  int pe = pesOnNodeSharingDevice[j];
496  int rank = pe - CkNodeFirst(CkMyNode());
497  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
498  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
499  if ( pe == CkMyPe() ) continue;
500  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
501  rankpcount[rank] = 0; // skip in rank loop below
502  slavePes[numSlaves] = pe;
503  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
504  ++numSlaves;
505  }
506  for ( int j=0; j<CkMyNodeSize(); ++j ) {
507  int pe = CkNodeFirst(CkMyNode()) + j;
508  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
509  // CkMyPe(),j,pe,rankpcount[j]);
510  if ( ! rankpcount[j] ) continue;
511  if ( pe == CkMyPe() ) continue;
512  slavePes[numSlaves] = pe;
513  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
514  ++numSlaves;
515  }
516 */
517 
518 #else
519  // For each rank, list of patches
520  rankPatches.resize(CkMyNodeSize());
521  // For each rank, list of home patch IDs
522  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
523  for (int i=0;i < CkMyNodeSize();i++) {
524  int pe = CkNodeFirst(CkMyNode()) + i;
525  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
526  }
527  std::vector<int> proxyPatchPes;
528  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
529  //--------------------------------------------------------
530  // Build a list of PEs to avoid
531  std::vector<int> pesToAvoid;
532 #if 0
533  // Avoid other GPUs' master PEs
534  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
535  int pe = deviceCUDA->getMasterPeForDeviceID(i);
536  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
537  }
538  // Avoid PEs that are involved in PME
539  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
540  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
541  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
542  }
543  // Set counters of avoidable PEs to high numbers
544  for (int i=0;i < pesToAvoid.size();i++) {
545  int pe = pesToAvoid[i];
546  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
547  }
548 #endif
549  // Avoid master Pe somewhat
550  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
551  //--------------------------------------------------------
552  for (int i=0;i < patches.size();i++) {
553  PatchID pid = patches[i].patchID;
554  int pe = findHomePatchPe(rankHomePatchIDs, pid);
555  if (pe == -1) {
556  // Patch not present on this node => try finding a ProxyPatch
557  findProxyPatchPes(proxyPatchPes, pid);
558  if (proxyPatchPes.size() == 0) {
559  // No ProxyPatch => create one on rank that has the least ProxyPatches
560  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
561  pe = CkNodeFirst(CkMyNode()) + rank;
562  peProxyPatchCounter[rank]++;
563  } else {
564  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
565  // this is done by finding the entry with minimum peProxyPatchCounter -value
566  // Find miniumum among proxyPatchPes, i.e., find the minimum among
567  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
568  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
569  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
570  // - proxyPatchPes.begin();
571  // pe = proxyPatchPes[pppi];
572  int minCounter = (1 << 30);
573  for (int j=0;j < proxyPatchPes.size();j++) {
574  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
575  pe = proxyPatchPes[j];
576  minCounter = peProxyPatchCounter[CkRankOf(pe)];
577  }
578  }
579  if (pe == -1)
580  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
581  peProxyPatchCounter[CkRankOf(pe)]++;
582  }
583  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
584  // Found home patch on this node, but it's on PE that should be avoided => find a new one
585  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
586  pe = CkNodeFirst(CkMyNode()) + rank;
587  peProxyPatchCounter[rank]++;
588  }
589  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
590  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
591  rankPatches[CkRankOf(pe)].push_back(i);
592  pidMap[pid] = i;
593  }
594 
595  delete [] rankHomePatchIDs;
596 #endif
597  // Setup computes using pidMap
598  for (int i=0;i < computes.size();i++) {
599  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
600  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
601  }
602  for (int i=0;i < CkMyNodeSize();i++) {
603  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
604  }
605  computeMgr->sendAssignPatchesOnPe(pes, this);
606 }
607 
609  if (patches.size() > 0) {
610  // Allocate CUDA version of patches
611  cudaCheck(cudaSetDevice(deviceID));
612  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
613 
614  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
615  allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
616 
617  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
618  */
619  cudaDeviceProp props;
620  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
621  maxShmemPerBlock = props.sharedMemPerBlock;
622 
623 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
624  int leastPriority, greatestPriority;
625  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
626  int priority = (doStreaming) ? leastPriority : greatestPriority;
627  // int priority = greatestPriority;
628  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
629 #else
630  cudaCheck(cudaStreamCreate(&stream));
631 #endif
632  cudaCheck(cudaEventCreate(&forceDoneEvent));
633 
634  buildExclusions();
635 
636  lock = CmiCreateLock();
637 
639  }
640 }
641 
642 //
643 // atomUpdate() can be called by any Pe
644 //
646  atomsChangedIn = true;
647 }
648 
649 //
650 // Compute patches[].atomStart, patches[].numAtoms, patches[].numFreeAtoms, and atomStorageSize
651 //
652 void CudaComputeNonbonded::updatePatches() {
653 
654  // Maximum number of tiles per tile list
655  maxTileListLen = 0;
656  int atomStart = 0;
657  for (int i=0;i < patches.size();i++) {
658  patches[i].atomStart = atomStart;
659  cudaPatches[i].atomStart = atomStart;
660  int numAtoms = patches[i].numAtoms;
661  int numTiles = ((numAtoms-1)/WARPSIZE+1);
662  maxTileListLen = std::max(maxTileListLen, numTiles);
663  atomStart += numTiles*WARPSIZE;
664  }
665  atomStorageSize = atomStart;
666 
667  if (maxTileListLen >= 65536) {
668  NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
669  }
670 }
671 
672 void CudaComputeNonbonded::skipPatch(int i) {
673  if (CkMyPe() != patches[i].pe)
674  NAMD_bug("CudaComputeNonbonded::skipPatch called on wrong Pe");
675  Flags &flags = patches[i].patch->flags;
676  patches[i].positionBox->skip();
677  patches[i].forceBox->skip();
678  if (flags.doGBIS) {
679  patches[i].psiSumBox->skip();
680  patches[i].intRadBox->skip();
681  patches[i].bornRadBox->skip();
682  patches[i].dEdaSumBox->skip();
683  patches[i].dHdrPrefixBox->skip();
684  }
685 }
686 
688  if (rankPatches[CkMyRank()].size() == 0)
689  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
690  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
691  skipPatch(rankPatches[CkMyRank()][i]);
692  }
693  bool done = false;
694  CmiLock(lock);
695  patchesCounter -= rankPatches[CkMyRank()].size();
696  if (patchesCounter == 0) {
697  patchesCounter = getNumPatches();
698  done = true;
699  }
700  CmiUnlock(lock);
701  if (done) {
702  // Reduction must be done on masterPe
703  computeMgr->sendFinishReductions(masterPe, this);
704  }
705 }
706 
707 void CudaComputeNonbonded::skip() {
708  if (CkMyPe() != masterPe)
709  NAMD_bug("CudaComputeNonbonded::skip() called on non masterPe");
710 
711  if (patches.size() == 0) return;
712 
713  doSkip = true;
714 
715  computeMgr->sendSkipPatchesOnPe(pes, this);
716 }
717 
718 void CudaComputeNonbonded::getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance) {
719  if (CkMyPe() != masterPe)
720  NAMD_bug("CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
721 
722  for (int i=0;i < patches.size();++i) {
723  PatchRecord &pr = patches[i];
724 
725  float maxMove = pr.patch->flags.maxAtomMovement;
726  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
727 
728  float maxTol = pr.patch->flags.pairlistTolerance;
729  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
730  }
731 }
732 
733 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param) {
735  c->updateVdwTypesExclSubset(first, last);
736 }
737 
738 void CudaComputeNonbonded::updateVdwTypesExclSubset(int first, int last) {
739  for (int i=first;i <= last;i++) {
740  PatchRecord &pr = patches[i];
741  int start = pr.atomStart;
742  int numAtoms = pr.numAtoms;
743  const CompAtom *compAtom = pr.compAtom;
744  const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
745  // Atoms have changed, re-do exclusions and vdw types
746  int2* exclp = exclIndexMaxDiff + start;
747  int* aip = atomIndex + start;
748  for ( int k=0;k < numAtoms; ++k ) {
749  int j = compAtomExt[k].sortOrder;
750  vdwTypes[start + k] = compAtom[j].vdwType;
751  aip[k] = compAtomExt[j].id;
752 #ifdef MEM_OPT_VERSION
753  exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
754  exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
755 #else // ! MEM_OPT_VERSION
756  exclp[k].x = exclusionsByAtom[compAtomExt[j].id].y;
757  exclp[k].y = exclusionsByAtom[compAtomExt[j].id].x;
758 #endif // MEM_OPT_VERSION
759  }
760  }
761 }
762 
763 //
764 // Called every time atoms changed
765 //
766 void CudaComputeNonbonded::updateVdwTypesExcl() {
767  // Re-allocate (VdwTypes, exclIndexMaxDiff) as needed
768  reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
769  reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
770  reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
771 
772 #if CMK_SMP && USE_CKLOOP
773  int useCkLoop = Node::Object()->simParameters->useCkLoop;
774  if (useCkLoop >= 1) {
775  CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (void *)this, CkMyNodeSize(), 0, patches.size()-1);
776  } else
777 #endif
778  {
779  updateVdwTypesExclSubset(0, patches.size()-1);
780  }
781 
782  nonbondedKernel.updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
783 }
784 
785 inline void CudaComputeNonbonded::copyAtomsLoop(int first, int last, void *result, int paraNum, void *param) {
787  c->copyAtomsSubset(first, last);
788 }
789 
790 void CudaComputeNonbonded::copyAtomsSubset(int first, int last) {
791  for (int i=first;i <= last;++i) {
792  PatchRecord &pr = patches[i];
793  int numAtoms = pr.numAtoms;
794  if (numAtoms > 0) {
795  int start = pr.atomStart;
796  const CudaAtom *src = pr.patch->getCudaAtomList();
797  CudaAtom *dst = atoms + start;
798  memcpy(dst, src, sizeof(CudaAtom)*numAtoms);
799  // Fill the rest with the copy of the last atom
800  int numAtomsAlign = ((numAtoms-1)/WARPSIZE+1)*WARPSIZE;
801  CudaAtom lastAtom = src[numAtoms-1];
802  for (int j=numAtoms;j < numAtomsAlign;j++) {
803  dst[j] = lastAtom;
804  }
805  }
806  }
807 }
808 
809 void CudaComputeNonbonded::copyGBISphase(int i) {
810  if (CkMyPe() != patches[i].pe)
811  NAMD_bug("CudaComputeNonbonded::copyGBISphase called on wrong Pe");
812  PatchRecord &pr = patches[i];
813  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
814  if (gbisPhase == 1) {
815  //Copy GBIS intRadius to Host
816  if (atomsChanged) {
817  float *intRad0 = intRad0H + pr.atomStart;
818  float *intRadS = intRadSH + pr.atomStart;
819  for (int k=0;k < pr.numAtoms;++k) {
820  int j = aExt[k].sortOrder;
821  intRad0[k] = pr.intRad[2*j+0];
822  intRadS[k] = pr.intRad[2*j+1];
823  }
824  }
825  } else if (gbisPhase == 2) {
826  float *bornRad = bornRadH + pr.atomStart;
827  for ( int k=0; k < pr.numAtoms; ++k ) {
828  int j = aExt[k].sortOrder;
829  bornRad[k] = pr.bornRad[j];
830  }
831  } else if (gbisPhase == 3) {
832  float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
833  for ( int k=0; k < pr.numAtoms; ++k ) {
834  int j = aExt[k].sortOrder;
835  dHdrPrefix[k] = pr.dHdrPrefix[j];
836  }
837  } // end phases
838 }
839 
840 void CudaComputeNonbonded::openBox(int i) {
841  if (CkMyPe() != patches[i].pe)
842  NAMD_bug("CudaComputeNonbonded::openBox called on wrong Pe");
843  SimParameters *simParams = Node::Object()->simParameters;
844  if (!simParams->GBISOn || gbisPhase == 1) {
845  patches[i].compAtom = patches[i].positionBox->open();
846  copyAtomsSubset(i, i);
847  }
848  if (simParams->GBISOn) {
849  if (gbisPhase == 1) {
850  patches[i].intRad = patches[i].intRadBox->open();
851  patches[i].psiSum = patches[i].psiSumBox->open();
852  } else if (gbisPhase == 2) {
853  patches[i].bornRad = patches[i].bornRadBox->open();
854  patches[i].dEdaSum = patches[i].dEdaSumBox->open();
855  } else if (gbisPhase == 3) {
856  patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
857  }
858  copyGBISphase(i);
859  }
860 }
861 
863  if (masterPe != CkMyPe())
864  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
866 }
867 
869  if (rankPatches[CkMyRank()].size() == 0)
870  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
871  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
872  openBox(rankPatches[CkMyRank()][i]);
873  }
874  bool done = false;
875  CmiLock(lock);
876  patchesCounter -= rankPatches[CkMyRank()].size();
877  if (patchesCounter == 0) {
878  patchesCounter = getNumPatches();
879  done = true;
880  }
881  CmiUnlock(lock);
882  if (done) {
883  computeMgr->sendLaunchWork(masterPe, this);
884  }
885 }
886 
888  // Simply enqueu doWork on masterPe and return "no work"
889  computeMgr->sendMessageEnqueueWork(masterPe, this);
890  return 1;
891 }
892 
893 void CudaComputeNonbonded::reallocateArrays() {
894  cudaCheck(cudaSetDevice(deviceID));
895  SimParameters *simParams = Node::Object()->simParameters;
896 
897  // Re-allocate atoms
898  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
899 
900  // Re-allocate forces
901  if (doStreaming) {
902  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
903  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
904  } else {
905  reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
906  reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
907  }
908  reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
909  reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
910  nonbondedKernel.reallocate_forceSOA(atomStorageSize);
911 
912  if (simParams->GBISOn) {
913  reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
914  reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
915  reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
916  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
917  reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
918  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
919  }
920 }
921 
923  if (CkMyPe() != masterPe)
924  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
925 
926  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
927  // atomsChangedIn can be set to true by any Pe
928  // atomsChanged can only be set by masterPe
929  // This use of double varibles makes sure we don't have race condition
930  atomsChanged = atomsChangedIn;
931  atomsChangedIn = false;
932 
933  SimParameters *simParams = Node::Object()->simParameters;
934 
935  if (patches.size() == 0) return; // No work do to
936 
937  // Take the flags from the first patch on this Pe
938  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
939  Flags &flags = patches[0].patch->flags;
940 
941  doSlow = flags.doFullElectrostatics;
942  doEnergy = flags.doEnergy;
943  doVirial = flags.doVirial;
944 
945  if (flags.doNonbonded) {
946 
947  if (simParams->GBISOn) {
948  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
949  }
950 
951  if (!simParams->GBISOn || gbisPhase == 1) {
952  if ( computesChanged ) {
953  updateComputes();
954  }
955  if (atomsChanged) {
956  // Re-calculate patch atom numbers and storage
957  updatePatches();
958  reSortDone = false;
959  }
960  reallocateArrays();
961  }
962 
963  // Open boxes on Pes and launch work to masterPe
964  computeMgr->sendOpenBoxesOnPe(pes, this);
965 
966  } else {
967  // No work to do, skip
968  skip();
969  }
970 
971 }
972 
974  if (CkMyPe() != masterPe)
975  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
976 
977  beforeForceCompute = CkWallTimer();
978 
979  cudaCheck(cudaSetDevice(deviceID));
980  SimParameters *simParams = Node::Object()->simParameters;
981 
982  //execute only during GBIS phase 1, or if not using GBIS
983  if (!simParams->GBISOn || gbisPhase == 1) {
984 
985  if ( atomsChanged || computesChanged ) {
986  // Invalidate pair lists
987  pairlistsValid = false;
988  pairlistTolerance = 0.0f;
989  }
990 
991  // Get maximum atom movement and patch tolerance
992  float maxAtomMovement = 0.0f;
993  float maxPatchTolerance = 0.0f;
994  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
995  // Update pair-list cutoff
996  Flags &flags = patches[0].patch->flags;
997  savePairlists = false;
998  usePairlists = false;
999  if ( flags.savePairlists ) {
1000  savePairlists = true;
1001  usePairlists = true;
1002  } else if ( flags.usePairlists ) {
1003  if ( ! pairlistsValid ||
1004  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1005  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1006  } else {
1007  usePairlists = true;
1008  }
1009  }
1010  if ( ! usePairlists ) {
1011  pairlistsValid = false;
1012  }
1013  float plcutoff = cutoff;
1014  if ( savePairlists ) {
1015  pairlistsValid = true;
1016  pairlistTolerance = 2. * maxPatchTolerance;
1017  plcutoff += pairlistTolerance;
1018  }
1019  plcutoff2 = plcutoff * plcutoff;
1020 
1021  // if (atomsChanged)
1022  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1023  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1024 
1025  } // if (!simParams->GBISOn || gbisPhase == 1)
1026 
1027  // Calculate PME & VdW forces
1028  if (!simParams->GBISOn || gbisPhase == 1) {
1029  doForce();
1030  if (doStreaming) {
1031  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1032  patchReadyQueueLen = tileListKernel.getNumPatches();
1033  patchReadyQueueNext = 0;
1034  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1035  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1036  int* emptyPatches = tileListKernel.getEmptyPatches();
1037  for (int i=0;i < numEmptyPatches;i++) {
1038  PatchRecord &pr = patches[emptyPatches[i]];
1039  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1040  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1041  patchReadyQueue[i] = emptyPatches[i];
1042  }
1043  if (patchReadyQueueLen != patches.size())
1044  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1045  }
1046  }
1047 
1048  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1049  // before calling doGBISphase1().
1050  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1051  // In this code path doGBISphase1() is called in forceDone()
1052  forceDoneSetCallback();
1053  return;
1054  }
1055 
1056  // GBIS Phases
1057  if (simParams->GBISOn) {
1058  if (gbisPhase == 1) {
1059  doGBISphase1();
1060  } else if (gbisPhase == 2) {
1061  doGBISphase2();
1062  } else if (gbisPhase == 3) {
1063  doGBISphase3();
1064  }
1065  }
1066 
1067  // Copy forces to host
1068  if (!simParams->GBISOn || gbisPhase == 3) {
1069  if (!doStreaming) {
1070  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1071  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1072  }
1073  }
1074 
1075  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1076  // For GBIS, energies are ready after phase 2
1077  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1078  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1079  d_forces, d_forcesSlow, d_virialEnergy, stream);
1080  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1081  }
1082 
1083  // Setup call back
1084  forceDoneSetCallback();
1085 }
1086 
1087 //
1088 // GBIS Phase 1
1089 //
1090 void CudaComputeNonbonded::doGBISphase1() {
1091  cudaCheck(cudaSetDevice(deviceID));
1092 
1093  if (atomsChanged) {
1094  GBISKernel.updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1095  }
1096 
1097  SimParameters *simParams = Node::Object()->simParameters;
1098  Lattice lattice = patches[0].patch->flags.lattice;
1099 
1100  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1101  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1102  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1103 
1104  GBISKernel.GBISphase1(tileListKernel, atomStorageSize,
1105  lata, latb, latc,
1106  simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream);
1107 }
1108 
1109 //
1110 // GBIS Phase 2
1111 //
1112 void CudaComputeNonbonded::doGBISphase2() {
1113  cudaCheck(cudaSetDevice(deviceID));
1114 
1115  SimParameters *simParams = Node::Object()->simParameters;
1116  Lattice lattice = patches[0].patch->flags.lattice;
1117 
1118  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1119  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1120  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1121 
1122  GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream);
1123 
1124  GBISKernel.GBISphase2(tileListKernel, atomStorageSize,
1125  doEnergy, doSlow,
1126  lata, latb, latc,
1127  simParams->cutoff, simParams->nonbondedScaling, simParams->kappa,
1128  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1129  simParams->dielectric, simParams->solvent_dielectric,
1130  d_forces, dEdaSumH, stream);
1131 }
1132 
1133 //
1134 // GBIS Phase 3
1135 //
1136 void CudaComputeNonbonded::doGBISphase3() {
1137  cudaCheck(cudaSetDevice(deviceID));
1138  SimParameters *simParams = Node::Object()->simParameters;
1139  Lattice lattice = patches[0].patch->flags.lattice;
1140 
1141  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1142  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1143  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1144 
1145  if (doSlow) {
1146  GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream);
1147 
1148  GBISKernel.GBISphase3(tileListKernel, atomStorageSize,
1149  lata, latb, latc,
1150  simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream);
1151  }
1152 }
1153 
1154 //
1155 // Calculate electrostatic & VdW forces
1156 //
1157 void CudaComputeNonbonded::doForce() {
1158  cudaCheck(cudaSetDevice(deviceID));
1159 
1160  Lattice lattice = patches[0].patch->flags.lattice;
1161  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
1162  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
1163  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
1164  bool doPairlist = (savePairlists || !usePairlists);
1165 
1166  if (doPairlist) {
1167  int numTileLists = calcNumTileLists();
1168  // Build initial tile lists and sort
1169  tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize,
1170  maxTileListLen, lata, latb, latc,
1171  cudaPatches, (const float4*)atoms, plcutoff2, maxShmemPerBlock, stream);
1172  // Prepare tile list for atom-based refinement
1173  tileListKernel.prepareTileList(stream);
1174  }
1175 
1176  if (atomsChanged) {
1177  // Update Vdw types and exclusion index & maxdiff
1178  updateVdwTypesExcl();
1179  }
1180 
1181  beforeForceCompute = CkWallTimer();
1182 
1183  // Calculate forces (and refine tile list if atomsChanged=true)
1184  nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize, doPairlist,
1185  doEnergy, doVirial, doSlow, lata, latb, latc,
1186  (const float4*)atoms, cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1187  stream);
1188 
1189  if (doPairlist) {
1190  tileListKernel.finishTileList(stream);
1191  }
1192 
1193  traceUserBracketEvent(CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1194 }
1195 
1196 //
1197 // Count an upper estimate for the number of tile lists
1198 //
1199 int CudaComputeNonbonded::calcNumTileLists() {
1200  int numTileLists = 0;
1201  for (int i=0;i < computes.size();i++) {
1202  int pi1 = computes[i].patchInd[0];
1203  int numAtoms1 = patches[pi1].numAtoms;
1204  int numTiles1 = (numAtoms1-1)/WARPSIZE+1;
1205  numTileLists += numTiles1;
1206  }
1207  return numTileLists;
1208 }
1209 
1210 //
1211 // Finish & submit reductions
1212 //
1214 
1215  if (CkMyPe() != masterPe)
1216  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1217 
1218  // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1219 
1220  if (!doSkip) {
1221 
1222  if (doStreaming && (doVirial || doEnergy)) {
1223  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1224  if (!forceDoneEventRecord)
1225  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1226  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1227  forceDoneEventRecord = false;
1228  }
1229 
1230  if (doVirial) {
1231  Tensor virialTensor;
1232  virialTensor.xx = h_virialEnergy->virial[0];
1233  virialTensor.xy = h_virialEnergy->virial[1];
1234  virialTensor.xz = h_virialEnergy->virial[2];
1235  virialTensor.yx = h_virialEnergy->virial[3];
1236  virialTensor.yy = h_virialEnergy->virial[4];
1237  virialTensor.yz = h_virialEnergy->virial[5];
1238  virialTensor.zx = h_virialEnergy->virial[6];
1239  virialTensor.zy = h_virialEnergy->virial[7];
1240  virialTensor.zz = h_virialEnergy->virial[8];
1241  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1242  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1243  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1244  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1245  if (doSlow) {
1246  Tensor virialTensor;
1247  virialTensor.xx = h_virialEnergy->virialSlow[0];
1248  virialTensor.xy = h_virialEnergy->virialSlow[1];
1249  virialTensor.xz = h_virialEnergy->virialSlow[2];
1250  virialTensor.yx = h_virialEnergy->virialSlow[3];
1251  virialTensor.yy = h_virialEnergy->virialSlow[4];
1252  virialTensor.yz = h_virialEnergy->virialSlow[5];
1253  virialTensor.zx = h_virialEnergy->virialSlow[6];
1254  virialTensor.zy = h_virialEnergy->virialSlow[7];
1255  virialTensor.zz = h_virialEnergy->virialSlow[8];
1256  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1257  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1258  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1259  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1260  }
1261  }
1262  if (doEnergy) {
1263  // if (doSlow)
1264  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1265  SimParameters *simParams = Node::Object()->simParameters;
1266  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1267  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1268  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1269  if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1270  // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
1271  }
1272 
1273  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1274  }
1275  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1276  reduction->submit();
1277 
1278  // Reset flags
1279  doSkip = false;
1280  computesChanged = false;
1281 }
1282 
1283 //
1284 // Finish a single patch
1285 //
1286 void CudaComputeNonbonded::finishPatch(int i) {
1287  if (CkMyPe() != patches[i].pe)
1288  NAMD_bug("CudaComputeNonbonded::finishPatch called on wrong Pe");
1289 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1290  char buf[64];
1291  sprintf(buf, "%s: %d", NamdProfileEventStr[NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES], i);
1292  NAMD_EVENT_START_EX(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES, buf);
1293 #endif
1294  PatchRecord &pr = patches[i];
1295  pr.results = pr.forceBox->open();
1296 
1297  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1298  int atomStart = pr.atomStart;
1299  int numAtoms = pr.numAtoms;
1300  if (numAtoms > 0) {
1301  Force *f = pr.results->f[Results::nbond];
1302  Force *f_slow = pr.results->f[Results::slow];
1303  float4 *af = h_forces + atomStart;
1304  float4 *af_slow = h_forcesSlow + atomStart;
1305  // float maxf = 0.0f;
1306  // int maxf_k;
1307  for ( int k=0; k<numAtoms; ++k ) {
1308  int j = aExt[k].sortOrder;
1309  f[j].x += af[k].x;
1310  f[j].y += af[k].y;
1311  f[j].z += af[k].z;
1312  // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
1313  // maxf = std::max(maxf, fabsf(af[k].x));
1314  // maxf = std::max(maxf, fabsf(af[k].y));
1315  // maxf = std::max(maxf, fabsf(af[k].z));
1316  // maxf_k = k;
1317  // }
1318  if ( doSlow ) {
1319  f_slow[j].x += af_slow[k].x;
1320  f_slow[j].y += af_slow[k].y;
1321  f_slow[j].z += af_slow[k].z;
1322  }
1323  }
1324  // if (maxf > 10000.0f) {
1325  // fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
1326  // cudaCheck(cudaStreamSynchronize(stream));
1327  // NAMD_die("maxf!");
1328  // }
1329  }
1330 
1331  pr.positionBox->close(&(pr.compAtom));
1332  pr.forceBox->close(&(pr.results));
1333 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1334  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1335 #endif
1336 }
1337 
1338 //
1339 // Finish a set of patches on this pe
1340 //
1341 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1342  if (patchSet.size() == 0)
1343  NAMD_bug("CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1344  SimParameters *simParams = Node::Object()->simParameters;
1345  // Save value of gbisPhase here because it can change after the last finishGBISPhase() or finishPatch() is called
1346  int gbisPhaseSave = gbisPhase;
1347  // Close Boxes depending on Phase
1348  if (simParams->GBISOn) {
1349  for (int i=0;i < patchSet.size();i++) {
1350  finishGBISPhase(patchSet[i]);
1351  }
1352  }
1353  // Finish patches
1354  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1355  for (int i=0;i < patchSet.size();i++) {
1356  finishPatch(patchSet[i]);
1357  }
1358  }
1359  bool done = false;
1360  CmiLock(lock);
1361  patchesCounter -= patchSet.size();
1362  if (patchesCounter == 0) {
1363  patchesCounter = getNumPatches();
1364  done = true;
1365  }
1366  CmiUnlock(lock);
1367  if (done) {
1368  // Do reductions
1369  if (!simParams->GBISOn || gbisPhaseSave == 3) {
1370  // Reduction must be done on masterPe
1371  computeMgr->sendFinishReductions(masterPe, this);
1372  }
1373  }
1374 }
1375 
1376 //
1377 // Finish all patches that are on this pe
1378 //
1380  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1381 }
1382 
1383 //
1384 // Finish single patch on this pe
1385 //
1387  std::vector<int> v(1, i);
1388  finishSetOfPatchesOnPe(v);
1389 }
1390 
1391 void CudaComputeNonbonded::finishPatches() {
1392  computeMgr->sendFinishPatchesOnPe(pes, this);
1393 }
1394 
1395 void CudaComputeNonbonded::finishGBISPhase(int i) {
1396  if (CkMyPe() != patches[i].pe)
1397  NAMD_bug("CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1398  PatchRecord &pr = patches[i];
1399  const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1400  int atomStart = pr.atomStart;
1401  if (gbisPhase == 1) {
1402  GBReal *psiSumMaster = psiSumH + atomStart;
1403  for ( int k=0; k<pr.numAtoms; ++k ) {
1404  int j = aExt[k].sortOrder;
1405  pr.psiSum[j] += psiSumMaster[k];
1406  }
1407  pr.psiSumBox->close(&(pr.psiSum));
1408  } else if (gbisPhase == 2) {
1409  GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1410  for ( int k=0; k<pr.numAtoms; ++k ) {
1411  int j = aExt[k].sortOrder;
1412  pr.dEdaSum[j] += dEdaSumMaster[k];
1413  }
1414  pr.dEdaSumBox->close(&(pr.dEdaSum));
1415  } else if (gbisPhase == 3) {
1416  pr.intRadBox->close(&(pr.intRad)); //box 6
1417  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1418  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1419  } //end phases
1420 }
1421 
1422 void CudaComputeNonbonded::finishTimers() {
1423  SimParameters *simParams = Node::Object()->simParameters;
1424 
1425  if (simParams->GBISOn) {
1426  if (gbisPhase == 1)
1427  traceUserBracketEvent(CUDA_GBIS1_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1428  if (gbisPhase == 2)
1429  traceUserBracketEvent(CUDA_GBIS2_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1430  if (gbisPhase == 3)
1431  traceUserBracketEvent(CUDA_GBIS3_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1432  } else {
1433  traceUserBracketEvent(CUDA_NONBONDED_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
1434  }
1435 }
1436 
1437 //
1438 // Re-sort tile lists if neccessary
1439 //
1440 void CudaComputeNonbonded::reSortTileLists() {
1441  // Re-sort tile lists
1442  SimParameters *simParams = Node::Object()->simParameters;
1443  cudaCheck(cudaSetDevice(deviceID));
1444  tileListKernel.reSortTileLists(simParams->GBISOn, stream);
1445 }
1446 
1447 void CudaComputeNonbonded::forceDoneCheck(void *arg, double walltime) {
1449 
1450  if (CkMyPe() != c->masterPe)
1451  NAMD_bug("CudaComputeNonbonded::forceDoneCheck called on non masterPe");
1452 
1453  SimParameters *simParams = Node::Object()->simParameters;
1454  cudaCheck(cudaSetDevice(c->deviceID));
1455 
1456  if (c->doStreaming) {
1457  int patchInd;
1458  while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
1459  c->patchReadyQueue[c->patchReadyQueueNext] = -1;
1460  c->patchReadyQueueNext++;
1461  c->checkCount = 0;
1462 
1463  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
1464  c->finishTimers();
1465  if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
1466  c->reSortTileLists();
1467  c->reSortDone = true;
1468  if (simParams->GBISOn && c->gbisPhase == 1) {
1469  // We must do GBIS Phase 1
1470  c->doGBISphase1();
1471  c->forceDoneSetCallback();
1472  return;
1473  }
1474  }
1475  }
1476 
1477  // Finish patch
1478  int pe = c->patches[patchInd].pe;
1479  PatchID patchID = c->patches[patchInd].patchID; // for priority
1480  c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
1481 
1482  // Last patch, return
1483  if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) return;
1484 
1485  }
1486  } else {
1487  if (!c->forceDoneEventRecord)
1488  NAMD_bug("CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
1489  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
1490  if (err == cudaSuccess) {
1491  // Event has occurred
1492  c->forceDoneEventRecord = false;
1493  c->checkCount = 0;
1494  c->finishTimers();
1495  if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
1496  c->reSortTileLists();
1497  c->reSortDone = true;
1498  if (simParams->GBISOn && c->gbisPhase == 1) {
1499  // We must do GBIS Phase 1
1500  c->doGBISphase1();
1501  c->forceDoneSetCallback();
1502  return;
1503  }
1504  }
1505  c->finishPatches();
1506  return;
1507  } else if (err != cudaErrorNotReady) {
1508  // Anything else is an error
1509  char errmsg[256];
1510  sprintf(errmsg,"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
1511  c->checkCount, walltime - c->beforeForceCompute);
1512  cudaDie(errmsg,err);
1513  }
1514  }
1515 
1516  // if (c->checkCount % 1000 == 0)
1517  // fprintf(stderr, "c->patchReadyQueueNext %d\n", c->patchReadyQueueNext);
1518 
1519  // Event has not occurred
1520  c->checkCount++;
1521  if (c->checkCount >= 1000000) {
1522  char errmsg[256];
1523  sprintf(errmsg,"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
1524  c->checkCount, walltime - c->beforeForceCompute);
1525  cudaDie(errmsg,cudaSuccess);
1526  }
1527 
1528  // Call again
1529  CcdCallBacksReset(0, walltime);
1530  CcdCallFnAfter(forceDoneCheck, arg, 0.1);
1531 }
1532 
1533 //
1534 // Set call back for all the work in the stream at this point
1535 //
1536 void CudaComputeNonbonded::forceDoneSetCallback() {
1537  if (CkMyPe() != masterPe)
1538  NAMD_bug("CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
1539  beforeForceCompute = CkWallTimer();
1540  cudaCheck(cudaSetDevice(deviceID));
1541  if (!doStreaming || doVirial || doEnergy) {
1542  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
1543  forceDoneEventRecord = true;
1544  }
1545  checkCount = 0;
1546  CcdCallBacksReset(0, CmiWallTimer());
1547  // Set the call back at 0.1ms
1548  CcdCallFnAfter(forceDoneCheck, this, 0.1);
1549 }
1550 
1551 struct cr_sortop_distance {
1552  const Lattice &l;
1553  cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
1556  Vector a = l.a();
1557  Vector b = l.b();
1558  Vector c = l.c();
1559  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
1560  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
1561  return ( ri < rj );
1562  }
1563 };
1564 
1565 static inline bool sortop_bitreverse(int a, int b) {
1566  if ( a == b ) return 0;
1567  for ( int bit = 1; bit; bit *= 2 ) {
1568  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
1569  }
1570  return 0;
1571 }
1572 
1577  const CudaComputeNonbonded::PatchRecord *patchrecs) : distop(sod), pr(patchrecs) { }
1578  bool pid_compare_priority(int2 pidi, int2 pidj) {
1579  const CudaComputeNonbonded::PatchRecord &pri = pr[pidi.y];
1580  const CudaComputeNonbonded::PatchRecord &prj = pr[pidj.y];
1581  if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
1582  if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
1583  if ( pri.isSameNode && ! prj.isSameNode ) return 0;
1584  if ( prj.isSameNode && ! pri.isSameNode ) return 1;
1585  if ( pri.isSameNode ) { // and prj.isSameNode
1586  int rpri = pri.reversePriorityRankInPe;
1587  int rprj = prj.reversePriorityRankInPe;
1588  if ( rpri != rprj ) return rpri > rprj;
1589  return sortop_bitreverse(CkRankOf(pri.pe),CkRankOf(prj.pe));
1590  }
1591  int ppi = PATCH_PRIORITY(pidi.x);
1592  int ppj = PATCH_PRIORITY(pidj.x);
1593  if ( ppi != ppj ) return ppi < ppj;
1594  return pidi.x < pidj.x;
1595  }
1597  CudaComputeNonbonded::ComputeRecord i) { // i and j reversed
1598  // Choose patch i (= patch with greater priority)
1599  int2 pidi = pid_compare_priority(make_int2(i.pid[0], i.patchInd[0]), make_int2(i.pid[1], i.patchInd[1])) ? make_int2(i.pid[0], i.patchInd[0]) : make_int2(i.pid[1], i.patchInd[1]);
1600  // Choose patch j
1601  int2 pidj = pid_compare_priority(make_int2(j.pid[0], j.patchInd[0]), make_int2(j.pid[1], j.patchInd[1])) ? make_int2(j.pid[0], j.patchInd[0]) : make_int2(j.pid[1], j.patchInd[1]);
1602  if ( pidi.x != pidj.x ) return pid_compare_priority(pidi, pidj);
1603  return distop(i,j);
1604  }
1605 };
1606 
1607 //
1608 // Setup computes. This is only done at the beginning and at load balancing, hence the lack of
1609 // consideration for performance in the CPU->GPU memory copy.
1610 //
1611 void CudaComputeNonbonded::updateComputes() {
1612  cudaCheck(cudaSetDevice(deviceID));
1613 
1614  Lattice lattice = patches[0].patch->flags.lattice;
1615  cr_sortop_distance so(lattice);
1616  std::stable_sort(computes.begin(), computes.end(), so);
1617 
1618  if (doStreaming) {
1619  cr_sortop_reverse_priority sorp(so, patches.data());
1620  std::stable_sort(computes.begin(), computes.end(), sorp);
1621  }
1622 
1623  CudaComputeRecord* cudaComputes = new CudaComputeRecord[computes.size()];
1624 
1625  for (int i=0;i < computes.size();i++) {
1626  cudaComputes[i].patchInd.x = computes[i].patchInd[0];
1627  cudaComputes[i].patchInd.y = computes[i].patchInd[1];
1628  cudaComputes[i].offsetXYZ.x = computes[i].offset.x;
1629  cudaComputes[i].offsetXYZ.y = computes[i].offset.y;
1630  cudaComputes[i].offsetXYZ.z = computes[i].offset.z;
1631  }
1632 
1633  tileListKernel.updateComputes(computes.size(), cudaComputes, stream);
1634  cudaCheck(cudaStreamSynchronize(stream));
1635 
1636  delete [] cudaComputes;
1637 }
1638 
1639 struct exlist_sortop {
1640  bool operator() (int32 *li, int32 *lj) {
1641  return ( li[1] < lj[1] );
1642  }
1643 };
1644 
1645 //
1646 // Builds the exclusions table. Swiped from ComputeNonbondedCUDA.C
1647 //
1648 void CudaComputeNonbonded::buildExclusions() {
1649  cudaCheck(cudaSetDevice(deviceID));
1650 
1652 
1653 #ifdef MEM_OPT_VERSION
1654  int natoms = mol->exclSigPoolSize;
1655 #else
1656  int natoms = mol->numAtoms;
1657 #endif
1658 
1659  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
1660  exclusionsByAtom = new int2[natoms];
1661 
1662  // create unique sorted lists
1663 
1664  ObjectArena<int32> listArena;
1665  ResizeArray<int32*> unique_lists;
1666  int32 **listsByAtom = new int32*[natoms];
1668  for ( int i=0; i<natoms; ++i ) {
1669  curList.resize(0);
1670  curList.add(0); // always excluded from self
1671 #ifdef MEM_OPT_VERSION
1672  const ExclusionSignature *sig = mol->exclSigPool + i;
1673  int n = sig->fullExclCnt;
1674  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
1675  n += 1;
1676 #else
1677  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
1678  int n = mol_list[0] + 1;
1679  for ( int j=1; j<n; ++j ) {
1680  curList.add(mol_list[j] - i);
1681  }
1682 #endif
1683  curList.sort();
1684 
1685  int j;
1686  for ( j=0; j<unique_lists.size(); ++j ) {
1687  if ( n != unique_lists[j][0] ) continue; // no match
1688  int k;
1689  for ( k=0; k<n; ++k ) {
1690  if ( unique_lists[j][k+3] != curList[k] ) break;
1691  }
1692  if ( k == n ) break; // found match
1693  }
1694  if ( j == unique_lists.size() ) { // no match
1695  int32 *list = listArena.getNewArray(n+3);
1696  list[0] = n;
1697  int maxdiff = 0;
1698  maxdiff = -1 * curList[0];
1699  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
1700  list[1] = maxdiff;
1701  for ( int k=0; k<n; ++k ) {
1702  list[k+3] = curList[k];
1703  }
1704  unique_lists.add(list);
1705  }
1706  listsByAtom[i] = unique_lists[j];
1707  }
1708  // sort lists by maxdiff
1709  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
1710  long int totalbits = 0;
1711  int nlists = unique_lists.size();
1712  for ( int j=0; j<nlists; ++j ) {
1713  int32 *list = unique_lists[j];
1714  int maxdiff = list[1];
1715  list[2] = totalbits + maxdiff;
1716  totalbits += 2*maxdiff + 1;
1717  }
1718  for ( int i=0; i<natoms; ++i ) {
1719  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
1720  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
1721  }
1722  delete [] listsByAtom;
1723 
1724  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
1725 
1726  {
1727  long int bytesneeded = totalbits / 8;
1728  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
1729  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
1730  unique_lists.size(), bytesneeded);
1731  }
1732 
1733  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
1734  if ( bytesneeded > bytesavail ) {
1735  char errmsg[512];
1736  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
1737  "but only %ld bytes can be addressed with 32-bit int.",
1738  unique_lists.size(), bytesneeded, bytesavail);
1739  NAMD_die(errmsg);
1740  }
1741  }
1742 
1743 #define SET_EXCL(EXCL,BASE,DIFF) \
1744  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
1745 
1746  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
1747  memset(exclusion_bits, 0, totalbits/8);
1748 
1749  long int base = 0;
1750  for ( int i=0; i<unique_lists.size(); ++i ) {
1751  base += unique_lists[i][1];
1752  if ( unique_lists[i][2] != (int32)base ) {
1753  NAMD_bug("CudaComputeNonbonded::build_exclusions base != stored");
1754  }
1755  int n = unique_lists[i][0];
1756  for ( int j=0; j<n; ++j ) {
1757  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
1758  }
1759  base += unique_lists[i][1] + 1;
1760  }
1761 
1762  int numExclusions = totalbits/32;
1763 
1764  nonbondedKernel.bindExclusions(numExclusions, exclusion_bits);
1765 
1766  delete [] exclusion_bits;
1767 }
1768 
1769 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:18
BigReal zy
Definition: Tensor.h:19
#define NAMD_EVENT_STOP(eon, id)
Type * getNewArray(int n)
Definition: ObjectArena.h:49
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
int getDeviceCount()
Definition: DeviceCUDA.h:92
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:94
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
BigReal xz
Definition: Tensor.h:17
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
BigReal solvent_dielectric
int sortOrder
Definition: NamdTypes.h:87
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
short int32
Definition: dumpdcd.c:24
int ComputeID
Definition: NamdTypes.h:183
static PatchMap * Object()
Definition: PatchMap.h:27
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1664
Definition: Vector.h:64
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1653
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
BigReal nonbondedScaling
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
virtual void gbisP2PatchReady(PatchID, int seq)
int savePairlists
Definition: PatchTypes.h:39
static const Molecule * mol
BigReal & item(int i)
Definition: ReductionMgr.h:312
BigReal z
Definition: Vector.h:66
char const *const NamdProfileEventStr[]
int usePairlists
Definition: PatchTypes.h:38
BigReal yz
Definition: Tensor.h:18
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
void CcdCallBacksReset(void *ignored, double curWallTime)
#define WARPSIZE
Definition: CudaUtils.h:10
const CudaComputeNonbonded::PatchRecord * pr
void reallocate_forceSOA(int atomStorageSize)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
BigReal switchingDist
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:84
Definition: Patch.h:35
bool operator()(int pidj, int pidi)
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1675
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
bool pid_compare_priority(int2 pidi, int2 pidj)
virtual void gbisP3PatchReady(PatchID, int seq)
#define ATOMIC_BINS
Definition: CudaUtils.h:24
void NAMD_bug(const char *err_msg)
Definition: common.C:129
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:15
int doEnergy
Definition: PatchTypes.h:20
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
int doFullElectrostatics
Definition: PatchTypes.h:23
BigReal yx
Definition: Tensor.h:18
iterator end(void)
Definition: ResizeArray.h:37
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:385
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1686
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:19
int priority(void)
Definition: Compute.h:65
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
BigReal x
Definition: Vector.h:66
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:107
void finishTileList(cudaStream_t stream)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
BigReal alpha_cutoff
int numAtoms
Definition: Molecule.h:557
int PatchID
Definition: NamdTypes.h:182
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1612
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:85
void registerComputeSelf(ComputeID cid, PatchID pid)
static bool sortop_bitreverse(int a, int b)
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
__global__ void const int numTileLists
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
int add(const Elem &elem)
Definition: ResizeArray.h:97
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
BigReal zz
Definition: Tensor.h:19
virtual void patchReady(PatchID, int doneMigration, int seq)
int gbisPhase
Definition: Compute.h:39
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BlockRadixSort::TempStorage sort
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
void resize(int i)
Definition: ResizeArray.h:84
int node(int pid) const
Definition: PatchMap.h:114
#define NAMD_EVENT_START_EX(eon, id, str)
Definition: Tensor.h:15
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:16
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
BigReal xy
Definition: Tensor.h:17
int getNumPatches()
Definition: Compute.h:53
int doVirial
Definition: PatchTypes.h:21
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
Bool pressureProfileOn
BigReal yy
Definition: Tensor.h:18
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:17
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
BigReal dielectric
int doGBIS
Definition: PatchTypes.h:28
void submit(void)
Definition: ReductionMgr.h:323
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:63
int size(void) const
Definition: ResizeArray.h:127
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1639
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
Definition: ComputeMgr.C:1626
#define MAX_EXCLUSIONS
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:106
BigReal zx
Definition: Tensor.h:19
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
Molecule * molecule
Definition: Node.h:176
Vector a() const
Definition: Lattice.h:252
const ComputeID cid
Definition: Compute.h:43
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1161
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1586
Vector c() const
Definition: Lattice.h:254
double BigReal
Definition: common.h:114
float GBReal
Definition: ComputeGBIS.inl:17
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1599
iterator begin(void)
Definition: ResizeArray.h:36
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)