NAMD
ComputeBondedCUDA.C
Go to the documentation of this file.
1 #include <algorithm> // std::find
2 
3 #include "charm++.h"
4 #include "NamdTypes.h"
5 #include "ComputeMgr.h"
6 #include "WorkDistrib.h"
7 #include "ProxyMgr.h"
8 #include "CudaUtils.h"
9 #include "DeviceCUDA.h"
10 #include "ComputeBonds.h"
11 #include "ComputeAngles.h"
12 #include "ComputeDihedrals.h"
13 #include "ComputeImpropers.h"
14 #include "ComputeCrossterms.h"
16 #include "ComputeBondedCUDA.h"
17 #include "PatchData.h"
18 #include "HipDefines.h"
19 #include "NamdEventsProfiling.h"
20 #include "CudaRecord.h"
21 
22 #include "TestArray.h"
23 
24 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
25 #ifdef WIN32
26 #define __thread __declspec(thread)
27 #endif
28 extern __thread DeviceCUDA *deviceCUDA;
29 
30 #ifdef BONDED_CUDA
31 
32 
33 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
34  sizeof(CudaBond), // Bonds
35  sizeof(CudaAngle), // Angles
36  sizeof(CudaDihedral), // Dihedrals
37  sizeof(CudaDihedral), // Impropers
38  sizeof(CudaExclusion), // Exclusions
39  sizeof(CudaCrossterm) // Crossterms
40 };
41 
42 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
43  sizeof(CudaBondStage), // Bonds
44  sizeof(CudaAngleStage), // Angles
45  sizeof(CudaDihedralStage), // Dihedrals
46  sizeof(CudaDihedralStage), // Impropers
47  sizeof(CudaExclusionStage), // Exclusions
48  sizeof(CudaCrossterm) // Crossterms
49 };
50 
51 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
52 
53 //
54 // Class constructor
55 //
56 ComputeBondedCUDA::ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID,
57  CudaNonbondedTables& cudaNonbondedTables) :
58 Compute(c), computeMgr(computeMgr), deviceID(deviceID), masterPe(CkMyPe()),
59 bondedKernel(deviceID, cudaNonbondedTables)
60 {
61 
62  computes.resize(CkMyNodeSize());
63  patchIDsPerRank.resize(CkMyNodeSize());
64  numExclPerRank.resize(CkMyNodeSize());
65  for (int i=0;i < numExclPerRank.size();i++) {
66  numExclPerRank[i].numModifiedExclusions = 0;
67  numExclPerRank[i].numExclusions = 0;
68  }
69 
70  atomMap.allocateMap(Node::Object()->molecule->numAtoms);
71 
72  flags = NULL;
73  step = 0;
74 
75  tupleData = NULL;
76  tupleDataSize = 0;
77 
78  atoms = NULL;
79  atomsSize = 0;
80 
81  forces = NULL;
82  forcesSize = 0;
83 
84  energies_virials = NULL;
85 
86  initializeCalled = false;
87 
88  params = Node::Object()->simParameters;
89  accelMDdoDihe = false;
90  if (params->accelMDOn) {
91  if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=true;
92  }
93  /*pswitchTable = {0, 1, 2,
94  1, 1, 99,
95  2, 99, 2};
96  */
97  pswitchTable[0] = 0; pswitchTable[1] = 1; pswitchTable[2] = 2;
98  pswitchTable[3] = 1; pswitchTable[4] = 1; pswitchTable[5] = 99;
99  pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
100 
101  h_patchRecord = NULL;
102  d_patchRecord = NULL;
103 
104  h_patchMapCenter = NULL;
105  d_patchMapCenter = NULL;
106 
107 }
108 
109 //
110 // Class destructor
111 //
112 ComputeBondedCUDA::~ComputeBondedCUDA() {
113  cudaCheck(cudaSetDevice(deviceID));
114 
115  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
116  if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
117  if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
118  if (tupleData != NULL) deallocate_host<char>(&tupleData);
119 
120  if (initializeCalled) {
121  cudaCheck(cudaStreamDestroy(stream));
122  cudaCheck(cudaEventDestroy(forceDoneEvent));
123  CmiDestroyLock(lock);
124  CmiDestroyLock(printLock);
125  delete reduction;
126  }
127 
128  if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
129  if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
130 
131  if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
132  if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
133 
134  // NOTE: unregistering happens in [sync] -entry method
135  computeMgr->sendUnregisterBoxesOnPe(pes, this);
136 }
137 
138 void ComputeBondedCUDA::unregisterBoxesOnPe() {
139  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
140  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
141  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
142  if (tpe == NULL || tpe->p == NULL) {
143  NAMD_bug("ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
144  }
145  Patch* patch = tpe->p;
146  if (tpe->positionBox != NULL) patch->unregisterPositionPickup(this, &tpe->positionBox);
147  if (tpe->avgPositionBox != NULL) patch->unregisterAvgPositionPickup(this, &tpe->avgPositionBox);
148  if (tpe->forceBox != NULL) patch->unregisterForceDeposit(this, &tpe->forceBox);
149  }
150 }
151 
152 //
153 // Register compute for a given PE. pids is a list of patches the PE has
154 // Called by master PE
155 //
156 void ComputeBondedCUDA::registerCompute(int pe, int type, PatchIDList& pids) {
157 
158  if (CkMyPe() != masterPe)
159  NAMD_bug("ComputeBondedCUDA::registerCompute() called on non master PE");
160 
161  int rank = CkRankOf(pe);
162 
163  HomeCompute& homeCompute = computes[rank].homeCompute;
164  if (homeCompute.patchIDs.size() == 0) {
165  homeCompute.isBasePatch.resize(PatchMap::Object()->numPatches(), 0);
166  homeCompute.patchIDs.resize(pids.size());
167  for (int i=0;i < pids.size();i++) {
168  homeCompute.patchIDs[i] = pids[i];
169  homeCompute.isBasePatch[pids[i]] = 1;
170  }
171  } else {
172  if (homeCompute.patchIDs.size() != pids.size()) {
173  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
174  }
175  for (int i=0;i < pids.size();i++) {
176  if (homeCompute.patchIDs[i] != pids[i]) {
177  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
178  }
179  }
180  }
181 
182  switch(type) {
183  case computeBondsType:
184  homeCompute.tuples.push_back(new HomeTuples<BondElem, Bond, BondValue>(Tuples::BOND));
185  break;
186 
187  case computeAnglesType:
188  homeCompute.tuples.push_back(new HomeTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE));
189  break;
190 
192  homeCompute.tuples.push_back(new HomeTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL));
193  break;
194 
196  homeCompute.tuples.push_back(new HomeTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER));
197  break;
198 
199  case computeExclsType:
200  homeCompute.tuples.push_back(new HomeTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION));
201  break;
202 
204  homeCompute.tuples.push_back(new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM));
205  break;
206 
207  default:
208  NAMD_bug("ComputeBondedCUDA::registerCompute(), Unsupported compute type");
209  break;
210  }
211 
212 }
213 
214 //
215 // Register self compute for a given PE
216 // Called by master PE
217 //
218 void ComputeBondedCUDA::registerSelfCompute(int pe, int type, int pid) {
219 
220  if (CkMyPe() != masterPe)
221  NAMD_bug("ComputeBondedCUDA::registerSelfCompute() called on non master PE");
222 
223  int rank = CkRankOf(pe);
224 
225  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
226  auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
227  if (it == selfComputes.end()) {
228  // Type not found, add new one
229  selfComputes.push_back(SelfCompute(type));
230  it = selfComputes.begin() + (selfComputes.size() - 1);
231 
232  switch(type) {
234  it->tuples = new SelfTuples<BondElem, Bond, BondValue>(Tuples::BOND);
235  break;
236 
238  it->tuples = new SelfTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE);
239  break;
240 
242  it->tuples = new SelfTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL);
243  break;
244 
246  it->tuples = new SelfTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER);
247  break;
248 
250  it->tuples = new SelfTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION);
251  break;
252 
254  it->tuples = new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM);
255  break;
256 
257  default:
258  NAMD_bug("ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
259  break;
260  }
261 
262  }
263 
264  // Add patch ID for this type
265  it->patchIDs.push_back(pid);
266 }
267 
268 void ComputeBondedCUDA::assignPatchesOnPe() {
269 
270  PatchMap* patchMap = PatchMap::Object();
271  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
272  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
273  ProxyMgr::Object()->createProxy(patchID);
274  Patch* patch = patchMap->patch(patchID);
275  if (patch == NULL)
276  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, patch not found");
277  if (flags == NULL) flags = &patchMap->patch(patchID)->flags;
278  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
279  if (tpe == NULL) {
280  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
281  }
282  if (tpe->p != NULL) {
283  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
284  }
285  // Assign patch and register coordinates and forces manually
286  tpe->p = patch;
287  tpe->positionBox = patch->registerPositionPickup(this);
288  tpe->avgPositionBox = patch->registerAvgPositionPickup(this);
289  tpe->forceBox = patch->registerForceDeposit(this);
290  }
291 }
292 
293 //
294 // atomUpdate() can be called by any Pe
295 //
296 void ComputeBondedCUDA::atomUpdate() {
297  atomsChangedIn = true;
298 }
299 
300 //
301 // Enqueu doWork on masterPe and return "no work"
302 // Can be called by any Pe
303 //
304 int ComputeBondedCUDA::noWork() {
305  computeMgr->sendMessageEnqueueWork(masterPe, this);
306  return 1;
307 }
308 
309 void ComputeBondedCUDA::messageEnqueueWork() {
310  if (masterPe != CkMyPe())
311  NAMD_bug("ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
313 }
314 
315 //
316 // Sends open-box commands to PEs
317 // Called on master PE
318 //
319 void ComputeBondedCUDA::doWork() {
320  if (CkMyPe() != masterPe)
321  NAMD_bug("ComputeBondedCUDA::doWork() called on non master PE");
322 
323  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
324  // atomsChangedIn can be set to true by any Pe
325  // atomsChanged can only be set by masterPe
326  // This use of double varibles makes sure we don't have race condition
327  atomsChanged = atomsChangedIn;
328  atomsChangedIn = false;
329 
330  if (getNumPatches() == 0) {
331  return; // No work do to
332  }
333 
334  if (flags == NULL)
335  NAMD_bug("ComputeBondedCUDA::doWork(), no flags set");
336 
337  // Read flags
338  // what is flags...
339  lattice = flags->lattice;
340  doEnergy = flags->doEnergy;
341  doVirial = flags->doVirial;
342  doSlow = flags->doFullElectrostatics;
343  doMolly = flags->doMolly;
344  step = flags->step;
345 
346  if (hostAlchFlags.alchOn) {
347  updateHostCudaAlchLambdas();
348  updateKernelCudaAlchLambdas();
349  const int& alchOutFreq = Node::Object()->simParameters->alchOutFreq;
350  if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
351  doEnergy = true;
352  }
353  }
354 
355  if (atomsChanged) {
356  // Re-calculate patch atom numbers and storage
357  updatePatches();
358  }
359  // Open boxes on Pes and launch work to masterPe
360  if(params->CUDASOAintegrate) {
361  if (!atomsChanged) this->openBoxesOnPe();
362  }
363  else computeMgr->sendOpenBoxesOnPe(pes, this);
364 }
365 
366 //
367 // This gets called when patch finishes on a PE
368 //
369 void ComputeBondedCUDA::patchReady(PatchID pid, int doneMigration, int seq) {
370  if (doneMigration) {
371  // auto it = patchIndex.find(pid);
372  // if (it == patchIndex.end())
373  // NAMD_bug("ComputeBondedCUDA::patchReady, Patch ID not found");
374  patches[patchIndex[pid]].numAtoms = PatchMap::Object()->patch(pid)->getNumAtoms();
375 #ifdef NODEGROUP_FORCE_REGISTER
376  patches[patchIndex[pid]].patchID = pid;
377 #endif
378  }
379  // XXX first sum locally, then sum globally
380  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
381  // however that is already called in Sequencer::runComputeObjects_CUDA
382  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
383  CmiLock(lock);
384  Compute::patchReady(pid, doneMigration, seq);
385  CmiUnlock(lock);
386  }
387 }
388 
389 //
390 //
391 //
392 void ComputeBondedCUDA::updatePatches() {
393  if (!Node::Object()->simParameters->CUDASOAintegrate) {
394  int atomStart = 0;
395  for (int i=0;i < patches.size();i++) {
396  patches[i].atomStart = atomStart;
397  atomStart += patches[i].numAtoms;
398  }
399  atomStorageSize = atomStart;
400 
401  // Re-allocate atoms
402  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
403 
404  } else {
405 #ifdef NODEGROUP_FORCE_REGISTER
406  const int deviceIndex = deviceCUDA->getDeviceIndex();
407  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
408  PatchData *patchData = cpdata.ckLocalBranch();
409 
410  std::vector<CudaLocalRecord>& localPatches =
411  patchData->devData[deviceIndex].h_localPatches;
412  const int numPatchesHomeAndProxy =
413  patchData->devData[deviceIndex].numPatchesHomeAndProxy;
414 
415  int atomStart = 0;
416  for (int i=0;i < numPatchesHomeAndProxy; i++) {
417  patches[i].numAtoms = localPatches[i].numAtoms;
418  patches[i].atomStart = localPatches[i].bufferOffset;
419  atomStart += patches[i].numAtoms;
420  }
421 
422  atomStorageSize = atomStart;
423  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
424 
425  if (params->CUDASOAintegrate && params->useDeviceMigration) {
426  bondedKernel.updateAtomBuffer(atomStorageSize, stream);
427  updatePatchRecords();
428  }
429 #endif // NODEGROUP_FORCE_REGISTER
430  }
431 }
432 
433 //
434 // Map atoms GPU-wide
435 //
436 void ComputeBondedCUDA::mapAtoms() {
437  for (int i=0;i < getNumPatches();i++) {
438  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
439  atomMappers[i]->registerIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
440  }
441 
442 }
443 
444 //
445 // Unmap atoms GPU-wide
446 //
447 void ComputeBondedCUDA::unmapAtoms() {
448  for (int i=0;i < getNumPatches();i++) {
449  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
450  atomMappers[i]->unregisterIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
451  }
452 }
453 
454 //
455 // Open all patches that have been assigned to this Pe
456 //
457 void ComputeBondedCUDA::openBoxesOnPe(int startup) {
458  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_OPEN_BOXES);
459  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
460 #if 0
461  fprintf(stderr, "PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(), this);
462 #endif
463 #ifdef NODEGROUP_FORCE_REGISTER
464  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
465  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
466  PatchID patchID = *it;
467  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
468  tpe->x = tpe->positionBox->open();
469  tpe->r = tpe->forceBox->open();
470  //tpe->f = tpe->r->f[Results::normal];
471  }
472  this->launchWork();
473  }
474  else{
475 #endif
476  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
477  PatchID patchID = *it;
478  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
479  tpe->x = tpe->positionBox->open();
480  tpe->xExt = tpe->p->getCompAtomExtInfo();
481  //fprintf(stderr, "::openBoxesOnPe(%p) PE[%d] step %d PID[%d] = atomExt = %p\n", this, CkMyPe(), tpe->p->flags.step, tpe->p->getPatchID(), tpe->xExt);
482  if ( doMolly ) tpe->x_avg = tpe->avgPositionBox->open();
483  tpe->r = tpe->forceBox->open();
484  tpe->f = tpe->r->f[Results::normal];
485  if (accelMDdoDihe) tpe->af = tpe->r->f[Results::amdf]; // for dihedral-only or dual-boost accelMD
486  // Copy atoms
487  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
488  int pi = patchIndex[patchID];
489  int atomStart = patches[pi].atomStart;
490  int numAtoms = patches[pi].numAtoms;
491  CompAtom* compAtom = tpe->x;
492  const CompAtomExt *aExt = tpe->p->getCompAtomExtInfo();
493  const CudaAtom *src = tpe->p->getCudaAtomList();
494  for (int i=0;i < numAtoms;i++) {
495  int j = aExt[i].sortOrder;
496  // We have an atomStart here, and J is the sortOrder: unsorting atoms
497  atoms[atomStart + j] = src[i];
498  }
499  }
500  }
501  bool done = false;
502  // NOTE: this whole scheme of counting down the patches will help
503  // when I have multiple masterPes
504  CmiLock(lock);
505  patchesCounter -= patchIDs.size();
506  if (patchesCounter == 0) {
507  patchesCounter = getNumPatches();
508  done = true;
509  }
510  CmiUnlock(lock);
511  if (done) {
512  if (atomsChanged) {
513  if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
514  mapAtoms();
515  }
516  //computeMgr->sendLoadTuplesOnPe(pes, this);
517  if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes, this);
518  } else {
519  //computeMgr->sendLaunchWork(masterPe, this);
520  if(params->CUDASOAintegrate){
521  if(!atomsChanged) this->launchWork();
522  }
523  else computeMgr->sendLaunchWork(masterPe, this);
524  }
525  }
526 #ifdef NODEGROUP_FORCE_REGISTER
527  }
528 #endif
529  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_OPEN_BOXES);
530 
531 #if 0
532  // CmiNodeBarrier();
533  // Patches are using different
534  CmiLock(printLock);
535  fprintf(stderr, "PE[%d] (%p) tuplePatchList printout\n", CkMyPe(), this);
536  for(int i = 0 ; i < tuplePatchList.size(); i++){
537  // how the fuck do I print this structure. argh
538  TuplePatchElem *tpe = tuplePatchList.find(TuplePatchElem(i));
539  if(tpe == NULL) break;
540  fprintf(stderr, "PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(), this, i, tpe->p->getPatchID(), tpe->xExt);
541  }
542  CmiUnlock(printLock);
543  // CmiNodeBarrier(); // Let's see
544 #endif
545 }
546 
547 void countNumExclusions(Tuples* tuples, int& numModifiedExclusions, int& numExclusions) {
548  numModifiedExclusions = 0;
549  int ntuples = tuples->getNumTuples();
550  ExclElem* src = (ExclElem *)(tuples->getTupleList());
551  for (int ituple=0;ituple < ntuples;ituple++) {
552  if (src[ituple].modified) numModifiedExclusions++;
553  }
554  numExclusions = ntuples - numModifiedExclusions;
555 }
556 
557 //
558 // Load tuples on PE. Note: this can only after boxes on all PEs have been opened
559 //
560 void ComputeBondedCUDA::loadTuplesOnPe(const int startup) {
561  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_LOAD_TUPLES);
562 
563  int numModifiedExclusions = 0;
564  int numExclusions = 0;
565  if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
566  std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
567  // Loop over self compute types
568  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
569  // clears tupleList and reloads the tuples
570  it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
571  // For exclusions, we must count the number of modified and non-modified exclusions
572  if (it->tuples->getType() == Tuples::EXCLUSION) {
573  int tmp1, tmp2;
574  countNumExclusions(it->tuples, tmp1, tmp2);
575  numModifiedExclusions += tmp1;
576  numExclusions += tmp2;
577  }
578  }
579 
580  HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
581  for (int i=0;i < homeCompute.tuples.size();i++) {
582  homeCompute.tuples[i]->loadTuples(tuplePatchList,
583  homeCompute.isBasePatch.data(), &atomMap,
584  homeCompute.patchIDs);
585  // For exclusions, we must count the number of modified and non-modified exclusions
586  if (homeCompute.tuples[i]->getType() == Tuples::EXCLUSION) {
587  int tmp1, tmp2;
588  countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
589  numModifiedExclusions += tmp1;
590  numExclusions += tmp2;
591  }
592  }
593  } else {
594  numModifiedExclusions = modifiedExclusionTupleData.size();
595  numExclusions = exclusionTupleData.size();
596  }
597 
598  // Store number of exclusions
599  numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
600  numExclPerRank[CkMyRank()].numExclusions = numExclusions;
601 
602 
603  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
604  bool done = false;
605 
606  // TODO: Swap that for a more efficient atomic operation
607  // Think of a lock-free design
608  CmiLock(lock);
609  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
610  if (patchesCounter == 0) {
611  patchesCounter = getNumPatches();
612  done = true;
613  }
614  CmiUnlock(lock);
615  if (done) {
616  //computeMgr->sendLaunchWork(masterPe, this);
617  //if(params->CUDASOAintegrate && !first) this->launchWork();
618  //else computeMgr->sendLaunchWork(masterPe, this);
619  if(!params->CUDASOAintegrate)computeMgr->sendLaunchWork(masterPe, this);
620  }
621  }
622 
623  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_LOAD_TUPLES);
624 }
625 
626 void ComputeBondedCUDA::copyBondData(const int ntuples, const BondElem* __restrict__ src,
627  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
628 
629  PatchMap* patchMap = PatchMap::Object();
630  for (int ituple=0;ituple < ntuples;ituple++) {
631  CudaBond dstval;
632  auto p0 = src[ituple].p[0];
633  auto p1 = src[ituple].p[1];
634  int pi0 = patchIndex[p0->patchID];
635  int pi1 = patchIndex[p1->patchID];
636  int l0 = src[ituple].localIndex[0];
637  int l1 = src[ituple].localIndex[1];
638  dstval.i = l0 + patches[pi0].atomStart;
639  dstval.j = l1 + patches[pi1].atomStart;
640  dstval.itype = (src[ituple].value - bond_array);
641  Position position1 = p0->x[l0].position;
642  Position position2 = p1->x[l1].position;
643  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
644  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
645  dstval.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
646  dstval.scale = src[ituple].scale;
647  if (hostAlchFlags.alchOn) {
648  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
649  const Molecule& mol = *(Node::Object()->molecule);
650  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
651  } else {
652  dstval.fepBondedType = 0;
653  }
654  dst[ituple] = dstval;
655  }
656 }
657 
658 #ifdef NODEGROUP_FORCE_REGISTER
659 template<>
660 void ComputeBondedCUDA::copyTupleToStage(const BondElem& src,
661  const BondValue* __restrict__ p_array, CudaBondStage& dstval) {
662 
663  auto p0 = src.p[0];
664  auto p1 = src.p[1];
665  int pi0 = patchIndex[p0->patchID];
666  int pi1 = patchIndex[p1->patchID];
667  int l0 = src.localIndex[0];
668  int l1 = src.localIndex[1];
669  dstval.itype = (src.value - p_array);
670  dstval.scale = src.scale;
671  dstval.patchIDs[0] = p0->patchID;
672  dstval.patchIDs[1] = p1->patchID;
673 
674  dstval.index[0] = l0;
675  dstval.index[1] = l1;
676 
677  if (hostAlchFlags.alchOn) {
678  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
679  const Molecule& mol = *(Node::Object()->molecule);
680  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
681  } else {
682  dstval.fepBondedType = 0;
683  }
684 }
685 #endif // NODEGROUP_FORCE_REGISTER
686 
687 // XXX NOTE: Modified FP32 version
688 void ComputeBondedCUDA::copyBondDatafp32(const int ntuples, const BondElem* __restrict__ src,
689  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
690 
691  PatchMap* patchMap = PatchMap::Object();
692  float3 b1f, b2f, b3f;
693  b1f = make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
694  b2f = make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
695  b3f = make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
696 
697  for (int ituple=0;ituple < ntuples;ituple++) {
698  CudaBond dstval;
699  auto p0 = src[ituple].p[0];
700  auto p1 = src[ituple].p[1];
701  int pi0 = patchIndex[p0->patchID];
702  int pi1 = patchIndex[p1->patchID];
703  int l0 = src[ituple].localIndex[0];
704  int l1 = src[ituple].localIndex[1];
705  dstval.i = l0 + patches[pi0].atomStart;
706  dstval.j = l1 + patches[pi1].atomStart;
707  dstval.itype = (src[ituple].value - bond_array);
708 #if 0
709  Position position1 = p0->x[l0].position;
710  Position position2 = p1->x[l1].position;
711  Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
712  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
713 #endif
714  float3 position1 = make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
715  float3 position2 = make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
716  float3 diff = position1 - position2;
717  float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
718  float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
719  float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
720  if(pi0 != pi1){
721  Vector c = patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
722  d1 += c.x;
723  d2 += c.y;
724  d3 += c.z;
725  }
726  dstval.ioffsetXYZ = make_float3(d1, d2, d3);
727  dstval.scale = src[ituple].scale;
728  if (hostAlchFlags.alchOn) {
729  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
730  const Molecule& mol = *(Node::Object()->molecule);
731  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
732  } else {
733  dstval.fepBondedType = 0;
734  }
735  dst[ituple] = dstval;
736  }
737 }
738 
739 void ComputeBondedCUDA::copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
740  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst) {
741  PatchMap* patchMap = PatchMap::Object();
742  for (int ituple=0;ituple < ntuples;ituple++) {
743  CudaAngle dstval;
744  auto p0 = src[ituple].p[0];
745  auto p1 = src[ituple].p[1];
746  auto p2 = src[ituple].p[2];
747  int pi0 = patchIndex[p0->patchID];
748  int pi1 = patchIndex[p1->patchID];
749  int pi2 = patchIndex[p2->patchID];
750  int l0 = src[ituple].localIndex[0];
751  int l1 = src[ituple].localIndex[1];
752  int l2 = src[ituple].localIndex[2];
753  dstval.i = l0 + patches[pi0].atomStart;
754  dstval.j = l1 + patches[pi1].atomStart;
755  dstval.k = l2 + patches[pi2].atomStart;
756  dstval.itype = (src[ituple].value - angle_array);
757  Position position1 = p0->x[l0].position;
758  Position position2 = p1->x[l1].position;
759  Position position3 = p2->x[l2].position;
760  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
761  Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
762  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
763  if(pi2 != pi1) shiftVec32 += patchMap->center(p2->patchID) - patchMap->center(p1->patchID);
764 
765  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
766  dstval.koffsetXYZ = make_float3((float)shiftVec32.x, (float)shiftVec32.y, (float)shiftVec32.z);
767  dstval.scale = src[ituple].scale;
768  if (hostAlchFlags.alchOn) {
769  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
770  const Molecule& mol = *(Node::Object()->molecule);
771  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 3);
772  } else {
773  dstval.fepBondedType = 0;
774  }
775  dst[ituple] = dstval;
776  }
777 }
778 
779 #ifdef NODEGROUP_FORCE_REGISTER
780 template<>
781 void ComputeBondedCUDA::copyTupleToStage(const AngleElem& src,
782  const AngleValue* __restrict__ p_array, CudaAngleStage& dstval) {
783 
784  auto p0 = src.p[0];
785  auto p1 = src.p[1];
786  auto p2 = src.p[2];
787  int pi0 = patchIndex[p0->patchID];
788  int pi1 = patchIndex[p1->patchID];
789  int pi2 = patchIndex[p2->patchID];
790  int l0 = src.localIndex[0];
791  int l1 = src.localIndex[1];
792  int l2 = src.localIndex[2];
793  dstval.itype = (src.value - p_array);
794 
795  dstval.patchIDs[0] = p0->patchID;
796  dstval.patchIDs[1] = p1->patchID;
797  dstval.patchIDs[2] = p2->patchID;
798 
799  dstval.index[0] = l0;
800  dstval.index[1] = l1;
801  dstval.index[2] = l2;
802 
803  dstval.scale = src.scale;
804  if (hostAlchFlags.alchOn) {
805  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
806  const Molecule& mol = *(Node::Object()->molecule);
807  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 3);
808  } else {
809  dstval.fepBondedType = 0;
810  }
811 }
812 #endif // NODEGROUP_FORCE_REGISTER
813 
814 //
815 // Used for both dihedrals and impropers
816 //
817 template <bool doDihedral, typename T, typename P>
818 void ComputeBondedCUDA::copyDihedralData(const int ntuples, const T* __restrict__ src,
819  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
820 
821  PatchMap* patchMap = PatchMap::Object();
822 
823  for (int ituple=0;ituple < ntuples;ituple++) {
824  CudaDihedral dstval;
825  auto p0 = src[ituple].p[0];
826  auto p1 = src[ituple].p[1];
827  auto p2 = src[ituple].p[2];
828  auto p3 = src[ituple].p[3];
829  int pi0 = patchIndex[p0->patchID];
830  int pi1 = patchIndex[p1->patchID];
831  int pi2 = patchIndex[p2->patchID];
832  int pi3 = patchIndex[p3->patchID];
833  int l0 = src[ituple].localIndex[0];
834  int l1 = src[ituple].localIndex[1];
835  int l2 = src[ituple].localIndex[2];
836  int l3 = src[ituple].localIndex[3];
837  dstval.i = l0 + patches[pi0].atomStart;
838  dstval.j = l1 + patches[pi1].atomStart;
839  dstval.k = l2 + patches[pi2].atomStart;
840  dstval.l = l3 + patches[pi3].atomStart;
841  if (doDihedral) {
842  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
843  } else {
844  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
845  }
846  Position position1 = p0->x[l0].position;
847  Position position2 = p1->x[l1].position;
848  Position position3 = p2->x[l2].position;
849  Position position4 = p3->x[l3].position;
850  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
851  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
852  Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
853  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
854  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
855  if(pi3 != pi2) shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
856 
857  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
858  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
859  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
860  dstval.scale = src[ituple].scale;
861  if (hostAlchFlags.alchOn) {
862  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
863  const Molecule& mol = *(Node::Object()->molecule);
864  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
865  } else {
866  dstval.fepBondedType = 0;
867  }
868  dst[ituple] = dstval;
869  }
870 }
871 
872 #ifdef NODEGROUP_FORCE_REGISTER
873 template <>
874 void ComputeBondedCUDA::copyTupleToStage(const DihedralElem& src,
875  const DihedralValue* __restrict__ p_array, CudaDihedralStage& dstval) {
876 
877  auto p0 = src.p[0];
878  auto p1 = src.p[1];
879  auto p2 = src.p[2];
880  auto p3 = src.p[3];
881  int pi0 = patchIndex[p0->patchID];
882  int pi1 = patchIndex[p1->patchID];
883  int pi2 = patchIndex[p2->patchID];
884  int pi3 = patchIndex[p3->patchID];
885  int l0 = src.localIndex[0];
886  int l1 = src.localIndex[1];
887  int l2 = src.localIndex[2];
888  int l3 = src.localIndex[3];
889  dstval.itype = dihedralMultMap[(src.value - p_array)];
890 
891  dstval.patchIDs[0] = p0->patchID;
892  dstval.patchIDs[1] = p1->patchID;
893  dstval.patchIDs[2] = p2->patchID;
894  dstval.patchIDs[3] = p3->patchID;
895 
896  dstval.index[0] = l0;
897  dstval.index[1] = l1;
898  dstval.index[2] = l2;
899  dstval.index[3] = l3;
900 
901  dstval.scale = src.scale;
902  if (hostAlchFlags.alchOn) {
903  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
904  const Molecule& mol = *(Node::Object()->molecule);
905  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
906  } else {
907  dstval.fepBondedType = 0;
908  }
909 }
910 
911 template <>
912 void ComputeBondedCUDA::copyTupleToStage(const ImproperElem& src,
913  const ImproperValue* __restrict__ p_array, CudaDihedralStage& dstval) {
914 
915  auto p0 = src.p[0];
916  auto p1 = src.p[1];
917  auto p2 = src.p[2];
918  auto p3 = src.p[3];
919  int pi0 = patchIndex[p0->patchID];
920  int pi1 = patchIndex[p1->patchID];
921  int pi2 = patchIndex[p2->patchID];
922  int pi3 = patchIndex[p3->patchID];
923  int l0 = src.localIndex[0];
924  int l1 = src.localIndex[1];
925  int l2 = src.localIndex[2];
926  int l3 = src.localIndex[3];
927  dstval.itype = improperMultMap[(src.value - p_array)];
928 
929  dstval.patchIDs[0] = p0->patchID;
930  dstval.patchIDs[1] = p1->patchID;
931  dstval.patchIDs[2] = p2->patchID;
932  dstval.patchIDs[3] = p3->patchID;
933 
934  dstval.index[0] = l0;
935  dstval.index[1] = l1;
936  dstval.index[2] = l2;
937  dstval.index[3] = l3;
938 
939  dstval.scale = src.scale;
940  if (hostAlchFlags.alchOn) {
941  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
942  const Molecule& mol = *(Node::Object()->molecule);
943  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
944  } else {
945  dstval.fepBondedType = 0;
946  }
947 }
948 
949 template <typename T, typename P, typename D>
950  void ComputeBondedCUDA::copyToStage(const int ntuples, const T* __restrict__ src,
951  const P* __restrict__ p_array, std::vector<D>& dst) {
952 
953  for (int ituple=0;ituple < ntuples;ituple++) {
954  D dstval;
955  copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
956  dst.push_back(dstval);
957  }
958 }
959 #endif // NODEGROUP_FORCE_REGISTER
960 
961 
962 //
963 // Used for both dihedrals and impropers
964 //
965 template <bool doDihedral, typename T, typename P>
966 void ComputeBondedCUDA::copyDihedralDatafp32(const int ntuples, const T* __restrict__ src,
967  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
968 
969  PatchMap* patchMap = PatchMap::Object();
970  float3 b1f = make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
971  float3 b2f = make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
972  float3 b3f = make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
973 
974  for (int ituple=0;ituple < ntuples;ituple++) {
975  CudaDihedral dstval;
976  auto p0 = src[ituple].p[0];
977  auto p1 = src[ituple].p[1];
978  auto p2 = src[ituple].p[2];
979  auto p3 = src[ituple].p[3];
980  int pi0 = patchIndex[p0->patchID];
981  int pi1 = patchIndex[p1->patchID];
982  int pi2 = patchIndex[p2->patchID];
983  int pi3 = patchIndex[p3->patchID];
984  int l0 = src[ituple].localIndex[0];
985  int l1 = src[ituple].localIndex[1];
986  int l2 = src[ituple].localIndex[2];
987  int l3 = src[ituple].localIndex[3];
988  dstval.i = l0 + patches[pi0].atomStart;
989  dstval.j = l1 + patches[pi1].atomStart;
990  dstval.k = l2 + patches[pi2].atomStart;
991  dstval.l = l3 + patches[pi3].atomStart;
992  if (doDihedral) {
993  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
994  } else {
995  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
996  }
997 #if 0
998  Position position1 = p0->x[l0].position;
999  Position position2 = p1->x[l1].position;
1000  Position position3 = p2->x[l2].position;
1001  Position position4 = p3->x[l3].position;
1002  Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
1003  Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
1004  Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
1005  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1006  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1007  if(pi3 != pi2) shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
1008 
1009  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
1010  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
1011  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
1012 #endif
1013 
1014  float3 position1 = make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
1015  float3 position2 = make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
1016  float3 position3 = make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
1017  float3 position4 = make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
1018 
1019  float3 diff12, diff23, diff43;
1020  diff12 = position1 - position2;
1021  diff23 = position2 - position3;
1022  diff43 = position4 - position3;
1023 
1024  float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
1025  float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
1026  float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
1027 
1028  float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
1029  float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
1030  float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
1031 
1032  float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
1033  float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
1034  float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
1035 
1036  if(pi0 != pi1){
1037  Vector c = patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1038  d12_x += c.x;
1039  d12_y += c.y;
1040  d12_z += c.z;
1041  }
1042 
1043  if(pi1 != pi2){
1044  Vector c = patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1045  d23_x += c.x;
1046  d23_y += c.y;
1047  d23_z += c.z;
1048  }
1049 
1050  if(pi3 != pi2){
1051  Vector c = patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
1052  d43_x += c.x;
1053  d43_y += c.y;
1054  d43_z += c.z;
1055  }
1056 
1057  dstval.ioffsetXYZ = make_float3(d12_x, d12_y, d12_z);
1058  dstval.joffsetXYZ = make_float3(d23_x, d23_y, d23_z);
1059  dstval.loffsetXYZ = make_float3(d43_x, d43_y, d43_z);
1060 
1061  dstval.scale = src[ituple].scale;
1062  if (hostAlchFlags.alchOn) {
1063  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1064  const Molecule& mol = *(Node::Object()->molecule);
1065  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1066  } else {
1067  dstval.fepBondedType = 0;
1068  }
1069  dst[ituple] = dstval;
1070  }
1071 }
1072 
1073 
1074 void ComputeBondedCUDA::copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
1075  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int64_t& pos, int64_t& pos2) {
1076 
1077  PatchMap* patchMap = PatchMap::Object();
1078  for (int ituple=0;ituple < ntuples;ituple++) {
1079  auto p0 = src[ituple].p[0];
1080  auto p1 = src[ituple].p[1];
1081  int pi0 = patchIndex[p0->patchID];
1082  int pi1 = patchIndex[p1->patchID];
1083  int l0 = src[ituple].localIndex[0];
1084  int l1 = src[ituple].localIndex[1];
1085  CompAtom& ca1 = p0->x[l0];
1086  CompAtom& ca2 = p1->x[l1];
1087  Position position1 = ca1.position;
1088  Position position2 = ca2.position;
1089  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
1090  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1091  CudaExclusion ce;
1092  ce.i = l0 + patches[pi0].atomStart;
1093  ce.j = l1 + patches[pi1].atomStart;
1094  ce.vdwtypei = ca1.vdwType;
1095  ce.vdwtypej = ca2.vdwType;
1096  ce.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
1097  if (hostAlchFlags.alchOn) {
1098  const unsigned char p1 = ca1.partition;
1099  const unsigned char p2 = ca2.partition;
1100  ce.pswitch = pswitchTable[size_t(p1 + 3*p2)];
1101  } else {
1102  ce.pswitch = 0;
1103  }
1104  if (src[ituple].modified) {
1105  *dst1 = ce;
1106  dst1++;
1107  pos += typeSize;
1108  } else {
1109  *dst2 = ce;
1110  dst2++;
1111  pos2 += typeSize;
1112  }
1113  }
1114 }
1115 
1116 #ifdef NODEGROUP_FORCE_REGISTER
1117 template<>
1118 void ComputeBondedCUDA::copyTupleToStage(const ExclElem& __restrict__ src,
1119  const int* __restrict__ p_array, CudaExclusionStage& dstval) {
1120 
1121  auto p0 = src.p[0];
1122  auto p1 = src.p[1];
1123  int pi0 = patchIndex[p0->patchID];
1124  int pi1 = patchIndex[p1->patchID];
1125  int l0 = src.localIndex[0];
1126  int l1 = src.localIndex[1];
1127  CompAtom& ca1 = p0->x[l0];
1128  CompAtom& ca2 = p1->x[l1];
1129 
1130  dstval.vdwtypei = ca1.vdwType;
1131  dstval.vdwtypej = ca2.vdwType;
1132 
1133  dstval.patchIDs[0] = p0->patchID;
1134  dstval.patchIDs[1] = p1->patchID;
1135 
1136  dstval.index[0] = l0;
1137  dstval.index[1] = l1;
1138 
1139  if (hostAlchFlags.alchOn) {
1140  const unsigned char p1 = ca1.partition;
1141  const unsigned char p2 = ca2.partition;
1142  dstval.pswitch = pswitchTable[size_t(p1 + 3*p2)];
1143  } else {
1144  dstval.pswitch = 0;
1145  }
1146 }
1147 
1148 
1149 
1150 void ComputeBondedCUDA::copyExclusionDataStage(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
1151  std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
1152 
1153  for (int ituple=0;ituple < ntuples;ituple++) {
1154  CudaExclusionStage ce;
1155  copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple], nullptr, ce);
1156  if (src[ituple].modified) {
1157  dst1.push_back(ce);
1158  pos += typeSize;
1159  } else {
1160  dst2.push_back(ce);
1161  pos2 += typeSize;
1162  }
1163  }
1164 }
1165 #endif // NODEGROUP_FORCE_REGISTER
1166 
1167 void ComputeBondedCUDA::copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
1168  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst) {
1169 
1170  PatchMap* patchMap = PatchMap::Object();
1171  for (int ituple=0;ituple < ntuples;ituple++) {
1172  auto p0 = src[ituple].p[0];
1173  auto p1 = src[ituple].p[1];
1174  auto p2 = src[ituple].p[2];
1175  auto p3 = src[ituple].p[3];
1176  auto p4 = src[ituple].p[4];
1177  auto p5 = src[ituple].p[5];
1178  auto p6 = src[ituple].p[6];
1179  auto p7 = src[ituple].p[7];
1180  int pi0 = patchIndex[p0->patchID];
1181  int pi1 = patchIndex[p1->patchID];
1182  int pi2 = patchIndex[p2->patchID];
1183  int pi3 = patchIndex[p3->patchID];
1184  int pi4 = patchIndex[p4->patchID];
1185  int pi5 = patchIndex[p5->patchID];
1186  int pi6 = patchIndex[p6->patchID];
1187  int pi7 = patchIndex[p7->patchID];
1188  int l0 = src[ituple].localIndex[0];
1189  int l1 = src[ituple].localIndex[1];
1190  int l2 = src[ituple].localIndex[2];
1191  int l3 = src[ituple].localIndex[3];
1192  int l4 = src[ituple].localIndex[4];
1193  int l5 = src[ituple].localIndex[5];
1194  int l6 = src[ituple].localIndex[6];
1195  int l7 = src[ituple].localIndex[7];
1196  dst[ituple].i1 = l0 + patches[pi0].atomStart;
1197  dst[ituple].i2 = l1 + patches[pi1].atomStart;
1198  dst[ituple].i3 = l2 + patches[pi2].atomStart;
1199  dst[ituple].i4 = l3 + patches[pi3].atomStart;
1200  dst[ituple].i5 = l4 + patches[pi4].atomStart;
1201  dst[ituple].i6 = l5 + patches[pi5].atomStart;
1202  dst[ituple].i7 = l6 + patches[pi6].atomStart;
1203  dst[ituple].i8 = l7 + patches[pi7].atomStart;
1204  dst[ituple].itype = (src[ituple].value - crossterm_array);
1205  Position position1 = p0->x[l0].position;
1206  Position position2 = p1->x[l1].position;
1207  Position position3 = p2->x[l2].position;
1208  Position position4 = p3->x[l3].position;
1209  Position position5 = p4->x[l4].position;
1210  Position position6 = p5->x[l5].position;
1211  Position position7 = p6->x[l6].position;
1212  Position position8 = p7->x[l7].position;
1213  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
1214  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
1215  Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
1216  Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
1217  Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
1218  Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
1219  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1220  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1221  if(pi2 != pi3) shiftVec34 += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
1222  if(pi4 != pi5) shiftVec56 += patchMap->center(p4->patchID) - patchMap->center(p5->patchID);
1223  if(pi5 != pi6) shiftVec67 += patchMap->center(p5->patchID) - patchMap->center(p6->patchID);
1224  if(pi6 != pi7) shiftVec78 += patchMap->center(p6->patchID) - patchMap->center(p7->patchID);
1225  dst[ituple].offset12XYZ = make_float3( (float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
1226  dst[ituple].offset23XYZ = make_float3( (float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
1227  dst[ituple].offset34XYZ = make_float3( (float)shiftVec34.x, (float)shiftVec34.y, (float)shiftVec34.z);
1228  dst[ituple].offset56XYZ = make_float3( (float)shiftVec56.x, (float)shiftVec56.y, (float)shiftVec56.z);
1229  dst[ituple].offset67XYZ = make_float3( (float)shiftVec67.x, (float)shiftVec67.y, (float)shiftVec67.z);
1230  dst[ituple].offset78XYZ = make_float3( (float)shiftVec78.x, (float)shiftVec78.y, (float)shiftVec78.z);
1231  if (hostAlchFlags.alchOn) {
1232  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1233  const Molecule& mol = *(Node::Object()->molecule);
1234  int typeSum1 = 0, typeSum2 = 0;
1235  for (size_t i = 0; i < 4; ++i) {
1236  typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1237  typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1238  }
1239  int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1240  if ((0 < typeSum1 && typeSum1 < order) || (0 < typeSum2 && typeSum2 < order)) {
1241  dst[ituple].fepBondedType = 1;
1242  } else if ((0 > typeSum1 && typeSum1 > -order) || (0 > typeSum2 && typeSum2 > -order)) {
1243  dst[ituple].fepBondedType = 2;
1244  }
1245  } else {
1246  dst[ituple].fepBondedType = 0;
1247  }
1248  dst[ituple].scale = src[ituple].scale;
1249  }
1250 }
1251 
1252 #ifdef NODEGROUP_FORCE_REGISTER
1253 template <>
1254 void ComputeBondedCUDA::copyTupleToStage(const CrosstermElem& src,
1255  const CrosstermValue* __restrict__ crossterm_array, CudaCrosstermStage& dstval) {
1256 
1257  auto p0 = src.p[0];
1258  auto p1 = src.p[1];
1259  auto p2 = src.p[2];
1260  auto p3 = src.p[3];
1261  auto p4 = src.p[4];
1262  auto p5 = src.p[5];
1263  auto p6 = src.p[6];
1264  auto p7 = src.p[7];
1265  int pi0 = patchIndex[p0->patchID];
1266  int pi1 = patchIndex[p1->patchID];
1267  int pi2 = patchIndex[p2->patchID];
1268  int pi3 = patchIndex[p3->patchID];
1269  int pi4 = patchIndex[p4->patchID];
1270  int pi5 = patchIndex[p5->patchID];
1271  int pi6 = patchIndex[p6->patchID];
1272  int pi7 = patchIndex[p7->patchID];
1273  int l0 = src.localIndex[0];
1274  int l1 = src.localIndex[1];
1275  int l2 = src.localIndex[2];
1276  int l3 = src.localIndex[3];
1277  int l4 = src.localIndex[4];
1278  int l5 = src.localIndex[5];
1279  int l6 = src.localIndex[6];
1280  int l7 = src.localIndex[7];
1281  dstval.itype = (src.value - crossterm_array);
1282 
1283  dstval.patchIDs[0] = p0->patchID;
1284  dstval.patchIDs[1] = p1->patchID;
1285  dstval.patchIDs[2] = p2->patchID;
1286  dstval.patchIDs[3] = p3->patchID;
1287  dstval.patchIDs[4] = p4->patchID;
1288  dstval.patchIDs[5] = p5->patchID;
1289  dstval.patchIDs[6] = p6->patchID;
1290  dstval.patchIDs[7] = p7->patchID;
1291 
1292  dstval.index[0] = l0;
1293  dstval.index[1] = l1;
1294  dstval.index[2] = l2;
1295  dstval.index[3] = l3;
1296  dstval.index[4] = l4;
1297  dstval.index[5] = l5;
1298  dstval.index[6] = l6;
1299  dstval.index[7] = l7;
1300 
1301  dstval.scale = src.scale;
1302  if (hostAlchFlags.alchOn) {
1303  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1304  const Molecule& mol = *(Node::Object()->molecule);
1305  int typeSum1 = 0, typeSum2 = 0;
1306  for (size_t i = 0; i < 4; ++i) {
1307  typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1308  typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1309  }
1310  int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1311  if ((0 < typeSum1 && typeSum1 < order) || (0 < typeSum2 && typeSum2 < order)) {
1312  dstval.fepBondedType = 1;
1313  } else if ((0 > typeSum1 && typeSum1 > -order) || (0 > typeSum2 && typeSum2 > -order)) {
1314  dstval.fepBondedType = 2;
1315  }
1316  } else {
1317  dstval.fepBondedType = 0;
1318  }
1319 }
1320 #endif // NODEGROUP_FORCE_REGISTER
1321 
1322 
1323 void ComputeBondedCUDA::tupleCopyWorker(int first, int last, void *result, int paraNum, void *param) {
1324  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1325  c->tupleCopyWorker(first, last);
1326 }
1327 
1328 void ComputeBondedCUDA::tupleCopyWorkerExcl(int first, int last, void *result, int paraNum, void *param) {
1329  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1330  c->tupleCopyWorkerExcl(first, last);
1331 }
1332 
1333 void ComputeBondedCUDA::tupleCopyWorkerExcl(int first, int last) {
1334 
1335  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1336  // Separate exclusions into modified, and non-modified
1337  auto itStart=std::next(tupleList[Tuples::EXCLUSION].begin(),first);
1338  //compute offset for this rank
1339  int64_t numModExclusionsBefore=0;
1340  int64_t numExclusionsBefore=0;
1341  // this is the most painful way to compute this, but should be correct
1342  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != itStart;it++) {
1343  int ntuples = (*it)->getNumTuples();
1344  auto thisExcl = (ExclElem *)(*it)->getTupleList();
1345  for (int ituple=0;ituple < ntuples;ituple++) {
1346  if(thisExcl[ituple].modified)
1347  numModExclusionsBefore++;
1348  else
1349  numExclusionsBefore++;
1350  }
1351  }
1352 
1353  int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[Tuples::EXCLUSION];
1354  int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[Tuples::EXCLUSION];
1355  int numExclusionsWork=last-first;
1356  auto itEnd=std::next(itStart,numExclusionsWork+1);
1357  for ( auto it=itStart;it != itEnd;it++) {
1358  int ntuples = (*it)->getNumTuples();
1359  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
1360  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
1361  }
1362  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1363 }
1364 
1365 void ComputeBondedCUDA::tupleCopyWorker(int first, int last) {
1366 
1367  if (first == -1) {
1368  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1369  // Separate exclusions into modified, and non-modified
1370  int64_t pos = exclusionStartPos;
1371  int64_t pos2 = exclusionStartPos2;
1372  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
1373  int ntuples = (*it)->getNumTuples();
1374  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
1375  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
1376  }
1377  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1378  first = 0;
1379  }
1380  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1381  // JM: Move the switch statement outside and do the for loop inside the cases
1382  for (int i=first;i <= last;i++) {
1383  switch (tupleCopyWorkList[i].tupletype) {
1384 
1385  case Tuples::BOND:
1386  {
1387 #if 1
1388  copyBondData(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
1389  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1390 #else
1391  copyBondDatafp32(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
1392  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1393 #endif
1394  }
1395  break;
1396 
1397  case Tuples::ANGLE:
1398  {
1399  copyAngleData(tupleCopyWorkList[i].ntuples, (AngleElem *)tupleCopyWorkList[i].tupleElemList,
1400  Node::Object()->parameters->angle_array, (CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1401  }
1402  break;
1403 
1404  case Tuples::DIHEDRAL:
1405  {
1406 
1407 #if 1
1408  copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1409  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
1410  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1411 #else
1412  copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1413  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
1414  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1415 #endif
1416  }
1417  break;
1418 
1419  case Tuples::IMPROPER:
1420  {
1421  copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
1422  (ImproperElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->improper_array,
1423  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1424  }
1425  break;
1426 
1427  case Tuples::CROSSTERM:
1428  {
1429  copyCrosstermData(tupleCopyWorkList[i].ntuples, (CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
1430  Node::Object()->parameters->crossterm_array, (CudaCrossterm *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1431  }
1432  break;
1433 
1434  default:
1435  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1436  break;
1437  }
1438  }
1439  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1440 }
1441 
1442 
1443 #ifdef NODEGROUP_FORCE_REGISTER
1444 
1445 void ComputeBondedCUDA::updateMaxTupleCounts(TupleCounts counts) {
1446  // Atomically compute the max number of tuples across all GPUs
1447  // Each GPU will use this as their buffer size * OVERALLOC to keep things easier
1448 
1449  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1450  PatchData *patchData = cpdata.ckLocalBranch();
1451 
1452  // Update bonds
1453  int numBondsTest = patchData->maxNumBonds.load();
1454  while (numBondsTest < counts.bond &&
1455  !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.bond));
1456 
1457  int numAnglesTest = patchData->maxNumAngles.load();
1458  while (numAnglesTest < counts.angle &&
1459  !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.angle));
1460 
1461  int numDihedralsTest = patchData->maxNumDihedrals.load();
1462  while (numDihedralsTest < counts.dihedral &&
1463  !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.dihedral));
1464 
1465  int numImpropersTest = patchData->maxNumImpropers.load();
1466  while (numImpropersTest < counts.improper &&
1467  !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.improper));
1468 
1469  int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
1470  while (numModifiedExclusionsTest < counts.modifiedExclusion &&
1471  !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.modifiedExclusion));
1472 
1473  int numExclusionsTest = patchData->maxNumExclusions.load();
1474  while (numExclusionsTest < counts.exclusion &&
1475  !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.exclusion));
1476 
1477  int numCrosstermsTest = patchData->maxNumCrossterms.load();
1478  while (numCrosstermsTest < counts.crossterm &&
1479  !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.crossterm));
1480 
1481 
1482 }
1483 
1484 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
1485  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1486  PatchData *patchData = cpdata.ckLocalBranch();
1487 
1488  TupleCounts counts;
1489 
1490  counts.bond = patchData->maxNumBonds.load();
1491  counts.angle = patchData->maxNumAngles.load();
1492  counts.dihedral = patchData->maxNumDihedrals.load();
1493  counts.improper = patchData->maxNumImpropers.load();
1494  counts.modifiedExclusion = patchData->maxNumModifiedExclusions.load();
1495  counts.exclusion = patchData->maxNumExclusions.load();
1496  counts.crossterm = patchData->maxNumCrossterms.load();
1497 
1498 #if 0
1499  CkPrintf("[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
1500  CkMyPe(), counts.bond, counts.angle, counts.dihedral, counts.improper, counts.modifiedExclusion,
1501  counts.exclusion, counts.crossterm);
1502 #endif
1503 
1504  return counts;
1505 }
1506 
1507 
1548 void ComputeBondedCUDA::migrateTuples(bool startup) {
1549 
1550  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1551  PatchData *patchData = cpdata.ckLocalBranch();
1552  const int devInd = deviceCUDA->getDeviceIndex();
1553 
1554  bool MGPU = deviceCUDA->getNumDevice() > 1;
1555  bool amMaster = (masterPe == CkMyPe());
1556  CmiNodeBarrier();
1557 
1558 
1559  if (amMaster) {
1560  cudaStreamSynchronize(stream);
1561  copyPatchData();
1562  cudaStreamSynchronize(stream);
1563  }
1564  CmiNodeBarrier();
1565 
1566  if (amMaster) {
1567  if (!startup) {
1568  PatchMap* patchMap = PatchMap::Object();
1569  const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
1570  // Update the device of the tuples
1571  // DMC this also updates the patches of the atoms, so it needs to be done in the single GPU case too
1572 
1573  TupleCounts counts = bondedKernel.getTupleCounts();
1574  migrationKernel.computeTupleDestination(
1575  devInd,
1576  counts,
1577  patchData->devData[devInd].numPatchesHome,
1578  migrationDestination,
1579  patchData->devData[devInd].d_patchToDeviceMap,
1580  patchData->devData[devInd].d_globalToLocalID,
1581  patchMap->getADim(), patchMap->getBDim(),
1582  patchMap->getCMaxIndex(), patchMap->getBMaxIndex(), patchMap->getAMaxIndex(),
1583  stream
1584  );
1585  cudaCheck(cudaStreamSynchronize(stream));
1586  }
1587  }
1588  CmiNodeBarrier();
1589 
1590  if (amMaster) {
1591  if (!startup) {
1592  migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
1593  cudaCheck(cudaStreamSynchronize(stream));
1594  }
1595  }
1596  CmiNodeBarrier();
1597 
1598  TupleCounts local, newMax;
1599  bool realloc = false;
1600 
1601  if (amMaster && !startup) {
1602  migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
1603 
1604  local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
1605  updateMaxTupleCounts(local);
1606 #if 0
1607  CkPrintf("[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
1608  CkMyPe(), local.bond, local.angle, local.dihedral, local.improper, local.modifiedExclusion,
1609  local.exclusion, local.crossterm);
1610 #endif
1611  }
1612  CmiNodeBarrier();
1613  if (amMaster && !startup) {
1614  // Only multi-GPU needs to check for reallocation
1615  if (MGPU) {
1616  newMax = getMaxTupleCounts();
1617  realloc = migrationKernel.reallocateBufferDst(newMax);
1618  patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
1619  }
1620  }
1621  CmiNodeBarrier();
1622 
1623  //
1624  // Non-master PEs need to see if reallocation is happening
1625  // They can all use the 0th GPU flag since the GPUs reallocate
1626  // based on the max
1627  if (!amMaster) {
1628  realloc = patchData->tupleReallocationFlagPerDevice[0];
1629  }
1630 
1631  // If reallocated, then we need to register the P2P pointer
1632  // Since the GPUs use the max tuple counts for the buffer sizes
1633  // If one GPU reallocated, then they all did.
1634 
1635  if (realloc) {
1636  if (amMaster) {
1637  registerPointersToHost();
1638  }
1639  CmiNodeBarrier();
1640 
1641  if (amMaster) {
1642  copyHostRegisterToDevice();
1643  cudaCheck(cudaStreamSynchronize(stream));
1644  }
1645  CmiNodeBarrier();
1646  }
1647 
1648  if (amMaster && !startup) {
1649  // Moves the tuples to the dst buffer on all GPUS
1650  migrationKernel.performTupleMigration(
1651  bondedKernel.getTupleCounts(),
1652  stream
1653  );
1654  cudaCheck(cudaStreamSynchronize(stream));
1655  }
1656  CmiNodeBarrier(); // We need a barrier because of the reallocation DMC believes
1657 
1658  if (amMaster && !startup) {
1659  // If we previously reallocated we need to reallocate the other buffers too
1660  // The regular tuples must be "Reallocated" everytime, so the offsets into
1661  // the unified buffer can be updated. The unified buffer made since when we
1662  // were copying the tuples from the host each migration, but I don't think
1663  // it still makes sense to have since it complicates reallocation
1664  if (MGPU) {
1665  bondedKernel.reallocateTupleBuffer(newMax, stream);
1666  migrationKernel.reallocateBufferSrc(newMax);
1667  cudaCheck(cudaStreamSynchronize(stream));
1668  }
1669  }
1670  CmiNodeBarrier();
1671 
1672  // Update the local tuple count and copies the migrated tuples back to the
1673  // src buffer
1674  if (amMaster) {
1675  if (!startup) {
1676  bondedKernel.setTupleCounts(local);
1677  }
1678 
1679  const int* ids = patchData->h_soa_id[devInd];
1680  migrationKernel.updateTuples(
1681  bondedKernel.getTupleCounts(),
1682  bondedKernel.getData(),
1683  ids,
1684  d_patchRecord,
1685  d_patchMapCenter,
1686  bondedKernel.getAtomBuffer(),
1687  lattice,
1688  stream
1689  );
1690  cudaCheck(cudaDeviceSynchronize());
1691  }
1692 }
1693 
1700 template<typename T>
1701 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
1702  PatchMap* patchMap = PatchMap::Object();
1703 
1704  std::vector<std::pair<int,int>> downstreamPatches;
1705  for (int i = 0; i < tuples.size(); i++) {
1706  int downstream = tuples[i].patchIDs[0];
1707  for (int j = 1; j < T::size; j++) {
1708  downstream = patchMap->downstream(downstream, tuples[i].patchIDs[j]);
1709  }
1710  downstreamPatches.push_back(std::make_pair(i, downstream));
1711  tupleCounts[patchIndex[downstream]]++;
1712  }
1713 
1714  //
1715  // Compute the offset into the tuples based on patch
1716  // TODO replace with STD prefix sum
1717  tupleOffsets[0] = 0;
1718  for (int i = 0; i < tupleCounts.size(); i++) {
1719  tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
1720  }
1721 
1722  //
1723  // Sort tuples based on home patch
1724  //
1725  std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
1726  [](std::pair<int, int> a, std::pair<int, int> b) {
1727  return a.second < b.second;
1728  });
1729 
1730  //
1731  // Propogate order to tuples
1732  //
1733  std::vector<T> copy = tuples;
1734  for (int i = 0; i < tuples.size(); i++) {
1735  tuples[i] = copy[downstreamPatches[i].first];
1736  }
1737 }
1738 
1739 void ComputeBondedCUDA::sortAndCopyToDevice() {
1740  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1741  PatchData *patchData = cpdata.ckLocalBranch();
1742  const int devInd = deviceCUDA->getDeviceIndex();
1743  const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
1744 
1745  TupleDataStage h_dataStage;
1746  TupleIntArrays h_counts;
1747  TupleIntArrays h_offsets;
1748 
1749  //
1750  // These cannot be declared within the macro because then they
1751  // would be out of scope
1752  //
1753  std::vector<int> bondPatchCounts(numPatchesHome, 0);
1754  std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
1755 
1756  std::vector<int> anglePatchCounts(numPatchesHome, 0);
1757  std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
1758 
1759  std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
1760  std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
1761 
1762  std::vector<int> improperPatchCounts(numPatchesHome, 0);
1763  std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
1764 
1765  std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
1766  std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
1767 
1768  std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
1769  std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
1770 
1771  std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
1772  std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
1773 
1774  #define CALL(fieldName) do { \
1775  sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \
1776  h_dataStage.fieldName = fieldName##TupleData.data(); \
1777  h_counts.fieldName = fieldName##PatchCounts.data(); \
1778  h_offsets.fieldName = fieldName##PatchOffsets.data(); \
1779  } while(0);
1780 
1781  CALL(bond);
1782  CALL(angle);
1783  CALL(dihedral);
1784  CALL(improper);
1785  CALL(modifiedExclusion);
1786  CALL(exclusion);
1787  CALL(crossterm);
1788 
1789  #undef CALL
1790 
1791  migrationKernel.copyTupleToDevice(
1792  bondedKernel.getTupleCounts(),
1793  numPatchesHome,
1794  h_dataStage,
1795  h_counts,
1796  h_offsets,
1797  stream
1798  );
1799 
1800  cudaCheck(cudaStreamSynchronize(stream));
1801 }
1802 
1810 void ComputeBondedCUDA::tupleCopyWorkerType(int tupletype) {
1811  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1812 
1813  switch (tupletype) {
1814  case Tuples::EXCLUSION:
1815  {
1816  // Separate exclusions into modified, and non-modified
1817  modifiedExclusionTupleData.clear();
1818  exclusionTupleData.clear();
1819  int64_t pos = exclusionStartPos;
1820  int64_t pos2 = exclusionStartPos2;
1821  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
1822  int ntuples = (*it)->getNumTuples();
1823  copyExclusionDataStage(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSizeStage[Tuples::EXCLUSION],
1824  modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
1825  }
1826  // Reserve patch warp aligned data for copy.
1827  // TODO DMC is this necessary? Why are we copying the padded part too?
1828  modifiedExclusionTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(modifiedExclusionTupleData.size()));
1829  exclusionTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(exclusionTupleData.size()));
1830  }
1831  break;
1832 
1833  case Tuples::BOND:
1834  {
1835  bondTupleData.clear();
1836  for (auto it = tupleList[Tuples::BOND].begin();it != tupleList[Tuples::BOND].end();it++) {
1837  int ntuples = (*it)->getNumTuples();
1838  BondElem* elemList = (BondElem *)(*it)->getTupleList();
1839  copyToStage<BondElem, BondValue, CudaBondStage>(
1840  ntuples, elemList, Node::Object()->parameters->bond_array, bondTupleData
1841  );
1842  }
1843 
1844  bondTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(bondTupleData.size()));
1845  }
1846  break;
1847 
1848  case Tuples::ANGLE:
1849  {
1850  angleTupleData.clear();
1851  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1852  int ntuples = (*it)->getNumTuples();
1853  AngleElem* elemList = (AngleElem *)(*it)->getTupleList();
1854  copyToStage<AngleElem, AngleValue, CudaAngleStage>(ntuples, elemList, Node::Object()->parameters->angle_array, angleTupleData);
1855  }
1856 
1857  angleTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(angleTupleData.size()));
1858  }
1859  break;
1860 
1861  case Tuples::DIHEDRAL:
1862  {
1863  dihedralTupleData.clear();
1864  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1865  int ntuples = (*it)->getNumTuples();
1866  DihedralElem* elemList = (DihedralElem *)(*it)->getTupleList();
1867  copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
1868  Node::Object()->parameters->dihedral_array, dihedralTupleData);
1869  }
1870 
1871  dihedralTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(dihedralTupleData.size()));
1872  }
1873  break;
1874 
1875  case Tuples::IMPROPER:
1876  {
1877  improperTupleData.clear();
1878  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1879  int ntuples = (*it)->getNumTuples();
1880  ImproperElem* elemList = (ImproperElem *)(*it)->getTupleList();
1881  copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList,
1882  Node::Object()->parameters->improper_array, improperTupleData);
1883  }
1884 
1885  improperTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(improperTupleData.size()));
1886  }
1887  break;
1888 
1889  case Tuples::CROSSTERM:
1890  {
1891  crosstermTupleData.clear();
1892  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1893  int ntuples = (*it)->getNumTuples();
1894  CrosstermElem* elemList = (CrosstermElem *)(*it)->getTupleList();
1895  copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList,
1896  Node::Object()->parameters->crossterm_array, crosstermTupleData);
1897  }
1898 
1899  crosstermTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(crosstermTupleData.size()));
1900  }
1901  break;
1902 
1903  default:
1904  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1905  break;
1906  }
1907 
1908  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1909 }
1910 #endif // NODEGROUP_FORCE_REGISTER
1911 
1912 //
1913 // Copies tuple data form individual buffers to a single contigious buffer
1914 // NOTE: This is done on the master PE
1915 //
1916 
1917 void ComputeBondedCUDA::copyTupleData() {
1918 
1919  PatchMap* patchMap = PatchMap::Object();
1920 
1921  // Count the number of exclusions
1922  int numModifiedExclusions = 0;
1923  int numExclusions = 0;
1924  for (int i=0;i < numExclPerRank.size();i++) {
1925  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
1926  numExclusions += numExclPerRank[i].numExclusions;
1927  }
1928  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
1929  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
1930 
1931  // Count the number of tuples for each type
1932  int64_t posWA = 0;
1933  exclusionStartPos = 0;
1934  exclusionStartPos2 = 0;
1935  tupleCopyWorkList.clear();
1936  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
1937  // Take temporary position
1938  int64_t pos = posWA;
1939  if (tupletype == Tuples::EXCLUSION) {
1940  exclusionStartPos = pos;
1941  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
1942  }
1943  // Count for total number of tuples for this tupletype
1944  int num = 0;
1945  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1946  int ntuples = (*it)->getNumTuples();
1947  num += ntuples;
1948  if (tupletype != Tuples::EXCLUSION) {
1949  TupleCopyWork tupleCopyWork;
1950  tupleCopyWork.tupletype = tupletype;
1951  tupleCopyWork.ntuples = ntuples;
1952  tupleCopyWork.tupleElemList = (*it)->getTupleList();
1953  tupleCopyWork.tupleDataPos = pos;
1954  // XXX NOTE: redundant copy happening here
1955  tupleCopyWorkList.push_back(tupleCopyWork);
1956  pos += ntuples*CudaTupleTypeSize[tupletype];
1957  }
1958  }
1959  numTuplesPerType[tupletype] = num;
1960  //
1961  if (tupletype == Tuples::EXCLUSION) {
1962  // Warp-align exclusions separately
1963  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
1964  } else {
1965  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
1966  }
1967  }
1968  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
1969  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
1970  }
1971 
1972  // Set flags for finishPatchesOnPe
1973  hasExclusions = (numExclusions > 0);
1974  hasModifiedExclusions = (numModifiedExclusions > 0);
1975 
1976  // Re-allocate storage as needed
1977  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
1978  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
1979 
1980 #if CMK_SMP && USE_CKLOOP
1981  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1982  if (useCkLoop >= 1) {
1983 #define CKLOOP_EXCLUSIONS 1
1984 #if CKLOOP_EXCLUSIONS
1985  CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (void *)this, CkMyNodeSize(), 0, tupleList[Tuples::EXCLUSION].size()-1);
1986  // pass 0 instead of -1 so tupleCopyWorker doesn't do exclusions
1987  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
1988 #else
1989  // JM NOTE: What this does is divice the work into chunks of CkMyNodeSize to be scheduled
1990  // between PE. To circumvent this, I have to call it by hand on all pes
1991  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
1992 #endif
1993  } else
1994 
1995 #endif
1996  {
1997 
1998  tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
1999  }
2000 
2001  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
2002  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
2003  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
2004  tupleData, stream);
2005 
2006  // Re-allocate forces
2007  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2008  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2009 }
2010 
2011 // JM: Single node version to bypass CkLoop
2012 #ifdef NODEGROUP_FORCE_REGISTER
2013 
2014 void ComputeBondedCUDA::copyTupleDataSN() {
2015 
2016  PatchMap* patchMap = PatchMap::Object();
2017  size_t numExclusions, numModifiedExclusions, copyIndex;
2018 
2019 
2020  // WORK TO BE DONE BY THE MASTERPE
2021  if(masterPe == CkMyPe()){
2022  numModifiedExclusions = 0;
2023  numExclusions = 0;
2024  for (int i=0;i < numExclPerRank.size();i++) {
2025  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2026  numExclusions += numExclPerRank[i].numExclusions;
2027  }
2028  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2029  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2030 
2031  // Count the number of tuples for each type
2032  int64_t posWA = 0;
2033  exclusionStartPos = 0;
2034  exclusionStartPos2 = 0;
2035  tupleCopyWorkList.clear();
2036  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2037  // Take temporary position
2038  int64_t pos = posWA;
2039  if (tupletype == Tuples::EXCLUSION) {
2040  exclusionStartPos = pos;
2041  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
2042  }
2043  // Count for total number of tuples for this tupletype
2044  int num = 0;
2045  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2046  int ntuples = (*it)->getNumTuples();
2047  num += ntuples;
2048  if (tupletype != Tuples::EXCLUSION) {
2049  TupleCopyWork tupleCopyWork;
2050  tupleCopyWork.tupletype = tupletype;
2051  tupleCopyWork.ntuples = ntuples;
2052  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2053  tupleCopyWork.tupleDataPos = pos;
2054  tupleCopyWorkList.push_back(tupleCopyWork);
2055  pos += ntuples*CudaTupleTypeSize[tupletype];
2056  }
2057  }
2058  numTuplesPerType[tupletype] = num;
2059  //
2060  if (tupletype == Tuples::EXCLUSION) {
2061  // Warp-align exclusions separately
2062  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2063  } else {
2064  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2065  }
2066  }
2067  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2068  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2069  }
2070 
2071  // Set flags for finishPatchesOnPe
2072  hasExclusions = (numExclusions > 0);
2073  hasModifiedExclusions = (numModifiedExclusions > 0);
2074 
2075  // Re-allocate storage as needed
2076  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2077  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2078  }
2079 
2080  CmiNodeBarrier();
2081 
2082 #if 0
2083  //int chunkSize = (tupleCopyWorkList.size()) /CkMyNodeSize() +
2084  // (CkMyPe() == CkMyNodeSize()-1) * tupleCopyWorkList.size() % CkMyNodeSize();
2085  //int first = ((CkMyNodeSize() -1) - CkMyPe())*chunkSize -1*(CkMyPe() == CkMyNodeSize()-1);
2086  //int last = (first + chunkSize) -1*(!CkMyPe());; // chunksize is padded for the last pe
2087  this->tupleCopyWorker(first, last);
2088 #else
2089  // JM: This probably leads to better load balance than the option above.
2090  // XXX TODO: Improve this further afterwards. I don't think this scales too well
2091  // due to lots of atomics, but you're not supposed to run this with
2092  // lots of PEs anyways
2093  while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <= tupleCopyWorkList.size()){
2094  this->tupleCopyWorker(copyIndex -1, copyIndex -1);
2095  }
2096 #endif
2097  CmiNodeBarrier();
2098 
2099  if(masterPe == CkMyPe()){
2100  tupleWorkIndex.store(0);
2101  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
2102  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
2103  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
2104  tupleData, stream);
2105 
2106  // Re-allocate forces
2107  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2108  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2109  }
2110 }
2111 
2112 
2120 void ComputeBondedCUDA::copyTupleDataGPU(const int startup) {
2121 
2122  PatchMap* patchMap = PatchMap::Object();
2123  size_t numExclusions, numModifiedExclusions, copyIndex;
2124 
2125  // WORK TO BE DONE BY THE MASTERPE
2126  if (startup) {
2127  // The atom map is no longer needed, so unmap the atoms. We need to do this in-case
2128  // multiple run commands are issued. It is safe to do this because there is a node
2129  // barrier between loadTuplesOnPe and this function
2130  unmapAtoms();
2131 
2132  if(masterPe == CkMyPe()){
2133  numModifiedExclusions = 0;
2134  numExclusions = 0;
2135  for (int i=0;i < numExclPerRank.size();i++) {
2136  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2137  numExclusions += numExclPerRank[i].numExclusions;
2138  }
2139  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2140  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2141 
2142  // Count the number of tuples for each type
2143  size_t posWA = 0;
2144  exclusionStartPos = 0;
2145  exclusionStartPos2 = 0;
2146  tupleCopyWorkList.clear();
2147  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2148  // Take temporary position
2149  size_t pos = posWA;
2150  if (tupletype == Tuples::EXCLUSION) {
2151  exclusionStartPos = pos;
2152  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[Tuples::EXCLUSION];
2153  }
2154  // Count for total number of tuples for this tupletype
2155  int num = 0;
2156  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2157  int ntuples = (*it)->getNumTuples();
2158  num += ntuples;
2159  if (tupletype != Tuples::EXCLUSION) {
2160  TupleCopyWork tupleCopyWork;
2161  tupleCopyWork.tupletype = tupletype;
2162  tupleCopyWork.ntuples = ntuples;
2163  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2164  tupleCopyWork.tupleDataPos = pos;
2165  tupleCopyWorkList.push_back(tupleCopyWork);
2166  pos += ntuples*CudaTupleTypeSizeStage[tupletype];
2167  }
2168  }
2169  numTuplesPerType[tupletype] = num;
2170  //
2171  if (tupletype == Tuples::EXCLUSION) {
2172  // Warp-align exclusions separately
2173  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
2174  } else {
2175  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2176  }
2177  }
2178  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2179  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2180  }
2181 
2182  // Set flags for finishPatchesOnPe
2183  hasExclusions = (numExclusions > 0);
2184  hasModifiedExclusions = (numModifiedExclusions > 0);
2185 
2186  // Re-allocate storage as needed
2187  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2188  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2189 
2190 
2191  TupleCounts local;
2192  local.bond = numTuplesPerType[Tuples::BOND];
2193  local.angle = numTuplesPerType[Tuples::ANGLE];
2194  local.dihedral = numTuplesPerType[Tuples::DIHEDRAL];
2195  local.improper = numTuplesPerType[Tuples::IMPROPER];
2196  local.modifiedExclusion = numModifiedExclusions;
2197  local.exclusion = numExclusions;
2198  local.crossterm = numTuplesPerType[Tuples::CROSSTERM];
2199 
2200  updateMaxTupleCounts(local);
2201  bondedKernel.setTupleCounts(local);
2202  }
2203 
2204  // Wait for tuple counts to be updated...
2205  CmiNodeBarrier();
2206 
2207  if(masterPe == CkMyPe()){
2208  TupleCounts newMax = getMaxTupleCounts();
2209  migrationKernel.reallocateBufferSrc(newMax);
2210  migrationKernel.reallocateBufferDst(newMax);
2211  bondedKernel.reallocateTupleBuffer(newMax, stream);
2212  registerPointersToHost();
2213  }
2214 
2215  // Node barrier is because of registering
2216  CmiNodeBarrier();
2217 
2218  if(masterPe == CkMyPe()){
2219  copyHostRegisterToDevice();
2220  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2221  this->tupleCopyWorkerType(tupletype);
2222  }
2223  sortAndCopyToDevice();
2224  }
2225  }
2226 
2227  migrateTuples(startup);
2228 
2229  if(masterPe == CkMyPe()){
2230  TupleCounts count = bondedKernel.getTupleCounts();
2231  numTuplesPerType[Tuples::BOND] = count.bond;
2232  numTuplesPerType[Tuples::ANGLE] = count.angle;
2233  numTuplesPerType[Tuples::DIHEDRAL] = count.dihedral;
2234  numTuplesPerType[Tuples::IMPROPER] = count.improper;
2235  numModifiedExclusions = count.modifiedExclusion;
2236  numExclusions = count.exclusion;
2237  numTuplesPerType[Tuples::CROSSTERM] = count.crossterm;
2238 
2239  // Set flags for finishPatchesOnPe
2240  hasExclusions = (numExclusions > 0);
2241  hasModifiedExclusions = (numModifiedExclusions > 0);
2242 
2243  // Re-allocate forces
2244  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2245  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2246  }
2247 }
2248 
2249 #endif
2250 
2251 
2260 #ifdef NODEGROUP_FORCE_REGISTER
2261 void ComputeBondedCUDA::updatePatchRecords() {
2262  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2263  PatchData *patchData = cpdata.ckLocalBranch();
2264  int devInd = deviceCUDA->getDeviceIndex();
2265  PatchRecord **d_pr = &(patchData->devData[devInd].bond_pr);
2266  int **d_pid = &(patchData->devData[devInd].bond_pi);
2267  size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
2268  size_t st_d_pr_size = (size_t)(patchData->devData[devInd].bond_pr_size);
2269  patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
2270  reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size()); // st_d_pr_size changed to patches.size() here
2271  patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size); // copy updated value back into devData
2272  reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
2273  patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size); // copy updated value back into devData
2274  copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
2275  copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
2276  if (params->CUDASOAintegrate && params->useDeviceMigration) {
2277  patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2278  }
2279 }
2280 #endif
2281 
2282 //
2283 // Launch work on GPU
2284 //
2285 void ComputeBondedCUDA::launchWork() {
2287  if (CkMyPe() != masterPe)
2288  NAMD_bug("ComputeBondedCUDA::launchWork() called on non master PE");
2289 
2290  cudaCheck(cudaSetDevice(deviceID));
2291  if (atomsChanged) {
2292  if(!params->CUDASOAintegrate) copyTupleData();
2293  // copyTupledata nees to be called before launchWork then
2294 
2295 // Move this once GPU migration has been merged
2296 #if 1
2297 #ifdef NODEGROUP_FORCE_REGISTER
2298  if(params->CUDASOAintegrate && !params->useDeviceMigration){
2299  updatePatchRecords();
2300  }
2301 #endif
2302 #endif
2303  }
2304 
2305  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
2306  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
2307  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
2308 
2309  int r2_delta_expc = 64 * (ComputeNonbondedUtil::r2_delta_exp - 127);
2310 
2311 #if 0
2312  if(!atomsChanged){
2313  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2314  PatchData *patchData = cpdata.ckLocalBranch();
2315 
2316  float4 *d_atoms = bondedKernel.getAtomBuffer();
2317  copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
2318  if(!params->CUDASOAintegrate) CmiLock(printLock);
2319  else CmiLock(patchData->printlock);
2320 
2321  fprintf(stderr, "DEV[%d] BOND POS PRINTOUT\n", deviceID);
2322 
2323  for(int i = 0 ; i < atomStorageSize; i++){
2324  fprintf(stderr, " ATOMS[%d] = %lf %lf %lf %lf\n",
2325  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
2326  }
2327 
2328  if(!params->CUDASOAintegrate) CmiUnlock(printLock);
2329  else CmiUnlock(patchData->printlock);
2330  }
2331 #endif
2332 
2333  const CudaNBConstants nbConstants = CudaComputeNonbonded::getNonbondedCoef(params);
2334  const bool doTable = CudaComputeNonbonded::getDoTable(params, doSlow, doVirial);
2335 
2336  bondedKernel.bondedForce(
2338  atomStorageSize,
2339  doEnergy, doVirial, doSlow, doTable,
2340  lata, latb, latc,
2342  (float)ComputeNonbondedUtil::r2_delta, r2_delta_expc,
2343  nbConstants,
2344  (const float4*)atoms, forces,
2345  energies_virials, atomsChanged, params->CUDASOAintegrate,
2346  params->useDeviceMigration, stream);
2347 
2348 
2349 #ifdef NODEGROUP_FORCE_REGISTER
2350  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2351  PatchData *patchData = cpdata.ckLocalBranch();
2352  int devInd = deviceCUDA->getDeviceIndex();
2353  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
2354  patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2355  }
2356  patchData->devData[devInd].f_bond = bondedKernel.getForces();
2357  patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
2358  patchData->devData[devInd].f_bond_slow = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
2359  patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
2360 #endif
2361 
2362 #if 0
2363  if(!params->CUDASOAintegrate) CmiLock(printLock);
2364  else CmiLock(patchData->printlock);
2365  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2366  // synchronously copy array back to the host
2367  copy_DtoH_sync(bondedKernel.getForces(), forces, forceStorageSize);
2368  fprintf(stderr, "DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
2369  for(int i = 0; i < forceStorageSize; i++){
2370  // prints entire forces datastructure
2371  fprintf(stderr, "BOND[%d] = %lf\n", i, forces[i]);
2372  }
2373  if(!params->CUDASOAintegrate) CmiUnlock(printLock);
2374  else CmiUnlock(patchData->printlock);
2375 #endif
2376  forceDoneSetCallback();
2377 }
2378 
2379 void ComputeBondedCUDA::forceDoneCheck(void *arg, double walltime) {
2380  ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
2381 
2382  if (CkMyPe() != c->masterPe)
2383  NAMD_bug("ComputeBondedCUDA::forceDoneCheck called on non masterPe");
2384 
2385  cudaCheck(cudaSetDevice(c->deviceID));
2386 
2387  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2388  if (err == cudaSuccess) {
2389  // Event has occurred
2390  c->checkCount = 0;
2391  traceUserBracketEvent(CUDA_BONDED_KERNEL_EVENT, c->beforeForceCompute, walltime);
2392  c->finishPatches();
2393  return;
2394  } else if (err != cudaErrorNotReady) {
2395  // Anything else is an error
2396  char errmsg[256];
2397  sprintf(errmsg,"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
2398  c->checkCount, walltime - c->beforeForceCompute);
2399  cudaDie(errmsg,err);
2400  }
2401 
2402  // Event has not occurred
2403  c->checkCount++;
2404  if (c->checkCount >= 1000000) {
2405  char errmsg[256];
2406  sprintf(errmsg,"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
2407  c->checkCount, walltime - c->beforeForceCompute);
2408  cudaDie(errmsg,err);
2409  }
2410 
2411  // Call again
2412  CcdCallBacksReset(0, walltime);
2413  CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2414 }
2415 
2416 //
2417 // Set call back for all the work in the stream at this point
2418 //
2419 void ComputeBondedCUDA::forceDoneSetCallback() {
2420  if (CkMyPe() != masterPe)
2421  NAMD_bug("ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
2422  cudaCheck(cudaSetDevice(deviceID));
2423  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2424  checkCount = 0;
2425  CcdCallBacksReset(0, CmiWallTimer());
2426  // Start timer for CUDA kernel
2427  beforeForceCompute = CkWallTimer();
2428  // Set the call back at 0.1ms
2429  if(!params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, this, 0.1);
2430 }
2431 
2432 template <bool sumNbond, bool sumSlow>
2433 void finishForceLoop(const int numAtoms, const int forceStride,
2434  const double* __restrict__ af,
2435  const double* __restrict__ af_nbond,
2436  const double* __restrict__ af_slow,
2437  Force* __restrict__ f,
2438  Force* __restrict__ f_nbond,
2439  Force* __restrict__ f_slow) {
2440 
2441 #if 0
2442  for (int j=0;j < numAtoms;j++) {
2443  {
2444  //double afx, afy, afz;
2445  //convertForceToDouble(af + j, forceStride, afx, afy, afz);
2446  //f[j].x += af[j];
2447  //f[j].y += af[j + forceStride];
2448  //f[j].z += af[j + forceStride*2];
2449  f[j].x += af[j];
2450  f[j].y += af[j + forceStride];
2451  f[j].z += af[j + forceStride*2];
2452  }
2453  if (sumNbond)
2454  {
2455  //double afx, afy, afz;
2456  //convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
2457  f_nbond[j].x += af_nbond[j];
2458  f_nbond[j].y += af_nbond[j + forceStride];
2459  f_nbond[j].z += af_nbond[j + forceStride*2];
2460 
2461  }
2462  if (sumSlow)
2463  {
2464  //double afx, afy, afz;
2465  //convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
2466  f_slow[j].x += af_slow[j];
2467  f_slow[j].y += af_slow[j + forceStride];
2468  f_slow[j].z += af_slow[j + forceStride*2];
2469 
2470  }
2471  }
2472 #endif
2473 
2474 #if 0
2475  for(int j=0; j < numAtoms; j++){
2476  f[j].x += af[j];
2477  f[j].y += af[j+ forceStride];
2478  f[j].z += af[j+ 2*forceStride];
2479  }
2480  if(sumNbond){
2481  for(int j=0; j < numAtoms; j++){
2482  f_nbond[j].x += af_nbond[j];
2483  f_nbond[j].y += af_nbond[j + forceStride];
2484  f_nbond[j].z += af_nbond[j + 2*forceStride];
2485  }
2486  }
2487  if(sumSlow){
2488  for(int j=0; j < numAtoms; j++){
2489  f_slow[j].x += af_slow[j];
2490  f_slow[j].y += af_slow[j + forceStride];
2491  f_slow[j].z += af_slow[j + 2*forceStride];
2492  }
2493  }
2494 #endif
2495  // XXX Summing into AOS buffer from SOA data layout
2496  // Try to reduce cache misses by separating loops.
2497  for(int j=0; j < numAtoms; j++) f[j].x += af[j];
2498  for(int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
2499  for(int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
2500 
2501  if(sumNbond){
2502 #ifdef DEBUG_MINIMIZE
2503  int k=0;
2504  printf("%s, line %d\n", __FILE__, __LINE__);
2505  printf(" before: f_nbond[%d] = %f %f %f\n",
2506  k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2507 #endif
2508  for(int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
2509  for(int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
2510  for(int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
2511 #ifdef DEBUG_MINIMIZE
2512  printf(" after: f_nbond[%d] = %f %f %f\n",
2513  k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2514 #endif
2515  }
2516  if(sumSlow){
2517  for(int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
2518  for(int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
2519  for(int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
2520  }
2521 #if 0
2522  for(int j = 0; j < numAtoms; j++){
2523  fprintf(stderr, "f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
2524  af[j], af[j + forceStride], af[j + 2*forceStride],
2525  af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
2526  af_slow[j], af_slow[j + forceStride], af_slow[j + 2*forceStride]);
2527  }
2528 #endif
2529 }
2530 
2531 //
2532 // Finish all patches that are on this pe
2533 //
2534 void ComputeBondedCUDA::finishPatchesOnPe() {
2535  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2536 
2537  PatchMap* patchMap = PatchMap::Object();
2538  int myRank = CkMyRank();
2539 
2540  const int forceStride = bondedKernel.getForceStride(atomStorageSize);
2541  const int forceSize = bondedKernel.getForceSize(atomStorageSize);
2542  const bool sumNbond = hasModifiedExclusions;
2543  const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
2544 
2545  // I need to print patchIDsPerRank
2546  for (int i=0;i < patchIDsPerRank[myRank].size();i++) {
2547  PatchID patchID = patchIDsPerRank[myRank][i];
2548  Patch* patch = patchMap->patch(patchID);
2549  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
2550  if (tpe == NULL) {
2551  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
2552  }
2553 
2554  int pi = patchIndex[patchID];
2555  int numAtoms = patches[pi].numAtoms;
2556  int atomStart = patches[pi].atomStart;
2557 
2558  // XXX TODO: this is a large CPU bottleneck
2559  // Ok, so this is
2560 #ifdef NODEGROUP_FORCE_REGISTER
2561  double *af = forces + atomStart;
2562  double *af_nbond = forces + forceSize + atomStart;
2563  double *af_slow = forces + 2*forceSize + atomStart;
2564  // XXX TODO: We still need forces inside patches for migration steps it seems
2565  if(!params->CUDASOAintegrate || (atomsChanged && !params->useDeviceMigration) ){
2566  Force *f = tpe->f;
2567  Force *f_nbond = tpe->r->f[Results::nbond];
2568  Force *f_slow = tpe->r->f[Results::slow];
2569  if (!sumNbond && !sumSlow) {
2570  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2571  } else if (sumNbond && !sumSlow) {
2572  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2573  } else if (!sumNbond && sumSlow) {
2574  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2575  } else if (sumNbond && sumSlow) {
2576  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2577  } else {
2578  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2579  }
2580  tpe->forceBox->close(&tpe->r);
2581  tpe->positionBox->close(&tpe->x);
2582  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
2583  }
2584 #else
2585  Force *f = tpe->f;
2586  Force *f_nbond = tpe->r->f[Results::nbond];
2587  Force *f_slow = tpe->r->f[Results::slow];
2588 
2589  double *af = forces + atomStart;
2590  double *af_nbond = forces + forceSize + atomStart;
2591  double *af_slow = forces + 2*forceSize + atomStart;
2592 
2593  if (!sumNbond && !sumSlow) {
2594  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2595  } else if (sumNbond && !sumSlow) {
2596  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2597  } else if (!sumNbond && sumSlow) {
2598  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2599  } else if (sumNbond && sumSlow) {
2600  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2601  } else {
2602  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2603  }
2604 
2605  tpe->forceBox->close(&tpe->r);
2606  tpe->positionBox->close(&tpe->x);
2607  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
2608 
2609 #endif
2610 // #endif
2611 
2612  }
2613 
2614 
2615  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2616 
2617  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2618 
2619  bool done = false;
2620  CmiLock(lock);
2621  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
2622  if(params->CUDASOAintegrate && !atomsChanged){
2623  // masterPe is executing this, so we can go ahead and do reductions
2624  // by themselves. However, for migrations, I still need to follow the usual
2625  // codepath, because of the box setup
2626  patchesCounter = 0;
2627  }
2628  if (patchesCounter == 0) {
2629  patchesCounter = getNumPatches();
2630  done = true;
2631  }
2632  CmiUnlock(lock);
2633  if (done) {
2634  //computeMgr->sendFinishReductions(masterPe, this);
2635  if(params->CUDASOAintegrate ) {
2636  if(!atomsChanged) this->finishReductions();
2637  }
2638  else computeMgr->sendFinishReductions(masterPe, this);
2639  }
2640 
2641  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2642 
2643 }
2644 
2645 void ComputeBondedCUDA::finishPatches() {
2646 
2647  if (atomsChanged && (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
2648  unmapAtoms();
2649  }
2650  // do I have to synchronize the stream here??????
2651  //computeMgr->sendFinishPatchesOnPe(pes, this);
2652  if(params->CUDASOAintegrate) this->finishPatchesOnPe();
2653  else{
2654  computeMgr->sendFinishPatchesOnPe(pes, this);
2655  }
2656 }
2657 
2658 //
2659 // Finish & submit reductions
2660 //
2661 void ComputeBondedCUDA::finishReductions() {
2662 
2663  if (CkMyPe() != masterPe)
2664  NAMD_bug("ComputeBondedCUDA::finishReductions() called on non masterPe");
2665 
2666  //fprintf(stderr, "finishReductions\n");
2667 
2668  // static int ncall = 0;
2669  // ncall++;
2670 
2671  int pos = 0;
2672  // do I need this streamSynchronize here?
2673  if(params->CUDASOAintegrate) {
2674  cudaCheck(cudaStreamSynchronize(stream));
2675  }
2676  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2677  if (numTuplesPerType[tupletype] > 0) {
2678 
2679  if (doEnergy) {
2680  switch (tupletype) {
2681  case Tuples::BOND:
2682 #ifdef NODEGROUP_FORCE_REGISTER
2683  if (params->CUDASOAintegrate) {
2684  nodeReduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
2685  } else
2686 #endif
2687  {
2688  reduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
2689  }
2690  if (hostAlchFlags.alchFepOn) {
2691 #ifdef NODEGROUP_FORCE_REGISTER
2692  if (params->CUDASOAintegrate) {
2693  nodeReduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND_F];
2694  } else
2695 #endif
2696  {
2697  reduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND_F];
2698  }
2699  }
2700  if (hostAlchFlags.alchThermIntOn) {
2701 #ifdef NODEGROUP_FORCE_REGISTER
2702  if (params->CUDASOAintegrate) {
2703  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND_TI_1];
2704  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND_TI_2];
2705  } else
2706 #endif
2707  {
2710  }
2711  }
2712  break;
2713 
2714  case Tuples::ANGLE:
2715 #ifdef NODEGROUP_FORCE_REGISTER
2716  if (params->CUDASOAintegrate) {
2717  nodeReduction->item(REDUCTION_ANGLE_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE];
2718  } else
2719 #endif
2720  {
2721  reduction->item(REDUCTION_ANGLE_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE];
2722  }
2723  if (hostAlchFlags.alchFepOn) {
2724 #ifdef NODEGROUP_FORCE_REGISTER
2725  if (params->CUDASOAintegrate) {
2726  nodeReduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE_F];
2727  } else
2728 #endif
2729  {
2730  reduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE_F];
2731  }
2732  }
2733  if (hostAlchFlags.alchThermIntOn) {
2734 #ifdef NODEGROUP_FORCE_REGISTER
2735  if (params->CUDASOAintegrate) {
2736  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1];
2737  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2];
2738  } else
2739 #endif
2740  {
2743  }
2744  }
2745  break;
2746 
2747  case Tuples::DIHEDRAL:
2748 #ifdef NODEGROUP_FORCE_REGISTER
2749  if (params->CUDASOAintegrate) {
2750  nodeReduction->item(REDUCTION_DIHEDRAL_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL];
2751  } else
2752 #endif
2753  {
2754  reduction->item(REDUCTION_DIHEDRAL_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL];
2755  }
2756  if (hostAlchFlags.alchFepOn) {
2757 #ifdef NODEGROUP_FORCE_REGISTER
2758  if (params->CUDASOAintegrate) {
2759  nodeReduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F];
2760  } else
2761 #endif
2762  {
2763  reduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F];
2764  }
2765  }
2766  if (hostAlchFlags.alchThermIntOn) {
2767 #ifdef NODEGROUP_FORCE_REGISTER
2768  if (params->CUDASOAintegrate) {
2769  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1];
2770  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2];
2771  } else
2772 #endif
2773  {
2776  }
2777  }
2778  break;
2779 
2780  case Tuples::IMPROPER:
2781 #ifdef NODEGROUP_FORCE_REGISTER
2782  if (params->CUDASOAintegrate) {
2783  nodeReduction->item(REDUCTION_IMPROPER_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER];
2784  } else
2785 #endif
2786  {
2787  reduction->item(REDUCTION_IMPROPER_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER];
2788  }
2789  if (hostAlchFlags.alchFepOn) {
2790 #ifdef NODEGROUP_FORCE_REGISTER
2791  if (params->CUDASOAintegrate) {
2792  nodeReduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER_F];
2793  } else
2794 #endif
2795  {
2796  reduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER_F];
2797  }
2798  }
2799  if (hostAlchFlags.alchThermIntOn) {
2800 #ifdef NODEGROUP_FORCE_REGISTER
2801  if (params->CUDASOAintegrate) {
2802  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1];
2803  nodeReduction->item(REDUCTION_BONDED_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2];
2804  } else
2805 #endif
2806  {
2809  }
2810  }
2811  break;
2812 
2813  case Tuples::EXCLUSION:
2814 #ifdef NODEGROUP_FORCE_REGISTER
2815  if (params->CUDASOAintegrate) {
2816  nodeReduction->item(REDUCTION_ELECT_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT];
2817  nodeReduction->item(REDUCTION_LJ_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
2818  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW];
2819  } else
2820 #endif
2821  {
2822  reduction->item(REDUCTION_ELECT_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT];
2823  reduction->item(REDUCTION_LJ_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
2825  }
2826  if (hostAlchFlags.alchFepOn) {
2827 #ifdef NODEGROUP_FORCE_REGISTER
2828  if (params->CUDASOAintegrate) {
2829  nodeReduction->item(REDUCTION_ELECT_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_F];
2830  nodeReduction->item(REDUCTION_LJ_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_F];
2831  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F];
2832  } else
2833 #endif
2834  {
2835  reduction->item(REDUCTION_ELECT_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_F];
2836  reduction->item(REDUCTION_LJ_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_F];
2838  }
2839  }
2840  if (hostAlchFlags.alchThermIntOn) {
2841 #ifdef NODEGROUP_FORCE_REGISTER
2842  if (params->CUDASOAintegrate) {
2843  nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1];
2844  nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2];
2845  nodeReduction->item(REDUCTION_LJ_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_TI_1];
2846  nodeReduction->item(REDUCTION_LJ_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_TI_2];
2849  } else
2850 #endif
2851  {
2854  reduction->item(REDUCTION_LJ_ENERGY_TI_1) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_TI_1];
2855  reduction->item(REDUCTION_LJ_ENERGY_TI_2) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_TI_2];
2858  }
2859  }
2860  break;
2861 
2862  case Tuples::CROSSTERM:
2863 #ifdef NODEGROUP_FORCE_REGISTER
2864  if (params->CUDASOAintegrate) {
2865  nodeReduction->item(REDUCTION_CROSSTERM_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_CROSSTERM];
2866  } else
2867 #endif
2868  {
2869  reduction->item(REDUCTION_CROSSTERM_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_CROSSTERM];
2870  }
2871  if (hostAlchFlags.alchFepOn) {
2872 #ifdef NODEGROUP_FORCE_REGISTER
2873  if (params->CUDASOAintegrate) {
2874  nodeReduction->item(REDUCTION_BONDED_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F];
2875  } else
2876 #endif
2877  {
2879  }
2880  }
2881  if (hostAlchFlags.alchThermIntOn) {
2882 #ifdef NODEGROUP_FORCE_REGISTER
2883  if (params->CUDASOAintegrate) {
2886  } else
2887 #endif
2888  {
2891  }
2892  }
2893  break;
2894  default:
2895  NAMD_bug("ComputeBondedCUDA::finishReductions, Unsupported tuple type");
2896  break;
2897  }
2898  }
2899 
2900  auto it = tupleList[tupletype].begin();
2901  if (!params->CUDASOAintegrate) {
2902  (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
2903  }
2904  }
2905  }
2906 
2907  if (doVirial) {
2908 #ifndef WRITE_FULL_VIRIALS
2909 #error "non-WRITE_FULL_VIRIALS not implemented"
2910 #endif
2911 
2912 #ifdef NODEGROUP_FORCE_REGISTER
2913  if (params->CUDASOAintegrate) {
2914  ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
2915  ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
2916  ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
2917  ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2918  // NOTE: AMD_DIHE virial is also added to NORMAL virial.
2919  // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
2920  ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2921  } else
2922 #endif
2923  {
2924  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
2925  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
2926  ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
2927  ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2928  // NOTE: AMD_DIHE virial is also added to NORMAL virial.
2929  // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
2930  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2931  }
2932  }
2933 
2934 #ifdef NODEGROUP_FORCE_REGISTER
2935  if (params->CUDASOAintegrate) {
2936  nodeReduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
2937  } else
2938 #endif
2939  {
2940  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
2941  }
2942 
2943  if(!params->CUDASOAintegrate) reduction->submit();
2944 }
2945 
2946 //
2947 // Can only be called by master PE
2948 //
2949 void ComputeBondedCUDA::initialize() {
2950 #ifdef NODEGROUP_FORCE_REGISTER
2951  tupleWorkIndex.store(0);
2952 #endif
2953 
2954  if (CkMyPe() != masterPe)
2955  NAMD_bug("ComputeBondedCUDA::initialize() called on non master PE");
2956 
2957  // Build list of PEs
2958  for (int rank=0;rank < computes.size();rank++) {
2959  if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
2960  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
2961  }
2962  }
2963 
2964  // Return if no work to do
2965  if (pes.size() == 0) return;
2966 
2967  initializeCalled = false;
2968  cudaCheck(cudaSetDevice(deviceID));
2969 
2970 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
2971  int leastPriority, greatestPriority;
2972  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
2973  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
2974 #else
2975  cudaCheck(cudaStreamCreate(&stream));
2976 #endif
2977  cudaCheck(cudaEventCreate(&forceDoneEvent));
2978  lock = CmiCreateLock();
2979  printLock = CmiCreateLock();
2980 
2982 
2983  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2984  PatchData *patchData = cpdata.ckLocalBranch();
2985  nodeReduction = patchData->reduction;
2986 
2987  PatchMap* patchMap = PatchMap::Object();
2988  // First, assign all patches in self computes.
2989  // NOTE: These never overlap between PEs. No proxies added.
2990  for (int rank=0;rank < computes.size();rank++) {
2991  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
2992  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
2993  for (auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
2994  if (!tuplePatchList.find( TuplePatchElem(*jt) ) ) {
2995  tuplePatchList.add( TuplePatchElem(*jt) );
2996  patchIDsPerRank[rank].push_back(*jt);
2997  allPatchIDs.push_back(*jt);
2998  }
2999  }
3000  }
3001  }
3002 
3003  // Second, assign all patches in home computes.
3004  // NOTE: The ranks always have these patches. No proxies added.
3005  for (int rank=0;rank < computes.size();rank++) {
3006  HomeCompute& homeCompute = computes[rank].homeCompute;
3007  std::vector<int>& patchIDs = homeCompute.patchIDs;
3008  for (int i=0;i < patchIDs.size();i++) {
3009  int patchID = patchIDs[i];
3010  if (!tuplePatchList.find( TuplePatchElem(patchID) ) ) {
3011  tuplePatchList.add( TuplePatchElem(patchID) );
3012  patchIDsPerRank[rank].push_back(patchID);
3013  allPatchIDs.push_back(patchID);
3014  }
3015  }
3016  }
3017 
3018  std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
3019  // Find neighbors that are not added yet
3020  std::vector<int> neighborPids;
3021  for (int rank=0;rank < computes.size();rank++) {
3023  HomeCompute& homeCompute = computes[rank].homeCompute;
3024  std::vector<int>& patchIDs = homeCompute.patchIDs;
3025  for (int i=0;i < patchIDs.size();i++) {
3026  int patchID = patchIDs[i];
3027  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
3028  for (int j=0;j < numNeighbors;j++) {
3029  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
3030  neighborPids.push_back(neighbors[j]);
3031  }
3032  }
3033  }
3034  }
3035  // Remove duplicates from neighborPids
3036  {
3037  std::sort(neighborPids.begin(), neighborPids.end());
3038  auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
3039  neighborPids.resize(std::distance(neighborPids.begin(), it_end));
3040  }
3041  // Assign neighbors to the PEs on this node that have them
3042  for (int i=0;i < neighborPids.size();i++) {
3043  for (int rank=0;rank < computes.size();rank++) {
3044  int pid = neighborPids[i];
3045  int pe = rank + CkNodeFirst(CkMyNode());
3046  if (patchMap->node(pid) == pe) {
3047  // Patch pid found on PE "pe" on this node
3048  tuplePatchList.add( TuplePatchElem(pid) );
3049  patchIDsPerRank[rank].push_back(pid);
3050  allPatchIDs.push_back(pid);
3051  // Add to this rank's patches
3052  patchIDsToAppend[rank].push_back(pid);
3053  // Add to the list of PEs
3054  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3055  break;
3056  }
3057  }
3058  }
3059  // Remove duplicates from pes
3060  {
3061  std::sort(pes.begin(), pes.end());
3062  auto it_end = std::unique(pes.begin(), pes.end());
3063  pes.resize(std::distance(pes.begin(), it_end));
3064  }
3065 
3066  // Last, assign all patches in neighbors of home computes
3067  // NOTE: Will create proxies on multiple nodes
3068  for (int rank=0;rank < computes.size();rank++) {
3070  HomeCompute& homeCompute = computes[rank].homeCompute;
3071  std::vector<int>& patchIDs = homeCompute.patchIDs;
3072  std::vector<int> neighborPatchIDs;
3073  for (int i=0;i < patchIDs.size();i++) {
3074  int patchID = patchIDs[i];
3075  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
3076  for (int j=0;j < numNeighbors;j++) {
3077  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
3078  // Patch not found => Add Proxy
3079  tuplePatchList.add( TuplePatchElem(neighbors[j]) );
3080  patchIDsPerRank[rank].push_back(neighbors[j]);
3081  allPatchIDs.push_back(neighbors[j]);
3082  }
3083  if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
3084  && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
3085  neighborPatchIDs.push_back(neighbors[j]);
3086  }
3087  }
3088  }
3089  // Append neighboring patchIDs to homeCompute.patchIDs
3090  // int start = patchIDs.size();
3091  // patchIDs.resize(patchIDs.size() + neighborPatchIDs.size());
3092  // for (int i=0;i < neighborPatchIDs.size();i++) {
3093  // patchIDs[start + i] = neighborPatchIDs[i];
3094  // }
3095  for (int i=0;i < neighborPatchIDs.size();i++) {
3096  patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
3097  }
3098  }
3099 
3100  for (int rank=0;rank < patchIDsToAppend.size();rank++) {
3101  for (int i=0;i < patchIDsToAppend[rank].size();i++) {
3102  computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
3103  }
3104  }
3105 
3106  // Remove duplicate patch IDs
3107  {
3108  std::sort(allPatchIDs.begin(), allPatchIDs.end());
3109  auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
3110  allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
3111  }
3112 
3113  // Set number of (unique) patches
3114  setNumPatches(allPatchIDs.size());
3115 
3116  // Reset patchesCounter
3117  patchesCounter = getNumPatches();
3118 
3119  patches.resize(getNumPatches());
3120  // Setup tupleList
3121  // tuple list never gets updated here, where the hell is it updated?
3122  for (int rank=0;rank < computes.size();rank++) {
3123  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3124  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
3125  tupleList[it->tuples->getType()].push_back(it->tuples);
3126  }
3127  HomeCompute& homeCompute = computes[rank].homeCompute;
3128  for (int i=0;i < homeCompute.tuples.size();i++) {
3129  tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
3130  }
3131  }
3132 
3133  // Allocate host memory for energies and virials
3134  allocate_host<double>(&energies_virials, ComputeBondedCUDAKernel::energies_virials_SIZE);
3135 
3136  // Finally, do sanity checks
3137  std::vector<char> patchIDset(patchMap->numPatches(), 0);
3138  int numPatchIDset = 0;
3139  int numPatchIDs = 0;
3140  for (int rank=0;rank < computes.size();rank++) {
3141  numPatchIDs += patchIDsPerRank[rank].size();
3142  for (int i=0;i < patchIDsPerRank[rank].size();i++) {
3143  PatchID patchID = patchIDsPerRank[rank][i];
3144  if (patchIDset[patchID] == 0) numPatchIDset++;
3145  patchIDset[patchID] = 1;
3146  if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
3147  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3148  }
3149  }
3150  }
3151  if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
3152  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3153  }
3154 
3155  // Warning: Direct indexing used, patchIndex could use up a lot of memory for large systems
3156  patchIndex.resize(patchMap->numPatches());
3157  atomMappers.resize(getNumPatches());
3158  for (int i=0;i < getNumPatches();i++) {
3159  atomMappers[i] = new AtomMapper(allPatchIDs[i], &atomMap);
3160  patchIndex[allPatchIDs[i]] = i;
3161  }
3162 
3163  // Initialize the alchemical flags, parameters and lambda values
3164  updateHostCudaAlchFlags();
3165  updateKernelCudaAlchFlags();
3166  if (hostAlchFlags.alchOn) {
3167  updateHostCudaAlchParameters();
3168  bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3169  updateHostCudaAlchLambdas();
3170  bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3171  if (hostAlchFlags.alchDecouple) {
3172  pswitchTable[1+3*1] = 0;
3173  pswitchTable[2+3*2] = 0;
3174  }
3175  }
3176 
3177  // Copy coefficients to GPU
3178  Parameters* parameters = Node::Object()->parameters;
3179  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3180  if (tupleList[tupletype].size() > 0) {
3181  switch(tupletype) {
3182 
3183  case Tuples::BOND:
3184  {
3185  int NumBondParams = parameters->NumBondParams;
3186  BondValue* bond_array = parameters->bond_array;
3187  std::vector<CudaBondValue> bondValues(NumBondParams);
3188  for (int i=0;i < NumBondParams;i++) {
3189  bondValues[i].k = bond_array[i].k;
3190  bondValues[i].x0 = bond_array[i].x0;
3191  bondValues[i].x1 = bond_array[i].x1;
3192  }
3193  bondedKernel.setupBondValues(NumBondParams, bondValues.data());
3194  }
3195  break;
3196 
3197  case Tuples::ANGLE:
3198  {
3199  int NumAngleParams = parameters->NumAngleParams;
3200  AngleValue* angle_array = parameters->angle_array;
3201  std::vector<CudaAngleValue> angleValues(NumAngleParams);
3202  bool normal_ub_error = false;
3203  for (int i=0;i < NumAngleParams;i++) {
3204  angleValues[i].k = angle_array[i].k;
3205  if (angle_array[i].normal == 1) {
3206  angleValues[i].theta0 = angle_array[i].theta0;
3207  } else {
3208  angleValues[i].theta0 = cos(angle_array[i].theta0);
3209  }
3210  normal_ub_error |= (angle_array[i].normal == 0 && angle_array[i].k_ub);
3211  angleValues[i].k_ub = angle_array[i].k_ub;
3212  angleValues[i].r_ub = angle_array[i].r_ub;
3213  angleValues[i].normal = angle_array[i].normal;
3214  }
3215  if (normal_ub_error) NAMD_die("ERROR: Can't use cosAngles with Urey-Bradley angles");
3216  bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
3217  }
3218  break;
3219 
3220  case Tuples::DIHEDRAL:
3221  {
3222  int NumDihedralParams = parameters->NumDihedralParams;
3223  DihedralValue* dihedral_array = parameters->dihedral_array;
3224  int NumDihedralParamsMult = 0;
3225  for (int i=0;i < NumDihedralParams;i++) {
3226  NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
3227  }
3228  std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
3229  dihedralMultMap.resize(NumDihedralParams);
3230  int k = 0;
3231  for (int i=0;i < NumDihedralParams;i++) {
3232  int multiplicity = dihedral_array[i].multiplicity;
3233  dihedralMultMap[i] = k;
3234  for (int j=0;j < multiplicity;j++) {
3235  dihedralValues[k].k = dihedral_array[i].values[j].k;
3236  dihedralValues[k].n = (dihedral_array[i].values[j].n << 1) | (j < (multiplicity - 1));
3237  dihedralValues[k].delta = dihedral_array[i].values[j].delta;
3238  k++;
3239  }
3240  }
3241  bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
3242  }
3243  break;
3244 
3245  case Tuples::IMPROPER:
3246  {
3247  int NumImproperParams = parameters->NumImproperParams;
3248  ImproperValue* improper_array = parameters->improper_array;
3249  int NumImproperParamsMult = 0;
3250  for (int i=0;i < NumImproperParams;i++) {
3251  NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
3252  }
3253  std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
3254  improperMultMap.resize(NumImproperParams);
3255  int k = 0;
3256  for (int i=0;i < NumImproperParams;i++) {
3257  int multiplicity = improper_array[i].multiplicity;
3258  improperMultMap[i] = k;
3259  for (int j=0;j < multiplicity;j++) {
3260  improperValues[k].k = improper_array[i].values[j].k;
3261  improperValues[k].n = (improper_array[i].values[j].n << 1) | (j < (multiplicity - 1));
3262  improperValues[k].delta = improper_array[i].values[j].delta;
3263  k++;
3264  }
3265  }
3266  bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
3267  }
3268  break;
3269 
3270  case Tuples::CROSSTERM:
3271  {
3272  int NumCrosstermParams = parameters->NumCrosstermParams;
3273  CrosstermValue* crossterm_array = parameters->crossterm_array;
3274  std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
3275  const int D = CrosstermValue::dim;
3276  const int N = CrosstermValue::dim - 1;
3277  for (int ipar=0;ipar < NumCrosstermParams;ipar++) {
3278  for (int i=0;i < N;i++) {
3279  for (int j=0;j < N;j++) {
3280 
3281  // Setups coefficients for bi-cubic interpolation.
3282  // See https://en.wikipedia.org/wiki/Bicubic_interpolation
3283 
3284  #define INDEX(ncols,i,j) ((i)*ncols + (j))
3285  CrosstermData* table = &crossterm_array[ipar].c[0][0];
3286 
3287  const double Ainv[16][16] = {
3288  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3289  { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3290  {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3291  { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3292  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
3293  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
3294  { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
3295  { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
3296  {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
3297  { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
3298  { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
3299  {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
3300  { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
3301  { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
3302  {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
3303  { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
3304  };
3305 
3306 #ifndef M_PI
3307 #define M_PI 3.14159265358979323846
3308 #endif
3309 
3310  const double h = M_PI/12.0;
3311 
3312  const double x[16] = {
3313  table[INDEX(D,i,j)].d00, table[INDEX(D,i+1,j)].d00, table[INDEX(D,i,j+1)].d00, table[INDEX(D,i+1,j+1)].d00,
3314  table[INDEX(D,i,j)].d10*h, table[INDEX(D,i+1,j)].d10*h, table[INDEX(D,i,j+1)].d10*h, table[INDEX(D,i+1,j+1)].d10*h,
3315  table[INDEX(D,i,j)].d01*h, table[INDEX(D,i+1,j)].d01*h, table[INDEX(D,i,j+1)].d01*h, table[INDEX(D,i+1,j+1)].d01*h,
3316  table[INDEX(D,i,j)].d11*h*h, table[INDEX(D,i+1,j)].d11*h*h, table[INDEX(D,i,j+1)].d11*h*h, table[INDEX(D,i+1,j+1)].d11*h*h
3317  };
3318 
3319  // a = Ainv*x
3320  float* a = (float *)&crosstermValues[ipar].c[i][j][0];
3321  for (int k=0;k < 16;k++) {
3322  double a_val = 0.0;
3323  for (int l=0;l < 16;l++) {
3324  a_val += Ainv[k][l]*x[l];
3325  }
3326  a[k] = (float)a_val;
3327  }
3328 
3329  }
3330  }
3331  }
3332  bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
3333  }
3334  break;
3335 
3336  case Tuples::EXCLUSION:
3337  // Nothing to do
3338  break;
3339 
3340  default:
3341  NAMD_bug("ComputeBondedCUDA::initialize, Undefined tuple type");
3342  break;
3343  }
3344  }
3345  }
3346 
3347  computeMgr->sendAssignPatchesOnPe(pes, this);
3348 
3349  int nDevices = deviceCUDA->getNumDevice();
3350 
3351 #ifdef NODEGROUP_FORCE_REGISTER
3352  // TODO DMC this isn't the number of home patches, but that is okay for now...
3353  if (params->CUDASOAintegrateMode && params->useDeviceMigration) {
3354  migrationKernel.setup(nDevices, patchMap->numPatches());
3355 
3356  // Tuple Migration Structures
3357  // Patch record Data
3358  allocate_host<PatchRecord>(&h_patchRecord, patchMap->numPatches());
3359  allocate_device<PatchRecord>(&d_patchRecord, patchMap->numPatches());
3360 
3361  // Patch Centers
3362  allocate_host<double3>(&h_patchMapCenter, patchMap->numPatches());
3363  allocate_device<double3>(&d_patchMapCenter, patchMap->numPatches());
3364  for (int i = 0; i < patchMap->numPatches(); i++) {
3365  COPY_CUDAVECTOR(patchMap->center(i), h_patchMapCenter[i]);
3366  }
3367  copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->numPatches());
3368  }
3369 #endif // NODEGROUP_FORCE_REGISTER
3370 }
3371 
3372 #ifdef NODEGROUP_FORCE_REGISTER
3373 void ComputeBondedCUDA::updatePatchOrder(const std::vector<CudaLocalRecord>& data) {
3374  // TODO is there anywhere else that the order matters??
3375  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
3376  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
3377  for (int i=0;i < data.size();i++) {
3378  patchIndex[data[i].patchID] = i;
3379  }
3380 
3381  patches.clear();
3382  for (int i=0;i < data.size();i++) {
3383  PatchRecord p;
3384  p.patchID = data[i].patchID;
3385  p.atomStart = 0;
3386  p.numAtoms = 0;
3387  patches.push_back(p);
3388  }
3389 }
3390 #endif // NODEGROUP_FORCE_REGISTER
3391 
3392 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
3393  const SimParameters& sim_params = *(Node::Object()->simParameters);
3394  hostAlchFlags.alchOn = sim_params.alchOn;
3395  hostAlchFlags.alchFepOn = sim_params.alchFepOn;
3396  hostAlchFlags.alchThermIntOn = sim_params.alchThermIntOn;
3397  hostAlchFlags.alchWCAOn = sim_params.alchWCAOn;
3398  hostAlchFlags.alchLJcorrection = sim_params.LJcorrection;
3399  hostAlchFlags.alchVdwForceSwitching = ComputeNonbondedUtil::vdwForceSwitching;
3400  hostAlchFlags.alchDecouple = sim_params.alchDecouple;
3401  hostAlchFlags.alchBondDecouple = sim_params.alchBondDecouple;
3402 }
3403 
3404 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
3405  bondedKernel.updateCudaAlchFlags(hostAlchFlags);
3406 }
3407 
3408 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
3409  const SimParameters& sim_params = *(Node::Object()->simParameters);
3410  hostAlchParameters.switchDist2 = ComputeNonbondedUtil::switchOn * ComputeNonbondedUtil::switchOn;
3411  hostAlchParameters.alchVdwShiftCoeff = sim_params.alchVdwShiftCoeff;
3412 }
3413 
3414 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
3415  bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3416 }
3417 
3418 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
3419  SimParameters& sim_params = *(Node::Object()->simParameters);
3420  hostAlchLambdas.currentLambda = float(sim_params.getCurrentLambda(step));
3421  hostAlchLambdas.currentLambda2 = float(sim_params.getCurrentLambda2(step));
3422  hostAlchLambdas.bondLambda1 = float(sim_params.getBondLambda(hostAlchLambdas.currentLambda));
3423  hostAlchLambdas.bondLambda2 = float(sim_params.getBondLambda(1.0 - hostAlchLambdas.currentLambda));
3424  hostAlchLambdas.bondLambda12 = float(sim_params.getBondLambda(hostAlchLambdas.currentLambda2));
3425  hostAlchLambdas.bondLambda22 = float(sim_params.getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
3426  hostAlchLambdas.elecLambdaUp = float(sim_params.getElecLambda(hostAlchLambdas.currentLambda));
3427  hostAlchLambdas.elecLambda2Up = float(sim_params.getElecLambda(hostAlchLambdas.currentLambda2));
3428  hostAlchLambdas.elecLambdaDown = float(sim_params.getElecLambda(1.0 - hostAlchLambdas.currentLambda));
3429  hostAlchLambdas.elecLambda2Down = float(sim_params.getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
3430  hostAlchLambdas.vdwLambdaUp = float(sim_params.getVdwLambda(hostAlchLambdas.currentLambda));
3431  hostAlchLambdas.vdwLambda2Up = float(sim_params.getVdwLambda(hostAlchLambdas.currentLambda2));
3432  hostAlchLambdas.vdwLambdaDown = float(sim_params.getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
3433  hostAlchLambdas.vdwLambda2Down = float(sim_params.getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
3434 }
3435 
3436 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
3437  bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3438 }
3439 
3440 #ifdef NODEGROUP_FORCE_REGISTER
3441 void ComputeBondedCUDA::registerPointersToHost() {
3442  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3443  PatchData *patchData = cpdata.ckLocalBranch();
3444 
3445  const int deviceIndex = deviceCUDA->getDeviceIndex();
3446 
3447  TupleDataStage dst = migrationKernel.getDstBuffers();
3448  patchData->h_tupleDataStage.bond[deviceIndex] = dst.bond;
3449  patchData->h_tupleDataStage.angle[deviceIndex] = dst.angle;
3450  patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.dihedral;
3451  patchData->h_tupleDataStage.improper[deviceIndex] = dst.improper;
3452  patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.modifiedExclusion;
3453  patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.exclusion;
3454  patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.crossterm;
3455 
3456  TupleIntArraysContiguous count = migrationKernel.getDeviceTupleCounts();
3457  patchData->h_tupleCount.bond[deviceIndex] = count.bond();
3458  patchData->h_tupleCount.angle[deviceIndex] = count.angle();
3459  patchData->h_tupleCount.dihedral[deviceIndex] = count.dihedral();
3460  patchData->h_tupleCount.improper[deviceIndex] = count.improper();
3461  patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.modifiedExclusion();
3462  patchData->h_tupleCount.exclusion[deviceIndex] = count.exclusion();
3463  patchData->h_tupleCount.crossterm[deviceIndex] = count.crossterm();
3464 
3465  TupleIntArraysContiguous offset = migrationKernel.getDeviceTupleOffsets();
3466  patchData->h_tupleOffset.bond[deviceIndex] = offset.bond();
3467  patchData->h_tupleOffset.angle[deviceIndex] = offset.angle();
3468  patchData->h_tupleOffset.dihedral[deviceIndex] = offset.dihedral();
3469  patchData->h_tupleOffset.improper[deviceIndex] = offset.improper();
3470  patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.modifiedExclusion();
3471  patchData->h_tupleOffset.exclusion[deviceIndex] = offset.exclusion();
3472  patchData->h_tupleOffset.crossterm[deviceIndex] = offset.crossterm();
3473 }
3474 
3475 void ComputeBondedCUDA::copyHostRegisterToDevice() {
3476  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3477  PatchData *patchData = cpdata.ckLocalBranch();
3478 
3479  const int deviceIndex = deviceCUDA->getDeviceIndex();
3480  const int nDevices = deviceCUDA->getNumDevice();
3481  const int deviceID = deviceCUDA->getDeviceID();
3482  cudaCheck(cudaSetDevice(deviceID));
3483 
3484  migrationKernel.copyPeerDataToDevice(
3485  patchData->h_tupleDataStage,
3486  patchData->h_tupleCount,
3487  patchData->h_tupleOffset,
3488  nDevices,
3489  stream
3490  );
3491 }
3492 
3493 void ComputeBondedCUDA::copyPatchData() {
3494  PatchMap* patchMap = PatchMap::Object();
3495  int numPatches = patchMap->numPatches();
3496  // Constructs a patch record array that is indexed by global patchID
3497  for (int i = 0; i < numPatches; i++) {
3498  h_patchRecord[i] = patches[patchIndex[i]];
3499  }
3500 
3501  copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
3502 }
3503 #endif // NODEGROUP_FORCE_REGISTER
3504 
3505 
3506 #endif // BONDED_CUDA
3507 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
CudaDihedralStage * improper
NAMD_HOST_DEVICE int * modifiedExclusion()
float scale
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
Definition: Patch.C:133
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
#define M_PI
Definition: GoMolecule.C:40
const CrosstermValue * value
int getNumAtoms() const
Definition: Patch.h:105
int NumBondParams
Definition: Parameters.h:330
CrosstermData c[dim][dim]
Definition: Parameters.h:145
TuplePatchElem * p[size]
Definition: ComputeAngles.h:22
int size(void) const
Definition: ResizeArray.h:131
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:139
TuplePatchElem * p[size]
BigReal getBondLambda(const BigReal) const
const ImproperValue * value
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
static int warpAlign(const int n)
CudaAngleStage * angle
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:29
int numAtoms
Definition: CudaRecord.h:17
int32 ComputeID
Definition: NamdTypes.h:278
#define ADD_TENSOR(R, RL, D, DL)
Definition: ReductionMgr.h:33
float3 ioffsetXYZ
int getCMaxIndex()
Definition: PatchMap.h:174
static PatchMap * Object()
Definition: PatchMap.h:27
float3 joffsetXYZ
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1759
CudaCrosstermStage * crossterm
CmiNodeLock printlock
Definition: PatchData.h:157
Definition: Vector.h:72
SimParameters * simParameters
Definition: Node.h:181
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1748
NAMD_HOST_DEVICE int * exclusion()
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
Bool CUDASOAintegrateMode
AtomID atomID[size]
#define INDEX(ncols, i, j)
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
Definition: Patch.C:238
AtomID atomID[size]
BigReal z
Definition: Vector.h:74
int downstream(int pid1, int pid2)
Definition: PatchMap.inl:51
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:669
int getNumDevice()
Definition: DeviceCUDA.h:125
Position position
Definition: NamdTypes.h:77
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2852
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:366
DihedralValue * dihedral_array
Definition: Parameters.h:314
int getBDim()
Definition: PatchMap.h:170
int getADim()
Definition: PatchMap.h:169
int fepBondedType
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:279
NodeReduction * reduction
Definition: PatchData.h:133
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
BigReal getElecLambda(const BigReal) const
int index[size]
int NumAngleParams
Definition: Parameters.h:331
CrosstermValue * crossterm_array
Definition: Parameters.h:316
Real scale
Definition: ComputeBonds.h:27
int localIndex[size]
int getBMaxIndex()
Definition: PatchMap.h:173
Molecule stores the structural information for the system.
Definition: Molecule.h:175
AtomID atomID[size]
Definition: ComputeBonds.h:24
int NumDihedralParams
Definition: Parameters.h:333
Definition: Patch.h:35
Bool useDeviceMigration
Flags flags
Definition: Patch.h:128
BigReal d11
Definition: Parameters.h:137
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:128
NAMD_HOST_DEVICE int * bond()
AngleValue * angle_array
Definition: Parameters.h:313
static CudaNBConstants getNonbondedCoef(SimParameters *params)
TuplePatchElem * p[size]
static Units next(Units u)
Definition: ParseOptions.C:48
#define order
Definition: PmeRealSpace.C:235
float3 koffsetXYZ
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1770
CudaAtom * getCudaAtomList()
Definition: Patch.h:125
float3 ioffsetXYZ
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
Definition: CudaUtils.h:435
void CcdCallBacksReset(void *ignored, double curWallTime)
const DihedralValue * value
NAMD_HOST_DEVICE float3 make_float3(float4 a)
Definition: Vector.h:335
int localIndex[size]
Definition: ComputeBonds.h:25
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:53
CudaDihedralStage * dihedral
AtomID atomID[size]
int NumImproperParams
Definition: Parameters.h:334
int16 vdwType
Definition: NamdTypes.h:79
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1781
Force * f[maxNumForces]
Definition: PatchTypes.h:146
NAMD_HOST_DEVICE int * crossterm()
AtomID atomID[size]
Definition: ComputeAngles.h:20
int localIndex[size]
BigReal d00
Definition: Parameters.h:137
uint8 partition
Definition: NamdTypes.h:80
NAMD_HOST_DEVICE int * angle()
BigReal x
Definition: Vector.h:74
CudaBondStage * bond
PatchID getPatchID() const
Definition: Patch.h:114
int NumCrosstermParams
Definition: Parameters.h:335
ImproperValue * improper_array
Definition: Parameters.h:315
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1707
CudaExclusionStage * modifiedExclusion
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
Definition: ProxyMgr.C:492
int get_fep_bonded_type(const int *atomID, unsigned int order) const
Definition: Molecule.h:1477
void NAMD_die(const char *err_msg)
Definition: common.C:147
CudaExclusionStage * exclusion
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:134
int getDeviceID()
Definition: DeviceCUDA.h:144
unsigned char get_fep_type(int anum) const
Definition: Molecule.h:1432
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:121
Parameters * parameters
Definition: Node.h:180
int getAMaxIndex()
Definition: PatchMap.h:172
float3 ioffsetXYZ
int localIndex[size]
Definition: ComputeAngles.h:21
int index[size]
int32 sortOrder
Definition: NamdTypes.h:149
int localIndex[size]
BigReal d01
Definition: Parameters.h:137
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
int patchIDs[size]
int32 AtomID
Definition: NamdTypes.h:35
BondValue * bond_array
Definition: Parameters.h:312
BigReal y
Definition: Vector.h:74
const BondValue * value
Definition: ComputeBonds.h:46
BigReal getVdwLambda(const BigReal) const
int getDeviceIndex()
Definition: DeviceCUDA.h:166
Box< Patch, CompAtom > * avgPositionBox
Real theta0
Definition: Parameters.h:112
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
const AngleValue * value
Definition: ComputeAngles.h:40
CompAtomExt * xExt
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
int node(int pid) const
Definition: PatchMap.h:114
TuplePatchElem * p[size]
Definition: ComputeBonds.h:26
NAMD_HOST_DEVICE int * improper()
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1734
Data * open(void)
Definition: Box.h:39
float3 loffsetXYZ
int atomStart
Definition: CudaRecord.h:16
int patchIDs[size]
TuplePatchElem * p[size]
int32 PatchID
Definition: NamdTypes.h:277
NAMD_HOST_DEVICE int * dihedral()
Molecule * molecule
Definition: Node.h:179
Box< Patch, Results > * forceBox
int fepBondedType
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1681
void close(Data **const t)
Definition: Box.h:49
BigReal d10
Definition: Parameters.h:137
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:106
BigReal alchVdwShiftCoeff
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:227