NAMD
ComputeBondedCUDA.C
Go to the documentation of this file.
1 #include <algorithm> // std::find
2 
3 #include "ComputeAniso.h"
5 #include "ComputeHomeTuples.h"
6 #include "ComputeMap.h"
7 #include "ComputeSelfTuples.h"
8 #include "ComputeThole.h"
9 #include "ReductionMgr.h"
10 #include "SimParameters.h"
11 #include "charm++.h"
12 #include "NamdTypes.h"
13 #include "ComputeMgr.h"
14 #include "WorkDistrib.h"
15 #include "ProxyMgr.h"
16 #include "CudaUtils.h"
17 #include "DeviceCUDA.h"
18 #include "ComputeBonds.h"
19 #include "ComputeAngles.h"
20 #include "ComputeDihedrals.h"
21 #include "ComputeImpropers.h"
22 #include "ComputeCrossterms.h"
24 #include "ComputeBondedCUDA.h"
25 #include "PatchData.h"
26 #include "HipDefines.h"
27 #include "NamdEventsProfiling.h"
28 #include "CudaRecord.h"
29 #include "SynchronousCollectives.h"
30 
31 #include "TestArray.h"
32 
33 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
34 #ifdef WIN32
35 #define __thread __declspec(thread)
36 #endif
37 extern __thread DeviceCUDA *deviceCUDA;
38 
39 #ifdef BONDED_CUDA
40 
41 
42 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
43  sizeof(CudaBond), // Bonds
44  sizeof(CudaAngle), // Angles
45  sizeof(CudaDihedral), // Dihedrals
46  sizeof(CudaDihedral), // Impropers
47  sizeof(CudaExclusion), // Exclusions
48  sizeof(CudaCrossterm), // Crossterms
49  sizeof(CudaThole), // Thole
50  sizeof(CudaAniso), // Aniso
51 };
52 
53 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
54  sizeof(CudaBondStage), // Bonds
55  sizeof(CudaAngleStage), // Angles
56  sizeof(CudaDihedralStage), // Dihedrals
57  sizeof(CudaDihedralStage), // Impropers
58  sizeof(CudaExclusionStage), // Exclusions
59  sizeof(CudaCrosstermStage), // Crossterms
60  sizeof(CudaTholeStage), // Thole
61  sizeof(CudaAnisoStage), // Aniso
62 };
63 
64 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
65 
66 //
67 // Class constructor
68 //
69 ComputeBondedCUDA::ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID,
70  CudaNonbondedTables& cudaNonbondedTables) :
71 Compute(c), deviceID(deviceID), masterPe(CkMyPe()),
72 bondedKernel(deviceID, cudaNonbondedTables), computeMgr(computeMgr)
73 {
74 
75  computes.resize(CkMyNodeSize());
76  patchIDsPerRank.resize(CkMyNodeSize());
77  numExclPerRank.resize(CkMyNodeSize());
78  for (int i=0;i < numExclPerRank.size();i++) {
79  numExclPerRank[i].numModifiedExclusions = 0;
80  numExclPerRank[i].numExclusions = 0;
81  }
82 
83  atomMap.allocateMap(Node::Object()->molecule->numAtoms);
84 
85  flags = NULL;
86  step = 0;
87 
88  tupleData = NULL;
89  tupleDataSize = 0;
90 
91  atoms = NULL;
92  atomsSize = 0;
93 
94  forces = NULL;
95  forcesSize = 0;
96 
97  energies_virials = NULL;
98 
99  initializeCalled = false;
100 
101  params = Node::Object()->simParameters;
102  accelMDdoDihe = false;
103  if (params->accelMDOn) {
104  if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=true;
105  }
106  /*pswitchTable = {0, 1, 2,
107  1, 1, 99,
108  2, 99, 2};
109  */
110  pswitchTable[0] = 0; pswitchTable[1] = 1; pswitchTable[2] = 2;
111  pswitchTable[3] = 1; pswitchTable[4] = 1; pswitchTable[5] = 99;
112  pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
113 
114  h_patchRecord = NULL;
115  d_patchRecord = NULL;
116 
117  h_patchMapCenter = NULL;
118  d_patchMapCenter = NULL;
119 
120 }
121 
122 //
123 // Class destructor
124 //
125 ComputeBondedCUDA::~ComputeBondedCUDA() {
126  cudaCheck(cudaSetDevice(deviceID));
127 
128  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
129  if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
130  if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
131  if (tupleData != NULL) deallocate_host<char>(&tupleData);
132 
133  if (initializeCalled) {
134  cudaCheck(cudaStreamDestroy(stream));
135  cudaCheck(cudaEventDestroy(forceDoneEvent));
136  CmiDestroyLock(lock);
137  CmiDestroyLock(printLock);
138  if (reductionGpuOffload) {
139  delete reductionGpuOffload;
140  }
141  if (reductionGpuResident) {
142  delete reductionGpuResident;
143  }
144  }
145 
146  if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
147  if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
148 
149  if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
150  if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
151 
152  // NOTE: unregistering happens in [sync] -entry method
153  computeMgr->sendUnregisterBoxesOnPe(pes, this);
154 }
155 
156 void ComputeBondedCUDA::unregisterBoxesOnPe() {
157  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
158  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
159  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
160  if (tpe == NULL || tpe->p == NULL) {
161  NAMD_bug("ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
162  }
163  Patch* patch = tpe->p;
164  if (tpe->positionBox != NULL) patch->unregisterPositionPickup(this, &tpe->positionBox);
165  if (tpe->avgPositionBox != NULL) patch->unregisterAvgPositionPickup(this, &tpe->avgPositionBox);
166  if (tpe->forceBox != NULL) patch->unregisterForceDeposit(this, &tpe->forceBox);
167  }
168 }
169 
170 //
171 // Register compute for a given PE. pids is a list of patches the PE has
172 // Called by master PE
173 //
174 void ComputeBondedCUDA::registerCompute(int pe, int type, PatchIDList& pids) {
175 
176  if (CkMyPe() != masterPe)
177  NAMD_bug("ComputeBondedCUDA::registerCompute() called on non master PE");
178 
179  int rank = CkRankOf(pe);
180 
181  HomeCompute& homeCompute = computes[rank].homeCompute;
182  if (homeCompute.patchIDs.size() == 0) {
183  homeCompute.isBasePatch.resize(PatchMap::Object()->numPatches(), 0);
184  homeCompute.patchIDs.resize(pids.size());
185  for (int i=0;i < pids.size();i++) {
186  homeCompute.patchIDs[i] = pids[i];
187  homeCompute.isBasePatch[pids[i]] = 1;
188  }
189  } else {
190  if (homeCompute.patchIDs.size() != pids.size()) {
191  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
192  }
193  for (int i=0;i < pids.size();i++) {
194  if (homeCompute.patchIDs[i] != pids[i]) {
195  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
196  }
197  }
198  }
199 
200  switch(type) {
201  case computeBondsType:
202  homeCompute.tuples.push_back(new HomeTuples<BondElem, Bond, BondValue>(Tuples::BOND));
203  break;
204 
205  case computeAnglesType:
206  homeCompute.tuples.push_back(new HomeTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE));
207  break;
208 
210  homeCompute.tuples.push_back(new HomeTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL));
211  break;
212 
214  homeCompute.tuples.push_back(new HomeTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER));
215  break;
216 
217  case computeExclsType:
218  homeCompute.tuples.push_back(new HomeTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION));
219  break;
220 
222  homeCompute.tuples.push_back(new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM));
223  break;
224 
225  case computeTholeType:
226  homeCompute.tuples.push_back(new HomeTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE));
227  break;
228 
229  case computeAnisoType:
230  homeCompute.tuples.push_back(new HomeTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO));
231  break;
232 
233  default:
234  NAMD_bug("ComputeBondedCUDA::registerCompute(), Unsupported compute type");
235  break;
236  }
237 
238 }
239 
240 //
241 // Register self compute for a given PE
242 // Called by master PE
243 //
244 void ComputeBondedCUDA::registerSelfCompute(int pe, int type, int pid) {
245 
246  if (CkMyPe() != masterPe)
247  NAMD_bug("ComputeBondedCUDA::registerSelfCompute() called on non master PE");
248 
249  int rank = CkRankOf(pe);
250 
251  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
252  auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
253  if (it == selfComputes.end()) {
254  // Type not found, add new one
255  selfComputes.push_back(SelfCompute(type));
256  it = selfComputes.begin() + (selfComputes.size() - 1);
257 
258  switch(type) {
260  it->tuples = new SelfTuples<BondElem, Bond, BondValue>(Tuples::BOND);
261  break;
262 
264  it->tuples = new SelfTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE);
265  break;
266 
268  it->tuples = new SelfTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL);
269  break;
270 
272  it->tuples = new SelfTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER);
273  break;
274 
276  it->tuples = new SelfTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION);
277  break;
278 
280  it->tuples = new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM);
281  break;
282 
284  it->tuples = new SelfTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE);
285  break;
286 
288  it->tuples = new SelfTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO);
289  break;
290 
291  default:
292  NAMD_bug("ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
293  break;
294  }
295 
296  }
297 
298  // Add patch ID for this type
299  it->patchIDs.push_back(pid);
300 }
301 
302 void ComputeBondedCUDA::assignPatchesOnPe() {
303 
304  PatchMap* patchMap = PatchMap::Object();
305  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
306  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
307  ProxyMgr::Object()->createProxy(patchID);
308  Patch* patch = patchMap->patch(patchID);
309  if (patch == NULL)
310  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, patch not found");
311  if (flags == NULL) flags = &patchMap->patch(patchID)->flags;
312  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
313  if (tpe == NULL) {
314  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
315  }
316  if (tpe->p != NULL) {
317  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
318  }
319  // Assign patch and register coordinates and forces manually
320  tpe->p = patch;
321  tpe->positionBox = patch->registerPositionPickup(this);
322  tpe->avgPositionBox = patch->registerAvgPositionPickup(this);
323  tpe->forceBox = patch->registerForceDeposit(this);
324  }
325 }
326 
327 //
328 // atomUpdate() can be called by any Pe
329 //
330 void ComputeBondedCUDA::atomUpdate() {
331  atomsChangedIn = true;
332 }
333 
334 //
335 // Enqueu doWork on masterPe and return "no work"
336 // Can be called by any Pe
337 //
338 int ComputeBondedCUDA::noWork() {
339  computeMgr->sendMessageEnqueueWork(masterPe, this);
340  return 1;
341 }
342 
343 void ComputeBondedCUDA::messageEnqueueWork() {
344  if (masterPe != CkMyPe())
345  NAMD_bug("ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
347 }
348 
349 //
350 // Sends open-box commands to PEs
351 // Called on master PE
352 //
353 void ComputeBondedCUDA::doWork() {
354  if (CkMyPe() != masterPe)
355  NAMD_bug("ComputeBondedCUDA::doWork() called on non master PE");
356 
357  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
358  // atomsChangedIn can be set to true by any Pe
359  // atomsChanged can only be set by masterPe
360  // This use of double varibles makes sure we don't have race condition
361  atomsChanged = atomsChangedIn;
362  atomsChangedIn = false;
363 
364  if (getNumPatches() == 0) {
365  return; // No work do to
366  }
367 
368  if (flags == NULL)
369  NAMD_bug("ComputeBondedCUDA::doWork(), no flags set");
370 
371  // Read flags
372  // what is flags...
373  lattice = flags->lattice;
374  doEnergy = flags->doEnergy;
375  doVirial = flags->doVirial;
376  doSlow = flags->doFullElectrostatics;
377  doMolly = flags->doMolly;
378  step = flags->step;
379 
380  if (hostAlchFlags.alchOn) {
381  updateHostCudaAlchLambdas();
382  updateKernelCudaAlchLambdas();
383  const int& alchOutFreq = Node::Object()->simParameters->alchOutFreq;
384  if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
385  doEnergy = true;
386  }
387  }
388 
389  if (atomsChanged) {
390  // Re-calculate patch atom numbers and storage
391  updatePatches();
392  }
393  // Open boxes on Pes and launch work to masterPe
394  if(params->CUDASOAintegrate) {
395  if (!atomsChanged) this->openBoxesOnPe();
396  }
397  else computeMgr->sendOpenBoxesOnPe(pes, this);
398 }
399 
400 //
401 // This gets called when patch finishes on a PE
402 //
403 void ComputeBondedCUDA::patchReady(PatchID pid, int doneMigration, int seq) {
404  if (doneMigration) {
405  // auto it = patchIndex.find(pid);
406  // if (it == patchIndex.end())
407  // NAMD_bug("ComputeBondedCUDA::patchReady, Patch ID not found");
408  patches[patchIndex[pid]].numAtoms = PatchMap::Object()->patch(pid)->getNumAtoms();
409 #ifdef NODEGROUP_FORCE_REGISTER
410  patches[patchIndex[pid]].patchID = pid;
411 #endif
412  }
413  // XXX first sum locally, then sum globally
414  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
415  // however that is already called in Sequencer::runComputeObjects_CUDA
416  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
417  CmiLock(lock);
418  Compute::patchReady(pid, doneMigration, seq);
419  CmiUnlock(lock);
420  }
421 }
422 
423 //
424 //
425 //
426 void ComputeBondedCUDA::updatePatches() {
427  if (!Node::Object()->simParameters->CUDASOAintegrate) {
428  int atomStart = 0;
429  for (int i=0;i < patches.size();i++) {
430  patches[i].atomStart = atomStart;
431  atomStart += patches[i].numAtoms;
432  }
433  atomStorageSize = atomStart;
434 
435  // Re-allocate atoms
436  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
437 
438  } else {
439 #ifdef NODEGROUP_FORCE_REGISTER
440  const int deviceIndex = deviceCUDA->getDeviceIndex();
441  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
442  PatchData *patchData = cpdata.ckLocalBranch();
443 
444  std::vector<CudaLocalRecord>& localPatches =
445  patchData->devData[deviceIndex].h_localPatches;
446  const int numPatchesHomeAndProxy =
447  patchData->devData[deviceIndex].numPatchesHomeAndProxy;
448 
449  int atomStart = 0;
450  for (int i=0;i < numPatchesHomeAndProxy; i++) {
451  patches[i].numAtoms = localPatches[i].numAtoms;
452  patches[i].atomStart = localPatches[i].bufferOffset;
453  atomStart += patches[i].numAtoms;
454  }
455 
456  atomStorageSize = atomStart;
457  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
458 
459  if (params->CUDASOAintegrate && params->useDeviceMigration) {
460  bondedKernel.updateAtomBuffer(atomStorageSize, stream);
461  updatePatchRecords();
462  }
463 #endif // NODEGROUP_FORCE_REGISTER
464  }
465 }
466 
467 //
468 // Map atoms GPU-wide
469 //
470 void ComputeBondedCUDA::mapAtoms() {
471  for (int i=0;i < getNumPatches();i++) {
472  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
473  atomMappers[i]->registerIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
474  }
475 
476 }
477 
478 //
479 // Unmap atoms GPU-wide
480 //
481 void ComputeBondedCUDA::unmapAtoms() {
482  for (int i=0;i < getNumPatches();i++) {
483  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
484  atomMappers[i]->unregisterIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
485  }
486 }
487 
488 //
489 // Open all patches that have been assigned to this Pe
490 //
491 void ComputeBondedCUDA::openBoxesOnPe(int startup) {
492  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_OPEN_BOXES);
493  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
494 #if 0
495  fprintf(stderr, "PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(), this);
496 #endif
497 #ifdef NODEGROUP_FORCE_REGISTER
498  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
499  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
500  PatchID patchID = *it;
501  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
502  tpe->x = tpe->positionBox->open();
503  tpe->r = tpe->forceBox->open();
504  //tpe->f = tpe->r->f[Results::normal];
505  }
506  this->launchWork();
507  }
508  else{
509 #endif
510  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
511  PatchID patchID = *it;
512  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
513  tpe->x = tpe->positionBox->open();
514  tpe->xExt = tpe->p->getCompAtomExtInfo();
515  //fprintf(stderr, "::openBoxesOnPe(%p) PE[%d] step %d PID[%d] = atomExt = %p\n", this, CkMyPe(), tpe->p->flags.step, tpe->p->getPatchID(), tpe->xExt);
516  if ( doMolly ) tpe->x_avg = tpe->avgPositionBox->open();
517  tpe->r = tpe->forceBox->open();
518  tpe->f = tpe->r->f[Results::normal];
519  if (accelMDdoDihe) tpe->af = tpe->r->f[Results::amdf]; // for dihedral-only or dual-boost accelMD
520  // Copy atoms
521  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
522  int pi = patchIndex[patchID];
523  int atomStart = patches[pi].atomStart;
524  int numAtoms = patches[pi].numAtoms;
525  CompAtom* compAtom = tpe->x;
526  const CompAtomExt *aExt = tpe->p->getCompAtomExtInfo();
527  const CudaAtom *src = tpe->p->getCudaAtomList();
528  for (int i=0;i < numAtoms;i++) {
529  int j = aExt[i].sortOrder;
530  // We have an atomStart here, and J is the sortOrder: unsorting atoms
531  atoms[atomStart + j] = src[i];
532  }
533  }
534  }
535  bool done = false;
536  // NOTE: this whole scheme of counting down the patches will help
537  // when I have multiple masterPes
538  CmiLock(lock);
539  patchesCounter -= patchIDs.size();
540  if (patchesCounter == 0) {
541  patchesCounter = getNumPatches();
542  done = true;
543  }
544  CmiUnlock(lock);
545  if (done) {
546  if (atomsChanged) {
547  if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
548  mapAtoms();
549  }
550  //computeMgr->sendLoadTuplesOnPe(pes, this);
551  if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes, this);
552  } else {
553  //computeMgr->sendLaunchWork(masterPe, this);
554  if(params->CUDASOAintegrate){
555  if(!atomsChanged) this->launchWork();
556  }
557  else computeMgr->sendLaunchWork(masterPe, this);
558  }
559  }
560 #ifdef NODEGROUP_FORCE_REGISTER
561  }
562 #endif
563  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_OPEN_BOXES);
564 
565 #if 0
566  // SynchronousCollectives* syncColl = SynchronousCollectives::Object();
567  // syncColl->barrier(SynchronousCollectiveScope::all);
568  // Patches are using different
569  CmiLock(printLock);
570  fprintf(stderr, "PE[%d] (%p) tuplePatchList printout\n", CkMyPe(), this);
571  for(int i = 0 ; i < tuplePatchList.size(); i++){
572  // how the fuck do I print this structure. argh
573  TuplePatchElem *tpe = tuplePatchList.find(TuplePatchElem(i));
574  if(tpe == NULL) break;
575  fprintf(stderr, "PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(), this, i, tpe->p->getPatchID(), tpe->xExt);
576  }
577  CmiUnlock(printLock);
578  // syncColl->barrier(SynchronousCollectiveScope::all); // Let's see
579 #endif
580 }
581 
582 void countNumExclusions(Tuples* tuples, int& numModifiedExclusions, int& numExclusions) {
583  numModifiedExclusions = 0;
584  int ntuples = tuples->getNumTuples();
585  ExclElem* src = (ExclElem *)(tuples->getTupleList());
586  for (int ituple=0;ituple < ntuples;ituple++) {
587  if (src[ituple].modified) numModifiedExclusions++;
588  }
589  numExclusions = ntuples - numModifiedExclusions;
590 }
591 
592 //
593 // Load tuples on PE. Note: this can only after boxes on all PEs have been opened
594 //
595 void ComputeBondedCUDA::loadTuplesOnPe(const int startup) {
596  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_LOAD_TUPLES);
597 
598  int numModifiedExclusions = 0;
599  int numExclusions = 0;
600  if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
601  std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
602  // Loop over self compute types
603  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
604  // clears tupleList and reloads the tuples
605  it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
606  // For exclusions, we must count the number of modified and non-modified exclusions
607  if (it->tuples->getType() == Tuples::EXCLUSION) {
608  int tmp1, tmp2;
609  countNumExclusions(it->tuples, tmp1, tmp2);
610  numModifiedExclusions += tmp1;
611  numExclusions += tmp2;
612  }
613  }
614 
615  HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
616  for (int i=0;i < homeCompute.tuples.size();i++) {
617  homeCompute.tuples[i]->loadTuples(tuplePatchList,
618  homeCompute.isBasePatch.data(), &atomMap,
619  homeCompute.patchIDs);
620  // For exclusions, we must count the number of modified and non-modified exclusions
621  if (homeCompute.tuples[i]->getType() == Tuples::EXCLUSION) {
622  int tmp1, tmp2;
623  countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
624  numModifiedExclusions += tmp1;
625  numExclusions += tmp2;
626  }
627  }
628  } else {
629  numModifiedExclusions = modifiedExclusionTupleData.size();
630  numExclusions = exclusionTupleData.size();
631  }
632 
633  // Store number of exclusions
634  numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
635  numExclPerRank[CkMyRank()].numExclusions = numExclusions;
636 
637 
638  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
639  bool done = false;
640 
641  // TODO: Swap that for a more efficient atomic operation
642  // Think of a lock-free design
643  CmiLock(lock);
644  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
645  if (patchesCounter == 0) {
646  patchesCounter = getNumPatches();
647  done = true;
648  }
649  CmiUnlock(lock);
650  if (done) {
651  //computeMgr->sendLaunchWork(masterPe, this);
652  //if(params->CUDASOAintegrate && !first) this->launchWork();
653  //else computeMgr->sendLaunchWork(masterPe, this);
654  if(!params->CUDASOAintegrate)computeMgr->sendLaunchWork(masterPe, this);
655  }
656  }
657 
658  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_LOAD_TUPLES);
659 }
660 
661 void ComputeBondedCUDA::copyBondData(const int ntuples, const BondElem* __restrict__ src,
662  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
663 
664  PatchMap* patchMap = PatchMap::Object();
665  for (int ituple=0;ituple < ntuples;ituple++) {
666  CudaBond dstval;
667  auto p0 = src[ituple].p[0];
668  auto p1 = src[ituple].p[1];
669  int pi0 = patchIndex[p0->patchID];
670  int pi1 = patchIndex[p1->patchID];
671  int l0 = src[ituple].localIndex[0];
672  int l1 = src[ituple].localIndex[1];
673  dstval.i = l0 + patches[pi0].atomStart;
674  dstval.j = l1 + patches[pi1].atomStart;
675  dstval.itype = (src[ituple].value - bond_array);
676  Position position1 = p0->x[l0].position;
677  Position position2 = p1->x[l1].position;
678  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
679  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
680  dstval.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
681  dstval.scale = src[ituple].scale;
682  if (hostAlchFlags.alchOn) {
683  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
684  const Molecule& mol = *(Node::Object()->molecule);
685  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
686  } else {
687  dstval.fepBondedType = 0;
688  }
689  dst[ituple] = dstval;
690  }
691 }
692 
693 #ifdef NODEGROUP_FORCE_REGISTER
694 template<>
695 void ComputeBondedCUDA::copyTupleToStage(const BondElem& src,
696  const BondValue* __restrict__ p_array, CudaBondStage& dstval) {
697 
698  auto p0 = src.p[0];
699  auto p1 = src.p[1];
700  int pi0 = patchIndex[p0->patchID];
701  int pi1 = patchIndex[p1->patchID];
702  int l0 = src.localIndex[0];
703  int l1 = src.localIndex[1];
704  dstval.itype = (src.value - p_array);
705  dstval.scale = src.scale;
706  dstval.patchIDs[0] = p0->patchID;
707  dstval.patchIDs[1] = p1->patchID;
708 
709  dstval.index[0] = l0;
710  dstval.index[1] = l1;
711 
712  if (hostAlchFlags.alchOn) {
713  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
714  const Molecule& mol = *(Node::Object()->molecule);
715  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
716  } else {
717  dstval.fepBondedType = 0;
718  }
719 }
720 #endif // NODEGROUP_FORCE_REGISTER
721 
722 // XXX NOTE: Modified FP32 version
723 void ComputeBondedCUDA::copyBondDatafp32(const int ntuples, const BondElem* __restrict__ src,
724  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
725 
726  PatchMap* patchMap = PatchMap::Object();
727  float3 b1f, b2f, b3f;
728  b1f = make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
729  b2f = make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
730  b3f = make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
731 
732  for (int ituple=0;ituple < ntuples;ituple++) {
733  CudaBond dstval;
734  auto p0 = src[ituple].p[0];
735  auto p1 = src[ituple].p[1];
736  int pi0 = patchIndex[p0->patchID];
737  int pi1 = patchIndex[p1->patchID];
738  int l0 = src[ituple].localIndex[0];
739  int l1 = src[ituple].localIndex[1];
740  dstval.i = l0 + patches[pi0].atomStart;
741  dstval.j = l1 + patches[pi1].atomStart;
742  dstval.itype = (src[ituple].value - bond_array);
743 #if 0
744  Position position1 = p0->x[l0].position;
745  Position position2 = p1->x[l1].position;
746  Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
747  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
748 #endif
749  float3 position1 = make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
750  float3 position2 = make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
751  float3 diff = position1 - position2;
752  float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
753  float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
754  float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
755  if(pi0 != pi1){
756  Vector c = patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
757  d1 += c.x;
758  d2 += c.y;
759  d3 += c.z;
760  }
761  dstval.ioffsetXYZ = make_float3(d1, d2, d3);
762  dstval.scale = src[ituple].scale;
763  if (hostAlchFlags.alchOn) {
764  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
765  const Molecule& mol = *(Node::Object()->molecule);
766  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
767  } else {
768  dstval.fepBondedType = 0;
769  }
770  dst[ituple] = dstval;
771  }
772 }
773 
774 void ComputeBondedCUDA::copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
775  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst) {
776  PatchMap* patchMap = PatchMap::Object();
777  for (int ituple=0;ituple < ntuples;ituple++) {
778  CudaAngle dstval;
779  auto p0 = src[ituple].p[0];
780  auto p1 = src[ituple].p[1];
781  auto p2 = src[ituple].p[2];
782  int pi0 = patchIndex[p0->patchID];
783  int pi1 = patchIndex[p1->patchID];
784  int pi2 = patchIndex[p2->patchID];
785  int l0 = src[ituple].localIndex[0];
786  int l1 = src[ituple].localIndex[1];
787  int l2 = src[ituple].localIndex[2];
788  dstval.i = l0 + patches[pi0].atomStart;
789  dstval.j = l1 + patches[pi1].atomStart;
790  dstval.k = l2 + patches[pi2].atomStart;
791  dstval.itype = (src[ituple].value - angle_array);
792  Position position1 = p0->x[l0].position;
793  Position position2 = p1->x[l1].position;
794  Position position3 = p2->x[l2].position;
795  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
796  Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
797  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
798  if(pi2 != pi1) shiftVec32 += patchMap->center(p2->patchID) - patchMap->center(p1->patchID);
799 
800  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
801  dstval.koffsetXYZ = make_float3((float)shiftVec32.x, (float)shiftVec32.y, (float)shiftVec32.z);
802  dstval.scale = src[ituple].scale;
803  if (hostAlchFlags.alchOn) {
804  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
805  const Molecule& mol = *(Node::Object()->molecule);
806  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 3);
807  } else {
808  dstval.fepBondedType = 0;
809  }
810  dst[ituple] = dstval;
811  }
812 }
813 
814 #ifdef NODEGROUP_FORCE_REGISTER
815 template<>
816 void ComputeBondedCUDA::copyTupleToStage(const AngleElem& src,
817  const AngleValue* __restrict__ p_array, CudaAngleStage& dstval) {
818 
819  auto p0 = src.p[0];
820  auto p1 = src.p[1];
821  auto p2 = src.p[2];
822  int pi0 = patchIndex[p0->patchID];
823  int pi1 = patchIndex[p1->patchID];
824  int pi2 = patchIndex[p2->patchID];
825  int l0 = src.localIndex[0];
826  int l1 = src.localIndex[1];
827  int l2 = src.localIndex[2];
828  dstval.itype = (src.value - p_array);
829 
830  dstval.patchIDs[0] = p0->patchID;
831  dstval.patchIDs[1] = p1->patchID;
832  dstval.patchIDs[2] = p2->patchID;
833 
834  dstval.index[0] = l0;
835  dstval.index[1] = l1;
836  dstval.index[2] = l2;
837 
838  dstval.scale = src.scale;
839  if (hostAlchFlags.alchOn) {
840  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
841  const Molecule& mol = *(Node::Object()->molecule);
842  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 3);
843  } else {
844  dstval.fepBondedType = 0;
845  }
846 }
847 #endif // NODEGROUP_FORCE_REGISTER
848 
849 //
850 // Used for both dihedrals and impropers
851 //
852 template <bool doDihedral, typename T, typename P>
853 void ComputeBondedCUDA::copyDihedralData(const int ntuples, const T* __restrict__ src,
854  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
855 
856  PatchMap* patchMap = PatchMap::Object();
857 
858  for (int ituple=0;ituple < ntuples;ituple++) {
859  CudaDihedral dstval;
860  auto p0 = src[ituple].p[0];
861  auto p1 = src[ituple].p[1];
862  auto p2 = src[ituple].p[2];
863  auto p3 = src[ituple].p[3];
864  int pi0 = patchIndex[p0->patchID];
865  int pi1 = patchIndex[p1->patchID];
866  int pi2 = patchIndex[p2->patchID];
867  int pi3 = patchIndex[p3->patchID];
868  int l0 = src[ituple].localIndex[0];
869  int l1 = src[ituple].localIndex[1];
870  int l2 = src[ituple].localIndex[2];
871  int l3 = src[ituple].localIndex[3];
872  dstval.i = l0 + patches[pi0].atomStart;
873  dstval.j = l1 + patches[pi1].atomStart;
874  dstval.k = l2 + patches[pi2].atomStart;
875  dstval.l = l3 + patches[pi3].atomStart;
876  if (doDihedral) {
877  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
878  } else {
879  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
880  }
881  Position position1 = p0->x[l0].position;
882  Position position2 = p1->x[l1].position;
883  Position position3 = p2->x[l2].position;
884  Position position4 = p3->x[l3].position;
885  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
886  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
887  Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
888  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
889  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
890  if(pi3 != pi2) shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
891 
892  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
893  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
894  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
895  dstval.scale = src[ituple].scale;
896  if (hostAlchFlags.alchOn) {
897  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
898  const Molecule& mol = *(Node::Object()->molecule);
899  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
900  } else {
901  dstval.fepBondedType = 0;
902  }
903  dst[ituple] = dstval;
904  }
905 }
906 
907 #ifdef NODEGROUP_FORCE_REGISTER
908 template <>
909 void ComputeBondedCUDA::copyTupleToStage(const DihedralElem& src,
910  const DihedralValue* __restrict__ p_array, CudaDihedralStage& dstval) {
911 
912  auto p0 = src.p[0];
913  auto p1 = src.p[1];
914  auto p2 = src.p[2];
915  auto p3 = src.p[3];
916  int pi0 = patchIndex[p0->patchID];
917  int pi1 = patchIndex[p1->patchID];
918  int pi2 = patchIndex[p2->patchID];
919  int pi3 = patchIndex[p3->patchID];
920  int l0 = src.localIndex[0];
921  int l1 = src.localIndex[1];
922  int l2 = src.localIndex[2];
923  int l3 = src.localIndex[3];
924  dstval.itype = dihedralMultMap[(src.value - p_array)];
925 
926  dstval.patchIDs[0] = p0->patchID;
927  dstval.patchIDs[1] = p1->patchID;
928  dstval.patchIDs[2] = p2->patchID;
929  dstval.patchIDs[3] = p3->patchID;
930 
931  dstval.index[0] = l0;
932  dstval.index[1] = l1;
933  dstval.index[2] = l2;
934  dstval.index[3] = l3;
935 
936  dstval.scale = src.scale;
937  if (hostAlchFlags.alchOn) {
938  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
939  const Molecule& mol = *(Node::Object()->molecule);
940  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
941  } else {
942  dstval.fepBondedType = 0;
943  }
944 }
945 
946 template <>
947 void ComputeBondedCUDA::copyTupleToStage(const ImproperElem& src,
948  const ImproperValue* __restrict__ p_array, CudaDihedralStage& dstval) {
949 
950  auto p0 = src.p[0];
951  auto p1 = src.p[1];
952  auto p2 = src.p[2];
953  auto p3 = src.p[3];
954  int pi0 = patchIndex[p0->patchID];
955  int pi1 = patchIndex[p1->patchID];
956  int pi2 = patchIndex[p2->patchID];
957  int pi3 = patchIndex[p3->patchID];
958  int l0 = src.localIndex[0];
959  int l1 = src.localIndex[1];
960  int l2 = src.localIndex[2];
961  int l3 = src.localIndex[3];
962  dstval.itype = improperMultMap[(src.value - p_array)];
963 
964  dstval.patchIDs[0] = p0->patchID;
965  dstval.patchIDs[1] = p1->patchID;
966  dstval.patchIDs[2] = p2->patchID;
967  dstval.patchIDs[3] = p3->patchID;
968 
969  dstval.index[0] = l0;
970  dstval.index[1] = l1;
971  dstval.index[2] = l2;
972  dstval.index[3] = l3;
973 
974  dstval.scale = src.scale;
975  if (hostAlchFlags.alchOn) {
976  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
977  const Molecule& mol = *(Node::Object()->molecule);
978  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
979  } else {
980  dstval.fepBondedType = 0;
981  }
982 }
983 
984 template <typename T, typename P, typename D>
985  void ComputeBondedCUDA::copyToStage(const int ntuples, const T* __restrict__ src,
986  const P* __restrict__ p_array, std::vector<D>& dst) {
987 
988  for (int ituple=0;ituple < ntuples;ituple++) {
989  D dstval;
990  copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
991  dst.push_back(dstval);
992  }
993 }
994 #endif // NODEGROUP_FORCE_REGISTER
995 
996 
997 //
998 // Used for both dihedrals and impropers
999 //
1000 template <bool doDihedral, typename T, typename P>
1001 void ComputeBondedCUDA::copyDihedralDatafp32(const int ntuples, const T* __restrict__ src,
1002  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
1003 
1004  PatchMap* patchMap = PatchMap::Object();
1005  float3 b1f = make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
1006  float3 b2f = make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
1007  float3 b3f = make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
1008 
1009  for (int ituple=0;ituple < ntuples;ituple++) {
1010  CudaDihedral dstval;
1011  auto p0 = src[ituple].p[0];
1012  auto p1 = src[ituple].p[1];
1013  auto p2 = src[ituple].p[2];
1014  auto p3 = src[ituple].p[3];
1015  int pi0 = patchIndex[p0->patchID];
1016  int pi1 = patchIndex[p1->patchID];
1017  int pi2 = patchIndex[p2->patchID];
1018  int pi3 = patchIndex[p3->patchID];
1019  int l0 = src[ituple].localIndex[0];
1020  int l1 = src[ituple].localIndex[1];
1021  int l2 = src[ituple].localIndex[2];
1022  int l3 = src[ituple].localIndex[3];
1023  dstval.i = l0 + patches[pi0].atomStart;
1024  dstval.j = l1 + patches[pi1].atomStart;
1025  dstval.k = l2 + patches[pi2].atomStart;
1026  dstval.l = l3 + patches[pi3].atomStart;
1027  if (doDihedral) {
1028  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
1029  } else {
1030  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
1031  }
1032 #if 0
1033  Position position1 = p0->x[l0].position;
1034  Position position2 = p1->x[l1].position;
1035  Position position3 = p2->x[l2].position;
1036  Position position4 = p3->x[l3].position;
1037  Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
1038  Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
1039  Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
1040  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1041  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1042  if(pi3 != pi2) shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
1043 
1044  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
1045  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
1046  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
1047 #endif
1048 
1049  float3 position1 = make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
1050  float3 position2 = make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
1051  float3 position3 = make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
1052  float3 position4 = make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
1053 
1054  float3 diff12, diff23, diff43;
1055  diff12 = position1 - position2;
1056  diff23 = position2 - position3;
1057  diff43 = position4 - position3;
1058 
1059  float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
1060  float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
1061  float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
1062 
1063  float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
1064  float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
1065  float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
1066 
1067  float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
1068  float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
1069  float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
1070 
1071  if(pi0 != pi1){
1072  Vector c = patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1073  d12_x += c.x;
1074  d12_y += c.y;
1075  d12_z += c.z;
1076  }
1077 
1078  if(pi1 != pi2){
1079  Vector c = patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1080  d23_x += c.x;
1081  d23_y += c.y;
1082  d23_z += c.z;
1083  }
1084 
1085  if(pi3 != pi2){
1086  Vector c = patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
1087  d43_x += c.x;
1088  d43_y += c.y;
1089  d43_z += c.z;
1090  }
1091 
1092  dstval.ioffsetXYZ = make_float3(d12_x, d12_y, d12_z);
1093  dstval.joffsetXYZ = make_float3(d23_x, d23_y, d23_z);
1094  dstval.loffsetXYZ = make_float3(d43_x, d43_y, d43_z);
1095 
1096  dstval.scale = src[ituple].scale;
1097  if (hostAlchFlags.alchOn) {
1098  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1099  const Molecule& mol = *(Node::Object()->molecule);
1100  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1101  } else {
1102  dstval.fepBondedType = 0;
1103  }
1104  dst[ituple] = dstval;
1105  }
1106 }
1107 
1108 
1109 void ComputeBondedCUDA::copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
1110  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int64_t& pos, int64_t& pos2) {
1111 
1112  PatchMap* patchMap = PatchMap::Object();
1113  for (int ituple=0;ituple < ntuples;ituple++) {
1114  auto p0 = src[ituple].p[0];
1115  auto p1 = src[ituple].p[1];
1116  int pi0 = patchIndex[p0->patchID];
1117  int pi1 = patchIndex[p1->patchID];
1118  int l0 = src[ituple].localIndex[0];
1119  int l1 = src[ituple].localIndex[1];
1120  CompAtom& ca1 = p0->x[l0];
1121  CompAtom& ca2 = p1->x[l1];
1122  Position position1 = ca1.position;
1123  Position position2 = ca2.position;
1124  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
1125  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1126  CudaExclusion ce;
1127  ce.i = l0 + patches[pi0].atomStart;
1128  ce.j = l1 + patches[pi1].atomStart;
1129  ce.vdwtypei = ca1.vdwType;
1130  ce.vdwtypej = ca2.vdwType;
1131  ce.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
1132  if (hostAlchFlags.alchOn) {
1133  const unsigned char p1 = ca1.partition;
1134  const unsigned char p2 = ca2.partition;
1135  ce.pswitch = pswitchTable[size_t(p1 + 3*p2)];
1136  } else {
1137  ce.pswitch = 0;
1138  }
1139  if (src[ituple].modified) {
1140  *dst1 = ce;
1141  dst1++;
1142  pos += typeSize;
1143  } else {
1144  *dst2 = ce;
1145  dst2++;
1146  pos2 += typeSize;
1147  }
1148  }
1149 }
1150 
1151 #ifdef NODEGROUP_FORCE_REGISTER
1152 template<>
1153 void ComputeBondedCUDA::copyTupleToStage(const ExclElem& __restrict__ src,
1154  const int* __restrict__ p_array, CudaExclusionStage& dstval) {
1155 
1156  auto p0 = src.p[0];
1157  auto p1 = src.p[1];
1158  int pi0 = patchIndex[p0->patchID];
1159  int pi1 = patchIndex[p1->patchID];
1160  int l0 = src.localIndex[0];
1161  int l1 = src.localIndex[1];
1162  CompAtom& ca1 = p0->x[l0];
1163  CompAtom& ca2 = p1->x[l1];
1164 
1165  dstval.vdwtypei = ca1.vdwType;
1166  dstval.vdwtypej = ca2.vdwType;
1167 
1168  dstval.patchIDs[0] = p0->patchID;
1169  dstval.patchIDs[1] = p1->patchID;
1170 
1171  dstval.index[0] = l0;
1172  dstval.index[1] = l1;
1173 
1174  if (hostAlchFlags.alchOn) {
1175  const unsigned char p1 = ca1.partition;
1176  const unsigned char p2 = ca2.partition;
1177  dstval.pswitch = pswitchTable[size_t(p1 + 3*p2)];
1178  } else {
1179  dstval.pswitch = 0;
1180  }
1181 }
1182 
1183 
1184 
1185 void ComputeBondedCUDA::copyExclusionDataStage(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
1186  std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
1187 
1188  for (int ituple=0;ituple < ntuples;ituple++) {
1189  CudaExclusionStage ce;
1190  copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple], nullptr, ce);
1191  if (src[ituple].modified) {
1192  dst1.push_back(ce);
1193  pos += typeSize;
1194  } else {
1195  dst2.push_back(ce);
1196  pos2 += typeSize;
1197  }
1198  }
1199 }
1200 #endif // NODEGROUP_FORCE_REGISTER
1201 
1202 void ComputeBondedCUDA::copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
1203  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst) {
1204 
1205  PatchMap* patchMap = PatchMap::Object();
1206  for (int ituple=0;ituple < ntuples;ituple++) {
1207  auto p0 = src[ituple].p[0];
1208  auto p1 = src[ituple].p[1];
1209  auto p2 = src[ituple].p[2];
1210  auto p3 = src[ituple].p[3];
1211  auto p4 = src[ituple].p[4];
1212  auto p5 = src[ituple].p[5];
1213  auto p6 = src[ituple].p[6];
1214  auto p7 = src[ituple].p[7];
1215  int pi0 = patchIndex[p0->patchID];
1216  int pi1 = patchIndex[p1->patchID];
1217  int pi2 = patchIndex[p2->patchID];
1218  int pi3 = patchIndex[p3->patchID];
1219  int pi4 = patchIndex[p4->patchID];
1220  int pi5 = patchIndex[p5->patchID];
1221  int pi6 = patchIndex[p6->patchID];
1222  int pi7 = patchIndex[p7->patchID];
1223  int l0 = src[ituple].localIndex[0];
1224  int l1 = src[ituple].localIndex[1];
1225  int l2 = src[ituple].localIndex[2];
1226  int l3 = src[ituple].localIndex[3];
1227  int l4 = src[ituple].localIndex[4];
1228  int l5 = src[ituple].localIndex[5];
1229  int l6 = src[ituple].localIndex[6];
1230  int l7 = src[ituple].localIndex[7];
1231  dst[ituple].i1 = l0 + patches[pi0].atomStart;
1232  dst[ituple].i2 = l1 + patches[pi1].atomStart;
1233  dst[ituple].i3 = l2 + patches[pi2].atomStart;
1234  dst[ituple].i4 = l3 + patches[pi3].atomStart;
1235  dst[ituple].i5 = l4 + patches[pi4].atomStart;
1236  dst[ituple].i6 = l5 + patches[pi5].atomStart;
1237  dst[ituple].i7 = l6 + patches[pi6].atomStart;
1238  dst[ituple].i8 = l7 + patches[pi7].atomStart;
1239  dst[ituple].itype = (src[ituple].value - crossterm_array);
1240  Position position1 = p0->x[l0].position;
1241  Position position2 = p1->x[l1].position;
1242  Position position3 = p2->x[l2].position;
1243  Position position4 = p3->x[l3].position;
1244  Position position5 = p4->x[l4].position;
1245  Position position6 = p5->x[l5].position;
1246  Position position7 = p6->x[l6].position;
1247  Position position8 = p7->x[l7].position;
1248  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
1249  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
1250  Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
1251  Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
1252  Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
1253  Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
1254  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1255  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1256  if(pi2 != pi3) shiftVec34 += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
1257  if(pi4 != pi5) shiftVec56 += patchMap->center(p4->patchID) - patchMap->center(p5->patchID);
1258  if(pi5 != pi6) shiftVec67 += patchMap->center(p5->patchID) - patchMap->center(p6->patchID);
1259  if(pi6 != pi7) shiftVec78 += patchMap->center(p6->patchID) - patchMap->center(p7->patchID);
1260  dst[ituple].offset12XYZ = make_float3( (float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
1261  dst[ituple].offset23XYZ = make_float3( (float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
1262  dst[ituple].offset34XYZ = make_float3( (float)shiftVec34.x, (float)shiftVec34.y, (float)shiftVec34.z);
1263  dst[ituple].offset56XYZ = make_float3( (float)shiftVec56.x, (float)shiftVec56.y, (float)shiftVec56.z);
1264  dst[ituple].offset67XYZ = make_float3( (float)shiftVec67.x, (float)shiftVec67.y, (float)shiftVec67.z);
1265  dst[ituple].offset78XYZ = make_float3( (float)shiftVec78.x, (float)shiftVec78.y, (float)shiftVec78.z);
1266  if (hostAlchFlags.alchOn) {
1267  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1268  const Molecule& mol = *(Node::Object()->molecule);
1269  int typeSum1 = 0, typeSum2 = 0;
1270  for (size_t i = 0; i < 4; ++i) {
1271  typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1272  typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1273  }
1274  int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1275  if ((0 < typeSum1 && typeSum1 < order) || (0 < typeSum2 && typeSum2 < order)) {
1276  dst[ituple].fepBondedType = 1;
1277  } else if ((0 > typeSum1 && typeSum1 > -order) || (0 > typeSum2 && typeSum2 > -order)) {
1278  dst[ituple].fepBondedType = 2;
1279  }
1280  } else {
1281  dst[ituple].fepBondedType = 0;
1282  }
1283  dst[ituple].scale = src[ituple].scale;
1284  }
1285 }
1286 
1287 #ifdef NODEGROUP_FORCE_REGISTER
1288 template <>
1289 void ComputeBondedCUDA::copyTupleToStage(const CrosstermElem& src,
1290  const CrosstermValue* __restrict__ crossterm_array, CudaCrosstermStage& dstval) {
1291 
1292  auto p0 = src.p[0];
1293  auto p1 = src.p[1];
1294  auto p2 = src.p[2];
1295  auto p3 = src.p[3];
1296  auto p4 = src.p[4];
1297  auto p5 = src.p[5];
1298  auto p6 = src.p[6];
1299  auto p7 = src.p[7];
1300  int pi0 = patchIndex[p0->patchID];
1301  int pi1 = patchIndex[p1->patchID];
1302  int pi2 = patchIndex[p2->patchID];
1303  int pi3 = patchIndex[p3->patchID];
1304  int pi4 = patchIndex[p4->patchID];
1305  int pi5 = patchIndex[p5->patchID];
1306  int pi6 = patchIndex[p6->patchID];
1307  int pi7 = patchIndex[p7->patchID];
1308  int l0 = src.localIndex[0];
1309  int l1 = src.localIndex[1];
1310  int l2 = src.localIndex[2];
1311  int l3 = src.localIndex[3];
1312  int l4 = src.localIndex[4];
1313  int l5 = src.localIndex[5];
1314  int l6 = src.localIndex[6];
1315  int l7 = src.localIndex[7];
1316  dstval.itype = (src.value - crossterm_array);
1317 
1318  dstval.patchIDs[0] = p0->patchID;
1319  dstval.patchIDs[1] = p1->patchID;
1320  dstval.patchIDs[2] = p2->patchID;
1321  dstval.patchIDs[3] = p3->patchID;
1322  dstval.patchIDs[4] = p4->patchID;
1323  dstval.patchIDs[5] = p5->patchID;
1324  dstval.patchIDs[6] = p6->patchID;
1325  dstval.patchIDs[7] = p7->patchID;
1326 
1327  dstval.index[0] = l0;
1328  dstval.index[1] = l1;
1329  dstval.index[2] = l2;
1330  dstval.index[3] = l3;
1331  dstval.index[4] = l4;
1332  dstval.index[5] = l5;
1333  dstval.index[6] = l6;
1334  dstval.index[7] = l7;
1335 
1336  dstval.scale = src.scale;
1337  if (hostAlchFlags.alchOn) {
1338  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1339  const Molecule& mol = *(Node::Object()->molecule);
1340  int typeSum1 = 0, typeSum2 = 0;
1341  for (size_t i = 0; i < 4; ++i) {
1342  typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1343  typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1344  }
1345  int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1346  if ((0 < typeSum1 && typeSum1 < order) || (0 < typeSum2 && typeSum2 < order)) {
1347  dstval.fepBondedType = 1;
1348  } else if ((0 > typeSum1 && typeSum1 > -order) || (0 > typeSum2 && typeSum2 > -order)) {
1349  dstval.fepBondedType = 2;
1350  }
1351  } else {
1352  dstval.fepBondedType = 0;
1353  }
1354 }
1355 #endif // NODEGROUP_FORCE_REGISTER
1356 
1357 void ComputeBondedCUDA::copyTholeData(
1358  const int ntuples, const TholeElem* __restrict__ src,
1359  const TholeValue* __restrict__ thole_array, CudaThole* __restrict__ dst) {
1360  const PatchMap* patchMap = PatchMap::Object();
1361  for (int ituple=0;ituple < ntuples;ituple++) {
1362  const auto p0 = src[ituple].p[0];
1363  const auto p1 = src[ituple].p[1];
1364  const auto p2 = src[ituple].p[2];
1365  const auto p3 = src[ituple].p[3];
1366  const int pi0 = patchIndex[p0->patchID];
1367  const int pi1 = patchIndex[p1->patchID];
1368  const int pi2 = patchIndex[p2->patchID];
1369  const int pi3 = patchIndex[p3->patchID];
1370  const int l0 = src[ituple].localIndex[0];
1371  const int l1 = src[ituple].localIndex[1];
1372  const int l2 = src[ituple].localIndex[2];
1373  const int l3 = src[ituple].localIndex[3];
1374  dst[ituple].i = l0 + patches[pi0].atomStart;
1375  dst[ituple].j = l1 + patches[pi1].atomStart;
1376  dst[ituple].k = l2 + patches[pi2].atomStart;
1377  dst[ituple].l = l3 + patches[pi3].atomStart;
1378  // TODO: is this correct?
1379  const TholeValue * const(&value)(src[ituple].value);
1380  if (value == NULL) {
1381  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1382  }
1383  dst[ituple].aa = value->aa;
1384  dst[ituple].qq = value->qq;
1385  // dst[ituple].itype = (src[ituple].value - thole_array);
1386  const Position position0 = p0->x[l0].position;
1387  const Position position1 = p1->x[l1].position;
1388  const Position position2 = p2->x[l2].position;
1389  const Position position3 = p3->x[l3].position;
1390  Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
1391  Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
1392  Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
1393  Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
1394  if (pi0 != pi2) shiftVec_aiaj += patchMap->center(p0->patchID) - patchMap->center(p2->patchID);
1395  if (pi0 != pi3) shiftVec_aidj += patchMap->center(p0->patchID) - patchMap->center(p3->patchID);
1396  if (pi1 != pi2) shiftVec_diaj += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1397  if (pi1 != pi3) shiftVec_didj += patchMap->center(p1->patchID) - patchMap->center(p3->patchID);
1398  dst[ituple].offset_aiaj = make_float3((float)shiftVec_aiaj.x, (float)shiftVec_aiaj.y, (float)shiftVec_aiaj.z);
1399  dst[ituple].offset_aidj = make_float3((float)shiftVec_aidj.x, (float)shiftVec_aidj.y, (float)shiftVec_aidj.z);
1400  dst[ituple].offset_diaj = make_float3((float)shiftVec_diaj.x, (float)shiftVec_diaj.y, (float)shiftVec_diaj.z);
1401  dst[ituple].offset_didj = make_float3((float)shiftVec_didj.x, (float)shiftVec_didj.y, (float)shiftVec_didj.z);
1402  if (hostAlchFlags.alchOn) {
1403  // THIS ASSUMES THAT AN ATOM AND ITS DRUDE PARTICLE ARE ALWAYS IN THE SAME
1404  // PARTITION!
1405  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1406  const Molecule *mol = Node::Object()->molecule;
1407  int typeSum = 0;
1408  typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
1409  typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
1410  const int order = (!params->alchDecouple ? 3 : 2);
1411  if (typeSum == 0 || abs(typeSum) == order) dst[ituple].fepBondedType = 0;
1412  else if (0 < typeSum && typeSum < order) dst[ituple].fepBondedType = 1;
1413  else if (-order < typeSum && typeSum < 0) dst[ituple].fepBondedType = 2;
1414  else dst[ituple].fepBondedType = 0;
1415  } else {
1416  dst[ituple].fepBondedType = 0;
1417  }
1418  }
1419 }
1420 
1421 #ifdef NODEGROUP_FORCE_REGISTER
1422 template <>
1423 void ComputeBondedCUDA::copyTupleToStage(const TholeElem& src,
1424  const TholeValue* __restrict__ thole_array, CudaTholeStage& dstval) {
1425  auto p0 = src.p[0];
1426  auto p1 = src.p[1];
1427  auto p2 = src.p[2];
1428  auto p3 = src.p[3];
1429  // int pi0 = patchIndex[p0->patchID];
1430  // int pi1 = patchIndex[p1->patchID];
1431  // int pi2 = patchIndex[p2->patchID];
1432  // int pi3 = patchIndex[p3->patchID];
1433  int l0 = src.localIndex[0];
1434  int l1 = src.localIndex[1];
1435  int l2 = src.localIndex[2];
1436  int l3 = src.localIndex[3];
1437  dstval.patchIDs[0] = p0->patchID;
1438  dstval.patchIDs[1] = p1->patchID;
1439  dstval.patchIDs[2] = p2->patchID;
1440  dstval.patchIDs[3] = p3->patchID;
1441  dstval.index[0] = l0;
1442  dstval.index[1] = l1;
1443  dstval.index[2] = l2;
1444  dstval.index[3] = l3;
1445  const TholeValue * const(&value)(src.value);
1446  if (value == NULL) {
1447  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1448  }
1449  dstval.aa = value->aa;
1450  dstval.qq = value->qq;
1451  if (hostAlchFlags.alchOn) {
1452  // THIS ASSUMES THAT AN ATOM AND ITS DRUDE PARTICLE ARE ALWAYS IN THE SAME
1453  // PARTITION!
1454  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1455  const Molecule *mol = Node::Object()->molecule;
1456  int typeSum = 0;
1457  typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
1458  typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
1459  const int order = (!params->alchDecouple ? 3 : 2);
1460  if (typeSum == 0 || abs(typeSum) == order) dstval.fepBondedType = 0;
1461  else if (0 < typeSum && typeSum < order) dstval.fepBondedType = 1;
1462  else if (-order < typeSum && typeSum < 0) dstval.fepBondedType = 2;
1463  else dstval.fepBondedType = 0;
1464  } else {
1465  dstval.fepBondedType = 0;
1466  }
1467 }
1468 #endif
1469 
1470 void ComputeBondedCUDA::copyAnisoData(
1471  const int ntuples, const AnisoElem* __restrict src,
1472  const AnisoValue* __restrict aniso_array, CudaAniso* __restrict dst) {
1473  const PatchMap* patchMap = PatchMap::Object();
1474  for (int ituple=0;ituple < ntuples;ituple++) {
1475  const auto p0 = src[ituple].p[0];
1476  const auto p1 = src[ituple].p[1];
1477  const auto p2 = src[ituple].p[2];
1478  const auto p3 = src[ituple].p[3];
1479  const int pi0 = patchIndex[p0->patchID];
1480  const int pi1 = patchIndex[p1->patchID];
1481  const int pi2 = patchIndex[p2->patchID];
1482  const int pi3 = patchIndex[p3->patchID];
1483  const int l0 = src[ituple].localIndex[0];
1484  const int l1 = src[ituple].localIndex[1];
1485  const int l2 = src[ituple].localIndex[2];
1486  const int l3 = src[ituple].localIndex[3];
1487  dst[ituple].i = l0 + patches[pi0].atomStart;
1488  // Atom i's drude, assume to be in the same patch of i
1489  dst[ituple].j = l0 + 1 + patches[pi0].atomStart;
1490  dst[ituple].l = l1 + patches[pi1].atomStart;
1491  dst[ituple].m = l2 + patches[pi2].atomStart;
1492  dst[ituple].n = l3 + patches[pi3].atomStart;
1493  const AnisoValue * const(&value)(src[ituple].value);
1494  if (value == NULL) {
1495  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyAnisoData.\n");
1496  }
1497  dst[ituple].kpar0 = 2.0 * value->k11;
1498  dst[ituple].kperp0 = 2.0 * value->k22;
1499  dst[ituple].kiso0 = 2.0 * value->k33;
1500  const Position position0 = p0->x[l0].position;
1501  const Position position1 = p1->x[l1].position;
1502  const Position position2 = p2->x[l2].position;
1503  const Position position3 = p3->x[l3].position;
1504  Vector shiftVec_il = lattice.wrap_delta_scaled(position0, position1);
1505  Vector shiftVec_mn = lattice.wrap_delta_scaled(position2, position3);
1506  if (pi0 != pi1) shiftVec_il += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1507  if (pi2 != pi3) shiftVec_mn += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
1508  dst[ituple].offset_il = make_float3((float)shiftVec_il.x, (float)shiftVec_il.y, (float)shiftVec_il.z);
1509  dst[ituple].offset_mn = make_float3((float)shiftVec_mn.x, (float)shiftVec_mn.y, (float)shiftVec_mn.z);
1510  if (hostAlchFlags.alchOn) {
1511  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1512  const Molecule& mol = *(Node::Object()->molecule);
1513  dst[ituple].fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1514  } else {
1515  dst[ituple].fepBondedType = 0;
1516  }
1517  }
1518 }
1519 
1520 #ifdef NODEGROUP_FORCE_REGISTER
1521 template<>
1522 void ComputeBondedCUDA::copyTupleToStage(const AnisoElem& src,
1523  const AnisoValue* __restrict__ p_array, CudaAnisoStage& dstval) {
1524  auto p0 = src.p[0];
1525  auto p1 = src.p[1];
1526  auto p2 = src.p[2];
1527  auto p3 = src.p[3];
1528  int l0 = src.localIndex[0];
1529  int l1 = src.localIndex[1];
1530  int l2 = src.localIndex[2];
1531  int l3 = src.localIndex[3];
1532  dstval.patchIDs[0] = p0->patchID;
1533  dstval.patchIDs[1] = p0->patchID;
1534  dstval.patchIDs[2] = p1->patchID;
1535  dstval.patchIDs[3] = p2->patchID;
1536  dstval.patchIDs[4] = p3->patchID;
1537  dstval.index[0] = l0;
1538  dstval.index[1] = l0 + 1;
1539  dstval.index[2] = l1;
1540  dstval.index[3] = l2;
1541  dstval.index[4] = l3;
1542  const AnisoValue * const(&value)(src.value);
1543  if (value == NULL) {
1544  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1545  }
1546  dstval.kpar0 = 2.0 * value->k11;
1547  dstval.kperp0 = 2.0 * value->k22;
1548  dstval.kiso0 = 2.0 * value->k33;
1549  if (hostAlchFlags.alchOn) {
1550  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1551  const Molecule& mol = *(Node::Object()->molecule);
1552  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1553  } else {
1554  dstval.fepBondedType = 0;
1555  }
1556 }
1557 #endif
1558 
1559 void ComputeBondedCUDA::tupleCopyWorker(int first, int last, void *result, int paraNum, void *param) {
1560  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1561  c->tupleCopyWorker(first, last);
1562 }
1563 
1564 void ComputeBondedCUDA::tupleCopyWorkerExcl(int first, int last, void *result, int paraNum, void *param) {
1565  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1566  c->tupleCopyWorkerExcl(first, last);
1567 }
1568 
1569 void ComputeBondedCUDA::tupleCopyWorkerExcl(int first, int last) {
1570 
1571  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1572  // Separate exclusions into modified, and non-modified
1573  auto itStart=std::next(tupleList[Tuples::EXCLUSION].begin(),first);
1574  //compute offset for this rank
1575  int64_t numModExclusionsBefore=0;
1576  int64_t numExclusionsBefore=0;
1577  // this is the most painful way to compute this, but should be correct
1578  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != itStart;it++) {
1579  int ntuples = (*it)->getNumTuples();
1580  auto thisExcl = (ExclElem *)(*it)->getTupleList();
1581  for (int ituple=0;ituple < ntuples;ituple++) {
1582  if(thisExcl[ituple].modified)
1583  numModExclusionsBefore++;
1584  else
1585  numExclusionsBefore++;
1586  }
1587  }
1588 
1589  int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[Tuples::EXCLUSION];
1590  int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[Tuples::EXCLUSION];
1591  int numExclusionsWork=last-first;
1592  auto itEnd=std::next(itStart,numExclusionsWork+1);
1593  for ( auto it=itStart;it != itEnd;it++) {
1594  int ntuples = (*it)->getNumTuples();
1595  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
1596  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
1597  }
1598  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1599 }
1600 
1601 void ComputeBondedCUDA::tupleCopyWorker(int first, int last) {
1602 
1603  if (first == -1) {
1604  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1605  // Separate exclusions into modified, and non-modified
1606  int64_t pos = exclusionStartPos;
1607  int64_t pos2 = exclusionStartPos2;
1608  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
1609  int ntuples = (*it)->getNumTuples();
1610  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
1611  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
1612  }
1613  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1614  first = 0;
1615  }
1616  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1617  // JM: Move the switch statement outside and do the for loop inside the cases
1618  for (int i=first;i <= last;i++) {
1619  switch (tupleCopyWorkList[i].tupletype) {
1620 
1621  case Tuples::BOND:
1622  {
1623 #if 1
1624  copyBondData(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
1625  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1626 #else
1627  copyBondDatafp32(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
1628  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1629 #endif
1630  }
1631  break;
1632 
1633  case Tuples::ANGLE:
1634  {
1635  copyAngleData(tupleCopyWorkList[i].ntuples, (AngleElem *)tupleCopyWorkList[i].tupleElemList,
1636  Node::Object()->parameters->angle_array, (CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1637  }
1638  break;
1639 
1640  case Tuples::DIHEDRAL:
1641  {
1642 
1643 #if 1
1644  copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1645  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
1646  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1647 #else
1648  copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1649  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
1650  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1651 #endif
1652  }
1653  break;
1654 
1655  case Tuples::IMPROPER:
1656  {
1657  copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
1658  (ImproperElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->improper_array,
1659  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1660  }
1661  break;
1662 
1663  case Tuples::CROSSTERM:
1664  {
1665  copyCrosstermData(tupleCopyWorkList[i].ntuples, (CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
1666  Node::Object()->parameters->crossterm_array, (CudaCrossterm *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1667  }
1668  break;
1669 
1670  case Tuples::THOLE:
1671  {
1672  copyTholeData(tupleCopyWorkList[i].ntuples, (TholeElem*)tupleCopyWorkList[i].tupleElemList, nullptr, (CudaThole *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1673  }
1674  break;
1675 
1676  case Tuples::ANISO:
1677  {
1678  copyAnisoData(tupleCopyWorkList[i].ntuples, (AnisoElem*)tupleCopyWorkList[i].tupleElemList, nullptr, (CudaAniso*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1679  }
1680  break;
1681 
1682  default:
1683  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1684  break;
1685  }
1686  }
1687  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1688 }
1689 
1690 
1691 #ifdef NODEGROUP_FORCE_REGISTER
1692 
1693 void ComputeBondedCUDA::updateMaxTupleCounts(TupleCounts counts) {
1694  // Atomically compute the max number of tuples across all GPUs
1695  // Each GPU will use this as their buffer size * OVERALLOC to keep things easier
1696 
1697  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1698  PatchData *patchData = cpdata.ckLocalBranch();
1699 
1700  // Update bonds
1701  int numBondsTest = patchData->maxNumBonds.load();
1702  while (numBondsTest < counts.bond &&
1703  !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.bond));
1704 
1705  int numAnglesTest = patchData->maxNumAngles.load();
1706  while (numAnglesTest < counts.angle &&
1707  !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.angle));
1708 
1709  int numDihedralsTest = patchData->maxNumDihedrals.load();
1710  while (numDihedralsTest < counts.dihedral &&
1711  !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.dihedral));
1712 
1713  int numImpropersTest = patchData->maxNumImpropers.load();
1714  while (numImpropersTest < counts.improper &&
1715  !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.improper));
1716 
1717  int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
1718  while (numModifiedExclusionsTest < counts.modifiedExclusion &&
1719  !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.modifiedExclusion));
1720 
1721  int numExclusionsTest = patchData->maxNumExclusions.load();
1722  while (numExclusionsTest < counts.exclusion &&
1723  !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.exclusion));
1724 
1725  int numCrosstermsTest = patchData->maxNumCrossterms.load();
1726  while (numCrosstermsTest < counts.crossterm &&
1727  !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.crossterm));
1728 
1729 
1730 }
1731 
1732 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
1733  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1734  PatchData *patchData = cpdata.ckLocalBranch();
1735 
1736  TupleCounts counts{0};
1737 
1738  counts.bond = patchData->maxNumBonds.load();
1739  counts.angle = patchData->maxNumAngles.load();
1740  counts.dihedral = patchData->maxNumDihedrals.load();
1741  counts.improper = patchData->maxNumImpropers.load();
1742  counts.modifiedExclusion = patchData->maxNumModifiedExclusions.load();
1743  counts.exclusion = patchData->maxNumExclusions.load();
1744  counts.crossterm = patchData->maxNumCrossterms.load();
1745 
1746 #if 0
1747  CkPrintf("[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
1748  CkMyPe(), counts.bond, counts.angle, counts.dihedral, counts.improper, counts.modifiedExclusion,
1749  counts.exclusion, counts.crossterm);
1750 #endif
1751 
1752  return counts;
1753 }
1754 
1755 
1796 void ComputeBondedCUDA::migrateTuples(bool startup) {
1797 
1798  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1799  PatchData *patchData = cpdata.ckLocalBranch();
1801  const int devInd = deviceCUDA->getDeviceIndex();
1802 
1803  bool MGPU = deviceCUDA->getNumDevice() > 1;
1804  bool amMaster = (masterPe == CkMyPe());
1806 
1807 
1808  if (amMaster) {
1809  cudaStreamSynchronize(stream);
1810  copyPatchData();
1811  cudaStreamSynchronize(stream);
1812  }
1814 
1815  if (amMaster) {
1816  if (!startup) {
1817  PatchMap* patchMap = PatchMap::Object();
1818  const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
1819  // Update the device of the tuples
1820  // DMC this also updates the patches of the atoms, so it needs to be done in the single GPU case too
1821 
1822  TupleCounts counts = bondedKernel.getTupleCounts();
1823  migrationKernel.computeTupleDestination(
1824  devInd,
1825  counts,
1826  patchData->devData[devInd].numPatchesHome,
1827  migrationDestination,
1828  patchData->devData[devInd].d_patchToDeviceMap,
1829  patchData->devData[devInd].d_globalToLocalID,
1830  patchMap->getADim(), patchMap->getBDim(),
1831  patchMap->getCMaxIndex(), patchMap->getBMaxIndex(), patchMap->getAMaxIndex(),
1832  stream
1833  );
1834  cudaCheck(cudaStreamSynchronize(stream));
1835  }
1836  }
1838 
1839  if (amMaster) {
1840  if (!startup) {
1841  migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
1842  cudaCheck(cudaStreamSynchronize(stream));
1843  }
1844  }
1846 
1847  TupleCounts local, newMax;
1848  bool realloc = false;
1849 
1850  if (amMaster && !startup) {
1851  migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
1852 
1853  local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
1854  updateMaxTupleCounts(local);
1855 #if 0
1856  CkPrintf("[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
1857  CkMyPe(), local.bond, local.angle, local.dihedral, local.improper, local.modifiedExclusion,
1858  local.exclusion, local.crossterm);
1859 #endif
1860  }
1862  if (amMaster && !startup) {
1863  // Only multi-GPU needs to check for reallocation
1864  if (MGPU) {
1865  newMax = getMaxTupleCounts();
1866  realloc = migrationKernel.reallocateBufferDst(newMax);
1867  patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
1868  }
1869  }
1871 
1872  //
1873  // Non-master PEs need to see if reallocation is happening
1874  // They can all use the 0th GPU flag since the GPUs reallocate
1875  // based on the max
1876  if (!amMaster) {
1877  realloc = patchData->tupleReallocationFlagPerDevice[0];
1878  }
1879 
1880  // If reallocated, then we need to register the P2P pointer
1881  // Since the GPUs use the max tuple counts for the buffer sizes
1882  // If one GPU reallocated, then they all did.
1883 
1884  if (realloc) {
1885  if (amMaster) {
1886  registerPointersToHost();
1887  }
1889 
1890  if (amMaster) {
1891  copyHostRegisterToDevice();
1892  cudaCheck(cudaStreamSynchronize(stream));
1893  }
1895  }
1896 
1897  if (amMaster && !startup) {
1898  // Moves the tuples to the dst buffer on all GPUS
1899  migrationKernel.performTupleMigration(
1900  bondedKernel.getTupleCounts(),
1901  stream
1902  );
1903  cudaCheck(cudaStreamSynchronize(stream));
1904  }
1905  syncColl->barrier(SynchronousCollectiveScope::all); // We need a barrier because of the reallocation DMC believes
1906 
1907  if (amMaster && !startup) {
1908  // If we previously reallocated we need to reallocate the other buffers too
1909  // The regular tuples must be "Reallocated" everytime, so the offsets into
1910  // the unified buffer can be updated. The unified buffer made since when we
1911  // were copying the tuples from the host each migration, but I don't think
1912  // it still makes sense to have since it complicates reallocation
1913  if (MGPU) {
1914  bondedKernel.reallocateTupleBuffer(newMax, stream);
1915  migrationKernel.reallocateBufferSrc(newMax);
1916  cudaCheck(cudaStreamSynchronize(stream));
1917  }
1918  }
1920 
1921  // Update the local tuple count and copies the migrated tuples back to the
1922  // src buffer
1923  if (amMaster) {
1924  if (!startup) {
1925  bondedKernel.setTupleCounts(local);
1926  }
1927 
1928  const int* ids = patchData->h_soa_id[devInd];
1929  migrationKernel.updateTuples(
1930  bondedKernel.getTupleCounts(),
1931  bondedKernel.getData(),
1932  ids,
1933  d_patchRecord,
1934  d_patchMapCenter,
1935  bondedKernel.getAtomBuffer(),
1936  lattice,
1937  stream
1938  );
1939  cudaCheck(cudaDeviceSynchronize());
1940  }
1941 }
1942 
1949 template<typename T>
1950 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
1951  PatchMap* patchMap = PatchMap::Object();
1952 
1953  std::vector<std::pair<int,int>> downstreamPatches;
1954  for (int i = 0; i < tuples.size(); i++) {
1955  int downstream = tuples[i].patchIDs[0];
1956  for (int j = 1; j < T::size; j++) {
1957  downstream = patchMap->downstream(downstream, tuples[i].patchIDs[j]);
1958  }
1959  downstreamPatches.push_back(std::make_pair(i, downstream));
1960  tupleCounts[patchIndex[downstream]]++;
1961  }
1962 
1963  //
1964  // Compute the offset into the tuples based on patch
1965  // TODO replace with STD prefix sum
1966  tupleOffsets[0] = 0;
1967  for (int i = 0; i < tupleCounts.size(); i++) {
1968  tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
1969  }
1970 
1971  //
1972  // Sort tuples based on home patch
1973  //
1974  std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
1975  [](std::pair<int, int> a, std::pair<int, int> b) {
1976  return a.second < b.second;
1977  });
1978 
1979  //
1980  // Propogate order to tuples
1981  //
1982  std::vector<T> copy = tuples;
1983  for (int i = 0; i < tuples.size(); i++) {
1984  tuples[i] = copy[downstreamPatches[i].first];
1985  }
1986 }
1987 
1988 void ComputeBondedCUDA::sortAndCopyToDevice() {
1989  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1990  PatchData *patchData = cpdata.ckLocalBranch();
1991  const int devInd = deviceCUDA->getDeviceIndex();
1992  const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
1993 
1994  TupleDataStage h_dataStage;
1995  TupleIntArrays h_counts;
1996  TupleIntArrays h_offsets;
1997 
1998  //
1999  // These cannot be declared within the macro because then they
2000  // would be out of scope
2001  //
2002  std::vector<int> bondPatchCounts(numPatchesHome, 0);
2003  std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
2004 
2005  std::vector<int> anglePatchCounts(numPatchesHome, 0);
2006  std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
2007 
2008  std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
2009  std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
2010 
2011  std::vector<int> improperPatchCounts(numPatchesHome, 0);
2012  std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
2013 
2014  std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
2015  std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
2016 
2017  std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
2018  std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
2019 
2020  std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
2021  std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
2022 
2023  #define CALL(fieldName) do { \
2024  sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \
2025  h_dataStage.fieldName = fieldName##TupleData.data(); \
2026  h_counts.fieldName = fieldName##PatchCounts.data(); \
2027  h_offsets.fieldName = fieldName##PatchOffsets.data(); \
2028  } while(0);
2029 
2030  CALL(bond);
2031  CALL(angle);
2032  CALL(dihedral);
2033  CALL(improper);
2034  CALL(modifiedExclusion);
2035  CALL(exclusion);
2036  CALL(crossterm);
2037 
2038  #undef CALL
2039 
2040  migrationKernel.copyTupleToDevice(
2041  bondedKernel.getTupleCounts(),
2042  numPatchesHome,
2043  h_dataStage,
2044  h_counts,
2045  h_offsets,
2046  stream
2047  );
2048 
2049  cudaCheck(cudaStreamSynchronize(stream));
2050 }
2051 
2059 void ComputeBondedCUDA::tupleCopyWorkerType(int tupletype) {
2060  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
2061 
2062  switch (tupletype) {
2063  case Tuples::EXCLUSION:
2064  {
2065  // Separate exclusions into modified, and non-modified
2066  modifiedExclusionTupleData.clear();
2067  exclusionTupleData.clear();
2068  int64_t pos = exclusionStartPos;
2069  int64_t pos2 = exclusionStartPos2;
2070  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
2071  int ntuples = (*it)->getNumTuples();
2072  copyExclusionDataStage(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSizeStage[Tuples::EXCLUSION],
2073  modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
2074  }
2075  // Reserve patch warp aligned data for copy.
2076  // TODO DMC is this necessary? Why are we copying the padded part too?
2077  modifiedExclusionTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(modifiedExclusionTupleData.size()));
2078  exclusionTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(exclusionTupleData.size()));
2079  }
2080  break;
2081 
2082  case Tuples::BOND:
2083  {
2084  bondTupleData.clear();
2085  for (auto it = tupleList[Tuples::BOND].begin();it != tupleList[Tuples::BOND].end();it++) {
2086  int ntuples = (*it)->getNumTuples();
2087  BondElem* elemList = (BondElem *)(*it)->getTupleList();
2088  copyToStage<BondElem, BondValue, CudaBondStage>(
2089  ntuples, elemList, Node::Object()->parameters->bond_array, bondTupleData
2090  );
2091  }
2092 
2093  bondTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(bondTupleData.size()));
2094  }
2095  break;
2096 
2097  case Tuples::ANGLE:
2098  {
2099  angleTupleData.clear();
2100  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2101  int ntuples = (*it)->getNumTuples();
2102  AngleElem* elemList = (AngleElem *)(*it)->getTupleList();
2103  copyToStage<AngleElem, AngleValue, CudaAngleStage>(ntuples, elemList, Node::Object()->parameters->angle_array, angleTupleData);
2104  }
2105 
2106  angleTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(angleTupleData.size()));
2107  }
2108  break;
2109 
2110  case Tuples::DIHEDRAL:
2111  {
2112  dihedralTupleData.clear();
2113  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2114  int ntuples = (*it)->getNumTuples();
2115  DihedralElem* elemList = (DihedralElem *)(*it)->getTupleList();
2116  copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
2117  Node::Object()->parameters->dihedral_array, dihedralTupleData);
2118  }
2119 
2120  dihedralTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(dihedralTupleData.size()));
2121  }
2122  break;
2123 
2124  case Tuples::IMPROPER:
2125  {
2126  improperTupleData.clear();
2127  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2128  int ntuples = (*it)->getNumTuples();
2129  ImproperElem* elemList = (ImproperElem *)(*it)->getTupleList();
2130  copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList,
2131  Node::Object()->parameters->improper_array, improperTupleData);
2132  }
2133 
2134  improperTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(improperTupleData.size()));
2135  }
2136  break;
2137 
2138  case Tuples::CROSSTERM:
2139  {
2140  crosstermTupleData.clear();
2141  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2142  int ntuples = (*it)->getNumTuples();
2143  CrosstermElem* elemList = (CrosstermElem *)(*it)->getTupleList();
2144  copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList,
2145  Node::Object()->parameters->crossterm_array, crosstermTupleData);
2146  }
2147 
2148  crosstermTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(crosstermTupleData.size()));
2149  }
2150  break;
2151 
2152  case Tuples::THOLE:
2153  {
2154  tholeTupleData.clear();
2155  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2156  const int ntuples = (*it)->getNumTuples();
2157  TholeElem* elemList = (TholeElem*)(*it)->getTupleList();
2158  copyToStage<TholeElem, TholeValue, CudaTholeStage>(ntuples, elemList, nullptr, tholeTupleData);
2159  }
2160  tholeTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(tholeTupleData.size()));
2161  }
2162  break;
2163 
2164  case Tuples::ANISO:
2165  {
2166  anisoTupleData.clear();
2167  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2168  const int ntuples = (*it)->getNumTuples();
2169  AnisoElem* elemList = (AnisoElem*)(*it)->getTupleList();
2170  copyToStage<AnisoElem, AnisoValue, CudaAnisoStage>(ntuples, elemList, nullptr, anisoTupleData);
2171  }
2172  anisoTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(anisoTupleData.size()));
2173  }
2174  break;
2175 
2176  default:
2177  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
2178  break;
2179  }
2180 
2181  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
2182 }
2183 #endif // NODEGROUP_FORCE_REGISTER
2184 
2185 //
2186 // Copies tuple data form individual buffers to a single contigious buffer
2187 // NOTE: This is done on the master PE
2188 //
2189 
2190 void ComputeBondedCUDA::copyTupleData() {
2191 
2192  PatchMap* patchMap = PatchMap::Object();
2193 
2194  // Count the number of exclusions
2195  int numModifiedExclusions = 0;
2196  int numExclusions = 0;
2197  for (int i=0;i < numExclPerRank.size();i++) {
2198  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2199  numExclusions += numExclPerRank[i].numExclusions;
2200  }
2201  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2202  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2203 
2204  // Count the number of tuples for each type
2205  int64_t posWA = 0;
2206  exclusionStartPos = 0;
2207  exclusionStartPos2 = 0;
2208  tupleCopyWorkList.clear();
2209  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2210  // Take temporary position
2211  int64_t pos = posWA;
2212  if (tupletype == Tuples::EXCLUSION) {
2213  exclusionStartPos = pos;
2214  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
2215  }
2216  // Count for total number of tuples for this tupletype
2217  int num = 0;
2218  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2219  int ntuples = (*it)->getNumTuples();
2220  num += ntuples;
2221  if (tupletype != Tuples::EXCLUSION) {
2222  TupleCopyWork tupleCopyWork;
2223  tupleCopyWork.tupletype = tupletype;
2224  tupleCopyWork.ntuples = ntuples;
2225  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2226  tupleCopyWork.tupleDataPos = pos;
2227  // XXX NOTE: redundant copy happening here
2228  tupleCopyWorkList.push_back(tupleCopyWork);
2229  pos += ntuples*CudaTupleTypeSize[tupletype];
2230  }
2231  }
2232  numTuplesPerType[tupletype] = num;
2233  //
2234  if (tupletype == Tuples::EXCLUSION) {
2235  // Warp-align exclusions separately
2236  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2237  } else {
2238  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2239  }
2240  }
2241  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2242  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2243  }
2244 
2245  // Set flags for finishPatchesOnPe
2246  hasExclusions = (numExclusions > 0);
2247  hasModifiedExclusions = (numModifiedExclusions > 0);
2248 
2249  // Re-allocate storage as needed
2250  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2251  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2252 
2253 #if CMK_SMP && USE_CKLOOP
2254  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2255  if (useCkLoop >= 1) {
2256 #define CKLOOP_EXCLUSIONS 1
2257 #if CKLOOP_EXCLUSIONS
2258  CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (void *)this, CkMyNodeSize(), 0, tupleList[Tuples::EXCLUSION].size()-1);
2259  // pass 0 instead of -1 so tupleCopyWorker doesn't do exclusions
2260  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
2261 #else
2262  // JM NOTE: What this does is divice the work into chunks of CkMyNodeSize to be scheduled
2263  // between PE. To circumvent this, I have to call it by hand on all pes
2264  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
2265 #endif
2266  } else
2267 
2268 #endif
2269  {
2270 
2271  tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
2272  }
2273 
2274  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
2275  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
2276  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
2277  numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO], tupleData, stream);
2278 
2279  // Re-allocate forces
2280  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2281  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2282 }
2283 
2284 // JM: Single node version to bypass CkLoop
2285 #ifdef NODEGROUP_FORCE_REGISTER
2286 
2287 void ComputeBondedCUDA::copyTupleDataSN() {
2288 
2289  PatchMap* patchMap = PatchMap::Object();
2290  size_t numExclusions, numModifiedExclusions, copyIndex;
2292 
2293  // WORK TO BE DONE BY THE MASTERPE
2294  if(masterPe == CkMyPe()){
2295  numModifiedExclusions = 0;
2296  numExclusions = 0;
2297  for (int i=0;i < numExclPerRank.size();i++) {
2298  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2299  numExclusions += numExclPerRank[i].numExclusions;
2300  }
2301  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2302  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2303 
2304  // Count the number of tuples for each type
2305  int64_t posWA = 0;
2306  exclusionStartPos = 0;
2307  exclusionStartPos2 = 0;
2308  tupleCopyWorkList.clear();
2309  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2310  // Take temporary position
2311  int64_t pos = posWA;
2312  if (tupletype == Tuples::EXCLUSION) {
2313  exclusionStartPos = pos;
2314  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
2315  }
2316  // Count for total number of tuples for this tupletype
2317  int num = 0;
2318  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2319  int ntuples = (*it)->getNumTuples();
2320  num += ntuples;
2321  if (tupletype != Tuples::EXCLUSION) {
2322  TupleCopyWork tupleCopyWork;
2323  tupleCopyWork.tupletype = tupletype;
2324  tupleCopyWork.ntuples = ntuples;
2325  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2326  tupleCopyWork.tupleDataPos = pos;
2327  tupleCopyWorkList.push_back(tupleCopyWork);
2328  pos += ntuples*CudaTupleTypeSize[tupletype];
2329  }
2330  }
2331  numTuplesPerType[tupletype] = num;
2332  //
2333  if (tupletype == Tuples::EXCLUSION) {
2334  // Warp-align exclusions separately
2335  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2336  } else {
2337  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2338  }
2339  }
2340  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2341  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2342  }
2343 
2344  // Set flags for finishPatchesOnPe
2345  hasExclusions = (numExclusions > 0);
2346  hasModifiedExclusions = (numModifiedExclusions > 0);
2347 
2348  // Re-allocate storage as needed
2349  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2350  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2351  }
2352 
2354 
2355 #if 0
2356  //int chunkSize = (tupleCopyWorkList.size()) /CkMyNodeSize() +
2357  // (CkMyPe() == CkMyNodeSize()-1) * tupleCopyWorkList.size() % CkMyNodeSize();
2358  //int first = ((CkMyNodeSize() -1) - CkMyPe())*chunkSize -1*(CkMyPe() == CkMyNodeSize()-1);
2359  //int last = (first + chunkSize) -1*(!CkMyPe());; // chunksize is padded for the last pe
2360  this->tupleCopyWorker(first, last);
2361 #else
2362  // JM: This probably leads to better load balance than the option above.
2363  // XXX TODO: Improve this further afterwards. I don't think this scales too well
2364  // due to lots of atomics, but you're not supposed to run this with
2365  // lots of PEs anyways
2366  while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <= tupleCopyWorkList.size()){
2367  this->tupleCopyWorker(copyIndex -1, copyIndex -1);
2368  }
2369 #endif
2371 
2372  if(masterPe == CkMyPe()){
2373  tupleWorkIndex.store(0);
2374  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
2375  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
2376  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
2377  numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
2378  tupleData, stream);
2379 
2380  // Re-allocate forces
2381  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2382  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2383  }
2384 }
2385 
2386 
2394 void ComputeBondedCUDA::copyTupleDataGPU(const int startup) {
2395 
2396  PatchMap* patchMap = PatchMap::Object();
2398  size_t numExclusions, numModifiedExclusions, copyIndex;
2399 
2400  // WORK TO BE DONE BY THE MASTERPE
2401  if (startup) {
2402  // The atom map is no longer needed, so unmap the atoms. We need to do this in-case
2403  // multiple run commands are issued. It is safe to do this because there is a node
2404  // barrier between loadTuplesOnPe and this function
2405  unmapAtoms();
2406 
2407  if(masterPe == CkMyPe()){
2408  numModifiedExclusions = 0;
2409  numExclusions = 0;
2410  for (int i=0;i < numExclPerRank.size();i++) {
2411  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2412  numExclusions += numExclPerRank[i].numExclusions;
2413  }
2414  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2415  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2416 
2417  // Count the number of tuples for each type
2418  size_t posWA = 0;
2419  exclusionStartPos = 0;
2420  exclusionStartPos2 = 0;
2421  tupleCopyWorkList.clear();
2422  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2423  // Take temporary position
2424  size_t pos = posWA;
2425  if (tupletype == Tuples::EXCLUSION) {
2426  exclusionStartPos = pos;
2427  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[Tuples::EXCLUSION];
2428  }
2429  // Count for total number of tuples for this tupletype
2430  int num = 0;
2431  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2432  int ntuples = (*it)->getNumTuples();
2433  num += ntuples;
2434  if (tupletype != Tuples::EXCLUSION) {
2435  TupleCopyWork tupleCopyWork;
2436  tupleCopyWork.tupletype = tupletype;
2437  tupleCopyWork.ntuples = ntuples;
2438  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2439  tupleCopyWork.tupleDataPos = pos;
2440  tupleCopyWorkList.push_back(tupleCopyWork);
2441  pos += ntuples*CudaTupleTypeSizeStage[tupletype];
2442  }
2443  }
2444  numTuplesPerType[tupletype] = num;
2445  //
2446  if (tupletype == Tuples::EXCLUSION) {
2447  // Warp-align exclusions separately
2448  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
2449  } else {
2450  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2451  }
2452  }
2453  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2454  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2455  }
2456 
2457  // Set flags for finishPatchesOnPe
2458  hasExclusions = (numExclusions > 0);
2459  hasModifiedExclusions = (numModifiedExclusions > 0);
2460 
2461  // Re-allocate storage as needed
2462  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2463  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2464 
2465 
2466  TupleCounts local;
2467  local.bond = numTuplesPerType[Tuples::BOND];
2468  local.angle = numTuplesPerType[Tuples::ANGLE];
2469  local.dihedral = numTuplesPerType[Tuples::DIHEDRAL];
2470  local.improper = numTuplesPerType[Tuples::IMPROPER];
2471  local.modifiedExclusion = numModifiedExclusions;
2472  local.exclusion = numExclusions;
2473  local.crossterm = numTuplesPerType[Tuples::CROSSTERM];
2474  local.thole = numTuplesPerType[Tuples::THOLE];
2475  local.aniso = numTuplesPerType[Tuples::ANISO];
2476 
2477  updateMaxTupleCounts(local);
2478  bondedKernel.setTupleCounts(local);
2479  }
2480 
2481  // Wait for tuple counts to be updated...
2483 
2484  if(masterPe == CkMyPe()){
2485  TupleCounts newMax = getMaxTupleCounts();
2486  migrationKernel.reallocateBufferSrc(newMax);
2487  migrationKernel.reallocateBufferDst(newMax);
2488  bondedKernel.reallocateTupleBuffer(newMax, stream);
2489  registerPointersToHost();
2490  }
2491 
2492  // Node barrier is because of registering
2494 
2495  if(masterPe == CkMyPe()){
2496  copyHostRegisterToDevice();
2497  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2498  this->tupleCopyWorkerType(tupletype);
2499  }
2500  sortAndCopyToDevice();
2501  }
2502  }
2503 
2504  migrateTuples(startup);
2505 
2506  if(masterPe == CkMyPe()){
2507  TupleCounts count = bondedKernel.getTupleCounts();
2508  numTuplesPerType[Tuples::BOND] = count.bond;
2509  numTuplesPerType[Tuples::ANGLE] = count.angle;
2510  numTuplesPerType[Tuples::DIHEDRAL] = count.dihedral;
2511  numTuplesPerType[Tuples::IMPROPER] = count.improper;
2512  numModifiedExclusions = count.modifiedExclusion;
2513  numExclusions = count.exclusion;
2514  numTuplesPerType[Tuples::CROSSTERM] = count.crossterm;
2515  numTuplesPerType[Tuples::THOLE] = count.thole;
2516  numTuplesPerType[Tuples::ANISO] = count.aniso;
2517 
2518  // Set flags for finishPatchesOnPe
2519  hasExclusions = (numExclusions > 0);
2520  hasModifiedExclusions = (numModifiedExclusions > 0);
2521 
2522  // Re-allocate forces
2523  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2524  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2525  }
2526 }
2527 
2528 #endif
2529 
2530 
2539 #ifdef NODEGROUP_FORCE_REGISTER
2540 void ComputeBondedCUDA::updatePatchRecords() {
2541  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2542  PatchData *patchData = cpdata.ckLocalBranch();
2543  int devInd = deviceCUDA->getDeviceIndex();
2544  PatchRecord **d_pr = &(patchData->devData[devInd].bond_pr);
2545  int **d_pid = &(patchData->devData[devInd].bond_pi);
2546  size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
2547  size_t st_d_pr_size = (size_t)(patchData->devData[devInd].bond_pr_size);
2548  patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
2549  reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size()); // st_d_pr_size changed to patches.size() here
2550  patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size); // copy updated value back into devData
2551  reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
2552  patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size); // copy updated value back into devData
2553  copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
2554  copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
2555  if (params->CUDASOAintegrate && params->useDeviceMigration) {
2556  patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2557  }
2558 }
2559 #endif
2560 
2561 //
2562 // Launch work on GPU
2563 //
2564 void ComputeBondedCUDA::launchWork() {
2566  if (CkMyPe() != masterPe)
2567  NAMD_bug("ComputeBondedCUDA::launchWork() called on non master PE");
2568 
2569  cudaCheck(cudaSetDevice(deviceID));
2570  if (atomsChanged) {
2571  if(!params->CUDASOAintegrate) copyTupleData();
2572  // copyTupledata nees to be called before launchWork then
2573 
2574 // Move this once GPU migration has been merged
2575 #if 1
2576 #ifdef NODEGROUP_FORCE_REGISTER
2577  if(params->CUDASOAintegrate && !params->useDeviceMigration){
2578  updatePatchRecords();
2579  }
2580 #endif
2581 #endif
2582  }
2583 
2584  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
2585  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
2586  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
2587 
2588  int r2_delta_expc = 64 * (ComputeNonbondedUtil::r2_delta_exp - 127);
2589 
2590 #if 0
2591  if(!atomsChanged){
2592  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2593  PatchData *patchData = cpdata.ckLocalBranch();
2594 
2595  float4 *d_atoms = bondedKernel.getAtomBuffer();
2596  copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
2597  if(!params->CUDASOAintegrate) CmiLock(printLock);
2598  else CmiLock(patchData->printlock);
2599 
2600  fprintf(stderr, "DEV[%d] BOND POS PRINTOUT\n", deviceID);
2601 
2602  for(int i = 0 ; i < atomStorageSize; i++){
2603  fprintf(stderr, " ATOMS[%d] = %lf %lf %lf %lf\n",
2604  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
2605  }
2606 
2607  if(!params->CUDASOAintegrate) CmiUnlock(printLock);
2608  else CmiUnlock(patchData->printlock);
2609  }
2610 #endif
2611 
2612  const CudaNBConstants nbConstants = CudaComputeNonbonded::getNonbondedCoef(params);
2613  const bool doTable = CudaComputeNonbonded::getDoTable(params, doSlow, doVirial);
2614 
2615  bondedKernel.bondedForce(
2617  atomStorageSize,
2618  doEnergy, doVirial, doSlow, doTable,
2619  lata, latb, latc,
2621  (float)ComputeNonbondedUtil::r2_delta, r2_delta_expc,
2622  nbConstants,
2623  (const float4*)atoms, forces,
2624  energies_virials, atomsChanged, params->CUDASOAintegrate,
2625  params->useDeviceMigration, stream);
2626 
2627 
2628 #ifdef NODEGROUP_FORCE_REGISTER
2629  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2630  PatchData *patchData = cpdata.ckLocalBranch();
2631  int devInd = deviceCUDA->getDeviceIndex();
2632  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
2633  patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2634  }
2635  patchData->devData[devInd].f_bond = bondedKernel.getForces();
2636  patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
2637  patchData->devData[devInd].f_bond_slow = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
2638  patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
2639 #endif
2640 
2641 #if 0
2642  if(!params->CUDASOAintegrate) CmiLock(printLock);
2643  else CmiLock(patchData->printlock);
2644  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2645  // synchronously copy array back to the host
2646  copy_DtoH_sync(bondedKernel.getForces(), forces, forceStorageSize);
2647  fprintf(stderr, "DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
2648  for(int i = 0; i < forceStorageSize; i++){
2649  // prints entire forces datastructure
2650  fprintf(stderr, "BOND[%d] = %lf\n", i, forces[i]);
2651  }
2652  if(!params->CUDASOAintegrate) CmiUnlock(printLock);
2653  else CmiUnlock(patchData->printlock);
2654 #endif
2655  forceDoneSetCallback();
2656 }
2657 
2658 void ComputeBondedCUDA::forceDoneCheck(void *arg, double walltime) {
2659  ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
2660 
2661  if (CkMyPe() != c->masterPe)
2662  NAMD_bug("ComputeBondedCUDA::forceDoneCheck called on non masterPe");
2663 
2664  cudaCheck(cudaSetDevice(c->deviceID));
2665 
2666  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2667  if (err == cudaSuccess) {
2668  // Event has occurred
2669  c->checkCount = 0;
2670  traceUserBracketEvent(CUDA_BONDED_KERNEL_EVENT, c->beforeForceCompute, walltime);
2671  c->finishPatches();
2672  return;
2673  } else if (err != cudaErrorNotReady) {
2674  // Anything else is an error
2675  char errmsg[256];
2676  sprintf(errmsg,"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
2677  c->checkCount, walltime - c->beforeForceCompute);
2678  cudaDie(errmsg,err);
2679  }
2680 
2681  // Event has not occurred
2682  c->checkCount++;
2683  if (c->checkCount >= 1000000) {
2684  char errmsg[256];
2685  sprintf(errmsg,"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
2686  c->checkCount, walltime - c->beforeForceCompute);
2687  cudaDie(errmsg,err);
2688  }
2689 
2690  // Call again
2691  CcdCallBacksReset(0, walltime);
2692  CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2693 }
2694 
2695 //
2696 // Set call back for all the work in the stream at this point
2697 //
2698 void ComputeBondedCUDA::forceDoneSetCallback() {
2699  if (CkMyPe() != masterPe)
2700  NAMD_bug("ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
2701  cudaCheck(cudaSetDevice(deviceID));
2702  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2703  checkCount = 0;
2704  CcdCallBacksReset(0, CmiWallTimer());
2705  // Start timer for CUDA kernel
2706  beforeForceCompute = CkWallTimer();
2707  // Set the call back at 0.1ms
2708  if(!params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, this, 0.1);
2709 }
2710 
2711 template <bool sumNbond, bool sumSlow>
2712 void finishForceLoop(const int numAtoms, const int forceStride,
2713  const double* __restrict__ af,
2714  const double* __restrict__ af_nbond,
2715  const double* __restrict__ af_slow,
2716  Force* __restrict__ f,
2717  Force* __restrict__ f_nbond,
2718  Force* __restrict__ f_slow) {
2719 
2720 #if 0
2721  for (int j=0;j < numAtoms;j++) {
2722  {
2723  //double afx, afy, afz;
2724  //convertForceToDouble(af + j, forceStride, afx, afy, afz);
2725  //f[j].x += af[j];
2726  //f[j].y += af[j + forceStride];
2727  //f[j].z += af[j + forceStride*2];
2728  f[j].x += af[j];
2729  f[j].y += af[j + forceStride];
2730  f[j].z += af[j + forceStride*2];
2731  }
2732  if (sumNbond)
2733  {
2734  //double afx, afy, afz;
2735  //convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
2736  f_nbond[j].x += af_nbond[j];
2737  f_nbond[j].y += af_nbond[j + forceStride];
2738  f_nbond[j].z += af_nbond[j + forceStride*2];
2739 
2740  }
2741  if (sumSlow)
2742  {
2743  //double afx, afy, afz;
2744  //convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
2745  f_slow[j].x += af_slow[j];
2746  f_slow[j].y += af_slow[j + forceStride];
2747  f_slow[j].z += af_slow[j + forceStride*2];
2748 
2749  }
2750  }
2751 #endif
2752 
2753 #if 0
2754  for(int j=0; j < numAtoms; j++){
2755  f[j].x += af[j];
2756  f[j].y += af[j+ forceStride];
2757  f[j].z += af[j+ 2*forceStride];
2758  }
2759  if(sumNbond){
2760  for(int j=0; j < numAtoms; j++){
2761  f_nbond[j].x += af_nbond[j];
2762  f_nbond[j].y += af_nbond[j + forceStride];
2763  f_nbond[j].z += af_nbond[j + 2*forceStride];
2764  }
2765  }
2766  if(sumSlow){
2767  for(int j=0; j < numAtoms; j++){
2768  f_slow[j].x += af_slow[j];
2769  f_slow[j].y += af_slow[j + forceStride];
2770  f_slow[j].z += af_slow[j + 2*forceStride];
2771  }
2772  }
2773 #endif
2774  // XXX Summing into AOS buffer from SOA data layout
2775  // Try to reduce cache misses by separating loops.
2776  for(int j=0; j < numAtoms; j++) f[j].x += af[j];
2777  for(int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
2778  for(int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
2779 
2780  if(sumNbond){
2781 #ifdef DEBUG_MINIMIZE
2782  int k=0;
2783  printf("%s, line %d\n", __FILE__, __LINE__);
2784  printf(" before: f_nbond[%d] = %f %f %f\n",
2785  k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2786 #endif
2787  for(int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
2788  for(int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
2789  for(int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
2790 #ifdef DEBUG_MINIMIZE
2791  printf(" after: f_nbond[%d] = %f %f %f\n",
2792  k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2793 #endif
2794  }
2795  if(sumSlow){
2796  for(int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
2797  for(int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
2798  for(int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
2799  }
2800 #if 0
2801  for(int j = 0; j < numAtoms; j++){
2802  fprintf(stderr, "f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
2803  af[j], af[j + forceStride], af[j + 2*forceStride],
2804  af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
2805  af_slow[j], af_slow[j + forceStride], af_slow[j + 2*forceStride]);
2806  }
2807 #endif
2808 }
2809 
2810 //
2811 // Finish all patches that are on this pe
2812 //
2813 void ComputeBondedCUDA::finishPatchesOnPe() {
2814  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2815 
2816  PatchMap* patchMap = PatchMap::Object();
2817  int myRank = CkMyRank();
2818 
2819  const int forceStride = bondedKernel.getForceStride(atomStorageSize);
2820  const int forceSize = bondedKernel.getForceSize(atomStorageSize);
2821  const bool sumNbond = hasModifiedExclusions;
2822  const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
2823 
2824  // I need to print patchIDsPerRank
2825  for (int i=0;i < patchIDsPerRank[myRank].size();i++) {
2826  PatchID patchID = patchIDsPerRank[myRank][i];
2827  Patch* patch = patchMap->patch(patchID);
2828  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
2829  if (tpe == NULL) {
2830  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
2831  }
2832 
2833  int pi = patchIndex[patchID];
2834  int numAtoms = patches[pi].numAtoms;
2835  int atomStart = patches[pi].atomStart;
2836 
2837  // XXX TODO: this is a large CPU bottleneck
2838  // Ok, so this is
2839 #ifdef NODEGROUP_FORCE_REGISTER
2840  double *af = forces + atomStart;
2841  double *af_nbond = forces + forceSize + atomStart;
2842  double *af_slow = forces + 2*forceSize + atomStart;
2843  // XXX TODO: We still need forces inside patches for migration steps it seems
2844  if(!params->CUDASOAintegrate || (atomsChanged && !params->useDeviceMigration) ){
2845  Force *f = tpe->f;
2846  Force *f_nbond = tpe->r->f[Results::nbond];
2847  Force *f_slow = tpe->r->f[Results::slow];
2848  if (!sumNbond && !sumSlow) {
2849  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2850  } else if (sumNbond && !sumSlow) {
2851  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2852  } else if (!sumNbond && sumSlow) {
2853  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2854  } else if (sumNbond && sumSlow) {
2855  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2856  } else {
2857  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2858  }
2859  tpe->forceBox->close(&tpe->r);
2860  tpe->positionBox->close(&tpe->x);
2861  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
2862  }
2863 #else
2864  Force *f = tpe->f;
2865  Force *f_nbond = tpe->r->f[Results::nbond];
2866  Force *f_slow = tpe->r->f[Results::slow];
2867 
2868  double *af = forces + atomStart;
2869  double *af_nbond = forces + forceSize + atomStart;
2870  double *af_slow = forces + 2*forceSize + atomStart;
2871 
2872  if (!sumNbond && !sumSlow) {
2873  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2874  } else if (sumNbond && !sumSlow) {
2875  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2876  } else if (!sumNbond && sumSlow) {
2877  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2878  } else if (sumNbond && sumSlow) {
2879  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2880  } else {
2881  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2882  }
2883 
2884  tpe->forceBox->close(&tpe->r);
2885  tpe->positionBox->close(&tpe->x);
2886  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
2887 
2888 #endif
2889 // #endif
2890 
2891  }
2892 
2893 
2894  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2895 
2896  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2897 
2898  bool done = false;
2899  CmiLock(lock);
2900  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
2901  if(params->CUDASOAintegrate && !atomsChanged){
2902  // masterPe is executing this, so we can go ahead and do reductions
2903  // by themselves. However, for migrations, I still need to follow the usual
2904  // codepath, because of the box setup
2905  patchesCounter = 0;
2906  }
2907  if (patchesCounter == 0) {
2908  patchesCounter = getNumPatches();
2909  done = true;
2910  }
2911  CmiUnlock(lock);
2912  if (done) {
2913  //computeMgr->sendFinishReductions(masterPe, this);
2914  if(params->CUDASOAintegrate ) {
2915  if(!atomsChanged) this->finishReductions();
2916  }
2917  else computeMgr->sendFinishReductions(masterPe, this);
2918  }
2919 
2920  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2921 
2922 }
2923 
2924 void ComputeBondedCUDA::finishPatches() {
2925 
2926  if (atomsChanged && (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
2927  unmapAtoms();
2928  }
2929  // do I have to synchronize the stream here??????
2930  //computeMgr->sendFinishPatchesOnPe(pes, this);
2931  if(params->CUDASOAintegrate) this->finishPatchesOnPe();
2932  else{
2933  computeMgr->sendFinishPatchesOnPe(pes, this);
2934  }
2935 }
2936 
2937 //
2938 // Finish & submit reductions
2939 //
2940 void ComputeBondedCUDA::finishReductions() {
2941 
2942  if (CkMyPe() != masterPe)
2943  NAMD_bug("ComputeBondedCUDA::finishReductions() called on non masterPe");
2944  SubmitReduction *reduction = getCurrentReduction();
2945 
2946  //fprintf(stderr, "finishReductions\n");
2947 
2948  // static int ncall = 0;
2949  // ncall++;
2950 
2951  int pos = 0;
2952  // do I need this streamSynchronize here?
2953  if(params->CUDASOAintegrate) {
2954  cudaCheck(cudaStreamSynchronize(stream));
2955  }
2956  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2957  if (numTuplesPerType[tupletype] > 0) {
2958 
2959  if (doEnergy) {
2960  switch (tupletype) {
2961  case Tuples::BOND:
2962  reduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
2963  if (hostAlchFlags.alchFepOn) {
2965  }
2966  if (hostAlchFlags.alchThermIntOn) {
2969  }
2970  break;
2971 
2972  case Tuples::ANGLE:
2974  if (hostAlchFlags.alchFepOn) {
2976  }
2977  if (hostAlchFlags.alchThermIntOn) {
2980  }
2981  break;
2982 
2983  case Tuples::DIHEDRAL:
2985  if (hostAlchFlags.alchFepOn) {
2987  }
2988  if (hostAlchFlags.alchThermIntOn) {
2991  }
2992  break;
2993 
2994  case Tuples::IMPROPER:
2996  if (hostAlchFlags.alchFepOn) {
2998  }
2999  if (hostAlchFlags.alchThermIntOn) {
3002  }
3003  break;
3004 
3005  case Tuples::EXCLUSION:
3007  reduction->item(REDUCTION_LJ_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
3009  if (hostAlchFlags.alchFepOn) {
3011  reduction->item(REDUCTION_LJ_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_F];
3013  }
3014  if (hostAlchFlags.alchThermIntOn) {
3021  }
3022  break;
3023 
3024  case Tuples::CROSSTERM:
3026  if (hostAlchFlags.alchFepOn) {
3028  }
3029  if (hostAlchFlags.alchThermIntOn) {
3032  }
3033  break;
3034 
3035  case Tuples::THOLE: {
3037  if (hostAlchFlags.alchFepOn) {
3039  }
3040  if (hostAlchFlags.alchThermIntOn) {
3043  }
3044  break;
3045  }
3046 
3047  case Tuples::ANISO: {
3049  if (hostAlchFlags.alchFepOn) {
3051  }
3052  if (hostAlchFlags.alchThermIntOn) {
3055  }
3056  break;
3057  }
3058 
3059  default:
3060  NAMD_bug("ComputeBondedCUDA::finishReductions, Unsupported tuple type");
3061  break;
3062  }
3063  }
3064 
3065  auto it = tupleList[tupletype].begin();
3066  if (!params->CUDASOAintegrate) {
3067  (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
3068  }
3069  }
3070  }
3071 
3072  if (doVirial) {
3073 #ifndef WRITE_FULL_VIRIALS
3074 #error "non-WRITE_FULL_VIRIALS not implemented"
3075 #endif
3076 
3077  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
3078  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
3079  ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
3080  ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3081  // NOTE: AMD_DIHE virial is also added to NORMAL virial.
3082  // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
3083  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3084  }
3085 
3086  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
3087 
3088  reduction->submit();
3089 }
3090 
3091 //
3092 // Can only be called by master PE
3093 //
3094 void ComputeBondedCUDA::initialize() {
3095 #ifdef NODEGROUP_FORCE_REGISTER
3096  tupleWorkIndex.store(0);
3097 #endif
3098 
3099  if (CkMyPe() != masterPe)
3100  NAMD_bug("ComputeBondedCUDA::initialize() called on non master PE");
3101 
3102  // Build list of PEs
3103  for (int rank=0;rank < computes.size();rank++) {
3104  if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
3105  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3106  }
3107  }
3108 
3109  // Return if no work to do
3110  if (pes.size() == 0) return;
3111 
3112  initializeCalled = false;
3113  cudaCheck(cudaSetDevice(deviceID));
3114 
3115 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
3116  int leastPriority, greatestPriority;
3117  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
3118  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
3119 #else
3120  cudaCheck(cudaStreamCreate(&stream));
3121 #endif
3122  cudaCheck(cudaEventCreate(&forceDoneEvent));
3123  lock = CmiCreateLock();
3124  printLock = CmiCreateLock();
3125  if(Node::Object()->simParameters->CUDASOAintegrateMode) {
3126  reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
3127  }
3128  reductionGpuOffload = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
3129 
3130  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3131  PatchData *patchData = cpdata.ckLocalBranch();
3132 
3133  PatchMap* patchMap = PatchMap::Object();
3134  // First, assign all patches in self computes.
3135  // NOTE: These never overlap between PEs. No proxies added.
3136  for (int rank=0;rank < computes.size();rank++) {
3137  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3138  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
3139  for (auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
3140  if (!tuplePatchList.find( TuplePatchElem(*jt) ) ) {
3141  tuplePatchList.add( TuplePatchElem(*jt) );
3142  patchIDsPerRank[rank].push_back(*jt);
3143  allPatchIDs.push_back(*jt);
3144  }
3145  }
3146  }
3147  }
3148 
3149  // Second, assign all patches in home computes.
3150  // NOTE: The ranks always have these patches. No proxies added.
3151  for (int rank=0;rank < computes.size();rank++) {
3152  HomeCompute& homeCompute = computes[rank].homeCompute;
3153  std::vector<int>& patchIDs = homeCompute.patchIDs;
3154  for (int i=0;i < patchIDs.size();i++) {
3155  int patchID = patchIDs[i];
3156  if (!tuplePatchList.find( TuplePatchElem(patchID) ) ) {
3157  tuplePatchList.add( TuplePatchElem(patchID) );
3158  patchIDsPerRank[rank].push_back(patchID);
3159  allPatchIDs.push_back(patchID);
3160  }
3161  }
3162  }
3163 
3164  std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
3165  // Find neighbors that are not added yet
3166  std::vector<int> neighborPids;
3167  for (int rank=0;rank < computes.size();rank++) {
3169  HomeCompute& homeCompute = computes[rank].homeCompute;
3170  std::vector<int>& patchIDs = homeCompute.patchIDs;
3171  for (int i=0;i < patchIDs.size();i++) {
3172  int patchID = patchIDs[i];
3173  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
3174  for (int j=0;j < numNeighbors;j++) {
3175  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
3176  neighborPids.push_back(neighbors[j]);
3177  }
3178  }
3179  }
3180  }
3181  // Remove duplicates from neighborPids
3182  {
3183  std::sort(neighborPids.begin(), neighborPids.end());
3184  auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
3185  neighborPids.resize(std::distance(neighborPids.begin(), it_end));
3186  }
3187  // Assign neighbors to the PEs on this node that have them
3188  for (int i=0;i < neighborPids.size();i++) {
3189  for (int rank=0;rank < computes.size();rank++) {
3190  int pid = neighborPids[i];
3191  int pe = rank + CkNodeFirst(CkMyNode());
3192  if (patchMap->node(pid) == pe) {
3193  // Patch pid found on PE "pe" on this node
3194  tuplePatchList.add( TuplePatchElem(pid) );
3195  patchIDsPerRank[rank].push_back(pid);
3196  allPatchIDs.push_back(pid);
3197  // Add to this rank's patches
3198  patchIDsToAppend[rank].push_back(pid);
3199  // Add to the list of PEs
3200  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3201  break;
3202  }
3203  }
3204  }
3205  // Remove duplicates from pes
3206  {
3207  std::sort(pes.begin(), pes.end());
3208  auto it_end = std::unique(pes.begin(), pes.end());
3209  pes.resize(std::distance(pes.begin(), it_end));
3210  }
3211 
3212  // Last, assign all patches in neighbors of home computes
3213  // NOTE: Will create proxies on multiple nodes
3214  for (int rank=0;rank < computes.size();rank++) {
3216  HomeCompute& homeCompute = computes[rank].homeCompute;
3217  std::vector<int>& patchIDs = homeCompute.patchIDs;
3218  std::vector<int> neighborPatchIDs;
3219  for (int i=0;i < patchIDs.size();i++) {
3220  int patchID = patchIDs[i];
3221  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
3222  for (int j=0;j < numNeighbors;j++) {
3223  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
3224  // Patch not found => Add Proxy
3225  tuplePatchList.add( TuplePatchElem(neighbors[j]) );
3226  patchIDsPerRank[rank].push_back(neighbors[j]);
3227  allPatchIDs.push_back(neighbors[j]);
3228  }
3229  if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
3230  && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
3231  neighborPatchIDs.push_back(neighbors[j]);
3232  }
3233  }
3234  }
3235  // Append neighboring patchIDs to homeCompute.patchIDs
3236  // int start = patchIDs.size();
3237  // patchIDs.resize(patchIDs.size() + neighborPatchIDs.size());
3238  // for (int i=0;i < neighborPatchIDs.size();i++) {
3239  // patchIDs[start + i] = neighborPatchIDs[i];
3240  // }
3241  for (int i=0;i < neighborPatchIDs.size();i++) {
3242  patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
3243  }
3244  }
3245 
3246  for (int rank=0;rank < patchIDsToAppend.size();rank++) {
3247  for (int i=0;i < patchIDsToAppend[rank].size();i++) {
3248  computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
3249  }
3250  }
3251 
3252  // Remove duplicate patch IDs
3253  {
3254  std::sort(allPatchIDs.begin(), allPatchIDs.end());
3255  auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
3256  allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
3257  }
3258 
3259  // Set number of (unique) patches
3260  setNumPatches(allPatchIDs.size());
3261 
3262  // Reset patchesCounter
3263  patchesCounter = getNumPatches();
3264 
3265  patches.resize(getNumPatches());
3266  // Setup tupleList
3267  // tuple list never gets updated here, where the hell is it updated?
3268  for (int rank=0;rank < computes.size();rank++) {
3269  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3270  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
3271  tupleList[it->tuples->getType()].push_back(it->tuples);
3272  }
3273  HomeCompute& homeCompute = computes[rank].homeCompute;
3274  for (int i=0;i < homeCompute.tuples.size();i++) {
3275  tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
3276  }
3277  }
3278 
3279  // Allocate host memory for energies and virials
3280  allocate_host<double>(&energies_virials, ComputeBondedCUDAKernel::energies_virials_SIZE);
3281 
3282  // Finally, do sanity checks
3283  std::vector<char> patchIDset(patchMap->numPatches(), 0);
3284  int numPatchIDset = 0;
3285  int numPatchIDs = 0;
3286  for (int rank=0;rank < computes.size();rank++) {
3287  numPatchIDs += patchIDsPerRank[rank].size();
3288  for (int i=0;i < patchIDsPerRank[rank].size();i++) {
3289  PatchID patchID = patchIDsPerRank[rank][i];
3290  if (patchIDset[patchID] == 0) numPatchIDset++;
3291  patchIDset[patchID] = 1;
3292  if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
3293  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3294  }
3295  }
3296  }
3297  if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
3298  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3299  }
3300 
3301  // Warning: Direct indexing used, patchIndex could use up a lot of memory for large systems
3302  patchIndex.resize(patchMap->numPatches());
3303  atomMappers.resize(getNumPatches());
3304  for (int i=0;i < getNumPatches();i++) {
3305  atomMappers[i] = new AtomMapper(allPatchIDs[i], &atomMap);
3306  patchIndex[allPatchIDs[i]] = i;
3307  }
3308 
3309  // Initialize the alchemical flags, parameters and lambda values
3310  updateHostCudaAlchFlags();
3311  updateKernelCudaAlchFlags();
3312  if (hostAlchFlags.alchOn) {
3313  updateHostCudaAlchParameters();
3314  bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3315  updateHostCudaAlchLambdas();
3316  bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3317  if (hostAlchFlags.alchDecouple) {
3318  pswitchTable[1+3*1] = 0;
3319  pswitchTable[2+3*2] = 0;
3320  }
3321  }
3322 
3323  // Copy coefficients to GPU
3324  Parameters* parameters = Node::Object()->parameters;
3325  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3326  if (tupleList[tupletype].size() > 0) {
3327  switch(tupletype) {
3328 
3329  case Tuples::BOND:
3330  {
3331  int NumBondParams = parameters->NumBondParams;
3332  BondValue* bond_array = parameters->bond_array;
3333  std::vector<CudaBondValue> bondValues(NumBondParams);
3334  for (int i=0;i < NumBondParams;i++) {
3335  bondValues[i].k = bond_array[i].k;
3336  bondValues[i].x0 = bond_array[i].x0;
3337  bondValues[i].x1 = bond_array[i].x1;
3338  }
3339  bondedKernel.setupBondValues(NumBondParams, bondValues.data());
3340  }
3341  break;
3342 
3343  case Tuples::ANGLE:
3344  {
3345  int NumAngleParams = parameters->NumAngleParams;
3346  AngleValue* angle_array = parameters->angle_array;
3347  std::vector<CudaAngleValue> angleValues(NumAngleParams);
3348  bool normal_ub_error = false;
3349  for (int i=0;i < NumAngleParams;i++) {
3350  angleValues[i].k = angle_array[i].k;
3351  if (angle_array[i].normal == 1) {
3352  angleValues[i].theta0 = angle_array[i].theta0;
3353  } else {
3354  angleValues[i].theta0 = cos(angle_array[i].theta0);
3355  }
3356  normal_ub_error |= (angle_array[i].normal == 0 && angle_array[i].k_ub);
3357  angleValues[i].k_ub = angle_array[i].k_ub;
3358  angleValues[i].r_ub = angle_array[i].r_ub;
3359  angleValues[i].normal = angle_array[i].normal;
3360  }
3361  if (normal_ub_error) NAMD_die("ERROR: Can't use cosAngles with Urey-Bradley angles");
3362  bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
3363  }
3364  break;
3365 
3366  case Tuples::DIHEDRAL:
3367  {
3368  int NumDihedralParams = parameters->NumDihedralParams;
3369  DihedralValue* dihedral_array = parameters->dihedral_array;
3370  int NumDihedralParamsMult = 0;
3371  for (int i=0;i < NumDihedralParams;i++) {
3372  NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
3373  }
3374  std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
3375  dihedralMultMap.resize(NumDihedralParams);
3376  int k = 0;
3377  for (int i=0;i < NumDihedralParams;i++) {
3378  int multiplicity = dihedral_array[i].multiplicity;
3379  dihedralMultMap[i] = k;
3380  for (int j=0;j < multiplicity;j++) {
3381  dihedralValues[k].k = dihedral_array[i].values[j].k;
3382  dihedralValues[k].n = (dihedral_array[i].values[j].n << 1) | (j < (multiplicity - 1));
3383  dihedralValues[k].delta = dihedral_array[i].values[j].delta;
3384  k++;
3385  }
3386  }
3387  bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
3388  }
3389  break;
3390 
3391  case Tuples::IMPROPER:
3392  {
3393  int NumImproperParams = parameters->NumImproperParams;
3394  ImproperValue* improper_array = parameters->improper_array;
3395  int NumImproperParamsMult = 0;
3396  for (int i=0;i < NumImproperParams;i++) {
3397  NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
3398  }
3399  std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
3400  improperMultMap.resize(NumImproperParams);
3401  int k = 0;
3402  for (int i=0;i < NumImproperParams;i++) {
3403  int multiplicity = improper_array[i].multiplicity;
3404  improperMultMap[i] = k;
3405  for (int j=0;j < multiplicity;j++) {
3406  improperValues[k].k = improper_array[i].values[j].k;
3407  improperValues[k].n = (improper_array[i].values[j].n << 1) | (j < (multiplicity - 1));
3408  improperValues[k].delta = improper_array[i].values[j].delta;
3409  k++;
3410  }
3411  }
3412  bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
3413  }
3414  break;
3415 
3416  case Tuples::CROSSTERM:
3417  {
3418  int NumCrosstermParams = parameters->NumCrosstermParams;
3419  CrosstermValue* crossterm_array = parameters->crossterm_array;
3420  std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
3421  const int D = CrosstermValue::dim;
3422  const int N = CrosstermValue::dim - 1;
3423  for (int ipar=0;ipar < NumCrosstermParams;ipar++) {
3424  for (int i=0;i < N;i++) {
3425  for (int j=0;j < N;j++) {
3426 
3427  // Setups coefficients for bi-cubic interpolation.
3428  // See https://en.wikipedia.org/wiki/Bicubic_interpolation
3429 
3430  #define INDEX(ncols,i,j) ((i)*ncols + (j))
3431  CrosstermData* table = &crossterm_array[ipar].c[0][0];
3432 
3433  const double Ainv[16][16] = {
3434  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3435  { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3436  {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3437  { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3438  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
3439  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
3440  { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
3441  { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
3442  {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
3443  { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
3444  { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
3445  {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
3446  { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
3447  { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
3448  {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
3449  { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
3450  };
3451 
3452 #ifndef M_PI
3453 #define M_PI 3.14159265358979323846
3454 #endif
3455 
3456  const double h = M_PI/12.0;
3457 
3458  const double x[16] = {
3459  table[INDEX(D,i,j)].d00, table[INDEX(D,i+1,j)].d00, table[INDEX(D,i,j+1)].d00, table[INDEX(D,i+1,j+1)].d00,
3460  table[INDEX(D,i,j)].d10*h, table[INDEX(D,i+1,j)].d10*h, table[INDEX(D,i,j+1)].d10*h, table[INDEX(D,i+1,j+1)].d10*h,
3461  table[INDEX(D,i,j)].d01*h, table[INDEX(D,i+1,j)].d01*h, table[INDEX(D,i,j+1)].d01*h, table[INDEX(D,i+1,j+1)].d01*h,
3462  table[INDEX(D,i,j)].d11*h*h, table[INDEX(D,i+1,j)].d11*h*h, table[INDEX(D,i,j+1)].d11*h*h, table[INDEX(D,i+1,j+1)].d11*h*h
3463  };
3464 
3465  // a = Ainv*x
3466  float* a = (float *)&crosstermValues[ipar].c[i][j][0];
3467  for (int k=0;k < 16;k++) {
3468  double a_val = 0.0;
3469  for (int l=0;l < 16;l++) {
3470  a_val += Ainv[k][l]*x[l];
3471  }
3472  a[k] = (float)a_val;
3473  }
3474 
3475  }
3476  }
3477  }
3478  bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
3479  }
3480  break;
3481 
3482  case Tuples::EXCLUSION:
3483  // Nothing to do
3484  break;
3485 
3486  case Tuples::THOLE: {
3487  // Nothing to do
3488  }
3489  break;
3490 
3491  case Tuples::ANISO: {
3492  // Nothing to do
3493  }
3494  break;
3495 
3496  default:
3497  NAMD_bug("ComputeBondedCUDA::initialize, Undefined tuple type");
3498  break;
3499  }
3500  }
3501  }
3502 
3503  computeMgr->sendAssignPatchesOnPe(pes, this);
3504 
3505  int nDevices = deviceCUDA->getNumDevice();
3506 
3507 #ifdef NODEGROUP_FORCE_REGISTER
3508  // TODO DMC this isn't the number of home patches, but that is okay for now...
3509  if (params->CUDASOAintegrateMode && params->useDeviceMigration) {
3510  migrationKernel.setup(nDevices, patchMap->numPatches());
3511 
3512  // Tuple Migration Structures
3513  // Patch record Data
3514  allocate_host<PatchRecord>(&h_patchRecord, patchMap->numPatches());
3515  allocate_device<PatchRecord>(&d_patchRecord, patchMap->numPatches());
3516 
3517  // Patch Centers
3518  allocate_host<double3>(&h_patchMapCenter, patchMap->numPatches());
3519  allocate_device<double3>(&d_patchMapCenter, patchMap->numPatches());
3520  for (int i = 0; i < patchMap->numPatches(); i++) {
3521  COPY_CUDAVECTOR(patchMap->center(i), h_patchMapCenter[i]);
3522  }
3523  copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->numPatches());
3524  }
3525 #endif // NODEGROUP_FORCE_REGISTER
3526 }
3527 
3528 #ifdef NODEGROUP_FORCE_REGISTER
3529 void ComputeBondedCUDA::updatePatchOrder(const std::vector<CudaLocalRecord>& data) {
3530  // TODO is there anywhere else that the order matters??
3531  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
3532  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
3533  for (int i=0;i < data.size();i++) {
3534  patchIndex[data[i].patchID] = i;
3535  }
3536 
3537  patches.clear();
3538  for (int i=0;i < data.size();i++) {
3539  PatchRecord p;
3540  p.patchID = data[i].patchID;
3541  p.atomStart = 0;
3542  p.numAtoms = 0;
3543  patches.push_back(p);
3544  }
3545 }
3546 #endif // NODEGROUP_FORCE_REGISTER
3547 
3548 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
3549  const SimParameters& sim_params = *(Node::Object()->simParameters);
3550  hostAlchFlags.alchOn = sim_params.alchOn;
3551  hostAlchFlags.alchFepOn = sim_params.alchFepOn;
3552  hostAlchFlags.alchThermIntOn = sim_params.alchThermIntOn;
3553  hostAlchFlags.alchWCAOn = sim_params.alchWCAOn;
3554  hostAlchFlags.alchLJcorrection = sim_params.LJcorrection;
3555  hostAlchFlags.alchVdwForceSwitching = ComputeNonbondedUtil::vdwForceSwitching;
3556  hostAlchFlags.alchDecouple = sim_params.alchDecouple;
3557  hostAlchFlags.alchBondDecouple = sim_params.alchBondDecouple;
3558 }
3559 
3560 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
3561  bondedKernel.updateCudaAlchFlags(hostAlchFlags);
3562 }
3563 
3564 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
3565  const SimParameters& sim_params = *(Node::Object()->simParameters);
3566  hostAlchParameters.switchDist2 = ComputeNonbondedUtil::switchOn * ComputeNonbondedUtil::switchOn;
3567  hostAlchParameters.alchVdwShiftCoeff = sim_params.alchVdwShiftCoeff;
3568 }
3569 
3570 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
3571  bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3572 }
3573 
3574 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
3575  SimParameters& sim_params = *(Node::Object()->simParameters);
3576  hostAlchLambdas.currentLambda = float(sim_params.getCurrentLambda(step));
3577  hostAlchLambdas.currentLambda2 = float(sim_params.getCurrentLambda2(step));
3578  hostAlchLambdas.bondLambda1 = float(sim_params.getBondLambda(hostAlchLambdas.currentLambda));
3579  hostAlchLambdas.bondLambda2 = float(sim_params.getBondLambda(1.0 - hostAlchLambdas.currentLambda));
3580  hostAlchLambdas.bondLambda12 = float(sim_params.getBondLambda(hostAlchLambdas.currentLambda2));
3581  hostAlchLambdas.bondLambda22 = float(sim_params.getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
3582  hostAlchLambdas.elecLambdaUp = float(sim_params.getElecLambda(hostAlchLambdas.currentLambda));
3583  hostAlchLambdas.elecLambda2Up = float(sim_params.getElecLambda(hostAlchLambdas.currentLambda2));
3584  hostAlchLambdas.elecLambdaDown = float(sim_params.getElecLambda(1.0 - hostAlchLambdas.currentLambda));
3585  hostAlchLambdas.elecLambda2Down = float(sim_params.getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
3586  hostAlchLambdas.vdwLambdaUp = float(sim_params.getVdwLambda(hostAlchLambdas.currentLambda));
3587  hostAlchLambdas.vdwLambda2Up = float(sim_params.getVdwLambda(hostAlchLambdas.currentLambda2));
3588  hostAlchLambdas.vdwLambdaDown = float(sim_params.getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
3589  hostAlchLambdas.vdwLambda2Down = float(sim_params.getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
3590 }
3591 
3592 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
3593  bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3594 }
3595 
3596 #ifdef NODEGROUP_FORCE_REGISTER
3597 void ComputeBondedCUDA::registerPointersToHost() {
3598  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3599  PatchData *patchData = cpdata.ckLocalBranch();
3600 
3601  const int deviceIndex = deviceCUDA->getDeviceIndex();
3602 
3603  TupleDataStage dst = migrationKernel.getDstBuffers();
3604  patchData->h_tupleDataStage.bond[deviceIndex] = dst.bond;
3605  patchData->h_tupleDataStage.angle[deviceIndex] = dst.angle;
3606  patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.dihedral;
3607  patchData->h_tupleDataStage.improper[deviceIndex] = dst.improper;
3608  patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.modifiedExclusion;
3609  patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.exclusion;
3610  patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.crossterm;
3611 
3612  TupleIntArraysContiguous count = migrationKernel.getDeviceTupleCounts();
3613  patchData->h_tupleCount.bond[deviceIndex] = count.bond();
3614  patchData->h_tupleCount.angle[deviceIndex] = count.angle();
3615  patchData->h_tupleCount.dihedral[deviceIndex] = count.dihedral();
3616  patchData->h_tupleCount.improper[deviceIndex] = count.improper();
3617  patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.modifiedExclusion();
3618  patchData->h_tupleCount.exclusion[deviceIndex] = count.exclusion();
3619  patchData->h_tupleCount.crossterm[deviceIndex] = count.crossterm();
3620 
3621  TupleIntArraysContiguous offset = migrationKernel.getDeviceTupleOffsets();
3622  patchData->h_tupleOffset.bond[deviceIndex] = offset.bond();
3623  patchData->h_tupleOffset.angle[deviceIndex] = offset.angle();
3624  patchData->h_tupleOffset.dihedral[deviceIndex] = offset.dihedral();
3625  patchData->h_tupleOffset.improper[deviceIndex] = offset.improper();
3626  patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.modifiedExclusion();
3627  patchData->h_tupleOffset.exclusion[deviceIndex] = offset.exclusion();
3628  patchData->h_tupleOffset.crossterm[deviceIndex] = offset.crossterm();
3629 }
3630 
3631 void ComputeBondedCUDA::copyHostRegisterToDevice() {
3632  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3633  PatchData *patchData = cpdata.ckLocalBranch();
3634 
3635  const int deviceIndex = deviceCUDA->getDeviceIndex();
3636  const int nDevices = deviceCUDA->getNumDevice();
3637  const int deviceID = deviceCUDA->getDeviceID();
3638  cudaCheck(cudaSetDevice(deviceID));
3639 
3640  migrationKernel.copyPeerDataToDevice(
3641  patchData->h_tupleDataStage,
3642  patchData->h_tupleCount,
3643  patchData->h_tupleOffset,
3644  nDevices,
3645  stream
3646  );
3647 }
3648 
3649 void ComputeBondedCUDA::copyPatchData() {
3650  PatchMap* patchMap = PatchMap::Object();
3651  int numPatches = patchMap->numPatches();
3652  // Constructs a patch record array that is indexed by global patchID
3653  for (int i = 0; i < numPatches; i++) {
3654  h_patchRecord[i] = patches[patchIndex[i]];
3655  }
3656 
3657  copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
3658 }
3659 #endif // NODEGROUP_FORCE_REGISTER
3660 
3661 SubmitReduction* ComputeBondedCUDA::getCurrentReduction() {
3663  return (simParams->CUDASOAintegrate) ? reductionGpuResident :
3664  reductionGpuOffload;
3665 }
3666 
3667 #endif // BONDED_CUDA
3668 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
CudaDihedralStage * improper
NAMD_HOST_DEVICE int * modifiedExclusion()
void barrier(const SynchronousCollectiveScope scope)
float scale
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
Definition: Patch.C:133
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
#define M_PI
Definition: GoMolecule.C:40
const CrosstermValue * value
int getNumAtoms() const
Definition: Patch.h:105
int NumBondParams
Definition: Parameters.h:326
CrosstermData c[dim][dim]
Definition: Parameters.h:141
TuplePatchElem * p[size]
Definition: ComputeAngles.h:22
int size(void) const
Definition: ResizeArray.h:131
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:139
TuplePatchElem * p[size]
BigReal getBondLambda(const BigReal) const
const ImproperValue * value
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
static int warpAlign(const int n)
CudaAngleStage * angle
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:29
int numAtoms
Definition: CudaRecord.h:17
int32 ComputeID
Definition: NamdTypes.h:288
#define ADD_TENSOR(R, RL, D, DL)
Definition: ReductionMgr.h:33
float3 ioffsetXYZ
int getCMaxIndex()
Definition: PatchMap.h:174
static PatchMap * Object()
Definition: PatchMap.h:27
float3 joffsetXYZ
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1863
CudaCrosstermStage * crossterm
CmiNodeLock printlock
Definition: PatchData.h:163
Definition: Vector.h:72
virtual void submit(void)=0
SimParameters * simParameters
Definition: Node.h:181
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1852
NAMD_HOST_DEVICE int * exclusion()
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
Bool CUDASOAintegrateMode
AtomID atomID[size]
#define INDEX(ncols, i, j)
BigReal & item(int i)
Definition: ReductionMgr.h:336
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
Definition: Patch.C:238
AtomID atomID[size]
BigReal z
Definition: Vector.h:74
int downstream(int pid1, int pid2)
Definition: PatchMap.inl:51
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:669
int getNumDevice()
Definition: DeviceCUDA.h:125
Position position
Definition: NamdTypes.h:78
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
DihedralValue * dihedral_array
Definition: Parameters.h:310
int getBDim()
Definition: PatchMap.h:170
int getADim()
Definition: PatchMap.h:169
int fepBondedType
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
BigReal getElecLambda(const BigReal) const
int index[size]
int NumAngleParams
Definition: Parameters.h:327
CrosstermValue * crossterm_array
Definition: Parameters.h:312
Real scale
Definition: ComputeBonds.h:27
int localIndex[size]
int getBMaxIndex()
Definition: PatchMap.h:173
Molecule stores the structural information for the system.
Definition: Molecule.h:174
AtomID atomID[size]
Definition: ComputeBonds.h:24
int NumDihedralParams
Definition: Parameters.h:329
Definition: Patch.h:35
Bool useDeviceMigration
Flags flags
Definition: Patch.h:128
BigReal d11
Definition: Parameters.h:133
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:124
NAMD_HOST_DEVICE int * bond()
AngleValue * angle_array
Definition: Parameters.h:309
static CudaNBConstants getNonbondedCoef(SimParameters *params)
TuplePatchElem * p[size]
static Units next(Units u)
Definition: ParseOptions.C:48
#define order
Definition: PmeRealSpace.C:235
float3 koffsetXYZ
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1874
CudaAtom * getCudaAtomList()
Definition: Patch.h:125
float3 ioffsetXYZ
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
Definition: CudaUtils.h:435
void CcdCallBacksReset(void *ignored, double curWallTime)
const DihedralValue * value
NAMD_HOST_DEVICE float3 make_float3(float4 a)
Definition: Vector.h:335
int localIndex[size]
Definition: ComputeBonds.h:25
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:53
CudaDihedralStage * dihedral
AtomID atomID[size]
int NumImproperParams
Definition: Parameters.h:330
int16 vdwType
Definition: NamdTypes.h:80
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1885
Force * f[maxNumForces]
Definition: PatchTypes.h:150
NAMD_HOST_DEVICE int * crossterm()
AtomID atomID[size]
Definition: ComputeAngles.h:20
int localIndex[size]
BigReal d00
Definition: Parameters.h:133
uint8 partition
Definition: NamdTypes.h:81
NAMD_HOST_DEVICE int * angle()
BigReal x
Definition: Vector.h:74
CudaBondStage * bond
PatchID getPatchID() const
Definition: Patch.h:114
int NumCrosstermParams
Definition: Parameters.h:331
int patchIDs[size]
ImproperValue * improper_array
Definition: Parameters.h:311
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1811
CudaExclusionStage * modifiedExclusion
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
Definition: ProxyMgr.C:492
int get_fep_bonded_type(const int *atomID, unsigned int order) const
Definition: Molecule.h:1482
void NAMD_die(const char *err_msg)
Definition: common.C:147
int localIndex[size]
Definition: ComputeAniso.h:23
CudaExclusionStage * exclusion
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:130
const TholeValue * value
Definition: ComputeThole.h:43
int getDeviceID()
Definition: DeviceCUDA.h:144
unsigned char get_fep_type(int anum) const
Definition: Molecule.h:1437
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:121
Parameters * parameters
Definition: Node.h:180
int getAMaxIndex()
Definition: PatchMap.h:172
float3 ioffsetXYZ
int localIndex[size]
Definition: ComputeAngles.h:21
int index[size]
#define simParams
Definition: Output.C:131
int32 sortOrder
Definition: NamdTypes.h:153
int localIndex[size]
BigReal d01
Definition: Parameters.h:133
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
int patchIDs[size]
int32 AtomID
Definition: NamdTypes.h:35
BondValue * bond_array
Definition: Parameters.h:308
BigReal y
Definition: Vector.h:74
const BondValue * value
Definition: ComputeBonds.h:46
BigReal getVdwLambda(const BigReal) const
AtomID atomID[size]
Definition: ComputeAniso.h:22
int getDeviceIndex()
Definition: DeviceCUDA.h:166
Box< Patch, CompAtom > * avgPositionBox
Real theta0
Definition: Parameters.h:108
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
TuplePatchElem * p[size]
Definition: ComputeAniso.h:24
const AngleValue * value
Definition: ComputeAngles.h:40
CompAtomExt * xExt
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
int localIndex[size]
Definition: ComputeThole.h:23
int patchIDs[size]
TuplePatchElem * p[size]
Definition: ComputeThole.h:24
int node(int pid) const
Definition: PatchMap.h:114
TuplePatchElem * p[size]
Definition: ComputeBonds.h:26
NAMD_HOST_DEVICE int * improper()
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1838
Data * open(void)
Definition: Box.h:39
float3 loffsetXYZ
int atomStart
Definition: CudaRecord.h:16
int patchIDs[size]
TuplePatchElem * p[size]
int32 PatchID
Definition: NamdTypes.h:287
NAMD_HOST_DEVICE int * dihedral()
const AnisoValue * value
Definition: ComputeAniso.h:43
Molecule * molecule
Definition: Node.h:179
Box< Patch, Results > * forceBox
int fepBondedType
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1785
void close(Data **const t)
Definition: Box.h:49
BigReal d10
Definition: Parameters.h:133
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:106
AtomID atomID[size]
Definition: ComputeThole.h:22
static SynchronousCollectives * Object()
BigReal alchVdwShiftCoeff
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:227