NAMD
ComputeBondedCUDA.C
Go to the documentation of this file.
1 #include <algorithm> // std::find
2 
3 #include "ComputeAniso.h"
5 #include "ComputeHomeTuples.h"
6 #include "ComputeMap.h"
7 #include "ComputeSelfTuples.h"
9 #include "ComputeThole.h"
10 #include "ReductionMgr.h"
11 #include "SimParameters.h"
12 #include "charm++.h"
13 #include "NamdTypes.h"
14 #include "ComputeMgr.h"
15 #include "WorkDistrib.h"
16 #include "ProxyMgr.h"
17 #include "CudaUtils.h"
18 #include "DeviceCUDA.h"
19 #include "ComputeBonds.h"
20 #include "ComputeAngles.h"
21 #include "ComputeDihedrals.h"
22 #include "ComputeImpropers.h"
23 #include "ComputeCrossterms.h"
25 #include "ComputeBondedCUDA.h"
26 #include "PatchData.h"
27 #include "HipDefines.h"
28 #include "NamdEventsProfiling.h"
29 #include "CudaRecord.h"
30 #include "SynchronousCollectives.h"
31 
32 #include "TestArray.h"
33 
34 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
35 #ifdef WIN32
36 #define __thread __declspec(thread)
37 #endif
38 extern __thread DeviceCUDA *deviceCUDA;
39 
40 #ifdef BONDED_CUDA
41 
42 
43 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
44  sizeof(CudaBond), // Bonds
45  sizeof(CudaAngle), // Angles
46  sizeof(CudaDihedral), // Dihedrals
47  sizeof(CudaDihedral), // Impropers
48  sizeof(CudaExclusion), // Exclusions
49  sizeof(CudaCrossterm), // Crossterms
50  sizeof(CudaThole), // Thole
51  sizeof(CudaAniso), // Aniso
52  sizeof(CudaOneFourNbThole),// 1-4 NbThole correction
53 };
54 
55 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
56  sizeof(CudaBondStage), // Bonds
57  sizeof(CudaAngleStage), // Angles
58  sizeof(CudaDihedralStage), // Dihedrals
59  sizeof(CudaDihedralStage), // Impropers
60  sizeof(CudaExclusionStage), // Exclusions
61  sizeof(CudaCrosstermStage), // Crossterms
62  sizeof(CudaTholeStage), // Thole
63  sizeof(CudaAnisoStage), // Aniso
64  sizeof(CudaOneFourNbTholeStage),// 1-4 NbThole correction
65 };
66 
67 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
68 
69 //
70 // Class constructor
71 //
72 ComputeBondedCUDA::ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID,
73  CudaNonbondedTables& cudaNonbondedTables) :
74 Compute(c), deviceID(deviceID), masterPe(CkMyPe()),
75 bondedKernel(deviceID, cudaNonbondedTables), computeMgr(computeMgr)
76 {
77 
78  computes.resize(CkMyNodeSize());
79  patchIDsPerRank.resize(CkMyNodeSize());
80  numExclPerRank.resize(CkMyNodeSize());
81  for (int i=0;i < numExclPerRank.size();i++) {
82  numExclPerRank[i].numModifiedExclusions = 0;
83  numExclPerRank[i].numExclusions = 0;
84  }
85 
86  atomMap.allocateMap(Node::Object()->molecule->numAtoms);
87 
88  flags = NULL;
89  step = 0;
90 
91  tupleData = NULL;
92  tupleDataSize = 0;
93 
94  atoms = NULL;
95  atomsSize = 0;
96 
97  forces = NULL;
98  forcesSize = 0;
99 
100  energies_virials = NULL;
101 
102  initializeCalled = false;
103 
104  params = Node::Object()->simParameters;
105  accelMDdoDihe = false;
106  if (params->accelMDOn) {
107  if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=true;
108  }
109  /*pswitchTable = {0, 1, 2,
110  1, 1, 99,
111  2, 99, 2};
112  */
113  pswitchTable[0] = 0; pswitchTable[1] = 1; pswitchTable[2] = 2;
114  pswitchTable[3] = 1; pswitchTable[4] = 1; pswitchTable[5] = 99;
115  pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
116 
117  h_patchRecord = NULL;
118  d_patchRecord = NULL;
119 
120  h_patchMapCenter = NULL;
121  d_patchMapCenter = NULL;
122 
123 }
124 
125 //
126 // Class destructor
127 //
128 ComputeBondedCUDA::~ComputeBondedCUDA() {
129  cudaCheck(cudaSetDevice(deviceID));
130 
131  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
132  if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
133  if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
134  if (tupleData != NULL) deallocate_host<char>(&tupleData);
135 
136  if (initializeCalled) {
137  cudaCheck(cudaStreamDestroy(stream));
138  cudaCheck(cudaEventDestroy(forceDoneEvent));
139  CmiDestroyLock(lock);
140  CmiDestroyLock(printLock);
141  if (reductionGpuOffload) {
142  delete reductionGpuOffload;
143  }
144  if (reductionGpuResident) {
145  delete reductionGpuResident;
146  }
147  }
148 
149  if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
150  if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
151 
152  if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
153  if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
154 
155  // NOTE: unregistering happens in [sync] -entry method
156  computeMgr->sendUnregisterBoxesOnPe(pes, this);
157 }
158 
159 void ComputeBondedCUDA::unregisterBoxesOnPe() {
160  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
161  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
162  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
163  if (tpe == NULL || tpe->p == NULL) {
164  NAMD_bug("ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
165  }
166  Patch* patch = tpe->p;
167  if (tpe->positionBox != NULL) patch->unregisterPositionPickup(this, &tpe->positionBox);
168  if (tpe->avgPositionBox != NULL) patch->unregisterAvgPositionPickup(this, &tpe->avgPositionBox);
169  if (tpe->forceBox != NULL) patch->unregisterForceDeposit(this, &tpe->forceBox);
170  }
171 }
172 
173 //
174 // Register compute for a given PE. pids is a list of patches the PE has
175 // Called by master PE
176 //
177 void ComputeBondedCUDA::registerCompute(int pe, int type, PatchIDList& pids) {
178 
179  if (CkMyPe() != masterPe)
180  NAMD_bug("ComputeBondedCUDA::registerCompute() called on non master PE");
181 
182  int rank = CkRankOf(pe);
183 
184  HomeCompute& homeCompute = computes[rank].homeCompute;
185  if (homeCompute.patchIDs.size() == 0) {
186  homeCompute.isBasePatch.resize(PatchMap::Object()->numPatches(), 0);
187  homeCompute.patchIDs.resize(pids.size());
188  for (int i=0;i < pids.size();i++) {
189  homeCompute.patchIDs[i] = pids[i];
190  homeCompute.isBasePatch[pids[i]] = 1;
191  }
192  } else {
193  if (homeCompute.patchIDs.size() != pids.size()) {
194  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
195  }
196  for (int i=0;i < pids.size();i++) {
197  if (homeCompute.patchIDs[i] != pids[i]) {
198  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
199  }
200  }
201  }
202 
203  switch(type) {
204  case computeBondsType:
205  homeCompute.tuples.push_back(new HomeTuples<BondElem, Bond, BondValue>(Tuples::BOND));
206  break;
207 
208  case computeAnglesType:
209  homeCompute.tuples.push_back(new HomeTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE));
210  break;
211 
213  homeCompute.tuples.push_back(new HomeTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL));
214  break;
215 
217  homeCompute.tuples.push_back(new HomeTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER));
218  break;
219 
220  case computeExclsType:
221  homeCompute.tuples.push_back(new HomeTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION));
222  break;
223 
225  homeCompute.tuples.push_back(new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM));
226  break;
227 
228  case computeTholeType:
229  homeCompute.tuples.push_back(new HomeTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE));
230  break;
231 
232  case computeAnisoType:
233  homeCompute.tuples.push_back(new HomeTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO));
234  break;
235 
237  homeCompute.tuples.push_back(new HomeTuples<OneFourNbTholeElem, OneFourNbThole, OneFourNbTholeValue>(Tuples::ONEFOURNBTHOLE));
238  break;
239 
240  default:
241  NAMD_bug("ComputeBondedCUDA::registerCompute(), Unsupported compute type");
242  break;
243  }
244 
245 }
246 
247 //
248 // Register self compute for a given PE
249 // Called by master PE
250 //
251 void ComputeBondedCUDA::registerSelfCompute(int pe, int type, int pid) {
252 
253  if (CkMyPe() != masterPe)
254  NAMD_bug("ComputeBondedCUDA::registerSelfCompute() called on non master PE");
255 
256  int rank = CkRankOf(pe);
257 
258  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
259  auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
260  if (it == selfComputes.end()) {
261  // Type not found, add new one
262  selfComputes.push_back(SelfCompute(type));
263  it = selfComputes.begin() + (selfComputes.size() - 1);
264 
265  switch(type) {
267  it->tuples = new SelfTuples<BondElem, Bond, BondValue>(Tuples::BOND);
268  break;
269 
271  it->tuples = new SelfTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE);
272  break;
273 
275  it->tuples = new SelfTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL);
276  break;
277 
279  it->tuples = new SelfTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER);
280  break;
281 
283  it->tuples = new SelfTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION);
284  break;
285 
287  it->tuples = new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM);
288  break;
289 
291  it->tuples = new SelfTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE);
292  break;
293 
295  it->tuples = new SelfTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO);
296  break;
297 
299  it->tuples = new SelfTuples<OneFourNbTholeElem, OneFourNbThole, OneFourNbTholeValue>(Tuples::ONEFOURNBTHOLE);
300  break;
301 
302  default:
303  NAMD_bug("ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
304  break;
305  }
306 
307  }
308 
309  // Add patch ID for this type
310  it->patchIDs.push_back(pid);
311 }
312 
313 void ComputeBondedCUDA::assignPatchesOnPe() {
314 
315  PatchMap* patchMap = PatchMap::Object();
316  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
317  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
318  ProxyMgr::Object()->createProxy(patchID);
319  Patch* patch = patchMap->patch(patchID);
320  if (patch == NULL)
321  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, patch not found");
322  if (flags == NULL) flags = &patchMap->patch(patchID)->flags;
323  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
324  if (tpe == NULL) {
325  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
326  }
327  if (tpe->p != NULL) {
328  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
329  }
330  // Assign patch and register coordinates and forces manually
331  tpe->p = patch;
332  tpe->positionBox = patch->registerPositionPickup(this);
333  tpe->avgPositionBox = patch->registerAvgPositionPickup(this);
334  tpe->forceBox = patch->registerForceDeposit(this);
335  }
336 }
337 
338 //
339 // atomUpdate() can be called by any Pe
340 //
341 void ComputeBondedCUDA::atomUpdate() {
342  atomsChangedIn = true;
343 }
344 
345 //
346 // Enqueu doWork on masterPe and return "no work"
347 // Can be called by any Pe
348 //
349 int ComputeBondedCUDA::noWork() {
350  computeMgr->sendMessageEnqueueWork(masterPe, this);
351  return 1;
352 }
353 
354 void ComputeBondedCUDA::messageEnqueueWork() {
355  if (masterPe != CkMyPe())
356  NAMD_bug("ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
358 }
359 
360 //
361 // Sends open-box commands to PEs
362 // Called on master PE
363 //
364 void ComputeBondedCUDA::doWork() {
365  if (CkMyPe() != masterPe)
366  NAMD_bug("ComputeBondedCUDA::doWork() called on non master PE");
367 
368  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
369  // atomsChangedIn can be set to true by any Pe
370  // atomsChanged can only be set by masterPe
371  // This use of double varibles makes sure we don't have race condition
372  atomsChanged = atomsChangedIn;
373  atomsChangedIn = false;
374 
375  if (getNumPatches() == 0) {
376  return; // No work do to
377  }
378 
379  if (flags == NULL)
380  NAMD_bug("ComputeBondedCUDA::doWork(), no flags set");
381 
382  // Read flags
383  // what is flags...
384  lattice = flags->lattice;
385  doEnergy = flags->doEnergy;
386  doVirial = flags->doVirial;
387  doSlow = flags->doFullElectrostatics;
388  doMolly = flags->doMolly;
389  step = flags->step;
390 
391  if (hostAlchFlags.alchOn) {
392  updateHostCudaAlchLambdas();
393  updateKernelCudaAlchLambdas();
394  const int& alchOutFreq = Node::Object()->simParameters->alchOutFreq;
395  if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
396  doEnergy = true;
397  }
398  }
399 
400  if (atomsChanged) {
401  // Re-calculate patch atom numbers and storage
402  updatePatches();
403  }
404  // Open boxes on Pes and launch work to masterPe
405  if(params->CUDASOAintegrate) {
406  if (!atomsChanged) this->openBoxesOnPe();
407  }
408  else computeMgr->sendOpenBoxesOnPe(pes, this);
409 }
410 
411 //
412 // This gets called when patch finishes on a PE
413 //
414 void ComputeBondedCUDA::patchReady(PatchID pid, int doneMigration, int seq) {
415  if (doneMigration) {
416  // auto it = patchIndex.find(pid);
417  // if (it == patchIndex.end())
418  // NAMD_bug("ComputeBondedCUDA::patchReady, Patch ID not found");
419  patches[patchIndex[pid]].numAtoms = PatchMap::Object()->patch(pid)->getNumAtoms();
420 #ifdef NODEGROUP_FORCE_REGISTER
421  patches[patchIndex[pid]].patchID = pid;
422 #endif
423  }
424  // XXX first sum locally, then sum globally
425  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
426  // however that is already called in Sequencer::runComputeObjects_CUDA
427  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
428  CmiLock(lock);
429  Compute::patchReady(pid, doneMigration, seq);
430  CmiUnlock(lock);
431  }
432 }
433 
434 //
435 //
436 //
437 void ComputeBondedCUDA::updatePatches() {
438  if (!Node::Object()->simParameters->CUDASOAintegrate) {
439  int atomStart = 0;
440  for (int i=0;i < patches.size();i++) {
441  patches[i].atomStart = atomStart;
442  atomStart += patches[i].numAtoms;
443  }
444  atomStorageSize = atomStart;
445 
446  // Re-allocate atoms
447  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
448 
449  } else {
450 #ifdef NODEGROUP_FORCE_REGISTER
451  const int deviceIndex = deviceCUDA->getDeviceIndex();
452  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
453  PatchData *patchData = cpdata.ckLocalBranch();
454 
455  std::vector<CudaLocalRecord>& localPatches =
456  patchData->devData[deviceIndex].h_localPatches;
457  const int numPatchesHomeAndProxy =
458  patchData->devData[deviceIndex].numPatchesHomeAndProxy;
459 
460  int atomStart = 0;
461  for (int i=0;i < numPatchesHomeAndProxy; i++) {
462  patches[i].numAtoms = localPatches[i].numAtoms;
463  patches[i].atomStart = localPatches[i].bufferOffset;
464  atomStart += patches[i].numAtoms;
465  }
466 
467  atomStorageSize = atomStart;
468  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
469 
470  if (params->CUDASOAintegrate && params->useDeviceMigration) {
471  bondedKernel.updateAtomBuffer(atomStorageSize, stream);
472  updatePatchRecords();
473  }
474 #endif // NODEGROUP_FORCE_REGISTER
475  }
476 }
477 
478 //
479 // Map atoms GPU-wide
480 //
481 void ComputeBondedCUDA::mapAtoms() {
482  for (int i=0;i < getNumPatches();i++) {
483  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
484  atomMappers[i]->registerIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
485  }
486 
487 }
488 
489 //
490 // Unmap atoms GPU-wide
491 //
492 void ComputeBondedCUDA::unmapAtoms() {
493  for (int i=0;i < getNumPatches();i++) {
494  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
495  atomMappers[i]->unregisterIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
496  }
497 }
498 
499 //
500 // Open all patches that have been assigned to this Pe
501 //
502 void ComputeBondedCUDA::openBoxesOnPe(int startup) {
503  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_OPEN_BOXES);
504  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
505 #if 0
506  fprintf(stderr, "PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(), this);
507 #endif
508 #ifdef NODEGROUP_FORCE_REGISTER
509  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
510  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
511  PatchID patchID = *it;
512  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
513  tpe->x = tpe->positionBox->open();
514  tpe->r = tpe->forceBox->open();
515  //tpe->f = tpe->r->f[Results::normal];
516  }
517  this->launchWork();
518  }
519  else{
520 #endif
521  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
522  PatchID patchID = *it;
523  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
524  tpe->x = tpe->positionBox->open();
525  tpe->xExt = tpe->p->getCompAtomExtInfo();
526  //fprintf(stderr, "::openBoxesOnPe(%p) PE[%d] step %d PID[%d] = atomExt = %p\n", this, CkMyPe(), tpe->p->flags.step, tpe->p->getPatchID(), tpe->xExt);
527  if ( doMolly ) tpe->x_avg = tpe->avgPositionBox->open();
528  tpe->r = tpe->forceBox->open();
529  tpe->f = tpe->r->f[Results::normal];
530  if (accelMDdoDihe) tpe->af = tpe->r->f[Results::amdf]; // for dihedral-only or dual-boost accelMD
531  // Copy atoms
532  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
533  int pi = patchIndex[patchID];
534  int atomStart = patches[pi].atomStart;
535  int numAtoms = patches[pi].numAtoms;
536  CompAtom* compAtom = tpe->x;
537  const CompAtomExt *aExt = tpe->p->getCompAtomExtInfo();
538  const CudaAtom *src = tpe->p->getCudaAtomList();
539  for (int i=0;i < numAtoms;i++) {
540  int j = aExt[i].sortOrder;
541  // We have an atomStart here, and J is the sortOrder: unsorting atoms
542  atoms[atomStart + j] = src[i];
543  }
544  }
545  }
546  bool done = false;
547  // NOTE: this whole scheme of counting down the patches will help
548  // when I have multiple masterPes
549  CmiLock(lock);
550  patchesCounter -= patchIDs.size();
551  if (patchesCounter == 0) {
552  patchesCounter = getNumPatches();
553  done = true;
554  }
555  CmiUnlock(lock);
556  if (done) {
557  if (atomsChanged) {
558  if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
559  mapAtoms();
560  }
561  //computeMgr->sendLoadTuplesOnPe(pes, this);
562  if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes, this);
563  } else {
564  //computeMgr->sendLaunchWork(masterPe, this);
565  if(params->CUDASOAintegrate){
566  if(!atomsChanged) this->launchWork();
567  }
568  else computeMgr->sendLaunchWork(masterPe, this);
569  }
570  }
571 #ifdef NODEGROUP_FORCE_REGISTER
572  }
573 #endif
574  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_OPEN_BOXES);
575 
576 #if 0
577  // SynchronousCollectives* syncColl = SynchronousCollectives::Object();
578  // syncColl->barrier(SynchronousCollectiveScope::all);
579  // Patches are using different
580  CmiLock(printLock);
581  fprintf(stderr, "PE[%d] (%p) tuplePatchList printout\n", CkMyPe(), this);
582  for(int i = 0 ; i < tuplePatchList.size(); i++){
583  // how the fuck do I print this structure. argh
584  TuplePatchElem *tpe = tuplePatchList.find(TuplePatchElem(i));
585  if(tpe == NULL) break;
586  fprintf(stderr, "PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(), this, i, tpe->p->getPatchID(), tpe->xExt);
587  }
588  CmiUnlock(printLock);
589  // syncColl->barrier(SynchronousCollectiveScope::all); // Let's see
590 #endif
591 }
592 
593 void countNumExclusions(Tuples* tuples, int& numModifiedExclusions, int& numExclusions) {
594  numModifiedExclusions = 0;
595  int ntuples = tuples->getNumTuples();
596  ExclElem* src = (ExclElem *)(tuples->getTupleList());
597  for (int ituple=0;ituple < ntuples;ituple++) {
598  if (src[ituple].modified) numModifiedExclusions++;
599  }
600  numExclusions = ntuples - numModifiedExclusions;
601 }
602 
603 //
604 // Load tuples on PE. Note: this can only after boxes on all PEs have been opened
605 //
606 void ComputeBondedCUDA::loadTuplesOnPe(const int startup) {
607  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_LOAD_TUPLES);
608 
609  int numModifiedExclusions = 0;
610  int numExclusions = 0;
611  if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
612  std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
613  // Loop over self compute types
614  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
615  // clears tupleList and reloads the tuples
616  it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
617  // For exclusions, we must count the number of modified and non-modified exclusions
618  if (it->tuples->getType() == Tuples::EXCLUSION) {
619  int tmp1, tmp2;
620  countNumExclusions(it->tuples, tmp1, tmp2);
621  numModifiedExclusions += tmp1;
622  numExclusions += tmp2;
623  }
624  }
625 
626  HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
627  for (int i=0;i < homeCompute.tuples.size();i++) {
628  homeCompute.tuples[i]->loadTuples(tuplePatchList,
629  homeCompute.isBasePatch.data(), &atomMap,
630  homeCompute.patchIDs);
631  // For exclusions, we must count the number of modified and non-modified exclusions
632  if (homeCompute.tuples[i]->getType() == Tuples::EXCLUSION) {
633  int tmp1, tmp2;
634  countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
635  numModifiedExclusions += tmp1;
636  numExclusions += tmp2;
637  }
638  }
639  } else {
640  numModifiedExclusions = modifiedExclusionTupleData.size();
641  numExclusions = exclusionTupleData.size();
642  }
643 
644  // Store number of exclusions
645  numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
646  numExclPerRank[CkMyRank()].numExclusions = numExclusions;
647 
648 
649  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
650  bool done = false;
651 
652  // TODO: Swap that for a more efficient atomic operation
653  // Think of a lock-free design
654  CmiLock(lock);
655  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
656  if (patchesCounter == 0) {
657  patchesCounter = getNumPatches();
658  done = true;
659  }
660  CmiUnlock(lock);
661  if (done) {
662  //computeMgr->sendLaunchWork(masterPe, this);
663  //if(params->CUDASOAintegrate && !first) this->launchWork();
664  //else computeMgr->sendLaunchWork(masterPe, this);
665  if(!params->CUDASOAintegrate)computeMgr->sendLaunchWork(masterPe, this);
666  }
667  }
668 
669  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_LOAD_TUPLES);
670 }
671 
672 void ComputeBondedCUDA::copyBondData(const int ntuples, const BondElem* __restrict__ src,
673  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
674 
675  PatchMap* patchMap = PatchMap::Object();
676  for (int ituple=0;ituple < ntuples;ituple++) {
677  CudaBond dstval;
678  auto p0 = src[ituple].p[0];
679  auto p1 = src[ituple].p[1];
680  int pi0 = patchIndex[p0->patchID];
681  int pi1 = patchIndex[p1->patchID];
682  int l0 = src[ituple].localIndex[0];
683  int l1 = src[ituple].localIndex[1];
684  dstval.i = l0 + patches[pi0].atomStart;
685  dstval.j = l1 + patches[pi1].atomStart;
686  dstval.itype = (src[ituple].value - bond_array);
687  Position position1 = p0->x[l0].position;
688  Position position2 = p1->x[l1].position;
689  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
690  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
691  dstval.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
692  dstval.scale = src[ituple].scale;
693  if (hostAlchFlags.alchOn) {
694  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
695  const Molecule& mol = *(Node::Object()->molecule);
696  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
697  } else {
698  dstval.fepBondedType = 0;
699  }
700  dst[ituple] = dstval;
701  }
702 }
703 
704 #ifdef NODEGROUP_FORCE_REGISTER
705 template<>
706 void ComputeBondedCUDA::copyTupleToStage(const BondElem& src,
707  const BondValue* __restrict__ p_array, CudaBondStage& dstval) {
708 
709  auto p0 = src.p[0];
710  auto p1 = src.p[1];
711  int pi0 = patchIndex[p0->patchID];
712  int pi1 = patchIndex[p1->patchID];
713  int l0 = src.localIndex[0];
714  int l1 = src.localIndex[1];
715  dstval.itype = (src.value - p_array);
716  dstval.scale = src.scale;
717  dstval.patchIDs[0] = p0->patchID;
718  dstval.patchIDs[1] = p1->patchID;
719 
720  dstval.index[0] = l0;
721  dstval.index[1] = l1;
722 
723  if (hostAlchFlags.alchOn) {
724  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
725  const Molecule& mol = *(Node::Object()->molecule);
726  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
727  } else {
728  dstval.fepBondedType = 0;
729  }
730 }
731 #endif // NODEGROUP_FORCE_REGISTER
732 
733 // XXX NOTE: Modified FP32 version
734 void ComputeBondedCUDA::copyBondDatafp32(const int ntuples, const BondElem* __restrict__ src,
735  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
736 
737  PatchMap* patchMap = PatchMap::Object();
738  float3 b1f, b2f, b3f;
739  b1f = make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
740  b2f = make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
741  b3f = make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
742 
743  for (int ituple=0;ituple < ntuples;ituple++) {
744  CudaBond dstval;
745  auto p0 = src[ituple].p[0];
746  auto p1 = src[ituple].p[1];
747  int pi0 = patchIndex[p0->patchID];
748  int pi1 = patchIndex[p1->patchID];
749  int l0 = src[ituple].localIndex[0];
750  int l1 = src[ituple].localIndex[1];
751  dstval.i = l0 + patches[pi0].atomStart;
752  dstval.j = l1 + patches[pi1].atomStart;
753  dstval.itype = (src[ituple].value - bond_array);
754 #if 0
755  Position position1 = p0->x[l0].position;
756  Position position2 = p1->x[l1].position;
757  Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
758  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
759 #endif
760  float3 position1 = make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
761  float3 position2 = make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
762  float3 diff = position1 - position2;
763  float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
764  float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
765  float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
766  if(pi0 != pi1){
767  Vector c = patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
768  d1 += c.x;
769  d2 += c.y;
770  d3 += c.z;
771  }
772  dstval.ioffsetXYZ = make_float3(d1, d2, d3);
773  dstval.scale = src[ituple].scale;
774  if (hostAlchFlags.alchOn) {
775  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
776  const Molecule& mol = *(Node::Object()->molecule);
777  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 2);
778  } else {
779  dstval.fepBondedType = 0;
780  }
781  dst[ituple] = dstval;
782  }
783 }
784 
785 void ComputeBondedCUDA::copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
786  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst) {
787  PatchMap* patchMap = PatchMap::Object();
788  for (int ituple=0;ituple < ntuples;ituple++) {
789  CudaAngle dstval;
790  auto p0 = src[ituple].p[0];
791  auto p1 = src[ituple].p[1];
792  auto p2 = src[ituple].p[2];
793  int pi0 = patchIndex[p0->patchID];
794  int pi1 = patchIndex[p1->patchID];
795  int pi2 = patchIndex[p2->patchID];
796  int l0 = src[ituple].localIndex[0];
797  int l1 = src[ituple].localIndex[1];
798  int l2 = src[ituple].localIndex[2];
799  dstval.i = l0 + patches[pi0].atomStart;
800  dstval.j = l1 + patches[pi1].atomStart;
801  dstval.k = l2 + patches[pi2].atomStart;
802  dstval.itype = (src[ituple].value - angle_array);
803  Position position1 = p0->x[l0].position;
804  Position position2 = p1->x[l1].position;
805  Position position3 = p2->x[l2].position;
806  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
807  Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
808  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
809  if(pi2 != pi1) shiftVec32 += patchMap->center(p2->patchID) - patchMap->center(p1->patchID);
810 
811  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
812  dstval.koffsetXYZ = make_float3((float)shiftVec32.x, (float)shiftVec32.y, (float)shiftVec32.z);
813  dstval.scale = src[ituple].scale;
814  if (hostAlchFlags.alchOn) {
815  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
816  const Molecule& mol = *(Node::Object()->molecule);
817  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 3);
818  } else {
819  dstval.fepBondedType = 0;
820  }
821  dst[ituple] = dstval;
822  }
823 }
824 
825 #ifdef NODEGROUP_FORCE_REGISTER
826 template<>
827 void ComputeBondedCUDA::copyTupleToStage(const AngleElem& src,
828  const AngleValue* __restrict__ p_array, CudaAngleStage& dstval) {
829 
830  auto p0 = src.p[0];
831  auto p1 = src.p[1];
832  auto p2 = src.p[2];
833  int pi0 = patchIndex[p0->patchID];
834  int pi1 = patchIndex[p1->patchID];
835  int pi2 = patchIndex[p2->patchID];
836  int l0 = src.localIndex[0];
837  int l1 = src.localIndex[1];
838  int l2 = src.localIndex[2];
839  dstval.itype = (src.value - p_array);
840 
841  dstval.patchIDs[0] = p0->patchID;
842  dstval.patchIDs[1] = p1->patchID;
843  dstval.patchIDs[2] = p2->patchID;
844 
845  dstval.index[0] = l0;
846  dstval.index[1] = l1;
847  dstval.index[2] = l2;
848 
849  dstval.scale = src.scale;
850  if (hostAlchFlags.alchOn) {
851  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
852  const Molecule& mol = *(Node::Object()->molecule);
853  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 3);
854  } else {
855  dstval.fepBondedType = 0;
856  }
857 }
858 #endif // NODEGROUP_FORCE_REGISTER
859 
860 //
861 // Used for both dihedrals and impropers
862 //
863 template <bool doDihedral, typename T, typename P>
864 void ComputeBondedCUDA::copyDihedralData(const int ntuples, const T* __restrict__ src,
865  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
866 
867  PatchMap* patchMap = PatchMap::Object();
868 
869  for (int ituple=0;ituple < ntuples;ituple++) {
870  CudaDihedral dstval;
871  auto p0 = src[ituple].p[0];
872  auto p1 = src[ituple].p[1];
873  auto p2 = src[ituple].p[2];
874  auto p3 = src[ituple].p[3];
875  int pi0 = patchIndex[p0->patchID];
876  int pi1 = patchIndex[p1->patchID];
877  int pi2 = patchIndex[p2->patchID];
878  int pi3 = patchIndex[p3->patchID];
879  int l0 = src[ituple].localIndex[0];
880  int l1 = src[ituple].localIndex[1];
881  int l2 = src[ituple].localIndex[2];
882  int l3 = src[ituple].localIndex[3];
883  dstval.i = l0 + patches[pi0].atomStart;
884  dstval.j = l1 + patches[pi1].atomStart;
885  dstval.k = l2 + patches[pi2].atomStart;
886  dstval.l = l3 + patches[pi3].atomStart;
887  if (doDihedral) {
888  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
889  } else {
890  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
891  }
892  Position position1 = p0->x[l0].position;
893  Position position2 = p1->x[l1].position;
894  Position position3 = p2->x[l2].position;
895  Position position4 = p3->x[l3].position;
896  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
897  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
898  Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
899  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
900  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
901  if(pi3 != pi2) shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
902 
903  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
904  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
905  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
906  dstval.scale = src[ituple].scale;
907  if (hostAlchFlags.alchOn) {
908  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
909  const Molecule& mol = *(Node::Object()->molecule);
910  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
911  } else {
912  dstval.fepBondedType = 0;
913  }
914  dst[ituple] = dstval;
915  }
916 }
917 
918 #ifdef NODEGROUP_FORCE_REGISTER
919 template <>
920 void ComputeBondedCUDA::copyTupleToStage(const DihedralElem& src,
921  const DihedralValue* __restrict__ p_array, CudaDihedralStage& dstval) {
922 
923  auto p0 = src.p[0];
924  auto p1 = src.p[1];
925  auto p2 = src.p[2];
926  auto p3 = src.p[3];
927  int pi0 = patchIndex[p0->patchID];
928  int pi1 = patchIndex[p1->patchID];
929  int pi2 = patchIndex[p2->patchID];
930  int pi3 = patchIndex[p3->patchID];
931  int l0 = src.localIndex[0];
932  int l1 = src.localIndex[1];
933  int l2 = src.localIndex[2];
934  int l3 = src.localIndex[3];
935  dstval.itype = dihedralMultMap[(src.value - p_array)];
936 
937  dstval.patchIDs[0] = p0->patchID;
938  dstval.patchIDs[1] = p1->patchID;
939  dstval.patchIDs[2] = p2->patchID;
940  dstval.patchIDs[3] = p3->patchID;
941 
942  dstval.index[0] = l0;
943  dstval.index[1] = l1;
944  dstval.index[2] = l2;
945  dstval.index[3] = l3;
946 
947  dstval.scale = src.scale;
948  if (hostAlchFlags.alchOn) {
949  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
950  const Molecule& mol = *(Node::Object()->molecule);
951  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
952  } else {
953  dstval.fepBondedType = 0;
954  }
955 }
956 
957 template <>
958 void ComputeBondedCUDA::copyTupleToStage(const ImproperElem& src,
959  const ImproperValue* __restrict__ p_array, CudaDihedralStage& dstval) {
960 
961  auto p0 = src.p[0];
962  auto p1 = src.p[1];
963  auto p2 = src.p[2];
964  auto p3 = src.p[3];
965  int pi0 = patchIndex[p0->patchID];
966  int pi1 = patchIndex[p1->patchID];
967  int pi2 = patchIndex[p2->patchID];
968  int pi3 = patchIndex[p3->patchID];
969  int l0 = src.localIndex[0];
970  int l1 = src.localIndex[1];
971  int l2 = src.localIndex[2];
972  int l3 = src.localIndex[3];
973  dstval.itype = improperMultMap[(src.value - p_array)];
974 
975  dstval.patchIDs[0] = p0->patchID;
976  dstval.patchIDs[1] = p1->patchID;
977  dstval.patchIDs[2] = p2->patchID;
978  dstval.patchIDs[3] = p3->patchID;
979 
980  dstval.index[0] = l0;
981  dstval.index[1] = l1;
982  dstval.index[2] = l2;
983  dstval.index[3] = l3;
984 
985  dstval.scale = src.scale;
986  if (hostAlchFlags.alchOn) {
987  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
988  const Molecule& mol = *(Node::Object()->molecule);
989  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
990  } else {
991  dstval.fepBondedType = 0;
992  }
993 }
994 
995 template <typename T, typename P, typename D>
996  void ComputeBondedCUDA::copyToStage(const int ntuples, const T* __restrict__ src,
997  const P* __restrict__ p_array, std::vector<D>& dst) {
998 
999  for (int ituple=0;ituple < ntuples;ituple++) {
1000  D dstval;
1001  copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
1002  dst.push_back(dstval);
1003  }
1004 }
1005 #endif // NODEGROUP_FORCE_REGISTER
1006 
1007 
1008 //
1009 // Used for both dihedrals and impropers
1010 //
1011 template <bool doDihedral, typename T, typename P>
1012 void ComputeBondedCUDA::copyDihedralDatafp32(const int ntuples, const T* __restrict__ src,
1013  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
1014 
1015  PatchMap* patchMap = PatchMap::Object();
1016  float3 b1f = make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
1017  float3 b2f = make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
1018  float3 b3f = make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
1019 
1020  for (int ituple=0;ituple < ntuples;ituple++) {
1021  CudaDihedral dstval;
1022  auto p0 = src[ituple].p[0];
1023  auto p1 = src[ituple].p[1];
1024  auto p2 = src[ituple].p[2];
1025  auto p3 = src[ituple].p[3];
1026  int pi0 = patchIndex[p0->patchID];
1027  int pi1 = patchIndex[p1->patchID];
1028  int pi2 = patchIndex[p2->patchID];
1029  int pi3 = patchIndex[p3->patchID];
1030  int l0 = src[ituple].localIndex[0];
1031  int l1 = src[ituple].localIndex[1];
1032  int l2 = src[ituple].localIndex[2];
1033  int l3 = src[ituple].localIndex[3];
1034  dstval.i = l0 + patches[pi0].atomStart;
1035  dstval.j = l1 + patches[pi1].atomStart;
1036  dstval.k = l2 + patches[pi2].atomStart;
1037  dstval.l = l3 + patches[pi3].atomStart;
1038  if (doDihedral) {
1039  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
1040  } else {
1041  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
1042  }
1043 #if 0
1044  Position position1 = p0->x[l0].position;
1045  Position position2 = p1->x[l1].position;
1046  Position position3 = p2->x[l2].position;
1047  Position position4 = p3->x[l3].position;
1048  Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
1049  Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
1050  Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
1051  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1052  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1053  if(pi3 != pi2) shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
1054 
1055  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
1056  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
1057  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
1058 #endif
1059 
1060  float3 position1 = make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
1061  float3 position2 = make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
1062  float3 position3 = make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
1063  float3 position4 = make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
1064 
1065  float3 diff12, diff23, diff43;
1066  diff12 = position1 - position2;
1067  diff23 = position2 - position3;
1068  diff43 = position4 - position3;
1069 
1070  float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
1071  float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
1072  float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
1073 
1074  float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
1075  float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
1076  float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
1077 
1078  float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
1079  float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
1080  float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
1081 
1082  if(pi0 != pi1){
1083  Vector c = patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1084  d12_x += c.x;
1085  d12_y += c.y;
1086  d12_z += c.z;
1087  }
1088 
1089  if(pi1 != pi2){
1090  Vector c = patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1091  d23_x += c.x;
1092  d23_y += c.y;
1093  d23_z += c.z;
1094  }
1095 
1096  if(pi3 != pi2){
1097  Vector c = patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
1098  d43_x += c.x;
1099  d43_y += c.y;
1100  d43_z += c.z;
1101  }
1102 
1103  dstval.ioffsetXYZ = make_float3(d12_x, d12_y, d12_z);
1104  dstval.joffsetXYZ = make_float3(d23_x, d23_y, d23_z);
1105  dstval.loffsetXYZ = make_float3(d43_x, d43_y, d43_z);
1106 
1107  dstval.scale = src[ituple].scale;
1108  if (hostAlchFlags.alchOn) {
1109  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1110  const Molecule& mol = *(Node::Object()->molecule);
1111  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1112  } else {
1113  dstval.fepBondedType = 0;
1114  }
1115  dst[ituple] = dstval;
1116  }
1117 }
1118 
1119 
1120 void ComputeBondedCUDA::copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
1121  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int64_t& pos, int64_t& pos2) {
1122 
1123  PatchMap* patchMap = PatchMap::Object();
1124  for (int ituple=0;ituple < ntuples;ituple++) {
1125  auto p0 = src[ituple].p[0];
1126  auto p1 = src[ituple].p[1];
1127  int pi0 = patchIndex[p0->patchID];
1128  int pi1 = patchIndex[p1->patchID];
1129  int l0 = src[ituple].localIndex[0];
1130  int l1 = src[ituple].localIndex[1];
1131  CompAtom& ca1 = p0->x[l0];
1132  CompAtom& ca2 = p1->x[l1];
1133  Position position1 = ca1.position;
1134  Position position2 = ca2.position;
1135  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
1136  if(pi0 != pi1) shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1137  CudaExclusion ce;
1138  ce.i = l0 + patches[pi0].atomStart;
1139  ce.j = l1 + patches[pi1].atomStart;
1140  ce.vdwtypei = ca1.vdwType;
1141  ce.vdwtypej = ca2.vdwType;
1142  ce.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
1143  if (hostAlchFlags.alchOn) {
1144  const unsigned char p1 = ca1.partition;
1145  const unsigned char p2 = ca2.partition;
1146  ce.pswitch = pswitchTable[size_t(p1 + 3*p2)];
1147  } else {
1148  ce.pswitch = 0;
1149  }
1150  if (src[ituple].modified) {
1151  *dst1 = ce;
1152  dst1++;
1153  pos += typeSize;
1154  } else {
1155  *dst2 = ce;
1156  dst2++;
1157  pos2 += typeSize;
1158  }
1159  }
1160 }
1161 
1162 #ifdef NODEGROUP_FORCE_REGISTER
1163 template<>
1164 void ComputeBondedCUDA::copyTupleToStage(const ExclElem& __restrict__ src,
1165  const int* __restrict__ p_array, CudaExclusionStage& dstval) {
1166 
1167  auto p0 = src.p[0];
1168  auto p1 = src.p[1];
1169  int pi0 = patchIndex[p0->patchID];
1170  int pi1 = patchIndex[p1->patchID];
1171  int l0 = src.localIndex[0];
1172  int l1 = src.localIndex[1];
1173  CompAtom& ca1 = p0->x[l0];
1174  CompAtom& ca2 = p1->x[l1];
1175 
1176  dstval.vdwtypei = ca1.vdwType;
1177  dstval.vdwtypej = ca2.vdwType;
1178 
1179  dstval.patchIDs[0] = p0->patchID;
1180  dstval.patchIDs[1] = p1->patchID;
1181 
1182  dstval.index[0] = l0;
1183  dstval.index[1] = l1;
1184 
1185  if (hostAlchFlags.alchOn) {
1186  const unsigned char p1 = ca1.partition;
1187  const unsigned char p2 = ca2.partition;
1188  dstval.pswitch = pswitchTable[size_t(p1 + 3*p2)];
1189  } else {
1190  dstval.pswitch = 0;
1191  }
1192 }
1193 
1194 
1195 
1196 void ComputeBondedCUDA::copyExclusionDataStage(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
1197  std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
1198 
1199  for (int ituple=0;ituple < ntuples;ituple++) {
1200  CudaExclusionStage ce;
1201  copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple], nullptr, ce);
1202  if (src[ituple].modified) {
1203  dst1.push_back(ce);
1204  pos += typeSize;
1205  } else {
1206  dst2.push_back(ce);
1207  pos2 += typeSize;
1208  }
1209  }
1210 }
1211 #endif // NODEGROUP_FORCE_REGISTER
1212 
1213 void ComputeBondedCUDA::copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
1214  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst) {
1215 
1216  PatchMap* patchMap = PatchMap::Object();
1217  for (int ituple=0;ituple < ntuples;ituple++) {
1218  auto p0 = src[ituple].p[0];
1219  auto p1 = src[ituple].p[1];
1220  auto p2 = src[ituple].p[2];
1221  auto p3 = src[ituple].p[3];
1222  auto p4 = src[ituple].p[4];
1223  auto p5 = src[ituple].p[5];
1224  auto p6 = src[ituple].p[6];
1225  auto p7 = src[ituple].p[7];
1226  int pi0 = patchIndex[p0->patchID];
1227  int pi1 = patchIndex[p1->patchID];
1228  int pi2 = patchIndex[p2->patchID];
1229  int pi3 = patchIndex[p3->patchID];
1230  int pi4 = patchIndex[p4->patchID];
1231  int pi5 = patchIndex[p5->patchID];
1232  int pi6 = patchIndex[p6->patchID];
1233  int pi7 = patchIndex[p7->patchID];
1234  int l0 = src[ituple].localIndex[0];
1235  int l1 = src[ituple].localIndex[1];
1236  int l2 = src[ituple].localIndex[2];
1237  int l3 = src[ituple].localIndex[3];
1238  int l4 = src[ituple].localIndex[4];
1239  int l5 = src[ituple].localIndex[5];
1240  int l6 = src[ituple].localIndex[6];
1241  int l7 = src[ituple].localIndex[7];
1242  dst[ituple].i1 = l0 + patches[pi0].atomStart;
1243  dst[ituple].i2 = l1 + patches[pi1].atomStart;
1244  dst[ituple].i3 = l2 + patches[pi2].atomStart;
1245  dst[ituple].i4 = l3 + patches[pi3].atomStart;
1246  dst[ituple].i5 = l4 + patches[pi4].atomStart;
1247  dst[ituple].i6 = l5 + patches[pi5].atomStart;
1248  dst[ituple].i7 = l6 + patches[pi6].atomStart;
1249  dst[ituple].i8 = l7 + patches[pi7].atomStart;
1250  dst[ituple].itype = (src[ituple].value - crossterm_array);
1251  Position position1 = p0->x[l0].position;
1252  Position position2 = p1->x[l1].position;
1253  Position position3 = p2->x[l2].position;
1254  Position position4 = p3->x[l3].position;
1255  Position position5 = p4->x[l4].position;
1256  Position position6 = p5->x[l5].position;
1257  Position position7 = p6->x[l6].position;
1258  Position position8 = p7->x[l7].position;
1259  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
1260  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
1261  Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
1262  Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
1263  Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
1264  Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
1265  if(pi0 != pi1) shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1266  if(pi1 != pi2) shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1267  if(pi2 != pi3) shiftVec34 += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
1268  if(pi4 != pi5) shiftVec56 += patchMap->center(p4->patchID) - patchMap->center(p5->patchID);
1269  if(pi5 != pi6) shiftVec67 += patchMap->center(p5->patchID) - patchMap->center(p6->patchID);
1270  if(pi6 != pi7) shiftVec78 += patchMap->center(p6->patchID) - patchMap->center(p7->patchID);
1271  dst[ituple].offset12XYZ = make_float3( (float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
1272  dst[ituple].offset23XYZ = make_float3( (float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
1273  dst[ituple].offset34XYZ = make_float3( (float)shiftVec34.x, (float)shiftVec34.y, (float)shiftVec34.z);
1274  dst[ituple].offset56XYZ = make_float3( (float)shiftVec56.x, (float)shiftVec56.y, (float)shiftVec56.z);
1275  dst[ituple].offset67XYZ = make_float3( (float)shiftVec67.x, (float)shiftVec67.y, (float)shiftVec67.z);
1276  dst[ituple].offset78XYZ = make_float3( (float)shiftVec78.x, (float)shiftVec78.y, (float)shiftVec78.z);
1277  if (hostAlchFlags.alchOn) {
1278  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1279  const Molecule& mol = *(Node::Object()->molecule);
1280  int typeSum1 = 0, typeSum2 = 0;
1281  for (size_t i = 0; i < 4; ++i) {
1282  typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1283  typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1284  }
1285  int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1286  if ((0 < typeSum1 && typeSum1 < order) || (0 < typeSum2 && typeSum2 < order)) {
1287  dst[ituple].fepBondedType = 1;
1288  } else if ((0 > typeSum1 && typeSum1 > -order) || (0 > typeSum2 && typeSum2 > -order)) {
1289  dst[ituple].fepBondedType = 2;
1290  }
1291  } else {
1292  dst[ituple].fepBondedType = 0;
1293  }
1294  dst[ituple].scale = src[ituple].scale;
1295  }
1296 }
1297 
1298 #ifdef NODEGROUP_FORCE_REGISTER
1299 template <>
1300 void ComputeBondedCUDA::copyTupleToStage(const CrosstermElem& src,
1301  const CrosstermValue* __restrict__ crossterm_array, CudaCrosstermStage& dstval) {
1302 
1303  auto p0 = src.p[0];
1304  auto p1 = src.p[1];
1305  auto p2 = src.p[2];
1306  auto p3 = src.p[3];
1307  auto p4 = src.p[4];
1308  auto p5 = src.p[5];
1309  auto p6 = src.p[6];
1310  auto p7 = src.p[7];
1311  int pi0 = patchIndex[p0->patchID];
1312  int pi1 = patchIndex[p1->patchID];
1313  int pi2 = patchIndex[p2->patchID];
1314  int pi3 = patchIndex[p3->patchID];
1315  int pi4 = patchIndex[p4->patchID];
1316  int pi5 = patchIndex[p5->patchID];
1317  int pi6 = patchIndex[p6->patchID];
1318  int pi7 = patchIndex[p7->patchID];
1319  int l0 = src.localIndex[0];
1320  int l1 = src.localIndex[1];
1321  int l2 = src.localIndex[2];
1322  int l3 = src.localIndex[3];
1323  int l4 = src.localIndex[4];
1324  int l5 = src.localIndex[5];
1325  int l6 = src.localIndex[6];
1326  int l7 = src.localIndex[7];
1327  dstval.itype = (src.value - crossterm_array);
1328 
1329  dstval.patchIDs[0] = p0->patchID;
1330  dstval.patchIDs[1] = p1->patchID;
1331  dstval.patchIDs[2] = p2->patchID;
1332  dstval.patchIDs[3] = p3->patchID;
1333  dstval.patchIDs[4] = p4->patchID;
1334  dstval.patchIDs[5] = p5->patchID;
1335  dstval.patchIDs[6] = p6->patchID;
1336  dstval.patchIDs[7] = p7->patchID;
1337 
1338  dstval.index[0] = l0;
1339  dstval.index[1] = l1;
1340  dstval.index[2] = l2;
1341  dstval.index[3] = l3;
1342  dstval.index[4] = l4;
1343  dstval.index[5] = l5;
1344  dstval.index[6] = l6;
1345  dstval.index[7] = l7;
1346 
1347  dstval.scale = src.scale;
1348  if (hostAlchFlags.alchOn) {
1349  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1350  const Molecule& mol = *(Node::Object()->molecule);
1351  int typeSum1 = 0, typeSum2 = 0;
1352  for (size_t i = 0; i < 4; ++i) {
1353  typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1354  typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1355  }
1356  int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1357  if ((0 < typeSum1 && typeSum1 < order) || (0 < typeSum2 && typeSum2 < order)) {
1358  dstval.fepBondedType = 1;
1359  } else if ((0 > typeSum1 && typeSum1 > -order) || (0 > typeSum2 && typeSum2 > -order)) {
1360  dstval.fepBondedType = 2;
1361  }
1362  } else {
1363  dstval.fepBondedType = 0;
1364  }
1365 }
1366 #endif // NODEGROUP_FORCE_REGISTER
1367 
1368 void ComputeBondedCUDA::copyTholeData(
1369  const int ntuples, const TholeElem* __restrict__ src,
1370  const TholeValue* __restrict__ thole_array, CudaThole* __restrict__ dst) {
1371  const PatchMap* patchMap = PatchMap::Object();
1372  for (int ituple=0;ituple < ntuples;ituple++) {
1373  const auto p0 = src[ituple].p[0];
1374  const auto p1 = src[ituple].p[1];
1375  const auto p2 = src[ituple].p[2];
1376  const auto p3 = src[ituple].p[3];
1377  const int pi0 = patchIndex[p0->patchID];
1378  const int pi1 = patchIndex[p1->patchID];
1379  const int pi2 = patchIndex[p2->patchID];
1380  const int pi3 = patchIndex[p3->patchID];
1381  const int l0 = src[ituple].localIndex[0];
1382  const int l1 = src[ituple].localIndex[1];
1383  const int l2 = src[ituple].localIndex[2];
1384  const int l3 = src[ituple].localIndex[3];
1385  dst[ituple].i = l0 + patches[pi0].atomStart;
1386  dst[ituple].j = l1 + patches[pi1].atomStart;
1387  dst[ituple].k = l2 + patches[pi2].atomStart;
1388  dst[ituple].l = l3 + patches[pi3].atomStart;
1389  // TODO: is this correct?
1390  const TholeValue * const(&value)(src[ituple].value);
1391  if (value == NULL) {
1392  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1393  }
1394  dst[ituple].aa = value->aa;
1395  dst[ituple].qq = value->qq;
1396  // dst[ituple].itype = (src[ituple].value - thole_array);
1397  const Position position0 = p0->x[l0].position;
1398  const Position position1 = p1->x[l1].position;
1399  const Position position2 = p2->x[l2].position;
1400  const Position position3 = p3->x[l3].position;
1401  Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
1402  Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
1403  Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
1404  Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
1405  if (pi0 != pi2) shiftVec_aiaj += patchMap->center(p0->patchID) - patchMap->center(p2->patchID);
1406  if (pi0 != pi3) shiftVec_aidj += patchMap->center(p0->patchID) - patchMap->center(p3->patchID);
1407  if (pi1 != pi2) shiftVec_diaj += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1408  if (pi1 != pi3) shiftVec_didj += patchMap->center(p1->patchID) - patchMap->center(p3->patchID);
1409  dst[ituple].offset_aiaj = make_float3((float)shiftVec_aiaj.x, (float)shiftVec_aiaj.y, (float)shiftVec_aiaj.z);
1410  dst[ituple].offset_aidj = make_float3((float)shiftVec_aidj.x, (float)shiftVec_aidj.y, (float)shiftVec_aidj.z);
1411  dst[ituple].offset_diaj = make_float3((float)shiftVec_diaj.x, (float)shiftVec_diaj.y, (float)shiftVec_diaj.z);
1412  dst[ituple].offset_didj = make_float3((float)shiftVec_didj.x, (float)shiftVec_didj.y, (float)shiftVec_didj.z);
1413  if (hostAlchFlags.alchOn) {
1414  // THIS ASSUMES THAT AN ATOM AND ITS DRUDE PARTICLE ARE ALWAYS IN THE SAME
1415  // PARTITION!
1416  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1417  const Molecule *mol = Node::Object()->molecule;
1418  int typeSum = 0;
1419  typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
1420  typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
1421  const int order = (!params->alchDecouple ? 3 : 2);
1422  if (typeSum == 0 || abs(typeSum) == order) dst[ituple].fepBondedType = 0;
1423  else if (0 < typeSum && typeSum < order) dst[ituple].fepBondedType = 1;
1424  else if (-order < typeSum && typeSum < 0) dst[ituple].fepBondedType = 2;
1425  else dst[ituple].fepBondedType = 0;
1426  } else {
1427  dst[ituple].fepBondedType = 0;
1428  }
1429  }
1430 }
1431 
1432 #ifdef NODEGROUP_FORCE_REGISTER
1433 template <>
1434 void ComputeBondedCUDA::copyTupleToStage(const TholeElem& src,
1435  const TholeValue* __restrict__ thole_array, CudaTholeStage& dstval) {
1436  auto p0 = src.p[0];
1437  auto p1 = src.p[1];
1438  auto p2 = src.p[2];
1439  auto p3 = src.p[3];
1440  // int pi0 = patchIndex[p0->patchID];
1441  // int pi1 = patchIndex[p1->patchID];
1442  // int pi2 = patchIndex[p2->patchID];
1443  // int pi3 = patchIndex[p3->patchID];
1444  int l0 = src.localIndex[0];
1445  int l1 = src.localIndex[1];
1446  int l2 = src.localIndex[2];
1447  int l3 = src.localIndex[3];
1448  dstval.patchIDs[0] = p0->patchID;
1449  dstval.patchIDs[1] = p1->patchID;
1450  dstval.patchIDs[2] = p2->patchID;
1451  dstval.patchIDs[3] = p3->patchID;
1452  dstval.index[0] = l0;
1453  dstval.index[1] = l1;
1454  dstval.index[2] = l2;
1455  dstval.index[3] = l3;
1456  const TholeValue * const(&value)(src.value);
1457  if (value == NULL) {
1458  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1459  }
1460  dstval.aa = value->aa;
1461  dstval.qq = value->qq;
1462  if (hostAlchFlags.alchOn) {
1463  // THIS ASSUMES THAT AN ATOM AND ITS DRUDE PARTICLE ARE ALWAYS IN THE SAME
1464  // PARTITION!
1465  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1466  const Molecule *mol = Node::Object()->molecule;
1467  int typeSum = 0;
1468  typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
1469  typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
1470  const int order = (!params->alchDecouple ? 3 : 2);
1471  if (typeSum == 0 || abs(typeSum) == order) dstval.fepBondedType = 0;
1472  else if (0 < typeSum && typeSum < order) dstval.fepBondedType = 1;
1473  else if (-order < typeSum && typeSum < 0) dstval.fepBondedType = 2;
1474  else dstval.fepBondedType = 0;
1475  } else {
1476  dstval.fepBondedType = 0;
1477  }
1478 }
1479 #endif
1480 
1481 void ComputeBondedCUDA::copyAnisoData(
1482  const int ntuples, const AnisoElem* __restrict src,
1483  const AnisoValue* __restrict aniso_array, CudaAniso* __restrict dst) {
1484  const PatchMap* patchMap = PatchMap::Object();
1485  for (int ituple=0;ituple < ntuples;ituple++) {
1486  const auto p0 = src[ituple].p[0];
1487  const auto p1 = src[ituple].p[1];
1488  const auto p2 = src[ituple].p[2];
1489  const auto p3 = src[ituple].p[3];
1490  const int pi0 = patchIndex[p0->patchID];
1491  const int pi1 = patchIndex[p1->patchID];
1492  const int pi2 = patchIndex[p2->patchID];
1493  const int pi3 = patchIndex[p3->patchID];
1494  const int l0 = src[ituple].localIndex[0];
1495  const int l1 = src[ituple].localIndex[1];
1496  const int l2 = src[ituple].localIndex[2];
1497  const int l3 = src[ituple].localIndex[3];
1498  dst[ituple].i = l0 + patches[pi0].atomStart;
1499  // Atom i's drude, assume to be in the same patch of i
1500  dst[ituple].j = l0 + 1 + patches[pi0].atomStart;
1501  dst[ituple].l = l1 + patches[pi1].atomStart;
1502  dst[ituple].m = l2 + patches[pi2].atomStart;
1503  dst[ituple].n = l3 + patches[pi3].atomStart;
1504  const AnisoValue * const(&value)(src[ituple].value);
1505  if (value == NULL) {
1506  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyAnisoData.\n");
1507  }
1508  dst[ituple].kpar0 = 2.0 * value->k11;
1509  dst[ituple].kperp0 = 2.0 * value->k22;
1510  dst[ituple].kiso0 = 2.0 * value->k33;
1511  const Position position0 = p0->x[l0].position;
1512  const Position position1 = p1->x[l1].position;
1513  const Position position2 = p2->x[l2].position;
1514  const Position position3 = p3->x[l3].position;
1515  Vector shiftVec_il = lattice.wrap_delta_scaled(position0, position1);
1516  Vector shiftVec_mn = lattice.wrap_delta_scaled(position2, position3);
1517  if (pi0 != pi1) shiftVec_il += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
1518  if (pi2 != pi3) shiftVec_mn += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
1519  dst[ituple].offset_il = make_float3((float)shiftVec_il.x, (float)shiftVec_il.y, (float)shiftVec_il.z);
1520  dst[ituple].offset_mn = make_float3((float)shiftVec_mn.x, (float)shiftVec_mn.y, (float)shiftVec_mn.z);
1521  if (hostAlchFlags.alchOn) {
1522  const AtomID (&atomID)[sizeof(src[ituple].atomID)/sizeof(AtomID)](src[ituple].atomID);
1523  const Molecule& mol = *(Node::Object()->molecule);
1524  dst[ituple].fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1525  } else {
1526  dst[ituple].fepBondedType = 0;
1527  }
1528  }
1529 }
1530 
1531 #ifdef NODEGROUP_FORCE_REGISTER
1532 template<>
1533 void ComputeBondedCUDA::copyTupleToStage(const AnisoElem& src,
1534  const AnisoValue* __restrict__ p_array, CudaAnisoStage& dstval) {
1535  auto p0 = src.p[0];
1536  auto p1 = src.p[1];
1537  auto p2 = src.p[2];
1538  auto p3 = src.p[3];
1539  int l0 = src.localIndex[0];
1540  int l1 = src.localIndex[1];
1541  int l2 = src.localIndex[2];
1542  int l3 = src.localIndex[3];
1543  dstval.patchIDs[0] = p0->patchID;
1544  dstval.patchIDs[1] = p0->patchID;
1545  dstval.patchIDs[2] = p1->patchID;
1546  dstval.patchIDs[3] = p2->patchID;
1547  dstval.patchIDs[4] = p3->patchID;
1548  dstval.index[0] = l0;
1549  dstval.index[1] = l0 + 1;
1550  dstval.index[2] = l1;
1551  dstval.index[3] = l2;
1552  dstval.index[4] = l3;
1553  const AnisoValue * const(&value)(src.value);
1554  if (value == NULL) {
1555  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1556  }
1557  dstval.kpar0 = 2.0 * value->k11;
1558  dstval.kperp0 = 2.0 * value->k22;
1559  dstval.kiso0 = 2.0 * value->k33;
1560  if (hostAlchFlags.alchOn) {
1561  const AtomID (&atomID)[sizeof(src.atomID)/sizeof(AtomID)](src.atomID);
1562  const Molecule& mol = *(Node::Object()->molecule);
1563  dstval.fepBondedType = mol.get_fep_bonded_type(atomID, 4);
1564  } else {
1565  dstval.fepBondedType = 0;
1566  }
1567 }
1568 #endif
1569 
1570 void ComputeBondedCUDA::copyOneFourNbTholeData(
1571  const int ntuples, const OneFourNbTholeElem* __restrict src,
1572  const OneFourNbTholeValue* __restrict one_four_nbthole_array,
1573  CudaOneFourNbThole* __restrict dst) {
1574  const PatchMap* patchMap = PatchMap::Object();
1575  for (int ituple=0;ituple < ntuples;ituple++) {
1576  const auto p0 = src[ituple].p[0];
1577  const auto p1 = src[ituple].p[1];
1578  const auto p2 = src[ituple].p[2];
1579  const auto p3 = src[ituple].p[3];
1580  const int pi0 = patchIndex[p0->patchID];
1581  const int pi1 = patchIndex[p1->patchID];
1582  const int pi2 = patchIndex[p2->patchID];
1583  const int pi3 = patchIndex[p3->patchID];
1584  const int l0 = src[ituple].localIndex[0];
1585  const int l1 = src[ituple].localIndex[1];
1586  const int l2 = src[ituple].localIndex[2];
1587  const int l3 = src[ituple].localIndex[3];
1588  dst[ituple].i = l0 + patches[pi0].atomStart;
1589  dst[ituple].j = l1 + patches[pi1].atomStart;
1590  dst[ituple].k = l2 + patches[pi2].atomStart;
1591  dst[ituple].l = l3 + patches[pi3].atomStart;
1592  const OneFourNbTholeValue * const(&value)(src[ituple].value);
1593  if (value == NULL) {
1594  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1595  }
1596  dst[ituple].aa = value->aa;
1597  const Position position0 = p0->x[l0].position;
1598  const Position position1 = p1->x[l1].position;
1599  const Position position2 = p2->x[l2].position;
1600  const Position position3 = p3->x[l3].position;
1601  Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
1602  Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
1603  Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
1604  Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
1605  if (pi0 != pi2) shiftVec_aiaj += patchMap->center(p0->patchID) - patchMap->center(p2->patchID);
1606  if (pi0 != pi3) shiftVec_aidj += patchMap->center(p0->patchID) - patchMap->center(p3->patchID);
1607  if (pi1 != pi2) shiftVec_diaj += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
1608  if (pi1 != pi3) shiftVec_didj += patchMap->center(p1->patchID) - patchMap->center(p3->patchID);
1609  dst[ituple].offset_aiaj = make_float3((float)shiftVec_aiaj.x, (float)shiftVec_aiaj.y, (float)shiftVec_aiaj.z);
1610  dst[ituple].offset_aidj = make_float3((float)shiftVec_aidj.x, (float)shiftVec_aidj.y, (float)shiftVec_aidj.z);
1611  dst[ituple].offset_diaj = make_float3((float)shiftVec_diaj.x, (float)shiftVec_diaj.y, (float)shiftVec_diaj.z);
1612  dst[ituple].offset_didj = make_float3((float)shiftVec_didj.x, (float)shiftVec_didj.y, (float)shiftVec_didj.z);
1613  // if (hostAlchFlags.alchOn) {
1614  // // TODO: Alchemical transformation
1615  // }
1616  }
1617 }
1618 
1619 #ifdef NODEGROUP_FORCE_REGISTER
1620 template <>
1621 void ComputeBondedCUDA::copyTupleToStage(const OneFourNbTholeElem& src,
1622  const OneFourNbTholeValue* __restrict__ thole_array, CudaOneFourNbTholeStage& dstval) {
1623  auto p0 = src.p[0];
1624  auto p1 = src.p[1];
1625  auto p2 = src.p[2];
1626  auto p3 = src.p[3];
1627  // int pi0 = patchIndex[p0->patchID];
1628  // int pi1 = patchIndex[p1->patchID];
1629  // int pi2 = patchIndex[p2->patchID];
1630  // int pi3 = patchIndex[p3->patchID];
1631  int l0 = src.localIndex[0];
1632  int l1 = src.localIndex[1];
1633  int l2 = src.localIndex[2];
1634  int l3 = src.localIndex[3];
1635  dstval.patchIDs[0] = p0->patchID;
1636  dstval.patchIDs[1] = p1->patchID;
1637  dstval.patchIDs[2] = p2->patchID;
1638  dstval.patchIDs[3] = p3->patchID;
1639  dstval.index[0] = l0;
1640  dstval.index[1] = l1;
1641  dstval.index[2] = l2;
1642  dstval.index[3] = l3;
1643  const OneFourNbTholeValue * const(&value)(src.value);
1644  if (value == NULL) {
1645  NAMD_bug("Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1646  }
1647  dstval.aa = value->aa;
1648  // if (hostAlchFlags.alchOn) {
1649  // // TODO: Alchemical transformation
1650  // }
1651 }
1652 #endif // NODEGROUP_FORCE_REGISTER
1653 
1654 void ComputeBondedCUDA::tupleCopyWorker(int first, int last, void *result, int paraNum, void *param) {
1655  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1656  c->tupleCopyWorker(first, last);
1657 }
1658 
1659 void ComputeBondedCUDA::tupleCopyWorkerExcl(int first, int last, void *result, int paraNum, void *param) {
1660  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1661  c->tupleCopyWorkerExcl(first, last);
1662 }
1663 
1664 void ComputeBondedCUDA::tupleCopyWorkerExcl(int first, int last) {
1665 
1666  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1667  // Separate exclusions into modified, and non-modified
1668  auto itStart=std::next(tupleList[Tuples::EXCLUSION].begin(),first);
1669  //compute offset for this rank
1670  int64_t numModExclusionsBefore=0;
1671  int64_t numExclusionsBefore=0;
1672  // this is the most painful way to compute this, but should be correct
1673  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != itStart;it++) {
1674  int ntuples = (*it)->getNumTuples();
1675  auto thisExcl = (ExclElem *)(*it)->getTupleList();
1676  for (int ituple=0;ituple < ntuples;ituple++) {
1677  if(thisExcl[ituple].modified)
1678  numModExclusionsBefore++;
1679  else
1680  numExclusionsBefore++;
1681  }
1682  }
1683 
1684  int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[Tuples::EXCLUSION];
1685  int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[Tuples::EXCLUSION];
1686  int numExclusionsWork=last-first;
1687  auto itEnd=std::next(itStart,numExclusionsWork+1);
1688  for ( auto it=itStart;it != itEnd;it++) {
1689  int ntuples = (*it)->getNumTuples();
1690  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
1691  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
1692  }
1693  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1694 }
1695 
1696 void ComputeBondedCUDA::tupleCopyWorker(int first, int last) {
1697 
1698  if (first == -1) {
1699  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1700  // Separate exclusions into modified, and non-modified
1701  int64_t pos = exclusionStartPos;
1702  int64_t pos2 = exclusionStartPos2;
1703  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
1704  int ntuples = (*it)->getNumTuples();
1705  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
1706  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
1707  }
1708  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY_EXCL);
1709  first = 0;
1710  }
1711  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1712  // JM: Move the switch statement outside and do the for loop inside the cases
1713  for (int i=first;i <= last;i++) {
1714  switch (tupleCopyWorkList[i].tupletype) {
1715 
1716  case Tuples::BOND:
1717  {
1718 #if 1
1719  copyBondData(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
1720  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1721 #else
1722  copyBondDatafp32(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
1723  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1724 #endif
1725  }
1726  break;
1727 
1728  case Tuples::ANGLE:
1729  {
1730  copyAngleData(tupleCopyWorkList[i].ntuples, (AngleElem *)tupleCopyWorkList[i].tupleElemList,
1731  Node::Object()->parameters->angle_array, (CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1732  }
1733  break;
1734 
1735  case Tuples::DIHEDRAL:
1736  {
1737 
1738 #if 1
1739  copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1740  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
1741  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1742 #else
1743  copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1744  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
1745  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1746 #endif
1747  }
1748  break;
1749 
1750  case Tuples::IMPROPER:
1751  {
1752  copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
1753  (ImproperElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->improper_array,
1754  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1755  }
1756  break;
1757 
1758  case Tuples::CROSSTERM:
1759  {
1760  copyCrosstermData(tupleCopyWorkList[i].ntuples, (CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
1761  Node::Object()->parameters->crossterm_array, (CudaCrossterm *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1762  }
1763  break;
1764 
1765  case Tuples::THOLE:
1766  {
1767  copyTholeData(tupleCopyWorkList[i].ntuples, (TholeElem*)tupleCopyWorkList[i].tupleElemList, nullptr, (CudaThole *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1768  }
1769  break;
1770 
1771  case Tuples::ANISO:
1772  {
1773  copyAnisoData(tupleCopyWorkList[i].ntuples, (AnisoElem*)tupleCopyWorkList[i].tupleElemList, nullptr, (CudaAniso*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1774  }
1775  break;
1776 
1777  case Tuples::ONEFOURNBTHOLE:
1778  {
1779  copyOneFourNbTholeData(tupleCopyWorkList[i].ntuples, (OneFourNbTholeElem*)tupleCopyWorkList[i].tupleElemList, nullptr, (CudaOneFourNbThole*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1780  }
1781  break;
1782 
1783  default:
1784  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1785  break;
1786  }
1787  }
1788  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
1789 }
1790 
1791 
1792 #ifdef NODEGROUP_FORCE_REGISTER
1793 
1794 void ComputeBondedCUDA::updateMaxTupleCounts(TupleCounts counts) {
1795  // Atomically compute the max number of tuples across all GPUs
1796  // Each GPU will use this as their buffer size * OVERALLOC to keep things easier
1797 
1798  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1799  PatchData *patchData = cpdata.ckLocalBranch();
1800 
1801  // Update bonds
1802  int numBondsTest = patchData->maxNumBonds.load();
1803  while (numBondsTest < counts.bond &&
1804  !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.bond));
1805 
1806  int numAnglesTest = patchData->maxNumAngles.load();
1807  while (numAnglesTest < counts.angle &&
1808  !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.angle));
1809 
1810  int numDihedralsTest = patchData->maxNumDihedrals.load();
1811  while (numDihedralsTest < counts.dihedral &&
1812  !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.dihedral));
1813 
1814  int numImpropersTest = patchData->maxNumImpropers.load();
1815  while (numImpropersTest < counts.improper &&
1816  !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.improper));
1817 
1818  int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
1819  while (numModifiedExclusionsTest < counts.modifiedExclusion &&
1820  !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.modifiedExclusion));
1821 
1822  int numExclusionsTest = patchData->maxNumExclusions.load();
1823  while (numExclusionsTest < counts.exclusion &&
1824  !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.exclusion));
1825 
1826  int numCrosstermsTest = patchData->maxNumCrossterms.load();
1827  while (numCrosstermsTest < counts.crossterm &&
1828  !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.crossterm));
1829 
1830 
1831 }
1832 
1833 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
1834  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1835  PatchData *patchData = cpdata.ckLocalBranch();
1836 
1837  TupleCounts counts{0};
1838 
1839  counts.bond = patchData->maxNumBonds.load();
1840  counts.angle = patchData->maxNumAngles.load();
1841  counts.dihedral = patchData->maxNumDihedrals.load();
1842  counts.improper = patchData->maxNumImpropers.load();
1843  counts.modifiedExclusion = patchData->maxNumModifiedExclusions.load();
1844  counts.exclusion = patchData->maxNumExclusions.load();
1845  counts.crossterm = patchData->maxNumCrossterms.load();
1846 
1847 #if 0
1848  CkPrintf("[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
1849  CkMyPe(), counts.bond, counts.angle, counts.dihedral, counts.improper, counts.modifiedExclusion,
1850  counts.exclusion, counts.crossterm);
1851 #endif
1852 
1853  return counts;
1854 }
1855 
1856 
1897 void ComputeBondedCUDA::migrateTuples(bool startup) {
1898 
1899  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1900  PatchData *patchData = cpdata.ckLocalBranch();
1902  const int devInd = deviceCUDA->getDeviceIndex();
1903 
1904  bool MGPU = deviceCUDA->getNumDevice() > 1;
1905  bool amMaster = (masterPe == CkMyPe());
1907 
1908 
1909  if (amMaster) {
1910  cudaStreamSynchronize(stream);
1911  copyPatchData();
1912  cudaStreamSynchronize(stream);
1913  }
1915 
1916  if (amMaster) {
1917  if (!startup) {
1918  PatchMap* patchMap = PatchMap::Object();
1919  const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
1920  // Update the device of the tuples
1921  // DMC this also updates the patches of the atoms, so it needs to be done in the single GPU case too
1922 
1923  TupleCounts counts = bondedKernel.getTupleCounts();
1924  migrationKernel.computeTupleDestination(
1925  devInd,
1926  counts,
1927  patchData->devData[devInd].numPatchesHome,
1928  migrationDestination,
1929  patchData->devData[devInd].d_patchToDeviceMap,
1930  patchData->devData[devInd].d_globalToLocalID,
1931  patchMap->getADim(), patchMap->getBDim(),
1932  patchMap->getCMaxIndex(), patchMap->getBMaxIndex(), patchMap->getAMaxIndex(),
1933  stream
1934  );
1935  cudaCheck(cudaStreamSynchronize(stream));
1936  }
1937  }
1939 
1940  if (amMaster) {
1941  if (!startup) {
1942  migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
1943  cudaCheck(cudaStreamSynchronize(stream));
1944  }
1945  }
1947 
1948  TupleCounts local, newMax;
1949  bool realloc = false;
1950 
1951  if (amMaster && !startup) {
1952  migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
1953 
1954  local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
1955  updateMaxTupleCounts(local);
1956 #if 0
1957  CkPrintf("[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
1958  CkMyPe(), local.bond, local.angle, local.dihedral, local.improper, local.modifiedExclusion,
1959  local.exclusion, local.crossterm);
1960 #endif
1961  }
1963  if (amMaster && !startup) {
1964  // Only multi-GPU needs to check for reallocation
1965  if (MGPU) {
1966  newMax = getMaxTupleCounts();
1967  realloc = migrationKernel.reallocateBufferDst(newMax);
1968  patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
1969  }
1970  }
1972 
1973  //
1974  // Non-master PEs need to see if reallocation is happening
1975  // They can all use the 0th GPU flag since the GPUs reallocate
1976  // based on the max
1977  if (!amMaster) {
1978  realloc = patchData->tupleReallocationFlagPerDevice[0];
1979  }
1980 
1981  // If reallocated, then we need to register the P2P pointer
1982  // Since the GPUs use the max tuple counts for the buffer sizes
1983  // If one GPU reallocated, then they all did.
1984 
1985  if (realloc) {
1986  if (amMaster) {
1987  registerPointersToHost();
1988  }
1990 
1991  if (amMaster) {
1992  copyHostRegisterToDevice();
1993  cudaCheck(cudaStreamSynchronize(stream));
1994  }
1996  }
1997 
1998  if (amMaster && !startup) {
1999  // Moves the tuples to the dst buffer on all GPUS
2000  migrationKernel.performTupleMigration(
2001  bondedKernel.getTupleCounts(),
2002  stream
2003  );
2004  cudaCheck(cudaStreamSynchronize(stream));
2005  }
2006  syncColl->barrier(SynchronousCollectiveScope::all); // We need a barrier because of the reallocation DMC believes
2007 
2008  if (amMaster && !startup) {
2009  // If we previously reallocated we need to reallocate the other buffers too
2010  // The regular tuples must be "Reallocated" everytime, so the offsets into
2011  // the unified buffer can be updated. The unified buffer made since when we
2012  // were copying the tuples from the host each migration, but I don't think
2013  // it still makes sense to have since it complicates reallocation
2014  if (MGPU) {
2015  bondedKernel.reallocateTupleBuffer(newMax, stream);
2016  migrationKernel.reallocateBufferSrc(newMax);
2017  cudaCheck(cudaStreamSynchronize(stream));
2018  }
2019  }
2021 
2022  // Update the local tuple count and copies the migrated tuples back to the
2023  // src buffer
2024  if (amMaster) {
2025  if (!startup) {
2026  bondedKernel.setTupleCounts(local);
2027  }
2028 
2029  const int* ids = patchData->h_soa_id[devInd];
2030  migrationKernel.updateTuples(
2031  bondedKernel.getTupleCounts(),
2032  bondedKernel.getData(),
2033  ids,
2034  d_patchRecord,
2035  d_patchMapCenter,
2036  bondedKernel.getAtomBuffer(),
2037  lattice,
2038  stream
2039  );
2040  cudaCheck(cudaDeviceSynchronize());
2041  }
2042 }
2043 
2050 template<typename T>
2051 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
2052  PatchMap* patchMap = PatchMap::Object();
2053 
2054  std::vector<std::pair<int,int>> downstreamPatches;
2055  for (int i = 0; i < tuples.size(); i++) {
2056  int downstream = tuples[i].patchIDs[0];
2057  for (int j = 1; j < T::size; j++) {
2058  downstream = patchMap->downstream(downstream, tuples[i].patchIDs[j]);
2059  }
2060  downstreamPatches.push_back(std::make_pair(i, downstream));
2061  tupleCounts[patchIndex[downstream]]++;
2062  }
2063 
2064  //
2065  // Compute the offset into the tuples based on patch
2066  // TODO replace with STD prefix sum
2067  tupleOffsets[0] = 0;
2068  for (int i = 0; i < tupleCounts.size(); i++) {
2069  tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
2070  }
2071 
2072  //
2073  // Sort tuples based on home patch
2074  //
2075  std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
2076  [](std::pair<int, int> a, std::pair<int, int> b) {
2077  return a.second < b.second;
2078  });
2079 
2080  //
2081  // Propogate order to tuples
2082  //
2083  std::vector<T> copy = tuples;
2084  for (int i = 0; i < tuples.size(); i++) {
2085  tuples[i] = copy[downstreamPatches[i].first];
2086  }
2087 }
2088 
2089 void ComputeBondedCUDA::sortAndCopyToDevice() {
2090  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2091  PatchData *patchData = cpdata.ckLocalBranch();
2092  const int devInd = deviceCUDA->getDeviceIndex();
2093  const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
2094 
2095  TupleDataStage h_dataStage;
2096  TupleIntArrays h_counts;
2097  TupleIntArrays h_offsets;
2098 
2099  //
2100  // These cannot be declared within the macro because then they
2101  // would be out of scope
2102  //
2103  std::vector<int> bondPatchCounts(numPatchesHome, 0);
2104  std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
2105 
2106  std::vector<int> anglePatchCounts(numPatchesHome, 0);
2107  std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
2108 
2109  std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
2110  std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
2111 
2112  std::vector<int> improperPatchCounts(numPatchesHome, 0);
2113  std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
2114 
2115  std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
2116  std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
2117 
2118  std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
2119  std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
2120 
2121  std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
2122  std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
2123 
2124  #define CALL(fieldName) do { \
2125  sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \
2126  h_dataStage.fieldName = fieldName##TupleData.data(); \
2127  h_counts.fieldName = fieldName##PatchCounts.data(); \
2128  h_offsets.fieldName = fieldName##PatchOffsets.data(); \
2129  } while(0);
2130 
2131  CALL(bond);
2132  CALL(angle);
2133  CALL(dihedral);
2134  CALL(improper);
2135  CALL(modifiedExclusion);
2136  CALL(exclusion);
2137  CALL(crossterm);
2138 
2139  #undef CALL
2140 
2141  migrationKernel.copyTupleToDevice(
2142  bondedKernel.getTupleCounts(),
2143  numPatchesHome,
2144  h_dataStage,
2145  h_counts,
2146  h_offsets,
2147  stream
2148  );
2149 
2150  cudaCheck(cudaStreamSynchronize(stream));
2151 }
2152 
2160 void ComputeBondedCUDA::tupleCopyWorkerType(int tupletype) {
2161  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
2162 
2163  switch (tupletype) {
2164  case Tuples::EXCLUSION:
2165  {
2166  // Separate exclusions into modified, and non-modified
2167  modifiedExclusionTupleData.clear();
2168  exclusionTupleData.clear();
2169  int64_t pos = exclusionStartPos;
2170  int64_t pos2 = exclusionStartPos2;
2171  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
2172  int ntuples = (*it)->getNumTuples();
2173  copyExclusionDataStage(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSizeStage[Tuples::EXCLUSION],
2174  modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
2175  }
2176  // Reserve patch warp aligned data for copy.
2177  // TODO DMC is this necessary? Why are we copying the padded part too?
2178  modifiedExclusionTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(modifiedExclusionTupleData.size()));
2179  exclusionTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(exclusionTupleData.size()));
2180  }
2181  break;
2182 
2183  case Tuples::BOND:
2184  {
2185  bondTupleData.clear();
2186  for (auto it = tupleList[Tuples::BOND].begin();it != tupleList[Tuples::BOND].end();it++) {
2187  int ntuples = (*it)->getNumTuples();
2188  BondElem* elemList = (BondElem *)(*it)->getTupleList();
2189  copyToStage<BondElem, BondValue, CudaBondStage>(
2190  ntuples, elemList, Node::Object()->parameters->bond_array, bondTupleData
2191  );
2192  }
2193 
2194  bondTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(bondTupleData.size()));
2195  }
2196  break;
2197 
2198  case Tuples::ANGLE:
2199  {
2200  angleTupleData.clear();
2201  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2202  int ntuples = (*it)->getNumTuples();
2203  AngleElem* elemList = (AngleElem *)(*it)->getTupleList();
2204  copyToStage<AngleElem, AngleValue, CudaAngleStage>(ntuples, elemList, Node::Object()->parameters->angle_array, angleTupleData);
2205  }
2206 
2207  angleTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(angleTupleData.size()));
2208  }
2209  break;
2210 
2211  case Tuples::DIHEDRAL:
2212  {
2213  dihedralTupleData.clear();
2214  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2215  int ntuples = (*it)->getNumTuples();
2216  DihedralElem* elemList = (DihedralElem *)(*it)->getTupleList();
2217  copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
2218  Node::Object()->parameters->dihedral_array, dihedralTupleData);
2219  }
2220 
2221  dihedralTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(dihedralTupleData.size()));
2222  }
2223  break;
2224 
2225  case Tuples::IMPROPER:
2226  {
2227  improperTupleData.clear();
2228  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2229  int ntuples = (*it)->getNumTuples();
2230  ImproperElem* elemList = (ImproperElem *)(*it)->getTupleList();
2231  copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList,
2232  Node::Object()->parameters->improper_array, improperTupleData);
2233  }
2234 
2235  improperTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(improperTupleData.size()));
2236  }
2237  break;
2238 
2239  case Tuples::CROSSTERM:
2240  {
2241  crosstermTupleData.clear();
2242  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2243  int ntuples = (*it)->getNumTuples();
2244  CrosstermElem* elemList = (CrosstermElem *)(*it)->getTupleList();
2245  copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList,
2246  Node::Object()->parameters->crossterm_array, crosstermTupleData);
2247  }
2248 
2249  crosstermTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(crosstermTupleData.size()));
2250  }
2251  break;
2252 
2253  case Tuples::THOLE:
2254  {
2255  tholeTupleData.clear();
2256  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2257  const int ntuples = (*it)->getNumTuples();
2258  TholeElem* elemList = (TholeElem*)(*it)->getTupleList();
2259  copyToStage<TholeElem, TholeValue, CudaTholeStage>(ntuples, elemList, nullptr, tholeTupleData);
2260  }
2261  tholeTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(tholeTupleData.size()));
2262  }
2263  break;
2264 
2265  case Tuples::ANISO:
2266  {
2267  anisoTupleData.clear();
2268  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2269  const int ntuples = (*it)->getNumTuples();
2270  AnisoElem* elemList = (AnisoElem*)(*it)->getTupleList();
2271  copyToStage<AnisoElem, AnisoValue, CudaAnisoStage>(ntuples, elemList, nullptr, anisoTupleData);
2272  }
2273  anisoTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(anisoTupleData.size()));
2274  }
2275  break;
2276 
2277  case Tuples::ONEFOURNBTHOLE:
2278  {
2279  oneFourNbTholeTupleData.clear();
2280  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2281  const int ntuples = (*it)->getNumTuples();
2282  OneFourNbTholeElem* elemList = (OneFourNbTholeElem*)(*it)->getTupleList();
2283  copyToStage<OneFourNbTholeElem, OneFourNbTholeValue, CudaOneFourNbTholeStage>(ntuples, elemList, nullptr, oneFourNbTholeTupleData);
2284  }
2285  oneFourNbTholeTupleData.reserve(ComputeBondedCUDAKernel::warpAlign(oneFourNbTholeTupleData.size()));
2286  }
2287  break;
2288 
2289  default:
2290  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
2291  break;
2292  }
2293 
2294  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_TUPLE_COPY);
2295 }
2296 #endif // NODEGROUP_FORCE_REGISTER
2297 
2298 //
2299 // Copies tuple data form individual buffers to a single contigious buffer
2300 // NOTE: This is done on the master PE
2301 //
2302 
2303 void ComputeBondedCUDA::copyTupleData() {
2304 
2305  PatchMap* patchMap = PatchMap::Object();
2306 
2307  // Count the number of exclusions
2308  int numModifiedExclusions = 0;
2309  int numExclusions = 0;
2310  for (int i=0;i < numExclPerRank.size();i++) {
2311  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2312  numExclusions += numExclPerRank[i].numExclusions;
2313  }
2314  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2315  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2316 
2317  // Count the number of tuples for each type
2318  int64_t posWA = 0;
2319  exclusionStartPos = 0;
2320  exclusionStartPos2 = 0;
2321  tupleCopyWorkList.clear();
2322  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2323  // Take temporary position
2324  int64_t pos = posWA;
2325  if (tupletype == Tuples::EXCLUSION) {
2326  exclusionStartPos = pos;
2327  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
2328  }
2329  // Count for total number of tuples for this tupletype
2330  int num = 0;
2331  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2332  int ntuples = (*it)->getNumTuples();
2333  num += ntuples;
2334  if (tupletype != Tuples::EXCLUSION) {
2335  TupleCopyWork tupleCopyWork;
2336  tupleCopyWork.tupletype = tupletype;
2337  tupleCopyWork.ntuples = ntuples;
2338  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2339  tupleCopyWork.tupleDataPos = pos;
2340  // XXX NOTE: redundant copy happening here
2341  tupleCopyWorkList.push_back(tupleCopyWork);
2342  pos += ntuples*CudaTupleTypeSize[tupletype];
2343  }
2344  }
2345  numTuplesPerType[tupletype] = num;
2346  //
2347  if (tupletype == Tuples::EXCLUSION) {
2348  // Warp-align exclusions separately
2349  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2350  } else {
2351  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2352  }
2353  }
2354  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2355  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2356  }
2357 
2358  // Set flags for finishPatchesOnPe
2359  hasExclusions = (numExclusions > 0);
2360  hasModifiedExclusions = (numModifiedExclusions > 0);
2361 
2362  // Re-allocate storage as needed
2363  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2364  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2365 
2366 #if CMK_SMP && USE_CKLOOP
2367  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2368  if (useCkLoop >= 1) {
2369 #define CKLOOP_EXCLUSIONS 1
2370 #if CKLOOP_EXCLUSIONS
2371  CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (void *)this, CkMyNodeSize(), 0, tupleList[Tuples::EXCLUSION].size()-1);
2372  // pass 0 instead of -1 so tupleCopyWorker doesn't do exclusions
2373  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
2374 #else
2375  // JM NOTE: What this does is divice the work into chunks of CkMyNodeSize to be scheduled
2376  // between PE. To circumvent this, I have to call it by hand on all pes
2377  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
2378 #endif
2379  } else
2380 
2381 #endif
2382  {
2383 
2384  tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
2385  }
2386 
2387  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
2388  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
2389  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
2390  numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
2391  numTuplesPerType[Tuples::ONEFOURNBTHOLE], tupleData, stream);
2392 
2393  // Re-allocate forces
2394  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2395  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2396 }
2397 
2398 // JM: Single node version to bypass CkLoop
2399 #ifdef NODEGROUP_FORCE_REGISTER
2400 
2401 void ComputeBondedCUDA::copyTupleDataSN() {
2402 
2403  PatchMap* patchMap = PatchMap::Object();
2404  size_t numExclusions, numModifiedExclusions, copyIndex;
2406 
2407  // WORK TO BE DONE BY THE MASTERPE
2408  if(masterPe == CkMyPe()){
2409  numModifiedExclusions = 0;
2410  numExclusions = 0;
2411  for (int i=0;i < numExclPerRank.size();i++) {
2412  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2413  numExclusions += numExclPerRank[i].numExclusions;
2414  }
2415  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2416  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2417 
2418  // Count the number of tuples for each type
2419  int64_t posWA = 0;
2420  exclusionStartPos = 0;
2421  exclusionStartPos2 = 0;
2422  tupleCopyWorkList.clear();
2423  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2424  // Take temporary position
2425  int64_t pos = posWA;
2426  if (tupletype == Tuples::EXCLUSION) {
2427  exclusionStartPos = pos;
2428  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
2429  }
2430  // Count for total number of tuples for this tupletype
2431  int num = 0;
2432  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2433  int ntuples = (*it)->getNumTuples();
2434  num += ntuples;
2435  if (tupletype != Tuples::EXCLUSION) {
2436  TupleCopyWork tupleCopyWork;
2437  tupleCopyWork.tupletype = tupletype;
2438  tupleCopyWork.ntuples = ntuples;
2439  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2440  tupleCopyWork.tupleDataPos = pos;
2441  tupleCopyWorkList.push_back(tupleCopyWork);
2442  pos += ntuples*CudaTupleTypeSize[tupletype];
2443  }
2444  }
2445  numTuplesPerType[tupletype] = num;
2446  //
2447  if (tupletype == Tuples::EXCLUSION) {
2448  // Warp-align exclusions separately
2449  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2450  } else {
2451  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2452  }
2453  }
2454  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2455  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2456  }
2457 
2458  // Set flags for finishPatchesOnPe
2459  hasExclusions = (numExclusions > 0);
2460  hasModifiedExclusions = (numModifiedExclusions > 0);
2461 
2462  // Re-allocate storage as needed
2463  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2464  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2465  }
2466 
2468 
2469 #if 0
2470  //int chunkSize = (tupleCopyWorkList.size()) /CkMyNodeSize() +
2471  // (CkMyPe() == CkMyNodeSize()-1) * tupleCopyWorkList.size() % CkMyNodeSize();
2472  //int first = ((CkMyNodeSize() -1) - CkMyPe())*chunkSize -1*(CkMyPe() == CkMyNodeSize()-1);
2473  //int last = (first + chunkSize) -1*(!CkMyPe());; // chunksize is padded for the last pe
2474  this->tupleCopyWorker(first, last);
2475 #else
2476  // JM: This probably leads to better load balance than the option above.
2477  // XXX TODO: Improve this further afterwards. I don't think this scales too well
2478  // due to lots of atomics, but you're not supposed to run this with
2479  // lots of PEs anyways
2480  while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <= tupleCopyWorkList.size()){
2481  this->tupleCopyWorker(copyIndex -1, copyIndex -1);
2482  }
2483 #endif
2485 
2486  if(masterPe == CkMyPe()){
2487  tupleWorkIndex.store(0);
2488  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
2489  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
2490  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
2491  numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
2492  numTuplesPerType[Tuples::ONEFOURNBTHOLE],
2493  tupleData, stream);
2494 
2495  // Re-allocate forces
2496  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2497  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2498  }
2499 }
2500 
2501 
2509 void ComputeBondedCUDA::copyTupleDataGPU(const int startup) {
2510 
2511  PatchMap* patchMap = PatchMap::Object();
2513  size_t numExclusions, numModifiedExclusions, copyIndex;
2514 
2515  // WORK TO BE DONE BY THE MASTERPE
2516  if (startup) {
2517  // The atom map is no longer needed, so unmap the atoms. We need to do this in-case
2518  // multiple run commands are issued. It is safe to do this because there is a node
2519  // barrier between loadTuplesOnPe and this function
2520  unmapAtoms();
2521 
2522  if(masterPe == CkMyPe()){
2523  numModifiedExclusions = 0;
2524  numExclusions = 0;
2525  for (int i=0;i < numExclPerRank.size();i++) {
2526  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2527  numExclusions += numExclPerRank[i].numExclusions;
2528  }
2529  size_t numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
2530  size_t numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
2531 
2532  // Count the number of tuples for each type
2533  size_t posWA = 0;
2534  exclusionStartPos = 0;
2535  exclusionStartPos2 = 0;
2536  tupleCopyWorkList.clear();
2537  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2538  // Take temporary position
2539  size_t pos = posWA;
2540  if (tupletype == Tuples::EXCLUSION) {
2541  exclusionStartPos = pos;
2542  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[Tuples::EXCLUSION];
2543  }
2544  // Count for total number of tuples for this tupletype
2545  int num = 0;
2546  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2547  int ntuples = (*it)->getNumTuples();
2548  num += ntuples;
2549  if (tupletype != Tuples::EXCLUSION) {
2550  TupleCopyWork tupleCopyWork;
2551  tupleCopyWork.tupletype = tupletype;
2552  tupleCopyWork.ntuples = ntuples;
2553  tupleCopyWork.tupleElemList = (*it)->getTupleList();
2554  tupleCopyWork.tupleDataPos = pos;
2555  tupleCopyWorkList.push_back(tupleCopyWork);
2556  pos += ntuples*CudaTupleTypeSizeStage[tupletype];
2557  }
2558  }
2559  numTuplesPerType[tupletype] = num;
2560  //
2561  if (tupletype == Tuples::EXCLUSION) {
2562  // Warp-align exclusions separately
2563  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
2564  } else {
2565  posWA += ((int64_t) ComputeBondedCUDAKernel::warpAlign(num))*CudaTupleTypeSize[tupletype];
2566  }
2567  }
2568  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
2569  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2570  }
2571 
2572  // Set flags for finishPatchesOnPe
2573  hasExclusions = (numExclusions > 0);
2574  hasModifiedExclusions = (numModifiedExclusions > 0);
2575 
2576  // Re-allocate storage as needed
2577  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
2578  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2579 
2580 
2581  TupleCounts local;
2582  local.bond = numTuplesPerType[Tuples::BOND];
2583  local.angle = numTuplesPerType[Tuples::ANGLE];
2584  local.dihedral = numTuplesPerType[Tuples::DIHEDRAL];
2585  local.improper = numTuplesPerType[Tuples::IMPROPER];
2586  local.modifiedExclusion = numModifiedExclusions;
2587  local.exclusion = numExclusions;
2588  local.crossterm = numTuplesPerType[Tuples::CROSSTERM];
2589  local.thole = numTuplesPerType[Tuples::THOLE];
2590  local.aniso = numTuplesPerType[Tuples::ANISO];
2591  local.oneFourNbThole = numTuplesPerType[Tuples::ONEFOURNBTHOLE];
2592 
2593  updateMaxTupleCounts(local);
2594  bondedKernel.setTupleCounts(local);
2595  }
2596 
2597  // Wait for tuple counts to be updated...
2599 
2600  if(masterPe == CkMyPe()){
2601  TupleCounts newMax = getMaxTupleCounts();
2602  migrationKernel.reallocateBufferSrc(newMax);
2603  migrationKernel.reallocateBufferDst(newMax);
2604  bondedKernel.reallocateTupleBuffer(newMax, stream);
2605  registerPointersToHost();
2606  }
2607 
2608  // Node barrier is because of registering
2610 
2611  if(masterPe == CkMyPe()){
2612  copyHostRegisterToDevice();
2613  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2614  this->tupleCopyWorkerType(tupletype);
2615  }
2616  sortAndCopyToDevice();
2617  }
2618  }
2619 
2620  migrateTuples(startup);
2621 
2622  if(masterPe == CkMyPe()){
2623  TupleCounts count = bondedKernel.getTupleCounts();
2624  numTuplesPerType[Tuples::BOND] = count.bond;
2625  numTuplesPerType[Tuples::ANGLE] = count.angle;
2626  numTuplesPerType[Tuples::DIHEDRAL] = count.dihedral;
2627  numTuplesPerType[Tuples::IMPROPER] = count.improper;
2628  numModifiedExclusions = count.modifiedExclusion;
2629  numExclusions = count.exclusion;
2630  numTuplesPerType[Tuples::CROSSTERM] = count.crossterm;
2631  numTuplesPerType[Tuples::THOLE] = count.thole;
2632  numTuplesPerType[Tuples::ANISO] = count.aniso;
2633  numTuplesPerType[Tuples::ONEFOURNBTHOLE] = count.oneFourNbThole;
2634 
2635  // Set flags for finishPatchesOnPe
2636  hasExclusions = (numExclusions > 0);
2637  hasModifiedExclusions = (numModifiedExclusions > 0);
2638 
2639  // Re-allocate forces
2640  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2641  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2642  }
2643 }
2644 
2645 #endif
2646 
2647 
2656 #ifdef NODEGROUP_FORCE_REGISTER
2657 void ComputeBondedCUDA::updatePatchRecords() {
2658  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2659  PatchData *patchData = cpdata.ckLocalBranch();
2660  int devInd = deviceCUDA->getDeviceIndex();
2661  PatchRecord **d_pr = &(patchData->devData[devInd].bond_pr);
2662  int **d_pid = &(patchData->devData[devInd].bond_pi);
2663  size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
2664  size_t st_d_pr_size = (size_t)(patchData->devData[devInd].bond_pr_size);
2665  patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
2666  reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size()); // st_d_pr_size changed to patches.size() here
2667  patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size); // copy updated value back into devData
2668  reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
2669  patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size); // copy updated value back into devData
2670  copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
2671  copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
2672  if (params->CUDASOAintegrate && params->useDeviceMigration) {
2673  patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2674  }
2675 }
2676 #endif
2677 
2678 //
2679 // Launch work on GPU
2680 //
2681 void ComputeBondedCUDA::launchWork() {
2683  if (CkMyPe() != masterPe)
2684  NAMD_bug("ComputeBondedCUDA::launchWork() called on non master PE");
2685 
2686  cudaCheck(cudaSetDevice(deviceID));
2687  if (atomsChanged) {
2688  if(!params->CUDASOAintegrate) copyTupleData();
2689  // copyTupledata nees to be called before launchWork then
2690 
2691 // Move this once GPU migration has been merged
2692 #if 1
2693 #ifdef NODEGROUP_FORCE_REGISTER
2694  if(params->CUDASOAintegrate && !params->useDeviceMigration){
2695  updatePatchRecords();
2696  }
2697 #endif
2698 #endif
2699  }
2700 
2701  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
2702  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
2703  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
2704 
2705  int r2_delta_expc = 64 * (ComputeNonbondedUtil::r2_delta_exp - 127);
2706 
2707 #if 0
2708  if(!atomsChanged){
2709  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2710  PatchData *patchData = cpdata.ckLocalBranch();
2711 
2712  float4 *d_atoms = bondedKernel.getAtomBuffer();
2713  copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
2714  if(!params->CUDASOAintegrate) CmiLock(printLock);
2715  else CmiLock(patchData->printlock);
2716 
2717  fprintf(stderr, "DEV[%d] BOND POS PRINTOUT\n", deviceID);
2718 
2719  for(int i = 0 ; i < atomStorageSize; i++){
2720  fprintf(stderr, " ATOMS[%d] = %lf %lf %lf %lf\n",
2721  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
2722  }
2723 
2724  if(!params->CUDASOAintegrate) CmiUnlock(printLock);
2725  else CmiUnlock(patchData->printlock);
2726  }
2727 #endif
2728 
2729  const CudaNBConstants nbConstants = CudaComputeNonbonded::getNonbondedCoef(params);
2730  const bool doTable = CudaComputeNonbonded::getDoTable(params, doSlow, doVirial);
2731 
2732  bondedKernel.bondedForce(
2734  atomStorageSize,
2735  doEnergy, doVirial, doSlow, doTable,
2736  lata, latb, latc,
2738  (float)ComputeNonbondedUtil::r2_delta, r2_delta_expc,
2739  nbConstants,
2740  (const float4*)atoms, forces,
2741  energies_virials, atomsChanged, params->CUDASOAintegrate,
2742  params->useDeviceMigration, stream);
2743 
2744 
2745 #ifdef NODEGROUP_FORCE_REGISTER
2746  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2747  PatchData *patchData = cpdata.ckLocalBranch();
2748  int devInd = deviceCUDA->getDeviceIndex();
2749  if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
2750  patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2751  }
2752  patchData->devData[devInd].f_bond = bondedKernel.getForces();
2753  patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
2754  patchData->devData[devInd].f_bond_slow = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
2755  patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
2756 #endif
2757 
2758 #if 0
2759  if(!params->CUDASOAintegrate) CmiLock(printLock);
2760  else CmiLock(patchData->printlock);
2761  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
2762  // synchronously copy array back to the host
2763  copy_DtoH_sync(bondedKernel.getForces(), forces, forceStorageSize);
2764  fprintf(stderr, "DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
2765  for(int i = 0; i < forceStorageSize; i++){
2766  // prints entire forces datastructure
2767  fprintf(stderr, "BOND[%d] = %lf\n", i, forces[i]);
2768  }
2769  if(!params->CUDASOAintegrate) CmiUnlock(printLock);
2770  else CmiUnlock(patchData->printlock);
2771 #endif
2772  forceDoneSetCallback();
2773 }
2774 
2775 void ComputeBondedCUDA::forceDoneCheck(void *arg, double walltime) {
2776  ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
2777 
2778  if (CkMyPe() != c->masterPe)
2779  NAMD_bug("ComputeBondedCUDA::forceDoneCheck called on non masterPe");
2780 
2781  cudaCheck(cudaSetDevice(c->deviceID));
2782 
2783  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2784  if (err == cudaSuccess) {
2785  // Event has occurred
2786  c->checkCount = 0;
2787  traceUserBracketEvent(CUDA_BONDED_KERNEL_EVENT, c->beforeForceCompute, walltime);
2788  c->finishPatches();
2789  return;
2790  } else if (err != cudaErrorNotReady) {
2791  // Anything else is an error
2792  char errmsg[256];
2793  sprintf(errmsg,"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
2794  c->checkCount, walltime - c->beforeForceCompute);
2795  cudaDie(errmsg,err);
2796  }
2797 
2798  // Event has not occurred
2799  c->checkCount++;
2800  if (c->checkCount >= 1000000) {
2801  char errmsg[256];
2802  sprintf(errmsg,"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
2803  c->checkCount, walltime - c->beforeForceCompute);
2804  cudaDie(errmsg,err);
2805  }
2806 
2807  // Call again
2808  CcdCallBacksReset(0, walltime);
2809  CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2810 }
2811 
2812 //
2813 // Set call back for all the work in the stream at this point
2814 //
2815 void ComputeBondedCUDA::forceDoneSetCallback() {
2816  if (CkMyPe() != masterPe)
2817  NAMD_bug("ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
2818  cudaCheck(cudaSetDevice(deviceID));
2819  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2820  checkCount = 0;
2821  CcdCallBacksReset(0, CmiWallTimer());
2822  // Start timer for CUDA kernel
2823  beforeForceCompute = CkWallTimer();
2824  // Set the call back at 0.1ms
2825  if(!params->CUDASOAintegrate) CcdCallFnAfter(forceDoneCheck, this, 0.1);
2826 }
2827 
2828 template <bool sumNbond, bool sumSlow>
2829 void finishForceLoop(const int numAtoms, const int forceStride,
2830  const double* __restrict__ af,
2831  const double* __restrict__ af_nbond,
2832  const double* __restrict__ af_slow,
2833  Force* __restrict__ f,
2834  Force* __restrict__ f_nbond,
2835  Force* __restrict__ f_slow) {
2836 
2837 #if 0
2838  for (int j=0;j < numAtoms;j++) {
2839  {
2840  //double afx, afy, afz;
2841  //convertForceToDouble(af + j, forceStride, afx, afy, afz);
2842  //f[j].x += af[j];
2843  //f[j].y += af[j + forceStride];
2844  //f[j].z += af[j + forceStride*2];
2845  f[j].x += af[j];
2846  f[j].y += af[j + forceStride];
2847  f[j].z += af[j + forceStride*2];
2848  }
2849  if (sumNbond)
2850  {
2851  //double afx, afy, afz;
2852  //convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
2853  f_nbond[j].x += af_nbond[j];
2854  f_nbond[j].y += af_nbond[j + forceStride];
2855  f_nbond[j].z += af_nbond[j + forceStride*2];
2856 
2857  }
2858  if (sumSlow)
2859  {
2860  //double afx, afy, afz;
2861  //convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
2862  f_slow[j].x += af_slow[j];
2863  f_slow[j].y += af_slow[j + forceStride];
2864  f_slow[j].z += af_slow[j + forceStride*2];
2865 
2866  }
2867  }
2868 #endif
2869 
2870 #if 0
2871  for(int j=0; j < numAtoms; j++){
2872  f[j].x += af[j];
2873  f[j].y += af[j+ forceStride];
2874  f[j].z += af[j+ 2*forceStride];
2875  }
2876  if(sumNbond){
2877  for(int j=0; j < numAtoms; j++){
2878  f_nbond[j].x += af_nbond[j];
2879  f_nbond[j].y += af_nbond[j + forceStride];
2880  f_nbond[j].z += af_nbond[j + 2*forceStride];
2881  }
2882  }
2883  if(sumSlow){
2884  for(int j=0; j < numAtoms; j++){
2885  f_slow[j].x += af_slow[j];
2886  f_slow[j].y += af_slow[j + forceStride];
2887  f_slow[j].z += af_slow[j + 2*forceStride];
2888  }
2889  }
2890 #endif
2891  // XXX Summing into AOS buffer from SOA data layout
2892  // Try to reduce cache misses by separating loops.
2893  for(int j=0; j < numAtoms; j++) f[j].x += af[j];
2894  for(int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
2895  for(int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
2896 
2897  if(sumNbond){
2898 #ifdef DEBUG_MINIMIZE
2899  int k=0;
2900  printf("%s, line %d\n", __FILE__, __LINE__);
2901  printf(" before: f_nbond[%d] = %f %f %f\n",
2902  k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2903 #endif
2904  for(int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
2905  for(int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
2906  for(int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
2907 #ifdef DEBUG_MINIMIZE
2908  printf(" after: f_nbond[%d] = %f %f %f\n",
2909  k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2910 #endif
2911  }
2912  if(sumSlow){
2913  for(int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
2914  for(int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
2915  for(int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
2916  }
2917 #if 0
2918  for(int j = 0; j < numAtoms; j++){
2919  fprintf(stderr, "f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
2920  af[j], af[j + forceStride], af[j + 2*forceStride],
2921  af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
2922  af_slow[j], af_slow[j + forceStride], af_slow[j + 2*forceStride]);
2923  }
2924 #endif
2925 }
2926 
2927 //
2928 // Finish all patches that are on this pe
2929 //
2930 void ComputeBondedCUDA::finishPatchesOnPe() {
2931  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2932 
2933  PatchMap* patchMap = PatchMap::Object();
2934  int myRank = CkMyRank();
2935 
2936  const int forceStride = bondedKernel.getForceStride(atomStorageSize);
2937  const int forceSize = bondedKernel.getForceSize(atomStorageSize);
2938  const bool sumNbond = hasModifiedExclusions;
2939  const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
2940 
2941  // I need to print patchIDsPerRank
2942  for (int i=0;i < patchIDsPerRank[myRank].size();i++) {
2943  PatchID patchID = patchIDsPerRank[myRank][i];
2944  Patch* patch = patchMap->patch(patchID);
2945  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
2946  if (tpe == NULL) {
2947  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
2948  }
2949 
2950  int pi = patchIndex[patchID];
2951  int numAtoms = patches[pi].numAtoms;
2952  int atomStart = patches[pi].atomStart;
2953 
2954  // XXX TODO: this is a large CPU bottleneck
2955  // Ok, so this is
2956 #ifdef NODEGROUP_FORCE_REGISTER
2957  double *af = forces + atomStart;
2958  double *af_nbond = forces + forceSize + atomStart;
2959  double *af_slow = forces + 2*forceSize + atomStart;
2960  // XXX TODO: We still need forces inside patches for migration steps it seems
2961  if(!params->CUDASOAintegrate || (atomsChanged && !params->useDeviceMigration) ){
2962  Force *f = tpe->f;
2963  Force *f_nbond = tpe->r->f[Results::nbond];
2964  Force *f_slow = tpe->r->f[Results::slow];
2965  if (!sumNbond && !sumSlow) {
2966  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2967  } else if (sumNbond && !sumSlow) {
2968  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2969  } else if (!sumNbond && sumSlow) {
2970  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2971  } else if (sumNbond && sumSlow) {
2972  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2973  } else {
2974  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2975  }
2976  tpe->forceBox->close(&tpe->r);
2977  tpe->positionBox->close(&tpe->x);
2978  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
2979  }
2980 #else
2981  Force *f = tpe->f;
2982  Force *f_nbond = tpe->r->f[Results::nbond];
2983  Force *f_slow = tpe->r->f[Results::slow];
2984 
2985  double *af = forces + atomStart;
2986  double *af_nbond = forces + forceSize + atomStart;
2987  double *af_slow = forces + 2*forceSize + atomStart;
2988 
2989  if (!sumNbond && !sumSlow) {
2990  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2991  } else if (sumNbond && !sumSlow) {
2992  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2993  } else if (!sumNbond && sumSlow) {
2994  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2995  } else if (sumNbond && sumSlow) {
2996  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2997  } else {
2998  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2999  }
3000 
3001  tpe->forceBox->close(&tpe->r);
3002  tpe->positionBox->close(&tpe->x);
3003  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
3004 
3005 #endif
3006 // #endif
3007 
3008  }
3009 
3010 
3011  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
3012 
3013  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
3014 
3015  bool done = false;
3016  CmiLock(lock);
3017  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
3018  if(params->CUDASOAintegrate && !atomsChanged){
3019  // masterPe is executing this, so we can go ahead and do reductions
3020  // by themselves. However, for migrations, I still need to follow the usual
3021  // codepath, because of the box setup
3022  patchesCounter = 0;
3023  }
3024  if (patchesCounter == 0) {
3025  patchesCounter = getNumPatches();
3026  done = true;
3027  }
3028  CmiUnlock(lock);
3029  if (done) {
3030  //computeMgr->sendFinishReductions(masterPe, this);
3031  if(params->CUDASOAintegrate ) {
3032  if(!atomsChanged) this->finishReductions();
3033  }
3034  else computeMgr->sendFinishReductions(masterPe, this);
3035  }
3036 
3037  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
3038 
3039 }
3040 
3041 void ComputeBondedCUDA::finishPatches() {
3042 
3043  if (atomsChanged && (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
3044  unmapAtoms();
3045  }
3046  // do I have to synchronize the stream here??????
3047  //computeMgr->sendFinishPatchesOnPe(pes, this);
3048  if(params->CUDASOAintegrate) this->finishPatchesOnPe();
3049  else{
3050  computeMgr->sendFinishPatchesOnPe(pes, this);
3051  }
3052 }
3053 
3054 //
3055 // Finish & submit reductions
3056 //
3057 void ComputeBondedCUDA::finishReductions() {
3058 
3059  if (CkMyPe() != masterPe)
3060  NAMD_bug("ComputeBondedCUDA::finishReductions() called on non masterPe");
3061  SubmitReduction *reduction = getCurrentReduction();
3062 
3063  //fprintf(stderr, "finishReductions\n");
3064 
3065  // static int ncall = 0;
3066  // ncall++;
3067 
3068  int pos = 0;
3069  // do I need this streamSynchronize here?
3070  if(params->CUDASOAintegrate) {
3071  cudaCheck(cudaStreamSynchronize(stream));
3072  }
3073  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3074  if (numTuplesPerType[tupletype] > 0) {
3075 
3076  if (doEnergy) {
3077  switch (tupletype) {
3078  case Tuples::BOND:
3079  reduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
3080  if (hostAlchFlags.alchFepOn) {
3082  }
3083  if (hostAlchFlags.alchThermIntOn) {
3086  }
3087  break;
3088 
3089  case Tuples::ANGLE:
3091  if (hostAlchFlags.alchFepOn) {
3093  }
3094  if (hostAlchFlags.alchThermIntOn) {
3097  }
3098  break;
3099 
3100  case Tuples::DIHEDRAL:
3102  if (hostAlchFlags.alchFepOn) {
3104  }
3105  if (hostAlchFlags.alchThermIntOn) {
3108  }
3109  break;
3110 
3111  case Tuples::IMPROPER:
3113  if (hostAlchFlags.alchFepOn) {
3115  }
3116  if (hostAlchFlags.alchThermIntOn) {
3119  }
3120  break;
3121 
3122  case Tuples::EXCLUSION:
3124  reduction->item(REDUCTION_LJ_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
3126  if (hostAlchFlags.alchFepOn) {
3128  reduction->item(REDUCTION_LJ_ENERGY_F) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ_F];
3130  }
3131  if (hostAlchFlags.alchThermIntOn) {
3138  }
3139  break;
3140 
3141  case Tuples::CROSSTERM:
3143  if (hostAlchFlags.alchFepOn) {
3145  }
3146  if (hostAlchFlags.alchThermIntOn) {
3149  }
3150  break;
3151 
3152  case Tuples::THOLE: {
3154  if (hostAlchFlags.alchFepOn) {
3156  }
3157  if (hostAlchFlags.alchThermIntOn) {
3160  }
3161  break;
3162  }
3163 
3164  case Tuples::ANISO: {
3166  if (hostAlchFlags.alchFepOn) {
3168  }
3169  if (hostAlchFlags.alchThermIntOn) {
3172  }
3173  break;
3174  }
3175 
3176  case Tuples::ONEFOURNBTHOLE: {
3178  if (hostAlchFlags.alchFepOn) {
3180  }
3181  if (hostAlchFlags.alchThermIntOn) {
3184  }
3185  break;
3186  }
3187 
3188  default:
3189  NAMD_bug("ComputeBondedCUDA::finishReductions, Unsupported tuple type");
3190  break;
3191  }
3192  }
3193 
3194  auto it = tupleList[tupletype].begin();
3195  if (!params->CUDASOAintegrate) {
3196  (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
3197  }
3198  }
3199  }
3200 
3201  if (doVirial) {
3202 #ifndef WRITE_FULL_VIRIALS
3203 #error "non-WRITE_FULL_VIRIALS not implemented"
3204 #endif
3205 
3206  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
3207  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
3208  ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
3209  ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3210  // NOTE: AMD_DIHE virial is also added to NORMAL virial.
3211  // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
3212  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3213  }
3214 
3215  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
3216 
3217  reduction->submit();
3218 }
3219 
3220 //
3221 // Can only be called by master PE
3222 //
3223 void ComputeBondedCUDA::initialize() {
3224 #ifdef NODEGROUP_FORCE_REGISTER
3225  tupleWorkIndex.store(0);
3226 #endif
3227 
3228  if (CkMyPe() != masterPe)
3229  NAMD_bug("ComputeBondedCUDA::initialize() called on non master PE");
3230 
3231  // Build list of PEs
3232  for (int rank=0;rank < computes.size();rank++) {
3233  if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
3234  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3235  }
3236  }
3237 
3238  // Return if no work to do
3239  if (pes.size() == 0) return;
3240 
3241  initializeCalled = false;
3242  cudaCheck(cudaSetDevice(deviceID));
3243 
3244 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
3245  int leastPriority, greatestPriority;
3246  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
3247  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
3248 #else
3249  cudaCheck(cudaStreamCreate(&stream));
3250 #endif
3251  cudaCheck(cudaEventCreate(&forceDoneEvent));
3252  lock = CmiCreateLock();
3253  printLock = CmiCreateLock();
3254  if(Node::Object()->simParameters->CUDASOAintegrateMode) {
3255  reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
3256  }
3257  reductionGpuOffload = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
3258 
3259  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3260  PatchData *patchData = cpdata.ckLocalBranch();
3261 
3262  PatchMap* patchMap = PatchMap::Object();
3263  // First, assign all patches in self computes.
3264  // NOTE: These never overlap between PEs. No proxies added.
3265  for (int rank=0;rank < computes.size();rank++) {
3266  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3267  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
3268  for (auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
3269  if (!tuplePatchList.find( TuplePatchElem(*jt) ) ) {
3270  tuplePatchList.add( TuplePatchElem(*jt) );
3271  patchIDsPerRank[rank].push_back(*jt);
3272  allPatchIDs.push_back(*jt);
3273  }
3274  }
3275  }
3276  }
3277 
3278  // Second, assign all patches in home computes.
3279  // NOTE: The ranks always have these patches. No proxies added.
3280  for (int rank=0;rank < computes.size();rank++) {
3281  HomeCompute& homeCompute = computes[rank].homeCompute;
3282  std::vector<int>& patchIDs = homeCompute.patchIDs;
3283  for (int i=0;i < patchIDs.size();i++) {
3284  int patchID = patchIDs[i];
3285  if (!tuplePatchList.find( TuplePatchElem(patchID) ) ) {
3286  tuplePatchList.add( TuplePatchElem(patchID) );
3287  patchIDsPerRank[rank].push_back(patchID);
3288  allPatchIDs.push_back(patchID);
3289  }
3290  }
3291  }
3292 
3293  std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
3294  // Find neighbors that are not added yet
3295  std::vector<int> neighborPids;
3296  for (int rank=0;rank < computes.size();rank++) {
3298  HomeCompute& homeCompute = computes[rank].homeCompute;
3299  std::vector<int>& patchIDs = homeCompute.patchIDs;
3300  for (int i=0;i < patchIDs.size();i++) {
3301  int patchID = patchIDs[i];
3302  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
3303  for (int j=0;j < numNeighbors;j++) {
3304  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
3305  neighborPids.push_back(neighbors[j]);
3306  }
3307  }
3308  }
3309  }
3310  // Remove duplicates from neighborPids
3311  {
3312  std::sort(neighborPids.begin(), neighborPids.end());
3313  auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
3314  neighborPids.resize(std::distance(neighborPids.begin(), it_end));
3315  }
3316  // Assign neighbors to the PEs on this node that have them
3317  for (int i=0;i < neighborPids.size();i++) {
3318  for (int rank=0;rank < computes.size();rank++) {
3319  int pid = neighborPids[i];
3320  int pe = rank + CkNodeFirst(CkMyNode());
3321  if (patchMap->node(pid) == pe) {
3322  // Patch pid found on PE "pe" on this node
3323  tuplePatchList.add( TuplePatchElem(pid) );
3324  patchIDsPerRank[rank].push_back(pid);
3325  allPatchIDs.push_back(pid);
3326  // Add to this rank's patches
3327  patchIDsToAppend[rank].push_back(pid);
3328  // Add to the list of PEs
3329  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3330  break;
3331  }
3332  }
3333  }
3334  // Remove duplicates from pes
3335  {
3336  std::sort(pes.begin(), pes.end());
3337  auto it_end = std::unique(pes.begin(), pes.end());
3338  pes.resize(std::distance(pes.begin(), it_end));
3339  }
3340 
3341  // Last, assign all patches in neighbors of home computes
3342  // NOTE: Will create proxies on multiple nodes
3343  for (int rank=0;rank < computes.size();rank++) {
3345  HomeCompute& homeCompute = computes[rank].homeCompute;
3346  std::vector<int>& patchIDs = homeCompute.patchIDs;
3347  std::vector<int> neighborPatchIDs;
3348  for (int i=0;i < patchIDs.size();i++) {
3349  int patchID = patchIDs[i];
3350  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
3351  for (int j=0;j < numNeighbors;j++) {
3352  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
3353  // Patch not found => Add Proxy
3354  tuplePatchList.add( TuplePatchElem(neighbors[j]) );
3355  patchIDsPerRank[rank].push_back(neighbors[j]);
3356  allPatchIDs.push_back(neighbors[j]);
3357  }
3358  if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
3359  && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
3360  neighborPatchIDs.push_back(neighbors[j]);
3361  }
3362  }
3363  }
3364  // Append neighboring patchIDs to homeCompute.patchIDs
3365  // int start = patchIDs.size();
3366  // patchIDs.resize(patchIDs.size() + neighborPatchIDs.size());
3367  // for (int i=0;i < neighborPatchIDs.size();i++) {
3368  // patchIDs[start + i] = neighborPatchIDs[i];
3369  // }
3370  for (int i=0;i < neighborPatchIDs.size();i++) {
3371  patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
3372  }
3373  }
3374 
3375  for (int rank=0;rank < patchIDsToAppend.size();rank++) {
3376  for (int i=0;i < patchIDsToAppend[rank].size();i++) {
3377  computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
3378  }
3379  }
3380 
3381  // Remove duplicate patch IDs
3382  {
3383  std::sort(allPatchIDs.begin(), allPatchIDs.end());
3384  auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
3385  allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
3386  }
3387 
3388  // Set number of (unique) patches
3389  setNumPatches(allPatchIDs.size());
3390 
3391  // Reset patchesCounter
3392  patchesCounter = getNumPatches();
3393 
3394  patches.resize(getNumPatches());
3395  // Setup tupleList
3396  // tuple list never gets updated here, where the hell is it updated?
3397  for (int rank=0;rank < computes.size();rank++) {
3398  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3399  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
3400  tupleList[it->tuples->getType()].push_back(it->tuples);
3401  }
3402  HomeCompute& homeCompute = computes[rank].homeCompute;
3403  for (int i=0;i < homeCompute.tuples.size();i++) {
3404  tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
3405  }
3406  }
3407 
3408  // Allocate host memory for energies and virials
3409  allocate_host<double>(&energies_virials, ComputeBondedCUDAKernel::energies_virials_SIZE);
3410 
3411  // Finally, do sanity checks
3412  std::vector<char> patchIDset(patchMap->numPatches(), 0);
3413  int numPatchIDset = 0;
3414  int numPatchIDs = 0;
3415  for (int rank=0;rank < computes.size();rank++) {
3416  numPatchIDs += patchIDsPerRank[rank].size();
3417  for (int i=0;i < patchIDsPerRank[rank].size();i++) {
3418  PatchID patchID = patchIDsPerRank[rank][i];
3419  if (patchIDset[patchID] == 0) numPatchIDset++;
3420  patchIDset[patchID] = 1;
3421  if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
3422  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3423  }
3424  }
3425  }
3426  if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
3427  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3428  }
3429 
3430  // Warning: Direct indexing used, patchIndex could use up a lot of memory for large systems
3431  patchIndex.resize(patchMap->numPatches());
3432  atomMappers.resize(getNumPatches());
3433  for (int i=0;i < getNumPatches();i++) {
3434  atomMappers[i] = new AtomMapper(allPatchIDs[i], &atomMap);
3435  patchIndex[allPatchIDs[i]] = i;
3436  }
3437 
3438  // Initialize the alchemical flags, parameters and lambda values
3439  updateHostCudaAlchFlags();
3440  updateKernelCudaAlchFlags();
3441  if (hostAlchFlags.alchOn) {
3442  updateHostCudaAlchParameters();
3443  bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3444  updateHostCudaAlchLambdas();
3445  bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3446  if (hostAlchFlags.alchDecouple) {
3447  pswitchTable[1+3*1] = 0;
3448  pswitchTable[2+3*2] = 0;
3449  }
3450  }
3451 
3452  // Copy coefficients to GPU
3453  Parameters* parameters = Node::Object()->parameters;
3454  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3455  if (tupleList[tupletype].size() > 0) {
3456  switch(tupletype) {
3457 
3458  case Tuples::BOND:
3459  {
3460  int NumBondParams = parameters->NumBondParams;
3461  BondValue* bond_array = parameters->bond_array;
3462  std::vector<CudaBondValue> bondValues(NumBondParams);
3463  for (int i=0;i < NumBondParams;i++) {
3464  bondValues[i].k = bond_array[i].k;
3465  bondValues[i].x0 = bond_array[i].x0;
3466  bondValues[i].x1 = bond_array[i].x1;
3467  }
3468  bondedKernel.setupBondValues(NumBondParams, bondValues.data());
3469  }
3470  break;
3471 
3472  case Tuples::ANGLE:
3473  {
3474  int NumAngleParams = parameters->NumAngleParams;
3475  AngleValue* angle_array = parameters->angle_array;
3476  std::vector<CudaAngleValue> angleValues(NumAngleParams);
3477  bool normal_ub_error = false;
3478  for (int i=0;i < NumAngleParams;i++) {
3479  angleValues[i].k = angle_array[i].k;
3480  if (angle_array[i].normal == 1) {
3481  angleValues[i].theta0 = angle_array[i].theta0;
3482  } else {
3483  angleValues[i].theta0 = cos(angle_array[i].theta0);
3484  }
3485  normal_ub_error |= (angle_array[i].normal == 0 && angle_array[i].k_ub);
3486  angleValues[i].k_ub = angle_array[i].k_ub;
3487  angleValues[i].r_ub = angle_array[i].r_ub;
3488  angleValues[i].normal = angle_array[i].normal;
3489  }
3490  if (normal_ub_error) NAMD_die("ERROR: Can't use cosAngles with Urey-Bradley angles");
3491  bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
3492  }
3493  break;
3494 
3495  case Tuples::DIHEDRAL:
3496  {
3497  int NumDihedralParams = parameters->NumDihedralParams;
3498  DihedralValue* dihedral_array = parameters->dihedral_array;
3499  int NumDihedralParamsMult = 0;
3500  for (int i=0;i < NumDihedralParams;i++) {
3501  NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
3502  }
3503  std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
3504  dihedralMultMap.resize(NumDihedralParams);
3505  int k = 0;
3506  for (int i=0;i < NumDihedralParams;i++) {
3507  int multiplicity = dihedral_array[i].multiplicity;
3508  dihedralMultMap[i] = k;
3509  for (int j=0;j < multiplicity;j++) {
3510  dihedralValues[k].k = dihedral_array[i].values[j].k;
3511  dihedralValues[k].n = (dihedral_array[i].values[j].n << 1) | (j < (multiplicity - 1));
3512  dihedralValues[k].delta = dihedral_array[i].values[j].delta;
3513  k++;
3514  }
3515  }
3516  bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
3517  }
3518  break;
3519 
3520  case Tuples::IMPROPER:
3521  {
3522  int NumImproperParams = parameters->NumImproperParams;
3523  ImproperValue* improper_array = parameters->improper_array;
3524  int NumImproperParamsMult = 0;
3525  for (int i=0;i < NumImproperParams;i++) {
3526  NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
3527  }
3528  std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
3529  improperMultMap.resize(NumImproperParams);
3530  int k = 0;
3531  for (int i=0;i < NumImproperParams;i++) {
3532  int multiplicity = improper_array[i].multiplicity;
3533  improperMultMap[i] = k;
3534  for (int j=0;j < multiplicity;j++) {
3535  improperValues[k].k = improper_array[i].values[j].k;
3536  improperValues[k].n = (improper_array[i].values[j].n << 1) | (j < (multiplicity - 1));
3537  improperValues[k].delta = improper_array[i].values[j].delta;
3538  k++;
3539  }
3540  }
3541  bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
3542  }
3543  break;
3544 
3545  case Tuples::CROSSTERM:
3546  {
3547  int NumCrosstermParams = parameters->NumCrosstermParams;
3548  CrosstermValue* crossterm_array = parameters->crossterm_array;
3549  std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
3550  const int D = CrosstermValue::dim;
3551  const int N = CrosstermValue::dim - 1;
3552  for (int ipar=0;ipar < NumCrosstermParams;ipar++) {
3553  for (int i=0;i < N;i++) {
3554  for (int j=0;j < N;j++) {
3555 
3556  // Setups coefficients for bi-cubic interpolation.
3557  // See https://en.wikipedia.org/wiki/Bicubic_interpolation
3558 
3559  #define INDEX(ncols,i,j) ((i)*ncols + (j))
3560  CrosstermData* table = &crossterm_array[ipar].c[0][0];
3561 
3562  const double Ainv[16][16] = {
3563  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3564  { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3565  {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3566  { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3567  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
3568  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
3569  { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
3570  { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
3571  {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
3572  { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
3573  { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
3574  {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
3575  { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
3576  { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
3577  {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
3578  { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
3579  };
3580 
3581 #ifndef M_PI
3582 #define M_PI 3.14159265358979323846
3583 #endif
3584 
3585  const double h = M_PI/12.0;
3586 
3587  const double x[16] = {
3588  table[INDEX(D,i,j)].d00, table[INDEX(D,i+1,j)].d00, table[INDEX(D,i,j+1)].d00, table[INDEX(D,i+1,j+1)].d00,
3589  table[INDEX(D,i,j)].d10*h, table[INDEX(D,i+1,j)].d10*h, table[INDEX(D,i,j+1)].d10*h, table[INDEX(D,i+1,j+1)].d10*h,
3590  table[INDEX(D,i,j)].d01*h, table[INDEX(D,i+1,j)].d01*h, table[INDEX(D,i,j+1)].d01*h, table[INDEX(D,i+1,j+1)].d01*h,
3591  table[INDEX(D,i,j)].d11*h*h, table[INDEX(D,i+1,j)].d11*h*h, table[INDEX(D,i,j+1)].d11*h*h, table[INDEX(D,i+1,j+1)].d11*h*h
3592  };
3593 
3594  // a = Ainv*x
3595  float* a = (float *)&crosstermValues[ipar].c[i][j][0];
3596  for (int k=0;k < 16;k++) {
3597  double a_val = 0.0;
3598  for (int l=0;l < 16;l++) {
3599  a_val += Ainv[k][l]*x[l];
3600  }
3601  a[k] = (float)a_val;
3602  }
3603 
3604  }
3605  }
3606  }
3607  bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
3608  }
3609  break;
3610 
3611  case Tuples::EXCLUSION:
3612  // Nothing to do
3613  break;
3614 
3615  case Tuples::THOLE: {
3616  // Nothing to do
3617  }
3618  break;
3619 
3620  case Tuples::ANISO: {
3621  // Nothing to do
3622  }
3623  break;
3624 
3625  case Tuples::ONEFOURNBTHOLE: {
3626  // Nothing to do
3627  }
3628  break;
3629 
3630  default:
3631  NAMD_bug("ComputeBondedCUDA::initialize, Undefined tuple type");
3632  break;
3633  }
3634  }
3635  }
3636 
3637  computeMgr->sendAssignPatchesOnPe(pes, this);
3638 
3639  int nDevices = deviceCUDA->getNumDevice();
3640 
3641 #ifdef NODEGROUP_FORCE_REGISTER
3642  // TODO DMC this isn't the number of home patches, but that is okay for now...
3643  if (params->CUDASOAintegrateMode && params->useDeviceMigration) {
3644  migrationKernel.setup(nDevices, patchMap->numPatches());
3645 
3646  // Tuple Migration Structures
3647  // Patch record Data
3648  allocate_host<PatchRecord>(&h_patchRecord, patchMap->numPatches());
3649  allocate_device<PatchRecord>(&d_patchRecord, patchMap->numPatches());
3650 
3651  // Patch Centers
3652  allocate_host<double3>(&h_patchMapCenter, patchMap->numPatches());
3653  allocate_device<double3>(&d_patchMapCenter, patchMap->numPatches());
3654  for (int i = 0; i < patchMap->numPatches(); i++) {
3655  COPY_CUDAVECTOR(patchMap->center(i), h_patchMapCenter[i]);
3656  }
3657  copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->numPatches());
3658  }
3659 #endif // NODEGROUP_FORCE_REGISTER
3660 }
3661 
3662 #ifdef NODEGROUP_FORCE_REGISTER
3663 void ComputeBondedCUDA::updatePatchOrder(const std::vector<CudaLocalRecord>& data) {
3664  // TODO is there anywhere else that the order matters??
3665  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
3666  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
3667  for (int i=0;i < data.size();i++) {
3668  patchIndex[data[i].patchID] = i;
3669  }
3670 
3671  patches.clear();
3672  for (int i=0;i < data.size();i++) {
3673  PatchRecord p;
3674  p.patchID = data[i].patchID;
3675  p.atomStart = 0;
3676  p.numAtoms = 0;
3677  patches.push_back(p);
3678  }
3679 }
3680 #endif // NODEGROUP_FORCE_REGISTER
3681 
3682 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
3683  const SimParameters& sim_params = *(Node::Object()->simParameters);
3684  hostAlchFlags.alchOn = sim_params.alchOn;
3685  hostAlchFlags.alchFepOn = sim_params.alchFepOn;
3686  hostAlchFlags.alchThermIntOn = sim_params.alchThermIntOn;
3687  hostAlchFlags.alchWCAOn = sim_params.alchWCAOn;
3688  hostAlchFlags.alchLJcorrection = sim_params.LJcorrection;
3689  hostAlchFlags.alchVdwForceSwitching = ComputeNonbondedUtil::vdwForceSwitching;
3690  hostAlchFlags.alchDecouple = sim_params.alchDecouple;
3691  hostAlchFlags.alchBondDecouple = sim_params.alchBondDecouple;
3692 }
3693 
3694 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
3695  bondedKernel.updateCudaAlchFlags(hostAlchFlags);
3696 }
3697 
3698 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
3699  const SimParameters& sim_params = *(Node::Object()->simParameters);
3700  hostAlchParameters.switchDist2 = ComputeNonbondedUtil::switchOn * ComputeNonbondedUtil::switchOn;
3701  hostAlchParameters.alchVdwShiftCoeff = sim_params.alchVdwShiftCoeff;
3702 }
3703 
3704 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
3705  bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3706 }
3707 
3708 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
3709  SimParameters& sim_params = *(Node::Object()->simParameters);
3710  hostAlchLambdas.currentLambda = float(sim_params.getCurrentLambda(step));
3711  hostAlchLambdas.currentLambda2 = float(sim_params.getCurrentLambda2(step));
3712  hostAlchLambdas.bondLambda1 = float(sim_params.getBondLambda(hostAlchLambdas.currentLambda));
3713  hostAlchLambdas.bondLambda2 = float(sim_params.getBondLambda(1.0 - hostAlchLambdas.currentLambda));
3714  hostAlchLambdas.bondLambda12 = float(sim_params.getBondLambda(hostAlchLambdas.currentLambda2));
3715  hostAlchLambdas.bondLambda22 = float(sim_params.getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
3716  hostAlchLambdas.elecLambdaUp = float(sim_params.getElecLambda(hostAlchLambdas.currentLambda));
3717  hostAlchLambdas.elecLambda2Up = float(sim_params.getElecLambda(hostAlchLambdas.currentLambda2));
3718  hostAlchLambdas.elecLambdaDown = float(sim_params.getElecLambda(1.0 - hostAlchLambdas.currentLambda));
3719  hostAlchLambdas.elecLambda2Down = float(sim_params.getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
3720  hostAlchLambdas.vdwLambdaUp = float(sim_params.getVdwLambda(hostAlchLambdas.currentLambda));
3721  hostAlchLambdas.vdwLambda2Up = float(sim_params.getVdwLambda(hostAlchLambdas.currentLambda2));
3722  hostAlchLambdas.vdwLambdaDown = float(sim_params.getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
3723  hostAlchLambdas.vdwLambda2Down = float(sim_params.getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
3724 }
3725 
3726 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
3727  bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3728 }
3729 
3730 #ifdef NODEGROUP_FORCE_REGISTER
3731 void ComputeBondedCUDA::registerPointersToHost() {
3732  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3733  PatchData *patchData = cpdata.ckLocalBranch();
3734 
3735  const int deviceIndex = deviceCUDA->getDeviceIndex();
3736 
3737  TupleDataStage dst = migrationKernel.getDstBuffers();
3738  patchData->h_tupleDataStage.bond[deviceIndex] = dst.bond;
3739  patchData->h_tupleDataStage.angle[deviceIndex] = dst.angle;
3740  patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.dihedral;
3741  patchData->h_tupleDataStage.improper[deviceIndex] = dst.improper;
3742  patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.modifiedExclusion;
3743  patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.exclusion;
3744  patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.crossterm;
3745 
3746  TupleIntArraysContiguous count = migrationKernel.getDeviceTupleCounts();
3747  patchData->h_tupleCount.bond[deviceIndex] = count.bond();
3748  patchData->h_tupleCount.angle[deviceIndex] = count.angle();
3749  patchData->h_tupleCount.dihedral[deviceIndex] = count.dihedral();
3750  patchData->h_tupleCount.improper[deviceIndex] = count.improper();
3751  patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.modifiedExclusion();
3752  patchData->h_tupleCount.exclusion[deviceIndex] = count.exclusion();
3753  patchData->h_tupleCount.crossterm[deviceIndex] = count.crossterm();
3754 
3755  TupleIntArraysContiguous offset = migrationKernel.getDeviceTupleOffsets();
3756  patchData->h_tupleOffset.bond[deviceIndex] = offset.bond();
3757  patchData->h_tupleOffset.angle[deviceIndex] = offset.angle();
3758  patchData->h_tupleOffset.dihedral[deviceIndex] = offset.dihedral();
3759  patchData->h_tupleOffset.improper[deviceIndex] = offset.improper();
3760  patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.modifiedExclusion();
3761  patchData->h_tupleOffset.exclusion[deviceIndex] = offset.exclusion();
3762  patchData->h_tupleOffset.crossterm[deviceIndex] = offset.crossterm();
3763 }
3764 
3765 void ComputeBondedCUDA::copyHostRegisterToDevice() {
3766  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3767  PatchData *patchData = cpdata.ckLocalBranch();
3768 
3769  const int deviceIndex = deviceCUDA->getDeviceIndex();
3770  const int nDevices = deviceCUDA->getNumDevice();
3771  const int deviceID = deviceCUDA->getDeviceID();
3772  cudaCheck(cudaSetDevice(deviceID));
3773 
3774  migrationKernel.copyPeerDataToDevice(
3775  patchData->h_tupleDataStage,
3776  patchData->h_tupleCount,
3777  patchData->h_tupleOffset,
3778  nDevices,
3779  stream
3780  );
3781 }
3782 
3783 void ComputeBondedCUDA::copyPatchData() {
3784  PatchMap* patchMap = PatchMap::Object();
3785  int numPatches = patchMap->numPatches();
3786  // Constructs a patch record array that is indexed by global patchID
3787  for (int i = 0; i < numPatches; i++) {
3788  h_patchRecord[i] = patches[patchIndex[i]];
3789  }
3790 
3791  copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
3792 }
3793 #endif // NODEGROUP_FORCE_REGISTER
3794 
3795 SubmitReduction* ComputeBondedCUDA::getCurrentReduction() {
3797  return (simParams->CUDASOAintegrate) ? reductionGpuResident :
3798  reductionGpuOffload;
3799 }
3800 
3801 #endif // BONDED_CUDA
3802 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
CudaDihedralStage * improper
NAMD_HOST_DEVICE int * modifiedExclusion()
void barrier(const SynchronousCollectiveScope scope)
float scale
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
Definition: Patch.C:133
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
#define M_PI
Definition: GoMolecule.C:40
const CrosstermValue * value
int getNumAtoms() const
Definition: Patch.h:105
int NumBondParams
Definition: Parameters.h:326
CrosstermData c[dim][dim]
Definition: Parameters.h:141
TuplePatchElem * p[size]
Definition: ComputeAngles.h:22
int size(void) const
Definition: ResizeArray.h:131
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:139
TuplePatchElem * p[size]
BigReal getBondLambda(const BigReal) const
const ImproperValue * value
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
static int warpAlign(const int n)
CudaAngleStage * angle
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:29
int numAtoms
Definition: CudaRecord.h:17
int32 ComputeID
Definition: NamdTypes.h:288
#define ADD_TENSOR(R, RL, D, DL)
Definition: ReductionMgr.h:33
float3 ioffsetXYZ
int getCMaxIndex()
Definition: PatchMap.h:174
TuplePatchElem * p[size]
static PatchMap * Object()
Definition: PatchMap.h:27
float3 joffsetXYZ
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1863
CudaCrosstermStage * crossterm
CmiNodeLock printlock
Definition: PatchData.h:163
Definition: Vector.h:72
virtual void submit(void)=0
SimParameters * simParameters
Definition: Node.h:181
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1852
NAMD_HOST_DEVICE int * exclusion()
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
Bool CUDASOAintegrateMode
AtomID atomID[size]
#define INDEX(ncols, i, j)
BigReal & item(int i)
Definition: ReductionMgr.h:336
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
Definition: Patch.C:238
AtomID atomID[size]
BigReal z
Definition: Vector.h:74
int downstream(int pid1, int pid2)
Definition: PatchMap.inl:51
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:669
int getNumDevice()
Definition: DeviceCUDA.h:125
Position position
Definition: NamdTypes.h:78
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
DihedralValue * dihedral_array
Definition: Parameters.h:310
int getBDim()
Definition: PatchMap.h:170
int getADim()
Definition: PatchMap.h:169
int fepBondedType
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
BigReal getElecLambda(const BigReal) const
int index[size]
int NumAngleParams
Definition: Parameters.h:327
CrosstermValue * crossterm_array
Definition: Parameters.h:312
Real scale
Definition: ComputeBonds.h:27
int localIndex[size]
int getBMaxIndex()
Definition: PatchMap.h:173
Molecule stores the structural information for the system.
Definition: Molecule.h:174
AtomID atomID[size]
Definition: ComputeBonds.h:24
int NumDihedralParams
Definition: Parameters.h:329
Definition: Patch.h:35
Bool useDeviceMigration
Flags flags
Definition: Patch.h:128
BigReal d11
Definition: Parameters.h:133
const OneFourNbTholeValue * value
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:124
NAMD_HOST_DEVICE int * bond()
AngleValue * angle_array
Definition: Parameters.h:309
static CudaNBConstants getNonbondedCoef(SimParameters *params)
TuplePatchElem * p[size]
static Units next(Units u)
Definition: ParseOptions.C:48
#define order
Definition: PmeRealSpace.C:235
float3 koffsetXYZ
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1874
CudaAtom * getCudaAtomList()
Definition: Patch.h:125
float3 ioffsetXYZ
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
Definition: CudaUtils.h:444
void CcdCallBacksReset(void *ignored, double curWallTime)
const DihedralValue * value
NAMD_HOST_DEVICE float3 make_float3(float4 a)
Definition: Vector.h:335
int localIndex[size]
Definition: ComputeBonds.h:25
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:62
CudaDihedralStage * dihedral
AtomID atomID[size]
int NumImproperParams
Definition: Parameters.h:330
int16 vdwType
Definition: NamdTypes.h:80
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1885
Force * f[maxNumForces]
Definition: PatchTypes.h:150
NAMD_HOST_DEVICE int * crossterm()
AtomID atomID[size]
Definition: ComputeAngles.h:20
int localIndex[size]
BigReal d00
Definition: Parameters.h:133
uint8 partition
Definition: NamdTypes.h:81
NAMD_HOST_DEVICE int * angle()
BigReal x
Definition: Vector.h:74
CudaBondStage * bond
PatchID getPatchID() const
Definition: Patch.h:114
int NumCrosstermParams
Definition: Parameters.h:331
int patchIDs[size]
ImproperValue * improper_array
Definition: Parameters.h:311
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1811
CudaExclusionStage * modifiedExclusion
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
Definition: ProxyMgr.C:492
int get_fep_bonded_type(const int *atomID, unsigned int order) const
Definition: Molecule.h:1482
void NAMD_die(const char *err_msg)
Definition: common.C:147
int localIndex[size]
Definition: ComputeAniso.h:23
CudaExclusionStage * exclusion
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:130
const TholeValue * value
Definition: ComputeThole.h:43
int getDeviceID()
Definition: DeviceCUDA.h:144
unsigned char get_fep_type(int anum) const
Definition: Molecule.h:1437
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:121
Parameters * parameters
Definition: Node.h:180
int getAMaxIndex()
Definition: PatchMap.h:172
float3 ioffsetXYZ
int localIndex[size]
Definition: ComputeAngles.h:21
int index[size]
#define simParams
Definition: Output.C:131
int32 sortOrder
Definition: NamdTypes.h:153
int localIndex[size]
BigReal d01
Definition: Parameters.h:133
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
int patchIDs[size]
int32 AtomID
Definition: NamdTypes.h:35
BondValue * bond_array
Definition: Parameters.h:308
BigReal y
Definition: Vector.h:74
const BondValue * value
Definition: ComputeBonds.h:46
BigReal getVdwLambda(const BigReal) const
AtomID atomID[size]
Definition: ComputeAniso.h:22
int getDeviceIndex()
Definition: DeviceCUDA.h:166
Box< Patch, CompAtom > * avgPositionBox
Real theta0
Definition: Parameters.h:108
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
TuplePatchElem * p[size]
Definition: ComputeAniso.h:24
const AngleValue * value
Definition: ComputeAngles.h:40
CompAtomExt * xExt
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
int localIndex[size]
Definition: ComputeThole.h:23
int patchIDs[size]
TuplePatchElem * p[size]
Definition: ComputeThole.h:24
int node(int pid) const
Definition: PatchMap.h:114
TuplePatchElem * p[size]
Definition: ComputeBonds.h:26
NAMD_HOST_DEVICE int * improper()
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1838
Data * open(void)
Definition: Box.h:39
float3 loffsetXYZ
int atomStart
Definition: CudaRecord.h:16
int patchIDs[size]
TuplePatchElem * p[size]
int32 PatchID
Definition: NamdTypes.h:287
NAMD_HOST_DEVICE int * dihedral()
const AnisoValue * value
Definition: ComputeAniso.h:43
Molecule * molecule
Definition: Node.h:179
Box< Patch, Results > * forceBox
int fepBondedType
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1785
void close(Data **const t)
Definition: Box.h:49
BigReal d10
Definition: Parameters.h:133
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:106
AtomID atomID[size]
Definition: ComputeThole.h:22
static SynchronousCollectives * Object()
BigReal alchVdwShiftCoeff
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:227