NAMD
ComputeBondedCUDA.C
Go to the documentation of this file.
1 #include "charm++.h"
2 #include "NamdTypes.h"
3 #include "ComputeMgr.h"
4 #include "WorkDistrib.h"
5 #include "ProxyMgr.h"
6 #include "CudaUtils.h"
7 #include "DeviceCUDA.h"
8 #include "ComputeBonds.h"
9 #include "ComputeAngles.h"
10 #include "ComputeDihedrals.h"
11 #include "ComputeImpropers.h"
12 #include "ComputeCrossterms.h"
14 #include "ComputeBondedCUDA.h"
15 #include "HipDefines.h"
16 #include "NamdEventsProfiling.h"
17 
18 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
19 #ifdef BONDED_CUDA
20 
21 #include <algorithm> // std::find
22 
23 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
24  sizeof(CudaBond), // Bonds
25  sizeof(CudaAngle), // Angles
26  sizeof(CudaDihedral), // Dihedrals
27  sizeof(CudaDihedral), // Impropers
28  sizeof(CudaExclusion), // Exclusions
29  sizeof(CudaCrossterm) // Crossterms
30 };
31 
32 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
33 
34 //
35 // Class constructor
36 //
37 ComputeBondedCUDA::ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID,
38  CudaNonbondedTables& cudaNonbondedTables) :
39 Compute(c), computeMgr(computeMgr), deviceID(deviceID), masterPe(CkMyPe()),
40 bondedKernel(deviceID, cudaNonbondedTables)
41 {
42 
43  computes.resize(CkMyNodeSize());
44  patchIDsPerRank.resize(CkMyNodeSize());
45  numExclPerRank.resize(CkMyNodeSize());
46  for (int i=0;i < numExclPerRank.size();i++) {
47  numExclPerRank[i].numModifiedExclusions = 0;
48  numExclPerRank[i].numExclusions = 0;
49  }
50 
51  atomMap.allocateMap(Node::Object()->molecule->numAtoms);
52 
53  flags = NULL;
54 
55  tupleData = NULL;
56  tupleDataSize = 0;
57 
58  atoms = NULL;
59  atomsSize = 0;
60 
61  forces = NULL;
62  forcesSize = 0;
63 
64  energies_virials = NULL;
65 
66  initializeCalled = false;
67 
69  accelMDdoDihe = false;
70  if (params->accelMDOn) {
71  if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=true;
72  }
73 }
74 
75 //
76 // Class destructor
77 //
78 ComputeBondedCUDA::~ComputeBondedCUDA() {
79  cudaCheck(cudaSetDevice(deviceID));
80 
81  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
82  if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
83  if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
84  if (tupleData != NULL) deallocate_host<char>(&tupleData);
85 
86  if (initializeCalled) {
87  cudaCheck(cudaStreamDestroy(stream));
88  cudaCheck(cudaEventDestroy(forceDoneEvent));
89  CmiDestroyLock(lock);
90  delete reduction;
91  }
92 
93  // NOTE: unregistering happens in [sync] -entry method
94  computeMgr->sendUnregisterBoxesOnPe(pes, this);
95 }
96 
97 void ComputeBondedCUDA::unregisterBoxesOnPe() {
98  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
99  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
100  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
101  if (tpe == NULL || tpe->p == NULL) {
102  NAMD_bug("ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
103  }
104  Patch* patch = tpe->p;
105  if (tpe->positionBox != NULL) patch->unregisterPositionPickup(this, &tpe->positionBox);
106  if (tpe->avgPositionBox != NULL) patch->unregisterAvgPositionPickup(this, &tpe->avgPositionBox);
107  if (tpe->forceBox != NULL) patch->unregisterForceDeposit(this, &tpe->forceBox);
108  }
109 }
110 
111 //
112 // Register compute for a given PE. pids is a list of patches the PE has
113 // Called by master PE
114 //
115 void ComputeBondedCUDA::registerCompute(int pe, int type, PatchIDList& pids) {
116 
117  if (CkMyPe() != masterPe)
118  NAMD_bug("ComputeBondedCUDA::registerCompute() called on non master PE");
119 
120  int rank = CkRankOf(pe);
121 
122  HomeCompute& homeCompute = computes[rank].homeCompute;
123  if (homeCompute.patchIDs.size() == 0) {
124  homeCompute.isBasePatch.resize(PatchMap::Object()->numPatches(), 0);
125  homeCompute.patchIDs.resize(pids.size());
126  for (int i=0;i < pids.size();i++) {
127  homeCompute.patchIDs[i] = pids[i];
128  homeCompute.isBasePatch[pids[i]] = 1;
129  }
130  } else {
131  if (homeCompute.patchIDs.size() != pids.size()) {
132  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
133  }
134  for (int i=0;i < pids.size();i++) {
135  if (homeCompute.patchIDs[i] != pids[i]) {
136  NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
137  }
138  }
139  }
140 
141  switch(type) {
142  case computeBondsType:
143  homeCompute.tuples.push_back(new HomeTuples<BondElem, Bond, BondValue>(Tuples::BOND));
144  break;
145 
146  case computeAnglesType:
147  homeCompute.tuples.push_back(new HomeTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE));
148  break;
149 
151  homeCompute.tuples.push_back(new HomeTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL));
152  break;
153 
155  homeCompute.tuples.push_back(new HomeTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER));
156  break;
157 
158  case computeExclsType:
159  homeCompute.tuples.push_back(new HomeTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION));
160  break;
161 
163  homeCompute.tuples.push_back(new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM));
164  break;
165 
166  default:
167  NAMD_bug("ComputeBondedCUDA::registerCompute(), Unsupported compute type");
168  break;
169  }
170 
171 }
172 
173 //
174 // Register self compute for a given PE
175 // Called by master PE
176 //
177 void ComputeBondedCUDA::registerSelfCompute(int pe, int type, int pid) {
178 
179  if (CkMyPe() != masterPe)
180  NAMD_bug("ComputeBondedCUDA::registerSelfCompute() called on non master PE");
181 
182  int rank = CkRankOf(pe);
183 
184  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
185  auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
186  if (it == selfComputes.end()) {
187  // Type not found, add new one
188  selfComputes.push_back(SelfCompute(type));
189  it = selfComputes.begin() + (selfComputes.size() - 1);
190 
191  switch(type) {
193  it->tuples = new SelfTuples<BondElem, Bond, BondValue>(Tuples::BOND);
194  break;
195 
197  it->tuples = new SelfTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE);
198  break;
199 
201  it->tuples = new SelfTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL);
202  break;
203 
205  it->tuples = new SelfTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER);
206  break;
207 
209  it->tuples = new SelfTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION);
210  break;
211 
213  it->tuples = new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM);
214  break;
215 
216  default:
217  NAMD_bug("ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
218  break;
219  }
220 
221  }
222 
223  // Add patch ID for this type
224  it->patchIDs.push_back(pid);
225 }
226 
227 void ComputeBondedCUDA::assignPatchesOnPe() {
228 
229  PatchMap* patchMap = PatchMap::Object();
230  for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
231  PatchID patchID = patchIDsPerRank[CkMyRank()][i];
232  ProxyMgr::Object()->createProxy(patchID);
233  Patch* patch = patchMap->patch(patchID);
234  if (patch == NULL)
235  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, patch not found");
236  if (flags == NULL) flags = &patchMap->patch(patchID)->flags;
237  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
238  if (tpe == NULL) {
239  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
240  }
241  if (tpe->p != NULL) {
242  NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
243  }
244  // Assign patch and register coordinates and forces manually
245  tpe->p = patch;
246  tpe->positionBox = patch->registerPositionPickup(this);
247  tpe->avgPositionBox = patch->registerAvgPositionPickup(this);
248  tpe->forceBox = patch->registerForceDeposit(this);
249  }
250 }
251 
252 //
253 // atomUpdate() can be called by any Pe
254 //
255 void ComputeBondedCUDA::atomUpdate() {
256  atomsChangedIn = true;
257 }
258 
259 //
260 // Enqueu doWork on masterPe and return "no work"
261 // Can be called by any Pe
262 //
263 int ComputeBondedCUDA::noWork() {
264  computeMgr->sendMessageEnqueueWork(masterPe, this);
265  return 1;
266 }
267 
268 void ComputeBondedCUDA::messageEnqueueWork() {
269  if (masterPe != CkMyPe())
270  NAMD_bug("ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
272 }
273 
274 //
275 // Sends open-box commands to PEs
276 // Called on master PE
277 //
278 void ComputeBondedCUDA::doWork() {
279 
280  if (CkMyPe() != masterPe)
281  NAMD_bug("ComputeBondedCUDA::doWork() called on non master PE");
282 
283  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
284  // atomsChangedIn can be set to true by any Pe
285  // atomsChanged can only be set by masterPe
286  // This use of double varibles makes sure we don't have race condition
287  atomsChanged = atomsChangedIn;
288  atomsChangedIn = false;
289 
290  if (getNumPatches() == 0) return; // No work do to
291 
292  if (flags == NULL)
293  NAMD_bug("ComputeBondedCUDA::doWork(), no flags set");
294 
295  // Read flags
296  lattice = flags->lattice;
297  doEnergy = flags->doEnergy;
298  doVirial = flags->doVirial;
299  doSlow = flags->doFullElectrostatics;
300  doMolly = flags->doMolly;
301 
302  if (atomsChanged) {
303  // Re-calculate patch atom numbers and storage
304  updatePatches();
305  }
306 
307  // Open boxes on Pes and launch work to masterPe
308  computeMgr->sendOpenBoxesOnPe(pes, this);
309 }
310 
311 //
312 // This gets called when patch finishes on a PE
313 //
314 void ComputeBondedCUDA::patchReady(PatchID pid, int doneMigration, int seq) {
315  if (doneMigration) {
316  // auto it = patchIndex.find(pid);
317  // if (it == patchIndex.end())
318  // NAMD_bug("ComputeBondedCUDA::patchReady, Patch ID not found");
319  patches[patchIndex[pid]].numAtoms = PatchMap::Object()->patch(pid)->getNumAtoms();
320  }
321  CmiLock(lock);
322  Compute::patchReady(pid, doneMigration, seq);
323  CmiUnlock(lock);
324 }
325 
326 //
327 //
328 //
329 void ComputeBondedCUDA::updatePatches() {
330  int atomStart = 0;
331  for (int i=0;i < patches.size();i++) {
332  patches[i].atomStart = atomStart;
333  atomStart += patches[i].numAtoms;
334  }
335  atomStorageSize = atomStart;
336 
337  // Re-allocate atoms
338  reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
339 }
340 
341 //
342 // Map atoms GPU-wide
343 //
344 void ComputeBondedCUDA::mapAtoms() {
345 
346  for (int i=0;i < getNumPatches();i++) {
347  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
348  atomMappers[i]->registerIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
349  }
350 
351 }
352 
353 //
354 // Unmap atoms GPU-wide
355 //
356 void ComputeBondedCUDA::unmapAtoms() {
357 
358  for (int i=0;i < getNumPatches();i++) {
359  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
360  atomMappers[i]->unregisterIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
361  }
362 
363 }
364 
365 //
366 // Open all patches that have been assigned to this Pe
367 //
368 void ComputeBondedCUDA::openBoxesOnPe() {
369 
370  std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
371 
372  for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
373  PatchID patchID = *it;
374  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
375  tpe->x = tpe->positionBox->open();
376  tpe->xExt = tpe->p->getCompAtomExtInfo();
377  if ( doMolly ) tpe->x_avg = tpe->avgPositionBox->open();
378  tpe->r = tpe->forceBox->open();
379  tpe->f = tpe->r->f[Results::normal];
380  if (accelMDdoDihe) tpe->af = tpe->r->f[Results::amdf]; // for dihedral-only or dual-boost accelMD
381 
382  // Copy atoms
383  int pi = patchIndex[patchID];
384  int atomStart = patches[pi].atomStart;
385  int numAtoms = patches[pi].numAtoms;
386  CompAtom* compAtom = tpe->x;
387  const CompAtomExt *aExt = tpe->p->getCompAtomExtInfo();
388  const CudaAtom *src = tpe->p->getCudaAtomList();
389  for (int i=0;i < numAtoms;i++) {
390  int j = aExt[i].sortOrder;
391  atoms[atomStart + j] = src[i];
392  }
393 
394  }
395 
396  bool done = false;
397  CmiLock(lock);
398  patchesCounter -= patchIDs.size();
399  if (patchesCounter == 0) {
400  patchesCounter = getNumPatches();
401  done = true;
402  }
403  CmiUnlock(lock);
404  if (done) {
405  if (atomsChanged) {
406  mapAtoms();
407  computeMgr->sendLoadTuplesOnPe(pes, this);
408  } else {
409  computeMgr->sendLaunchWork(masterPe, this);
410  }
411  }
412 }
413 
414 void countNumExclusions(Tuples* tuples, int& numModifiedExclusions, int& numExclusions) {
415  numModifiedExclusions = 0;
416  int ntuples = tuples->getNumTuples();
417  ExclElem* src = (ExclElem *)(tuples->getTupleList());
418  for (int ituple=0;ituple < ntuples;ituple++) {
419  if (src[ituple].modified) numModifiedExclusions++;
420  }
421  numExclusions = ntuples - numModifiedExclusions;
422 }
423 
424 //
425 // Load tuples on PE. Note: this can only after boxes on all PEs have been opened
426 //
427 void ComputeBondedCUDA::loadTuplesOnPe() {
428 
429  int numModifiedExclusions = 0;
430  int numExclusions = 0;
431 
432  std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
433  // Loop over self compute types
434  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
435  it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
436  // For exclusions, we must count the number of modified and non-modified exclusions
437  if (it->tuples->getType() == Tuples::EXCLUSION) {
438  int tmp1, tmp2;
439  countNumExclusions(it->tuples, tmp1, tmp2);
440  numModifiedExclusions += tmp1;
441  numExclusions += tmp2;
442  }
443  }
444 
445  HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
446  for (int i=0;i < homeCompute.tuples.size();i++) {
447  homeCompute.tuples[i]->loadTuples(tuplePatchList,
448  homeCompute.isBasePatch.data(), &atomMap,
449  homeCompute.patchIDs);
450  // For exclusions, we must count the number of modified and non-modified exclusions
451  if (homeCompute.tuples[i]->getType() == Tuples::EXCLUSION) {
452  int tmp1, tmp2;
453  countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
454  numModifiedExclusions += tmp1;
455  numExclusions += tmp2;
456  }
457  }
458 
459  // Store number of exclusions
460  numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
461  numExclPerRank[CkMyRank()].numExclusions = numExclusions;
462 
463  bool done = false;
464  CmiLock(lock);
465  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
466  if (patchesCounter == 0) {
467  patchesCounter = getNumPatches();
468  done = true;
469  }
470  CmiUnlock(lock);
471  if (done) {
472  computeMgr->sendLaunchWork(masterPe, this);
473  }
474 }
475 
476 void ComputeBondedCUDA::copyBondData(const int ntuples, const BondElem* __restrict__ src,
477  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
478 
479  PatchMap* patchMap = PatchMap::Object();
480  for (int ituple=0;ituple < ntuples;ituple++) {
481  CudaBond dstval;
482  auto p0 = src[ituple].p[0];
483  auto p1 = src[ituple].p[1];
484  int pi0 = patchIndex[p0->patchID];
485  int pi1 = patchIndex[p1->patchID];
486  int l0 = src[ituple].localIndex[0];
487  int l1 = src[ituple].localIndex[1];
488  dstval.i = l0 + patches[pi0].atomStart;
489  dstval.j = l1 + patches[pi1].atomStart;
490  dstval.itype = (src[ituple].value - bond_array);
491  Position position1 = p0->x[l0].position;
492  Position position2 = p1->x[l1].position;
493  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
494  shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
495  dstval.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
496  dstval.scale = src[ituple].scale;
497  dst[ituple] = dstval;
498  }
499 }
500 
501 void ComputeBondedCUDA::copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
502  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst) {
503 
504  PatchMap* patchMap = PatchMap::Object();
505  for (int ituple=0;ituple < ntuples;ituple++) {
506  CudaAngle dstval;
507  auto p0 = src[ituple].p[0];
508  auto p1 = src[ituple].p[1];
509  auto p2 = src[ituple].p[2];
510  int pi0 = patchIndex[p0->patchID];
511  int pi1 = patchIndex[p1->patchID];
512  int pi2 = patchIndex[p2->patchID];
513  int l0 = src[ituple].localIndex[0];
514  int l1 = src[ituple].localIndex[1];
515  int l2 = src[ituple].localIndex[2];
516  dstval.i = l0 + patches[pi0].atomStart;
517  dstval.j = l1 + patches[pi1].atomStart;
518  dstval.k = l2 + patches[pi2].atomStart;
519  dstval.itype = (src[ituple].value - angle_array);
520  Position position1 = p0->x[l0].position;
521  Position position2 = p1->x[l1].position;
522  Position position3 = p2->x[l2].position;
523  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
524  Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
525  shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
526  shiftVec32 += patchMap->center(p2->patchID) - patchMap->center(p1->patchID);
527  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
528  dstval.koffsetXYZ = make_float3((float)shiftVec32.x, (float)shiftVec32.y, (float)shiftVec32.z);
529  dstval.scale = src[ituple].scale;
530  dst[ituple] = dstval;
531  }
532 }
533 
534 //
535 // Used for both dihedrals and impropers
536 //
537 template <bool doDihedral, typename T, typename P>
538 void ComputeBondedCUDA::copyDihedralData(const int ntuples, const T* __restrict__ src,
539  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
540 
541  PatchMap* patchMap = PatchMap::Object();
542  for (int ituple=0;ituple < ntuples;ituple++) {
543  CudaDihedral dstval;
544  auto p0 = src[ituple].p[0];
545  auto p1 = src[ituple].p[1];
546  auto p2 = src[ituple].p[2];
547  auto p3 = src[ituple].p[3];
548  int pi0 = patchIndex[p0->patchID];
549  int pi1 = patchIndex[p1->patchID];
550  int pi2 = patchIndex[p2->patchID];
551  int pi3 = patchIndex[p3->patchID];
552  int l0 = src[ituple].localIndex[0];
553  int l1 = src[ituple].localIndex[1];
554  int l2 = src[ituple].localIndex[2];
555  int l3 = src[ituple].localIndex[3];
556  dstval.i = l0 + patches[pi0].atomStart;
557  dstval.j = l1 + patches[pi1].atomStart;
558  dstval.k = l2 + patches[pi2].atomStart;
559  dstval.l = l3 + patches[pi3].atomStart;
560  if (doDihedral) {
561  dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
562  } else {
563  dstval.itype = improperMultMap[(src[ituple].value - p_array)];
564  }
565  Position position1 = p0->x[l0].position;
566  Position position2 = p1->x[l1].position;
567  Position position3 = p2->x[l2].position;
568  Position position4 = p3->x[l3].position;
569  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
570  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
571  Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
572  shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
573  shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
574  shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
575  dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
576  dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
577  dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
578  dstval.scale = src[ituple].scale;
579  dst[ituple] = dstval;
580  }
581 }
582 
583 void ComputeBondedCUDA::copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
584  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int& pos, int& pos2) {
585 
586  PatchMap* patchMap = PatchMap::Object();
587  for (int ituple=0;ituple < ntuples;ituple++) {
588  auto p0 = src[ituple].p[0];
589  auto p1 = src[ituple].p[1];
590  int pi0 = patchIndex[p0->patchID];
591  int pi1 = patchIndex[p1->patchID];
592  int l0 = src[ituple].localIndex[0];
593  int l1 = src[ituple].localIndex[1];
594  CompAtom& ca1 = p0->x[l0];
595  CompAtom& ca2 = p1->x[l1];
596  Position position1 = ca1.position;
597  Position position2 = ca2.position;
598  Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
599  shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
600  CudaExclusion ce;
601  ce.i = l0 + patches[pi0].atomStart;
602  ce.j = l1 + patches[pi1].atomStart;
603  ce.vdwtypei = ca1.vdwType;
604  ce.vdwtypej = ca2.vdwType;
605  ce.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
606  //
607  if (src[ituple].modified) {
608  *dst1 = ce;
609  dst1++;
610  pos += typeSize;
611  } else {
612  *dst2 = ce;
613  dst2++;
614  pos2 += typeSize;
615  }
616  }
617 }
618 
619 void ComputeBondedCUDA::copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
620  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst) {
621 
622  PatchMap* patchMap = PatchMap::Object();
623  for (int ituple=0;ituple < ntuples;ituple++) {
624  auto p0 = src[ituple].p[0];
625  auto p1 = src[ituple].p[1];
626  auto p2 = src[ituple].p[2];
627  auto p3 = src[ituple].p[3];
628  auto p4 = src[ituple].p[4];
629  auto p5 = src[ituple].p[5];
630  auto p6 = src[ituple].p[6];
631  auto p7 = src[ituple].p[7];
632  int pi0 = patchIndex[p0->patchID];
633  int pi1 = patchIndex[p1->patchID];
634  int pi2 = patchIndex[p2->patchID];
635  int pi3 = patchIndex[p3->patchID];
636  int pi4 = patchIndex[p4->patchID];
637  int pi5 = patchIndex[p5->patchID];
638  int pi6 = patchIndex[p6->patchID];
639  int pi7 = patchIndex[p7->patchID];
640  int l0 = src[ituple].localIndex[0];
641  int l1 = src[ituple].localIndex[1];
642  int l2 = src[ituple].localIndex[2];
643  int l3 = src[ituple].localIndex[3];
644  int l4 = src[ituple].localIndex[4];
645  int l5 = src[ituple].localIndex[5];
646  int l6 = src[ituple].localIndex[6];
647  int l7 = src[ituple].localIndex[7];
648  dst[ituple].i1 = l0 + patches[pi0].atomStart;
649  dst[ituple].i2 = l1 + patches[pi1].atomStart;
650  dst[ituple].i3 = l2 + patches[pi2].atomStart;
651  dst[ituple].i4 = l3 + patches[pi3].atomStart;
652  dst[ituple].i5 = l4 + patches[pi4].atomStart;
653  dst[ituple].i6 = l5 + patches[pi5].atomStart;
654  dst[ituple].i7 = l6 + patches[pi6].atomStart;
655  dst[ituple].i8 = l7 + patches[pi7].atomStart;
656  dst[ituple].itype = (src[ituple].value - crossterm_array);
657  Position position1 = p0->x[l0].position;
658  Position position2 = p1->x[l1].position;
659  Position position3 = p2->x[l2].position;
660  Position position4 = p3->x[l3].position;
661  Position position5 = p4->x[l4].position;
662  Position position6 = p5->x[l5].position;
663  Position position7 = p6->x[l6].position;
664  Position position8 = p7->x[l7].position;
665  Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
666  Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
667  Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
668  Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
669  Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
670  Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
671  shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
672  shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
673  shiftVec34 += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
674  shiftVec56 += patchMap->center(p4->patchID) - patchMap->center(p5->patchID);
675  shiftVec67 += patchMap->center(p5->patchID) - patchMap->center(p6->patchID);
676  shiftVec78 += patchMap->center(p6->patchID) - patchMap->center(p7->patchID);
677  dst[ituple].offset12XYZ = make_float3( (float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
678  dst[ituple].offset23XYZ = make_float3( (float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
679  dst[ituple].offset34XYZ = make_float3( (float)shiftVec34.x, (float)shiftVec34.y, (float)shiftVec34.z);
680  dst[ituple].offset56XYZ = make_float3( (float)shiftVec56.x, (float)shiftVec56.y, (float)shiftVec56.z);
681  dst[ituple].offset67XYZ = make_float3( (float)shiftVec67.x, (float)shiftVec67.y, (float)shiftVec67.z);
682  dst[ituple].offset78XYZ = make_float3( (float)shiftVec78.x, (float)shiftVec78.y, (float)shiftVec78.z);
683  dst[ituple].scale = src[ituple].scale;
684  }
685 }
686 
687 void ComputeBondedCUDA::tupleCopyWorker(int first, int last, void *result, int paraNum, void *param) {
688  ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
689  c->tupleCopyWorker(first, last);
690 }
691 
692 void ComputeBondedCUDA::tupleCopyWorker(int first, int last) {
693  if (first == -1) {
694  // Separate exclusions into modified, and non-modified
695  int pos = exclusionStartPos;
696  int pos2 = exclusionStartPos2;
697  for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
698  int ntuples = (*it)->getNumTuples();
699  copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
700  (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
701  }
702  first = 0;
703  }
704  for (int i=first;i <= last;i++) {
705  switch (tupleCopyWorkList[i].tupletype) {
706 
707  case Tuples::BOND:
708  {
709  copyBondData(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
710  Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
711  }
712  break;
713 
714  case Tuples::ANGLE:
715  {
716  copyAngleData(tupleCopyWorkList[i].ntuples, (AngleElem *)tupleCopyWorkList[i].tupleElemList,
717  Node::Object()->parameters->angle_array, (CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
718  }
719  break;
720 
721  case Tuples::DIHEDRAL:
722  {
723  copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
724  (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
725  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
726  }
727  break;
728 
729  case Tuples::IMPROPER:
730  {
731  copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
732  (ImproperElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->improper_array,
733  (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
734  }
735  break;
736 
737  case Tuples::CROSSTERM:
738  {
739  copyCrosstermData(tupleCopyWorkList[i].ntuples, (CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
740  Node::Object()->parameters->crossterm_array, (CudaCrossterm *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
741  }
742  break;
743 
744  default:
745  NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
746  break;
747  }
748  }
749 }
750 
751 //
752 // Copies tuple data form individual buffers to a single contigious buffer
753 // NOTE: This is done on the master PE
754 //
755 void ComputeBondedCUDA::copyTupleData() {
756 
757  PatchMap* patchMap = PatchMap::Object();
758 
759  // Count the number of exclusions
760  int numModifiedExclusions = 0;
761  int numExclusions = 0;
762  for (int i=0;i < numExclPerRank.size();i++) {
763  numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
764  numExclusions += numExclPerRank[i].numExclusions;
765  }
766  int numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
767  int numExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numExclusions);
768 
769  // Count the number of tuples for each type
770  int posWA = 0;
771  exclusionStartPos = 0;
772  exclusionStartPos2 = 0;
773  tupleCopyWorkList.clear();
774  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
775  // Take temporary position
776  int pos = posWA;
777  if (tupletype == Tuples::EXCLUSION) {
778  exclusionStartPos = pos;
779  exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
780  }
781  // Count for total number of tuples for this tupletype
782  int num = 0;
783  for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
784  int ntuples = (*it)->getNumTuples();
785  num += ntuples;
786  if (tupletype != Tuples::EXCLUSION) {
787  TupleCopyWork tupleCopyWork;
788  tupleCopyWork.tupletype = tupletype;
789  tupleCopyWork.ntuples = ntuples;
790  tupleCopyWork.tupleElemList = (*it)->getTupleList();
791  tupleCopyWork.tupleDataPos = pos;
792  tupleCopyWorkList.push_back(tupleCopyWork);
793  pos += ntuples*CudaTupleTypeSize[tupletype];
794  }
795  }
796  numTuplesPerType[tupletype] = num;
797  //
798  if (tupletype == Tuples::EXCLUSION) {
799  // Warp-align exclusions separately
800  posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
801  } else {
802  posWA += ComputeBondedCUDAKernel::warpAlign(num)*CudaTupleTypeSize[tupletype];
803  }
804  }
805  if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
806  NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
807  }
808 
809  // Set flags for finishPatchesOnPe
810  hasExclusions = (numExclusions > 0);
811  hasModifiedExclusions = (numModifiedExclusions > 0);
812 
813  // Re-allocate storage as needed
814  // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
815  reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
816 
817 #if CMK_SMP && USE_CKLOOP
818  int useCkLoop = Node::Object()->simParameters->useCkLoop;
819  if (useCkLoop >= 1) {
820  CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
821  } else
822 #endif
823  {
824  tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
825  }
826 
827  bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
828  numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
829  numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
830  tupleData, stream);
831 
832  // Re-allocate forces
833  int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
834  reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
835 }
836 
837 //
838 // Launch work on GPU
839 //
840 void ComputeBondedCUDA::launchWork() {
841  if (CkMyPe() != masterPe)
842  NAMD_bug("ComputeBondedCUDA::launchWork() called on non master PE");
843 
844  cudaCheck(cudaSetDevice(deviceID));
845 
846  if (atomsChanged) {
847  copyTupleData();
848  }
849 
850  float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
851  float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
852  float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
853 
854  int r2_delta_expc = 64 * (ComputeNonbondedUtil::r2_delta_exp - 127);
855 
856  // Calculate forces
857  bondedKernel.bondedForce(
859  atomStorageSize,
860  doEnergy, doVirial, doSlow,
861  lata, latb, latc,
863  (float)ComputeNonbondedUtil::r2_delta, r2_delta_expc,
864  (const float4*)atoms, forces,
865  energies_virials,
866  stream);
867 
868  forceDoneSetCallback();
869 }
870 
871 void ComputeBondedCUDA::forceDoneCheck(void *arg, double walltime) {
872  ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
873 
874  if (CkMyPe() != c->masterPe)
875  NAMD_bug("ComputeBondedCUDA::forceDoneCheck called on non masterPe");
876 
877  cudaCheck(cudaSetDevice(c->deviceID));
878 
879  cudaError_t err = cudaEventQuery(c->forceDoneEvent);
880  if (err == cudaSuccess) {
881  // Event has occurred
882  c->checkCount = 0;
883  traceUserBracketEvent(CUDA_BONDED_KERNEL_EVENT, c->beforeForceCompute, walltime);
884  c->finishPatches();
885  return;
886  } else if (err != cudaErrorNotReady) {
887  // Anything else is an error
888  char errmsg[256];
889  sprintf(errmsg,"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
890  c->checkCount, walltime - c->beforeForceCompute);
891  cudaDie(errmsg,err);
892  }
893 
894  // Event has not occurred
895  c->checkCount++;
896  if (c->checkCount >= 1000000) {
897  char errmsg[256];
898  sprintf(errmsg,"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
899  c->checkCount, walltime - c->beforeForceCompute);
900  cudaDie(errmsg,err);
901  }
902 
903  // Call again
904  CcdCallBacksReset(0, walltime);
905  CcdCallFnAfter(forceDoneCheck, arg, 0.1);
906 }
907 
908 //
909 // Set call back for all the work in the stream at this point
910 //
911 void ComputeBondedCUDA::forceDoneSetCallback() {
912  if (CkMyPe() != masterPe)
913  NAMD_bug("ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
914  cudaCheck(cudaSetDevice(deviceID));
915  cudaCheck(cudaEventRecord(forceDoneEvent, stream));
916  checkCount = 0;
917  CcdCallBacksReset(0, CmiWallTimer());
918  // Start timer for CUDA kernel
919  beforeForceCompute = CkWallTimer();
920  // Set the call back at 0.1ms
921  CcdCallFnAfter(forceDoneCheck, this, 0.1);
922 }
923 
924 inline void convertForceToDouble(const FORCE_TYPE *af, const int forceStride, double& afx, double& afy, double& afz) {
925 #ifdef USE_STRIDED_FORCE
926  FORCE_TYPE afxt = af[0];
927  FORCE_TYPE afyt = af[forceStride];
928  FORCE_TYPE afzt = af[forceStride*2];
929 #else
930  FORCE_TYPE afxt = af->x;
931  FORCE_TYPE afyt = af->y;
932  FORCE_TYPE afzt = af->z;
933 #endif
934 #ifdef USE_FP_FORCE
935  afx = ((double)afxt)*force_to_double;
936  afy = ((double)afyt)*force_to_double;
937  afz = ((double)afzt)*force_to_double;
938 #else
939  afx = afxt;
940  afy = afyt;
941  afz = afzt;
942 #endif
943 }
944 
945 template <bool sumNbond, bool sumSlow>
946 void finishForceLoop(const int numAtoms, const int forceStride,
947  const FORCE_TYPE* __restrict__ af, const FORCE_TYPE* __restrict__ af_nbond, const FORCE_TYPE* __restrict__ af_slow,
948  Force* __restrict__ f, Force* __restrict__ f_nbond, Force* __restrict__ f_slow) {
949 
950  for (int j=0;j < numAtoms;j++) {
951  {
952  double afx, afy, afz;
953  convertForceToDouble(af + j, forceStride, afx, afy, afz);
954  f[j].x += afx;
955  f[j].y += afy;
956  f[j].z += afz;
957  }
958  if (sumNbond)
959  {
960  double afx, afy, afz;
961  convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
962  f_nbond[j].x += afx;
963  f_nbond[j].y += afy;
964  f_nbond[j].z += afz;
965  }
966  if (sumSlow)
967  {
968  double afx, afy, afz;
969  convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
970  f_slow[j].x += afx;
971  f_slow[j].y += afy;
972  f_slow[j].z += afz;
973  }
974  }
975 
976 }
977 
978 //
979 // Finish all patches that are on this pe
980 //
981 void ComputeBondedCUDA::finishPatchesOnPe() {
982 
983  PatchMap* patchMap = PatchMap::Object();
984  int myRank = CkMyRank();
985 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
986  char buf[64];
987  sprintf(buf, "%s: %d", NamdProfileEventStr[NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES], myRank);
988  NAMD_EVENT_START_EX(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES, buf);
989 #endif
990  const int forceStride = bondedKernel.getForceStride(atomStorageSize);
991  const int forceSize = bondedKernel.getForceSize(atomStorageSize);
992  const bool sumNbond = hasModifiedExclusions;
993  const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
994 
995  for (int i=0;i < patchIDsPerRank[myRank].size();i++) {
996  PatchID patchID = patchIDsPerRank[myRank][i];
997  Patch* patch = patchMap->patch(patchID);
998  TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
999  if (tpe == NULL) {
1000  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
1001  }
1002 
1003  int pi = patchIndex[patchID];
1004  int numAtoms = patches[pi].numAtoms;
1005  int atomStart = patches[pi].atomStart;
1006 
1007  Force *f = tpe->f;
1008  Force *f_nbond = tpe->r->f[Results::nbond];
1009  Force *f_slow = tpe->r->f[Results::slow];
1010 
1011  FORCE_TYPE *af = forces + atomStart;
1012  FORCE_TYPE *af_nbond = forces + forceSize + atomStart;
1013  FORCE_TYPE *af_slow = forces + 2*forceSize + atomStart;
1014 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1015  char buf2[64];
1016  sprintf(buf2, "%s: %d", NamdProfileEventStr[NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_FORCE_LOOP], myRank);
1017  NAMD_EVENT_START_EX(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_FORCE_LOOP, buf2);
1018 #endif
1019  if (!sumNbond && !sumSlow) {
1020  finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
1021  } else if (sumNbond && !sumSlow) {
1022  finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
1023  } else if (!sumNbond && sumSlow) {
1024  finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
1025  } else if (sumNbond && sumSlow) {
1026  finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
1027  } else {
1028  NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
1029  }
1030 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1031  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_FORCE_LOOP);
1032 #endif
1033  // for (int j=0;j < numAtoms;j++) {
1034  // {
1035  // double afx, afy, afz;
1036  // convertForceToDouble(af + j, forceStride, afx, afy, afz);
1037  // f[j].x += afx;
1038  // f[j].y += afy;
1039  // f[j].z += afz;
1040  // }
1041  // if (sumNbond)
1042  // {
1043  // double afx, afy, afz;
1044  // convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
1045  // f_nbond[j].x += afx;
1046  // f_nbond[j].y += afy;
1047  // f_nbond[j].z += afz;
1048  // }
1049  // if (sumSlow)
1050  // {
1051  // double afx, afy, afz;
1052  // convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
1053  // f_slow[j].x += afx;
1054  // f_slow[j].y += afy;
1055  // f_slow[j].z += afz;
1056  // }
1057  // }
1058 
1059  tpe->forceBox->close(&tpe->r);
1060  tpe->positionBox->close(&tpe->x);
1061  if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
1062  }
1063 
1064  bool done = false;
1065  CmiLock(lock);
1066  patchesCounter -= patchIDsPerRank[CkMyRank()].size();
1067  if (patchesCounter == 0) {
1068  patchesCounter = getNumPatches();
1069  done = true;
1070  }
1071  CmiUnlock(lock);
1072  if (done) {
1073  computeMgr->sendFinishReductions(masterPe, this);
1074  }
1075 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1076  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
1077 #endif
1078 }
1079 
1080 void ComputeBondedCUDA::finishPatches() {
1081 
1082  if (atomsChanged) {
1083  unmapAtoms();
1084  }
1085 
1086  computeMgr->sendFinishPatchesOnPe(pes, this);
1087 }
1088 
1089 #ifdef WRITE_FULL_VIRIALS
1090 #ifdef USE_FP_VIRIAL
1091 void convertVirial(double *virial) {
1092  long long int *virial_lli = (long long int *)virial;
1093  for (int i=0;i < 9;i++) {
1094  virial[i] = ((double)virial_lli[i])*virial_to_double;
1095  }
1096 }
1097 #endif
1098 #endif
1099 
1100 //
1101 // Finish & submit reductions
1102 //
1103 void ComputeBondedCUDA::finishReductions() {
1104 
1105  if (CkMyPe() != masterPe)
1106  NAMD_bug("ComputeBondedCUDA::finishReductions() called on non masterPe");
1107 
1108  // static int ncall = 0;
1109  // ncall++;
1110 
1111  int pos = 0;
1112  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
1113  if (numTuplesPerType[tupletype] > 0) {
1114 
1115  if (doEnergy) {
1116  switch (tupletype) {
1117  case Tuples::BOND:
1118  reduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
1119  break;
1120 
1121  case Tuples::ANGLE:
1122  reduction->item(REDUCTION_ANGLE_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE];
1123  break;
1124 
1125  case Tuples::DIHEDRAL:
1126  reduction->item(REDUCTION_DIHEDRAL_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL];
1127  break;
1128 
1129  case Tuples::IMPROPER:
1130  reduction->item(REDUCTION_IMPROPER_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER];
1131  break;
1132 
1133  case Tuples::EXCLUSION:
1134  reduction->item(REDUCTION_ELECT_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT];
1135  reduction->item(REDUCTION_LJ_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
1137  break;
1138 
1139  case Tuples::CROSSTERM:
1140  reduction->item(REDUCTION_CROSSTERM_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_CROSSTERM];
1141  break;
1142 
1143  default:
1144  NAMD_bug("ComputeBondedCUDA::finishReductions, Unsupported tuple type");
1145  break;
1146  }
1147  }
1148 
1149  auto it = tupleList[tupletype].begin();
1150  (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
1151  }
1152  }
1153 
1154  if (doVirial) {
1155 #ifdef WRITE_FULL_VIRIALS
1156 #ifdef USE_FP_VIRIAL
1157  convertVirial(&energies_virials[ComputeBondedCUDAKernel::normalVirialIndex_XX]);
1158  convertVirial(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX]);
1159  convertVirial(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX]);
1160  convertVirial(&energies_virials[ComputeBondedCUDAKernel::amdDiheVirialIndex_XX]);
1161 #endif
1162 #else
1163 #error "non-WRITE_FULL_VIRIALS not implemented"
1164 #endif
1165  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
1166  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
1167  ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
1168  ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
1169  // NOTE: AMD_DIHE virial is also added to NORMAL virial.
1170  // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
1171  ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
1172  }
1173 
1174  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1175  reduction->submit();
1176 }
1177 
1178 //
1179 // Can only be called by master PE
1180 //
1181 void ComputeBondedCUDA::initialize() {
1182 
1183  if (CkMyPe() != masterPe)
1184  NAMD_bug("ComputeBondedCUDA::initialize() called on non master PE");
1185 
1186  // Build list of PEs
1187  for (int rank=0;rank < computes.size();rank++) {
1188  if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
1189  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
1190  }
1191  }
1192 
1193  // Return if no work to do
1194  if (pes.size() == 0) return;
1195 
1196  initializeCalled = true;
1197  cudaCheck(cudaSetDevice(deviceID));
1198 
1199 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
1200  int leastPriority, greatestPriority;
1201  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1202  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
1203 #else
1204  cudaCheck(cudaStreamCreate(&stream));
1205 #endif
1206  cudaCheck(cudaEventCreate(&forceDoneEvent));
1207  lock = CmiCreateLock();
1208 
1210 
1211  PatchMap* patchMap = PatchMap::Object();
1212 
1213  // First, assign all patches in self computes.
1214  // NOTE: These never overlap between PEs. No proxies added.
1215  for (int rank=0;rank < computes.size();rank++) {
1216  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
1217  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
1218  for (auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
1219  if (!tuplePatchList.find( TuplePatchElem(*jt) ) ) {
1220  tuplePatchList.add( TuplePatchElem(*jt) );
1221  patchIDsPerRank[rank].push_back(*jt);
1222  allPatchIDs.push_back(*jt);
1223  }
1224  }
1225  }
1226  }
1227 
1228  // Second, assign all patches in home computes.
1229  // NOTE: The ranks always have these patches. No proxies added.
1230  for (int rank=0;rank < computes.size();rank++) {
1231  HomeCompute& homeCompute = computes[rank].homeCompute;
1232  std::vector<int>& patchIDs = homeCompute.patchIDs;
1233  for (int i=0;i < patchIDs.size();i++) {
1234  int patchID = patchIDs[i];
1235  if (!tuplePatchList.find( TuplePatchElem(patchID) ) ) {
1236  tuplePatchList.add( TuplePatchElem(patchID) );
1237  patchIDsPerRank[rank].push_back(patchID);
1238  allPatchIDs.push_back(patchID);
1239  }
1240  }
1241  }
1242 
1243  std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
1244  // Find neighbors that are not added yet
1245  std::vector<int> neighborPids;
1246  for (int rank=0;rank < computes.size();rank++) {
1248  HomeCompute& homeCompute = computes[rank].homeCompute;
1249  std::vector<int>& patchIDs = homeCompute.patchIDs;
1250  for (int i=0;i < patchIDs.size();i++) {
1251  int patchID = patchIDs[i];
1252  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
1253  for (int j=0;j < numNeighbors;j++) {
1254  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
1255  neighborPids.push_back(neighbors[j]);
1256  }
1257  }
1258  }
1259  }
1260  // Remove duplicates from neighborPids
1261  {
1262  std::sort(neighborPids.begin(), neighborPids.end());
1263  auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
1264  neighborPids.resize(std::distance(neighborPids.begin(), it_end));
1265  }
1266  // Assign neighbors to the PEs on this node that have them
1267  for (int i=0;i < neighborPids.size();i++) {
1268  for (int rank=0;rank < computes.size();rank++) {
1269  int pid = neighborPids[i];
1270  int pe = rank + CkNodeFirst(CkMyNode());
1271  if (patchMap->node(pid) == pe) {
1272  // Patch pid found on PE "pe" on this node
1273  tuplePatchList.add( TuplePatchElem(pid) );
1274  patchIDsPerRank[rank].push_back(pid);
1275  allPatchIDs.push_back(pid);
1276  // Add to this rank's patches
1277  patchIDsToAppend[rank].push_back(pid);
1278  // Add to the list of PEs
1279  pes.push_back(CkNodeFirst(CkMyNode()) + rank);
1280  break;
1281  }
1282  }
1283  }
1284  // Remove duplicates from pes
1285  {
1286  std::sort(pes.begin(), pes.end());
1287  auto it_end = std::unique(pes.begin(), pes.end());
1288  pes.resize(std::distance(pes.begin(), it_end));
1289  }
1290 
1291  // Last, assign all patches in neighbors of home computes
1292  // NOTE: Will create proxies on multiple nodes
1293  for (int rank=0;rank < computes.size();rank++) {
1295  HomeCompute& homeCompute = computes[rank].homeCompute;
1296  std::vector<int>& patchIDs = homeCompute.patchIDs;
1297  std::vector<int> neighborPatchIDs;
1298  for (int i=0;i < patchIDs.size();i++) {
1299  int patchID = patchIDs[i];
1300  int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
1301  for (int j=0;j < numNeighbors;j++) {
1302  if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
1303  // Patch not found => Add Proxy
1304  tuplePatchList.add( TuplePatchElem(neighbors[j]) );
1305  patchIDsPerRank[rank].push_back(neighbors[j]);
1306  allPatchIDs.push_back(neighbors[j]);
1307  }
1308  if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
1309  && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
1310  neighborPatchIDs.push_back(neighbors[j]);
1311  }
1312  }
1313  }
1314  // Append neighboring patchIDs to homeCompute.patchIDs
1315  // int start = patchIDs.size();
1316  // patchIDs.resize(patchIDs.size() + neighborPatchIDs.size());
1317  // for (int i=0;i < neighborPatchIDs.size();i++) {
1318  // patchIDs[start + i] = neighborPatchIDs[i];
1319  // }
1320  for (int i=0;i < neighborPatchIDs.size();i++) {
1321  patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
1322  }
1323  }
1324 
1325  for (int rank=0;rank < patchIDsToAppend.size();rank++) {
1326  for (int i=0;i < patchIDsToAppend[rank].size();i++) {
1327  computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
1328  }
1329  }
1330 
1331  // Remove duplicate patch IDs
1332  {
1333  std::sort(allPatchIDs.begin(), allPatchIDs.end());
1334  auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
1335  allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
1336  }
1337 
1338  // Set number of (unique) patches
1339  setNumPatches(allPatchIDs.size());
1340 
1341  // Reset patchesCounter
1342  patchesCounter = getNumPatches();
1343 
1344  patches.resize(getNumPatches());
1345 
1346  // Setup tupleList
1347  for (int rank=0;rank < computes.size();rank++) {
1348  std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
1349  for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
1350  tupleList[it->tuples->getType()].push_back(it->tuples);
1351  }
1352  HomeCompute& homeCompute = computes[rank].homeCompute;
1353  for (int i=0;i < homeCompute.tuples.size();i++) {
1354  tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
1355  }
1356  }
1357 
1358  // Allocate host memory for energies and virials
1359  allocate_host<double>(&energies_virials, ComputeBondedCUDAKernel::energies_virials_SIZE);
1360 
1361  // Finally, do sanity checks
1362  std::vector<char> patchIDset(patchMap->numPatches(), 0);
1363  int numPatchIDset = 0;
1364  int numPatchIDs = 0;
1365  for (int rank=0;rank < computes.size();rank++) {
1366  numPatchIDs += patchIDsPerRank[rank].size();
1367  for (int i=0;i < patchIDsPerRank[rank].size();i++) {
1368  PatchID patchID = patchIDsPerRank[rank][i];
1369  if (patchIDset[patchID] == 0) numPatchIDset++;
1370  patchIDset[patchID] = 1;
1371  if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
1372  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
1373  }
1374  }
1375  }
1376  if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
1377  NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
1378  }
1379 
1380  // Warning: Direct indexing used, patchIndex could use up a lot of memory for large systems
1381  patchIndex.resize(patchMap->numPatches());
1382  atomMappers.resize(getNumPatches());
1383  for (int i=0;i < getNumPatches();i++) {
1384  atomMappers[i] = new AtomMapper(allPatchIDs[i], &atomMap);
1385  patchIndex[allPatchIDs[i]] = i;
1386  }
1387 
1388  // Copy coefficients to GPU
1389  Parameters* parameters = Node::Object()->parameters;
1390  for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
1391  if (tupleList[tupletype].size() > 0) {
1392  switch(tupletype) {
1393 
1394  case Tuples::BOND:
1395  {
1396  int NumBondParams = parameters->NumBondParams;
1397  BondValue* bond_array = parameters->bond_array;
1398  std::vector<CudaBondValue> bondValues(NumBondParams);
1399  for (int i=0;i < NumBondParams;i++) {
1400  bondValues[i].k = bond_array[i].k;
1401  bondValues[i].x0 = bond_array[i].x0;
1402  bondValues[i].x1 = bond_array[i].x1;
1403  }
1404  bondedKernel.setupBondValues(NumBondParams, bondValues.data());
1405  }
1406  break;
1407 
1408  case Tuples::ANGLE:
1409  {
1410  int NumAngleParams = parameters->NumAngleParams;
1411  AngleValue* angle_array = parameters->angle_array;
1412  std::vector<CudaAngleValue> angleValues(NumAngleParams);
1413  bool normal_ub_error = false;
1414  for (int i=0;i < NumAngleParams;i++) {
1415  angleValues[i].k = angle_array[i].k;
1416  if (angle_array[i].normal == 1) {
1417  angleValues[i].theta0 = angle_array[i].theta0;
1418  } else {
1419  angleValues[i].theta0 = cos(angle_array[i].theta0);
1420  }
1421  normal_ub_error |= (angle_array[i].normal == 0 && angle_array[i].k_ub);
1422  angleValues[i].k_ub = angle_array[i].k_ub;
1423  angleValues[i].r_ub = angle_array[i].r_ub;
1424  angleValues[i].normal = angle_array[i].normal;
1425  }
1426  if (normal_ub_error) NAMD_die("ERROR: Can't use cosAngles with Urey-Bradley angles");
1427  bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
1428  }
1429  break;
1430 
1431  case Tuples::DIHEDRAL:
1432  {
1433  int NumDihedralParams = parameters->NumDihedralParams;
1434  DihedralValue* dihedral_array = parameters->dihedral_array;
1435  int NumDihedralParamsMult = 0;
1436  for (int i=0;i < NumDihedralParams;i++) {
1437  NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
1438  }
1439  std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
1440  dihedralMultMap.resize(NumDihedralParams);
1441  int k = 0;
1442  for (int i=0;i < NumDihedralParams;i++) {
1443  int multiplicity = dihedral_array[i].multiplicity;
1444  dihedralMultMap[i] = k;
1445  for (int j=0;j < multiplicity;j++) {
1446  dihedralValues[k].k = dihedral_array[i].values[j].k;
1447  dihedralValues[k].n = (dihedral_array[i].values[j].n << 1) | (j < (multiplicity - 1));
1448  dihedralValues[k].delta = dihedral_array[i].values[j].delta;
1449  k++;
1450  }
1451  }
1452  bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
1453  }
1454  break;
1455 
1456  case Tuples::IMPROPER:
1457  {
1458  int NumImproperParams = parameters->NumImproperParams;
1459  ImproperValue* improper_array = parameters->improper_array;
1460  int NumImproperParamsMult = 0;
1461  for (int i=0;i < NumImproperParams;i++) {
1462  NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
1463  }
1464  std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
1465  improperMultMap.resize(NumImproperParams);
1466  int k = 0;
1467  for (int i=0;i < NumImproperParams;i++) {
1468  int multiplicity = improper_array[i].multiplicity;
1469  improperMultMap[i] = k;
1470  for (int j=0;j < multiplicity;j++) {
1471  improperValues[k].k = improper_array[i].values[j].k;
1472  improperValues[k].n = (improper_array[i].values[j].n << 1) | (j < (multiplicity - 1));
1473  improperValues[k].delta = improper_array[i].values[j].delta;
1474  k++;
1475  }
1476  }
1477  bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
1478  }
1479  break;
1480 
1481  case Tuples::CROSSTERM:
1482  {
1483  int NumCrosstermParams = parameters->NumCrosstermParams;
1484  CrosstermValue* crossterm_array = parameters->crossterm_array;
1485  std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
1486  const int D = CrosstermValue::dim;
1487  const int N = CrosstermValue::dim - 1;
1488  for (int ipar=0;ipar < NumCrosstermParams;ipar++) {
1489  for (int i=0;i < N;i++) {
1490  for (int j=0;j < N;j++) {
1491 
1492  // Setups coefficients for bi-cubic interpolation.
1493  // See https://en.wikipedia.org/wiki/Bicubic_interpolation
1494 
1495  #define INDEX(ncols,i,j) ((i)*ncols + (j))
1496  CrosstermData* table = &crossterm_array[ipar].c[0][0];
1497 
1498  const double Ainv[16][16] = {
1499  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
1500  { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
1501  {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
1502  { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
1503  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
1504  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
1505  { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
1506  { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
1507  {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
1508  { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
1509  { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
1510  {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
1511  { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
1512  { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
1513  {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
1514  { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
1515  };
1516 
1517 #ifndef M_PI
1518 #define M_PI 3.14159265358979323846
1519 #endif
1520 
1521  const double h = M_PI/12.0;
1522 
1523  const double x[16] = {
1524  table[INDEX(D,i,j)].d00, table[INDEX(D,i+1,j)].d00, table[INDEX(D,i,j+1)].d00, table[INDEX(D,i+1,j+1)].d00,
1525  table[INDEX(D,i,j)].d10*h, table[INDEX(D,i+1,j)].d10*h, table[INDEX(D,i,j+1)].d10*h, table[INDEX(D,i+1,j+1)].d10*h,
1526  table[INDEX(D,i,j)].d01*h, table[INDEX(D,i+1,j)].d01*h, table[INDEX(D,i,j+1)].d01*h, table[INDEX(D,i+1,j+1)].d01*h,
1527  table[INDEX(D,i,j)].d11*h*h, table[INDEX(D,i+1,j)].d11*h*h, table[INDEX(D,i,j+1)].d11*h*h, table[INDEX(D,i+1,j+1)].d11*h*h
1528  };
1529 
1530  // a = Ainv*x
1531  float* a = (float *)&crosstermValues[ipar].c[i][j][0];
1532  for (int k=0;k < 16;k++) {
1533  double a_val = 0.0;
1534  for (int l=0;l < 16;l++) {
1535  a_val += Ainv[k][l]*x[l];
1536  }
1537  a[k] = (float)a_val;
1538  }
1539 
1540  }
1541  }
1542  }
1543  bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
1544  }
1545  break;
1546 
1547  case Tuples::EXCLUSION:
1548  // Nothing to do
1549  break;
1550 
1551  default:
1552  NAMD_bug("ComputeBondedCUDA::initialize, Undefined tuple type");
1553  break;
1554  }
1555  }
1556  }
1557 
1558  computeMgr->sendAssignPatchesOnPe(pes, this);
1559 }
1560 
1561 #endif // BONDED_CUDA
1562 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
float scale
static __constant__ const double force_to_double
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
Definition: Patch.C:134
#define M_PI
Definition: GoMolecule.C:39
int NumBondParams
Definition: Parameters.h:261
CrosstermData c[dim][dim]
Definition: Parameters.h:124
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:140
int sortOrder
Definition: NamdTypes.h:87
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
static int warpAlign(const int n)
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:14
int ComputeID
Definition: NamdTypes.h:183
#define ADD_TENSOR(R, RL, D, DL)
Definition: ReductionMgr.h:32
float3 ioffsetXYZ
static PatchMap * Object()
Definition: PatchMap.h:27
float3 joffsetXYZ
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1664
static __thread ComputeMgr * computeMgr
Definition: Vector.h:64
SimParameters * simParameters
Definition: Node.h:178
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1653
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
Real r_ub
Definition: Parameters.h:96
static __thread atom * atoms
static __thread float4 * forces
#define INDEX(ncols, i, j)
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
Definition: Patch.C:239
BigReal z
Definition: Vector.h:66
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:669
char const *const NamdProfileEventStr[]
Position position
Definition: NamdTypes.h:53
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
DihedralValue * dihedral_array
Definition: Parameters.h:245
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
void CcdCallBacksReset(void *ignored, double curWallTime)
Real x1
Definition: Parameters.h:88
int NumAngleParams
Definition: Parameters.h:262
CrosstermValue * crossterm_array
Definition: Parameters.h:247
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
int NumDihedralParams
Definition: Parameters.h:264
Definition: Patch.h:35
Flags flags
Definition: Patch.h:127
__thread cudaStream_t stream
BigReal d11
Definition: Parameters.h:119
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:110
AngleValue * angle_array
Definition: Parameters.h:244
float3 koffsetXYZ
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1675
CudaAtom * getCudaAtomList()
Definition: Patch.h:124
float3 ioffsetXYZ
int normal
Definition: Parameters.h:97
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int NumImproperParams
Definition: Parameters.h:265
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1686
Force * f[maxNumForces]
Definition: PatchTypes.h:67
#define FORCE_TYPE
BigReal d00
Definition: Parameters.h:119
BigReal x
Definition: Vector.h:66
int NumCrosstermParams
Definition: Parameters.h:266
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
int PatchID
Definition: NamdTypes.h:182
ImproperValue * improper_array
Definition: Parameters.h:246
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1612
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
void NAMD_die(const char *err_msg)
Definition: common.C:85
FourBodyConsts values[MAX_MULTIPLICITY]
Definition: Parameters.h:116
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
Definition: Patch.C:122
Parameters * parameters
Definition: Node.h:177
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BlockRadixSort::TempStorage sort
float3 ioffsetXYZ
short vdwType
Definition: NamdTypes.h:55
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
#define NAMD_EVENT_START_EX(eon, id, str)
BigReal d01
Definition: Parameters.h:119
Real x0
Definition: Parameters.h:87
BondValue * bond_array
Definition: Parameters.h:243
BigReal y
Definition: Vector.h:66
int getNumAtoms()
Definition: Patch.h:105
Box< Patch, CompAtom > * avgPositionBox
Real theta0
Definition: Parameters.h:94
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
CompAtomExt * xExt
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:63
int size(void) const
Definition: ResizeArray.h:127
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1639
Data * open(void)
Definition: Box.h:39
float3 loffsetXYZ
Box< Patch, Results > * forceBox
Real k_ub
Definition: Parameters.h:95
Real k
Definition: Parameters.h:86
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1586
void close(Data **const t)
Definition: Box.h:49
BigReal d10
Definition: Parameters.h:119
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:107
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:228