24 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 26 #define __thread __declspec(thread) 33 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
42 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
58 Compute(c), computeMgr(computeMgr), deviceID(deviceID), masterPe(CkMyPe()),
59 bondedKernel(deviceID, cudaNonbondedTables)
62 computes.resize(CkMyNodeSize());
63 patchIDsPerRank.resize(CkMyNodeSize());
64 numExclPerRank.resize(CkMyNodeSize());
65 for (
int i=0;i < numExclPerRank.size();i++) {
66 numExclPerRank[i].numModifiedExclusions = 0;
67 numExclPerRank[i].numExclusions = 0;
84 energies_virials = NULL;
86 initializeCalled =
false;
89 accelMDdoDihe =
false;
90 if (params->accelMDOn) {
91 if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=
true;
97 pswitchTable[0] = 0; pswitchTable[1] = 1; pswitchTable[2] = 2;
98 pswitchTable[3] = 1; pswitchTable[4] = 1; pswitchTable[5] = 99;
99 pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
101 h_patchRecord = NULL;
102 d_patchRecord = NULL;
104 h_patchMapCenter = NULL;
105 d_patchMapCenter = NULL;
112 ComputeBondedCUDA::~ComputeBondedCUDA() {
115 if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
116 if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
117 if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
118 if (tupleData != NULL) deallocate_host<char>(&tupleData);
120 if (initializeCalled) {
122 cudaCheck(cudaEventDestroy(forceDoneEvent));
123 CmiDestroyLock(lock);
124 CmiDestroyLock(printLock);
128 if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
129 if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
131 if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
132 if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
138 void ComputeBondedCUDA::unregisterBoxesOnPe() {
139 for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
140 PatchID patchID = patchIDsPerRank[CkMyRank()][i];
142 if (tpe == NULL || tpe->
p == NULL) {
143 NAMD_bug(
"ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
156 void ComputeBondedCUDA::registerCompute(
int pe,
int type,
PatchIDList& pids) {
158 if (CkMyPe() != masterPe)
159 NAMD_bug(
"ComputeBondedCUDA::registerCompute() called on non master PE");
161 int rank = CkRankOf(pe);
163 HomeCompute& homeCompute = computes[rank].homeCompute;
164 if (homeCompute.patchIDs.size() == 0) {
166 homeCompute.patchIDs.resize(pids.
size());
167 for (
int i=0;i < pids.
size();i++) {
168 homeCompute.patchIDs[i] = pids[i];
169 homeCompute.isBasePatch[pids[i]] = 1;
172 if (homeCompute.patchIDs.size() != pids.
size()) {
173 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
175 for (
int i=0;i < pids.
size();i++) {
176 if (homeCompute.patchIDs[i] != pids[i]) {
177 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
184 homeCompute.tuples.push_back(
new HomeTuples<BondElem, Bond, BondValue>(
Tuples::BOND));
188 homeCompute.tuples.push_back(
new HomeTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE));
192 homeCompute.tuples.push_back(
new HomeTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL));
196 homeCompute.tuples.push_back(
new HomeTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER));
200 homeCompute.tuples.push_back(
new HomeTuples<ExclElem, Exclusion, int>(
Tuples::EXCLUSION));
204 homeCompute.tuples.push_back(
new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM));
208 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), Unsupported compute type");
218 void ComputeBondedCUDA::registerSelfCompute(
int pe,
int type,
int pid) {
220 if (CkMyPe() != masterPe)
221 NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute() called on non master PE");
223 int rank = CkRankOf(pe);
225 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
226 auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
227 if (it == selfComputes.end()) {
229 selfComputes.push_back(SelfCompute(type));
230 it = selfComputes.begin() + (selfComputes.size() - 1);
234 it->tuples =
new SelfTuples<BondElem, Bond, BondValue>(
Tuples::BOND);
238 it->tuples =
new SelfTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE);
242 it->tuples =
new SelfTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL);
246 it->tuples =
new SelfTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER);
254 it->tuples =
new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM);
258 NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
265 it->patchIDs.push_back(pid);
268 void ComputeBondedCUDA::assignPatchesOnPe() {
271 for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
272 PatchID patchID = patchIDsPerRank[CkMyRank()][i];
276 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, patch not found");
277 if (flags == NULL) flags = &patchMap->
patch(patchID)->
flags;
280 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
282 if (tpe->
p != NULL) {
283 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
296 void ComputeBondedCUDA::atomUpdate() {
297 atomsChangedIn =
true;
304 int ComputeBondedCUDA::noWork() {
309 void ComputeBondedCUDA::messageEnqueueWork() {
310 if (masterPe != CkMyPe())
311 NAMD_bug(
"ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
319 void ComputeBondedCUDA::doWork() {
320 if (CkMyPe() != masterPe)
321 NAMD_bug(
"ComputeBondedCUDA::doWork() called on non master PE");
327 atomsChanged = atomsChangedIn;
328 atomsChangedIn =
false;
330 if (getNumPatches() == 0) {
335 NAMD_bug(
"ComputeBondedCUDA::doWork(), no flags set");
339 lattice = flags->lattice;
340 doEnergy = flags->doEnergy;
341 doVirial = flags->doVirial;
342 doSlow = flags->doFullElectrostatics;
343 doMolly = flags->doMolly;
346 if (hostAlchFlags.alchOn) {
347 updateHostCudaAlchLambdas();
348 updateKernelCudaAlchLambdas();
350 if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
360 if(params->CUDASOAintegrate) {
361 if (!atomsChanged) this->openBoxesOnPe();
369 void ComputeBondedCUDA::patchReady(
PatchID pid,
int doneMigration,
int seq) {
375 #ifdef NODEGROUP_FORCE_REGISTER 376 patches[patchIndex[pid]].patchID = pid;
382 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
392 void ComputeBondedCUDA::updatePatches() {
395 for (
int i=0;i < patches.size();i++) {
396 patches[i].atomStart = atomStart;
397 atomStart += patches[i].numAtoms;
399 atomStorageSize = atomStart;
402 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
405 #ifdef NODEGROUP_FORCE_REGISTER 407 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
408 PatchData *patchData = cpdata.ckLocalBranch();
410 std::vector<CudaLocalRecord>& localPatches =
411 patchData->devData[deviceIndex].h_localPatches;
412 const int numPatchesHomeAndProxy =
413 patchData->devData[deviceIndex].numPatchesHomeAndProxy;
416 for (
int i=0;i < numPatchesHomeAndProxy; i++) {
417 patches[i].numAtoms = localPatches[i].numAtoms;
418 patches[i].atomStart = localPatches[i].bufferOffset;
419 atomStart += patches[i].numAtoms;
422 atomStorageSize = atomStart;
423 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
425 if (params->CUDASOAintegrate && params->useDeviceMigration) {
426 bondedKernel.updateAtomBuffer(atomStorageSize, stream);
427 updatePatchRecords();
429 #endif // NODEGROUP_FORCE_REGISTER 436 void ComputeBondedCUDA::mapAtoms() {
437 for (
int i=0;i < getNumPatches();i++) {
447 void ComputeBondedCUDA::unmapAtoms() {
448 for (
int i=0;i < getNumPatches();i++) {
457 void ComputeBondedCUDA::openBoxesOnPe(
int startup) {
459 std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
461 fprintf(stderr,
"PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(),
this);
463 #ifdef NODEGROUP_FORCE_REGISTER 464 if(
Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
465 for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
476 for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
487 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
488 int pi = patchIndex[patchID];
489 int atomStart = patches[pi].atomStart;
490 int numAtoms = patches[pi].numAtoms;
494 for (
int i=0;i < numAtoms;i++) {
497 atoms[atomStart + j] = src[i];
505 patchesCounter -= patchIDs.size();
506 if (patchesCounter == 0) {
507 patchesCounter = getNumPatches();
513 if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
517 if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes,
this);
520 if(params->CUDASOAintegrate){
521 if(!atomsChanged) this->launchWork();
526 #ifdef NODEGROUP_FORCE_REGISTER 535 fprintf(stderr,
"PE[%d] (%p) tuplePatchList printout\n", CkMyPe(),
this);
536 for(
int i = 0 ; i < tuplePatchList.size(); i++){
539 if(tpe == NULL)
break;
540 fprintf(stderr,
"PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(),
this, i, tpe->
p->
getPatchID(), tpe->
xExt);
542 CmiUnlock(printLock);
547 void countNumExclusions(Tuples* tuples,
int& numModifiedExclusions,
int& numExclusions) {
548 numModifiedExclusions = 0;
549 int ntuples = tuples->getNumTuples();
551 for (
int ituple=0;ituple < ntuples;ituple++) {
552 if (src[ituple].modified) numModifiedExclusions++;
554 numExclusions = ntuples - numModifiedExclusions;
560 void ComputeBondedCUDA::loadTuplesOnPe(
const int startup) {
563 int numModifiedExclusions = 0;
564 int numExclusions = 0;
565 if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
566 std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
568 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
570 it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
574 countNumExclusions(it->tuples, tmp1, tmp2);
575 numModifiedExclusions += tmp1;
576 numExclusions += tmp2;
580 HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
581 for (
int i=0;i < homeCompute.tuples.size();i++) {
582 homeCompute.tuples[i]->loadTuples(tuplePatchList,
583 homeCompute.isBasePatch.data(), &atomMap,
584 homeCompute.patchIDs);
588 countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
589 numModifiedExclusions += tmp1;
590 numExclusions += tmp2;
594 numModifiedExclusions = modifiedExclusionTupleData.size();
595 numExclusions = exclusionTupleData.size();
599 numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
600 numExclPerRank[CkMyRank()].numExclusions = numExclusions;
603 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
609 patchesCounter -= patchIDsPerRank[CkMyRank()].size();
610 if (patchesCounter == 0) {
611 patchesCounter = getNumPatches();
619 if(!params->CUDASOAintegrate)computeMgr->
sendLaunchWork(masterPe,
this);
626 void ComputeBondedCUDA::copyBondData(
const int ntuples,
const BondElem* __restrict__ src,
630 for (
int ituple=0;ituple < ntuples;ituple++) {
632 auto p0 = src[ituple].p[0];
633 auto p1 = src[ituple].p[1];
634 int pi0 = patchIndex[p0->patchID];
635 int pi1 = patchIndex[p1->patchID];
636 int l0 = src[ituple].localIndex[0];
637 int l1 = src[ituple].localIndex[1];
638 dstval.
i = l0 + patches[pi0].atomStart;
639 dstval.
j = l1 + patches[pi1].atomStart;
640 dstval.
itype = (src[ituple].value - bond_array);
643 Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
644 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
646 dstval.
scale = src[ituple].scale;
647 if (hostAlchFlags.alchOn) {
648 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
654 dst[ituple] = dstval;
658 #ifdef NODEGROUP_FORCE_REGISTER 660 void ComputeBondedCUDA::copyTupleToStage(
const BondElem& src,
665 int pi0 = patchIndex[p0->patchID];
666 int pi1 = patchIndex[p1->patchID];
674 dstval.
index[0] = l0;
675 dstval.
index[1] = l1;
677 if (hostAlchFlags.alchOn) {
685 #endif // NODEGROUP_FORCE_REGISTER 688 void ComputeBondedCUDA::copyBondDatafp32(
const int ntuples,
const BondElem* __restrict__ src,
692 float3 b1f, b2f, b3f;
693 b1f =
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
694 b2f =
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
695 b3f =
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
697 for (
int ituple=0;ituple < ntuples;ituple++) {
699 auto p0 = src[ituple].p[0];
700 auto p1 = src[ituple].p[1];
701 int pi0 = patchIndex[p0->patchID];
702 int pi1 = patchIndex[p1->patchID];
703 int l0 = src[ituple].localIndex[0];
704 int l1 = src[ituple].localIndex[1];
705 dstval.
i = l0 + patches[pi0].atomStart;
706 dstval.
j = l1 + patches[pi1].atomStart;
707 dstval.
itype = (src[ituple].value - bond_array);
711 Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
712 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
714 float3 position1 =
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
715 float3 position2 =
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
716 float3 diff = position1 - position2;
717 float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
718 float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
719 float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
727 dstval.
scale = src[ituple].scale;
728 if (hostAlchFlags.alchOn) {
729 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
735 dst[ituple] = dstval;
739 void ComputeBondedCUDA::copyAngleData(
const int ntuples,
const AngleElem* __restrict__ src,
742 for (
int ituple=0;ituple < ntuples;ituple++) {
744 auto p0 = src[ituple].p[0];
745 auto p1 = src[ituple].p[1];
746 auto p2 = src[ituple].p[2];
747 int pi0 = patchIndex[p0->patchID];
748 int pi1 = patchIndex[p1->patchID];
749 int pi2 = patchIndex[p2->patchID];
750 int l0 = src[ituple].localIndex[0];
751 int l1 = src[ituple].localIndex[1];
752 int l2 = src[ituple].localIndex[2];
753 dstval.
i = l0 + patches[pi0].atomStart;
754 dstval.
j = l1 + patches[pi1].atomStart;
755 dstval.
k = l2 + patches[pi2].atomStart;
756 dstval.
itype = (src[ituple].value - angle_array);
760 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
761 Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
762 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
763 if(pi2 != pi1) shiftVec32 += patchMap->
center(p2->patchID) - patchMap->
center(p1->patchID);
767 dstval.
scale = src[ituple].scale;
768 if (hostAlchFlags.alchOn) {
769 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
775 dst[ituple] = dstval;
779 #ifdef NODEGROUP_FORCE_REGISTER 781 void ComputeBondedCUDA::copyTupleToStage(
const AngleElem& src,
787 int pi0 = patchIndex[p0->patchID];
788 int pi1 = patchIndex[p1->patchID];
789 int pi2 = patchIndex[p2->patchID];
799 dstval.
index[0] = l0;
800 dstval.
index[1] = l1;
801 dstval.
index[2] = l2;
804 if (hostAlchFlags.alchOn) {
812 #endif // NODEGROUP_FORCE_REGISTER 817 template <
bool doDihedral,
typename T,
typename P>
818 void ComputeBondedCUDA::copyDihedralData(
const int ntuples,
const T* __restrict__ src,
819 const P* __restrict__ p_array,
CudaDihedral* __restrict__ dst) {
823 for (
int ituple=0;ituple < ntuples;ituple++) {
825 auto p0 = src[ituple].p[0];
826 auto p1 = src[ituple].p[1];
827 auto p2 = src[ituple].p[2];
828 auto p3 = src[ituple].p[3];
829 int pi0 = patchIndex[p0->patchID];
830 int pi1 = patchIndex[p1->patchID];
831 int pi2 = patchIndex[p2->patchID];
832 int pi3 = patchIndex[p3->patchID];
833 int l0 = src[ituple].localIndex[0];
834 int l1 = src[ituple].localIndex[1];
835 int l2 = src[ituple].localIndex[2];
836 int l3 = src[ituple].localIndex[3];
837 dstval.
i = l0 + patches[pi0].atomStart;
838 dstval.
j = l1 + patches[pi1].atomStart;
839 dstval.
k = l2 + patches[pi2].atomStart;
840 dstval.
l = l3 + patches[pi3].atomStart;
842 dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
844 dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
850 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
851 Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
852 Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
853 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
854 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
855 if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
860 dstval.
scale = src[ituple].scale;
861 if (hostAlchFlags.alchOn) {
862 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
868 dst[ituple] = dstval;
872 #ifdef NODEGROUP_FORCE_REGISTER 874 void ComputeBondedCUDA::copyTupleToStage(
const DihedralElem& src,
881 int pi0 = patchIndex[p0->patchID];
882 int pi1 = patchIndex[p1->patchID];
883 int pi2 = patchIndex[p2->patchID];
884 int pi3 = patchIndex[p3->patchID];
889 dstval.
itype = dihedralMultMap[(src.
value - p_array)];
896 dstval.
index[0] = l0;
897 dstval.
index[1] = l1;
898 dstval.
index[2] = l2;
899 dstval.
index[3] = l3;
902 if (hostAlchFlags.alchOn) {
912 void ComputeBondedCUDA::copyTupleToStage(
const ImproperElem& src,
919 int pi0 = patchIndex[p0->patchID];
920 int pi1 = patchIndex[p1->patchID];
921 int pi2 = patchIndex[p2->patchID];
922 int pi3 = patchIndex[p3->patchID];
927 dstval.
itype = improperMultMap[(src.
value - p_array)];
934 dstval.
index[0] = l0;
935 dstval.
index[1] = l1;
936 dstval.
index[2] = l2;
937 dstval.
index[3] = l3;
940 if (hostAlchFlags.alchOn) {
949 template <
typename T,
typename P,
typename D>
950 void ComputeBondedCUDA::copyToStage(
const int ntuples,
const T* __restrict__ src,
951 const P* __restrict__ p_array, std::vector<D>& dst) {
953 for (
int ituple=0;ituple < ntuples;ituple++) {
955 copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
956 dst.push_back(dstval);
959 #endif // NODEGROUP_FORCE_REGISTER 965 template <
bool doDihedral,
typename T,
typename P>
966 void ComputeBondedCUDA::copyDihedralDatafp32(
const int ntuples,
const T* __restrict__ src,
967 const P* __restrict__ p_array,
CudaDihedral* __restrict__ dst) {
970 float3 b1f =
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
971 float3 b2f =
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
972 float3 b3f =
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
974 for (
int ituple=0;ituple < ntuples;ituple++) {
976 auto p0 = src[ituple].p[0];
977 auto p1 = src[ituple].p[1];
978 auto p2 = src[ituple].p[2];
979 auto p3 = src[ituple].p[3];
980 int pi0 = patchIndex[p0->patchID];
981 int pi1 = patchIndex[p1->patchID];
982 int pi2 = patchIndex[p2->patchID];
983 int pi3 = patchIndex[p3->patchID];
984 int l0 = src[ituple].localIndex[0];
985 int l1 = src[ituple].localIndex[1];
986 int l2 = src[ituple].localIndex[2];
987 int l3 = src[ituple].localIndex[3];
988 dstval.
i = l0 + patches[pi0].atomStart;
989 dstval.
j = l1 + patches[pi1].atomStart;
990 dstval.
k = l2 + patches[pi2].atomStart;
991 dstval.
l = l3 + patches[pi3].atomStart;
993 dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
995 dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
1000 Position position3 = p2->
x[l2].position;
1001 Position position4 = p3->
x[l3].position;
1002 Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
1003 Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
1004 Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
1005 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1006 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1007 if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
1014 float3 position1 =
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
1015 float3 position2 =
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
1016 float3 position3 =
make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
1017 float3 position4 =
make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
1019 float3 diff12, diff23, diff43;
1020 diff12 = position1 - position2;
1021 diff23 = position2 - position3;
1022 diff43 = position4 - position3;
1024 float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
1025 float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
1026 float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
1028 float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
1029 float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
1030 float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
1032 float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
1033 float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
1034 float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
1061 dstval.
scale = src[ituple].scale;
1062 if (hostAlchFlags.alchOn) {
1063 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1069 dst[ituple] = dstval;
1074 void ComputeBondedCUDA::copyExclusionData(
const int ntuples,
const ExclElem* __restrict__ src,
const int typeSize,
1078 for (
int ituple=0;ituple < ntuples;ituple++) {
1079 auto p0 = src[ituple].p[0];
1080 auto p1 = src[ituple].p[1];
1081 int pi0 = patchIndex[p0->patchID];
1082 int pi1 = patchIndex[p1->patchID];
1083 int l0 = src[ituple].localIndex[0];
1084 int l1 = src[ituple].localIndex[1];
1089 Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
1090 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1092 ce.
i = l0 + patches[pi0].atomStart;
1093 ce.
j = l1 + patches[pi1].atomStart;
1097 if (hostAlchFlags.alchOn) {
1100 ce.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
1104 if (src[ituple].modified) {
1116 #ifdef NODEGROUP_FORCE_REGISTER 1118 void ComputeBondedCUDA::copyTupleToStage(
const ExclElem& __restrict__ src,
1123 int pi0 = patchIndex[p0->patchID];
1124 int pi1 = patchIndex[p1->patchID];
1125 int l0 = src.localIndex[0];
1126 int l1 = src.localIndex[1];
1136 dstval.
index[0] = l0;
1137 dstval.
index[1] = l1;
1139 if (hostAlchFlags.alchOn) {
1142 dstval.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
1150 void ComputeBondedCUDA::copyExclusionDataStage(
const int ntuples,
const ExclElem* __restrict__ src,
const int typeSize,
1151 std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
1153 for (
int ituple=0;ituple < ntuples;ituple++) {
1155 copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple],
nullptr, ce);
1156 if (src[ituple].modified) {
1165 #endif // NODEGROUP_FORCE_REGISTER 1167 void ComputeBondedCUDA::copyCrosstermData(
const int ntuples,
const CrosstermElem* __restrict__ src,
1171 for (
int ituple=0;ituple < ntuples;ituple++) {
1172 auto p0 = src[ituple].p[0];
1173 auto p1 = src[ituple].p[1];
1174 auto p2 = src[ituple].p[2];
1175 auto p3 = src[ituple].p[3];
1176 auto p4 = src[ituple].p[4];
1177 auto p5 = src[ituple].p[5];
1178 auto p6 = src[ituple].p[6];
1179 auto p7 = src[ituple].p[7];
1180 int pi0 = patchIndex[p0->patchID];
1181 int pi1 = patchIndex[p1->patchID];
1182 int pi2 = patchIndex[p2->patchID];
1183 int pi3 = patchIndex[p3->patchID];
1184 int pi4 = patchIndex[p4->patchID];
1185 int pi5 = patchIndex[p5->patchID];
1186 int pi6 = patchIndex[p6->patchID];
1187 int pi7 = patchIndex[p7->patchID];
1188 int l0 = src[ituple].localIndex[0];
1189 int l1 = src[ituple].localIndex[1];
1190 int l2 = src[ituple].localIndex[2];
1191 int l3 = src[ituple].localIndex[3];
1192 int l4 = src[ituple].localIndex[4];
1193 int l5 = src[ituple].localIndex[5];
1194 int l6 = src[ituple].localIndex[6];
1195 int l7 = src[ituple].localIndex[7];
1196 dst[ituple].i1 = l0 + patches[pi0].atomStart;
1197 dst[ituple].i2 = l1 + patches[pi1].atomStart;
1198 dst[ituple].i3 = l2 + patches[pi2].atomStart;
1199 dst[ituple].i4 = l3 + patches[pi3].atomStart;
1200 dst[ituple].i5 = l4 + patches[pi4].atomStart;
1201 dst[ituple].i6 = l5 + patches[pi5].atomStart;
1202 dst[ituple].i7 = l6 + patches[pi6].atomStart;
1203 dst[ituple].i8 = l7 + patches[pi7].atomStart;
1204 dst[ituple].itype = (src[ituple].value - crossterm_array);
1205 Position position1 = p0->
x[l0].position;
1206 Position position2 = p1->
x[l1].position;
1207 Position position3 = p2->
x[l2].position;
1208 Position position4 = p3->
x[l3].position;
1209 Position position5 = p4->
x[l4].position;
1210 Position position6 = p5->
x[l5].position;
1211 Position position7 = p6->
x[l6].position;
1212 Position position8 = p7->
x[l7].position;
1213 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
1214 Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
1215 Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
1216 Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
1217 Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
1218 Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
1219 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1220 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1221 if(pi2 != pi3) shiftVec34 += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
1222 if(pi4 != pi5) shiftVec56 += patchMap->
center(p4->patchID) - patchMap->
center(p5->patchID);
1223 if(pi5 != pi6) shiftVec67 += patchMap->
center(p5->patchID) - patchMap->
center(p6->patchID);
1224 if(pi6 != pi7) shiftVec78 += patchMap->
center(p6->patchID) - patchMap->
center(p7->patchID);
1225 dst[ituple].offset12XYZ =
make_float3( (
float)shiftVec12.
x, (
float)shiftVec12.
y, (
float)shiftVec12.
z);
1226 dst[ituple].offset23XYZ =
make_float3( (
float)shiftVec23.
x, (
float)shiftVec23.
y, (
float)shiftVec23.
z);
1227 dst[ituple].offset34XYZ =
make_float3( (
float)shiftVec34.
x, (
float)shiftVec34.
y, (
float)shiftVec34.
z);
1228 dst[ituple].offset56XYZ =
make_float3( (
float)shiftVec56.
x, (
float)shiftVec56.
y, (
float)shiftVec56.
z);
1229 dst[ituple].offset67XYZ =
make_float3( (
float)shiftVec67.
x, (
float)shiftVec67.
y, (
float)shiftVec67.
z);
1230 dst[ituple].offset78XYZ =
make_float3( (
float)shiftVec78.
x, (
float)shiftVec78.
y, (
float)shiftVec78.
z);
1231 if (hostAlchFlags.alchOn) {
1232 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1234 int typeSum1 = 0, typeSum2 = 0;
1235 for (
size_t i = 0; i < 4; ++i) {
1236 typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1237 typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1239 int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1240 if ((0 < typeSum1 && typeSum1 <
order) || (0 < typeSum2 && typeSum2 <
order)) {
1241 dst[ituple].fepBondedType = 1;
1242 }
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
1243 dst[ituple].fepBondedType = 2;
1246 dst[ituple].fepBondedType = 0;
1248 dst[ituple].scale = src[ituple].scale;
1252 #ifdef NODEGROUP_FORCE_REGISTER 1254 void ComputeBondedCUDA::copyTupleToStage(
const CrosstermElem& src,
1265 int pi0 = patchIndex[p0->patchID];
1266 int pi1 = patchIndex[p1->patchID];
1267 int pi2 = patchIndex[p2->patchID];
1268 int pi3 = patchIndex[p3->patchID];
1269 int pi4 = patchIndex[p4->patchID];
1270 int pi5 = patchIndex[p5->patchID];
1271 int pi6 = patchIndex[p6->patchID];
1272 int pi7 = patchIndex[p7->patchID];
1281 dstval.
itype = (src.
value - crossterm_array);
1292 dstval.
index[0] = l0;
1293 dstval.
index[1] = l1;
1294 dstval.
index[2] = l2;
1295 dstval.
index[3] = l3;
1296 dstval.
index[4] = l4;
1297 dstval.
index[5] = l5;
1298 dstval.
index[6] = l6;
1299 dstval.
index[7] = l7;
1302 if (hostAlchFlags.alchOn) {
1305 int typeSum1 = 0, typeSum2 = 0;
1306 for (
size_t i = 0; i < 4; ++i) {
1310 int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1311 if ((0 < typeSum1 && typeSum1 <
order) || (0 < typeSum2 && typeSum2 <
order)) {
1313 }
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
1320 #endif // NODEGROUP_FORCE_REGISTER 1323 void ComputeBondedCUDA::tupleCopyWorker(
int first,
int last,
void *result,
int paraNum,
void *param) {
1324 ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1325 c->tupleCopyWorker(first, last);
1328 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first,
int last,
void *result,
int paraNum,
void *param) {
1329 ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1330 c->tupleCopyWorkerExcl(first, last);
1333 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first,
int last) {
1339 int64_t numModExclusionsBefore=0;
1340 int64_t numExclusionsBefore=0;
1343 int ntuples = (*it)->getNumTuples();
1344 auto thisExcl = (
ExclElem *)(*it)->getTupleList();
1345 for (
int ituple=0;ituple < ntuples;ituple++) {
1346 if(thisExcl[ituple].modified)
1347 numModExclusionsBefore++;
1349 numExclusionsBefore++;
1353 int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
1354 int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
1355 int numExclusionsWork=last-first;
1356 auto itEnd=
std::next(itStart,numExclusionsWork+1);
1357 for (
auto it=itStart;it != itEnd;it++) {
1358 int ntuples = (*it)->getNumTuples();
1365 void ComputeBondedCUDA::tupleCopyWorker(
int first,
int last) {
1370 int64_t pos = exclusionStartPos;
1371 int64_t pos2 = exclusionStartPos2;
1373 int ntuples = (*it)->getNumTuples();
1382 for (
int i=first;i <= last;i++) {
1383 switch (tupleCopyWorkList[i].tupletype) {
1388 copyBondData(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
1389 Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1391 copyBondDatafp32(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
1392 Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1399 copyAngleData(tupleCopyWorkList[i].ntuples, (
AngleElem *)tupleCopyWorkList[i].tupleElemList,
1400 Node::Object()->parameters->angle_array, (
CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1408 copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1410 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1412 copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1414 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1421 copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
1423 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1429 copyCrosstermData(tupleCopyWorkList[i].ntuples, (
CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
1435 NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1443 #ifdef NODEGROUP_FORCE_REGISTER 1445 void ComputeBondedCUDA::updateMaxTupleCounts(
TupleCounts counts) {
1449 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1450 PatchData *patchData = cpdata.ckLocalBranch();
1453 int numBondsTest = patchData->maxNumBonds.load();
1454 while (numBondsTest < counts.
bond &&
1455 !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.
bond));
1457 int numAnglesTest = patchData->maxNumAngles.load();
1458 while (numAnglesTest < counts.
angle &&
1459 !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.
angle));
1461 int numDihedralsTest = patchData->maxNumDihedrals.load();
1462 while (numDihedralsTest < counts.
dihedral &&
1463 !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.
dihedral));
1465 int numImpropersTest = patchData->maxNumImpropers.load();
1466 while (numImpropersTest < counts.
improper &&
1467 !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.
improper));
1469 int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
1471 !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.
modifiedExclusion));
1473 int numExclusionsTest = patchData->maxNumExclusions.load();
1474 while (numExclusionsTest < counts.
exclusion &&
1475 !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.
exclusion));
1477 int numCrosstermsTest = patchData->maxNumCrossterms.load();
1478 while (numCrosstermsTest < counts.
crossterm &&
1479 !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.
crossterm));
1484 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
1485 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1486 PatchData *patchData = cpdata.ckLocalBranch();
1490 counts.
bond = patchData->maxNumBonds.load();
1491 counts.
angle = patchData->maxNumAngles.load();
1492 counts.
dihedral = patchData->maxNumDihedrals.load();
1493 counts.
improper = patchData->maxNumImpropers.load();
1495 counts.
exclusion = patchData->maxNumExclusions.load();
1496 counts.
crossterm = patchData->maxNumCrossterms.load();
1499 CkPrintf(
"[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
1548 void ComputeBondedCUDA::migrateTuples(
bool startup) {
1550 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1551 PatchData *patchData = cpdata.ckLocalBranch();
1555 bool amMaster = (masterPe == CkMyPe());
1560 cudaStreamSynchronize(stream);
1562 cudaStreamSynchronize(stream);
1569 const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
1573 TupleCounts counts = bondedKernel.getTupleCounts();
1574 migrationKernel.computeTupleDestination(
1577 patchData->devData[devInd].numPatchesHome,
1578 migrationDestination,
1579 patchData->devData[devInd].d_patchToDeviceMap,
1580 patchData->devData[devInd].d_globalToLocalID,
1585 cudaCheck(cudaStreamSynchronize(stream));
1592 migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
1593 cudaCheck(cudaStreamSynchronize(stream));
1599 bool realloc =
false;
1601 if (amMaster && !startup) {
1602 migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
1604 local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
1605 updateMaxTupleCounts(local);
1607 CkPrintf(
"[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
1613 if (amMaster && !startup) {
1616 newMax = getMaxTupleCounts();
1617 realloc = migrationKernel.reallocateBufferDst(newMax);
1618 patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
1628 realloc = patchData->tupleReallocationFlagPerDevice[0];
1637 registerPointersToHost();
1642 copyHostRegisterToDevice();
1643 cudaCheck(cudaStreamSynchronize(stream));
1648 if (amMaster && !startup) {
1650 migrationKernel.performTupleMigration(
1651 bondedKernel.getTupleCounts(),
1654 cudaCheck(cudaStreamSynchronize(stream));
1658 if (amMaster && !startup) {
1665 bondedKernel.reallocateTupleBuffer(newMax, stream);
1666 migrationKernel.reallocateBufferSrc(newMax);
1667 cudaCheck(cudaStreamSynchronize(stream));
1676 bondedKernel.setTupleCounts(local);
1679 const int* ids = patchData->h_soa_id[devInd];
1680 migrationKernel.updateTuples(
1681 bondedKernel.getTupleCounts(),
1682 bondedKernel.getData(),
1686 bondedKernel.getAtomBuffer(),
1700 template<
typename T>
1701 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
1704 std::vector<std::pair<int,int>> downstreamPatches;
1705 for (
int i = 0; i < tuples.size(); i++) {
1706 int downstream = tuples[i].patchIDs[0];
1707 for (
int j = 1; j < T::size; j++) {
1708 downstream = patchMap->
downstream(downstream, tuples[i].patchIDs[j]);
1710 downstreamPatches.push_back(std::make_pair(i, downstream));
1711 tupleCounts[patchIndex[downstream]]++;
1717 tupleOffsets[0] = 0;
1718 for (
int i = 0; i < tupleCounts.size(); i++) {
1719 tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
1725 std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
1726 [](std::pair<int, int> a, std::pair<int, int> b) {
1727 return a.second < b.second;
1733 std::vector<T> copy = tuples;
1734 for (
int i = 0; i < tuples.size(); i++) {
1735 tuples[i] = copy[downstreamPatches[i].first];
1739 void ComputeBondedCUDA::sortAndCopyToDevice() {
1740 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1741 PatchData *patchData = cpdata.ckLocalBranch();
1743 const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
1753 std::vector<int> bondPatchCounts(numPatchesHome, 0);
1754 std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
1756 std::vector<int> anglePatchCounts(numPatchesHome, 0);
1757 std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
1759 std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
1760 std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
1762 std::vector<int> improperPatchCounts(numPatchesHome, 0);
1763 std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
1765 std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
1766 std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
1768 std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
1769 std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
1771 std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
1772 std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
1774 #define CALL(fieldName) do { \ 1775 sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \ 1776 h_dataStage.fieldName = fieldName##TupleData.data(); \ 1777 h_counts.fieldName = fieldName##PatchCounts.data(); \ 1778 h_offsets.fieldName = fieldName##PatchOffsets.data(); \ 1785 CALL(modifiedExclusion);
1791 migrationKernel.copyTupleToDevice(
1792 bondedKernel.getTupleCounts(),
1800 cudaCheck(cudaStreamSynchronize(stream));
1810 void ComputeBondedCUDA::tupleCopyWorkerType(
int tupletype) {
1813 switch (tupletype) {
1817 modifiedExclusionTupleData.clear();
1818 exclusionTupleData.clear();
1819 int64_t pos = exclusionStartPos;
1820 int64_t pos2 = exclusionStartPos2;
1822 int ntuples = (*it)->getNumTuples();
1824 modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
1835 bondTupleData.clear();
1837 int ntuples = (*it)->getNumTuples();
1839 copyToStage<BondElem, BondValue, CudaBondStage>(
1850 angleTupleData.clear();
1851 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1852 int ntuples = (*it)->getNumTuples();
1863 dihedralTupleData.clear();
1864 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1865 int ntuples = (*it)->getNumTuples();
1867 copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
1877 improperTupleData.clear();
1878 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1879 int ntuples = (*it)->getNumTuples();
1881 copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList,
1891 crosstermTupleData.clear();
1892 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1893 int ntuples = (*it)->getNumTuples();
1895 copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList,
1904 NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1910 #endif // NODEGROUP_FORCE_REGISTER 1917 void ComputeBondedCUDA::copyTupleData() {
1922 int numModifiedExclusions = 0;
1923 int numExclusions = 0;
1924 for (
int i=0;i < numExclPerRank.size();i++) {
1925 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
1926 numExclusions += numExclPerRank[i].numExclusions;
1933 exclusionStartPos = 0;
1934 exclusionStartPos2 = 0;
1935 tupleCopyWorkList.clear();
1936 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
1938 int64_t pos = posWA;
1940 exclusionStartPos = pos;
1941 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
1945 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
1946 int ntuples = (*it)->getNumTuples();
1949 TupleCopyWork tupleCopyWork;
1950 tupleCopyWork.tupletype = tupletype;
1951 tupleCopyWork.ntuples = ntuples;
1952 tupleCopyWork.tupleElemList = (*it)->getTupleList();
1953 tupleCopyWork.tupleDataPos = pos;
1955 tupleCopyWorkList.push_back(tupleCopyWork);
1956 pos += ntuples*CudaTupleTypeSize[tupletype];
1959 numTuplesPerType[tupletype] = num;
1963 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
1968 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
1969 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
1973 hasExclusions = (numExclusions > 0);
1974 hasModifiedExclusions = (numModifiedExclusions > 0);
1978 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
1980 #if CMK_SMP && USE_CKLOOP 1982 if (useCkLoop >= 1) {
1983 #define CKLOOP_EXCLUSIONS 1 1984 #if CKLOOP_EXCLUSIONS 1985 CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (
void *)
this, CkMyNodeSize(), 0, tupleList[
Tuples::EXCLUSION].size()-1);
1987 CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
1991 CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
1998 tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
2007 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2008 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2012 #ifdef NODEGROUP_FORCE_REGISTER 2014 void ComputeBondedCUDA::copyTupleDataSN() {
2017 size_t numExclusions, numModifiedExclusions, copyIndex;
2021 if(masterPe == CkMyPe()){
2022 numModifiedExclusions = 0;
2024 for (
int i=0;i < numExclPerRank.size();i++) {
2025 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2026 numExclusions += numExclPerRank[i].numExclusions;
2033 exclusionStartPos = 0;
2034 exclusionStartPos2 = 0;
2035 tupleCopyWorkList.clear();
2036 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2038 int64_t pos = posWA;
2040 exclusionStartPos = pos;
2041 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
2045 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2046 int ntuples = (*it)->getNumTuples();
2049 TupleCopyWork tupleCopyWork;
2050 tupleCopyWork.tupletype = tupletype;
2051 tupleCopyWork.ntuples = ntuples;
2052 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2053 tupleCopyWork.tupleDataPos = pos;
2054 tupleCopyWorkList.push_back(tupleCopyWork);
2055 pos += ntuples*CudaTupleTypeSize[tupletype];
2058 numTuplesPerType[tupletype] = num;
2062 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2067 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2068 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2072 hasExclusions = (numExclusions > 0);
2073 hasModifiedExclusions = (numModifiedExclusions > 0);
2077 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2087 this->tupleCopyWorker(first, last);
2093 while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <= tupleCopyWorkList.size()){
2094 this->tupleCopyWorker(copyIndex -1, copyIndex -1);
2099 if(masterPe == CkMyPe()){
2100 tupleWorkIndex.store(0);
2107 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2108 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2120 void ComputeBondedCUDA::copyTupleDataGPU(
const int startup) {
2123 size_t numExclusions, numModifiedExclusions, copyIndex;
2132 if(masterPe == CkMyPe()){
2133 numModifiedExclusions = 0;
2135 for (
int i=0;i < numExclPerRank.size();i++) {
2136 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2137 numExclusions += numExclPerRank[i].numExclusions;
2144 exclusionStartPos = 0;
2145 exclusionStartPos2 = 0;
2146 tupleCopyWorkList.clear();
2147 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2151 exclusionStartPos = pos;
2152 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[
Tuples::EXCLUSION];
2156 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2157 int ntuples = (*it)->getNumTuples();
2160 TupleCopyWork tupleCopyWork;
2161 tupleCopyWork.tupletype = tupletype;
2162 tupleCopyWork.ntuples = ntuples;
2163 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2164 tupleCopyWork.tupleDataPos = pos;
2165 tupleCopyWorkList.push_back(tupleCopyWork);
2166 pos += ntuples*CudaTupleTypeSizeStage[tupletype];
2169 numTuplesPerType[tupletype] = num;
2173 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
2178 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2179 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2183 hasExclusions = (numExclusions > 0);
2184 hasModifiedExclusions = (numModifiedExclusions > 0);
2188 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2196 local.modifiedExclusion = numModifiedExclusions;
2197 local.exclusion = numExclusions;
2200 updateMaxTupleCounts(local);
2201 bondedKernel.setTupleCounts(local);
2207 if(masterPe == CkMyPe()){
2209 migrationKernel.reallocateBufferSrc(newMax);
2210 migrationKernel.reallocateBufferDst(newMax);
2211 bondedKernel.reallocateTupleBuffer(newMax, stream);
2212 registerPointersToHost();
2218 if(masterPe == CkMyPe()){
2219 copyHostRegisterToDevice();
2220 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2221 this->tupleCopyWorkerType(tupletype);
2223 sortAndCopyToDevice();
2227 migrateTuples(startup);
2229 if(masterPe == CkMyPe()){
2230 TupleCounts count = bondedKernel.getTupleCounts();
2240 hasExclusions = (numExclusions > 0);
2241 hasModifiedExclusions = (numModifiedExclusions > 0);
2244 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2245 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2260 #ifdef NODEGROUP_FORCE_REGISTER 2261 void ComputeBondedCUDA::updatePatchRecords() {
2262 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2263 PatchData *patchData = cpdata.ckLocalBranch();
2265 PatchRecord **d_pr = &(patchData->devData[devInd].bond_pr);
2266 int **d_pid = &(patchData->devData[devInd].bond_pi);
2267 size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
2268 size_t st_d_pr_size = (size_t)(patchData->devData[devInd].bond_pr_size);
2269 patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
2270 reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size());
2271 patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size);
2272 reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
2273 patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size);
2274 copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
2275 copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
2276 if (params->CUDASOAintegrate && params->useDeviceMigration) {
2277 patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2285 void ComputeBondedCUDA::launchWork() {
2287 if (CkMyPe() != masterPe)
2288 NAMD_bug(
"ComputeBondedCUDA::launchWork() called on non master PE");
2297 #ifdef NODEGROUP_FORCE_REGISTER 2299 updatePatchRecords();
2305 float3 lata =
make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
2306 float3 latb =
make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
2307 float3 latc =
make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
2313 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2314 PatchData *patchData = cpdata.ckLocalBranch();
2316 float4 *d_atoms = bondedKernel.getAtomBuffer();
2317 copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
2321 fprintf(stderr,
"DEV[%d] BOND POS PRINTOUT\n", deviceID);
2323 for(
int i = 0 ; i < atomStorageSize; i++){
2324 fprintf(stderr,
" ATOMS[%d] = %lf %lf %lf %lf\n",
2325 i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
2336 bondedKernel.bondedForce(
2339 doEnergy, doVirial, doSlow, doTable,
2344 (
const float4*)atoms, forces,
2349 #ifdef NODEGROUP_FORCE_REGISTER 2350 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2351 PatchData *patchData = cpdata.ckLocalBranch();
2354 patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2356 patchData->devData[devInd].f_bond = bondedKernel.getForces();
2357 patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
2358 patchData->devData[devInd].f_bond_slow = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
2359 patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
2365 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2367 copy_DtoH_sync(bondedKernel.getForces(), forces, forceStorageSize);
2368 fprintf(stderr,
"DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
2369 for(
int i = 0; i < forceStorageSize; i++){
2371 fprintf(stderr,
"BOND[%d] = %lf\n", i, forces[i]);
2376 forceDoneSetCallback();
2379 void ComputeBondedCUDA::forceDoneCheck(
void *arg,
double walltime) {
2380 ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
2382 if (CkMyPe() != c->masterPe)
2383 NAMD_bug(
"ComputeBondedCUDA::forceDoneCheck called on non masterPe");
2387 cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2388 if (err == cudaSuccess) {
2394 }
else if (err != cudaErrorNotReady) {
2397 sprintf(errmsg,
"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
2398 c->checkCount, walltime - c->beforeForceCompute);
2404 if (c->checkCount >= 1000000) {
2406 sprintf(errmsg,
"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
2407 c->checkCount, walltime - c->beforeForceCompute);
2413 CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2419 void ComputeBondedCUDA::forceDoneSetCallback() {
2420 if (CkMyPe() != masterPe)
2421 NAMD_bug(
"ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
2423 cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2427 beforeForceCompute = CkWallTimer();
2432 template <
bool sumNbond,
bool sumSlow>
2433 void finishForceLoop(
const int numAtoms,
const int forceStride,
2434 const double* __restrict__ af,
2435 const double* __restrict__ af_nbond,
2436 const double* __restrict__ af_slow,
2437 Force* __restrict__ f,
2438 Force* __restrict__ f_nbond,
2439 Force* __restrict__ f_slow) {
2442 for (
int j=0;j < numAtoms;j++) {
2450 f[j].y += af[j + forceStride];
2451 f[j].z += af[j + forceStride*2];
2457 f_nbond[j].x += af_nbond[j];
2458 f_nbond[j].y += af_nbond[j + forceStride];
2459 f_nbond[j].z += af_nbond[j + forceStride*2];
2466 f_slow[j].x += af_slow[j];
2467 f_slow[j].y += af_slow[j + forceStride];
2468 f_slow[j].z += af_slow[j + forceStride*2];
2475 for(
int j=0; j < numAtoms; j++){
2477 f[j].y += af[j+ forceStride];
2478 f[j].z += af[j+ 2*forceStride];
2481 for(
int j=0; j < numAtoms; j++){
2482 f_nbond[j].x += af_nbond[j];
2483 f_nbond[j].y += af_nbond[j + forceStride];
2484 f_nbond[j].z += af_nbond[j + 2*forceStride];
2488 for(
int j=0; j < numAtoms; j++){
2489 f_slow[j].x += af_slow[j];
2490 f_slow[j].y += af_slow[j + forceStride];
2491 f_slow[j].z += af_slow[j + 2*forceStride];
2497 for(
int j=0; j < numAtoms; j++) f[j].x += af[j];
2498 for(
int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
2499 for(
int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
2502 #ifdef DEBUG_MINIMIZE 2504 printf(
"%s, line %d\n", __FILE__, __LINE__);
2505 printf(
" before: f_nbond[%d] = %f %f %f\n",
2506 k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2508 for(
int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
2509 for(
int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
2510 for(
int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
2511 #ifdef DEBUG_MINIMIZE 2512 printf(
" after: f_nbond[%d] = %f %f %f\n",
2513 k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2517 for(
int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
2518 for(
int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
2519 for(
int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
2522 for(
int j = 0; j < numAtoms; j++){
2523 fprintf(stderr,
"f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
2524 af[j], af[j + forceStride], af[j + 2*forceStride],
2525 af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
2526 af_slow[j], af_slow[j + forceStride], af_slow[j + 2*forceStride]);
2534 void ComputeBondedCUDA::finishPatchesOnPe() {
2538 int myRank = CkMyRank();
2540 const int forceStride = bondedKernel.getForceStride(atomStorageSize);
2541 const int forceSize = bondedKernel.getForceSize(atomStorageSize);
2542 const bool sumNbond = hasModifiedExclusions;
2543 const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
2546 for (
int i=0;i < patchIDsPerRank[myRank].size();i++) {
2547 PatchID patchID = patchIDsPerRank[myRank][i];
2551 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
2554 int pi = patchIndex[patchID];
2555 int numAtoms = patches[pi].numAtoms;
2556 int atomStart = patches[pi].atomStart;
2560 #ifdef NODEGROUP_FORCE_REGISTER 2561 double *af = forces + atomStart;
2562 double *af_nbond = forces + forceSize + atomStart;
2563 double *af_slow = forces + 2*forceSize + atomStart;
2569 if (!sumNbond && !sumSlow) {
2570 finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2571 }
else if (sumNbond && !sumSlow) {
2572 finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2573 }
else if (!sumNbond && sumSlow) {
2574 finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2575 }
else if (sumNbond && sumSlow) {
2576 finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2578 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2589 double *af = forces + atomStart;
2590 double *af_nbond = forces + forceSize + atomStart;
2591 double *af_slow = forces + 2*forceSize + atomStart;
2593 if (!sumNbond && !sumSlow) {
2594 finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2595 }
else if (sumNbond && !sumSlow) {
2596 finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2597 }
else if (!sumNbond && sumSlow) {
2598 finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2599 }
else if (sumNbond && sumSlow) {
2600 finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2602 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2615 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2617 NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2621 patchesCounter -= patchIDsPerRank[CkMyRank()].size();
2628 if (patchesCounter == 0) {
2629 patchesCounter = getNumPatches();
2636 if(!atomsChanged) this->finishReductions();
2641 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2645 void ComputeBondedCUDA::finishPatches() {
2661 void ComputeBondedCUDA::finishReductions() {
2663 if (CkMyPe() != masterPe)
2664 NAMD_bug(
"ComputeBondedCUDA::finishReductions() called on non masterPe");
2674 cudaCheck(cudaStreamSynchronize(stream));
2676 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2677 if (numTuplesPerType[tupletype] > 0) {
2680 switch (tupletype) {
2682 #ifdef NODEGROUP_FORCE_REGISTER 2690 if (hostAlchFlags.alchFepOn) {
2691 #ifdef NODEGROUP_FORCE_REGISTER 2700 if (hostAlchFlags.alchThermIntOn) {
2701 #ifdef NODEGROUP_FORCE_REGISTER 2715 #ifdef NODEGROUP_FORCE_REGISTER 2723 if (hostAlchFlags.alchFepOn) {
2724 #ifdef NODEGROUP_FORCE_REGISTER 2733 if (hostAlchFlags.alchThermIntOn) {
2734 #ifdef NODEGROUP_FORCE_REGISTER 2748 #ifdef NODEGROUP_FORCE_REGISTER 2756 if (hostAlchFlags.alchFepOn) {
2757 #ifdef NODEGROUP_FORCE_REGISTER 2766 if (hostAlchFlags.alchThermIntOn) {
2767 #ifdef NODEGROUP_FORCE_REGISTER 2781 #ifdef NODEGROUP_FORCE_REGISTER 2789 if (hostAlchFlags.alchFepOn) {
2790 #ifdef NODEGROUP_FORCE_REGISTER 2799 if (hostAlchFlags.alchThermIntOn) {
2800 #ifdef NODEGROUP_FORCE_REGISTER 2814 #ifdef NODEGROUP_FORCE_REGISTER 2826 if (hostAlchFlags.alchFepOn) {
2827 #ifdef NODEGROUP_FORCE_REGISTER 2840 if (hostAlchFlags.alchThermIntOn) {
2841 #ifdef NODEGROUP_FORCE_REGISTER 2863 #ifdef NODEGROUP_FORCE_REGISTER 2871 if (hostAlchFlags.alchFepOn) {
2872 #ifdef NODEGROUP_FORCE_REGISTER 2881 if (hostAlchFlags.alchThermIntOn) {
2882 #ifdef NODEGROUP_FORCE_REGISTER 2895 NAMD_bug(
"ComputeBondedCUDA::finishReductions, Unsupported tuple type");
2900 auto it = tupleList[tupletype].begin();
2902 (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
2908 #ifndef WRITE_FULL_VIRIALS 2909 #error "non-WRITE_FULL_VIRIALS not implemented" 2912 #ifdef NODEGROUP_FORCE_REGISTER 2914 ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
2915 ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
2916 ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
2917 ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2920 ADD_TENSOR(nodeReduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2924 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
2925 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
2926 ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
2927 ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2930 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
2934 #ifdef NODEGROUP_FORCE_REGISTER 2949 void ComputeBondedCUDA::initialize() {
2950 #ifdef NODEGROUP_FORCE_REGISTER 2951 tupleWorkIndex.store(0);
2954 if (CkMyPe() != masterPe)
2955 NAMD_bug(
"ComputeBondedCUDA::initialize() called on non master PE");
2958 for (
int rank=0;rank < computes.size();rank++) {
2959 if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
2960 pes.push_back(CkNodeFirst(CkMyNode()) + rank);
2965 if (pes.size() == 0)
return;
2967 initializeCalled =
false;
2970 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 2971 int leastPriority, greatestPriority;
2972 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
2973 cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
2977 cudaCheck(cudaEventCreate(&forceDoneEvent));
2978 lock = CmiCreateLock();
2979 printLock = CmiCreateLock();
2983 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2984 PatchData *patchData = cpdata.ckLocalBranch();
2990 for (
int rank=0;rank < computes.size();rank++) {
2991 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
2992 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
2993 for (
auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
2996 patchIDsPerRank[rank].push_back(*jt);
2997 allPatchIDs.push_back(*jt);
3005 for (
int rank=0;rank < computes.size();rank++) {
3006 HomeCompute& homeCompute = computes[rank].homeCompute;
3007 std::vector<int>& patchIDs = homeCompute.patchIDs;
3008 for (
int i=0;i < patchIDs.size();i++) {
3009 int patchID = patchIDs[i];
3012 patchIDsPerRank[rank].push_back(patchID);
3013 allPatchIDs.push_back(patchID);
3018 std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
3020 std::vector<int> neighborPids;
3021 for (
int rank=0;rank < computes.size();rank++) {
3023 HomeCompute& homeCompute = computes[rank].homeCompute;
3024 std::vector<int>& patchIDs = homeCompute.patchIDs;
3025 for (
int i=0;i < patchIDs.size();i++) {
3026 int patchID = patchIDs[i];
3028 for (
int j=0;j < numNeighbors;j++) {
3030 neighborPids.push_back(neighbors[j]);
3037 std::sort(neighborPids.begin(), neighborPids.end());
3038 auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
3039 neighborPids.resize(std::distance(neighborPids.begin(), it_end));
3042 for (
int i=0;i < neighborPids.size();i++) {
3043 for (
int rank=0;rank < computes.size();rank++) {
3044 int pid = neighborPids[i];
3045 int pe = rank + CkNodeFirst(CkMyNode());
3046 if (patchMap->
node(pid) == pe) {
3049 patchIDsPerRank[rank].push_back(pid);
3050 allPatchIDs.push_back(pid);
3052 patchIDsToAppend[rank].push_back(pid);
3054 pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3061 std::sort(pes.begin(), pes.end());
3062 auto it_end = std::unique(pes.begin(), pes.end());
3063 pes.resize(std::distance(pes.begin(), it_end));
3068 for (
int rank=0;rank < computes.size();rank++) {
3070 HomeCompute& homeCompute = computes[rank].homeCompute;
3071 std::vector<int>& patchIDs = homeCompute.patchIDs;
3072 std::vector<int> neighborPatchIDs;
3073 for (
int i=0;i < patchIDs.size();i++) {
3074 int patchID = patchIDs[i];
3076 for (
int j=0;j < numNeighbors;j++) {
3080 patchIDsPerRank[rank].push_back(neighbors[j]);
3081 allPatchIDs.push_back(neighbors[j]);
3083 if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
3084 && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
3085 neighborPatchIDs.push_back(neighbors[j]);
3095 for (
int i=0;i < neighborPatchIDs.size();i++) {
3096 patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
3100 for (
int rank=0;rank < patchIDsToAppend.size();rank++) {
3101 for (
int i=0;i < patchIDsToAppend[rank].size();i++) {
3102 computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
3108 std::sort(allPatchIDs.begin(), allPatchIDs.end());
3109 auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
3110 allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
3114 setNumPatches(allPatchIDs.size());
3117 patchesCounter = getNumPatches();
3119 patches.resize(getNumPatches());
3122 for (
int rank=0;rank < computes.size();rank++) {
3123 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3124 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
3125 tupleList[it->tuples->getType()].push_back(it->tuples);
3127 HomeCompute& homeCompute = computes[rank].homeCompute;
3128 for (
int i=0;i < homeCompute.tuples.size();i++) {
3129 tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
3137 std::vector<char> patchIDset(patchMap->
numPatches(), 0);
3138 int numPatchIDset = 0;
3139 int numPatchIDs = 0;
3140 for (
int rank=0;rank < computes.size();rank++) {
3141 numPatchIDs += patchIDsPerRank[rank].size();
3142 for (
int i=0;i < patchIDsPerRank[rank].size();i++) {
3143 PatchID patchID = patchIDsPerRank[rank][i];
3144 if (patchIDset[patchID] == 0) numPatchIDset++;
3145 patchIDset[patchID] = 1;
3146 if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
3147 NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3151 if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
3152 NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3157 atomMappers.resize(getNumPatches());
3158 for (
int i=0;i < getNumPatches();i++) {
3159 atomMappers[i] =
new AtomMapper(allPatchIDs[i], &atomMap);
3160 patchIndex[allPatchIDs[i]] = i;
3164 updateHostCudaAlchFlags();
3165 updateKernelCudaAlchFlags();
3166 if (hostAlchFlags.alchOn) {
3167 updateHostCudaAlchParameters();
3168 bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3169 updateHostCudaAlchLambdas();
3170 bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3171 if (hostAlchFlags.alchDecouple) {
3172 pswitchTable[1+3*1] = 0;
3173 pswitchTable[2+3*2] = 0;
3179 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3180 if (tupleList[tupletype].size() > 0) {
3187 std::vector<CudaBondValue> bondValues(NumBondParams);
3188 for (
int i=0;i < NumBondParams;i++) {
3189 bondValues[i].k = bond_array[i].
k;
3190 bondValues[i].x0 = bond_array[i].
x0;
3191 bondValues[i].x1 = bond_array[i].
x1;
3193 bondedKernel.setupBondValues(NumBondParams, bondValues.data());
3201 std::vector<CudaAngleValue> angleValues(NumAngleParams);
3202 bool normal_ub_error =
false;
3203 for (
int i=0;i < NumAngleParams;i++) {
3204 angleValues[i].k = angle_array[i].
k;
3205 if (angle_array[i].normal == 1) {
3206 angleValues[i].theta0 = angle_array[i].
theta0;
3208 angleValues[i].theta0 = cos(angle_array[i].theta0);
3210 normal_ub_error |= (angle_array[i].
normal == 0 && angle_array[i].
k_ub);
3211 angleValues[i].k_ub = angle_array[i].
k_ub;
3212 angleValues[i].r_ub = angle_array[i].
r_ub;
3213 angleValues[i].normal = angle_array[i].
normal;
3215 if (normal_ub_error)
NAMD_die(
"ERROR: Can't use cosAngles with Urey-Bradley angles");
3216 bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
3224 int NumDihedralParamsMult = 0;
3225 for (
int i=0;i < NumDihedralParams;i++) {
3226 NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
3228 std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
3229 dihedralMultMap.resize(NumDihedralParams);
3231 for (
int i=0;i < NumDihedralParams;i++) {
3233 dihedralMultMap[i] = k;
3234 for (
int j=0;j < multiplicity;j++) {
3235 dihedralValues[k].k = dihedral_array[i].
values[j].
k;
3236 dihedralValues[k].n = (dihedral_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
3237 dihedralValues[k].delta = dihedral_array[i].
values[j].
delta;
3241 bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
3249 int NumImproperParamsMult = 0;
3250 for (
int i=0;i < NumImproperParams;i++) {
3251 NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
3253 std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
3254 improperMultMap.resize(NumImproperParams);
3256 for (
int i=0;i < NumImproperParams;i++) {
3258 improperMultMap[i] = k;
3259 for (
int j=0;j < multiplicity;j++) {
3260 improperValues[k].k = improper_array[i].
values[j].
k;
3261 improperValues[k].n = (improper_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
3262 improperValues[k].delta = improper_array[i].
values[j].
delta;
3266 bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
3274 std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
3277 for (
int ipar=0;ipar < NumCrosstermParams;ipar++) {
3278 for (
int i=0;i < N;i++) {
3279 for (
int j=0;j < N;j++) {
3284 #define INDEX(ncols,i,j) ((i)*ncols + (j)) 3287 const double Ainv[16][16] = {
3288 { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3289 { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3290 {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3291 { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3292 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
3293 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
3294 { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
3295 { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
3296 {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
3297 { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
3298 { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
3299 {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
3300 { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
3301 { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
3302 {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
3303 { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
3307 #define M_PI 3.14159265358979323846 3310 const double h =
M_PI/12.0;
3312 const double x[16] = {
3313 table[
INDEX(D,i,j)].
d00, table[
INDEX(D,i+1,j)].
d00, table[
INDEX(D,i,j+1)].
d00, table[
INDEX(D,i+1,j+1)].
d00,
3314 table[
INDEX(D,i,j)].
d10*h, table[
INDEX(D,i+1,j)].
d10*h, table[
INDEX(D,i,j+1)].
d10*h, table[
INDEX(D,i+1,j+1)].
d10*h,
3315 table[
INDEX(D,i,j)].
d01*h, table[
INDEX(D,i+1,j)].
d01*h, table[
INDEX(D,i,j+1)].
d01*h, table[
INDEX(D,i+1,j+1)].
d01*h,
3316 table[
INDEX(D,i,j)].
d11*h*h, table[
INDEX(D,i+1,j)].
d11*h*h, table[
INDEX(D,i,j+1)].
d11*h*h, table[
INDEX(D,i+1,j+1)].
d11*h*h
3320 float* a = (
float *)&crosstermValues[ipar].c[i][j][0];
3321 for (
int k=0;k < 16;k++) {
3323 for (
int l=0;l < 16;l++) {
3324 a_val += Ainv[k][l]*x[l];
3326 a[k] = (float)a_val;
3332 bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
3341 NAMD_bug(
"ComputeBondedCUDA::initialize, Undefined tuple type");
3351 #ifdef NODEGROUP_FORCE_REGISTER 3354 migrationKernel.setup(nDevices, patchMap->
numPatches());
3358 allocate_host<PatchRecord>(&h_patchRecord, patchMap->
numPatches());
3359 allocate_device<PatchRecord>(&d_patchRecord, patchMap->
numPatches());
3362 allocate_host<double3>(&h_patchMapCenter, patchMap->
numPatches());
3363 allocate_device<double3>(&d_patchMapCenter, patchMap->
numPatches());
3364 for (
int i = 0; i < patchMap->
numPatches(); i++) {
3367 copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->
numPatches());
3369 #endif // NODEGROUP_FORCE_REGISTER 3372 #ifdef NODEGROUP_FORCE_REGISTER 3373 void ComputeBondedCUDA::updatePatchOrder(
const std::vector<CudaLocalRecord>& data) {
3376 std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
3377 for (
int i=0;i < data.size();i++) {
3378 patchIndex[data[i].patchID] = i;
3382 for (
int i=0;i < data.size();i++) {
3387 patches.push_back(p);
3390 #endif // NODEGROUP_FORCE_REGISTER 3392 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
3395 hostAlchFlags.alchFepOn = sim_params.
alchFepOn;
3397 hostAlchFlags.alchWCAOn = sim_params.
alchWCAOn;
3398 hostAlchFlags.alchLJcorrection = sim_params.
LJcorrection;
3404 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
3405 bondedKernel.updateCudaAlchFlags(hostAlchFlags);
3408 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
3414 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
3415 bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3418 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
3422 hostAlchLambdas.bondLambda1 = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda));
3423 hostAlchLambdas.bondLambda2 = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda));
3424 hostAlchLambdas.bondLambda12 = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda2));
3425 hostAlchLambdas.bondLambda22 = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
3426 hostAlchLambdas.elecLambdaUp = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda));
3427 hostAlchLambdas.elecLambda2Up = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda2));
3428 hostAlchLambdas.elecLambdaDown = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda));
3429 hostAlchLambdas.elecLambda2Down = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
3430 hostAlchLambdas.vdwLambdaUp = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda));
3431 hostAlchLambdas.vdwLambda2Up = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda2));
3432 hostAlchLambdas.vdwLambdaDown = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
3433 hostAlchLambdas.vdwLambda2Down = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
3436 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
3437 bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3440 #ifdef NODEGROUP_FORCE_REGISTER 3441 void ComputeBondedCUDA::registerPointersToHost() {
3442 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3443 PatchData *patchData = cpdata.ckLocalBranch();
3448 patchData->h_tupleDataStage.bond[deviceIndex] = dst.
bond;
3449 patchData->h_tupleDataStage.angle[deviceIndex] = dst.
angle;
3450 patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.
dihedral;
3451 patchData->h_tupleDataStage.improper[deviceIndex] = dst.
improper;
3452 patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.
modifiedExclusion;
3453 patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.
exclusion;
3454 patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.
crossterm;
3457 patchData->h_tupleCount.bond[deviceIndex] = count.
bond();
3458 patchData->h_tupleCount.angle[deviceIndex] = count.
angle();
3459 patchData->h_tupleCount.dihedral[deviceIndex] = count.
dihedral();
3460 patchData->h_tupleCount.improper[deviceIndex] = count.
improper();
3461 patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.
modifiedExclusion();
3462 patchData->h_tupleCount.exclusion[deviceIndex] = count.
exclusion();
3463 patchData->h_tupleCount.crossterm[deviceIndex] = count.
crossterm();
3466 patchData->h_tupleOffset.bond[deviceIndex] = offset.
bond();
3467 patchData->h_tupleOffset.angle[deviceIndex] = offset.
angle();
3468 patchData->h_tupleOffset.dihedral[deviceIndex] = offset.
dihedral();
3469 patchData->h_tupleOffset.improper[deviceIndex] = offset.
improper();
3470 patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.
modifiedExclusion();
3471 patchData->h_tupleOffset.exclusion[deviceIndex] = offset.
exclusion();
3472 patchData->h_tupleOffset.crossterm[deviceIndex] = offset.
crossterm();
3475 void ComputeBondedCUDA::copyHostRegisterToDevice() {
3476 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3477 PatchData *patchData = cpdata.ckLocalBranch();
3484 migrationKernel.copyPeerDataToDevice(
3485 patchData->h_tupleDataStage,
3486 patchData->h_tupleCount,
3487 patchData->h_tupleOffset,
3493 void ComputeBondedCUDA::copyPatchData() {
3497 for (
int i = 0; i < numPatches; i++) {
3498 h_patchRecord[i] = patches[patchIndex[i]];
3501 copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
3503 #endif // NODEGROUP_FORCE_REGISTER 3506 #endif // BONDED_CUDA
CudaDihedralStage * improper
NAMD_HOST_DEVICE int * modifiedExclusion()
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
ScaledPosition center(int pid) const
const CrosstermValue * value
CrosstermData c[dim][dim]
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
BigReal getBondLambda(const BigReal) const
const ImproperValue * value
static ProxyMgr * Object()
static int warpAlign(const int n)
#define CUDA_BONDED_KERNEL_EVENT
#define ADD_TENSOR(R, RL, D, DL)
static PatchMap * Object()
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
CudaCrosstermStage * crossterm
SimParameters * simParameters
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * exclusion()
void cudaDie(const char *msg, cudaError_t err)
Bool CUDASOAintegrateMode
#define INDEX(ncols, i, j)
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
int downstream(int pid1, int pid2)
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
DihedralValue * dihedral_array
static ReductionMgr * Object(void)
NodeReduction * reduction
Patch * patch(PatchID pid)
BigReal getElecLambda(const BigReal) const
CrosstermValue * crossterm_array
Molecule stores the structural information for the system.
__thread DeviceCUDA * deviceCUDA
FourBodyConsts values[MAX_MULTIPLICITY]
NAMD_HOST_DEVICE int * bond()
static CudaNBConstants getNonbondedCoef(SimParameters *params)
static Units next(Units u)
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
CudaAtom * getCudaAtomList()
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
void CcdCallBacksReset(void *ignored, double curWallTime)
const DihedralValue * value
NAMD_HOST_DEVICE float3 make_float3(float4 a)
void NAMD_bug(const char *err_msg)
#define COPY_CUDAVECTOR(S, D)
CudaDihedralStage * dihedral
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * crossterm()
NAMD_HOST_DEVICE int * angle()
PatchID getPatchID() const
ImproperValue * improper_array
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
CudaExclusionStage * modifiedExclusion
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
int get_fep_bonded_type(const int *atomID, unsigned int order) const
void NAMD_die(const char *err_msg)
CudaExclusionStage * exclusion
FourBodyConsts values[MAX_MULTIPLICITY]
static Bool vdwForceSwitching
unsigned char get_fep_type(int anum) const
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
BigReal getVdwLambda(const BigReal) const
Box< Patch, CompAtom > * avgPositionBox
virtual void patchReady(PatchID, int doneMigration, int seq)
NAMD_HOST_DEVICE int * improper()
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * dihedral()
Box< Patch, Results > * forceBox
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void close(Data **const t)
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
BigReal alchVdwShiftCoeff
CompAtomExt * getCompAtomExtInfo()
Box< Patch, Results > * registerForceDeposit(Compute *cid)