34 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 36 #define __thread __declspec(thread) 43 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
55 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
74 Compute(c), deviceID(deviceID), masterPe(CkMyPe()),
75 bondedKernel(deviceID, cudaNonbondedTables), computeMgr(computeMgr)
78 computes.resize(CkMyNodeSize());
79 patchIDsPerRank.resize(CkMyNodeSize());
80 numExclPerRank.resize(CkMyNodeSize());
81 for (
int i=0;i < numExclPerRank.size();i++) {
82 numExclPerRank[i].numModifiedExclusions = 0;
83 numExclPerRank[i].numExclusions = 0;
100 energies_virials = NULL;
102 initializeCalled =
false;
105 accelMDdoDihe =
false;
106 if (params->accelMDOn) {
107 if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=
true;
113 pswitchTable[0] = 0; pswitchTable[1] = 1; pswitchTable[2] = 2;
114 pswitchTable[3] = 1; pswitchTable[4] = 1; pswitchTable[5] = 99;
115 pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
117 h_patchRecord = NULL;
118 d_patchRecord = NULL;
120 h_patchMapCenter = NULL;
121 d_patchMapCenter = NULL;
128 ComputeBondedCUDA::~ComputeBondedCUDA() {
131 if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
132 if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
133 if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
134 if (tupleData != NULL) deallocate_host<char>(&tupleData);
136 if (initializeCalled) {
138 cudaCheck(cudaEventDestroy(forceDoneEvent));
139 CmiDestroyLock(lock);
140 CmiDestroyLock(printLock);
141 if (reductionGpuOffload) {
142 delete reductionGpuOffload;
144 if (reductionGpuResident) {
145 delete reductionGpuResident;
149 if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
150 if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
152 if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
153 if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
159 void ComputeBondedCUDA::unregisterBoxesOnPe() {
160 for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
161 PatchID patchID = patchIDsPerRank[CkMyRank()][i];
163 if (tpe == NULL || tpe->
p == NULL) {
164 NAMD_bug(
"ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
177 void ComputeBondedCUDA::registerCompute(
int pe,
int type,
PatchIDList& pids) {
179 if (CkMyPe() != masterPe)
180 NAMD_bug(
"ComputeBondedCUDA::registerCompute() called on non master PE");
182 int rank = CkRankOf(pe);
184 HomeCompute& homeCompute = computes[rank].homeCompute;
185 if (homeCompute.patchIDs.size() == 0) {
187 homeCompute.patchIDs.resize(pids.
size());
188 for (
int i=0;i < pids.
size();i++) {
189 homeCompute.patchIDs[i] = pids[i];
190 homeCompute.isBasePatch[pids[i]] = 1;
193 if (homeCompute.patchIDs.size() != pids.
size()) {
194 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
196 for (
int i=0;i < pids.
size();i++) {
197 if (homeCompute.patchIDs[i] != pids[i]) {
198 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
205 homeCompute.tuples.push_back(
new HomeTuples<BondElem, Bond, BondValue>(
Tuples::BOND));
209 homeCompute.tuples.push_back(
new HomeTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE));
213 homeCompute.tuples.push_back(
new HomeTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL));
217 homeCompute.tuples.push_back(
new HomeTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER));
221 homeCompute.tuples.push_back(
new HomeTuples<ExclElem, Exclusion, int>(
Tuples::EXCLUSION));
225 homeCompute.tuples.push_back(
new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM));
229 homeCompute.tuples.push_back(
new HomeTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE));
233 homeCompute.tuples.push_back(
new HomeTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO));
237 homeCompute.tuples.push_back(
new HomeTuples<OneFourNbTholeElem, OneFourNbThole, OneFourNbTholeValue>(Tuples::ONEFOURNBTHOLE));
241 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), Unsupported compute type");
251 void ComputeBondedCUDA::registerSelfCompute(
int pe,
int type,
int pid) {
253 if (CkMyPe() != masterPe)
254 NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute() called on non master PE");
256 int rank = CkRankOf(pe);
258 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
259 auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
260 if (it == selfComputes.end()) {
262 selfComputes.push_back(SelfCompute(type));
263 it = selfComputes.begin() + (selfComputes.size() - 1);
267 it->tuples =
new SelfTuples<BondElem, Bond, BondValue>(
Tuples::BOND);
271 it->tuples =
new SelfTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE);
275 it->tuples =
new SelfTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL);
279 it->tuples =
new SelfTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER);
287 it->tuples =
new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM);
291 it->tuples =
new SelfTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE);
295 it->tuples =
new SelfTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO);
299 it->tuples =
new SelfTuples<OneFourNbTholeElem, OneFourNbThole, OneFourNbTholeValue>(Tuples::ONEFOURNBTHOLE);
303 NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
310 it->patchIDs.push_back(pid);
313 void ComputeBondedCUDA::assignPatchesOnPe() {
316 for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
317 PatchID patchID = patchIDsPerRank[CkMyRank()][i];
321 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, patch not found");
322 if (flags == NULL) flags = &patchMap->
patch(patchID)->
flags;
325 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
327 if (tpe->
p != NULL) {
328 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
341 void ComputeBondedCUDA::atomUpdate() {
342 atomsChangedIn =
true;
349 int ComputeBondedCUDA::noWork() {
354 void ComputeBondedCUDA::messageEnqueueWork() {
355 if (masterPe != CkMyPe())
356 NAMD_bug(
"ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
364 void ComputeBondedCUDA::doWork() {
365 if (CkMyPe() != masterPe)
366 NAMD_bug(
"ComputeBondedCUDA::doWork() called on non master PE");
372 atomsChanged = atomsChangedIn;
373 atomsChangedIn =
false;
375 if (getNumPatches() == 0) {
380 NAMD_bug(
"ComputeBondedCUDA::doWork(), no flags set");
384 lattice = flags->lattice;
385 doEnergy = flags->doEnergy;
386 doVirial = flags->doVirial;
387 doSlow = flags->doFullElectrostatics;
388 doMolly = flags->doMolly;
391 if (hostAlchFlags.alchOn) {
392 updateHostCudaAlchLambdas();
393 updateKernelCudaAlchLambdas();
395 if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
405 if(params->CUDASOAintegrate) {
406 if (!atomsChanged) this->openBoxesOnPe();
414 void ComputeBondedCUDA::patchReady(
PatchID pid,
int doneMigration,
int seq) {
420 #ifdef NODEGROUP_FORCE_REGISTER 421 patches[patchIndex[pid]].patchID = pid;
427 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
437 void ComputeBondedCUDA::updatePatches() {
440 for (
int i=0;i < patches.size();i++) {
441 patches[i].atomStart = atomStart;
442 atomStart += patches[i].numAtoms;
444 atomStorageSize = atomStart;
447 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
450 #ifdef NODEGROUP_FORCE_REGISTER 452 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
453 PatchData *patchData = cpdata.ckLocalBranch();
455 std::vector<CudaLocalRecord>& localPatches =
456 patchData->devData[deviceIndex].h_localPatches;
457 const int numPatchesHomeAndProxy =
458 patchData->devData[deviceIndex].numPatchesHomeAndProxy;
461 for (
int i=0;i < numPatchesHomeAndProxy; i++) {
462 patches[i].numAtoms = localPatches[i].numAtoms;
463 patches[i].atomStart = localPatches[i].bufferOffset;
464 atomStart += patches[i].numAtoms;
467 atomStorageSize = atomStart;
468 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
470 if (params->CUDASOAintegrate && params->useDeviceMigration) {
471 bondedKernel.updateAtomBuffer(atomStorageSize, stream);
472 updatePatchRecords();
474 #endif // NODEGROUP_FORCE_REGISTER 481 void ComputeBondedCUDA::mapAtoms() {
482 for (
int i=0;i < getNumPatches();i++) {
492 void ComputeBondedCUDA::unmapAtoms() {
493 for (
int i=0;i < getNumPatches();i++) {
502 void ComputeBondedCUDA::openBoxesOnPe(
int startup) {
504 std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
506 fprintf(stderr,
"PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(),
this);
508 #ifdef NODEGROUP_FORCE_REGISTER 509 if(
Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
510 for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
521 for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
532 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
533 int pi = patchIndex[patchID];
534 int atomStart = patches[pi].atomStart;
535 int numAtoms = patches[pi].numAtoms;
539 for (
int i=0;i < numAtoms;i++) {
542 atoms[atomStart + j] = src[i];
550 patchesCounter -= patchIDs.size();
551 if (patchesCounter == 0) {
552 patchesCounter = getNumPatches();
558 if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
562 if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes,
this);
565 if(params->CUDASOAintegrate){
566 if(!atomsChanged) this->launchWork();
571 #ifdef NODEGROUP_FORCE_REGISTER 581 fprintf(stderr,
"PE[%d] (%p) tuplePatchList printout\n", CkMyPe(),
this);
582 for(
int i = 0 ; i < tuplePatchList.size(); i++){
585 if(tpe == NULL)
break;
586 fprintf(stderr,
"PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(),
this, i, tpe->
p->
getPatchID(), tpe->
xExt);
588 CmiUnlock(printLock);
593 void countNumExclusions(Tuples* tuples,
int& numModifiedExclusions,
int& numExclusions) {
594 numModifiedExclusions = 0;
595 int ntuples = tuples->getNumTuples();
597 for (
int ituple=0;ituple < ntuples;ituple++) {
598 if (src[ituple].modified) numModifiedExclusions++;
600 numExclusions = ntuples - numModifiedExclusions;
606 void ComputeBondedCUDA::loadTuplesOnPe(
const int startup) {
609 int numModifiedExclusions = 0;
610 int numExclusions = 0;
611 if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
612 std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
614 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
616 it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
620 countNumExclusions(it->tuples, tmp1, tmp2);
621 numModifiedExclusions += tmp1;
622 numExclusions += tmp2;
626 HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
627 for (
int i=0;i < homeCompute.tuples.size();i++) {
628 homeCompute.tuples[i]->loadTuples(tuplePatchList,
629 homeCompute.isBasePatch.data(), &atomMap,
630 homeCompute.patchIDs);
634 countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
635 numModifiedExclusions += tmp1;
636 numExclusions += tmp2;
640 numModifiedExclusions = modifiedExclusionTupleData.size();
641 numExclusions = exclusionTupleData.size();
645 numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
646 numExclPerRank[CkMyRank()].numExclusions = numExclusions;
649 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
655 patchesCounter -= patchIDsPerRank[CkMyRank()].size();
656 if (patchesCounter == 0) {
657 patchesCounter = getNumPatches();
665 if(!params->CUDASOAintegrate)computeMgr->
sendLaunchWork(masterPe,
this);
672 void ComputeBondedCUDA::copyBondData(
const int ntuples,
const BondElem* __restrict__ src,
676 for (
int ituple=0;ituple < ntuples;ituple++) {
678 auto p0 = src[ituple].p[0];
679 auto p1 = src[ituple].p[1];
680 int pi0 = patchIndex[p0->patchID];
681 int pi1 = patchIndex[p1->patchID];
682 int l0 = src[ituple].localIndex[0];
683 int l1 = src[ituple].localIndex[1];
684 dstval.
i = l0 + patches[pi0].atomStart;
685 dstval.
j = l1 + patches[pi1].atomStart;
686 dstval.
itype = (src[ituple].value - bond_array);
689 Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
690 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
692 dstval.
scale = src[ituple].scale;
693 if (hostAlchFlags.alchOn) {
694 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
700 dst[ituple] = dstval;
704 #ifdef NODEGROUP_FORCE_REGISTER 706 void ComputeBondedCUDA::copyTupleToStage(
const BondElem& src,
711 int pi0 = patchIndex[p0->patchID];
712 int pi1 = patchIndex[p1->patchID];
720 dstval.
index[0] = l0;
721 dstval.
index[1] = l1;
723 if (hostAlchFlags.alchOn) {
731 #endif // NODEGROUP_FORCE_REGISTER 734 void ComputeBondedCUDA::copyBondDatafp32(
const int ntuples,
const BondElem* __restrict__ src,
738 float3 b1f, b2f, b3f;
739 b1f =
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
740 b2f =
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
741 b3f =
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
743 for (
int ituple=0;ituple < ntuples;ituple++) {
745 auto p0 = src[ituple].p[0];
746 auto p1 = src[ituple].p[1];
747 int pi0 = patchIndex[p0->patchID];
748 int pi1 = patchIndex[p1->patchID];
749 int l0 = src[ituple].localIndex[0];
750 int l1 = src[ituple].localIndex[1];
751 dstval.
i = l0 + patches[pi0].atomStart;
752 dstval.
j = l1 + patches[pi1].atomStart;
753 dstval.
itype = (src[ituple].value - bond_array);
757 Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
758 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
760 float3 position1 =
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
761 float3 position2 =
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
762 float3 diff = position1 - position2;
763 float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
764 float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
765 float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
773 dstval.
scale = src[ituple].scale;
774 if (hostAlchFlags.alchOn) {
775 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
781 dst[ituple] = dstval;
785 void ComputeBondedCUDA::copyAngleData(
const int ntuples,
const AngleElem* __restrict__ src,
788 for (
int ituple=0;ituple < ntuples;ituple++) {
790 auto p0 = src[ituple].p[0];
791 auto p1 = src[ituple].p[1];
792 auto p2 = src[ituple].p[2];
793 int pi0 = patchIndex[p0->patchID];
794 int pi1 = patchIndex[p1->patchID];
795 int pi2 = patchIndex[p2->patchID];
796 int l0 = src[ituple].localIndex[0];
797 int l1 = src[ituple].localIndex[1];
798 int l2 = src[ituple].localIndex[2];
799 dstval.
i = l0 + patches[pi0].atomStart;
800 dstval.
j = l1 + patches[pi1].atomStart;
801 dstval.
k = l2 + patches[pi2].atomStart;
802 dstval.
itype = (src[ituple].value - angle_array);
806 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
807 Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
808 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
809 if(pi2 != pi1) shiftVec32 += patchMap->
center(p2->patchID) - patchMap->
center(p1->patchID);
813 dstval.
scale = src[ituple].scale;
814 if (hostAlchFlags.alchOn) {
815 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
821 dst[ituple] = dstval;
825 #ifdef NODEGROUP_FORCE_REGISTER 827 void ComputeBondedCUDA::copyTupleToStage(
const AngleElem& src,
833 int pi0 = patchIndex[p0->patchID];
834 int pi1 = patchIndex[p1->patchID];
835 int pi2 = patchIndex[p2->patchID];
845 dstval.
index[0] = l0;
846 dstval.
index[1] = l1;
847 dstval.
index[2] = l2;
850 if (hostAlchFlags.alchOn) {
858 #endif // NODEGROUP_FORCE_REGISTER 863 template <
bool doDihedral,
typename T,
typename P>
864 void ComputeBondedCUDA::copyDihedralData(
const int ntuples,
const T* __restrict__ src,
865 const P* __restrict__ p_array,
CudaDihedral* __restrict__ dst) {
869 for (
int ituple=0;ituple < ntuples;ituple++) {
871 auto p0 = src[ituple].p[0];
872 auto p1 = src[ituple].p[1];
873 auto p2 = src[ituple].p[2];
874 auto p3 = src[ituple].p[3];
875 int pi0 = patchIndex[p0->patchID];
876 int pi1 = patchIndex[p1->patchID];
877 int pi2 = patchIndex[p2->patchID];
878 int pi3 = patchIndex[p3->patchID];
879 int l0 = src[ituple].localIndex[0];
880 int l1 = src[ituple].localIndex[1];
881 int l2 = src[ituple].localIndex[2];
882 int l3 = src[ituple].localIndex[3];
883 dstval.
i = l0 + patches[pi0].atomStart;
884 dstval.
j = l1 + patches[pi1].atomStart;
885 dstval.
k = l2 + patches[pi2].atomStart;
886 dstval.
l = l3 + patches[pi3].atomStart;
888 dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
890 dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
896 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
897 Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
898 Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
899 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
900 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
901 if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
906 dstval.
scale = src[ituple].scale;
907 if (hostAlchFlags.alchOn) {
908 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
914 dst[ituple] = dstval;
918 #ifdef NODEGROUP_FORCE_REGISTER 920 void ComputeBondedCUDA::copyTupleToStage(
const DihedralElem& src,
927 int pi0 = patchIndex[p0->patchID];
928 int pi1 = patchIndex[p1->patchID];
929 int pi2 = patchIndex[p2->patchID];
930 int pi3 = patchIndex[p3->patchID];
935 dstval.
itype = dihedralMultMap[(src.
value - p_array)];
942 dstval.
index[0] = l0;
943 dstval.
index[1] = l1;
944 dstval.
index[2] = l2;
945 dstval.
index[3] = l3;
948 if (hostAlchFlags.alchOn) {
958 void ComputeBondedCUDA::copyTupleToStage(
const ImproperElem& src,
965 int pi0 = patchIndex[p0->patchID];
966 int pi1 = patchIndex[p1->patchID];
967 int pi2 = patchIndex[p2->patchID];
968 int pi3 = patchIndex[p3->patchID];
973 dstval.
itype = improperMultMap[(src.
value - p_array)];
980 dstval.
index[0] = l0;
981 dstval.
index[1] = l1;
982 dstval.
index[2] = l2;
983 dstval.
index[3] = l3;
986 if (hostAlchFlags.alchOn) {
995 template <
typename T,
typename P,
typename D>
996 void ComputeBondedCUDA::copyToStage(
const int ntuples,
const T* __restrict__ src,
997 const P* __restrict__ p_array, std::vector<D>& dst) {
999 for (
int ituple=0;ituple < ntuples;ituple++) {
1001 copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
1002 dst.push_back(dstval);
1005 #endif // NODEGROUP_FORCE_REGISTER 1011 template <
bool doDihedral,
typename T,
typename P>
1012 void ComputeBondedCUDA::copyDihedralDatafp32(
const int ntuples,
const T* __restrict__ src,
1013 const P* __restrict__ p_array,
CudaDihedral* __restrict__ dst) {
1016 float3 b1f =
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
1017 float3 b2f =
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
1018 float3 b3f =
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
1020 for (
int ituple=0;ituple < ntuples;ituple++) {
1022 auto p0 = src[ituple].p[0];
1023 auto p1 = src[ituple].p[1];
1024 auto p2 = src[ituple].p[2];
1025 auto p3 = src[ituple].p[3];
1026 int pi0 = patchIndex[p0->patchID];
1027 int pi1 = patchIndex[p1->patchID];
1028 int pi2 = patchIndex[p2->patchID];
1029 int pi3 = patchIndex[p3->patchID];
1030 int l0 = src[ituple].localIndex[0];
1031 int l1 = src[ituple].localIndex[1];
1032 int l2 = src[ituple].localIndex[2];
1033 int l3 = src[ituple].localIndex[3];
1034 dstval.
i = l0 + patches[pi0].atomStart;
1035 dstval.
j = l1 + patches[pi1].atomStart;
1036 dstval.
k = l2 + patches[pi2].atomStart;
1037 dstval.
l = l3 + patches[pi3].atomStart;
1039 dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
1041 dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
1044 Position position1 = p0->
x[l0].position;
1045 Position position2 = p1->
x[l1].position;
1046 Position position3 = p2->
x[l2].position;
1047 Position position4 = p3->
x[l3].position;
1048 Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
1049 Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
1050 Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
1051 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1052 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1053 if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
1060 float3 position1 =
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
1061 float3 position2 =
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
1062 float3 position3 =
make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
1063 float3 position4 =
make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
1065 float3 diff12, diff23, diff43;
1066 diff12 = position1 - position2;
1067 diff23 = position2 - position3;
1068 diff43 = position4 - position3;
1070 float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
1071 float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
1072 float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
1074 float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
1075 float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
1076 float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
1078 float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
1079 float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
1080 float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
1107 dstval.
scale = src[ituple].scale;
1108 if (hostAlchFlags.alchOn) {
1109 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1115 dst[ituple] = dstval;
1120 void ComputeBondedCUDA::copyExclusionData(
const int ntuples,
const ExclElem* __restrict__ src,
const int typeSize,
1124 for (
int ituple=0;ituple < ntuples;ituple++) {
1125 auto p0 = src[ituple].p[0];
1126 auto p1 = src[ituple].p[1];
1127 int pi0 = patchIndex[p0->patchID];
1128 int pi1 = patchIndex[p1->patchID];
1129 int l0 = src[ituple].localIndex[0];
1130 int l1 = src[ituple].localIndex[1];
1135 Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
1136 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1138 ce.
i = l0 + patches[pi0].atomStart;
1139 ce.
j = l1 + patches[pi1].atomStart;
1143 if (hostAlchFlags.alchOn) {
1146 ce.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
1150 if (src[ituple].modified) {
1162 #ifdef NODEGROUP_FORCE_REGISTER 1164 void ComputeBondedCUDA::copyTupleToStage(
const ExclElem& __restrict__ src,
1169 int pi0 = patchIndex[p0->patchID];
1170 int pi1 = patchIndex[p1->patchID];
1171 int l0 = src.localIndex[0];
1172 int l1 = src.localIndex[1];
1182 dstval.
index[0] = l0;
1183 dstval.
index[1] = l1;
1185 if (hostAlchFlags.alchOn) {
1188 dstval.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
1196 void ComputeBondedCUDA::copyExclusionDataStage(
const int ntuples,
const ExclElem* __restrict__ src,
const int typeSize,
1197 std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
1199 for (
int ituple=0;ituple < ntuples;ituple++) {
1201 copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple],
nullptr, ce);
1202 if (src[ituple].modified) {
1211 #endif // NODEGROUP_FORCE_REGISTER 1213 void ComputeBondedCUDA::copyCrosstermData(
const int ntuples,
const CrosstermElem* __restrict__ src,
1217 for (
int ituple=0;ituple < ntuples;ituple++) {
1218 auto p0 = src[ituple].p[0];
1219 auto p1 = src[ituple].p[1];
1220 auto p2 = src[ituple].p[2];
1221 auto p3 = src[ituple].p[3];
1222 auto p4 = src[ituple].p[4];
1223 auto p5 = src[ituple].p[5];
1224 auto p6 = src[ituple].p[6];
1225 auto p7 = src[ituple].p[7];
1226 int pi0 = patchIndex[p0->patchID];
1227 int pi1 = patchIndex[p1->patchID];
1228 int pi2 = patchIndex[p2->patchID];
1229 int pi3 = patchIndex[p3->patchID];
1230 int pi4 = patchIndex[p4->patchID];
1231 int pi5 = patchIndex[p5->patchID];
1232 int pi6 = patchIndex[p6->patchID];
1233 int pi7 = patchIndex[p7->patchID];
1234 int l0 = src[ituple].localIndex[0];
1235 int l1 = src[ituple].localIndex[1];
1236 int l2 = src[ituple].localIndex[2];
1237 int l3 = src[ituple].localIndex[3];
1238 int l4 = src[ituple].localIndex[4];
1239 int l5 = src[ituple].localIndex[5];
1240 int l6 = src[ituple].localIndex[6];
1241 int l7 = src[ituple].localIndex[7];
1242 dst[ituple].i1 = l0 + patches[pi0].atomStart;
1243 dst[ituple].i2 = l1 + patches[pi1].atomStart;
1244 dst[ituple].i3 = l2 + patches[pi2].atomStart;
1245 dst[ituple].i4 = l3 + patches[pi3].atomStart;
1246 dst[ituple].i5 = l4 + patches[pi4].atomStart;
1247 dst[ituple].i6 = l5 + patches[pi5].atomStart;
1248 dst[ituple].i7 = l6 + patches[pi6].atomStart;
1249 dst[ituple].i8 = l7 + patches[pi7].atomStart;
1250 dst[ituple].itype = (src[ituple].value - crossterm_array);
1251 Position position1 = p0->
x[l0].position;
1252 Position position2 = p1->
x[l1].position;
1253 Position position3 = p2->
x[l2].position;
1254 Position position4 = p3->
x[l3].position;
1255 Position position5 = p4->
x[l4].position;
1256 Position position6 = p5->
x[l5].position;
1257 Position position7 = p6->
x[l6].position;
1258 Position position8 = p7->
x[l7].position;
1259 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
1260 Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
1261 Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
1262 Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
1263 Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
1264 Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
1265 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1266 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1267 if(pi2 != pi3) shiftVec34 += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
1268 if(pi4 != pi5) shiftVec56 += patchMap->
center(p4->patchID) - patchMap->
center(p5->patchID);
1269 if(pi5 != pi6) shiftVec67 += patchMap->
center(p5->patchID) - patchMap->
center(p6->patchID);
1270 if(pi6 != pi7) shiftVec78 += patchMap->
center(p6->patchID) - patchMap->
center(p7->patchID);
1271 dst[ituple].offset12XYZ =
make_float3( (
float)shiftVec12.
x, (
float)shiftVec12.
y, (
float)shiftVec12.
z);
1272 dst[ituple].offset23XYZ =
make_float3( (
float)shiftVec23.
x, (
float)shiftVec23.
y, (
float)shiftVec23.
z);
1273 dst[ituple].offset34XYZ =
make_float3( (
float)shiftVec34.
x, (
float)shiftVec34.
y, (
float)shiftVec34.
z);
1274 dst[ituple].offset56XYZ =
make_float3( (
float)shiftVec56.
x, (
float)shiftVec56.
y, (
float)shiftVec56.
z);
1275 dst[ituple].offset67XYZ =
make_float3( (
float)shiftVec67.
x, (
float)shiftVec67.
y, (
float)shiftVec67.
z);
1276 dst[ituple].offset78XYZ =
make_float3( (
float)shiftVec78.
x, (
float)shiftVec78.
y, (
float)shiftVec78.
z);
1277 if (hostAlchFlags.alchOn) {
1278 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1280 int typeSum1 = 0, typeSum2 = 0;
1281 for (
size_t i = 0; i < 4; ++i) {
1282 typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1283 typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1285 int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1286 if ((0 < typeSum1 && typeSum1 <
order) || (0 < typeSum2 && typeSum2 <
order)) {
1287 dst[ituple].fepBondedType = 1;
1288 }
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
1289 dst[ituple].fepBondedType = 2;
1292 dst[ituple].fepBondedType = 0;
1294 dst[ituple].scale = src[ituple].scale;
1298 #ifdef NODEGROUP_FORCE_REGISTER 1300 void ComputeBondedCUDA::copyTupleToStage(
const CrosstermElem& src,
1311 int pi0 = patchIndex[p0->patchID];
1312 int pi1 = patchIndex[p1->patchID];
1313 int pi2 = patchIndex[p2->patchID];
1314 int pi3 = patchIndex[p3->patchID];
1315 int pi4 = patchIndex[p4->patchID];
1316 int pi5 = patchIndex[p5->patchID];
1317 int pi6 = patchIndex[p6->patchID];
1318 int pi7 = patchIndex[p7->patchID];
1327 dstval.
itype = (src.
value - crossterm_array);
1338 dstval.
index[0] = l0;
1339 dstval.
index[1] = l1;
1340 dstval.
index[2] = l2;
1341 dstval.
index[3] = l3;
1342 dstval.
index[4] = l4;
1343 dstval.
index[5] = l5;
1344 dstval.
index[6] = l6;
1345 dstval.
index[7] = l7;
1348 if (hostAlchFlags.alchOn) {
1351 int typeSum1 = 0, typeSum2 = 0;
1352 for (
size_t i = 0; i < 4; ++i) {
1356 int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1357 if ((0 < typeSum1 && typeSum1 <
order) || (0 < typeSum2 && typeSum2 <
order)) {
1359 }
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
1366 #endif // NODEGROUP_FORCE_REGISTER 1368 void ComputeBondedCUDA::copyTholeData(
1369 const int ntuples,
const TholeElem* __restrict__ src,
1372 for (
int ituple=0;ituple < ntuples;ituple++) {
1373 const auto p0 = src[ituple].p[0];
1374 const auto p1 = src[ituple].p[1];
1375 const auto p2 = src[ituple].p[2];
1376 const auto p3 = src[ituple].p[3];
1377 const int pi0 = patchIndex[p0->patchID];
1378 const int pi1 = patchIndex[p1->patchID];
1379 const int pi2 = patchIndex[p2->patchID];
1380 const int pi3 = patchIndex[p3->patchID];
1381 const int l0 = src[ituple].localIndex[0];
1382 const int l1 = src[ituple].localIndex[1];
1383 const int l2 = src[ituple].localIndex[2];
1384 const int l3 = src[ituple].localIndex[3];
1385 dst[ituple].i = l0 + patches[pi0].atomStart;
1386 dst[ituple].j = l1 + patches[pi1].atomStart;
1387 dst[ituple].k = l2 + patches[pi2].atomStart;
1388 dst[ituple].l = l3 + patches[pi3].atomStart;
1390 const TholeValue *
const(&value)(src[ituple].value);
1391 if (value == NULL) {
1392 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1394 dst[ituple].aa = value->aa;
1395 dst[ituple].qq = value->qq;
1397 const Position position0 = p0->
x[l0].position;
1398 const Position position1 = p1->
x[l1].position;
1399 const Position position2 = p2->
x[l2].position;
1400 const Position position3 = p3->
x[l3].position;
1401 Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
1402 Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
1403 Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
1404 Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
1405 if (pi0 != pi2) shiftVec_aiaj += patchMap->
center(p0->patchID) - patchMap->
center(p2->patchID);
1406 if (pi0 != pi3) shiftVec_aidj += patchMap->
center(p0->patchID) - patchMap->
center(p3->patchID);
1407 if (pi1 != pi2) shiftVec_diaj += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1408 if (pi1 != pi3) shiftVec_didj += patchMap->
center(p1->patchID) - patchMap->
center(p3->patchID);
1409 dst[ituple].offset_aiaj =
make_float3((
float)shiftVec_aiaj.
x, (
float)shiftVec_aiaj.
y, (
float)shiftVec_aiaj.
z);
1410 dst[ituple].offset_aidj =
make_float3((
float)shiftVec_aidj.
x, (
float)shiftVec_aidj.
y, (
float)shiftVec_aidj.
z);
1411 dst[ituple].offset_diaj =
make_float3((
float)shiftVec_diaj.
x, (
float)shiftVec_diaj.
y, (
float)shiftVec_diaj.
z);
1412 dst[ituple].offset_didj =
make_float3((
float)shiftVec_didj.
x, (
float)shiftVec_didj.
y, (
float)shiftVec_didj.
z);
1413 if (hostAlchFlags.alchOn) {
1416 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1419 typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
1420 typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
1421 const int order = (!params->alchDecouple ? 3 : 2);
1422 if (typeSum == 0 || abs(typeSum) ==
order) dst[ituple].fepBondedType = 0;
1423 else if (0 < typeSum && typeSum <
order) dst[ituple].fepBondedType = 1;
1424 else if (-
order < typeSum && typeSum < 0) dst[ituple].fepBondedType = 2;
1425 else dst[ituple].fepBondedType = 0;
1427 dst[ituple].fepBondedType = 0;
1432 #ifdef NODEGROUP_FORCE_REGISTER 1434 void ComputeBondedCUDA::copyTupleToStage(
const TholeElem& src,
1452 dstval.
index[0] = l0;
1453 dstval.
index[1] = l1;
1454 dstval.
index[2] = l2;
1455 dstval.
index[3] = l3;
1457 if (value == NULL) {
1458 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1460 dstval.
aa = value->aa;
1461 dstval.
qq = value->qq;
1462 if (hostAlchFlags.alchOn) {
1470 const int order = (!params->alchDecouple ? 3 : 2);
1481 void ComputeBondedCUDA::copyAnisoData(
1482 const int ntuples,
const AnisoElem* __restrict src,
1485 for (
int ituple=0;ituple < ntuples;ituple++) {
1486 const auto p0 = src[ituple].p[0];
1487 const auto p1 = src[ituple].p[1];
1488 const auto p2 = src[ituple].p[2];
1489 const auto p3 = src[ituple].p[3];
1490 const int pi0 = patchIndex[p0->patchID];
1491 const int pi1 = patchIndex[p1->patchID];
1492 const int pi2 = patchIndex[p2->patchID];
1493 const int pi3 = patchIndex[p3->patchID];
1494 const int l0 = src[ituple].localIndex[0];
1495 const int l1 = src[ituple].localIndex[1];
1496 const int l2 = src[ituple].localIndex[2];
1497 const int l3 = src[ituple].localIndex[3];
1498 dst[ituple].i = l0 + patches[pi0].atomStart;
1500 dst[ituple].j = l0 + 1 + patches[pi0].atomStart;
1501 dst[ituple].l = l1 + patches[pi1].atomStart;
1502 dst[ituple].m = l2 + patches[pi2].atomStart;
1503 dst[ituple].n = l3 + patches[pi3].atomStart;
1504 const AnisoValue *
const(&value)(src[ituple].value);
1505 if (value == NULL) {
1506 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyAnisoData.\n");
1508 dst[ituple].kpar0 = 2.0 * value->k11;
1509 dst[ituple].kperp0 = 2.0 * value->k22;
1510 dst[ituple].kiso0 = 2.0 * value->k33;
1511 const Position position0 = p0->
x[l0].position;
1512 const Position position1 = p1->
x[l1].position;
1513 const Position position2 = p2->
x[l2].position;
1514 const Position position3 = p3->
x[l3].position;
1515 Vector shiftVec_il = lattice.wrap_delta_scaled(position0, position1);
1516 Vector shiftVec_mn = lattice.wrap_delta_scaled(position2, position3);
1517 if (pi0 != pi1) shiftVec_il += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1518 if (pi2 != pi3) shiftVec_mn += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
1519 dst[ituple].offset_il =
make_float3((
float)shiftVec_il.
x, (
float)shiftVec_il.
y, (
float)shiftVec_il.
z);
1520 dst[ituple].offset_mn =
make_float3((
float)shiftVec_mn.
x, (
float)shiftVec_mn.
y, (
float)shiftVec_mn.
z);
1521 if (hostAlchFlags.alchOn) {
1522 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1526 dst[ituple].fepBondedType = 0;
1531 #ifdef NODEGROUP_FORCE_REGISTER 1533 void ComputeBondedCUDA::copyTupleToStage(
const AnisoElem& src,
1548 dstval.
index[0] = l0;
1549 dstval.
index[1] = l0 + 1;
1550 dstval.
index[2] = l1;
1551 dstval.
index[3] = l2;
1552 dstval.
index[4] = l3;
1554 if (value == NULL) {
1555 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1557 dstval.
kpar0 = 2.0 * value->k11;
1558 dstval.
kperp0 = 2.0 * value->k22;
1559 dstval.
kiso0 = 2.0 * value->k33;
1560 if (hostAlchFlags.alchOn) {
1570 void ComputeBondedCUDA::copyOneFourNbTholeData(
1575 for (
int ituple=0;ituple < ntuples;ituple++) {
1576 const auto p0 = src[ituple].p[0];
1577 const auto p1 = src[ituple].p[1];
1578 const auto p2 = src[ituple].p[2];
1579 const auto p3 = src[ituple].p[3];
1580 const int pi0 = patchIndex[p0->patchID];
1581 const int pi1 = patchIndex[p1->patchID];
1582 const int pi2 = patchIndex[p2->patchID];
1583 const int pi3 = patchIndex[p3->patchID];
1584 const int l0 = src[ituple].localIndex[0];
1585 const int l1 = src[ituple].localIndex[1];
1586 const int l2 = src[ituple].localIndex[2];
1587 const int l3 = src[ituple].localIndex[3];
1588 dst[ituple].i = l0 + patches[pi0].atomStart;
1589 dst[ituple].j = l1 + patches[pi1].atomStart;
1590 dst[ituple].k = l2 + patches[pi2].atomStart;
1591 dst[ituple].l = l3 + patches[pi3].atomStart;
1593 if (value == NULL) {
1594 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1596 dst[ituple].aa = value->aa;
1597 const Position position0 = p0->
x[l0].position;
1598 const Position position1 = p1->
x[l1].position;
1599 const Position position2 = p2->
x[l2].position;
1600 const Position position3 = p3->
x[l3].position;
1601 Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
1602 Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
1603 Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
1604 Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
1605 if (pi0 != pi2) shiftVec_aiaj += patchMap->
center(p0->patchID) - patchMap->
center(p2->patchID);
1606 if (pi0 != pi3) shiftVec_aidj += patchMap->
center(p0->patchID) - patchMap->
center(p3->patchID);
1607 if (pi1 != pi2) shiftVec_diaj += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1608 if (pi1 != pi3) shiftVec_didj += patchMap->
center(p1->patchID) - patchMap->
center(p3->patchID);
1609 dst[ituple].offset_aiaj =
make_float3((
float)shiftVec_aiaj.
x, (
float)shiftVec_aiaj.
y, (
float)shiftVec_aiaj.
z);
1610 dst[ituple].offset_aidj =
make_float3((
float)shiftVec_aidj.
x, (
float)shiftVec_aidj.
y, (
float)shiftVec_aidj.
z);
1611 dst[ituple].offset_diaj =
make_float3((
float)shiftVec_diaj.
x, (
float)shiftVec_diaj.
y, (
float)shiftVec_diaj.
z);
1612 dst[ituple].offset_didj =
make_float3((
float)shiftVec_didj.
x, (
float)shiftVec_didj.
y, (
float)shiftVec_didj.
z);
1619 #ifdef NODEGROUP_FORCE_REGISTER 1639 dstval.
index[0] = l0;
1640 dstval.
index[1] = l1;
1641 dstval.
index[2] = l2;
1642 dstval.
index[3] = l3;
1644 if (value == NULL) {
1645 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1647 dstval.
aa = value->aa;
1652 #endif // NODEGROUP_FORCE_REGISTER 1654 void ComputeBondedCUDA::tupleCopyWorker(
int first,
int last,
void *result,
int paraNum,
void *param) {
1655 ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1656 c->tupleCopyWorker(first, last);
1659 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first,
int last,
void *result,
int paraNum,
void *param) {
1660 ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1661 c->tupleCopyWorkerExcl(first, last);
1664 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first,
int last) {
1670 int64_t numModExclusionsBefore=0;
1671 int64_t numExclusionsBefore=0;
1674 int ntuples = (*it)->getNumTuples();
1675 auto thisExcl = (
ExclElem *)(*it)->getTupleList();
1676 for (
int ituple=0;ituple < ntuples;ituple++) {
1677 if(thisExcl[ituple].modified)
1678 numModExclusionsBefore++;
1680 numExclusionsBefore++;
1684 int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
1685 int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
1686 int numExclusionsWork=last-first;
1687 auto itEnd=
std::next(itStart,numExclusionsWork+1);
1688 for (
auto it=itStart;it != itEnd;it++) {
1689 int ntuples = (*it)->getNumTuples();
1696 void ComputeBondedCUDA::tupleCopyWorker(
int first,
int last) {
1701 int64_t pos = exclusionStartPos;
1702 int64_t pos2 = exclusionStartPos2;
1704 int ntuples = (*it)->getNumTuples();
1713 for (
int i=first;i <= last;i++) {
1714 switch (tupleCopyWorkList[i].tupletype) {
1719 copyBondData(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
1720 Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1722 copyBondDatafp32(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
1723 Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1730 copyAngleData(tupleCopyWorkList[i].ntuples, (
AngleElem *)tupleCopyWorkList[i].tupleElemList,
1731 Node::Object()->parameters->angle_array, (
CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1739 copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1741 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1743 copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1745 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1752 copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
1754 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1760 copyCrosstermData(tupleCopyWorkList[i].ntuples, (
CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
1767 copyTholeData(tupleCopyWorkList[i].ntuples, (
TholeElem*)tupleCopyWorkList[i].tupleElemList,
nullptr, (
CudaThole *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1773 copyAnisoData(tupleCopyWorkList[i].ntuples, (
AnisoElem*)tupleCopyWorkList[i].tupleElemList,
nullptr, (
CudaAniso*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1777 case Tuples::ONEFOURNBTHOLE:
1779 copyOneFourNbTholeData(tupleCopyWorkList[i].ntuples, (
OneFourNbTholeElem*)tupleCopyWorkList[i].tupleElemList,
nullptr, (
CudaOneFourNbThole*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1784 NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1792 #ifdef NODEGROUP_FORCE_REGISTER 1794 void ComputeBondedCUDA::updateMaxTupleCounts(
TupleCounts counts) {
1798 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1799 PatchData *patchData = cpdata.ckLocalBranch();
1802 int numBondsTest = patchData->maxNumBonds.load();
1803 while (numBondsTest < counts.
bond &&
1804 !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.
bond));
1806 int numAnglesTest = patchData->maxNumAngles.load();
1807 while (numAnglesTest < counts.
angle &&
1808 !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.
angle));
1810 int numDihedralsTest = patchData->maxNumDihedrals.load();
1811 while (numDihedralsTest < counts.
dihedral &&
1812 !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.
dihedral));
1814 int numImpropersTest = patchData->maxNumImpropers.load();
1815 while (numImpropersTest < counts.
improper &&
1816 !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.
improper));
1818 int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
1820 !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.
modifiedExclusion));
1822 int numExclusionsTest = patchData->maxNumExclusions.load();
1823 while (numExclusionsTest < counts.
exclusion &&
1824 !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.
exclusion));
1826 int numCrosstermsTest = patchData->maxNumCrossterms.load();
1827 while (numCrosstermsTest < counts.
crossterm &&
1828 !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.
crossterm));
1833 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
1834 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1835 PatchData *patchData = cpdata.ckLocalBranch();
1839 counts.
bond = patchData->maxNumBonds.load();
1840 counts.
angle = patchData->maxNumAngles.load();
1841 counts.
dihedral = patchData->maxNumDihedrals.load();
1842 counts.
improper = patchData->maxNumImpropers.load();
1844 counts.
exclusion = patchData->maxNumExclusions.load();
1845 counts.
crossterm = patchData->maxNumCrossterms.load();
1848 CkPrintf(
"[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
1897 void ComputeBondedCUDA::migrateTuples(
bool startup) {
1899 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1900 PatchData *patchData = cpdata.ckLocalBranch();
1905 bool amMaster = (masterPe == CkMyPe());
1910 cudaStreamSynchronize(stream);
1912 cudaStreamSynchronize(stream);
1919 const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
1923 TupleCounts counts = bondedKernel.getTupleCounts();
1924 migrationKernel.computeTupleDestination(
1927 patchData->devData[devInd].numPatchesHome,
1928 migrationDestination,
1929 patchData->devData[devInd].d_patchToDeviceMap,
1930 patchData->devData[devInd].d_globalToLocalID,
1935 cudaCheck(cudaStreamSynchronize(stream));
1942 migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
1943 cudaCheck(cudaStreamSynchronize(stream));
1949 bool realloc =
false;
1951 if (amMaster && !startup) {
1952 migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
1954 local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
1955 updateMaxTupleCounts(local);
1957 CkPrintf(
"[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
1963 if (amMaster && !startup) {
1966 newMax = getMaxTupleCounts();
1967 realloc = migrationKernel.reallocateBufferDst(newMax);
1968 patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
1978 realloc = patchData->tupleReallocationFlagPerDevice[0];
1987 registerPointersToHost();
1992 copyHostRegisterToDevice();
1993 cudaCheck(cudaStreamSynchronize(stream));
1998 if (amMaster && !startup) {
2000 migrationKernel.performTupleMigration(
2001 bondedKernel.getTupleCounts(),
2004 cudaCheck(cudaStreamSynchronize(stream));
2008 if (amMaster && !startup) {
2015 bondedKernel.reallocateTupleBuffer(newMax, stream);
2016 migrationKernel.reallocateBufferSrc(newMax);
2017 cudaCheck(cudaStreamSynchronize(stream));
2026 bondedKernel.setTupleCounts(local);
2029 const int* ids = patchData->h_soa_id[devInd];
2030 migrationKernel.updateTuples(
2031 bondedKernel.getTupleCounts(),
2032 bondedKernel.getData(),
2036 bondedKernel.getAtomBuffer(),
2050 template<
typename T>
2051 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
2054 std::vector<std::pair<int,int>> downstreamPatches;
2055 for (
int i = 0; i < tuples.size(); i++) {
2056 int downstream = tuples[i].patchIDs[0];
2057 for (
int j = 1; j < T::size; j++) {
2058 downstream = patchMap->
downstream(downstream, tuples[i].patchIDs[j]);
2060 downstreamPatches.push_back(std::make_pair(i, downstream));
2061 tupleCounts[patchIndex[downstream]]++;
2067 tupleOffsets[0] = 0;
2068 for (
int i = 0; i < tupleCounts.size(); i++) {
2069 tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
2075 std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
2076 [](std::pair<int, int> a, std::pair<int, int> b) {
2077 return a.second < b.second;
2083 std::vector<T> copy = tuples;
2084 for (
int i = 0; i < tuples.size(); i++) {
2085 tuples[i] = copy[downstreamPatches[i].first];
2089 void ComputeBondedCUDA::sortAndCopyToDevice() {
2090 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2091 PatchData *patchData = cpdata.ckLocalBranch();
2093 const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
2103 std::vector<int> bondPatchCounts(numPatchesHome, 0);
2104 std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
2106 std::vector<int> anglePatchCounts(numPatchesHome, 0);
2107 std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
2109 std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
2110 std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
2112 std::vector<int> improperPatchCounts(numPatchesHome, 0);
2113 std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
2115 std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
2116 std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
2118 std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
2119 std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
2121 std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
2122 std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
2124 #define CALL(fieldName) do { \ 2125 sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \ 2126 h_dataStage.fieldName = fieldName##TupleData.data(); \ 2127 h_counts.fieldName = fieldName##PatchCounts.data(); \ 2128 h_offsets.fieldName = fieldName##PatchOffsets.data(); \ 2135 CALL(modifiedExclusion);
2141 migrationKernel.copyTupleToDevice(
2142 bondedKernel.getTupleCounts(),
2150 cudaCheck(cudaStreamSynchronize(stream));
2160 void ComputeBondedCUDA::tupleCopyWorkerType(
int tupletype) {
2163 switch (tupletype) {
2167 modifiedExclusionTupleData.clear();
2168 exclusionTupleData.clear();
2169 int64_t pos = exclusionStartPos;
2170 int64_t pos2 = exclusionStartPos2;
2172 int ntuples = (*it)->getNumTuples();
2174 modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
2185 bondTupleData.clear();
2187 int ntuples = (*it)->getNumTuples();
2189 copyToStage<BondElem, BondValue, CudaBondStage>(
2200 angleTupleData.clear();
2201 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2202 int ntuples = (*it)->getNumTuples();
2213 dihedralTupleData.clear();
2214 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2215 int ntuples = (*it)->getNumTuples();
2217 copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
2227 improperTupleData.clear();
2228 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2229 int ntuples = (*it)->getNumTuples();
2231 copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList,
2241 crosstermTupleData.clear();
2242 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2243 int ntuples = (*it)->getNumTuples();
2245 copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList,
2255 tholeTupleData.clear();
2256 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2257 const int ntuples = (*it)->getNumTuples();
2259 copyToStage<TholeElem, TholeValue, CudaTholeStage>(ntuples, elemList,
nullptr, tholeTupleData);
2267 anisoTupleData.clear();
2268 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2269 const int ntuples = (*it)->getNumTuples();
2271 copyToStage<AnisoElem, AnisoValue, CudaAnisoStage>(ntuples, elemList,
nullptr, anisoTupleData);
2277 case Tuples::ONEFOURNBTHOLE:
2279 oneFourNbTholeTupleData.clear();
2280 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2281 const int ntuples = (*it)->getNumTuples();
2283 copyToStage<OneFourNbTholeElem, OneFourNbTholeValue, CudaOneFourNbTholeStage>(ntuples, elemList,
nullptr, oneFourNbTholeTupleData);
2290 NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
2296 #endif // NODEGROUP_FORCE_REGISTER 2303 void ComputeBondedCUDA::copyTupleData() {
2308 int numModifiedExclusions = 0;
2309 int numExclusions = 0;
2310 for (
int i=0;i < numExclPerRank.size();i++) {
2311 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2312 numExclusions += numExclPerRank[i].numExclusions;
2319 exclusionStartPos = 0;
2320 exclusionStartPos2 = 0;
2321 tupleCopyWorkList.clear();
2322 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2324 int64_t pos = posWA;
2326 exclusionStartPos = pos;
2327 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
2331 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2332 int ntuples = (*it)->getNumTuples();
2335 TupleCopyWork tupleCopyWork;
2336 tupleCopyWork.tupletype = tupletype;
2337 tupleCopyWork.ntuples = ntuples;
2338 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2339 tupleCopyWork.tupleDataPos = pos;
2341 tupleCopyWorkList.push_back(tupleCopyWork);
2342 pos += ntuples*CudaTupleTypeSize[tupletype];
2345 numTuplesPerType[tupletype] = num;
2349 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2354 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2355 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2359 hasExclusions = (numExclusions > 0);
2360 hasModifiedExclusions = (numModifiedExclusions > 0);
2364 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2366 #if CMK_SMP && USE_CKLOOP 2368 if (useCkLoop >= 1) {
2369 #define CKLOOP_EXCLUSIONS 1 2370 #if CKLOOP_EXCLUSIONS 2371 CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (
void *)
this, CkMyNodeSize(), 0, tupleList[
Tuples::EXCLUSION].size()-1);
2373 CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
2377 CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
2384 tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
2390 numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
2391 numTuplesPerType[Tuples::ONEFOURNBTHOLE], tupleData, stream);
2394 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2395 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2399 #ifdef NODEGROUP_FORCE_REGISTER 2401 void ComputeBondedCUDA::copyTupleDataSN() {
2404 size_t numExclusions, numModifiedExclusions, copyIndex;
2408 if(masterPe == CkMyPe()){
2409 numModifiedExclusions = 0;
2411 for (
int i=0;i < numExclPerRank.size();i++) {
2412 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2413 numExclusions += numExclPerRank[i].numExclusions;
2420 exclusionStartPos = 0;
2421 exclusionStartPos2 = 0;
2422 tupleCopyWorkList.clear();
2423 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2425 int64_t pos = posWA;
2427 exclusionStartPos = pos;
2428 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
2432 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2433 int ntuples = (*it)->getNumTuples();
2436 TupleCopyWork tupleCopyWork;
2437 tupleCopyWork.tupletype = tupletype;
2438 tupleCopyWork.ntuples = ntuples;
2439 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2440 tupleCopyWork.tupleDataPos = pos;
2441 tupleCopyWorkList.push_back(tupleCopyWork);
2442 pos += ntuples*CudaTupleTypeSize[tupletype];
2445 numTuplesPerType[tupletype] = num;
2449 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2454 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2455 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2459 hasExclusions = (numExclusions > 0);
2460 hasModifiedExclusions = (numModifiedExclusions > 0);
2464 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2474 this->tupleCopyWorker(first, last);
2480 while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <= tupleCopyWorkList.size()){
2481 this->tupleCopyWorker(copyIndex -1, copyIndex -1);
2486 if(masterPe == CkMyPe()){
2487 tupleWorkIndex.store(0);
2491 numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
2492 numTuplesPerType[Tuples::ONEFOURNBTHOLE],
2496 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2497 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2509 void ComputeBondedCUDA::copyTupleDataGPU(
const int startup) {
2513 size_t numExclusions, numModifiedExclusions, copyIndex;
2522 if(masterPe == CkMyPe()){
2523 numModifiedExclusions = 0;
2525 for (
int i=0;i < numExclPerRank.size();i++) {
2526 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2527 numExclusions += numExclPerRank[i].numExclusions;
2534 exclusionStartPos = 0;
2535 exclusionStartPos2 = 0;
2536 tupleCopyWorkList.clear();
2537 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2541 exclusionStartPos = pos;
2542 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[
Tuples::EXCLUSION];
2546 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2547 int ntuples = (*it)->getNumTuples();
2550 TupleCopyWork tupleCopyWork;
2551 tupleCopyWork.tupletype = tupletype;
2552 tupleCopyWork.ntuples = ntuples;
2553 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2554 tupleCopyWork.tupleDataPos = pos;
2555 tupleCopyWorkList.push_back(tupleCopyWork);
2556 pos += ntuples*CudaTupleTypeSizeStage[tupletype];
2559 numTuplesPerType[tupletype] = num;
2563 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
2568 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2569 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2573 hasExclusions = (numExclusions > 0);
2574 hasModifiedExclusions = (numModifiedExclusions > 0);
2578 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2586 local.modifiedExclusion = numModifiedExclusions;
2587 local.exclusion = numExclusions;
2589 local.thole = numTuplesPerType[Tuples::THOLE];
2590 local.aniso = numTuplesPerType[Tuples::ANISO];
2591 local.oneFourNbThole = numTuplesPerType[Tuples::ONEFOURNBTHOLE];
2593 updateMaxTupleCounts(local);
2594 bondedKernel.setTupleCounts(local);
2600 if(masterPe == CkMyPe()){
2602 migrationKernel.reallocateBufferSrc(newMax);
2603 migrationKernel.reallocateBufferDst(newMax);
2604 bondedKernel.reallocateTupleBuffer(newMax, stream);
2605 registerPointersToHost();
2611 if(masterPe == CkMyPe()){
2612 copyHostRegisterToDevice();
2613 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2614 this->tupleCopyWorkerType(tupletype);
2616 sortAndCopyToDevice();
2620 migrateTuples(startup);
2622 if(masterPe == CkMyPe()){
2623 TupleCounts count = bondedKernel.getTupleCounts();
2631 numTuplesPerType[Tuples::THOLE] = count.
thole;
2632 numTuplesPerType[Tuples::ANISO] = count.
aniso;
2636 hasExclusions = (numExclusions > 0);
2637 hasModifiedExclusions = (numModifiedExclusions > 0);
2640 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2641 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2656 #ifdef NODEGROUP_FORCE_REGISTER 2657 void ComputeBondedCUDA::updatePatchRecords() {
2658 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2659 PatchData *patchData = cpdata.ckLocalBranch();
2661 PatchRecord **d_pr = &(patchData->devData[devInd].bond_pr);
2662 int **d_pid = &(patchData->devData[devInd].bond_pi);
2663 size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
2664 size_t st_d_pr_size = (size_t)(patchData->devData[devInd].bond_pr_size);
2665 patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
2666 reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size());
2667 patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size);
2668 reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
2669 patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size);
2670 copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
2671 copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
2672 if (params->CUDASOAintegrate && params->useDeviceMigration) {
2673 patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2681 void ComputeBondedCUDA::launchWork() {
2683 if (CkMyPe() != masterPe)
2684 NAMD_bug(
"ComputeBondedCUDA::launchWork() called on non master PE");
2693 #ifdef NODEGROUP_FORCE_REGISTER 2695 updatePatchRecords();
2701 float3 lata =
make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
2702 float3 latb =
make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
2703 float3 latc =
make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
2709 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2710 PatchData *patchData = cpdata.ckLocalBranch();
2712 float4 *d_atoms = bondedKernel.getAtomBuffer();
2713 copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
2717 fprintf(stderr,
"DEV[%d] BOND POS PRINTOUT\n", deviceID);
2719 for(
int i = 0 ; i < atomStorageSize; i++){
2720 fprintf(stderr,
" ATOMS[%d] = %lf %lf %lf %lf\n",
2721 i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
2732 bondedKernel.bondedForce(
2735 doEnergy, doVirial, doSlow, doTable,
2740 (
const float4*)atoms, forces,
2745 #ifdef NODEGROUP_FORCE_REGISTER 2746 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2747 PatchData *patchData = cpdata.ckLocalBranch();
2750 patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2752 patchData->devData[devInd].f_bond = bondedKernel.getForces();
2753 patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
2754 patchData->devData[devInd].f_bond_slow = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
2755 patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
2761 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2763 copy_DtoH_sync(bondedKernel.getForces(), forces, forceStorageSize);
2764 fprintf(stderr,
"DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
2765 for(
int i = 0; i < forceStorageSize; i++){
2767 fprintf(stderr,
"BOND[%d] = %lf\n", i, forces[i]);
2772 forceDoneSetCallback();
2775 void ComputeBondedCUDA::forceDoneCheck(
void *arg,
double walltime) {
2776 ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
2778 if (CkMyPe() != c->masterPe)
2779 NAMD_bug(
"ComputeBondedCUDA::forceDoneCheck called on non masterPe");
2783 cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2784 if (err == cudaSuccess) {
2790 }
else if (err != cudaErrorNotReady) {
2793 sprintf(errmsg,
"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
2794 c->checkCount, walltime - c->beforeForceCompute);
2800 if (c->checkCount >= 1000000) {
2802 sprintf(errmsg,
"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
2803 c->checkCount, walltime - c->beforeForceCompute);
2809 CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2815 void ComputeBondedCUDA::forceDoneSetCallback() {
2816 if (CkMyPe() != masterPe)
2817 NAMD_bug(
"ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
2819 cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2823 beforeForceCompute = CkWallTimer();
2828 template <
bool sumNbond,
bool sumSlow>
2829 void finishForceLoop(
const int numAtoms,
const int forceStride,
2830 const double* __restrict__ af,
2831 const double* __restrict__ af_nbond,
2832 const double* __restrict__ af_slow,
2833 Force* __restrict__ f,
2834 Force* __restrict__ f_nbond,
2835 Force* __restrict__ f_slow) {
2838 for (
int j=0;j < numAtoms;j++) {
2846 f[j].y += af[j + forceStride];
2847 f[j].z += af[j + forceStride*2];
2853 f_nbond[j].x += af_nbond[j];
2854 f_nbond[j].y += af_nbond[j + forceStride];
2855 f_nbond[j].z += af_nbond[j + forceStride*2];
2862 f_slow[j].x += af_slow[j];
2863 f_slow[j].y += af_slow[j + forceStride];
2864 f_slow[j].z += af_slow[j + forceStride*2];
2871 for(
int j=0; j < numAtoms; j++){
2873 f[j].y += af[j+ forceStride];
2874 f[j].z += af[j+ 2*forceStride];
2877 for(
int j=0; j < numAtoms; j++){
2878 f_nbond[j].x += af_nbond[j];
2879 f_nbond[j].y += af_nbond[j + forceStride];
2880 f_nbond[j].z += af_nbond[j + 2*forceStride];
2884 for(
int j=0; j < numAtoms; j++){
2885 f_slow[j].x += af_slow[j];
2886 f_slow[j].y += af_slow[j + forceStride];
2887 f_slow[j].z += af_slow[j + 2*forceStride];
2893 for(
int j=0; j < numAtoms; j++) f[j].x += af[j];
2894 for(
int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
2895 for(
int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
2898 #ifdef DEBUG_MINIMIZE 2900 printf(
"%s, line %d\n", __FILE__, __LINE__);
2901 printf(
" before: f_nbond[%d] = %f %f %f\n",
2902 k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2904 for(
int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
2905 for(
int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
2906 for(
int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
2907 #ifdef DEBUG_MINIMIZE 2908 printf(
" after: f_nbond[%d] = %f %f %f\n",
2909 k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2913 for(
int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
2914 for(
int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
2915 for(
int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
2918 for(
int j = 0; j < numAtoms; j++){
2919 fprintf(stderr,
"f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
2920 af[j], af[j + forceStride], af[j + 2*forceStride],
2921 af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
2922 af_slow[j], af_slow[j + forceStride], af_slow[j + 2*forceStride]);
2930 void ComputeBondedCUDA::finishPatchesOnPe() {
2934 int myRank = CkMyRank();
2936 const int forceStride = bondedKernel.getForceStride(atomStorageSize);
2937 const int forceSize = bondedKernel.getForceSize(atomStorageSize);
2938 const bool sumNbond = hasModifiedExclusions;
2939 const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
2942 for (
int i=0;i < patchIDsPerRank[myRank].size();i++) {
2943 PatchID patchID = patchIDsPerRank[myRank][i];
2947 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
2950 int pi = patchIndex[patchID];
2951 int numAtoms = patches[pi].numAtoms;
2952 int atomStart = patches[pi].atomStart;
2956 #ifdef NODEGROUP_FORCE_REGISTER 2957 double *af = forces + atomStart;
2958 double *af_nbond = forces + forceSize + atomStart;
2959 double *af_slow = forces + 2*forceSize + atomStart;
2965 if (!sumNbond && !sumSlow) {
2966 finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2967 }
else if (sumNbond && !sumSlow) {
2968 finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2969 }
else if (!sumNbond && sumSlow) {
2970 finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2971 }
else if (sumNbond && sumSlow) {
2972 finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2974 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2985 double *af = forces + atomStart;
2986 double *af_nbond = forces + forceSize + atomStart;
2987 double *af_slow = forces + 2*forceSize + atomStart;
2989 if (!sumNbond && !sumSlow) {
2990 finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2991 }
else if (sumNbond && !sumSlow) {
2992 finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2993 }
else if (!sumNbond && sumSlow) {
2994 finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2995 }
else if (sumNbond && sumSlow) {
2996 finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2998 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
3011 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
3013 NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
3017 patchesCounter -= patchIDsPerRank[CkMyRank()].size();
3024 if (patchesCounter == 0) {
3025 patchesCounter = getNumPatches();
3032 if(!atomsChanged) this->finishReductions();
3037 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
3041 void ComputeBondedCUDA::finishPatches() {
3057 void ComputeBondedCUDA::finishReductions() {
3059 if (CkMyPe() != masterPe)
3060 NAMD_bug(
"ComputeBondedCUDA::finishReductions() called on non masterPe");
3071 cudaCheck(cudaStreamSynchronize(stream));
3073 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3074 if (numTuplesPerType[tupletype] > 0) {
3077 switch (tupletype) {
3080 if (hostAlchFlags.alchFepOn) {
3083 if (hostAlchFlags.alchThermIntOn) {
3091 if (hostAlchFlags.alchFepOn) {
3094 if (hostAlchFlags.alchThermIntOn) {
3102 if (hostAlchFlags.alchFepOn) {
3105 if (hostAlchFlags.alchThermIntOn) {
3113 if (hostAlchFlags.alchFepOn) {
3116 if (hostAlchFlags.alchThermIntOn) {
3126 if (hostAlchFlags.alchFepOn) {
3131 if (hostAlchFlags.alchThermIntOn) {
3143 if (hostAlchFlags.alchFepOn) {
3146 if (hostAlchFlags.alchThermIntOn) {
3152 case Tuples::THOLE: {
3154 if (hostAlchFlags.alchFepOn) {
3157 if (hostAlchFlags.alchThermIntOn) {
3164 case Tuples::ANISO: {
3166 if (hostAlchFlags.alchFepOn) {
3169 if (hostAlchFlags.alchThermIntOn) {
3176 case Tuples::ONEFOURNBTHOLE: {
3178 if (hostAlchFlags.alchFepOn) {
3181 if (hostAlchFlags.alchThermIntOn) {
3189 NAMD_bug(
"ComputeBondedCUDA::finishReductions, Unsupported tuple type");
3194 auto it = tupleList[tupletype].begin();
3196 (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
3202 #ifndef WRITE_FULL_VIRIALS 3203 #error "non-WRITE_FULL_VIRIALS not implemented" 3206 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
3207 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
3208 ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
3209 ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3212 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3223 void ComputeBondedCUDA::initialize() {
3224 #ifdef NODEGROUP_FORCE_REGISTER 3225 tupleWorkIndex.store(0);
3228 if (CkMyPe() != masterPe)
3229 NAMD_bug(
"ComputeBondedCUDA::initialize() called on non master PE");
3232 for (
int rank=0;rank < computes.size();rank++) {
3233 if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
3234 pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3239 if (pes.size() == 0)
return;
3241 initializeCalled =
false;
3244 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 3245 int leastPriority, greatestPriority;
3246 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
3247 cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
3251 cudaCheck(cudaEventCreate(&forceDoneEvent));
3252 lock = CmiCreateLock();
3253 printLock = CmiCreateLock();
3254 if(
Node::Object()->simParameters->CUDASOAintegrateMode) {
3259 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3260 PatchData *patchData = cpdata.ckLocalBranch();
3265 for (
int rank=0;rank < computes.size();rank++) {
3266 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3267 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
3268 for (
auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
3271 patchIDsPerRank[rank].push_back(*jt);
3272 allPatchIDs.push_back(*jt);
3280 for (
int rank=0;rank < computes.size();rank++) {
3281 HomeCompute& homeCompute = computes[rank].homeCompute;
3282 std::vector<int>& patchIDs = homeCompute.patchIDs;
3283 for (
int i=0;i < patchIDs.size();i++) {
3284 int patchID = patchIDs[i];
3287 patchIDsPerRank[rank].push_back(patchID);
3288 allPatchIDs.push_back(patchID);
3293 std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
3295 std::vector<int> neighborPids;
3296 for (
int rank=0;rank < computes.size();rank++) {
3298 HomeCompute& homeCompute = computes[rank].homeCompute;
3299 std::vector<int>& patchIDs = homeCompute.patchIDs;
3300 for (
int i=0;i < patchIDs.size();i++) {
3301 int patchID = patchIDs[i];
3303 for (
int j=0;j < numNeighbors;j++) {
3305 neighborPids.push_back(neighbors[j]);
3312 std::sort(neighborPids.begin(), neighborPids.end());
3313 auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
3314 neighborPids.resize(std::distance(neighborPids.begin(), it_end));
3317 for (
int i=0;i < neighborPids.size();i++) {
3318 for (
int rank=0;rank < computes.size();rank++) {
3319 int pid = neighborPids[i];
3320 int pe = rank + CkNodeFirst(CkMyNode());
3321 if (patchMap->
node(pid) == pe) {
3324 patchIDsPerRank[rank].push_back(pid);
3325 allPatchIDs.push_back(pid);
3327 patchIDsToAppend[rank].push_back(pid);
3329 pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3336 std::sort(pes.begin(), pes.end());
3337 auto it_end = std::unique(pes.begin(), pes.end());
3338 pes.resize(std::distance(pes.begin(), it_end));
3343 for (
int rank=0;rank < computes.size();rank++) {
3345 HomeCompute& homeCompute = computes[rank].homeCompute;
3346 std::vector<int>& patchIDs = homeCompute.patchIDs;
3347 std::vector<int> neighborPatchIDs;
3348 for (
int i=0;i < patchIDs.size();i++) {
3349 int patchID = patchIDs[i];
3351 for (
int j=0;j < numNeighbors;j++) {
3355 patchIDsPerRank[rank].push_back(neighbors[j]);
3356 allPatchIDs.push_back(neighbors[j]);
3358 if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
3359 && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
3360 neighborPatchIDs.push_back(neighbors[j]);
3370 for (
int i=0;i < neighborPatchIDs.size();i++) {
3371 patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
3375 for (
int rank=0;rank < patchIDsToAppend.size();rank++) {
3376 for (
int i=0;i < patchIDsToAppend[rank].size();i++) {
3377 computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
3383 std::sort(allPatchIDs.begin(), allPatchIDs.end());
3384 auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
3385 allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
3389 setNumPatches(allPatchIDs.size());
3392 patchesCounter = getNumPatches();
3394 patches.resize(getNumPatches());
3397 for (
int rank=0;rank < computes.size();rank++) {
3398 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3399 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
3400 tupleList[it->tuples->getType()].push_back(it->tuples);
3402 HomeCompute& homeCompute = computes[rank].homeCompute;
3403 for (
int i=0;i < homeCompute.tuples.size();i++) {
3404 tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
3412 std::vector<char> patchIDset(patchMap->
numPatches(), 0);
3413 int numPatchIDset = 0;
3414 int numPatchIDs = 0;
3415 for (
int rank=0;rank < computes.size();rank++) {
3416 numPatchIDs += patchIDsPerRank[rank].size();
3417 for (
int i=0;i < patchIDsPerRank[rank].size();i++) {
3418 PatchID patchID = patchIDsPerRank[rank][i];
3419 if (patchIDset[patchID] == 0) numPatchIDset++;
3420 patchIDset[patchID] = 1;
3421 if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
3422 NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3426 if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
3427 NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3432 atomMappers.resize(getNumPatches());
3433 for (
int i=0;i < getNumPatches();i++) {
3434 atomMappers[i] =
new AtomMapper(allPatchIDs[i], &atomMap);
3435 patchIndex[allPatchIDs[i]] = i;
3439 updateHostCudaAlchFlags();
3440 updateKernelCudaAlchFlags();
3441 if (hostAlchFlags.alchOn) {
3442 updateHostCudaAlchParameters();
3443 bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3444 updateHostCudaAlchLambdas();
3445 bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3446 if (hostAlchFlags.alchDecouple) {
3447 pswitchTable[1+3*1] = 0;
3448 pswitchTable[2+3*2] = 0;
3454 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3455 if (tupleList[tupletype].size() > 0) {
3462 std::vector<CudaBondValue> bondValues(NumBondParams);
3463 for (
int i=0;i < NumBondParams;i++) {
3464 bondValues[i].k = bond_array[i].
k;
3465 bondValues[i].x0 = bond_array[i].
x0;
3466 bondValues[i].x1 = bond_array[i].
x1;
3468 bondedKernel.setupBondValues(NumBondParams, bondValues.data());
3476 std::vector<CudaAngleValue> angleValues(NumAngleParams);
3477 bool normal_ub_error =
false;
3478 for (
int i=0;i < NumAngleParams;i++) {
3479 angleValues[i].k = angle_array[i].
k;
3480 if (angle_array[i].normal == 1) {
3481 angleValues[i].theta0 = angle_array[i].
theta0;
3483 angleValues[i].theta0 = cos(angle_array[i].theta0);
3485 normal_ub_error |= (angle_array[i].
normal == 0 && angle_array[i].
k_ub);
3486 angleValues[i].k_ub = angle_array[i].
k_ub;
3487 angleValues[i].r_ub = angle_array[i].
r_ub;
3488 angleValues[i].normal = angle_array[i].
normal;
3490 if (normal_ub_error)
NAMD_die(
"ERROR: Can't use cosAngles with Urey-Bradley angles");
3491 bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
3499 int NumDihedralParamsMult = 0;
3500 for (
int i=0;i < NumDihedralParams;i++) {
3501 NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
3503 std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
3504 dihedralMultMap.resize(NumDihedralParams);
3506 for (
int i=0;i < NumDihedralParams;i++) {
3508 dihedralMultMap[i] = k;
3509 for (
int j=0;j < multiplicity;j++) {
3510 dihedralValues[k].k = dihedral_array[i].
values[j].
k;
3511 dihedralValues[k].n = (dihedral_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
3512 dihedralValues[k].delta = dihedral_array[i].
values[j].
delta;
3516 bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
3524 int NumImproperParamsMult = 0;
3525 for (
int i=0;i < NumImproperParams;i++) {
3526 NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
3528 std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
3529 improperMultMap.resize(NumImproperParams);
3531 for (
int i=0;i < NumImproperParams;i++) {
3533 improperMultMap[i] = k;
3534 for (
int j=0;j < multiplicity;j++) {
3535 improperValues[k].k = improper_array[i].
values[j].
k;
3536 improperValues[k].n = (improper_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
3537 improperValues[k].delta = improper_array[i].
values[j].
delta;
3541 bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
3549 std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
3552 for (
int ipar=0;ipar < NumCrosstermParams;ipar++) {
3553 for (
int i=0;i < N;i++) {
3554 for (
int j=0;j < N;j++) {
3559 #define INDEX(ncols,i,j) ((i)*ncols + (j)) 3562 const double Ainv[16][16] = {
3563 { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3564 { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3565 {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3566 { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3567 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
3568 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
3569 { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
3570 { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
3571 {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
3572 { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
3573 { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
3574 {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
3575 { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
3576 { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
3577 {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
3578 { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
3582 #define M_PI 3.14159265358979323846 3585 const double h =
M_PI/12.0;
3587 const double x[16] = {
3588 table[
INDEX(D,i,j)].
d00, table[
INDEX(D,i+1,j)].
d00, table[
INDEX(D,i,j+1)].
d00, table[
INDEX(D,i+1,j+1)].
d00,
3589 table[
INDEX(D,i,j)].
d10*h, table[
INDEX(D,i+1,j)].
d10*h, table[
INDEX(D,i,j+1)].
d10*h, table[
INDEX(D,i+1,j+1)].
d10*h,
3590 table[
INDEX(D,i,j)].
d01*h, table[
INDEX(D,i+1,j)].
d01*h, table[
INDEX(D,i,j+1)].
d01*h, table[
INDEX(D,i+1,j+1)].
d01*h,
3591 table[
INDEX(D,i,j)].
d11*h*h, table[
INDEX(D,i+1,j)].
d11*h*h, table[
INDEX(D,i,j+1)].
d11*h*h, table[
INDEX(D,i+1,j+1)].
d11*h*h
3595 float* a = (
float *)&crosstermValues[ipar].c[i][j][0];
3596 for (
int k=0;k < 16;k++) {
3598 for (
int l=0;l < 16;l++) {
3599 a_val += Ainv[k][l]*x[l];
3601 a[k] = (float)a_val;
3607 bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
3615 case Tuples::THOLE: {
3620 case Tuples::ANISO: {
3625 case Tuples::ONEFOURNBTHOLE: {
3631 NAMD_bug(
"ComputeBondedCUDA::initialize, Undefined tuple type");
3641 #ifdef NODEGROUP_FORCE_REGISTER 3644 migrationKernel.setup(nDevices, patchMap->
numPatches());
3648 allocate_host<PatchRecord>(&h_patchRecord, patchMap->
numPatches());
3649 allocate_device<PatchRecord>(&d_patchRecord, patchMap->
numPatches());
3652 allocate_host<double3>(&h_patchMapCenter, patchMap->
numPatches());
3653 allocate_device<double3>(&d_patchMapCenter, patchMap->
numPatches());
3654 for (
int i = 0; i < patchMap->
numPatches(); i++) {
3657 copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->
numPatches());
3659 #endif // NODEGROUP_FORCE_REGISTER 3662 #ifdef NODEGROUP_FORCE_REGISTER 3663 void ComputeBondedCUDA::updatePatchOrder(
const std::vector<CudaLocalRecord>& data) {
3666 std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
3667 for (
int i=0;i < data.size();i++) {
3668 patchIndex[data[i].patchID] = i;
3672 for (
int i=0;i < data.size();i++) {
3677 patches.push_back(p);
3680 #endif // NODEGROUP_FORCE_REGISTER 3682 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
3685 hostAlchFlags.alchFepOn = sim_params.
alchFepOn;
3687 hostAlchFlags.alchWCAOn = sim_params.
alchWCAOn;
3688 hostAlchFlags.alchLJcorrection = sim_params.
LJcorrection;
3694 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
3695 bondedKernel.updateCudaAlchFlags(hostAlchFlags);
3698 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
3704 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
3705 bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3708 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
3712 hostAlchLambdas.bondLambda1 = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda));
3713 hostAlchLambdas.bondLambda2 = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda));
3714 hostAlchLambdas.bondLambda12 = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda2));
3715 hostAlchLambdas.bondLambda22 = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
3716 hostAlchLambdas.elecLambdaUp = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda));
3717 hostAlchLambdas.elecLambda2Up = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda2));
3718 hostAlchLambdas.elecLambdaDown = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda));
3719 hostAlchLambdas.elecLambda2Down = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
3720 hostAlchLambdas.vdwLambdaUp = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda));
3721 hostAlchLambdas.vdwLambda2Up = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda2));
3722 hostAlchLambdas.vdwLambdaDown = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
3723 hostAlchLambdas.vdwLambda2Down = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
3726 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
3727 bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3730 #ifdef NODEGROUP_FORCE_REGISTER 3731 void ComputeBondedCUDA::registerPointersToHost() {
3732 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3733 PatchData *patchData = cpdata.ckLocalBranch();
3738 patchData->h_tupleDataStage.bond[deviceIndex] = dst.
bond;
3739 patchData->h_tupleDataStage.angle[deviceIndex] = dst.
angle;
3740 patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.
dihedral;
3741 patchData->h_tupleDataStage.improper[deviceIndex] = dst.
improper;
3742 patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.
modifiedExclusion;
3743 patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.
exclusion;
3744 patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.
crossterm;
3747 patchData->h_tupleCount.bond[deviceIndex] = count.
bond();
3748 patchData->h_tupleCount.angle[deviceIndex] = count.
angle();
3749 patchData->h_tupleCount.dihedral[deviceIndex] = count.
dihedral();
3750 patchData->h_tupleCount.improper[deviceIndex] = count.
improper();
3751 patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.
modifiedExclusion();
3752 patchData->h_tupleCount.exclusion[deviceIndex] = count.
exclusion();
3753 patchData->h_tupleCount.crossterm[deviceIndex] = count.
crossterm();
3756 patchData->h_tupleOffset.bond[deviceIndex] = offset.
bond();
3757 patchData->h_tupleOffset.angle[deviceIndex] = offset.
angle();
3758 patchData->h_tupleOffset.dihedral[deviceIndex] = offset.
dihedral();
3759 patchData->h_tupleOffset.improper[deviceIndex] = offset.
improper();
3760 patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.
modifiedExclusion();
3761 patchData->h_tupleOffset.exclusion[deviceIndex] = offset.
exclusion();
3762 patchData->h_tupleOffset.crossterm[deviceIndex] = offset.
crossterm();
3765 void ComputeBondedCUDA::copyHostRegisterToDevice() {
3766 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3767 PatchData *patchData = cpdata.ckLocalBranch();
3774 migrationKernel.copyPeerDataToDevice(
3775 patchData->h_tupleDataStage,
3776 patchData->h_tupleCount,
3777 patchData->h_tupleOffset,
3783 void ComputeBondedCUDA::copyPatchData() {
3787 for (
int i = 0; i < numPatches; i++) {
3788 h_patchRecord[i] = patches[patchIndex[i]];
3791 copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
3793 #endif // NODEGROUP_FORCE_REGISTER 3797 return (
simParams->CUDASOAintegrate) ? reductionGpuResident :
3798 reductionGpuOffload;
3801 #endif // BONDED_CUDA
CudaDihedralStage * improper
NAMD_HOST_DEVICE int * modifiedExclusion()
void barrier(const SynchronousCollectiveScope scope)
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
ScaledPosition center(int pid) const
const CrosstermValue * value
CrosstermData c[dim][dim]
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
BigReal getBondLambda(const BigReal) const
const ImproperValue * value
static ProxyMgr * Object()
static int warpAlign(const int n)
#define CUDA_BONDED_KERNEL_EVENT
#define ADD_TENSOR(R, RL, D, DL)
static PatchMap * Object()
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
CudaCrosstermStage * crossterm
virtual void submit(void)=0
SimParameters * simParameters
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * exclusion()
void cudaDie(const char *msg, cudaError_t err)
Bool CUDASOAintegrateMode
#define INDEX(ncols, i, j)
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
int downstream(int pid1, int pid2)
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
DihedralValue * dihedral_array
static ReductionMgr * Object(void)
Patch * patch(PatchID pid)
BigReal getElecLambda(const BigReal) const
CrosstermValue * crossterm_array
Molecule stores the structural information for the system.
const OneFourNbTholeValue * value
__thread DeviceCUDA * deviceCUDA
FourBodyConsts values[MAX_MULTIPLICITY]
NAMD_HOST_DEVICE int * bond()
static CudaNBConstants getNonbondedCoef(SimParameters *params)
static Units next(Units u)
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
CudaAtom * getCudaAtomList()
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
void CcdCallBacksReset(void *ignored, double curWallTime)
const DihedralValue * value
NAMD_HOST_DEVICE float3 make_float3(float4 a)
void NAMD_bug(const char *err_msg)
#define COPY_CUDAVECTOR(S, D)
CudaDihedralStage * dihedral
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * crossterm()
NAMD_HOST_DEVICE int * angle()
PatchID getPatchID() const
ImproperValue * improper_array
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
CudaExclusionStage * modifiedExclusion
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
int get_fep_bonded_type(const int *atomID, unsigned int order) const
void NAMD_die(const char *err_msg)
CudaExclusionStage * exclusion
FourBodyConsts values[MAX_MULTIPLICITY]
static Bool vdwForceSwitching
unsigned char get_fep_type(int anum) const
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
BigReal getVdwLambda(const BigReal) const
Box< Patch, CompAtom > * avgPositionBox
virtual void patchReady(PatchID, int doneMigration, int seq)
NAMD_HOST_DEVICE int * improper()
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * dihedral()
Box< Patch, Results > * forceBox
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void close(Data **const t)
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
static SynchronousCollectives * Object()
BigReal alchVdwShiftCoeff
CompAtomExt * getCompAtomExtInfo()
Box< Patch, Results > * registerForceDeposit(Compute *cid)