33 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 35 #define __thread __declspec(thread) 42 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
53 const int ComputeBondedCUDA::CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES] = {
71 Compute(c), deviceID(deviceID), masterPe(CkMyPe()),
72 bondedKernel(deviceID, cudaNonbondedTables), computeMgr(computeMgr)
75 computes.resize(CkMyNodeSize());
76 patchIDsPerRank.resize(CkMyNodeSize());
77 numExclPerRank.resize(CkMyNodeSize());
78 for (
int i=0;i < numExclPerRank.size();i++) {
79 numExclPerRank[i].numModifiedExclusions = 0;
80 numExclPerRank[i].numExclusions = 0;
97 energies_virials = NULL;
99 initializeCalled =
false;
102 accelMDdoDihe =
false;
103 if (params->accelMDOn) {
104 if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=
true;
110 pswitchTable[0] = 0; pswitchTable[1] = 1; pswitchTable[2] = 2;
111 pswitchTable[3] = 1; pswitchTable[4] = 1; pswitchTable[5] = 99;
112 pswitchTable[6] = 2; pswitchTable[7] = 99; pswitchTable[8] = 2;
114 h_patchRecord = NULL;
115 d_patchRecord = NULL;
117 h_patchMapCenter = NULL;
118 d_patchMapCenter = NULL;
125 ComputeBondedCUDA::~ComputeBondedCUDA() {
128 if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
129 if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
130 if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
131 if (tupleData != NULL) deallocate_host<char>(&tupleData);
133 if (initializeCalled) {
135 cudaCheck(cudaEventDestroy(forceDoneEvent));
136 CmiDestroyLock(lock);
137 CmiDestroyLock(printLock);
138 if (reductionGpuOffload) {
139 delete reductionGpuOffload;
141 if (reductionGpuResident) {
142 delete reductionGpuResident;
146 if (h_patchMapCenter != NULL) deallocate_host<double3>(&h_patchMapCenter);
147 if (d_patchMapCenter != NULL) deallocate_device<double3>(&d_patchMapCenter);
149 if (h_patchRecord != NULL) deallocate_host<PatchRecord>(&h_patchRecord);
150 if (d_patchRecord != NULL) deallocate_device<PatchRecord>(&d_patchRecord);
156 void ComputeBondedCUDA::unregisterBoxesOnPe() {
157 for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
158 PatchID patchID = patchIDsPerRank[CkMyRank()][i];
160 if (tpe == NULL || tpe->
p == NULL) {
161 NAMD_bug(
"ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
174 void ComputeBondedCUDA::registerCompute(
int pe,
int type,
PatchIDList& pids) {
176 if (CkMyPe() != masterPe)
177 NAMD_bug(
"ComputeBondedCUDA::registerCompute() called on non master PE");
179 int rank = CkRankOf(pe);
181 HomeCompute& homeCompute = computes[rank].homeCompute;
182 if (homeCompute.patchIDs.size() == 0) {
184 homeCompute.patchIDs.resize(pids.
size());
185 for (
int i=0;i < pids.
size();i++) {
186 homeCompute.patchIDs[i] = pids[i];
187 homeCompute.isBasePatch[pids[i]] = 1;
190 if (homeCompute.patchIDs.size() != pids.
size()) {
191 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
193 for (
int i=0;i < pids.
size();i++) {
194 if (homeCompute.patchIDs[i] != pids[i]) {
195 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
202 homeCompute.tuples.push_back(
new HomeTuples<BondElem, Bond, BondValue>(
Tuples::BOND));
206 homeCompute.tuples.push_back(
new HomeTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE));
210 homeCompute.tuples.push_back(
new HomeTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL));
214 homeCompute.tuples.push_back(
new HomeTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER));
218 homeCompute.tuples.push_back(
new HomeTuples<ExclElem, Exclusion, int>(
Tuples::EXCLUSION));
222 homeCompute.tuples.push_back(
new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM));
226 homeCompute.tuples.push_back(
new HomeTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE));
230 homeCompute.tuples.push_back(
new HomeTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO));
234 NAMD_bug(
"ComputeBondedCUDA::registerCompute(), Unsupported compute type");
244 void ComputeBondedCUDA::registerSelfCompute(
int pe,
int type,
int pid) {
246 if (CkMyPe() != masterPe)
247 NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute() called on non master PE");
249 int rank = CkRankOf(pe);
251 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
252 auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
253 if (it == selfComputes.end()) {
255 selfComputes.push_back(SelfCompute(type));
256 it = selfComputes.begin() + (selfComputes.size() - 1);
260 it->tuples =
new SelfTuples<BondElem, Bond, BondValue>(
Tuples::BOND);
264 it->tuples =
new SelfTuples<AngleElem, Angle, AngleValue>(
Tuples::ANGLE);
268 it->tuples =
new SelfTuples<DihedralElem, Dihedral, DihedralValue>(
Tuples::DIHEDRAL);
272 it->tuples =
new SelfTuples<ImproperElem, Improper, ImproperValue>(
Tuples::IMPROPER);
280 it->tuples =
new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(
Tuples::CROSSTERM);
284 it->tuples =
new SelfTuples<TholeElem, Thole, TholeValue>(Tuples::THOLE);
288 it->tuples =
new SelfTuples<AnisoElem, Aniso, AnisoValue>(Tuples::ANISO);
292 NAMD_bug(
"ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
299 it->patchIDs.push_back(pid);
302 void ComputeBondedCUDA::assignPatchesOnPe() {
305 for (
int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
306 PatchID patchID = patchIDsPerRank[CkMyRank()][i];
310 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, patch not found");
311 if (flags == NULL) flags = &patchMap->
patch(patchID)->
flags;
314 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
316 if (tpe->
p != NULL) {
317 NAMD_bug(
"ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
330 void ComputeBondedCUDA::atomUpdate() {
331 atomsChangedIn =
true;
338 int ComputeBondedCUDA::noWork() {
343 void ComputeBondedCUDA::messageEnqueueWork() {
344 if (masterPe != CkMyPe())
345 NAMD_bug(
"ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
353 void ComputeBondedCUDA::doWork() {
354 if (CkMyPe() != masterPe)
355 NAMD_bug(
"ComputeBondedCUDA::doWork() called on non master PE");
361 atomsChanged = atomsChangedIn;
362 atomsChangedIn =
false;
364 if (getNumPatches() == 0) {
369 NAMD_bug(
"ComputeBondedCUDA::doWork(), no flags set");
373 lattice = flags->lattice;
374 doEnergy = flags->doEnergy;
375 doVirial = flags->doVirial;
376 doSlow = flags->doFullElectrostatics;
377 doMolly = flags->doMolly;
380 if (hostAlchFlags.alchOn) {
381 updateHostCudaAlchLambdas();
382 updateKernelCudaAlchLambdas();
384 if (alchOutFreq > 0 && (step % alchOutFreq == 0)) {
394 if(params->CUDASOAintegrate) {
395 if (!atomsChanged) this->openBoxesOnPe();
403 void ComputeBondedCUDA::patchReady(
PatchID pid,
int doneMigration,
int seq) {
409 #ifdef NODEGROUP_FORCE_REGISTER 410 patches[patchIndex[pid]].patchID = pid;
416 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
426 void ComputeBondedCUDA::updatePatches() {
429 for (
int i=0;i < patches.size();i++) {
430 patches[i].atomStart = atomStart;
431 atomStart += patches[i].numAtoms;
433 atomStorageSize = atomStart;
436 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
439 #ifdef NODEGROUP_FORCE_REGISTER 441 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
442 PatchData *patchData = cpdata.ckLocalBranch();
444 std::vector<CudaLocalRecord>& localPatches =
445 patchData->devData[deviceIndex].h_localPatches;
446 const int numPatchesHomeAndProxy =
447 patchData->devData[deviceIndex].numPatchesHomeAndProxy;
450 for (
int i=0;i < numPatchesHomeAndProxy; i++) {
451 patches[i].numAtoms = localPatches[i].numAtoms;
452 patches[i].atomStart = localPatches[i].bufferOffset;
453 atomStart += patches[i].numAtoms;
456 atomStorageSize = atomStart;
457 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
459 if (params->CUDASOAintegrate && params->useDeviceMigration) {
460 bondedKernel.updateAtomBuffer(atomStorageSize, stream);
461 updatePatchRecords();
463 #endif // NODEGROUP_FORCE_REGISTER 470 void ComputeBondedCUDA::mapAtoms() {
471 for (
int i=0;i < getNumPatches();i++) {
481 void ComputeBondedCUDA::unmapAtoms() {
482 for (
int i=0;i < getNumPatches();i++) {
491 void ComputeBondedCUDA::openBoxesOnPe(
int startup) {
493 std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
495 fprintf(stderr,
"PE[%d] calling ComputeBondedCUDA::openBoxesOnePE(%p)\n", CkMyPe(),
this);
497 #ifdef NODEGROUP_FORCE_REGISTER 498 if(
Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
499 for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
510 for (
auto it=patchIDs.begin();it != patchIDs.end();it++) {
521 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
522 int pi = patchIndex[patchID];
523 int atomStart = patches[pi].atomStart;
524 int numAtoms = patches[pi].numAtoms;
528 for (
int i=0;i < numAtoms;i++) {
531 atoms[atomStart + j] = src[i];
539 patchesCounter -= patchIDs.size();
540 if (patchesCounter == 0) {
541 patchesCounter = getNumPatches();
547 if (!params->CUDASOAintegrate || !params->useDeviceMigration || startup) {
551 if(!params->CUDASOAintegrate) computeMgr->sendLoadTuplesOnPe(pes,
this);
554 if(params->CUDASOAintegrate){
555 if(!atomsChanged) this->launchWork();
560 #ifdef NODEGROUP_FORCE_REGISTER 570 fprintf(stderr,
"PE[%d] (%p) tuplePatchList printout\n", CkMyPe(),
this);
571 for(
int i = 0 ; i < tuplePatchList.size(); i++){
574 if(tpe == NULL)
break;
575 fprintf(stderr,
"PE[%d] (%p) %d PID[%d] atomExt = %p\n",CkMyPe(),
this, i, tpe->
p->
getPatchID(), tpe->
xExt);
577 CmiUnlock(printLock);
582 void countNumExclusions(Tuples* tuples,
int& numModifiedExclusions,
int& numExclusions) {
583 numModifiedExclusions = 0;
584 int ntuples = tuples->getNumTuples();
586 for (
int ituple=0;ituple < ntuples;ituple++) {
587 if (src[ituple].modified) numModifiedExclusions++;
589 numExclusions = ntuples - numModifiedExclusions;
595 void ComputeBondedCUDA::loadTuplesOnPe(
const int startup) {
598 int numModifiedExclusions = 0;
599 int numExclusions = 0;
600 if (startup || (!params->CUDASOAintegrate || !params->useDeviceMigration)) {
601 std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
603 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
605 it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
609 countNumExclusions(it->tuples, tmp1, tmp2);
610 numModifiedExclusions += tmp1;
611 numExclusions += tmp2;
615 HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
616 for (
int i=0;i < homeCompute.tuples.size();i++) {
617 homeCompute.tuples[i]->loadTuples(tuplePatchList,
618 homeCompute.isBasePatch.data(), &atomMap,
619 homeCompute.patchIDs);
623 countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
624 numModifiedExclusions += tmp1;
625 numExclusions += tmp2;
629 numModifiedExclusions = modifiedExclusionTupleData.size();
630 numExclusions = exclusionTupleData.size();
634 numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
635 numExclPerRank[CkMyRank()].numExclusions = numExclusions;
638 if (!params->CUDASOAintegrate || !params->useDeviceMigration) {
644 patchesCounter -= patchIDsPerRank[CkMyRank()].size();
645 if (patchesCounter == 0) {
646 patchesCounter = getNumPatches();
654 if(!params->CUDASOAintegrate)computeMgr->
sendLaunchWork(masterPe,
this);
661 void ComputeBondedCUDA::copyBondData(
const int ntuples,
const BondElem* __restrict__ src,
665 for (
int ituple=0;ituple < ntuples;ituple++) {
667 auto p0 = src[ituple].p[0];
668 auto p1 = src[ituple].p[1];
669 int pi0 = patchIndex[p0->patchID];
670 int pi1 = patchIndex[p1->patchID];
671 int l0 = src[ituple].localIndex[0];
672 int l1 = src[ituple].localIndex[1];
673 dstval.
i = l0 + patches[pi0].atomStart;
674 dstval.
j = l1 + patches[pi1].atomStart;
675 dstval.
itype = (src[ituple].value - bond_array);
678 Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
679 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
681 dstval.
scale = src[ituple].scale;
682 if (hostAlchFlags.alchOn) {
683 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
689 dst[ituple] = dstval;
693 #ifdef NODEGROUP_FORCE_REGISTER 695 void ComputeBondedCUDA::copyTupleToStage(
const BondElem& src,
700 int pi0 = patchIndex[p0->patchID];
701 int pi1 = patchIndex[p1->patchID];
709 dstval.
index[0] = l0;
710 dstval.
index[1] = l1;
712 if (hostAlchFlags.alchOn) {
720 #endif // NODEGROUP_FORCE_REGISTER 723 void ComputeBondedCUDA::copyBondDatafp32(
const int ntuples,
const BondElem* __restrict__ src,
727 float3 b1f, b2f, b3f;
728 b1f =
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
729 b2f =
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
730 b3f =
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
732 for (
int ituple=0;ituple < ntuples;ituple++) {
734 auto p0 = src[ituple].p[0];
735 auto p1 = src[ituple].p[1];
736 int pi0 = patchIndex[p0->patchID];
737 int pi1 = patchIndex[p1->patchID];
738 int l0 = src[ituple].localIndex[0];
739 int l1 = src[ituple].localIndex[1];
740 dstval.
i = l0 + patches[pi0].atomStart;
741 dstval.
j = l1 + patches[pi1].atomStart;
742 dstval.
itype = (src[ituple].value - bond_array);
746 Vector shiftVec = lattice.wrap_delta_scaled_fast(position1, position2);
747 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
749 float3 position1 =
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
750 float3 position2 =
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
751 float3 diff = position1 - position2;
752 float d1 = -floorf(b1f.x * diff.x + b1f.y * diff.y + b1f.z * diff.z + 0.5f);
753 float d2 = -floorf(b2f.x * diff.x + b2f.y * diff.y + b2f.z * diff.z + 0.5f);
754 float d3 = -floorf(b3f.x * diff.x + b3f.y * diff.y + b3f.z * diff.z + 0.5f);
762 dstval.
scale = src[ituple].scale;
763 if (hostAlchFlags.alchOn) {
764 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
770 dst[ituple] = dstval;
774 void ComputeBondedCUDA::copyAngleData(
const int ntuples,
const AngleElem* __restrict__ src,
777 for (
int ituple=0;ituple < ntuples;ituple++) {
779 auto p0 = src[ituple].p[0];
780 auto p1 = src[ituple].p[1];
781 auto p2 = src[ituple].p[2];
782 int pi0 = patchIndex[p0->patchID];
783 int pi1 = patchIndex[p1->patchID];
784 int pi2 = patchIndex[p2->patchID];
785 int l0 = src[ituple].localIndex[0];
786 int l1 = src[ituple].localIndex[1];
787 int l2 = src[ituple].localIndex[2];
788 dstval.
i = l0 + patches[pi0].atomStart;
789 dstval.
j = l1 + patches[pi1].atomStart;
790 dstval.
k = l2 + patches[pi2].atomStart;
791 dstval.
itype = (src[ituple].value - angle_array);
795 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
796 Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
797 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
798 if(pi2 != pi1) shiftVec32 += patchMap->
center(p2->patchID) - patchMap->
center(p1->patchID);
802 dstval.
scale = src[ituple].scale;
803 if (hostAlchFlags.alchOn) {
804 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
810 dst[ituple] = dstval;
814 #ifdef NODEGROUP_FORCE_REGISTER 816 void ComputeBondedCUDA::copyTupleToStage(
const AngleElem& src,
822 int pi0 = patchIndex[p0->patchID];
823 int pi1 = patchIndex[p1->patchID];
824 int pi2 = patchIndex[p2->patchID];
834 dstval.
index[0] = l0;
835 dstval.
index[1] = l1;
836 dstval.
index[2] = l2;
839 if (hostAlchFlags.alchOn) {
847 #endif // NODEGROUP_FORCE_REGISTER 852 template <
bool doDihedral,
typename T,
typename P>
853 void ComputeBondedCUDA::copyDihedralData(
const int ntuples,
const T* __restrict__ src,
854 const P* __restrict__ p_array,
CudaDihedral* __restrict__ dst) {
858 for (
int ituple=0;ituple < ntuples;ituple++) {
860 auto p0 = src[ituple].p[0];
861 auto p1 = src[ituple].p[1];
862 auto p2 = src[ituple].p[2];
863 auto p3 = src[ituple].p[3];
864 int pi0 = patchIndex[p0->patchID];
865 int pi1 = patchIndex[p1->patchID];
866 int pi2 = patchIndex[p2->patchID];
867 int pi3 = patchIndex[p3->patchID];
868 int l0 = src[ituple].localIndex[0];
869 int l1 = src[ituple].localIndex[1];
870 int l2 = src[ituple].localIndex[2];
871 int l3 = src[ituple].localIndex[3];
872 dstval.
i = l0 + patches[pi0].atomStart;
873 dstval.
j = l1 + patches[pi1].atomStart;
874 dstval.
k = l2 + patches[pi2].atomStart;
875 dstval.
l = l3 + patches[pi3].atomStart;
877 dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
879 dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
885 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
886 Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
887 Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
888 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
889 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
890 if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
895 dstval.
scale = src[ituple].scale;
896 if (hostAlchFlags.alchOn) {
897 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
903 dst[ituple] = dstval;
907 #ifdef NODEGROUP_FORCE_REGISTER 909 void ComputeBondedCUDA::copyTupleToStage(
const DihedralElem& src,
916 int pi0 = patchIndex[p0->patchID];
917 int pi1 = patchIndex[p1->patchID];
918 int pi2 = patchIndex[p2->patchID];
919 int pi3 = patchIndex[p3->patchID];
924 dstval.
itype = dihedralMultMap[(src.
value - p_array)];
931 dstval.
index[0] = l0;
932 dstval.
index[1] = l1;
933 dstval.
index[2] = l2;
934 dstval.
index[3] = l3;
937 if (hostAlchFlags.alchOn) {
947 void ComputeBondedCUDA::copyTupleToStage(
const ImproperElem& src,
954 int pi0 = patchIndex[p0->patchID];
955 int pi1 = patchIndex[p1->patchID];
956 int pi2 = patchIndex[p2->patchID];
957 int pi3 = patchIndex[p3->patchID];
962 dstval.
itype = improperMultMap[(src.
value - p_array)];
969 dstval.
index[0] = l0;
970 dstval.
index[1] = l1;
971 dstval.
index[2] = l2;
972 dstval.
index[3] = l3;
975 if (hostAlchFlags.alchOn) {
984 template <
typename T,
typename P,
typename D>
985 void ComputeBondedCUDA::copyToStage(
const int ntuples,
const T* __restrict__ src,
986 const P* __restrict__ p_array, std::vector<D>& dst) {
988 for (
int ituple=0;ituple < ntuples;ituple++) {
990 copyTupleToStage<T, P, D>(src[ituple], p_array, dstval);
991 dst.push_back(dstval);
994 #endif // NODEGROUP_FORCE_REGISTER 1000 template <
bool doDihedral,
typename T,
typename P>
1001 void ComputeBondedCUDA::copyDihedralDatafp32(
const int ntuples,
const T* __restrict__ src,
1002 const P* __restrict__ p_array,
CudaDihedral* __restrict__ dst) {
1005 float3 b1f =
make_float3(lattice.a_r().x, lattice.a_r().y, lattice.a_r().z);
1006 float3 b2f =
make_float3(lattice.b_r().x, lattice.b_r().y, lattice.b_r().z);
1007 float3 b3f =
make_float3(lattice.c_r().x, lattice.c_r().y, lattice.c_r().z);
1009 for (
int ituple=0;ituple < ntuples;ituple++) {
1011 auto p0 = src[ituple].p[0];
1012 auto p1 = src[ituple].p[1];
1013 auto p2 = src[ituple].p[2];
1014 auto p3 = src[ituple].p[3];
1015 int pi0 = patchIndex[p0->patchID];
1016 int pi1 = patchIndex[p1->patchID];
1017 int pi2 = patchIndex[p2->patchID];
1018 int pi3 = patchIndex[p3->patchID];
1019 int l0 = src[ituple].localIndex[0];
1020 int l1 = src[ituple].localIndex[1];
1021 int l2 = src[ituple].localIndex[2];
1022 int l3 = src[ituple].localIndex[3];
1023 dstval.
i = l0 + patches[pi0].atomStart;
1024 dstval.
j = l1 + patches[pi1].atomStart;
1025 dstval.
k = l2 + patches[pi2].atomStart;
1026 dstval.
l = l3 + patches[pi3].atomStart;
1028 dstval.
itype = dihedralMultMap[(src[ituple].value - p_array)];
1030 dstval.
itype = improperMultMap[(src[ituple].value - p_array)];
1033 Position position1 = p0->
x[l0].position;
1034 Position position2 = p1->
x[l1].position;
1035 Position position3 = p2->
x[l2].position;
1036 Position position4 = p3->
x[l3].position;
1037 Vector shiftVec12 = lattice.wrap_delta_scaled_fast(position1, position2);
1038 Vector shiftVec23 = lattice.wrap_delta_scaled_fast(position2, position3);
1039 Vector shiftVec43 = lattice.wrap_delta_scaled_fast(position4, position3);
1040 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1041 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1042 if(pi3 != pi2) shiftVec43 += patchMap->
center(p3->patchID) - patchMap->
center(p2->patchID);
1049 float3 position1 =
make_float3(p0->x[l0].position.x, p0->x[l0].position.y, p0->x[l0].position.z);
1050 float3 position2 =
make_float3(p1->x[l1].position.x, p1->x[l1].position.y, p1->x[l1].position.z);
1051 float3 position3 =
make_float3(p2->x[l2].position.x, p2->x[l2].position.y, p2->x[l2].position.z);
1052 float3 position4 =
make_float3(p3->x[l3].position.x, p3->x[l3].position.y, p3->x[l3].position.z);
1054 float3 diff12, diff23, diff43;
1055 diff12 = position1 - position2;
1056 diff23 = position2 - position3;
1057 diff43 = position4 - position3;
1059 float d12_x = -floorf(b1f.x * diff12.x + b1f.y * diff12.y + b1f.z * diff12.z + 0.5f);
1060 float d12_y = -floorf(b2f.x * diff12.x + b2f.y * diff12.y + b2f.z * diff12.z + 0.5f);
1061 float d12_z = -floorf(b3f.x * diff12.x + b3f.y * diff12.y + b3f.z * diff12.z + 0.5f);
1063 float d23_x = -floorf(b1f.x * diff23.x + b1f.y * diff23.y + b1f.z * diff23.z + 0.5f);
1064 float d23_y = -floorf(b2f.x * diff23.x + b2f.y * diff23.y + b2f.z * diff23.z + 0.5f);
1065 float d23_z = -floorf(b3f.x * diff23.x + b3f.y * diff23.y + b3f.z * diff23.z + 0.5f);
1067 float d43_x = -floorf(b1f.x * diff43.x + b1f.y * diff43.y + b1f.z * diff43.z + 0.5f);
1068 float d43_y = -floorf(b2f.x * diff43.x + b2f.y * diff43.y + b2f.z * diff43.z + 0.5f);
1069 float d43_z = -floorf(b3f.x * diff43.x + b3f.y * diff43.y + b3f.z * diff43.z + 0.5f);
1096 dstval.
scale = src[ituple].scale;
1097 if (hostAlchFlags.alchOn) {
1098 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1104 dst[ituple] = dstval;
1109 void ComputeBondedCUDA::copyExclusionData(
const int ntuples,
const ExclElem* __restrict__ src,
const int typeSize,
1113 for (
int ituple=0;ituple < ntuples;ituple++) {
1114 auto p0 = src[ituple].p[0];
1115 auto p1 = src[ituple].p[1];
1116 int pi0 = patchIndex[p0->patchID];
1117 int pi1 = patchIndex[p1->patchID];
1118 int l0 = src[ituple].localIndex[0];
1119 int l1 = src[ituple].localIndex[1];
1124 Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
1125 if(pi0 != pi1) shiftVec += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1127 ce.
i = l0 + patches[pi0].atomStart;
1128 ce.
j = l1 + patches[pi1].atomStart;
1132 if (hostAlchFlags.alchOn) {
1135 ce.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
1139 if (src[ituple].modified) {
1151 #ifdef NODEGROUP_FORCE_REGISTER 1153 void ComputeBondedCUDA::copyTupleToStage(
const ExclElem& __restrict__ src,
1158 int pi0 = patchIndex[p0->patchID];
1159 int pi1 = patchIndex[p1->patchID];
1160 int l0 = src.localIndex[0];
1161 int l1 = src.localIndex[1];
1171 dstval.
index[0] = l0;
1172 dstval.
index[1] = l1;
1174 if (hostAlchFlags.alchOn) {
1177 dstval.
pswitch = pswitchTable[size_t(p1 + 3*p2)];
1185 void ComputeBondedCUDA::copyExclusionDataStage(
const int ntuples,
const ExclElem* __restrict__ src,
const int typeSize,
1186 std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2) {
1188 for (
int ituple=0;ituple < ntuples;ituple++) {
1190 copyTupleToStage<ExclElem, int, CudaExclusionStage>(src[ituple],
nullptr, ce);
1191 if (src[ituple].modified) {
1200 #endif // NODEGROUP_FORCE_REGISTER 1202 void ComputeBondedCUDA::copyCrosstermData(
const int ntuples,
const CrosstermElem* __restrict__ src,
1206 for (
int ituple=0;ituple < ntuples;ituple++) {
1207 auto p0 = src[ituple].p[0];
1208 auto p1 = src[ituple].p[1];
1209 auto p2 = src[ituple].p[2];
1210 auto p3 = src[ituple].p[3];
1211 auto p4 = src[ituple].p[4];
1212 auto p5 = src[ituple].p[5];
1213 auto p6 = src[ituple].p[6];
1214 auto p7 = src[ituple].p[7];
1215 int pi0 = patchIndex[p0->patchID];
1216 int pi1 = patchIndex[p1->patchID];
1217 int pi2 = patchIndex[p2->patchID];
1218 int pi3 = patchIndex[p3->patchID];
1219 int pi4 = patchIndex[p4->patchID];
1220 int pi5 = patchIndex[p5->patchID];
1221 int pi6 = patchIndex[p6->patchID];
1222 int pi7 = patchIndex[p7->patchID];
1223 int l0 = src[ituple].localIndex[0];
1224 int l1 = src[ituple].localIndex[1];
1225 int l2 = src[ituple].localIndex[2];
1226 int l3 = src[ituple].localIndex[3];
1227 int l4 = src[ituple].localIndex[4];
1228 int l5 = src[ituple].localIndex[5];
1229 int l6 = src[ituple].localIndex[6];
1230 int l7 = src[ituple].localIndex[7];
1231 dst[ituple].i1 = l0 + patches[pi0].atomStart;
1232 dst[ituple].i2 = l1 + patches[pi1].atomStart;
1233 dst[ituple].i3 = l2 + patches[pi2].atomStart;
1234 dst[ituple].i4 = l3 + patches[pi3].atomStart;
1235 dst[ituple].i5 = l4 + patches[pi4].atomStart;
1236 dst[ituple].i6 = l5 + patches[pi5].atomStart;
1237 dst[ituple].i7 = l6 + patches[pi6].atomStart;
1238 dst[ituple].i8 = l7 + patches[pi7].atomStart;
1239 dst[ituple].itype = (src[ituple].value - crossterm_array);
1240 Position position1 = p0->
x[l0].position;
1241 Position position2 = p1->
x[l1].position;
1242 Position position3 = p2->
x[l2].position;
1243 Position position4 = p3->
x[l3].position;
1244 Position position5 = p4->
x[l4].position;
1245 Position position6 = p5->
x[l5].position;
1246 Position position7 = p6->
x[l6].position;
1247 Position position8 = p7->
x[l7].position;
1248 Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
1249 Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
1250 Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
1251 Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
1252 Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
1253 Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
1254 if(pi0 != pi1) shiftVec12 += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1255 if(pi1 != pi2) shiftVec23 += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1256 if(pi2 != pi3) shiftVec34 += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
1257 if(pi4 != pi5) shiftVec56 += patchMap->
center(p4->patchID) - patchMap->
center(p5->patchID);
1258 if(pi5 != pi6) shiftVec67 += patchMap->
center(p5->patchID) - patchMap->
center(p6->patchID);
1259 if(pi6 != pi7) shiftVec78 += patchMap->
center(p6->patchID) - patchMap->
center(p7->patchID);
1260 dst[ituple].offset12XYZ =
make_float3( (
float)shiftVec12.
x, (
float)shiftVec12.
y, (
float)shiftVec12.
z);
1261 dst[ituple].offset23XYZ =
make_float3( (
float)shiftVec23.
x, (
float)shiftVec23.
y, (
float)shiftVec23.
z);
1262 dst[ituple].offset34XYZ =
make_float3( (
float)shiftVec34.
x, (
float)shiftVec34.
y, (
float)shiftVec34.
z);
1263 dst[ituple].offset56XYZ =
make_float3( (
float)shiftVec56.
x, (
float)shiftVec56.
y, (
float)shiftVec56.
z);
1264 dst[ituple].offset67XYZ =
make_float3( (
float)shiftVec67.
x, (
float)shiftVec67.
y, (
float)shiftVec67.
z);
1265 dst[ituple].offset78XYZ =
make_float3( (
float)shiftVec78.
x, (
float)shiftVec78.
y, (
float)shiftVec78.
z);
1266 if (hostAlchFlags.alchOn) {
1267 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1269 int typeSum1 = 0, typeSum2 = 0;
1270 for (
size_t i = 0; i < 4; ++i) {
1271 typeSum1 += (mol.get_fep_type(atomID[i]) == 2 ? -1 : mol.get_fep_type(atomID[i]));
1272 typeSum2 += (mol.get_fep_type(atomID[i+4]) == 2 ? -1 : mol.get_fep_type(atomID[i+4]));
1274 int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1275 if ((0 < typeSum1 && typeSum1 <
order) || (0 < typeSum2 && typeSum2 <
order)) {
1276 dst[ituple].fepBondedType = 1;
1277 }
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
1278 dst[ituple].fepBondedType = 2;
1281 dst[ituple].fepBondedType = 0;
1283 dst[ituple].scale = src[ituple].scale;
1287 #ifdef NODEGROUP_FORCE_REGISTER 1289 void ComputeBondedCUDA::copyTupleToStage(
const CrosstermElem& src,
1300 int pi0 = patchIndex[p0->patchID];
1301 int pi1 = patchIndex[p1->patchID];
1302 int pi2 = patchIndex[p2->patchID];
1303 int pi3 = patchIndex[p3->patchID];
1304 int pi4 = patchIndex[p4->patchID];
1305 int pi5 = patchIndex[p5->patchID];
1306 int pi6 = patchIndex[p6->patchID];
1307 int pi7 = patchIndex[p7->patchID];
1316 dstval.
itype = (src.
value - crossterm_array);
1327 dstval.
index[0] = l0;
1328 dstval.
index[1] = l1;
1329 dstval.
index[2] = l2;
1330 dstval.
index[3] = l3;
1331 dstval.
index[4] = l4;
1332 dstval.
index[5] = l5;
1333 dstval.
index[6] = l6;
1334 dstval.
index[7] = l7;
1337 if (hostAlchFlags.alchOn) {
1340 int typeSum1 = 0, typeSum2 = 0;
1341 for (
size_t i = 0; i < 4; ++i) {
1345 int order = (hostAlchFlags.alchBondDecouple ? 5 : 4);
1346 if ((0 < typeSum1 && typeSum1 <
order) || (0 < typeSum2 && typeSum2 <
order)) {
1348 }
else if ((0 > typeSum1 && typeSum1 > -
order) || (0 > typeSum2 && typeSum2 > -
order)) {
1355 #endif // NODEGROUP_FORCE_REGISTER 1357 void ComputeBondedCUDA::copyTholeData(
1358 const int ntuples,
const TholeElem* __restrict__ src,
1361 for (
int ituple=0;ituple < ntuples;ituple++) {
1362 const auto p0 = src[ituple].p[0];
1363 const auto p1 = src[ituple].p[1];
1364 const auto p2 = src[ituple].p[2];
1365 const auto p3 = src[ituple].p[3];
1366 const int pi0 = patchIndex[p0->patchID];
1367 const int pi1 = patchIndex[p1->patchID];
1368 const int pi2 = patchIndex[p2->patchID];
1369 const int pi3 = patchIndex[p3->patchID];
1370 const int l0 = src[ituple].localIndex[0];
1371 const int l1 = src[ituple].localIndex[1];
1372 const int l2 = src[ituple].localIndex[2];
1373 const int l3 = src[ituple].localIndex[3];
1374 dst[ituple].i = l0 + patches[pi0].atomStart;
1375 dst[ituple].j = l1 + patches[pi1].atomStart;
1376 dst[ituple].k = l2 + patches[pi2].atomStart;
1377 dst[ituple].l = l3 + patches[pi3].atomStart;
1379 const TholeValue *
const(&value)(src[ituple].value);
1380 if (value == NULL) {
1381 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1383 dst[ituple].aa = value->aa;
1384 dst[ituple].qq = value->qq;
1386 const Position position0 = p0->
x[l0].position;
1387 const Position position1 = p1->
x[l1].position;
1388 const Position position2 = p2->
x[l2].position;
1389 const Position position3 = p3->
x[l3].position;
1390 Vector shiftVec_aiaj = lattice.wrap_delta_scaled(position0, position2);
1391 Vector shiftVec_aidj = lattice.wrap_delta_scaled(position0, position3);
1392 Vector shiftVec_diaj = lattice.wrap_delta_scaled(position1, position2);
1393 Vector shiftVec_didj = lattice.wrap_delta_scaled(position1, position3);
1394 if (pi0 != pi2) shiftVec_aiaj += patchMap->
center(p0->patchID) - patchMap->
center(p2->patchID);
1395 if (pi0 != pi3) shiftVec_aidj += patchMap->
center(p0->patchID) - patchMap->
center(p3->patchID);
1396 if (pi1 != pi2) shiftVec_diaj += patchMap->
center(p1->patchID) - patchMap->
center(p2->patchID);
1397 if (pi1 != pi3) shiftVec_didj += patchMap->
center(p1->patchID) - patchMap->
center(p3->patchID);
1398 dst[ituple].offset_aiaj =
make_float3((
float)shiftVec_aiaj.
x, (
float)shiftVec_aiaj.
y, (
float)shiftVec_aiaj.
z);
1399 dst[ituple].offset_aidj =
make_float3((
float)shiftVec_aidj.
x, (
float)shiftVec_aidj.
y, (
float)shiftVec_aidj.
z);
1400 dst[ituple].offset_diaj =
make_float3((
float)shiftVec_diaj.
x, (
float)shiftVec_diaj.
y, (
float)shiftVec_diaj.
z);
1401 dst[ituple].offset_didj =
make_float3((
float)shiftVec_didj.
x, (
float)shiftVec_didj.
y, (
float)shiftVec_didj.
z);
1402 if (hostAlchFlags.alchOn) {
1405 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1408 typeSum += (mol->get_fep_type(atomID[0]) == 2 ? -1 : mol->get_fep_type(atomID[0]));
1409 typeSum += (mol->get_fep_type(atomID[2]) == 2 ? -1 : mol->get_fep_type(atomID[2]));
1410 const int order = (!params->alchDecouple ? 3 : 2);
1411 if (typeSum == 0 || abs(typeSum) ==
order) dst[ituple].fepBondedType = 0;
1412 else if (0 < typeSum && typeSum <
order) dst[ituple].fepBondedType = 1;
1413 else if (-
order < typeSum && typeSum < 0) dst[ituple].fepBondedType = 2;
1414 else dst[ituple].fepBondedType = 0;
1416 dst[ituple].fepBondedType = 0;
1421 #ifdef NODEGROUP_FORCE_REGISTER 1423 void ComputeBondedCUDA::copyTupleToStage(
const TholeElem& src,
1441 dstval.
index[0] = l0;
1442 dstval.
index[1] = l1;
1443 dstval.
index[2] = l2;
1444 dstval.
index[3] = l3;
1446 if (value == NULL) {
1447 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1449 dstval.
aa = value->aa;
1450 dstval.
qq = value->qq;
1451 if (hostAlchFlags.alchOn) {
1459 const int order = (!params->alchDecouple ? 3 : 2);
1470 void ComputeBondedCUDA::copyAnisoData(
1471 const int ntuples,
const AnisoElem* __restrict src,
1474 for (
int ituple=0;ituple < ntuples;ituple++) {
1475 const auto p0 = src[ituple].p[0];
1476 const auto p1 = src[ituple].p[1];
1477 const auto p2 = src[ituple].p[2];
1478 const auto p3 = src[ituple].p[3];
1479 const int pi0 = patchIndex[p0->patchID];
1480 const int pi1 = patchIndex[p1->patchID];
1481 const int pi2 = patchIndex[p2->patchID];
1482 const int pi3 = patchIndex[p3->patchID];
1483 const int l0 = src[ituple].localIndex[0];
1484 const int l1 = src[ituple].localIndex[1];
1485 const int l2 = src[ituple].localIndex[2];
1486 const int l3 = src[ituple].localIndex[3];
1487 dst[ituple].i = l0 + patches[pi0].atomStart;
1489 dst[ituple].j = l0 + 1 + patches[pi0].atomStart;
1490 dst[ituple].l = l1 + patches[pi1].atomStart;
1491 dst[ituple].m = l2 + patches[pi2].atomStart;
1492 dst[ituple].n = l3 + patches[pi3].atomStart;
1493 const AnisoValue *
const(&value)(src[ituple].value);
1494 if (value == NULL) {
1495 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyAnisoData.\n");
1497 dst[ituple].kpar0 = 2.0 * value->k11;
1498 dst[ituple].kperp0 = 2.0 * value->k22;
1499 dst[ituple].kiso0 = 2.0 * value->k33;
1500 const Position position0 = p0->
x[l0].position;
1501 const Position position1 = p1->
x[l1].position;
1502 const Position position2 = p2->
x[l2].position;
1503 const Position position3 = p3->
x[l3].position;
1504 Vector shiftVec_il = lattice.wrap_delta_scaled(position0, position1);
1505 Vector shiftVec_mn = lattice.wrap_delta_scaled(position2, position3);
1506 if (pi0 != pi1) shiftVec_il += patchMap->
center(p0->patchID) - patchMap->
center(p1->patchID);
1507 if (pi2 != pi3) shiftVec_mn += patchMap->
center(p2->patchID) - patchMap->
center(p3->patchID);
1508 dst[ituple].offset_il =
make_float3((
float)shiftVec_il.
x, (
float)shiftVec_il.
y, (
float)shiftVec_il.
z);
1509 dst[ituple].offset_mn =
make_float3((
float)shiftVec_mn.
x, (
float)shiftVec_mn.
y, (
float)shiftVec_mn.
z);
1510 if (hostAlchFlags.alchOn) {
1511 const AtomID (&atomID)[
sizeof(src[ituple].atomID)/
sizeof(
AtomID)](src[ituple].atomID);
1515 dst[ituple].fepBondedType = 0;
1520 #ifdef NODEGROUP_FORCE_REGISTER 1522 void ComputeBondedCUDA::copyTupleToStage(
const AnisoElem& src,
1537 dstval.
index[0] = l0;
1538 dstval.
index[1] = l0 + 1;
1539 dstval.
index[2] = l1;
1540 dstval.
index[3] = l2;
1541 dstval.
index[4] = l3;
1543 if (value == NULL) {
1544 NAMD_bug(
"Unexpected NULL value in ComputeBondedCUDA::copyTholeData.\n");
1546 dstval.
kpar0 = 2.0 * value->k11;
1547 dstval.
kperp0 = 2.0 * value->k22;
1548 dstval.
kiso0 = 2.0 * value->k33;
1549 if (hostAlchFlags.alchOn) {
1559 void ComputeBondedCUDA::tupleCopyWorker(
int first,
int last,
void *result,
int paraNum,
void *param) {
1560 ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1561 c->tupleCopyWorker(first, last);
1564 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first,
int last,
void *result,
int paraNum,
void *param) {
1565 ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
1566 c->tupleCopyWorkerExcl(first, last);
1569 void ComputeBondedCUDA::tupleCopyWorkerExcl(
int first,
int last) {
1575 int64_t numModExclusionsBefore=0;
1576 int64_t numExclusionsBefore=0;
1579 int ntuples = (*it)->getNumTuples();
1580 auto thisExcl = (
ExclElem *)(*it)->getTupleList();
1581 for (
int ituple=0;ituple < ntuples;ituple++) {
1582 if(thisExcl[ituple].modified)
1583 numModExclusionsBefore++;
1585 numExclusionsBefore++;
1589 int64_t pos = exclusionStartPos + numModExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
1590 int64_t pos2 = exclusionStartPos2 + numExclusionsBefore * CudaTupleTypeSize[
Tuples::EXCLUSION];
1591 int numExclusionsWork=last-first;
1592 auto itEnd=
std::next(itStart,numExclusionsWork+1);
1593 for (
auto it=itStart;it != itEnd;it++) {
1594 int ntuples = (*it)->getNumTuples();
1601 void ComputeBondedCUDA::tupleCopyWorker(
int first,
int last) {
1606 int64_t pos = exclusionStartPos;
1607 int64_t pos2 = exclusionStartPos2;
1609 int ntuples = (*it)->getNumTuples();
1618 for (
int i=first;i <= last;i++) {
1619 switch (tupleCopyWorkList[i].tupletype) {
1624 copyBondData(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
1625 Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1627 copyBondDatafp32(tupleCopyWorkList[i].ntuples, (
BondElem *)tupleCopyWorkList[i].tupleElemList,
1628 Node::Object()->parameters->bond_array, (
CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1635 copyAngleData(tupleCopyWorkList[i].ntuples, (
AngleElem *)tupleCopyWorkList[i].tupleElemList,
1636 Node::Object()->parameters->angle_array, (
CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1644 copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1646 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1648 copyDihedralDatafp32<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
1650 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1657 copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
1659 (
CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1665 copyCrosstermData(tupleCopyWorkList[i].ntuples, (
CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
1672 copyTholeData(tupleCopyWorkList[i].ntuples, (
TholeElem*)tupleCopyWorkList[i].tupleElemList,
nullptr, (
CudaThole *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1678 copyAnisoData(tupleCopyWorkList[i].ntuples, (
AnisoElem*)tupleCopyWorkList[i].tupleElemList,
nullptr, (
CudaAniso*)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
1683 NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
1691 #ifdef NODEGROUP_FORCE_REGISTER 1693 void ComputeBondedCUDA::updateMaxTupleCounts(
TupleCounts counts) {
1697 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1698 PatchData *patchData = cpdata.ckLocalBranch();
1701 int numBondsTest = patchData->maxNumBonds.load();
1702 while (numBondsTest < counts.
bond &&
1703 !patchData->maxNumBonds.compare_exchange_strong(numBondsTest, counts.
bond));
1705 int numAnglesTest = patchData->maxNumAngles.load();
1706 while (numAnglesTest < counts.
angle &&
1707 !patchData->maxNumAngles.compare_exchange_strong(numAnglesTest, counts.
angle));
1709 int numDihedralsTest = patchData->maxNumDihedrals.load();
1710 while (numDihedralsTest < counts.
dihedral &&
1711 !patchData->maxNumDihedrals.compare_exchange_strong(numDihedralsTest, counts.
dihedral));
1713 int numImpropersTest = patchData->maxNumImpropers.load();
1714 while (numImpropersTest < counts.
improper &&
1715 !patchData->maxNumImpropers.compare_exchange_strong(numImpropersTest, counts.
improper));
1717 int numModifiedExclusionsTest = patchData->maxNumModifiedExclusions.load();
1719 !patchData->maxNumModifiedExclusions.compare_exchange_strong(numModifiedExclusionsTest, counts.
modifiedExclusion));
1721 int numExclusionsTest = patchData->maxNumExclusions.load();
1722 while (numExclusionsTest < counts.
exclusion &&
1723 !patchData->maxNumExclusions.compare_exchange_strong(numExclusionsTest, counts.
exclusion));
1725 int numCrosstermsTest = patchData->maxNumCrossterms.load();
1726 while (numCrosstermsTest < counts.
crossterm &&
1727 !patchData->maxNumCrossterms.compare_exchange_strong(numCrosstermsTest, counts.
crossterm));
1732 TupleCounts ComputeBondedCUDA::getMaxTupleCounts() {
1733 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1734 PatchData *patchData = cpdata.ckLocalBranch();
1738 counts.
bond = patchData->maxNumBonds.load();
1739 counts.
angle = patchData->maxNumAngles.load();
1740 counts.
dihedral = patchData->maxNumDihedrals.load();
1741 counts.
improper = patchData->maxNumImpropers.load();
1743 counts.
exclusion = patchData->maxNumExclusions.load();
1744 counts.
crossterm = patchData->maxNumCrossterms.load();
1747 CkPrintf(
"[%d] Max: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d, cross %d\n",
1796 void ComputeBondedCUDA::migrateTuples(
bool startup) {
1798 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1799 PatchData *patchData = cpdata.ckLocalBranch();
1804 bool amMaster = (masterPe == CkMyPe());
1809 cudaStreamSynchronize(stream);
1811 cudaStreamSynchronize(stream);
1818 const int4* migrationDestination = patchData->h_soa_migrationDestination[devInd];
1822 TupleCounts counts = bondedKernel.getTupleCounts();
1823 migrationKernel.computeTupleDestination(
1826 patchData->devData[devInd].numPatchesHome,
1827 migrationDestination,
1828 patchData->devData[devInd].d_patchToDeviceMap,
1829 patchData->devData[devInd].d_globalToLocalID,
1834 cudaCheck(cudaStreamSynchronize(stream));
1841 migrationKernel.reserveTupleDestination(devInd, patchData->devData[devInd].numPatchesHome, stream);
1842 cudaCheck(cudaStreamSynchronize(stream));
1848 bool realloc =
false;
1850 if (amMaster && !startup) {
1851 migrationKernel.computePatchOffsets(patchData->devData[devInd].numPatchesHome, stream);
1853 local = migrationKernel.fetchTupleCounts(patchData->devData[devInd].numPatchesHome, stream);
1854 updateMaxTupleCounts(local);
1856 CkPrintf(
"[%d] Actual: Bonds %d, angles %d, dihedral %d, improper %d, modexl %d, exl %d cross %d\n",
1862 if (amMaster && !startup) {
1865 newMax = getMaxTupleCounts();
1866 realloc = migrationKernel.reallocateBufferDst(newMax);
1867 patchData->tupleReallocationFlagPerDevice[devInd] = realloc;
1877 realloc = patchData->tupleReallocationFlagPerDevice[0];
1886 registerPointersToHost();
1891 copyHostRegisterToDevice();
1892 cudaCheck(cudaStreamSynchronize(stream));
1897 if (amMaster && !startup) {
1899 migrationKernel.performTupleMigration(
1900 bondedKernel.getTupleCounts(),
1903 cudaCheck(cudaStreamSynchronize(stream));
1907 if (amMaster && !startup) {
1914 bondedKernel.reallocateTupleBuffer(newMax, stream);
1915 migrationKernel.reallocateBufferSrc(newMax);
1916 cudaCheck(cudaStreamSynchronize(stream));
1925 bondedKernel.setTupleCounts(local);
1928 const int* ids = patchData->h_soa_id[devInd];
1929 migrationKernel.updateTuples(
1930 bondedKernel.getTupleCounts(),
1931 bondedKernel.getData(),
1935 bondedKernel.getAtomBuffer(),
1949 template<
typename T>
1950 void ComputeBondedCUDA::sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets) {
1953 std::vector<std::pair<int,int>> downstreamPatches;
1954 for (
int i = 0; i < tuples.size(); i++) {
1955 int downstream = tuples[i].patchIDs[0];
1956 for (
int j = 1; j < T::size; j++) {
1957 downstream = patchMap->
downstream(downstream, tuples[i].patchIDs[j]);
1959 downstreamPatches.push_back(std::make_pair(i, downstream));
1960 tupleCounts[patchIndex[downstream]]++;
1966 tupleOffsets[0] = 0;
1967 for (
int i = 0; i < tupleCounts.size(); i++) {
1968 tupleOffsets[i+1] = tupleCounts[i] + tupleOffsets[i];
1974 std::stable_sort(downstreamPatches.begin(), downstreamPatches.end(),
1975 [](std::pair<int, int> a, std::pair<int, int> b) {
1976 return a.second < b.second;
1982 std::vector<T> copy = tuples;
1983 for (
int i = 0; i < tuples.size(); i++) {
1984 tuples[i] = copy[downstreamPatches[i].first];
1988 void ComputeBondedCUDA::sortAndCopyToDevice() {
1989 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1990 PatchData *patchData = cpdata.ckLocalBranch();
1992 const int numPatchesHome = patchData->devData[devInd].numPatchesHome;
2002 std::vector<int> bondPatchCounts(numPatchesHome, 0);
2003 std::vector<int> bondPatchOffsets(numPatchesHome+1, 0);
2005 std::vector<int> anglePatchCounts(numPatchesHome, 0);
2006 std::vector<int> anglePatchOffsets(numPatchesHome+1, 0);
2008 std::vector<int> dihedralPatchCounts(numPatchesHome, 0);
2009 std::vector<int> dihedralPatchOffsets(numPatchesHome+1, 0);
2011 std::vector<int> improperPatchCounts(numPatchesHome, 0);
2012 std::vector<int> improperPatchOffsets(numPatchesHome+1, 0);
2014 std::vector<int> modifiedExclusionPatchCounts(numPatchesHome, 0);
2015 std::vector<int> modifiedExclusionPatchOffsets(numPatchesHome+1, 0);
2017 std::vector<int> exclusionPatchCounts(numPatchesHome, 0);
2018 std::vector<int> exclusionPatchOffsets(numPatchesHome+1, 0);
2020 std::vector<int> crosstermPatchCounts(numPatchesHome, 0);
2021 std::vector<int> crosstermPatchOffsets(numPatchesHome+1, 0);
2023 #define CALL(fieldName) do { \ 2024 sortTupleList(fieldName##TupleData, fieldName##PatchCounts, fieldName##PatchOffsets); \ 2025 h_dataStage.fieldName = fieldName##TupleData.data(); \ 2026 h_counts.fieldName = fieldName##PatchCounts.data(); \ 2027 h_offsets.fieldName = fieldName##PatchOffsets.data(); \ 2034 CALL(modifiedExclusion);
2040 migrationKernel.copyTupleToDevice(
2041 bondedKernel.getTupleCounts(),
2049 cudaCheck(cudaStreamSynchronize(stream));
2059 void ComputeBondedCUDA::tupleCopyWorkerType(
int tupletype) {
2062 switch (tupletype) {
2066 modifiedExclusionTupleData.clear();
2067 exclusionTupleData.clear();
2068 int64_t pos = exclusionStartPos;
2069 int64_t pos2 = exclusionStartPos2;
2071 int ntuples = (*it)->getNumTuples();
2073 modifiedExclusionTupleData, exclusionTupleData, pos, pos2);
2084 bondTupleData.clear();
2086 int ntuples = (*it)->getNumTuples();
2088 copyToStage<BondElem, BondValue, CudaBondStage>(
2099 angleTupleData.clear();
2100 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2101 int ntuples = (*it)->getNumTuples();
2112 dihedralTupleData.clear();
2113 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2114 int ntuples = (*it)->getNumTuples();
2116 copyToStage<DihedralElem, DihedralValue, CudaDihedralStage>(ntuples, elemList,
2126 improperTupleData.clear();
2127 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2128 int ntuples = (*it)->getNumTuples();
2130 copyToStage<ImproperElem, ImproperValue, CudaDihedralStage>(ntuples, elemList,
2140 crosstermTupleData.clear();
2141 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2142 int ntuples = (*it)->getNumTuples();
2144 copyToStage<CrosstermElem, CrosstermValue, CudaCrosstermStage>(ntuples, elemList,
2154 tholeTupleData.clear();
2155 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2156 const int ntuples = (*it)->getNumTuples();
2158 copyToStage<TholeElem, TholeValue, CudaTholeStage>(ntuples, elemList,
nullptr, tholeTupleData);
2166 anisoTupleData.clear();
2167 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2168 const int ntuples = (*it)->getNumTuples();
2170 copyToStage<AnisoElem, AnisoValue, CudaAnisoStage>(ntuples, elemList,
nullptr, anisoTupleData);
2177 NAMD_bug(
"ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
2183 #endif // NODEGROUP_FORCE_REGISTER 2190 void ComputeBondedCUDA::copyTupleData() {
2195 int numModifiedExclusions = 0;
2196 int numExclusions = 0;
2197 for (
int i=0;i < numExclPerRank.size();i++) {
2198 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2199 numExclusions += numExclPerRank[i].numExclusions;
2206 exclusionStartPos = 0;
2207 exclusionStartPos2 = 0;
2208 tupleCopyWorkList.clear();
2209 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2211 int64_t pos = posWA;
2213 exclusionStartPos = pos;
2214 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
2218 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2219 int ntuples = (*it)->getNumTuples();
2222 TupleCopyWork tupleCopyWork;
2223 tupleCopyWork.tupletype = tupletype;
2224 tupleCopyWork.ntuples = ntuples;
2225 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2226 tupleCopyWork.tupleDataPos = pos;
2228 tupleCopyWorkList.push_back(tupleCopyWork);
2229 pos += ntuples*CudaTupleTypeSize[tupletype];
2232 numTuplesPerType[tupletype] = num;
2236 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2241 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2242 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2246 hasExclusions = (numExclusions > 0);
2247 hasModifiedExclusions = (numModifiedExclusions > 0);
2251 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2253 #if CMK_SMP && USE_CKLOOP 2255 if (useCkLoop >= 1) {
2256 #define CKLOOP_EXCLUSIONS 1 2257 #if CKLOOP_EXCLUSIONS 2258 CkLoop_Parallelize(tupleCopyWorkerExcl, 1, (
void *)
this, CkMyNodeSize(), 0, tupleList[
Tuples::EXCLUSION].size()-1);
2260 CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), 0, tupleCopyWorkList.size() - 1);
2264 CkLoop_Parallelize(tupleCopyWorker, 1, (
void *)
this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
2271 tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
2277 numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO], tupleData, stream);
2280 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2281 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2285 #ifdef NODEGROUP_FORCE_REGISTER 2287 void ComputeBondedCUDA::copyTupleDataSN() {
2290 size_t numExclusions, numModifiedExclusions, copyIndex;
2294 if(masterPe == CkMyPe()){
2295 numModifiedExclusions = 0;
2297 for (
int i=0;i < numExclPerRank.size();i++) {
2298 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2299 numExclusions += numExclPerRank[i].numExclusions;
2306 exclusionStartPos = 0;
2307 exclusionStartPos2 = 0;
2308 tupleCopyWorkList.clear();
2309 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2311 int64_t pos = posWA;
2313 exclusionStartPos = pos;
2314 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[
Tuples::EXCLUSION];
2318 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2319 int ntuples = (*it)->getNumTuples();
2322 TupleCopyWork tupleCopyWork;
2323 tupleCopyWork.tupletype = tupletype;
2324 tupleCopyWork.ntuples = ntuples;
2325 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2326 tupleCopyWork.tupleDataPos = pos;
2327 tupleCopyWorkList.push_back(tupleCopyWork);
2328 pos += ntuples*CudaTupleTypeSize[tupletype];
2331 numTuplesPerType[tupletype] = num;
2335 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
2340 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2341 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2345 hasExclusions = (numExclusions > 0);
2346 hasModifiedExclusions = (numModifiedExclusions > 0);
2350 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2360 this->tupleCopyWorker(first, last);
2366 while( (copyIndex = tupleWorkIndex.fetch_add(1) ) <= tupleCopyWorkList.size()){
2367 this->tupleCopyWorker(copyIndex -1, copyIndex -1);
2372 if(masterPe == CkMyPe()){
2373 tupleWorkIndex.store(0);
2377 numTuplesPerType[Tuples::THOLE], numTuplesPerType[Tuples::ANISO],
2381 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2382 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2394 void ComputeBondedCUDA::copyTupleDataGPU(
const int startup) {
2398 size_t numExclusions, numModifiedExclusions, copyIndex;
2407 if(masterPe == CkMyPe()){
2408 numModifiedExclusions = 0;
2410 for (
int i=0;i < numExclPerRank.size();i++) {
2411 numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
2412 numExclusions += numExclPerRank[i].numExclusions;
2419 exclusionStartPos = 0;
2420 exclusionStartPos2 = 0;
2421 tupleCopyWorkList.clear();
2422 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2426 exclusionStartPos = pos;
2427 exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSizeStage[
Tuples::EXCLUSION];
2431 for (
auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
2432 int ntuples = (*it)->getNumTuples();
2435 TupleCopyWork tupleCopyWork;
2436 tupleCopyWork.tupletype = tupletype;
2437 tupleCopyWork.ntuples = ntuples;
2438 tupleCopyWork.tupleElemList = (*it)->getTupleList();
2439 tupleCopyWork.tupleDataPos = pos;
2440 tupleCopyWorkList.push_back(tupleCopyWork);
2441 pos += ntuples*CudaTupleTypeSizeStage[tupletype];
2444 numTuplesPerType[tupletype] = num;
2448 posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSizeStage[tupletype];
2453 if (numModifiedExclusions + numExclusions != numTuplesPerType[
Tuples::EXCLUSION]) {
2454 NAMD_bug(
"ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
2458 hasExclusions = (numExclusions > 0);
2459 hasModifiedExclusions = (numModifiedExclusions > 0);
2463 reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
2471 local.modifiedExclusion = numModifiedExclusions;
2472 local.exclusion = numExclusions;
2474 local.thole = numTuplesPerType[Tuples::THOLE];
2475 local.aniso = numTuplesPerType[Tuples::ANISO];
2477 updateMaxTupleCounts(local);
2478 bondedKernel.setTupleCounts(local);
2484 if(masterPe == CkMyPe()){
2486 migrationKernel.reallocateBufferSrc(newMax);
2487 migrationKernel.reallocateBufferDst(newMax);
2488 bondedKernel.reallocateTupleBuffer(newMax, stream);
2489 registerPointersToHost();
2495 if(masterPe == CkMyPe()){
2496 copyHostRegisterToDevice();
2497 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2498 this->tupleCopyWorkerType(tupletype);
2500 sortAndCopyToDevice();
2504 migrateTuples(startup);
2506 if(masterPe == CkMyPe()){
2507 TupleCounts count = bondedKernel.getTupleCounts();
2515 numTuplesPerType[Tuples::THOLE] = count.
thole;
2516 numTuplesPerType[Tuples::ANISO] = count.
aniso;
2519 hasExclusions = (numExclusions > 0);
2520 hasModifiedExclusions = (numModifiedExclusions > 0);
2523 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2524 reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2539 #ifdef NODEGROUP_FORCE_REGISTER 2540 void ComputeBondedCUDA::updatePatchRecords() {
2541 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2542 PatchData *patchData = cpdata.ckLocalBranch();
2544 PatchRecord **d_pr = &(patchData->devData[devInd].bond_pr);
2545 int **d_pid = &(patchData->devData[devInd].bond_pi);
2546 size_t st_d_pid_size = (size_t)(patchData->devData[devInd].bond_pi_size);
2547 size_t st_d_pr_size = (size_t)(patchData->devData[devInd].bond_pr_size);
2548 patchData->devData[devInd].forceStride = bondedKernel.getForceStride(atomStorageSize);
2549 reallocate_device<PatchRecord>(d_pr, &st_d_pr_size, patches.size());
2550 patchData->devData[devInd].bond_pr_size = (int)(st_d_pr_size);
2551 reallocate_device<int>(d_pid, &st_d_pid_size, patches.size());
2552 patchData->devData[devInd].bond_pi_size = (int)(st_d_pid_size);
2553 copy_HtoD<PatchRecord>(&(patches[0]), *d_pr, patches.size(), stream);
2554 copy_HtoD<int>(&(patchIndex[0]), *d_pid, patches.size(), stream);
2555 if (params->CUDASOAintegrate && params->useDeviceMigration) {
2556 patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2564 void ComputeBondedCUDA::launchWork() {
2566 if (CkMyPe() != masterPe)
2567 NAMD_bug(
"ComputeBondedCUDA::launchWork() called on non master PE");
2576 #ifdef NODEGROUP_FORCE_REGISTER 2578 updatePatchRecords();
2584 float3 lata =
make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
2585 float3 latb =
make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
2586 float3 latc =
make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
2592 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2593 PatchData *patchData = cpdata.ckLocalBranch();
2595 float4 *d_atoms = bondedKernel.getAtomBuffer();
2596 copy_DtoH_sync<float4>(d_atoms, (float4*) atoms, atomStorageSize);
2600 fprintf(stderr,
"DEV[%d] BOND POS PRINTOUT\n", deviceID);
2602 for(
int i = 0 ; i < atomStorageSize; i++){
2603 fprintf(stderr,
" ATOMS[%d] = %lf %lf %lf %lf\n",
2604 i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q);
2615 bondedKernel.bondedForce(
2618 doEnergy, doVirial, doSlow, doTable,
2623 (
const float4*)atoms, forces,
2628 #ifdef NODEGROUP_FORCE_REGISTER 2629 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
2630 PatchData *patchData = cpdata.ckLocalBranch();
2633 patchData->devData[devInd].b_datoms = bondedKernel.getAtomBuffer();
2635 patchData->devData[devInd].f_bond = bondedKernel.getForces();
2636 patchData->devData[devInd].f_bond_nbond = patchData->devData[devInd].f_bond + bondedKernel.getForceSize(atomStorageSize);
2637 patchData->devData[devInd].f_bond_slow = patchData->devData[devInd].f_bond + 2*bondedKernel.getForceSize(atomStorageSize);
2638 patchData->devData[devInd].f_bond_size = bondedKernel.getForceSize(atomStorageSize);
2644 int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize,
true);
2646 copy_DtoH_sync(bondedKernel.getForces(), forces, forceStorageSize);
2647 fprintf(stderr,
"DEV[%d] BOND FORCE PRINTOUT\n", deviceID);
2648 for(
int i = 0; i < forceStorageSize; i++){
2650 fprintf(stderr,
"BOND[%d] = %lf\n", i, forces[i]);
2655 forceDoneSetCallback();
2658 void ComputeBondedCUDA::forceDoneCheck(
void *arg,
double walltime) {
2659 ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
2661 if (CkMyPe() != c->masterPe)
2662 NAMD_bug(
"ComputeBondedCUDA::forceDoneCheck called on non masterPe");
2666 cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2667 if (err == cudaSuccess) {
2673 }
else if (err != cudaErrorNotReady) {
2676 sprintf(errmsg,
"in ComputeBondedCUDA::forceDoneCheck after polling %d times over %f s",
2677 c->checkCount, walltime - c->beforeForceCompute);
2683 if (c->checkCount >= 1000000) {
2685 sprintf(errmsg,
"ComputeBondedCUDA::forceDoneCheck polled %d times over %f s",
2686 c->checkCount, walltime - c->beforeForceCompute);
2692 CcdCallFnAfter(forceDoneCheck, arg, 0.1);
2698 void ComputeBondedCUDA::forceDoneSetCallback() {
2699 if (CkMyPe() != masterPe)
2700 NAMD_bug(
"ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
2702 cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2706 beforeForceCompute = CkWallTimer();
2711 template <
bool sumNbond,
bool sumSlow>
2712 void finishForceLoop(
const int numAtoms,
const int forceStride,
2713 const double* __restrict__ af,
2714 const double* __restrict__ af_nbond,
2715 const double* __restrict__ af_slow,
2716 Force* __restrict__ f,
2717 Force* __restrict__ f_nbond,
2718 Force* __restrict__ f_slow) {
2721 for (
int j=0;j < numAtoms;j++) {
2729 f[j].y += af[j + forceStride];
2730 f[j].z += af[j + forceStride*2];
2736 f_nbond[j].x += af_nbond[j];
2737 f_nbond[j].y += af_nbond[j + forceStride];
2738 f_nbond[j].z += af_nbond[j + forceStride*2];
2745 f_slow[j].x += af_slow[j];
2746 f_slow[j].y += af_slow[j + forceStride];
2747 f_slow[j].z += af_slow[j + forceStride*2];
2754 for(
int j=0; j < numAtoms; j++){
2756 f[j].y += af[j+ forceStride];
2757 f[j].z += af[j+ 2*forceStride];
2760 for(
int j=0; j < numAtoms; j++){
2761 f_nbond[j].x += af_nbond[j];
2762 f_nbond[j].y += af_nbond[j + forceStride];
2763 f_nbond[j].z += af_nbond[j + 2*forceStride];
2767 for(
int j=0; j < numAtoms; j++){
2768 f_slow[j].x += af_slow[j];
2769 f_slow[j].y += af_slow[j + forceStride];
2770 f_slow[j].z += af_slow[j + 2*forceStride];
2776 for(
int j=0; j < numAtoms; j++) f[j].x += af[j];
2777 for(
int j=0; j < numAtoms; j++) f[j].y += af[j+ forceStride];
2778 for(
int j=0; j < numAtoms; j++) f[j].z += af[j+ 2*forceStride];
2781 #ifdef DEBUG_MINIMIZE 2783 printf(
"%s, line %d\n", __FILE__, __LINE__);
2784 printf(
" before: f_nbond[%d] = %f %f %f\n",
2785 k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2787 for(
int j=0; j < numAtoms; j++) f_nbond[j].x += af_nbond[j];
2788 for(
int j=0; j < numAtoms; j++) f_nbond[j].y += af_nbond[j + forceStride];
2789 for(
int j=0; j < numAtoms; j++) f_nbond[j].z += af_nbond[j + 2*forceStride];
2790 #ifdef DEBUG_MINIMIZE 2791 printf(
" after: f_nbond[%d] = %f %f %f\n",
2792 k, f_nbond[k].x, f_nbond[k].y, f_nbond[k].z);
2796 for(
int j=0; j < numAtoms; j++) f_slow[j].x += af_slow[j];
2797 for(
int j=0; j < numAtoms; j++) f_slow[j].y += af_slow[j + forceStride];
2798 for(
int j=0; j < numAtoms; j++) f_slow[j].z += af_slow[j + 2*forceStride];
2801 for(
int j = 0; j < numAtoms; j++){
2802 fprintf(stderr,
"f[%d] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", j,
2803 af[j], af[j + forceStride], af[j + 2*forceStride],
2804 af_nbond[j], af_nbond[j + forceStride], af_nbond[j + 2*forceStride],
2805 af_slow[j], af_slow[j + forceStride], af_slow[j + 2*forceStride]);
2813 void ComputeBondedCUDA::finishPatchesOnPe() {
2817 int myRank = CkMyRank();
2819 const int forceStride = bondedKernel.getForceStride(atomStorageSize);
2820 const int forceSize = bondedKernel.getForceSize(atomStorageSize);
2821 const bool sumNbond = hasModifiedExclusions;
2822 const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
2825 for (
int i=0;i < patchIDsPerRank[myRank].size();i++) {
2826 PatchID patchID = patchIDsPerRank[myRank][i];
2830 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
2833 int pi = patchIndex[patchID];
2834 int numAtoms = patches[pi].numAtoms;
2835 int atomStart = patches[pi].atomStart;
2839 #ifdef NODEGROUP_FORCE_REGISTER 2840 double *af = forces + atomStart;
2841 double *af_nbond = forces + forceSize + atomStart;
2842 double *af_slow = forces + 2*forceSize + atomStart;
2848 if (!sumNbond && !sumSlow) {
2849 finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2850 }
else if (sumNbond && !sumSlow) {
2851 finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2852 }
else if (!sumNbond && sumSlow) {
2853 finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2854 }
else if (sumNbond && sumSlow) {
2855 finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2857 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2868 double *af = forces + atomStart;
2869 double *af_nbond = forces + forceSize + atomStart;
2870 double *af_slow = forces + 2*forceSize + atomStart;
2872 if (!sumNbond && !sumSlow) {
2873 finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2874 }
else if (sumNbond && !sumSlow) {
2875 finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2876 }
else if (!sumNbond && sumSlow) {
2877 finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2878 }
else if (sumNbond && sumSlow) {
2879 finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
2881 NAMD_bug(
"ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
2894 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES);
2896 NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2900 patchesCounter -= patchIDsPerRank[CkMyRank()].size();
2907 if (patchesCounter == 0) {
2908 patchesCounter = getNumPatches();
2915 if(!atomsChanged) this->finishReductions();
2920 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_BONDED_CUDA_FINISH_PATCHES_LOCK);
2924 void ComputeBondedCUDA::finishPatches() {
2940 void ComputeBondedCUDA::finishReductions() {
2942 if (CkMyPe() != masterPe)
2943 NAMD_bug(
"ComputeBondedCUDA::finishReductions() called on non masterPe");
2954 cudaCheck(cudaStreamSynchronize(stream));
2956 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
2957 if (numTuplesPerType[tupletype] > 0) {
2960 switch (tupletype) {
2963 if (hostAlchFlags.alchFepOn) {
2966 if (hostAlchFlags.alchThermIntOn) {
2974 if (hostAlchFlags.alchFepOn) {
2977 if (hostAlchFlags.alchThermIntOn) {
2985 if (hostAlchFlags.alchFepOn) {
2988 if (hostAlchFlags.alchThermIntOn) {
2996 if (hostAlchFlags.alchFepOn) {
2999 if (hostAlchFlags.alchThermIntOn) {
3009 if (hostAlchFlags.alchFepOn) {
3014 if (hostAlchFlags.alchThermIntOn) {
3026 if (hostAlchFlags.alchFepOn) {
3029 if (hostAlchFlags.alchThermIntOn) {
3035 case Tuples::THOLE: {
3037 if (hostAlchFlags.alchFepOn) {
3040 if (hostAlchFlags.alchThermIntOn) {
3047 case Tuples::ANISO: {
3049 if (hostAlchFlags.alchFepOn) {
3052 if (hostAlchFlags.alchThermIntOn) {
3060 NAMD_bug(
"ComputeBondedCUDA::finishReductions, Unsupported tuple type");
3065 auto it = tupleList[tupletype].begin();
3067 (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
3073 #ifndef WRITE_FULL_VIRIALS 3074 #error "non-WRITE_FULL_VIRIALS not implemented" 3077 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
3078 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
3079 ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW, energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
3080 ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3083 ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
3094 void ComputeBondedCUDA::initialize() {
3095 #ifdef NODEGROUP_FORCE_REGISTER 3096 tupleWorkIndex.store(0);
3099 if (CkMyPe() != masterPe)
3100 NAMD_bug(
"ComputeBondedCUDA::initialize() called on non master PE");
3103 for (
int rank=0;rank < computes.size();rank++) {
3104 if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
3105 pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3110 if (pes.size() == 0)
return;
3112 initializeCalled =
false;
3115 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 3116 int leastPriority, greatestPriority;
3117 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
3118 cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
3122 cudaCheck(cudaEventCreate(&forceDoneEvent));
3123 lock = CmiCreateLock();
3124 printLock = CmiCreateLock();
3125 if(
Node::Object()->simParameters->CUDASOAintegrateMode) {
3130 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3131 PatchData *patchData = cpdata.ckLocalBranch();
3136 for (
int rank=0;rank < computes.size();rank++) {
3137 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3138 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
3139 for (
auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
3142 patchIDsPerRank[rank].push_back(*jt);
3143 allPatchIDs.push_back(*jt);
3151 for (
int rank=0;rank < computes.size();rank++) {
3152 HomeCompute& homeCompute = computes[rank].homeCompute;
3153 std::vector<int>& patchIDs = homeCompute.patchIDs;
3154 for (
int i=0;i < patchIDs.size();i++) {
3155 int patchID = patchIDs[i];
3158 patchIDsPerRank[rank].push_back(patchID);
3159 allPatchIDs.push_back(patchID);
3164 std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
3166 std::vector<int> neighborPids;
3167 for (
int rank=0;rank < computes.size();rank++) {
3169 HomeCompute& homeCompute = computes[rank].homeCompute;
3170 std::vector<int>& patchIDs = homeCompute.patchIDs;
3171 for (
int i=0;i < patchIDs.size();i++) {
3172 int patchID = patchIDs[i];
3174 for (
int j=0;j < numNeighbors;j++) {
3176 neighborPids.push_back(neighbors[j]);
3183 std::sort(neighborPids.begin(), neighborPids.end());
3184 auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
3185 neighborPids.resize(std::distance(neighborPids.begin(), it_end));
3188 for (
int i=0;i < neighborPids.size();i++) {
3189 for (
int rank=0;rank < computes.size();rank++) {
3190 int pid = neighborPids[i];
3191 int pe = rank + CkNodeFirst(CkMyNode());
3192 if (patchMap->
node(pid) == pe) {
3195 patchIDsPerRank[rank].push_back(pid);
3196 allPatchIDs.push_back(pid);
3198 patchIDsToAppend[rank].push_back(pid);
3200 pes.push_back(CkNodeFirst(CkMyNode()) + rank);
3207 std::sort(pes.begin(), pes.end());
3208 auto it_end = std::unique(pes.begin(), pes.end());
3209 pes.resize(std::distance(pes.begin(), it_end));
3214 for (
int rank=0;rank < computes.size();rank++) {
3216 HomeCompute& homeCompute = computes[rank].homeCompute;
3217 std::vector<int>& patchIDs = homeCompute.patchIDs;
3218 std::vector<int> neighborPatchIDs;
3219 for (
int i=0;i < patchIDs.size();i++) {
3220 int patchID = patchIDs[i];
3222 for (
int j=0;j < numNeighbors;j++) {
3226 patchIDsPerRank[rank].push_back(neighbors[j]);
3227 allPatchIDs.push_back(neighbors[j]);
3229 if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0
3230 && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
3231 neighborPatchIDs.push_back(neighbors[j]);
3241 for (
int i=0;i < neighborPatchIDs.size();i++) {
3242 patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
3246 for (
int rank=0;rank < patchIDsToAppend.size();rank++) {
3247 for (
int i=0;i < patchIDsToAppend[rank].size();i++) {
3248 computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
3254 std::sort(allPatchIDs.begin(), allPatchIDs.end());
3255 auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
3256 allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
3260 setNumPatches(allPatchIDs.size());
3263 patchesCounter = getNumPatches();
3265 patches.resize(getNumPatches());
3268 for (
int rank=0;rank < computes.size();rank++) {
3269 std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
3270 for (
auto it=selfComputes.begin();it != selfComputes.end();it++) {
3271 tupleList[it->tuples->getType()].push_back(it->tuples);
3273 HomeCompute& homeCompute = computes[rank].homeCompute;
3274 for (
int i=0;i < homeCompute.tuples.size();i++) {
3275 tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
3283 std::vector<char> patchIDset(patchMap->
numPatches(), 0);
3284 int numPatchIDset = 0;
3285 int numPatchIDs = 0;
3286 for (
int rank=0;rank < computes.size();rank++) {
3287 numPatchIDs += patchIDsPerRank[rank].size();
3288 for (
int i=0;i < patchIDsPerRank[rank].size();i++) {
3289 PatchID patchID = patchIDsPerRank[rank][i];
3290 if (patchIDset[patchID] == 0) numPatchIDset++;
3291 patchIDset[patchID] = 1;
3292 if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
3293 NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3297 if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
3298 NAMD_bug(
"ComputeBondedCUDA::initialize(), inconsistent patch mapping");
3303 atomMappers.resize(getNumPatches());
3304 for (
int i=0;i < getNumPatches();i++) {
3305 atomMappers[i] =
new AtomMapper(allPatchIDs[i], &atomMap);
3306 patchIndex[allPatchIDs[i]] = i;
3310 updateHostCudaAlchFlags();
3311 updateKernelCudaAlchFlags();
3312 if (hostAlchFlags.alchOn) {
3313 updateHostCudaAlchParameters();
3314 bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3315 updateHostCudaAlchLambdas();
3316 bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3317 if (hostAlchFlags.alchDecouple) {
3318 pswitchTable[1+3*1] = 0;
3319 pswitchTable[2+3*2] = 0;
3325 for (
int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
3326 if (tupleList[tupletype].size() > 0) {
3333 std::vector<CudaBondValue> bondValues(NumBondParams);
3334 for (
int i=0;i < NumBondParams;i++) {
3335 bondValues[i].k = bond_array[i].
k;
3336 bondValues[i].x0 = bond_array[i].
x0;
3337 bondValues[i].x1 = bond_array[i].
x1;
3339 bondedKernel.setupBondValues(NumBondParams, bondValues.data());
3347 std::vector<CudaAngleValue> angleValues(NumAngleParams);
3348 bool normal_ub_error =
false;
3349 for (
int i=0;i < NumAngleParams;i++) {
3350 angleValues[i].k = angle_array[i].
k;
3351 if (angle_array[i].normal == 1) {
3352 angleValues[i].theta0 = angle_array[i].
theta0;
3354 angleValues[i].theta0 = cos(angle_array[i].theta0);
3356 normal_ub_error |= (angle_array[i].
normal == 0 && angle_array[i].
k_ub);
3357 angleValues[i].k_ub = angle_array[i].
k_ub;
3358 angleValues[i].r_ub = angle_array[i].
r_ub;
3359 angleValues[i].normal = angle_array[i].
normal;
3361 if (normal_ub_error)
NAMD_die(
"ERROR: Can't use cosAngles with Urey-Bradley angles");
3362 bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
3370 int NumDihedralParamsMult = 0;
3371 for (
int i=0;i < NumDihedralParams;i++) {
3372 NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
3374 std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
3375 dihedralMultMap.resize(NumDihedralParams);
3377 for (
int i=0;i < NumDihedralParams;i++) {
3379 dihedralMultMap[i] = k;
3380 for (
int j=0;j < multiplicity;j++) {
3381 dihedralValues[k].k = dihedral_array[i].
values[j].
k;
3382 dihedralValues[k].n = (dihedral_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
3383 dihedralValues[k].delta = dihedral_array[i].
values[j].
delta;
3387 bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
3395 int NumImproperParamsMult = 0;
3396 for (
int i=0;i < NumImproperParams;i++) {
3397 NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
3399 std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
3400 improperMultMap.resize(NumImproperParams);
3402 for (
int i=0;i < NumImproperParams;i++) {
3404 improperMultMap[i] = k;
3405 for (
int j=0;j < multiplicity;j++) {
3406 improperValues[k].k = improper_array[i].
values[j].
k;
3407 improperValues[k].n = (improper_array[i].
values[j].
n << 1) | (j < (multiplicity - 1));
3408 improperValues[k].delta = improper_array[i].
values[j].
delta;
3412 bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
3420 std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
3423 for (
int ipar=0;ipar < NumCrosstermParams;ipar++) {
3424 for (
int i=0;i < N;i++) {
3425 for (
int j=0;j < N;j++) {
3430 #define INDEX(ncols,i,j) ((i)*ncols + (j)) 3433 const double Ainv[16][16] = {
3434 { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3435 { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3436 {-3, 3, 0, 0, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3437 { 2, -2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
3438 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
3439 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0},
3440 { 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, -2, -1, 0, 0},
3441 { 0, 0, 0, 0, 0, 0, 0, 0, 2, -2, 0, 0, 1, 1, 0, 0},
3442 {-3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0, 0, 0, 0, 0},
3443 { 0, 0, 0, 0, -3, 0, 3, 0, 0, 0, 0, 0, -2, 0, -1, 0},
3444 { 9, -9, -9, 9, 6, 3, -6, -3, 6, -6, 3, -3, 4, 2, 2, 1},
3445 {-6, 6, 6, -6, -3, -3, 3, 3, -4, 4, -2, 2, -2, -2, -1, -1},
3446 { 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
3447 { 0, 0, 0, 0, 2, 0, -2, 0, 0, 0, 0, 0, 1, 0, 1, 0},
3448 {-6, 6, 6, -6, -4, -2, 4, 2, -3, 3, -3, 3, -2, -1, -2, -1},
3449 { 4, -4, -4, 4, 2, 2, -2, -2, 2, -2, 2, -2, 1, 1, 1, 1}
3453 #define M_PI 3.14159265358979323846 3456 const double h =
M_PI/12.0;
3458 const double x[16] = {
3459 table[
INDEX(D,i,j)].
d00, table[
INDEX(D,i+1,j)].
d00, table[
INDEX(D,i,j+1)].
d00, table[
INDEX(D,i+1,j+1)].
d00,
3460 table[
INDEX(D,i,j)].
d10*h, table[
INDEX(D,i+1,j)].
d10*h, table[
INDEX(D,i,j+1)].
d10*h, table[
INDEX(D,i+1,j+1)].
d10*h,
3461 table[
INDEX(D,i,j)].
d01*h, table[
INDEX(D,i+1,j)].
d01*h, table[
INDEX(D,i,j+1)].
d01*h, table[
INDEX(D,i+1,j+1)].
d01*h,
3462 table[
INDEX(D,i,j)].
d11*h*h, table[
INDEX(D,i+1,j)].
d11*h*h, table[
INDEX(D,i,j+1)].
d11*h*h, table[
INDEX(D,i+1,j+1)].
d11*h*h
3466 float* a = (
float *)&crosstermValues[ipar].c[i][j][0];
3467 for (
int k=0;k < 16;k++) {
3469 for (
int l=0;l < 16;l++) {
3470 a_val += Ainv[k][l]*x[l];
3472 a[k] = (float)a_val;
3478 bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
3486 case Tuples::THOLE: {
3491 case Tuples::ANISO: {
3497 NAMD_bug(
"ComputeBondedCUDA::initialize, Undefined tuple type");
3507 #ifdef NODEGROUP_FORCE_REGISTER 3510 migrationKernel.setup(nDevices, patchMap->
numPatches());
3514 allocate_host<PatchRecord>(&h_patchRecord, patchMap->
numPatches());
3515 allocate_device<PatchRecord>(&d_patchRecord, patchMap->
numPatches());
3518 allocate_host<double3>(&h_patchMapCenter, patchMap->
numPatches());
3519 allocate_device<double3>(&d_patchMapCenter, patchMap->
numPatches());
3520 for (
int i = 0; i < patchMap->
numPatches(); i++) {
3523 copy_HtoD_sync<double3>(h_patchMapCenter, d_patchMapCenter, patchMap->
numPatches());
3525 #endif // NODEGROUP_FORCE_REGISTER 3528 #ifdef NODEGROUP_FORCE_REGISTER 3529 void ComputeBondedCUDA::updatePatchOrder(
const std::vector<CudaLocalRecord>& data) {
3532 std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
3533 for (
int i=0;i < data.size();i++) {
3534 patchIndex[data[i].patchID] = i;
3538 for (
int i=0;i < data.size();i++) {
3543 patches.push_back(p);
3546 #endif // NODEGROUP_FORCE_REGISTER 3548 void ComputeBondedCUDA::updateHostCudaAlchFlags() {
3551 hostAlchFlags.alchFepOn = sim_params.
alchFepOn;
3553 hostAlchFlags.alchWCAOn = sim_params.
alchWCAOn;
3554 hostAlchFlags.alchLJcorrection = sim_params.
LJcorrection;
3560 void ComputeBondedCUDA::updateKernelCudaAlchFlags() {
3561 bondedKernel.updateCudaAlchFlags(hostAlchFlags);
3564 void ComputeBondedCUDA::updateHostCudaAlchParameters() {
3570 void ComputeBondedCUDA::updateKernelCudaAlchParameters() {
3571 bondedKernel.updateCudaAlchParameters(&hostAlchParameters, stream);
3574 void ComputeBondedCUDA::updateHostCudaAlchLambdas() {
3578 hostAlchLambdas.bondLambda1 = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda));
3579 hostAlchLambdas.bondLambda2 = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda));
3580 hostAlchLambdas.bondLambda12 = float(sim_params.
getBondLambda(hostAlchLambdas.currentLambda2));
3581 hostAlchLambdas.bondLambda22 = float(sim_params.
getBondLambda(1.0 - hostAlchLambdas.currentLambda2));
3582 hostAlchLambdas.elecLambdaUp = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda));
3583 hostAlchLambdas.elecLambda2Up = float(sim_params.
getElecLambda(hostAlchLambdas.currentLambda2));
3584 hostAlchLambdas.elecLambdaDown = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda));
3585 hostAlchLambdas.elecLambda2Down = float(sim_params.
getElecLambda(1.0 - hostAlchLambdas.currentLambda2));
3586 hostAlchLambdas.vdwLambdaUp = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda));
3587 hostAlchLambdas.vdwLambda2Up = float(sim_params.
getVdwLambda(hostAlchLambdas.currentLambda2));
3588 hostAlchLambdas.vdwLambdaDown = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda));
3589 hostAlchLambdas.vdwLambda2Down = float(sim_params.
getVdwLambda(1.0 - hostAlchLambdas.currentLambda2));
3592 void ComputeBondedCUDA::updateKernelCudaAlchLambdas() {
3593 bondedKernel.updateCudaAlchLambdas(&hostAlchLambdas, stream);
3596 #ifdef NODEGROUP_FORCE_REGISTER 3597 void ComputeBondedCUDA::registerPointersToHost() {
3598 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3599 PatchData *patchData = cpdata.ckLocalBranch();
3604 patchData->h_tupleDataStage.bond[deviceIndex] = dst.
bond;
3605 patchData->h_tupleDataStage.angle[deviceIndex] = dst.
angle;
3606 patchData->h_tupleDataStage.dihedral[deviceIndex] = dst.
dihedral;
3607 patchData->h_tupleDataStage.improper[deviceIndex] = dst.
improper;
3608 patchData->h_tupleDataStage.modifiedExclusion[deviceIndex] = dst.
modifiedExclusion;
3609 patchData->h_tupleDataStage.exclusion[deviceIndex] = dst.
exclusion;
3610 patchData->h_tupleDataStage.crossterm[deviceIndex] = dst.
crossterm;
3613 patchData->h_tupleCount.bond[deviceIndex] = count.
bond();
3614 patchData->h_tupleCount.angle[deviceIndex] = count.
angle();
3615 patchData->h_tupleCount.dihedral[deviceIndex] = count.
dihedral();
3616 patchData->h_tupleCount.improper[deviceIndex] = count.
improper();
3617 patchData->h_tupleCount.modifiedExclusion[deviceIndex] = count.
modifiedExclusion();
3618 patchData->h_tupleCount.exclusion[deviceIndex] = count.
exclusion();
3619 patchData->h_tupleCount.crossterm[deviceIndex] = count.
crossterm();
3622 patchData->h_tupleOffset.bond[deviceIndex] = offset.
bond();
3623 patchData->h_tupleOffset.angle[deviceIndex] = offset.
angle();
3624 patchData->h_tupleOffset.dihedral[deviceIndex] = offset.
dihedral();
3625 patchData->h_tupleOffset.improper[deviceIndex] = offset.
improper();
3626 patchData->h_tupleOffset.modifiedExclusion[deviceIndex] = offset.
modifiedExclusion();
3627 patchData->h_tupleOffset.exclusion[deviceIndex] = offset.
exclusion();
3628 patchData->h_tupleOffset.crossterm[deviceIndex] = offset.
crossterm();
3631 void ComputeBondedCUDA::copyHostRegisterToDevice() {
3632 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3633 PatchData *patchData = cpdata.ckLocalBranch();
3640 migrationKernel.copyPeerDataToDevice(
3641 patchData->h_tupleDataStage,
3642 patchData->h_tupleCount,
3643 patchData->h_tupleOffset,
3649 void ComputeBondedCUDA::copyPatchData() {
3653 for (
int i = 0; i < numPatches; i++) {
3654 h_patchRecord[i] = patches[patchIndex[i]];
3657 copy_HtoD<PatchRecord>(h_patchRecord, d_patchRecord, numPatches, stream);
3659 #endif // NODEGROUP_FORCE_REGISTER 3663 return (
simParams->CUDASOAintegrate) ? reductionGpuResident :
3664 reductionGpuOffload;
3667 #endif // BONDED_CUDA
CudaDihedralStage * improper
NAMD_HOST_DEVICE int * modifiedExclusion()
void barrier(const SynchronousCollectiveScope scope)
#define NAMD_EVENT_STOP(eon, id)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
ScaledPosition center(int pid) const
const CrosstermValue * value
CrosstermData c[dim][dim]
void unregisterAvgPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
BigReal getBondLambda(const BigReal) const
const ImproperValue * value
static ProxyMgr * Object()
static int warpAlign(const int n)
#define CUDA_BONDED_KERNEL_EVENT
#define ADD_TENSOR(R, RL, D, DL)
static PatchMap * Object()
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
CudaCrosstermStage * crossterm
virtual void submit(void)=0
SimParameters * simParameters
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * exclusion()
void cudaDie(const char *msg, cudaError_t err)
Bool CUDASOAintegrateMode
#define INDEX(ncols, i, j)
void unregisterForceDeposit(Compute *cid, Box< Patch, Results > **const box)
int downstream(int pid1, int pid2)
int upstreamNeighbors(int pid, PatchID *neighbor_ids)
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
DihedralValue * dihedral_array
static ReductionMgr * Object(void)
Patch * patch(PatchID pid)
BigReal getElecLambda(const BigReal) const
CrosstermValue * crossterm_array
Molecule stores the structural information for the system.
__thread DeviceCUDA * deviceCUDA
FourBodyConsts values[MAX_MULTIPLICITY]
NAMD_HOST_DEVICE int * bond()
static CudaNBConstants getNonbondedCoef(SimParameters *params)
static Units next(Units u)
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
CudaAtom * getCudaAtomList()
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
void CcdCallBacksReset(void *ignored, double curWallTime)
const DihedralValue * value
NAMD_HOST_DEVICE float3 make_float3(float4 a)
void NAMD_bug(const char *err_msg)
#define COPY_CUDAVECTOR(S, D)
CudaDihedralStage * dihedral
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * crossterm()
NAMD_HOST_DEVICE int * angle()
PatchID getPatchID() const
ImproperValue * improper_array
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
CudaExclusionStage * modifiedExclusion
Box< Patch, CompAtom > * positionBox
void createProxy(PatchID pid)
int get_fep_bonded_type(const int *atomID, unsigned int order) const
void NAMD_die(const char *err_msg)
CudaExclusionStage * exclusion
FourBodyConsts values[MAX_MULTIPLICITY]
static Bool vdwForceSwitching
unsigned char get_fep_type(int anum) const
void unregisterPositionPickup(Compute *cid, Box< Patch, CompAtom > **const box)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
BigReal getVdwLambda(const BigReal) const
Box< Patch, CompAtom > * avgPositionBox
virtual void patchReady(PatchID, int doneMigration, int seq)
NAMD_HOST_DEVICE int * improper()
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
NAMD_HOST_DEVICE int * dihedral()
Box< Patch, Results > * forceBox
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void close(Data **const t)
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
static SynchronousCollectives * Object()
BigReal alchVdwShiftCoeff
CompAtomExt * getCompAtomExtInfo()
Box< Patch, Results > * registerForceDeposit(Compute *cid)