26 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 28 #define __thread __declspec(thread) 33 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 41 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
42 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
46 exclusionsByAtom = NULL;
51 exclIndexMaxDiff = NULL;
52 exclIndexMaxDiffSize = 0;
65 lambdaWindowUpdated =
false;
94 atomsChangedIn =
true;
96 computesChanged =
true;
98 forceDoneEventRecord =
false;
103 drudeAtomAlpha =
nullptr;
104 drudeAtomAlphaSize = 0;
108 NAMD_die(
"CudaComputeNonbonded, pressure profile not supported");
122 if (exclusionsByAtom != NULL)
delete [] exclusionsByAtom;
123 if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
124 if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
125 if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
126 if (part != NULL) deallocate_host<char>(&part);
127 if (h_forces != NULL) deallocate_host<float4>(&h_forces);
128 if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
129 if (d_forces != NULL) deallocate_device<float4>(&d_forces);
130 if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
133 if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
134 if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
135 if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
136 if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
137 if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
138 if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
140 if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
146 if (patches.size() > 0) {
147 deallocate_host<VirialEnergy>(&h_virialEnergy);
148 deallocate_device<VirialEnergy>(&d_virialEnergy);
150 cudaCheck(cudaEventDestroy(forceDoneEvent));
151 CmiDestroyLock(lock);
152 if (reductionGpuOffload) {
153 delete reductionGpuOffload;
155 if (reductionGpuResident) {
156 delete reductionGpuResident;
166 void CudaComputeNonbonded::unregisterBox(
int i) {
167 if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(
this, &patches[i].positionBox);
168 if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(
this, &patches[i].forceBox);
169 if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(
this, &patches[i].intRadBox);
170 if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(
this, &patches[i].psiSumBox);
171 if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(
this, &patches[i].bornRadBox);
172 if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(
this, &patches[i].dEdaSumBox);
173 if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(
this, &patches[i].dHdrPrefixBox);
177 if (rankPatches[CkMyRank()].size() == 0)
178 NAMD_bug(
"CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
179 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
180 unregisterBox(rankPatches[CkMyRank()][i]);
189 computesChanged =
true;
191 addCompute(
cid, pid, pid, 0.);
199 computesChanged =
true;
206 offset.
x += (t1%3-1) - (t2%3-1);
207 offset.
y += ((t1/3)%3-1) - ((t2/3)%3-1);
208 offset.
z += (t1/9-1) - (t2/9-1);
209 addCompute(
cid, pid[0], pid[1], offset);
215 void CudaComputeNonbonded::addPatch(
PatchID pid) {
228 computes.push_back(cr);
234 void CudaComputeNonbonded::updatePatch(
int i) {
235 int numAtoms = patches[i].patch->getNumAtoms();
236 int numFreeAtoms = numAtoms;
238 const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
239 for (
int j=0; j< numAtoms; ++j ) {
240 if ( aExt[j].atomFixed ) --numFreeAtoms;
243 patches[i].numAtoms = numAtoms;
244 patches[i].numFreeAtoms = numFreeAtoms;
247 #ifdef NODEGROUP_FORCE_REGISTER 248 cudaPatches[i].patchID = patches[i].patchID;
252 int CudaComputeNonbonded::findPid(
PatchID pid) {
253 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
254 int j = rankPatches[CkMyRank()][i];
255 if (patches[j].patchID == pid)
return j;
266 int i = findPid(pid);
268 NAMD_bug(
"CudaComputeNonbonded::patchReady, Patch ID not found");
289 void CudaComputeNonbonded::assignPatch(
int i) {
292 PatchID pid = patches[i].patchID;
297 patch = patchMap->
patch(pid);
299 patches[i].patch = patch;
300 if (patches[i].patch == NULL) {
301 NAMD_bug(
"CudaComputeNonbonded::assignPatch, patch not found");
303 patches[i].positionBox = patches[i].patch->registerPositionPickup(
this);
304 patches[i].forceBox = patches[i].patch->registerForceDeposit(
this);
307 patches[i].intRadBox = patches[i].patch->registerIntRadPickup(
this);
308 patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(
this);
309 patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(
this);
310 patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(
this);
311 patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(
this);
315 if (patches[i].pe != CkMyPe()) {
316 NAMD_bug(
"CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
319 patches[i].pe = CkMyPe();
322 patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->
node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
323 patches[i].isSameNode = ( CkNodeOf(patchMap->
node(pid)) == CkMyNode() );
330 if ( ppi != ppj )
return ppi < ppj;
331 return pidi.x < pidj.x;
336 if (rankPatches[CkMyRank()].size() == 0)
337 NAMD_bug(
"CudaComputeNonbonded::assignPatchesOnPe, empty rank");
343 for (
int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
344 int i = rankPatches[CkMyRank()][k];
345 int pid = patches[i].patchID;
346 int homePe = patchMap->
node(pid);
347 if ( CkNodeOf(homePe) == CkMyNode() ) {
351 homePatchByRank[CkRankOf(homePe)].
add(pid_index);
354 for (
int i=0; i<CkMyNodeSize(); ++i ) {
356 std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
357 int masterBoost = ( CkMyRank() == i ? 2 : 0 );
358 for (
int j=0; j<homePatchByRank[i].
size(); ++j ) {
359 int index = homePatchByRank[i][j].y;
360 patches[index].reversePriorityRankInPe = j + masterBoost;
365 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
366 assignPatch(rankPatches[CkMyRank()][i]);
376 for (
int i=0;i < CkMyNodeSize();i++) {
377 if (rankPatchIDs[i].find(pid) != -1)
return CkNodeFirst(CkMyNode()) + i;
386 proxyPatchPes.clear();
387 for (
int i=0;i < CkMyNodeSize();i++) {
388 int pe = CkNodeFirst(CkMyNode()) + i;
390 proxyPatchPes.push_back(pe);
399 std::sort(patches.begin(), patches.end());
400 std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
401 patches.erase(last, patches.end());
405 computeMgr = computeMgrIn;
409 std::map<PatchID, int> pidMap;
415 std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
416 int numPesOnNodeSharingDevice = 0;
417 int masterIndex = -1;
420 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
421 if ( CkNodeOf(pe) == CkMyNode() ) {
422 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
426 std::vector<int> count(patches.size(), 0);
427 std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
428 std::vector<int> rankpcount(CkMyNodeSize(), 0);
429 std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
433 int unassignedpatches = patches.size();
435 for (
int i=0;i < patches.size(); ++i) {
440 for (
int i=0;i < patches.size(); ++i) {
441 int pid = patches[i].patchID;
443 int homePe = patchMap->node(pid);
444 for (
int j=0; j < numPesOnNodeSharingDevice; ++j ) {
445 int pe = pesOnNodeSharingDevice[j];
447 if ( pe == homePe ) {
453 table[i*numPesOnNodeSharingDevice + j] = 1;
457 if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
458 patches[i].pe = homePe;
460 rankpcount[CkRankOf(homePe)] += 1;
464 for (
int i=0; i < patches.size(); ++i) {
465 int pid = patches[i].patchID;
466 if ( patches[i].pe != -1 )
continue;
469 for (
int j=0; j < numPesOnNodeSharingDevice; ++j) {
470 if ( table[i*numPesOnNodeSharingDevice + j] ) {
477 patches[i].pe = pesOnNodeSharingDevice[lastj];
483 while ( unassignedpatches ) {
485 for (i=0;i < patches.size(); ++i) {
486 if ( ! table[i*numPesOnNodeSharingDevice + assignj] )
continue;
487 int pid = patches[i].patchID;
489 if ( patches[i].pe != -1 )
continue;
490 patches[i].pe = pesOnNodeSharingDevice[assignj];
492 pcount[assignj] += 1;
493 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
496 if (i < patches.size() )
continue;
497 for ( i=0;i < patches.size(); ++i ) {
498 int pid = patches[i].patchID;
500 if ( patches[i].pe != -1 )
continue;
501 if ( count[i] )
continue;
502 patches[i].pe = pesOnNodeSharingDevice[assignj];
504 pcount[assignj] += 1;
505 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
508 if ( i < patches.size() )
continue;
509 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
513 rankPatches.resize(CkMyNodeSize());
514 for (
int i=0; i < patches.size(); ++i) {
515 rankPatches[CkRankOf(patches[i].pe)].push_back(i);
516 pidMap[patches[i].patchID] = i;
524 rankPatches.resize(CkMyNodeSize());
527 for (
int i=0;i < CkMyNodeSize();i++) {
528 int pe = CkNodeFirst(CkMyNode()) + i;
531 std::vector<int> proxyPatchPes;
532 std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
535 std::vector<int> pesToAvoid;
540 if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
544 for (
int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
545 if (computePmeCUDAMgr->
isPmePe(pe)) pesToAvoid.push_back(pe);
548 for (
int i=0;i < pesToAvoid.size();i++) {
549 int pe = pesToAvoid[i];
550 peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
554 peProxyPatchCounter[CkRankOf(masterPe)] = 2;
556 for (
int i=0;i < patches.size();i++) {
558 PatchID pid = patches[i].patchID;
563 if (proxyPatchPes.size() == 0) {
565 int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
566 pe = CkNodeFirst(CkMyNode()) + rank;
567 peProxyPatchCounter[rank]++;
577 int minCounter = (1 << 30);
578 for (
int j=0;j < proxyPatchPes.size();j++) {
579 if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
580 pe = proxyPatchPes[j];
581 minCounter = peProxyPatchCounter[CkRankOf(pe)];
585 NAMD_bug(
"CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
586 peProxyPatchCounter[CkRankOf(pe)]++;
588 }
else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
590 int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
591 pe = CkNodeFirst(CkMyNode()) + rank;
592 peProxyPatchCounter[rank]++;
594 if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
595 NAMD_bug(
"CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
596 rankPatches[CkRankOf(pe)].push_back(i);
600 delete [] rankHomePatchIDs;
603 for (
int i=0;i < computes.size();i++) {
604 computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
605 computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
607 for (
int i=0;i < CkMyNodeSize();i++) {
608 if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
615 std::map<int, int> pidMap;
616 for (
int i=0; i < data.size(); ++i) {
617 pidMap[data[i].patchID] = i;
620 std::vector<PatchRecord> copy = patches;
622 for (
int i=0; i < copy.size(); i++) {
623 const int new_idx = pidMap[copy[i].patchID];
624 patches[new_idx] = copy[i];
627 for (
int i=0; i < rankPatches.size(); i++) {
628 rankPatches[i].clear();
630 for (
int i=0; i < patches.size(); ++i) {
631 rankPatches[CkRankOf(patches[i].pe)].push_back(i);
635 for (
int i=0;i < computes.size();i++) {
636 computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
637 computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
643 if (patches.size() > 0) {
647 allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
649 allocate_host<VirialEnergy>(&h_virialEnergy, 1);
650 allocate_device<VirialEnergy>(&d_virialEnergy,
ATOMIC_BINS);
654 cudaDeviceProp props;
655 cudaCheck(cudaGetDeviceProperties(&props, deviceID));
656 maxShmemPerBlock = props.sharedMemPerBlock;
658 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 659 int leastPriority, greatestPriority;
660 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
661 int priority = (doStreaming) ? leastPriority : greatestPriority;
667 cudaCheck(cudaEventCreate(&forceDoneEvent));
671 lock = CmiCreateLock();
679 #ifdef NODEGROUP_FORCE_REGISTER 681 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
682 PatchData *patchData = cpdata.ckLocalBranch();
683 patchData->devData[devInd].nbond_stream = stream;
687 allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
688 memset(patchData->devData[devInd].h_hasPatches, 0,
sizeof(
bool)*nGlobalPatches);
690 for(
int i = 0; i < patches.size(); i++){
691 patchData->devData[devInd].h_hasPatches[patches[i].patchID] =
true;
693 allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
694 copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
703 atomsChangedIn =
true;
709 void CudaComputeNonbonded::updatePatches() {
711 #ifdef NODEGROUP_FORCE_REGISTER 713 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
714 PatchData *patchData = cpdata.ckLocalBranch();
715 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
716 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
721 for (
int i=0;i < numPatchesHomeAndProxy; i++) {
722 patches[i].numAtoms = localPatches[i].numAtoms;
723 patches[i].numFreeAtoms = localPatches[i].numAtoms;
724 patches[i].atomStart = localPatches[i].bufferOffsetNBPad;
725 cudaPatches[i].
numAtoms = localPatches[i].numAtoms;
727 cudaPatches[i].
atomStart = localPatches[i].bufferOffsetNBPad;
728 cudaPatches[i].patchID = localPatches[i].patchID;
735 patch = pm->
patch(localPatches[i].patchID);
736 if (patch != NULL)
break;
738 if (patch == NULL)
NAMD_bug(
"CudaComputeNonbonded::updatePatches cannot find patch.\n");
739 if (patch->getNumAtoms() != localPatches[i].numAtoms) {
740 NAMD_bug(
"CudaComputeNonbonded::updatePatches numAtoms mismatches!\n");
742 const CompAtomExt *aExt = patch->getCompAtomExtInfo();
743 for (
int j = 0; j < localPatches[i].numAtoms; ++j) {
744 if (aExt[j].atomFixed) {
745 --patches[i].numFreeAtoms;
750 int numAtoms = patches[i].numAtoms;
751 #if defined(NAMD_CUDA) 753 maxTileListLen = std::max(maxTileListLen, numTiles);
758 maxTileListLen = std::max(maxTileListLen, numTiles);
763 atomStorageSize = atomStart;
765 if (maxTileListLen >= 65536) {
766 NAMD_bug(
"CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
774 for (
int i=0;i < patches.size();i++) {
775 patches[i].atomStart = atomStart;
777 #ifdef NODEGROUP_FORCE_REGISTER 778 cudaPatches[i].patchID = patches[i].patchID;
780 int numAtoms = patches[i].numAtoms;
783 maxTileListLen = std::max(maxTileListLen, numTiles);
787 maxTileListLen = std::max(maxTileListLen, numTiles);
791 atomStorageSize = atomStart;
793 if (maxTileListLen >= 65536) {
794 NAMD_bug(
"CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
799 void CudaComputeNonbonded::skipPatch(
int i) {
800 if (CkMyPe() != patches[i].pe)
801 NAMD_bug(
"CudaComputeNonbonded::skipPatch called on wrong Pe");
802 Flags &flags = patches[i].patch->flags;
803 patches[i].positionBox->skip();
804 patches[i].forceBox->skip();
806 patches[i].psiSumBox->skip();
807 patches[i].intRadBox->skip();
808 patches[i].bornRadBox->skip();
809 patches[i].dEdaSumBox->skip();
810 patches[i].dHdrPrefixBox->skip();
815 if (rankPatches[CkMyRank()].size() == 0)
816 NAMD_bug(
"CudaComputeNonbonded::skipPatchesOnPe, empty rank");
817 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
818 skipPatch(rankPatches[CkMyRank()][i]);
822 patchesCounter -= rankPatches[CkMyRank()].size();
823 if (patchesCounter == 0) {
834 void CudaComputeNonbonded::skip() {
835 if (CkMyPe() != masterPe)
836 NAMD_bug(
"CudaComputeNonbonded::skip() called on non masterPe");
838 if (patches.size() == 0)
return;
847 void CudaComputeNonbonded::getMaxMovementTolerance(
float& maxAtomMovement,
float& maxPatchTolerance) {
848 if (CkMyPe() != masterPe)
849 NAMD_bug(
"CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
851 for (
int i=0;i < patches.size();++i) {
854 float maxMove = pr.patch->flags.maxAtomMovement;
855 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
857 float maxTol = pr.patch->flags.pairlistTolerance;
860 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
864 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(
int first,
int last,
void *result,
int paraNum,
void *param) {
866 c->updateVdwTypesExclSubset(first, last);
869 void CudaComputeNonbonded::updateVdwTypesExclSubset(
int first,
int last) {
874 for (
int i=first;i <= last;i++) {
878 const CompAtom *compAtom = pr.compAtom;
879 const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
881 int2* exclp = exclIndexMaxDiff + start;
882 int* aip = atomIndex + start;
884 if(doAlch) pst = part + start;
887 float* drudeAtomAlphaStart;
889 isDrudeStart = isDrude + start;
890 drudeAtomAlphaStart = drudeAtomAlpha + start;
892 for (
int k=0;k < numAtoms; ++k ) {
894 vdwTypes[start + k] = compAtom[j].
vdwType;
895 aip[k] = compAtomExt[j].
id;
896 if(doAlch) pst[k] = compAtom[j].
partition;
897 #ifdef MEM_OPT_VERSION 898 exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
899 exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
900 #else // ! MEM_OPT_VERSION 901 exclp[k].x = exclusionsByAtom[compAtomExt[j].
id].y;
902 exclp[k].y = exclusionsByAtom[compAtomExt[j].
id].x;
903 #endif // MEM_OPT_VERSION 907 atomIndexToNBindex[aip[k]] = start + k;
916 void CudaComputeNonbonded::updateVdwTypesExcl() {
918 reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
919 reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
920 reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
921 if (doAlch) reallocate_host<char>(&part, &partSize, atomStorageSize, 1.4f);
924 std::fill(isDrude, isDrude + atomStorageSize, -1);
925 reallocate_host(&drudeAtomAlpha, &drudeAtomAlphaSize, atomStorageSize, 1.4f);
926 std::fill(drudeAtomAlpha, drudeAtomAlpha + atomStorageSize, 0.0);
932 #if CMK_SMP && USE_CKLOOP 934 if (useCkLoop >= 1) {
935 CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (
void *)
this, CkMyNodeSize(), 0, patches.size()-1);
939 updateVdwTypesExclSubset(0, patches.size()-1);
943 auto updateIsDrudeNBIndexWorker = [&](
int start,
int end,
void* unused){
944 for (
int i = start;i <= end;i++) {
945 if (isDrude[i] > -1) {
946 isDrude[i] = atomIndexToNBindex[isDrude[i]];
951 #if CMK_SMP && USE_CKLOOP 953 if (useCkLoop >= 1) {
954 const int numChunks = CkMyNodeSize();
955 const int lowerRange = 0;
956 const int upperRange = atomStorageSize - 1;
957 CkLoop_Parallelize(numChunks, lowerRange, upperRange, updateIsDrudeNBIndexWorker, NULL, CKLOOP_NONE, NULL);
961 updateIsDrudeNBIndexWorker(0, atomStorageSize - 1,
nullptr);
963 #if defined (NAMD_CUDA) || defined (NAMD_HIP) 964 nonbondedKernel.
updateDrudeData(atomIndexSize, drudeAtomAlpha, isDrude, stream);
965 cudaCheck(cudaStreamSynchronize(stream));
969 nonbondedKernel.
updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
971 #ifdef NODEGROUP_FORCE_REGISTER 972 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
973 PatchData *patchData = cpdata.ckLocalBranch();
976 patchData->devData[deviceIndex].numPatchesHomeAndProxy,
977 atomStorageSize, params->
alchOn,
978 patchData->devData[deviceIndex].d_localPatches,
979 patchData->h_soa_vdwType[deviceIndex],
980 patchData->h_soa_id[deviceIndex],
981 patchData->h_soa_sortOrder[deviceIndex],
982 patchData->h_soa_partition[deviceIndex],
985 #endif // NODEGROUP_FORCE_REGISTER 988 NAMD_bug(
"Unimplemented feature combination: Nbthole and GPU atom migration.\n");
993 inline void CudaComputeNonbonded::copyAtomsLoop(
int first,
int last,
void *result,
int paraNum,
void *param) {
995 c->copyAtomsSubset(first, last);
998 void CudaComputeNonbonded::copyAtomsSubset(
int first,
int last) {
999 for (
int i=first;i <= last;++i) {
1004 const CudaAtom *src = pr.patch->getCudaAtomList();
1006 memcpy(dst, src,
sizeof(
CudaAtom)*numAtoms);
1013 CudaAtom lastAtom = src[numAtoms-1];
1014 for (
int j=numAtoms;j < numAtomsAlign;j++) {
1018 fprintf(stderr,
" printing patch %d\n", pr.patch->getPatchID());
1019 for(
int k = 0; k < numAtoms; k++){
1020 fprintf(stderr,
"%lf %lf %lf\n", dst[k].x, dst[k].y, dst[k].z);
1027 void CudaComputeNonbonded::copyGBISphase(
int i) {
1028 if (CkMyPe() != patches[i].pe)
1029 NAMD_bug(
"CudaComputeNonbonded::copyGBISphase called on wrong Pe");
1031 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1035 float *intRad0 = intRad0H + pr.
atomStart;
1036 float *intRadS = intRadSH + pr.
atomStart;
1037 for (
int k=0;k < pr.
numAtoms;++k) {
1039 intRad0[k] = pr.intRad[2*j+0];
1040 intRadS[k] = pr.intRad[2*j+1];
1044 float *bornRad = bornRadH + pr.
atomStart;
1045 for (
int k=0; k < pr.
numAtoms; ++k ) {
1047 bornRad[k] = pr.bornRad[j];
1050 float *dHdrPrefix = dHdrPrefixH + pr.
atomStart;
1051 for (
int k=0; k < pr.
numAtoms; ++k ) {
1053 dHdrPrefix[k] = pr.dHdrPrefix[j];
1058 void CudaComputeNonbonded::openBox(
int i) {
1061 if (CkMyPe() != patches[i].pe)
1062 NAMD_bug(
"CudaComputeNonbonded::openBox called on wrong Pe");
1066 patches[i].compAtom = patches[i].positionBox->open();
1073 #ifdef NODEGROUP_FORCE_REGISTER 1075 if(atomsChanged && !
simParams->useDeviceMigration) copyAtomsSubset(i, i);
1076 }
else copyAtomsSubset(i, i);
1078 copyAtomsSubset(i, i);
1083 patches[i].intRad = patches[i].intRadBox->open();
1084 patches[i].psiSum = patches[i].psiSumBox->open();
1086 patches[i].bornRad = patches[i].bornRadBox->open();
1087 patches[i].dEdaSum = patches[i].dEdaSumBox->open();
1089 patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
1098 if (masterPe != CkMyPe())
1099 NAMD_bug(
"CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
1104 if (rankPatches[CkMyRank()].size() == 0)
1105 NAMD_bug(
"CudaComputeNonbonded::openBoxesOnPe, empty rank");
1109 #ifdef NODEGROUP_FORCE_REGISTER 1110 if(
Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
1112 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
1113 int j = rankPatches[CkMyRank()][i];
1114 patches[j].positionBox->open();
1116 if(masterPe == CkMyPe()) {
1126 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
1127 openBox(rankPatches[CkMyRank()][i]);
1131 patchesCounter -= rankPatches[CkMyRank()].size();
1132 if (patchesCounter == 0) {
1143 #ifdef NODEGROUP_FORCE_REGISTER 1155 void CudaComputeNonbonded::reallocateArrays() {
1160 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
1164 reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1165 reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1167 reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
1168 reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
1170 reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
1171 reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
1175 reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
1176 reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
1177 reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
1178 reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
1179 reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
1180 reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
1185 if (CkMyPe() != masterPe)
1186 NAMD_bug(
"CudaComputeNonbonded::doWork() called on non masterPe");
1193 atomsChanged = atomsChangedIn;
1194 atomsChangedIn =
false;
1198 if (patches.size() == 0)
return;
1203 Flags &flags = patches[0].patch->flags;
1218 if ( computesChanged ) {
1227 #ifdef NODEGROUP_FORCE_REGISTER 1229 tileListKernel.
prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
1230 updatePatchRecord();
1232 #endif // NODEGROUP_FORCE_REGISTER 1249 if (CkMyPe() != masterPe)
1250 NAMD_bug(
"CudaComputeNonbonded::launchWork() called on non masterPe");
1252 beforeForceCompute = CkWallTimer();
1263 if ( atomsChanged || computesChanged ) {
1265 pairlistsValid =
false;
1266 pairlistTolerance = 0.0f;
1270 float maxAtomMovement = 0.0f;
1271 float maxPatchTolerance = 0.0f;
1272 getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
1274 Flags &flags = patches[0].patch->flags;
1275 savePairlists =
false;
1276 usePairlists =
false;
1278 savePairlists =
true;
1279 usePairlists =
true;
1281 if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1285 usePairlists =
true;
1288 if ( ! usePairlists ) {
1289 pairlistsValid =
false;
1292 if ( savePairlists ) {
1293 pairlistsValid =
true;
1294 pairlistTolerance = 2. * maxPatchTolerance;
1295 plcutoff += pairlistTolerance;
1297 plcutoff2 = plcutoff * plcutoff;
1301 if(savePairlists || !usePairlists){
1317 patchReadyQueueNext = 0;
1321 for (
int i=0;i < numEmptyPatches;i++) {
1325 patchReadyQueue[i] = emptyPatches[i];
1327 if (patchReadyQueueLen != patches.size())
1328 NAMD_bug(
"CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1336 forceDoneSetCallback();
1354 #ifdef NODEGROUP_FORCE_REGISTER 1355 if(!
simParams->CUDASOAintegrate || (atomsChanged && !
simParams->useDeviceMigration)){
1356 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1357 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1360 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1361 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1372 atomStorageSize, doEnergy, doVirial, doSlow,
simParams->GBISOn,
1373 d_forces, d_forcesSlow, d_virialEnergy, stream);
1374 copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1383 forceDoneSetCallback();
1386 cudaCheck(cudaStreamSynchronize(stream));
1394 fprintf(stderr,
"CudaNonbonded data\n");
1395 for(
int i = 0 ; i < atomStorageSize; i++){
1396 fprintf(stderr,
"pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n",
1397 i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
1399 h_forces[i].x, h_forces[i].y, h_forces[i].z,
1400 h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
1410 void CudaComputeNonbonded::doGBISphase1() {
1414 GBISKernel.
updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1418 Lattice lattice = patches[0].patch->flags.lattice;
1424 GBISKernel.
GBISphase1(tileListKernel, atomStorageSize,
1432 void CudaComputeNonbonded::doGBISphase2() {
1436 Lattice lattice = patches[0].patch->flags.lattice;
1442 GBISKernel.
updateBornRad(atomStorageSize, bornRadH, stream);
1444 GBISKernel.
GBISphase2(tileListKernel, atomStorageSize,
1450 d_forces, dEdaSumH, stream);
1456 void CudaComputeNonbonded::doGBISphase3() {
1459 Lattice lattice = patches[0].patch->flags.lattice;
1468 GBISKernel.
GBISphase3(tileListKernel, atomStorageSize,
1477 void CudaComputeNonbonded::doForce() {
1481 Lattice lattice = patches[0].patch->flags.lattice;
1482 bool CUDASOAintegrator =
simParams->CUDASOAintegrate;
1486 bool doPairlist = savePairlists || (!usePairlists);
1487 bool doFEP=
false, doTI=
false, doAlchVdwForceSwitching=
false;
1489 static thread_local
bool firsttime =
true;
1492 doAlchVdwForceSwitching =
simParams->vdwForceSwitching;
1496 const decltype(alchFlags.
lambdaUp) currentLambda =
simParams->getCurrentLambda(patches[0].patch->flags.step);
1497 const decltype(alchFlags.lambda2Up) currentLambda2 =
simParams->getCurrentLambda2(patches[0].patch->flags.step);
1501 lambdaWindowUpdated =
true;
1504 if (alchFlags.
lambdaUp != currentLambda ||
1505 alchFlags.
lambda2Up != currentLambda2 ||
1513 lambdaWindowUpdated =
true;
1515 lambdaWindowUpdated =
false;
1518 if (lambdaWindowUpdated) {
1532 alchFlags.
lambdaUp = currentLambda;
1552 int numTileLists = calcNumTileLists();
1555 tileListKernel.
buildTileLists(numTileLists, patches.size(), atomStorageSize,
1556 maxTileListLen, lata, latb, latc,
1557 cudaPatches, (
const float4*)atoms, plcutoff2, maxShmemPerBlock, stream, atomsChanged, doAlch, CUDASOAintegrator,
simParams->useDeviceMigration);
1565 updateVdwTypesExcl();
1568 beforeForceCompute = CkWallTimer();
1573 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1574 PatchData *patchData = cpdata.ckLocalBranch();
1577 fprintf(stderr,
"DEV[%d] MIG POS PRINTOUT\n", deviceID);
1578 for (
int p = 0; p < patches.size(); p++) {
1579 fprintf(stderr,
"Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1580 for (
int i = 0; i < patches[p].numAtoms; i++) {
1581 const int ai = i + patches[p].atomStart;
1582 fprintf(stderr,
"POS[%d,%d,%d] = %lf %lf %lf %lf. Type %d\n", i, ai, atomIndex[ai],
1583 atoms[ai].x, atoms[ai].y, atoms[ai].z, atoms[ai].q, vdwTypes[ai]);
1593 #ifdef DEBUG_MINIMIZE 1594 printf(
"%s, line %d:\n", __FILE__, __LINE__);
1595 printf(
" atomsChanged = %d\n", atomsChanged);
1596 printf(
" doMinimize = %d\n", doMinimize);
1597 printf(
" doPairlist = %d\n", doPairlist);
1598 printf(
" doEnergy = %d\n", doEnergy);
1599 printf(
" doVirial = %d\n", doVirial);
1600 printf(
" doSlow = %d\n", doSlow);
1604 float drudeNbTholeCut2;
1610 atomsChanged, doMinimize, doPairlist, doEnergy, doVirial,
1611 doSlow, doAlch, doAlchVdwForceSwitching, doFEP, doTI,
1612 doNbThole, doTable, lata, latb, latc,
1613 (
const float4*)atoms,
cutoff2, c,
1614 d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1615 &alchFlags, lambdaWindowUpdated, part,
1625 #ifdef NODEGROUP_FORCE_REGISTER 1627 updatePatchRecord();
1635 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1636 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1637 cudaStreamSynchronize(stream);
1638 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1639 PatchData *patchData = cpdata.ckLocalBranch();
1642 fprintf(stderr,
"DEV[%d] MIG POS PRINTOUT\n", deviceID);
1643 for (
int p = 0; p < patches.size(); p++) {
1644 fprintf(stderr,
"Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1645 for (
int i = 0; i < patches[p].numAtoms; i++) {
1646 const int ai = i + patches[p].atomStart;
1647 fprintf(stderr,
"POS[%d,%d,%d] = Type %d (%lf %lf %lf) (%lf %lf %lf)\n", i, ai, atomIndex[ai],
1649 h_forces[ai].x, h_forces[ai].y, h_forces[ai].z,
1650 h_forcesSlow[ai].x, h_forcesSlow[ai].y, h_forcesSlow[ai].z);
1660 traceUserBracketEvent(
CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1663 #ifdef NODEGROUP_FORCE_REGISTER 1664 void CudaComputeNonbonded::updatePatchRecord() {
1668 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1669 PatchData *patchData = cpdata.ckLocalBranch();
1671 patchData->devData[devInd].f_nbond = d_forces;
1672 patchData->devData[devInd].f_nbond_slow = d_forcesSlow;
1673 patchData->devData[devInd].f_nbond_size = atomStorageSize;
1675 patchData->devData[devInd].nbond_precord = tileListKernel.
getCudaPatches();
1677 patchData->devData[devInd].nb_datoms = tileListKernel.
get_xyzq();
1678 patchData->devData[devInd].nbond_tkernel = &tileListKernel;
1679 patchData->devData[devInd].size_nb_datoms = atomStorageSize;
1686 int CudaComputeNonbonded::calcNumTileLists() {
1687 int numTileLists = 0;
1688 for (
int i=0;i < computes.size();i++) {
1689 int pi1 = computes[i].patchInd[0];
1690 int numAtoms1 = patches[pi1].numAtoms;
1696 numTileLists += numTiles1;
1698 return numTileLists;
1705 if (CkMyPe() != masterPe)
1706 NAMD_bug(
"CudaComputeNonbonded::finishReductions() called on non masterPe");
1713 if (doStreaming && (doVirial || doEnergy)) {
1715 if (!forceDoneEventRecord)
1716 NAMD_bug(
"CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1717 cudaCheck(cudaEventSynchronize(forceDoneEvent));
1718 forceDoneEventRecord =
false;
1724 virialTensor.
xx = h_virialEnergy->
virial[0];
1725 virialTensor.
xy = h_virialEnergy->
virial[1];
1726 virialTensor.
xz = h_virialEnergy->
virial[2];
1727 virialTensor.
yx = h_virialEnergy->
virial[3];
1728 virialTensor.
yy = h_virialEnergy->
virial[4];
1729 virialTensor.
yz = h_virialEnergy->
virial[5];
1730 virialTensor.
zx = h_virialEnergy->
virial[6];
1731 virialTensor.
zy = h_virialEnergy->
virial[7];
1732 virialTensor.
zz = h_virialEnergy->
virial[8];
1786 computesChanged =
false;
1792 void CudaComputeNonbonded::finishPatch(
int i) {
1793 if (CkMyPe() != patches[i].pe)
1794 NAMD_bug(
"CudaComputeNonbonded::finishPatch called on wrong Pe");
1798 pr.results = pr.forceBox->open();
1801 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1804 #ifdef NODEGROUP_FORCE_REGISTER 1805 if (numAtoms > 0 && (!
simParams->CUDASOAintegrate || (atomsChanged && !
simParams->useDeviceMigration))) {
1808 float4 *af = h_forces + atomStart;
1809 float4 *af_slow = h_forcesSlow + atomStart;
1812 for (
int k=0; k<numAtoms; ++k ) {
1814 #ifdef DEBUG_MINIMIZE 1816 printf(
"%s, line %d\n", __FILE__, __LINE__);
1817 printf(
" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1823 #ifdef DEBUG_MINIMIZE 1825 printf(
" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1835 f_slow[j].
x += af_slow[k].x;
1836 f_slow[j].
y += af_slow[k].y;
1837 f_slow[j].
z += af_slow[k].z;
1850 float4 *af = h_forces + atomStart;
1851 float4 *af_slow = h_forcesSlow + atomStart;
1854 for (
int k=0; k<numAtoms; ++k ) {
1856 #ifdef DEBUG_MINIMIZE 1858 printf(
"%s, line %d\n", __FILE__, __LINE__);
1859 printf(
" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1865 #ifdef DEBUG_MINIMIZE 1867 printf(
" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1877 f_slow[j].
x += af_slow[k].x;
1878 f_slow[j].
y += af_slow[k].y;
1879 f_slow[j].
z += af_slow[k].z;
1891 if(!
simParams->CUDASOAintegrate || atomsChanged){
1892 pr.positionBox->close(&(pr.compAtom));
1893 pr.forceBox->close(&(pr.results));
1900 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1901 NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1902 if (patchSet.size() == 0)
1903 NAMD_bug(
"CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1909 for (
int i=0;i < patchSet.size();i++) {
1910 finishGBISPhase(patchSet[i]);
1914 if (!
simParams->GBISOn || gbisPhaseSave == 3) {
1915 for (
int i=0;i < patchSet.size();i++) {
1916 finishPatch(patchSet[i]);
1921 patchesCounter -= patchSet.size();
1928 if (patchesCounter == 0) {
1935 if (!
simParams->GBISOn || gbisPhaseSave == 3) {
1944 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1952 finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1959 std::vector<int> v(1, i);
1960 finishSetOfPatchesOnPe(v);
1965 if (atomsChanged || doEnergy || doVirial)
cudaCheck(cudaStreamSynchronize(stream));
1973 void CudaComputeNonbonded::finishGBISPhase(
int i) {
1974 if (CkMyPe() != patches[i].pe)
1975 NAMD_bug(
"CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1977 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1980 GBReal *psiSumMaster = psiSumH + atomStart;
1981 for (
int k=0; k<pr.
numAtoms; ++k ) {
1983 pr.psiSum[j] += psiSumMaster[k];
1985 pr.psiSumBox->close(&(pr.psiSum));
1987 GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1988 for (
int k=0; k<pr.
numAtoms; ++k ) {
1990 pr.dEdaSum[j] += dEdaSumMaster[k];
1992 pr.dEdaSumBox->close(&(pr.dEdaSum));
1994 pr.intRadBox->close(&(pr.intRad));
1995 pr.bornRadBox->close(&(pr.bornRad));
1996 pr.dHdrPrefixBox->close(&(pr.dHdrPrefix));
2000 void CudaComputeNonbonded::finishTimers() {
2029 void CudaComputeNonbonded::forceDoneCheck(
void *arg,
double walltime) {
2032 if (CkMyPe() != c->masterPe)
2033 NAMD_bug(
"CudaComputeNonbonded::forceDoneCheck called on non masterPe");
2038 if (c->doStreaming) {
2040 while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
2041 c->patchReadyQueue[c->patchReadyQueueNext] = -1;
2042 c->patchReadyQueueNext++;
2045 if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
2049 if (c->atomsChanged && !c->reSortDone) {
2051 c->reSortDone =
true;
2054 if ( (c->savePairlists || !(c->usePairlists)) && (c->
gbisPhase == 1) && !c->reSortDone) {
2056 c->reSortDone =
true;
2060 c->forceDoneSetCallback();
2068 int pe = c->patches[patchInd].pe;
2069 PatchID patchID = c->patches[patchInd].patchID;
2075 if ( c->patchReadyQueueNext == c->patchReadyQueueLen )
return;
2079 if (!c->forceDoneEventRecord)
2080 NAMD_bug(
"CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
2081 cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2082 if (err == cudaSuccess) {
2084 c->forceDoneEventRecord =
false;
2089 if (c->atomsChanged && !c->reSortDone) {
2091 c->reSortDone =
true;
2094 if ( (c->savePairlists || !(c->usePairlists)) && (c->
gbisPhase == 1) && !c->reSortDone) {
2096 c->reSortDone =
true;
2100 c->forceDoneSetCallback();
2107 }
else if (err != cudaErrorNotReady) {
2110 sprintf(errmsg,
"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
2111 c->checkCount, walltime - c->beforeForceCompute);
2121 if (c->checkCount >= 1000000) {
2123 sprintf(errmsg,
"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
2124 c->checkCount, walltime - c->beforeForceCompute);
2138 void CudaComputeNonbonded::forceDoneSetCallback() {
2139 if (CkMyPe() != masterPe)
2140 NAMD_bug(
"CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
2141 beforeForceCompute = CkWallTimer();
2143 if (!doStreaming || doVirial || doEnergy) {
2144 cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2145 forceDoneEventRecord =
true;
2168 if ( a == b )
return 0;
2169 for (
int bit = 1; bit; bit *= 2 ) {
2170 if ( (a&bit) != (b&bit) )
return ((a&bit) < (b&bit));
2190 if ( rpri != rprj )
return rpri > rprj;
2195 if ( ppi != ppj )
return ppi < ppj;
2196 return pidi.x < pidj.x;
2213 void CudaComputeNonbonded::updateComputes() {
2216 Lattice lattice = patches[0].patch->flags.lattice;
2218 std::stable_sort(computes.begin(), computes.end(), so);
2222 std::stable_sort(computes.begin(), computes.end(), sorp);
2227 for (
int i=0;i < computes.size();i++) {
2228 cudaComputes[i].
patchInd.x = computes[i].patchInd[0];
2229 cudaComputes[i].
patchInd.y = computes[i].patchInd[1];
2230 cudaComputes[i].
offsetXYZ.x = computes[i].offset.x;
2231 cudaComputes[i].
offsetXYZ.y = computes[i].offset.y;
2232 cudaComputes[i].
offsetXYZ.z = computes[i].offset.z;
2235 tileListKernel.
updateComputes(computes.size(), cudaComputes, stream);
2236 cudaCheck(cudaStreamSynchronize(stream));
2238 delete [] cudaComputes;
2243 return ( li[1] < lj[1] );
2250 void CudaComputeNonbonded::buildExclusions() {
2255 #ifdef MEM_OPT_VERSION 2256 int natoms =
mol->exclSigPoolSize;
2261 if (exclusionsByAtom != NULL)
delete [] exclusionsByAtom;
2262 exclusionsByAtom =
new int2[natoms];
2270 for (
int i=0; i<natoms; ++i ) {
2273 #ifdef MEM_OPT_VERSION 2276 for (
int j=0; j<n; ++j ) { curList.
add(sig->
fullOffset[j]); }
2280 int n = mol_list[0] + 1;
2281 for (
int j=1; j<n; ++j ) {
2282 curList.
add(mol_list[j] - i);
2288 for ( j=0; j<unique_lists.
size(); ++j ) {
2289 if ( n != unique_lists[j][0] )
continue;
2291 for ( k=0; k<n; ++k ) {
2292 if ( unique_lists[j][k+3] != curList[k] )
break;
2294 if ( k == n )
break;
2296 if ( j == unique_lists.
size() ) {
2300 maxdiff = -1 * curList[0];
2301 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
2303 for (
int k=0; k<n; ++k ) {
2304 list[k+3] = curList[k];
2306 unique_lists.
add(list);
2308 listsByAtom[i] = unique_lists[j];
2312 long int totalbits = 0;
2313 int nlists = unique_lists.
size();
2314 for (
int j=0; j<nlists; ++j ) {
2315 int32 *list = unique_lists[j];
2316 int maxdiff = list[1];
2317 list[2] = totalbits + maxdiff;
2318 totalbits += 2*maxdiff + 1;
2320 for (
int i=0; i<natoms; ++i ) {
2321 exclusionsByAtom[i].x = listsByAtom[i][1];
2322 exclusionsByAtom[i].y = listsByAtom[i][2];
2324 delete [] listsByAtom;
2326 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
2329 long int bytesneeded = totalbits / 8;
2330 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
2331 CkPrintf(
"Info: Found %d unique exclusion lists needing %ld bytes\n",
2332 unique_lists.
size(), bytesneeded);
2336 if ( bytesneeded > bytesavail ) {
2338 sprintf(errmsg,
"Found %d unique exclusion lists needing %ld bytes " 2339 "but only %ld bytes can be addressed with 32-bit int.",
2340 unique_lists.
size(), bytesneeded, bytesavail);
2345 #define SET_EXCL(EXCL,BASE,DIFF) \ 2346 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31)) 2348 unsigned int *exclusion_bits =
new unsigned int[totalbits/32];
2349 memset(exclusion_bits, 0, totalbits/8);
2352 for (
int i=0; i<unique_lists.
size(); ++i ) {
2353 base += unique_lists[i][1];
2354 if ( unique_lists[i][2] != (
int32)base ) {
2355 NAMD_bug(
"CudaComputeNonbonded::build_exclusions base != stored");
2357 int n = unique_lists[i][0];
2358 for (
int j=0; j<n; ++j ) {
2359 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
2361 base += unique_lists[i][1] + 1;
2364 int numExclusions = totalbits/32;
2374 delete [] exclusion_bits;
2380 const float cutoffInv = 1.0f /
cutoff;
2381 const float cutoff2Inv = 1.0f /
cutoff2;
2384 const float scutoff2Inv = 1.0f / scutoff2;
2388 const float slowScale = ((float)
simParams->fullElectFrequency) /
simParams->nonbondedFrequency;
2391 c.
lj_0 = scutoff_denom *
cutoff2 - 3.0f * scutoff2 * scutoff_denom;
2392 c.
lj_1 = scutoff_denom * 2.0f;
2393 c.
lj_2 = scutoff_denom * -12.0f;
2394 c.
lj_3 = 12.0f * scutoff_denom * scutoff2;
2397 c.
e_0 = cutoff2Inv * cutoffInv;
2398 c.
e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
2413 bool doTable =
simParams->useCUDANonbondedForceTable;
2417 doTable = doTable || doSlow;
2420 doTable = doTable || (doSlow && doVirial);
2427 return (
simParams->CUDASOAintegrate) ? reductionGpuResident :
2428 reductionGpuOffload;
void setNumPatches(int n)
#define CUDA_GBIS2_KERNEL_EVENT
void finishPatchOnPe(int i)
SubmitReduction * getCurrentReduction()
#define NAMD_EVENT_STOP(eon, id)
int get_mother_atom(int) const
ScaledPosition center(int pid) const
Type * getNewArray(int n)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
virtual void initialize()
virtual void gbisP3PatchReady(PatchID, int seq)
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
NAMD_HOST_DEVICE Vector c() const
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
void deallocate_host(T **pp)
static PatchMap * Object()
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
virtual void submit(void)=0
#define ADD_TENSOR_OBJECT(R, RL, D)
SimParameters * simParameters
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
void cudaDie(const char *msg, cudaError_t err)
Bool CUDASOAintegrateMode
void basePatchIDList(int pe, PatchIDList &)
void clearTileListStat(cudaStream_t stream)
virtual void gbisP2PatchReady(PatchID, int seq)
static const Molecule * mol
HomePatchList * homePatchList()
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
CudaPatchRecord * getCudaPatches()
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
bool operator()(int2 pidj, int2 pidi)
void updateDrudeData(const int atomStorageSize, const float *h_drudeAtomAlpha, const int *h_isDrude, cudaStream_t stream)
const int32 * get_full_exclusions_for_atom(int anum) const
BigReal GetAtomAlpha(int i) const
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
Patch * patch(PatchID pid)
static PatchMap * ObjectOnPe(int pe)
void messageEnqueueWork()
int add(const Elem &elem)
const CudaComputeNonbonded::PatchRecord * pr
Molecule stores the structural information for the system.
void reallocate_forceSOA(int atomStorageSize)
virtual void gbisP2PatchReady(PatchID, int seq)
static CudaNBConstants getNonbondedCoef(SimParameters *params)
__thread DeviceCUDA * deviceCUDA
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
static __device__ __host__ __forceinline__ int computeAtomPad(const int numAtoms, const int tilesize=WARPSIZE)
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
bool pid_compare_priority(int2 pidi, int2 pidj)
bool reallocate_host(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
virtual void gbisP3PatchReady(PatchID, int seq)
NAMD_HOST_DEVICE float3 make_float3(float4 a)
void NAMD_bug(const char *err_msg)
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
int getMasterPeForDeviceID(int deviceID)
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
#define CUDA_GBIS3_KERNEL_EVENT
void unregisterBoxesOnPe()
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
PatchID getPatchID() const
void CcdCallBacksReset(void *ignored, double curWallTime)
cr_sortop_distance(const Lattice &lattice)
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool atomsChanged, const bool doMinimize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doAlch, const bool doAlchVdwForceSwitching, const bool doFEP, const bool doTI, const bool doNbThole, const bool doTable, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, const CudaNBConstants nbConstants, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, AlchData *fepFlags, bool lambdaWindowUpdated, char *part, bool CUDASOAintegratorOn, bool useDeviceMigration, const float drudeNbtholeCut2, cudaStream_t stream)
int getPesSharingDevice(const int i)
void finishTileList(cudaStream_t stream)
void setExclusionsByAtom(int2 *h_data, const int num_atoms)
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void createProxy(PatchID pid)
void NAMD_die(const char *err_msg)
void registerComputeSelf(ComputeID cid, PatchID pid)
NAMD_HOST_DEVICE Vector b() const
static bool sortop_bitreverse(int a, int b)
void updateVdwTypesExclOnGPU(CudaTileListKernel &tlKernel, const int numPatches, const int atomStorageSize, const bool alchOn, CudaLocalRecord *localRecords, const int *d_vdwTypes, const int *d_id, const int *d_sortOrder, const int *d_partition, cudaStream_t stream)
static BigReal pi_ewaldcof
virtual void patchReady(PatchID, int doneMigration, int seq)
static __device__ __host__ __forceinline__ int computeNumTiles(const int numAtoms, const int tilesize=WARPSIZE)
int * getPatchReadyQueue()
int reversePriorityRankInPe
#define CUDA_NONBONDED_KERNEL_EVENT
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
cr_sortop_distance & distop
#define CUDA_GBIS1_KERNEL_EVENT
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
virtual void patchReady(PatchID, int doneMigration, int seq)
static BigReal alchVdwShiftCoeff
NAMD_HOST_DEVICE Vector a() const
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
int getNumPesSharingDevice()
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
virtual void atomUpdate()
bool operator()(CudaComputeNonbonded::ComputeRecord j, CudaComputeNonbonded::ComputeRecord i)
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
#define PATCH_PRIORITY(PID)
void updatePatchOrder(const std::vector< CudaLocalRecord > &data)
bool operator()(CudaComputeNonbonded::ComputeRecord i, CudaComputeNonbonded::ComputeRecord j)
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)