28 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 30 #define __thread __declspec(thread) 35 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 43 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
44 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
48 exclusionsByAtom = NULL;
53 exclIndexMaxDiff = NULL;
54 exclIndexMaxDiffSize = 0;
67 lambdaWindowUpdated =
false;
96 atomsChangedIn =
true;
98 computesChanged =
true;
100 forceDoneEventRecord =
false;
104 NAMD_die(
"CudaComputeNonbonded, pressure profile not supported");
118 if (exclusionsByAtom != NULL)
delete [] exclusionsByAtom;
119 if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
120 if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
121 if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
122 if (part != NULL) deallocate_host<char>(&part);
123 if (h_forces != NULL) deallocate_host<float4>(&h_forces);
124 if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
125 if (d_forces != NULL) deallocate_device<float4>(&d_forces);
126 if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
129 if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
130 if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
131 if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
132 if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
133 if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
134 if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
136 if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
138 if (patches.size() > 0) {
139 deallocate_host<VirialEnergy>(&h_virialEnergy);
140 deallocate_device<VirialEnergy>(&d_virialEnergy);
142 cudaCheck(cudaEventDestroy(forceDoneEvent));
143 CmiDestroyLock(lock);
153 void CudaComputeNonbonded::unregisterBox(
int i) {
154 if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(
this, &patches[i].positionBox);
155 if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(
this, &patches[i].forceBox);
156 if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(
this, &patches[i].intRadBox);
157 if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(
this, &patches[i].psiSumBox);
158 if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(
this, &patches[i].bornRadBox);
159 if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(
this, &patches[i].dEdaSumBox);
160 if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(
this, &patches[i].dHdrPrefixBox);
164 if (rankPatches[CkMyRank()].size() == 0)
165 NAMD_bug(
"CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
166 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
167 unregisterBox(rankPatches[CkMyRank()][i]);
176 computesChanged =
true;
178 addCompute(
cid, pid, pid, 0.);
186 computesChanged =
true;
193 offset.
x += (t1%3-1) - (t2%3-1);
194 offset.
y += ((t1/3)%3-1) - ((t2/3)%3-1);
195 offset.
z += (t1/9-1) - (t2/9-1);
196 addCompute(
cid, pid[0], pid[1], offset);
202 void CudaComputeNonbonded::addPatch(
PatchID pid) {
215 computes.push_back(cr);
221 void CudaComputeNonbonded::updatePatch(
int i) {
222 int numAtoms = patches[i].patch->getNumAtoms();
223 int numFreeAtoms = numAtoms;
225 const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
226 for (
int j=0; j< numAtoms; ++j ) {
227 if ( aExt[j].atomFixed ) --numFreeAtoms;
230 patches[i].numAtoms = numAtoms;
231 patches[i].numFreeAtoms = numFreeAtoms;
234 #ifdef NODEGROUP_FORCE_REGISTER 235 cudaPatches[i].patchID = patches[i].patchID;
239 int CudaComputeNonbonded::findPid(
PatchID pid) {
240 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
241 int j = rankPatches[CkMyRank()][i];
242 if (patches[j].patchID == pid)
return j;
253 int i = findPid(pid);
255 NAMD_bug(
"CudaComputeNonbonded::patchReady, Patch ID not found");
276 void CudaComputeNonbonded::assignPatch(
int i) {
279 PatchID pid = patches[i].patchID;
284 patch = patchMap->
patch(pid);
286 patches[i].patch = patch;
287 if (patches[i].patch == NULL) {
288 NAMD_bug(
"CudaComputeNonbonded::assignPatch, patch not found");
290 patches[i].positionBox = patches[i].patch->registerPositionPickup(
this);
291 patches[i].forceBox = patches[i].patch->registerForceDeposit(
this);
294 patches[i].intRadBox = patches[i].patch->registerIntRadPickup(
this);
295 patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(
this);
296 patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(
this);
297 patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(
this);
298 patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(
this);
302 if (patches[i].pe != CkMyPe()) {
303 NAMD_bug(
"CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
306 patches[i].pe = CkMyPe();
309 patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->
node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
310 patches[i].isSameNode = ( CkNodeOf(patchMap->
node(pid)) == CkMyNode() );
317 if ( ppi != ppj )
return ppi < ppj;
318 return pidi.x < pidj.x;
323 if (rankPatches[CkMyRank()].size() == 0)
324 NAMD_bug(
"CudaComputeNonbonded::assignPatchesOnPe, empty rank");
330 for (
int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
331 int i = rankPatches[CkMyRank()][k];
332 int pid = patches[i].patchID;
333 int homePe = patchMap->
node(pid);
334 if ( CkNodeOf(homePe) == CkMyNode() ) {
338 homePatchByRank[CkRankOf(homePe)].
add(pid_index);
341 for (
int i=0; i<CkMyNodeSize(); ++i ) {
343 std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
344 int masterBoost = ( CkMyRank() == i ? 2 : 0 );
345 for (
int j=0; j<homePatchByRank[i].
size(); ++j ) {
346 int index = homePatchByRank[i][j].y;
347 patches[index].reversePriorityRankInPe = j + masterBoost;
352 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
353 assignPatch(rankPatches[CkMyRank()][i]);
363 for (
int i=0;i < CkMyNodeSize();i++) {
364 if (rankPatchIDs[i].find(pid) != -1)
return CkNodeFirst(CkMyNode()) + i;
373 proxyPatchPes.clear();
374 for (
int i=0;i < CkMyNodeSize();i++) {
375 int pe = CkNodeFirst(CkMyNode()) + i;
377 proxyPatchPes.push_back(pe);
386 std::sort(patches.begin(), patches.end());
387 std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
388 patches.erase(last, patches.end());
392 computeMgr = computeMgrIn;
396 std::map<PatchID, int> pidMap;
402 std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
403 int numPesOnNodeSharingDevice = 0;
404 int masterIndex = -1;
407 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
408 if ( CkNodeOf(pe) == CkMyNode() ) {
409 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
413 std::vector<int> count(patches.size(), 0);
414 std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
415 std::vector<int> rankpcount(CkMyNodeSize(), 0);
416 std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
420 int unassignedpatches = patches.size();
422 for (
int i=0;i < patches.size(); ++i) {
427 for (
int i=0;i < patches.size(); ++i) {
428 int pid = patches[i].patchID;
430 int homePe = patchMap->node(pid);
431 for (
int j=0; j < numPesOnNodeSharingDevice; ++j ) {
432 int pe = pesOnNodeSharingDevice[j];
434 if ( pe == homePe ) {
440 table[i*numPesOnNodeSharingDevice + j] = 1;
444 if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
445 patches[i].pe = homePe;
447 rankpcount[CkRankOf(homePe)] += 1;
451 for (
int i=0; i < patches.size(); ++i) {
452 int pid = patches[i].patchID;
453 if ( patches[i].pe != -1 )
continue;
456 for (
int j=0; j < numPesOnNodeSharingDevice; ++j) {
457 if ( table[i*numPesOnNodeSharingDevice + j] ) {
464 patches[i].pe = pesOnNodeSharingDevice[lastj];
470 while ( unassignedpatches ) {
472 for (i=0;i < patches.size(); ++i) {
473 if ( ! table[i*numPesOnNodeSharingDevice + assignj] )
continue;
474 int pid = patches[i].patchID;
476 if ( patches[i].pe != -1 )
continue;
477 patches[i].pe = pesOnNodeSharingDevice[assignj];
479 pcount[assignj] += 1;
480 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
483 if (i < patches.size() )
continue;
484 for ( i=0;i < patches.size(); ++i ) {
485 int pid = patches[i].patchID;
487 if ( patches[i].pe != -1 )
continue;
488 if ( count[i] )
continue;
489 patches[i].pe = pesOnNodeSharingDevice[assignj];
491 pcount[assignj] += 1;
492 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
495 if ( i < patches.size() )
continue;
496 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
500 rankPatches.resize(CkMyNodeSize());
501 for (
int i=0; i < patches.size(); ++i) {
502 rankPatches[CkRankOf(patches[i].pe)].push_back(i);
503 pidMap[patches[i].patchID] = i;
511 rankPatches.resize(CkMyNodeSize());
514 for (
int i=0;i < CkMyNodeSize();i++) {
515 int pe = CkNodeFirst(CkMyNode()) + i;
518 std::vector<int> proxyPatchPes;
519 std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
522 std::vector<int> pesToAvoid;
527 if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
531 for (
int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
532 if (computePmeCUDAMgr->
isPmePe(pe)) pesToAvoid.push_back(pe);
535 for (
int i=0;i < pesToAvoid.size();i++) {
536 int pe = pesToAvoid[i];
537 peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
541 peProxyPatchCounter[CkRankOf(masterPe)] = 2;
543 for (
int i=0;i < patches.size();i++) {
545 PatchID pid = patches[i].patchID;
550 if (proxyPatchPes.size() == 0) {
552 int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
553 pe = CkNodeFirst(CkMyNode()) + rank;
554 peProxyPatchCounter[rank]++;
564 int minCounter = (1 << 30);
565 for (
int j=0;j < proxyPatchPes.size();j++) {
566 if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
567 pe = proxyPatchPes[j];
568 minCounter = peProxyPatchCounter[CkRankOf(pe)];
572 NAMD_bug(
"CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
573 peProxyPatchCounter[CkRankOf(pe)]++;
575 }
else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
577 int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
578 pe = CkNodeFirst(CkMyNode()) + rank;
579 peProxyPatchCounter[rank]++;
581 if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
582 NAMD_bug(
"CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
583 rankPatches[CkRankOf(pe)].push_back(i);
587 delete [] rankHomePatchIDs;
590 for (
int i=0;i < computes.size();i++) {
591 computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
592 computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
594 for (
int i=0;i < CkMyNodeSize();i++) {
595 if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
602 std::map<int, int> pidMap;
603 for (
int i=0; i < data.size(); ++i) {
604 pidMap[data[i].patchID] = i;
607 std::vector<PatchRecord> copy = patches;
609 for (
int i=0; i < copy.size(); i++) {
610 const int new_idx = pidMap[copy[i].patchID];
611 patches[new_idx] = copy[i];
614 for (
int i=0; i < rankPatches.size(); i++) {
615 rankPatches[i].clear();
617 for (
int i=0; i < patches.size(); ++i) {
618 rankPatches[CkRankOf(patches[i].pe)].push_back(i);
622 for (
int i=0;i < computes.size();i++) {
623 computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
624 computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
630 if (patches.size() > 0) {
634 allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
636 allocate_host<VirialEnergy>(&h_virialEnergy, 1);
637 allocate_device<VirialEnergy>(&d_virialEnergy,
ATOMIC_BINS);
641 cudaDeviceProp props;
642 cudaCheck(cudaGetDeviceProperties(&props, deviceID));
643 maxShmemPerBlock = props.sharedMemPerBlock;
645 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 646 int leastPriority, greatestPriority;
647 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
648 int priority = (doStreaming) ? leastPriority : greatestPriority;
654 cudaCheck(cudaEventCreate(&forceDoneEvent));
658 lock = CmiCreateLock();
662 #ifdef NODEGROUP_FORCE_REGISTER 664 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
665 PatchData *patchData = cpdata.ckLocalBranch();
667 patchData->devData[devInd].nbond_stream = stream;
671 allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
672 memset(patchData->devData[devInd].h_hasPatches, 0,
sizeof(
bool)*nGlobalPatches);
674 for(
int i = 0; i < patches.size(); i++){
675 patchData->devData[devInd].h_hasPatches[patches[i].patchID] =
true;
677 allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
678 copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
687 atomsChangedIn =
true;
693 void CudaComputeNonbonded::updatePatches() {
695 #ifdef NODEGROUP_FORCE_REGISTER 697 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
698 PatchData *patchData = cpdata.ckLocalBranch();
699 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
700 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
705 for (
int i=0;i < numPatchesHomeAndProxy; i++) {
706 patches[i].numAtoms = localPatches[i].numAtoms;
707 patches[i].numFreeAtoms = localPatches[i].numAtoms;
708 patches[i].atomStart = localPatches[i].bufferOffsetNBPad;
709 cudaPatches[i].
numAtoms = localPatches[i].numAtoms;
711 cudaPatches[i].
atomStart = localPatches[i].bufferOffsetNBPad;
712 cudaPatches[i].patchID = localPatches[i].patchID;
719 patch = pm->
patch(localPatches[i].patchID);
720 if (patch != NULL)
break;
722 if (patch == NULL)
NAMD_bug(
"CudaComputeNonbonded::updatePatches cannot find patch.\n");
723 if (patch->getNumAtoms() != localPatches[i].numAtoms) {
724 NAMD_bug(
"CudaComputeNonbonded::updatePatches numAtoms mismatches!\n");
726 const CompAtomExt *aExt = patch->getCompAtomExtInfo();
727 for (
int j = 0; j < localPatches[i].numAtoms; ++j) {
728 if (aExt[j].atomFixed) {
729 --patches[i].numFreeAtoms;
734 int numAtoms = patches[i].numAtoms;
735 #if defined(NAMD_CUDA) 737 maxTileListLen = std::max(maxTileListLen, numTiles);
742 maxTileListLen = std::max(maxTileListLen, numTiles);
747 atomStorageSize = atomStart;
749 if (maxTileListLen >= 65536) {
750 NAMD_bug(
"CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
758 for (
int i=0;i < patches.size();i++) {
759 patches[i].atomStart = atomStart;
761 #ifdef NODEGROUP_FORCE_REGISTER 762 cudaPatches[i].patchID = patches[i].patchID;
764 int numAtoms = patches[i].numAtoms;
767 maxTileListLen = std::max(maxTileListLen, numTiles);
771 maxTileListLen = std::max(maxTileListLen, numTiles);
775 atomStorageSize = atomStart;
777 if (maxTileListLen >= 65536) {
778 NAMD_bug(
"CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
783 void CudaComputeNonbonded::skipPatch(
int i) {
784 if (CkMyPe() != patches[i].pe)
785 NAMD_bug(
"CudaComputeNonbonded::skipPatch called on wrong Pe");
786 Flags &flags = patches[i].patch->flags;
787 patches[i].positionBox->skip();
788 patches[i].forceBox->skip();
790 patches[i].psiSumBox->skip();
791 patches[i].intRadBox->skip();
792 patches[i].bornRadBox->skip();
793 patches[i].dEdaSumBox->skip();
794 patches[i].dHdrPrefixBox->skip();
799 if (rankPatches[CkMyRank()].size() == 0)
800 NAMD_bug(
"CudaComputeNonbonded::skipPatchesOnPe, empty rank");
801 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
802 skipPatch(rankPatches[CkMyRank()][i]);
806 patchesCounter -= rankPatches[CkMyRank()].size();
807 if (patchesCounter == 0) {
818 void CudaComputeNonbonded::skip() {
819 if (CkMyPe() != masterPe)
820 NAMD_bug(
"CudaComputeNonbonded::skip() called on non masterPe");
822 if (patches.size() == 0)
return;
829 void CudaComputeNonbonded::getMaxMovementTolerance(
float& maxAtomMovement,
float& maxPatchTolerance) {
830 if (CkMyPe() != masterPe)
831 NAMD_bug(
"CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
833 for (
int i=0;i < patches.size();++i) {
836 float maxMove = pr.patch->flags.maxAtomMovement;
837 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
839 float maxTol = pr.patch->flags.pairlistTolerance;
842 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
846 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(
int first,
int last,
void *result,
int paraNum,
void *param) {
848 c->updateVdwTypesExclSubset(first, last);
851 void CudaComputeNonbonded::updateVdwTypesExclSubset(
int first,
int last) {
852 for (
int i=first;i <= last;i++) {
856 const CompAtom *compAtom = pr.compAtom;
857 const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
859 int2* exclp = exclIndexMaxDiff + start;
860 int* aip = atomIndex + start;
862 if(doAlch) pst = part + start;
863 for (
int k=0;k < numAtoms; ++k ) {
865 vdwTypes[start + k] = compAtom[j].
vdwType;
866 aip[k] = compAtomExt[j].
id;
867 if(doAlch) pst[k] = compAtom[j].
partition;
868 #ifdef MEM_OPT_VERSION 869 exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
870 exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
871 #else // ! MEM_OPT_VERSION 872 exclp[k].x = exclusionsByAtom[compAtomExt[j].
id].y;
873 exclp[k].y = exclusionsByAtom[compAtomExt[j].
id].x;
874 #endif // MEM_OPT_VERSION 882 void CudaComputeNonbonded::updateVdwTypesExcl() {
884 reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
885 reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
886 reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
887 if (doAlch) reallocate_host<char>(&part, &partSize, atomStorageSize, 1.4f);
891 #if CMK_SMP && USE_CKLOOP 893 if (useCkLoop >= 1) {
894 CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (
void *)
this, CkMyNodeSize(), 0, patches.size()-1);
898 updateVdwTypesExclSubset(0, patches.size()-1);
901 nonbondedKernel.
updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
903 #ifdef NODEGROUP_FORCE_REGISTER 904 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
905 PatchData *patchData = cpdata.ckLocalBranch();
908 patchData->devData[deviceIndex].numPatchesHomeAndProxy,
909 atomStorageSize, params->
alchOn,
910 patchData->devData[deviceIndex].d_localPatches,
911 patchData->h_soa_vdwType[deviceIndex],
912 patchData->h_soa_id[deviceIndex],
913 patchData->h_soa_sortOrder[deviceIndex],
914 patchData->h_soa_partition[deviceIndex],
917 #endif // NODEGROUP_FORCE_REGISTER 921 inline void CudaComputeNonbonded::copyAtomsLoop(
int first,
int last,
void *result,
int paraNum,
void *param) {
923 c->copyAtomsSubset(first, last);
926 void CudaComputeNonbonded::copyAtomsSubset(
int first,
int last) {
927 for (
int i=first;i <= last;++i) {
932 const CudaAtom *src = pr.patch->getCudaAtomList();
934 memcpy(dst, src,
sizeof(
CudaAtom)*numAtoms);
941 CudaAtom lastAtom = src[numAtoms-1];
942 for (
int j=numAtoms;j < numAtomsAlign;j++) {
946 fprintf(stderr,
" printing patch %d\n", pr.patch->getPatchID());
947 for(
int k = 0; k < numAtoms; k++){
948 fprintf(stderr,
"%lf %lf %lf\n", dst[k].x, dst[k].y, dst[k].z);
955 void CudaComputeNonbonded::copyGBISphase(
int i) {
956 if (CkMyPe() != patches[i].pe)
957 NAMD_bug(
"CudaComputeNonbonded::copyGBISphase called on wrong Pe");
959 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
963 float *intRad0 = intRad0H + pr.
atomStart;
964 float *intRadS = intRadSH + pr.
atomStart;
967 intRad0[k] = pr.intRad[2*j+0];
968 intRadS[k] = pr.intRad[2*j+1];
972 float *bornRad = bornRadH + pr.
atomStart;
973 for (
int k=0; k < pr.
numAtoms; ++k ) {
975 bornRad[k] = pr.bornRad[j];
978 float *dHdrPrefix = dHdrPrefixH + pr.
atomStart;
979 for (
int k=0; k < pr.
numAtoms; ++k ) {
981 dHdrPrefix[k] = pr.dHdrPrefix[j];
986 void CudaComputeNonbonded::openBox(
int i) {
989 if (CkMyPe() != patches[i].pe)
990 NAMD_bug(
"CudaComputeNonbonded::openBox called on wrong Pe");
994 patches[i].compAtom = patches[i].positionBox->open();
1001 #ifdef NODEGROUP_FORCE_REGISTER 1003 if(atomsChanged && !
simParams->useDeviceMigration) copyAtomsSubset(i, i);
1004 }
else copyAtomsSubset(i, i);
1006 copyAtomsSubset(i, i);
1011 patches[i].intRad = patches[i].intRadBox->open();
1012 patches[i].psiSum = patches[i].psiSumBox->open();
1014 patches[i].bornRad = patches[i].bornRadBox->open();
1015 patches[i].dEdaSum = patches[i].dEdaSumBox->open();
1017 patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
1026 if (masterPe != CkMyPe())
1027 NAMD_bug(
"CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
1032 if (rankPatches[CkMyRank()].size() == 0)
1033 NAMD_bug(
"CudaComputeNonbonded::openBoxesOnPe, empty rank");
1037 #ifdef NODEGROUP_FORCE_REGISTER 1038 if(
Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
1040 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
1041 int j = rankPatches[CkMyRank()][i];
1042 patches[j].positionBox->open();
1044 if(masterPe == CkMyPe()) {
1054 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
1055 openBox(rankPatches[CkMyRank()][i]);
1059 patchesCounter -= rankPatches[CkMyRank()].size();
1060 if (patchesCounter == 0) {
1071 #ifdef NODEGROUP_FORCE_REGISTER 1083 void CudaComputeNonbonded::reallocateArrays() {
1088 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
1092 reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1093 reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
1095 reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
1096 reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
1098 reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
1099 reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
1103 reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
1104 reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
1105 reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
1106 reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
1107 reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
1108 reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
1113 if (CkMyPe() != masterPe)
1114 NAMD_bug(
"CudaComputeNonbonded::doWork() called on non masterPe");
1121 atomsChanged = atomsChangedIn;
1122 atomsChangedIn =
false;
1126 if (patches.size() == 0)
return;
1131 Flags &flags = patches[0].patch->flags;
1146 if ( computesChanged ) {
1155 #ifdef NODEGROUP_FORCE_REGISTER 1157 tileListKernel.
prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
1158 updatePatchRecord();
1160 #endif // NODEGROUP_FORCE_REGISTER 1177 if (CkMyPe() != masterPe)
1178 NAMD_bug(
"CudaComputeNonbonded::launchWork() called on non masterPe");
1180 beforeForceCompute = CkWallTimer();
1191 if ( atomsChanged || computesChanged ) {
1193 pairlistsValid =
false;
1194 pairlistTolerance = 0.0f;
1198 float maxAtomMovement = 0.0f;
1199 float maxPatchTolerance = 0.0f;
1200 getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
1202 Flags &flags = patches[0].patch->flags;
1203 savePairlists =
false;
1204 usePairlists =
false;
1206 savePairlists =
true;
1207 usePairlists =
true;
1209 if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1211 #ifdef NODEGROUP_FORCE_REGISTER 1215 usePairlists =
true;
1218 if ( ! usePairlists ) {
1219 pairlistsValid =
false;
1222 if ( savePairlists ) {
1223 pairlistsValid =
true;
1224 pairlistTolerance = 2. * maxPatchTolerance;
1225 plcutoff += pairlistTolerance;
1227 plcutoff2 = plcutoff * plcutoff;
1231 if(savePairlists || !usePairlists){
1247 patchReadyQueueNext = 0;
1251 for (
int i=0;i < numEmptyPatches;i++) {
1255 patchReadyQueue[i] = emptyPatches[i];
1257 if (patchReadyQueueLen != patches.size())
1258 NAMD_bug(
"CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1266 forceDoneSetCallback();
1284 #ifdef NODEGROUP_FORCE_REGISTER 1285 if(!
simParams->CUDASOAintegrate || (atomsChanged && !
simParams->useDeviceMigration)){
1286 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1287 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1290 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1291 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1302 atomStorageSize, doEnergy, doVirial, doSlow,
simParams->GBISOn,
1303 d_forces, d_forcesSlow, d_virialEnergy, stream);
1304 copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1313 forceDoneSetCallback();
1316 cudaCheck(cudaStreamSynchronize(stream));
1324 fprintf(stderr,
"CudaNonbonded data\n");
1325 for(
int i = 0 ; i < atomStorageSize; i++){
1326 fprintf(stderr,
"pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n",
1327 i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
1329 h_forces[i].x, h_forces[i].y, h_forces[i].z,
1330 h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
1340 void CudaComputeNonbonded::doGBISphase1() {
1344 GBISKernel.
updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1348 Lattice lattice = patches[0].patch->flags.lattice;
1354 GBISKernel.
GBISphase1(tileListKernel, atomStorageSize,
1362 void CudaComputeNonbonded::doGBISphase2() {
1366 Lattice lattice = patches[0].patch->flags.lattice;
1372 GBISKernel.
updateBornRad(atomStorageSize, bornRadH, stream);
1374 GBISKernel.
GBISphase2(tileListKernel, atomStorageSize,
1380 d_forces, dEdaSumH, stream);
1386 void CudaComputeNonbonded::doGBISphase3() {
1389 Lattice lattice = patches[0].patch->flags.lattice;
1398 GBISKernel.
GBISphase3(tileListKernel, atomStorageSize,
1407 void CudaComputeNonbonded::doForce() {
1411 Lattice lattice = patches[0].patch->flags.lattice;
1412 bool CUDASOAintegrator =
simParams->CUDASOAintegrate;
1416 bool doPairlist = savePairlists || (!usePairlists);
1417 bool doFEP=
false, doTI=
false, doAlchVdwForceSwitching=
false;
1419 static thread_local
bool firsttime =
true;
1422 doAlchVdwForceSwitching =
simParams->vdwForceSwitching;
1426 const decltype(alchFlags.
lambdaUp) currentLambda =
simParams->getCurrentLambda(patches[0].patch->flags.step);
1427 const decltype(alchFlags.lambda2Up) currentLambda2 =
simParams->getCurrentLambda2(patches[0].patch->flags.step);
1431 lambdaWindowUpdated =
true;
1434 if (alchFlags.
lambdaUp != currentLambda ||
1435 alchFlags.
lambda2Up != currentLambda2 ||
1443 lambdaWindowUpdated =
true;
1445 lambdaWindowUpdated =
false;
1448 if (lambdaWindowUpdated) {
1462 alchFlags.
lambdaUp = currentLambda;
1482 int numTileLists = calcNumTileLists();
1485 tileListKernel.
buildTileLists(numTileLists, patches.size(), atomStorageSize,
1486 maxTileListLen, lata, latb, latc,
1487 cudaPatches, (
const float4*)atoms, plcutoff2, maxShmemPerBlock, stream, atomsChanged, doAlch,
1488 CUDASOAintegrator,
simParams->useDeviceMigration);
1496 updateVdwTypesExcl();
1499 beforeForceCompute = CkWallTimer();
1504 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1505 PatchData *patchData = cpdata.ckLocalBranch();
1508 fprintf(stderr,
"DEV[%d] MIG POS PRINTOUT\n", deviceID);
1509 for (
int p = 0; p < patches.size(); p++) {
1510 fprintf(stderr,
"Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1511 for (
int i = 0; i < patches[p].numAtoms; i++) {
1512 const int ai = i + patches[p].atomStart;
1513 fprintf(stderr,
"POS[%d,%d,%d] = %lf %lf %lf %lf. Type %d\n", i, ai, atomIndex[ai],
1514 atoms[ai].x, atoms[ai].y, atoms[ai].z, atoms[ai].q, vdwTypes[ai]);
1524 #ifdef DEBUG_MINIMIZE 1525 printf(
"%s, line %d:\n", __FILE__, __LINE__);
1526 printf(
" atomsChanged = %d\n", atomsChanged);
1527 printf(
" doMinimize = %d\n", doMinimize);
1528 printf(
" doPairlist = %d\n", doPairlist);
1529 printf(
" doEnergy = %d\n", doEnergy);
1530 printf(
" doVirial = %d\n", doVirial);
1531 printf(
" doSlow = %d\n", doSlow);
1537 atomsChanged, doMinimize, doPairlist, doEnergy, doVirial,
1538 doSlow, doAlch, doAlchVdwForceSwitching, doFEP, doTI,
1539 doTable, lata, latb, latc,
1540 (
const float4*)atoms,
cutoff2, c,
1541 d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1542 &alchFlags, lambdaWindowUpdated, part, CUDASOAintegrator,
1550 #ifdef NODEGROUP_FORCE_REGISTER 1552 updatePatchRecord();
1560 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1561 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1562 cudaStreamSynchronize(stream);
1563 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1564 PatchData *patchData = cpdata.ckLocalBranch();
1567 fprintf(stderr,
"DEV[%d] MIG POS PRINTOUT\n", deviceID);
1568 for (
int p = 0; p < patches.size(); p++) {
1569 fprintf(stderr,
"Patch Index %d. Patch ID %d\n", p, cudaPatches[p].patchID);
1570 for (
int i = 0; i < patches[p].numAtoms; i++) {
1571 const int ai = i + patches[p].atomStart;
1572 fprintf(stderr,
"POS[%d,%d,%d] = Type %d (%lf %lf %lf) (%lf %lf %lf)\n", i, ai, atomIndex[ai],
1574 h_forces[ai].x, h_forces[ai].y, h_forces[ai].z,
1575 h_forcesSlow[ai].x, h_forcesSlow[ai].y, h_forcesSlow[ai].z);
1585 traceUserBracketEvent(
CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1588 #ifdef NODEGROUP_FORCE_REGISTER 1589 void CudaComputeNonbonded::updatePatchRecord() {
1593 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1594 PatchData *patchData = cpdata.ckLocalBranch();
1596 patchData->devData[devInd].f_nbond = d_forces;
1597 patchData->devData[devInd].f_nbond_slow = d_forcesSlow;
1598 patchData->devData[devInd].f_nbond_size = atomStorageSize;
1600 patchData->devData[devInd].nbond_precord = tileListKernel.
getCudaPatches();
1602 patchData->devData[devInd].nb_datoms = tileListKernel.
get_xyzq();
1603 patchData->devData[devInd].nbond_tkernel = &tileListKernel;
1604 patchData->devData[devInd].size_nb_datoms = atomStorageSize;
1611 int CudaComputeNonbonded::calcNumTileLists() {
1612 int numTileLists = 0;
1613 for (
int i=0;i < computes.size();i++) {
1614 int pi1 = computes[i].patchInd[0];
1615 int numAtoms1 = patches[pi1].numAtoms;
1621 numTileLists += numTiles1;
1623 return numTileLists;
1630 if (CkMyPe() != masterPe)
1631 NAMD_bug(
"CudaComputeNonbonded::finishReductions() called on non masterPe");
1636 if (doStreaming && (doVirial || doEnergy)) {
1638 if (!forceDoneEventRecord)
1639 NAMD_bug(
"CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1640 cudaCheck(cudaEventSynchronize(forceDoneEvent));
1641 forceDoneEventRecord =
false;
1647 virialTensor.
xx = h_virialEnergy->
virial[0];
1648 virialTensor.
xy = h_virialEnergy->
virial[1];
1649 virialTensor.
xz = h_virialEnergy->
virial[2];
1650 virialTensor.
yx = h_virialEnergy->
virial[3];
1651 virialTensor.
yy = h_virialEnergy->
virial[4];
1652 virialTensor.
yz = h_virialEnergy->
virial[5];
1653 virialTensor.
zx = h_virialEnergy->
virial[6];
1654 virialTensor.
zy = h_virialEnergy->
virial[7];
1655 virialTensor.
zz = h_virialEnergy->
virial[8];
1659 #ifdef NODEGROUP_FORCE_REGISTER 1681 #ifdef NODEGROUP_FORCE_REGISTER 1694 #ifdef NODEGROUP_FORCE_REGISTER 1724 #ifdef NODEGROUP_FORCE_REGISTER 1743 #ifdef NODEGROUP_FORCE_REGISTER 1752 #ifdef NODEGROUP_FORCE_REGISTER 1766 computesChanged =
false;
1772 void CudaComputeNonbonded::finishPatch(
int i) {
1773 if (CkMyPe() != patches[i].pe)
1774 NAMD_bug(
"CudaComputeNonbonded::finishPatch called on wrong Pe");
1778 pr.results = pr.forceBox->open();
1781 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1784 #ifdef NODEGROUP_FORCE_REGISTER 1785 if (numAtoms > 0 && (!
simParams->CUDASOAintegrate || (atomsChanged && !
simParams->useDeviceMigration))) {
1788 float4 *af = h_forces + atomStart;
1789 float4 *af_slow = h_forcesSlow + atomStart;
1792 for (
int k=0; k<numAtoms; ++k ) {
1794 #ifdef DEBUG_MINIMIZE 1796 printf(
"%s, line %d\n", __FILE__, __LINE__);
1797 printf(
" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1803 #ifdef DEBUG_MINIMIZE 1805 printf(
" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1815 f_slow[j].
x += af_slow[k].x;
1816 f_slow[j].
y += af_slow[k].y;
1817 f_slow[j].
z += af_slow[k].z;
1830 float4 *af = h_forces + atomStart;
1831 float4 *af_slow = h_forcesSlow + atomStart;
1834 for (
int k=0; k<numAtoms; ++k ) {
1836 #ifdef DEBUG_MINIMIZE 1838 printf(
"%s, line %d\n", __FILE__, __LINE__);
1839 printf(
" before: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1845 #ifdef DEBUG_MINIMIZE 1847 printf(
" after: f[%d] = %f %f %f\n", j, f[j].x, f[j].y, f[j].z);
1857 f_slow[j].
x += af_slow[k].x;
1858 f_slow[j].
y += af_slow[k].y;
1859 f_slow[j].
z += af_slow[k].z;
1871 if(!
simParams->CUDASOAintegrate || atomsChanged){
1872 pr.positionBox->close(&(pr.compAtom));
1873 pr.forceBox->close(&(pr.results));
1880 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1881 NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1882 if (patchSet.size() == 0)
1883 NAMD_bug(
"CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1889 for (
int i=0;i < patchSet.size();i++) {
1890 finishGBISPhase(patchSet[i]);
1894 if (!
simParams->GBISOn || gbisPhaseSave == 3) {
1895 for (
int i=0;i < patchSet.size();i++) {
1896 finishPatch(patchSet[i]);
1901 patchesCounter -= patchSet.size();
1908 if (patchesCounter == 0) {
1915 if (!
simParams->GBISOn || gbisPhaseSave == 3) {
1924 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1932 finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1939 std::vector<int> v(1, i);
1940 finishSetOfPatchesOnPe(v);
1945 if (atomsChanged || doEnergy || doVirial)
cudaCheck(cudaStreamSynchronize(stream));
1953 void CudaComputeNonbonded::finishGBISPhase(
int i) {
1954 if (CkMyPe() != patches[i].pe)
1955 NAMD_bug(
"CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1957 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1960 GBReal *psiSumMaster = psiSumH + atomStart;
1961 for (
int k=0; k<pr.
numAtoms; ++k ) {
1963 pr.psiSum[j] += psiSumMaster[k];
1965 pr.psiSumBox->close(&(pr.psiSum));
1967 GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1968 for (
int k=0; k<pr.
numAtoms; ++k ) {
1970 pr.dEdaSum[j] += dEdaSumMaster[k];
1972 pr.dEdaSumBox->close(&(pr.dEdaSum));
1974 pr.intRadBox->close(&(pr.intRad));
1975 pr.bornRadBox->close(&(pr.bornRad));
1976 pr.dHdrPrefixBox->close(&(pr.dHdrPrefix));
1980 void CudaComputeNonbonded::finishTimers() {
2009 void CudaComputeNonbonded::forceDoneCheck(
void *arg,
double walltime) {
2012 if (CkMyPe() != c->masterPe)
2013 NAMD_bug(
"CudaComputeNonbonded::forceDoneCheck called on non masterPe");
2018 if (c->doStreaming) {
2020 while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
2021 c->patchReadyQueue[c->patchReadyQueueNext] = -1;
2022 c->patchReadyQueueNext++;
2025 if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
2027 if (c->atomsChanged && (!
simParams->GBISOn || c->
gbisPhase == 1) && !c->reSortDone) {
2029 c->reSortDone =
true;
2033 c->forceDoneSetCallback();
2040 int pe = c->patches[patchInd].pe;
2041 PatchID patchID = c->patches[patchInd].patchID;
2047 if ( c->patchReadyQueueNext == c->patchReadyQueueLen )
return;
2051 if (!c->forceDoneEventRecord)
2052 NAMD_bug(
"CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
2053 cudaError_t err = cudaEventQuery(c->forceDoneEvent);
2054 if (err == cudaSuccess) {
2056 c->forceDoneEventRecord =
false;
2059 if (c->atomsChanged && (!
simParams->GBISOn || c->
gbisPhase == 1) && !c->reSortDone) {
2061 c->reSortDone =
true;
2065 c->forceDoneSetCallback();
2071 }
else if (err != cudaErrorNotReady) {
2074 sprintf(errmsg,
"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
2075 c->checkCount, walltime - c->beforeForceCompute);
2085 if (c->checkCount >= 1000000) {
2087 sprintf(errmsg,
"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
2088 c->checkCount, walltime - c->beforeForceCompute);
2102 void CudaComputeNonbonded::forceDoneSetCallback() {
2103 if (CkMyPe() != masterPe)
2104 NAMD_bug(
"CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
2105 beforeForceCompute = CkWallTimer();
2107 if (!doStreaming || doVirial || doEnergy) {
2108 cudaCheck(cudaEventRecord(forceDoneEvent, stream));
2109 forceDoneEventRecord =
true;
2132 if ( a == b )
return 0;
2133 for (
int bit = 1; bit; bit *= 2 ) {
2134 if ( (a&bit) != (b&bit) )
return ((a&bit) < (b&bit));
2154 if ( rpri != rprj )
return rpri > rprj;
2159 if ( ppi != ppj )
return ppi < ppj;
2160 return pidi.x < pidj.x;
2177 void CudaComputeNonbonded::updateComputes() {
2180 Lattice lattice = patches[0].patch->flags.lattice;
2182 std::stable_sort(computes.begin(), computes.end(), so);
2186 std::stable_sort(computes.begin(), computes.end(), sorp);
2191 for (
int i=0;i < computes.size();i++) {
2192 cudaComputes[i].
patchInd.x = computes[i].patchInd[0];
2193 cudaComputes[i].
patchInd.y = computes[i].patchInd[1];
2194 cudaComputes[i].
offsetXYZ.x = computes[i].offset.x;
2195 cudaComputes[i].
offsetXYZ.y = computes[i].offset.y;
2196 cudaComputes[i].
offsetXYZ.z = computes[i].offset.z;
2199 tileListKernel.
updateComputes(computes.size(), cudaComputes, stream);
2200 cudaCheck(cudaStreamSynchronize(stream));
2202 delete [] cudaComputes;
2207 return ( li[1] < lj[1] );
2214 void CudaComputeNonbonded::buildExclusions() {
2219 #ifdef MEM_OPT_VERSION 2220 int natoms =
mol->exclSigPoolSize;
2225 if (exclusionsByAtom != NULL)
delete [] exclusionsByAtom;
2226 exclusionsByAtom =
new int2[natoms];
2234 for (
int i=0; i<natoms; ++i ) {
2237 #ifdef MEM_OPT_VERSION 2240 for (
int j=0; j<n; ++j ) { curList.
add(sig->
fullOffset[j]); }
2244 int n = mol_list[0] + 1;
2245 for (
int j=1; j<n; ++j ) {
2246 curList.
add(mol_list[j] - i);
2252 for ( j=0; j<unique_lists.
size(); ++j ) {
2253 if ( n != unique_lists[j][0] )
continue;
2255 for ( k=0; k<n; ++k ) {
2256 if ( unique_lists[j][k+3] != curList[k] )
break;
2258 if ( k == n )
break;
2260 if ( j == unique_lists.
size() ) {
2264 maxdiff = -1 * curList[0];
2265 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
2267 for (
int k=0; k<n; ++k ) {
2268 list[k+3] = curList[k];
2270 unique_lists.
add(list);
2272 listsByAtom[i] = unique_lists[j];
2276 long int totalbits = 0;
2277 int nlists = unique_lists.
size();
2278 for (
int j=0; j<nlists; ++j ) {
2279 int32 *list = unique_lists[j];
2280 int maxdiff = list[1];
2281 list[2] = totalbits + maxdiff;
2282 totalbits += 2*maxdiff + 1;
2284 for (
int i=0; i<natoms; ++i ) {
2285 exclusionsByAtom[i].x = listsByAtom[i][1];
2286 exclusionsByAtom[i].y = listsByAtom[i][2];
2288 delete [] listsByAtom;
2290 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
2293 long int bytesneeded = totalbits / 8;
2294 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
2295 CkPrintf(
"Info: Found %d unique exclusion lists needing %ld bytes\n",
2296 unique_lists.
size(), bytesneeded);
2300 if ( bytesneeded > bytesavail ) {
2302 sprintf(errmsg,
"Found %d unique exclusion lists needing %ld bytes " 2303 "but only %ld bytes can be addressed with 32-bit int.",
2304 unique_lists.
size(), bytesneeded, bytesavail);
2309 #define SET_EXCL(EXCL,BASE,DIFF) \ 2310 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31)) 2312 unsigned int *exclusion_bits =
new unsigned int[totalbits/32];
2313 memset(exclusion_bits, 0, totalbits/8);
2316 for (
int i=0; i<unique_lists.
size(); ++i ) {
2317 base += unique_lists[i][1];
2318 if ( unique_lists[i][2] != (
int32)base ) {
2319 NAMD_bug(
"CudaComputeNonbonded::build_exclusions base != stored");
2321 int n = unique_lists[i][0];
2322 for (
int j=0; j<n; ++j ) {
2323 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
2325 base += unique_lists[i][1] + 1;
2328 int numExclusions = totalbits/32;
2338 delete [] exclusion_bits;
2344 const float cutoffInv = 1.0f /
cutoff;
2345 const float cutoff2Inv = 1.0f /
cutoff2;
2348 const float scutoff2Inv = 1.0f / scutoff2;
2352 const float slowScale = ((float)
simParams->fullElectFrequency) /
simParams->nonbondedFrequency;
2355 c.
lj_0 = scutoff_denom *
cutoff2 - 3.0f * scutoff2 * scutoff_denom;
2356 c.
lj_1 = scutoff_denom * 2.0f;
2357 c.
lj_2 = scutoff_denom * -12.0f;
2358 c.
lj_3 = 12.0f * scutoff_denom * scutoff2;
2361 c.
e_0 = cutoff2Inv * cutoffInv;
2362 c.
e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
2377 bool doTable =
simParams->useCUDANonbondedForceTable;
2381 doTable = doTable || doSlow;
2384 doTable = doTable || (doSlow && doVirial);
void setNumPatches(int n)
#define CUDA_GBIS2_KERNEL_EVENT
void finishPatchOnPe(int i)
#define NAMD_EVENT_STOP(eon, id)
ScaledPosition center(int pid) const
Type * getNewArray(int n)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
virtual void initialize()
virtual void gbisP3PatchReady(PatchID, int seq)
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool atomsChanged, const bool doMinimize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doAlch, const bool doAlchVdwForceSwitching, const bool doFEP, const bool doTI, const bool doTable, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, const CudaNBConstants nbConstants, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, AlchData *fepFlags, bool lambdaWindowUpdated, char *part, bool CUDASOAintegratorOn, bool useDeviceMigration, cudaStream_t stream)
NAMD_HOST_DEVICE Vector c() const
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
static PatchMap * Object()
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
#define ADD_TENSOR_OBJECT(R, RL, D)
SimParameters * simParameters
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
void cudaDie(const char *msg, cudaError_t err)
void basePatchIDList(int pe, PatchIDList &)
void clearTileListStat(cudaStream_t stream)
virtual void gbisP2PatchReady(PatchID, int seq)
static const Molecule * mol
HomePatchList * homePatchList()
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
CudaPatchRecord * getCudaPatches()
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
bool operator()(int2 pidj, int2 pidi)
const int32 * get_full_exclusions_for_atom(int anum) const
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
NodeReduction * reduction
Patch * patch(PatchID pid)
static PatchMap * ObjectOnPe(int pe)
void messageEnqueueWork()
int add(const Elem &elem)
const CudaComputeNonbonded::PatchRecord * pr
Molecule stores the structural information for the system.
void reallocate_forceSOA(int atomStorageSize)
virtual void gbisP2PatchReady(PatchID, int seq)
static CudaNBConstants getNonbondedCoef(SimParameters *params)
__thread DeviceCUDA * deviceCUDA
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
static __device__ __host__ __forceinline__ int computeAtomPad(const int numAtoms, const int tilesize=WARPSIZE)
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
bool pid_compare_priority(int2 pidi, int2 pidj)
virtual void gbisP3PatchReady(PatchID, int seq)
NAMD_HOST_DEVICE float3 make_float3(float4 a)
void NAMD_bug(const char *err_msg)
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
ReductionValue & item(int index)
int getMasterPeForDeviceID(int deviceID)
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
#define CUDA_GBIS3_KERNEL_EVENT
void unregisterBoxesOnPe()
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
PatchID getPatchID() const
void CcdCallBacksReset(void *ignored, double curWallTime)
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
void finishTileList(cudaStream_t stream)
void setExclusionsByAtom(int2 *h_data, const int num_atoms)
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void createProxy(PatchID pid)
void NAMD_die(const char *err_msg)
void registerComputeSelf(ComputeID cid, PatchID pid)
NAMD_HOST_DEVICE Vector b() const
static bool sortop_bitreverse(int a, int b)
void updateVdwTypesExclOnGPU(CudaTileListKernel &tlKernel, const int numPatches, const int atomStorageSize, const bool alchOn, CudaLocalRecord *localRecords, const int *d_vdwTypes, const int *d_id, const int *d_sortOrder, const int *d_partition, cudaStream_t stream)
static BigReal pi_ewaldcof
virtual void patchReady(PatchID, int doneMigration, int seq)
static __device__ __host__ __forceinline__ int computeNumTiles(const int numAtoms, const int tilesize=WARPSIZE)
int * getPatchReadyQueue()
int reversePriorityRankInPe
#define CUDA_NONBONDED_KERNEL_EVENT
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
cr_sortop_distance & distop
#define CUDA_GBIS1_KERNEL_EVENT
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
virtual void patchReady(PatchID, int doneMigration, int seq)
static BigReal alchVdwShiftCoeff
NAMD_HOST_DEVICE Vector a() const
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
int getNumPesSharingDevice()
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
virtual void atomUpdate()
bool operator()(CudaComputeNonbonded::ComputeRecord j, CudaComputeNonbonded::ComputeRecord i)
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
#define PATCH_PRIORITY(PID)
void updatePatchOrder(const std::vector< CudaLocalRecord > &data)
bool operator()(CudaComputeNonbonded::ComputeRecord i, CudaComputeNonbonded::ComputeRecord j)
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)