23 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
25 #define __thread __declspec(thread)
30 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
39 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
40 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
44 exclusionsByAtom = NULL;
49 exclIndexMaxDiff = NULL;
50 exclIndexMaxDiffSize = 0;
88 atomsChangedIn =
true;
90 computesChanged =
true;
92 forceDoneEventRecord =
false;
96 NAMD_die(
"CudaComputeNonbonded, pressure profile not supported");
109 if (exclusionsByAtom != NULL)
delete [] exclusionsByAtom;
110 if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
111 if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
112 if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
113 if (h_forces != NULL) deallocate_host<float4>(&h_forces);
114 if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
115 if (d_forces != NULL) deallocate_device<float4>(&d_forces);
116 if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
119 if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
120 if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
121 if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
122 if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
123 if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
124 if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
126 if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
128 if (patches.size() > 0) {
129 deallocate_host<VirialEnergy>(&h_virialEnergy);
130 deallocate_device<VirialEnergy>(&d_virialEnergy);
132 cudaCheck(cudaEventDestroy(forceDoneEvent));
133 CmiDestroyLock(lock);
142 void CudaComputeNonbonded::unregisterBox(
int i) {
143 if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(
this, &patches[i].positionBox);
144 if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(
this, &patches[i].forceBox);
145 if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(
this, &patches[i].intRadBox);
146 if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(
this, &patches[i].psiSumBox);
147 if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(
this, &patches[i].bornRadBox);
148 if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(
this, &patches[i].dEdaSumBox);
149 if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(
this, &patches[i].dHdrPrefixBox);
153 if (rankPatches[CkMyRank()].size() == 0)
154 NAMD_bug(
"CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
155 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
156 unregisterBox(rankPatches[CkMyRank()][i]);
165 computesChanged =
true;
167 addCompute(cid, pid, pid, 0.);
175 computesChanged =
true;
182 offset.
x += (t1%3-1) - (t2%3-1);
183 offset.
y += ((t1/3)%3-1) - ((t2/3)%3-1);
184 offset.
z += (t1/9-1) - (t2/9-1);
185 addCompute(cid, pid[0], pid[1], offset);
191 void CudaComputeNonbonded::addPatch(
PatchID pid) {
192 patches.push_back(PatchRecord(pid));
204 computes.push_back(cr);
210 void CudaComputeNonbonded::updatePatch(
int i) {
211 int numAtoms = patches[i].patch->getNumAtoms();
212 int numFreeAtoms = numAtoms;
214 const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
215 for (
int j=0; j< numAtoms; ++j ) {
216 if ( aExt[j].atomFixed ) --numFreeAtoms;
219 patches[i].numAtoms = numAtoms;
220 patches[i].numFreeAtoms = numFreeAtoms;
225 int CudaComputeNonbonded::findPid(
PatchID pid) {
226 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
227 int j = rankPatches[CkMyRank()][i];
228 if (patches[j].patchID == pid)
return j;
235 int i = findPid(pid);
237 NAMD_bug(
"CudaComputeNonbonded::patchReady, Patch ID not found");
257 void CudaComputeNonbonded::assignPatch(
int i) {
259 PatchID pid = patches[i].patchID;
264 patch = patchMap->
patch(pid);
266 patches[i].patch = patch;
267 if (patches[i].patch == NULL) {
268 NAMD_bug(
"CudaComputeNonbonded::assignPatch, patch not found");
270 patches[i].positionBox = patches[i].patch->registerPositionPickup(
this);
271 patches[i].forceBox = patches[i].patch->registerForceDeposit(
this);
274 patches[i].intRadBox = patches[i].patch->registerIntRadPickup(
this);
275 patches[i].psiSumBox = patches[i].patch->registerPsiSumDeposit(
this);
276 patches[i].bornRadBox = patches[i].patch->registerBornRadPickup(
this);
277 patches[i].dEdaSumBox = patches[i].patch->registerDEdaSumDeposit(
this);
278 patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(
this);
282 if (patches[i].pe != CkMyPe()) {
283 NAMD_bug(
"CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
286 patches[i].pe = CkMyPe();
289 patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->
node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
290 patches[i].isSameNode = ( CkNodeOf(patchMap->
node(pid)) == CkMyNode() );
297 if ( ppi != ppj )
return ppi < ppj;
298 return pidi.x < pidj.x;
303 if (rankPatches[CkMyRank()].size() == 0)
304 NAMD_bug(
"CudaComputeNonbonded::assignPatchesOnPe, empty rank");
310 for (
int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
311 int i = rankPatches[CkMyRank()][k];
312 int pid = patches[i].patchID;
313 int homePe = patchMap->
node(pid);
314 if ( CkNodeOf(homePe) == CkMyNode() ) {
318 homePatchByRank[CkRankOf(homePe)].
add(pid_index);
321 for (
int i=0; i<CkMyNodeSize(); ++i ) {
323 std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
324 int masterBoost = ( CkMyRank() == i ? 2 : 0 );
325 for (
int j=0; j<homePatchByRank[i].
size(); ++j ) {
326 int index = homePatchByRank[i][j].y;
327 patches[index].reversePriorityRankInPe = j + masterBoost;
332 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
333 assignPatch(rankPatches[CkMyRank()][i]);
343 for (
int i=0;i < CkMyNodeSize();i++) {
344 if (rankPatchIDs[i].find(pid) != -1)
return CkNodeFirst(CkMyNode()) + i;
353 proxyPatchPes.clear();
354 for (
int i=0;i < CkMyNodeSize();i++) {
355 int pe = CkNodeFirst(CkMyNode()) + i;
357 proxyPatchPes.push_back(pe);
366 std::sort(patches.begin(), patches.end());
367 std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
368 patches.erase(last, patches.end());
372 computeMgr = computeMgrIn;
376 std::map<PatchID, int> pidMap;
382 std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
383 int numPesOnNodeSharingDevice = 0;
384 int masterIndex = -1;
387 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
388 if ( CkNodeOf(pe) == CkMyNode() ) {
389 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
393 std::vector<int> count(patches.size(), 0);
394 std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
395 std::vector<int> rankpcount(CkMyNodeSize(), 0);
396 std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
400 int unassignedpatches = patches.size();
402 for (
int i=0;i < patches.size(); ++i) {
407 for (
int i=0;i < patches.size(); ++i) {
408 int pid = patches[i].patchID;
410 int homePe = patchMap->node(pid);
411 for (
int j=0; j < numPesOnNodeSharingDevice; ++j ) {
412 int pe = pesOnNodeSharingDevice[j];
414 if ( pe == homePe ) {
420 table[i*numPesOnNodeSharingDevice + j] = 1;
424 if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
425 patches[i].pe = homePe;
427 rankpcount[CkRankOf(homePe)] += 1;
431 for (
int i=0; i < patches.size(); ++i) {
432 int pid = patches[i].patchID;
433 if ( patches[i].pe != -1 )
continue;
436 for (
int j=0; j < numPesOnNodeSharingDevice; ++j) {
437 if ( table[i*numPesOnNodeSharingDevice + j] ) {
444 patches[i].pe = pesOnNodeSharingDevice[lastj];
450 while ( unassignedpatches ) {
452 for (i=0;i < patches.size(); ++i) {
453 if ( ! table[i*numPesOnNodeSharingDevice + assignj] )
continue;
454 int pid = patches[i].patchID;
456 if ( patches[i].pe != -1 )
continue;
457 patches[i].pe = pesOnNodeSharingDevice[assignj];
459 pcount[assignj] += 1;
460 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
463 if (i < patches.size() )
continue;
464 for ( i=0;i < patches.size(); ++i ) {
465 int pid = patches[i].patchID;
467 if ( patches[i].pe != -1 )
continue;
468 if ( count[i] )
continue;
469 patches[i].pe = pesOnNodeSharingDevice[assignj];
471 pcount[assignj] += 1;
472 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
475 if ( i < patches.size() )
continue;
476 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
480 rankPatches.resize(CkMyNodeSize());
481 for (
int i=0; i < patches.size(); ++i) {
482 rankPatches[CkRankOf(patches[i].pe)].push_back(i);
483 pidMap[patches[i].patchID] = i;
520 rankPatches.resize(CkMyNodeSize());
523 for (
int i=0;i < CkMyNodeSize();i++) {
524 int pe = CkNodeFirst(CkMyNode()) + i;
527 std::vector<int> proxyPatchPes;
528 std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
531 std::vector<int> pesToAvoid;
536 if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
540 for (
int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
541 if (computePmeCUDAMgr->
isPmePe(pe)) pesToAvoid.push_back(pe);
544 for (
int i=0;i < pesToAvoid.size();i++) {
545 int pe = pesToAvoid[i];
546 peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
550 peProxyPatchCounter[CkRankOf(masterPe)] = 2;
552 for (
int i=0;i < patches.size();i++) {
553 PatchID pid = patches[i].patchID;
558 if (proxyPatchPes.size() == 0) {
560 int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
561 pe = CkNodeFirst(CkMyNode()) + rank;
562 peProxyPatchCounter[rank]++;
572 int minCounter = (1 << 30);
573 for (
int j=0;j < proxyPatchPes.size();j++) {
574 if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
575 pe = proxyPatchPes[j];
576 minCounter = peProxyPatchCounter[CkRankOf(pe)];
580 NAMD_bug(
"CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
581 peProxyPatchCounter[CkRankOf(pe)]++;
583 }
else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
585 int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
586 pe = CkNodeFirst(CkMyNode()) + rank;
587 peProxyPatchCounter[rank]++;
589 if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
590 NAMD_bug(
"CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
591 rankPatches[CkRankOf(pe)].push_back(i);
595 delete [] rankHomePatchIDs;
598 for (
int i=0;i < computes.size();i++) {
599 computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
600 computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
602 for (
int i=0;i < CkMyNodeSize();i++) {
603 if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
609 if (patches.size() > 0) {
612 allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
614 allocate_host<VirialEnergy>(&h_virialEnergy, 1);
615 allocate_device<VirialEnergy>(&d_virialEnergy,
ATOMIC_BINS);
619 cudaDeviceProp props;
620 cudaCheck(cudaGetDeviceProperties(&props, deviceID));
621 maxShmemPerBlock = props.sharedMemPerBlock;
623 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
624 int leastPriority, greatestPriority;
625 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
626 int priority = (doStreaming) ? leastPriority : greatestPriority;
628 cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
632 cudaCheck(cudaEventCreate(&forceDoneEvent));
636 lock = CmiCreateLock();
646 atomsChangedIn =
true;
652 void CudaComputeNonbonded::updatePatches() {
657 for (
int i=0;i < patches.size();i++) {
658 patches[i].atomStart = atomStart;
660 int numAtoms = patches[i].numAtoms;
661 int numTiles = ((numAtoms-1)/
WARPSIZE+1);
662 maxTileListLen = std::max(maxTileListLen, numTiles);
665 atomStorageSize = atomStart;
667 if (maxTileListLen >= 65536) {
668 NAMD_bug(
"CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
672 void CudaComputeNonbonded::skipPatch(
int i) {
673 if (CkMyPe() != patches[i].pe)
674 NAMD_bug(
"CudaComputeNonbonded::skipPatch called on wrong Pe");
675 Flags &flags = patches[i].patch->flags;
676 patches[i].positionBox->skip();
677 patches[i].forceBox->skip();
679 patches[i].psiSumBox->skip();
680 patches[i].intRadBox->skip();
681 patches[i].bornRadBox->skip();
682 patches[i].dEdaSumBox->skip();
683 patches[i].dHdrPrefixBox->skip();
688 if (rankPatches[CkMyRank()].size() == 0)
689 NAMD_bug(
"CudaComputeNonbonded::skipPatchesOnPe, empty rank");
690 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
691 skipPatch(rankPatches[CkMyRank()][i]);
695 patchesCounter -= rankPatches[CkMyRank()].size();
696 if (patchesCounter == 0) {
707 void CudaComputeNonbonded::skip() {
708 if (CkMyPe() != masterPe)
709 NAMD_bug(
"CudaComputeNonbonded::skip() called on non masterPe");
711 if (patches.size() == 0)
return;
718 void CudaComputeNonbonded::getMaxMovementTolerance(
float& maxAtomMovement,
float& maxPatchTolerance) {
719 if (CkMyPe() != masterPe)
720 NAMD_bug(
"CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
722 for (
int i=0;i < patches.size();++i) {
723 PatchRecord &pr = patches[i];
725 float maxMove = pr.patch->flags.maxAtomMovement;
726 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
728 float maxTol = pr.patch->flags.pairlistTolerance;
729 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
733 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(
int first,
int last,
void *result,
int paraNum,
void *param) {
735 c->updateVdwTypesExclSubset(first, last);
738 void CudaComputeNonbonded::updateVdwTypesExclSubset(
int first,
int last) {
739 for (
int i=first;i <= last;i++) {
740 PatchRecord &pr = patches[i];
741 int start = pr.atomStart;
742 int numAtoms = pr.numAtoms;
743 const CompAtom *compAtom = pr.compAtom;
744 const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
746 int2* exclp = exclIndexMaxDiff + start;
747 int* aip = atomIndex + start;
748 for (
int k=0;k < numAtoms; ++k ) {
750 vdwTypes[start + k] = compAtom[j].
vdwType;
751 aip[k] = compAtomExt[j].
id;
752 #ifdef MEM_OPT_VERSION
753 exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
754 exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
755 #else // ! MEM_OPT_VERSION
756 exclp[k].x = exclusionsByAtom[compAtomExt[j].
id].y;
757 exclp[k].y = exclusionsByAtom[compAtomExt[j].
id].x;
758 #endif // MEM_OPT_VERSION
766 void CudaComputeNonbonded::updateVdwTypesExcl() {
768 reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
769 reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
770 reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
772 #if CMK_SMP && USE_CKLOOP
774 if (useCkLoop >= 1) {
775 CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (
void *)
this, CkMyNodeSize(), 0, patches.size()-1);
779 updateVdwTypesExclSubset(0, patches.size()-1);
782 nonbondedKernel.
updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
785 inline void CudaComputeNonbonded::copyAtomsLoop(
int first,
int last,
void *result,
int paraNum,
void *param) {
787 c->copyAtomsSubset(first, last);
790 void CudaComputeNonbonded::copyAtomsSubset(
int first,
int last) {
791 for (
int i=first;i <= last;++i) {
792 PatchRecord &pr = patches[i];
793 int numAtoms = pr.numAtoms;
795 int start = pr.atomStart;
796 const CudaAtom *src = pr.patch->getCudaAtomList();
798 memcpy(dst, src,
sizeof(
CudaAtom)*numAtoms);
801 CudaAtom lastAtom = src[numAtoms-1];
802 for (
int j=numAtoms;j < numAtomsAlign;j++) {
809 void CudaComputeNonbonded::copyGBISphase(
int i) {
810 if (CkMyPe() != patches[i].pe)
811 NAMD_bug(
"CudaComputeNonbonded::copyGBISphase called on wrong Pe");
812 PatchRecord &pr = patches[i];
813 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
817 float *intRad0 = intRad0H + pr.atomStart;
818 float *intRadS = intRadSH + pr.atomStart;
819 for (
int k=0;k < pr.numAtoms;++k) {
821 intRad0[k] = pr.intRad[2*j+0];
822 intRadS[k] = pr.intRad[2*j+1];
826 float *bornRad = bornRadH + pr.atomStart;
827 for (
int k=0; k < pr.numAtoms; ++k ) {
829 bornRad[k] = pr.bornRad[j];
832 float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
833 for (
int k=0; k < pr.numAtoms; ++k ) {
835 dHdrPrefix[k] = pr.dHdrPrefix[j];
840 void CudaComputeNonbonded::openBox(
int i) {
841 if (CkMyPe() != patches[i].pe)
842 NAMD_bug(
"CudaComputeNonbonded::openBox called on wrong Pe");
845 patches[i].compAtom = patches[i].positionBox->open();
846 copyAtomsSubset(i, i);
850 patches[i].intRad = patches[i].intRadBox->open();
851 patches[i].psiSum = patches[i].psiSumBox->open();
853 patches[i].bornRad = patches[i].bornRadBox->open();
854 patches[i].dEdaSum = patches[i].dEdaSumBox->open();
856 patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
863 if (masterPe != CkMyPe())
864 NAMD_bug(
"CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
869 if (rankPatches[CkMyRank()].size() == 0)
870 NAMD_bug(
"CudaComputeNonbonded::openBoxesOnPe, empty rank");
871 for (
int i=0;i < rankPatches[CkMyRank()].size();i++) {
872 openBox(rankPatches[CkMyRank()][i]);
876 patchesCounter -= rankPatches[CkMyRank()].size();
877 if (patchesCounter == 0) {
893 void CudaComputeNonbonded::reallocateArrays() {
898 reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
902 reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
903 reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
905 reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
906 reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
908 reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
909 reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);
913 reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
914 reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
915 reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
916 reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
917 reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
918 reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
923 if (CkMyPe() != masterPe)
924 NAMD_bug(
"CudaComputeNonbonded::doWork() called on non masterPe");
930 atomsChanged = atomsChangedIn;
931 atomsChangedIn =
false;
935 if (patches.size() == 0)
return;
939 Flags &flags = patches[0].patch->flags;
952 if ( computesChanged ) {
974 if (CkMyPe() != masterPe)
975 NAMD_bug(
"CudaComputeNonbonded::launchWork() called on non masterPe");
977 beforeForceCompute = CkWallTimer();
985 if ( atomsChanged || computesChanged ) {
987 pairlistsValid =
false;
988 pairlistTolerance = 0.0f;
992 float maxAtomMovement = 0.0f;
993 float maxPatchTolerance = 0.0f;
994 getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
996 Flags &flags = patches[0].patch->flags;
997 savePairlists =
false;
998 usePairlists =
false;
1000 savePairlists =
true;
1001 usePairlists =
true;
1003 if ( ! pairlistsValid ||
1004 ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1007 usePairlists =
true;
1010 if ( ! usePairlists ) {
1011 pairlistsValid =
false;
1014 if ( savePairlists ) {
1015 pairlistsValid =
true;
1016 pairlistTolerance = 2. * maxPatchTolerance;
1017 plcutoff += pairlistTolerance;
1019 plcutoff2 = plcutoff * plcutoff;
1033 patchReadyQueueNext = 0;
1037 for (
int i=0;i < numEmptyPatches;i++) {
1041 patchReadyQueue[i] = emptyPatches[i];
1043 if (patchReadyQueueLen != patches.size())
1044 NAMD_bug(
"CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1052 forceDoneSetCallback();
1070 copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1071 if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1075 if ((!simParams->
GBISOn ||
gbisPhase == 2) && (doEnergy || doVirial)) {
1078 atomStorageSize, doEnergy, doVirial, doSlow, simParams->
GBISOn,
1079 d_forces, d_forcesSlow, d_virialEnergy, stream);
1080 copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1084 forceDoneSetCallback();
1090 void CudaComputeNonbonded::doGBISphase1() {
1094 GBISKernel.
updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
1098 Lattice lattice = patches[0].patch->flags.lattice;
1100 float3
lata = make_float3(lattice.
a().
x, lattice.
a().
y, lattice.
a().
z);
1101 float3
latb = make_float3(lattice.
b().
x, lattice.
b().
y, lattice.
b().
z);
1102 float3
latc = make_float3(lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
1104 GBISKernel.
GBISphase1(tileListKernel, atomStorageSize,
1112 void CudaComputeNonbonded::doGBISphase2() {
1116 Lattice lattice = patches[0].patch->flags.lattice;
1118 float3 lata = make_float3(lattice.
a().
x, lattice.
a().
y, lattice.
a().
z);
1119 float3 latb = make_float3(lattice.
b().
x, lattice.
b().
y, lattice.
b().
z);
1120 float3 latc = make_float3(lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
1122 GBISKernel.
updateBornRad(atomStorageSize, bornRadH, stream);
1124 GBISKernel.
GBISphase2(tileListKernel, atomStorageSize,
1130 d_forces, dEdaSumH, stream);
1136 void CudaComputeNonbonded::doGBISphase3() {
1139 Lattice lattice = patches[0].patch->flags.lattice;
1141 float3 lata = make_float3(lattice.
a().
x, lattice.
a().
y, lattice.
a().
z);
1142 float3 latb = make_float3(lattice.
b().
x, lattice.
b().
y, lattice.
b().
z);
1143 float3 latc = make_float3(lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
1148 GBISKernel.
GBISphase3(tileListKernel, atomStorageSize,
1157 void CudaComputeNonbonded::doForce() {
1160 Lattice lattice = patches[0].patch->flags.lattice;
1161 float3 lata = make_float3(lattice.
a().
x, lattice.
a().
y, lattice.
a().
z);
1162 float3 latb = make_float3(lattice.
b().
x, lattice.
b().
y, lattice.
b().
z);
1163 float3 latc = make_float3(lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
1164 bool doPairlist = (savePairlists || !usePairlists);
1169 tileListKernel.
buildTileLists(numTileLists, patches.size(), atomStorageSize,
1171 cudaPatches, (
const float4*)atoms, plcutoff2, maxShmemPerBlock, stream);
1178 updateVdwTypesExcl();
1181 beforeForceCompute = CkWallTimer();
1184 nonbondedKernel.
nonbondedForce(tileListKernel, atomStorageSize, doPairlist,
1185 doEnergy, doVirial, doSlow, lata, latb, latc,
1186 (
const float4*)atoms,
cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow,
1193 traceUserBracketEvent(
CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
1199 int CudaComputeNonbonded::calcNumTileLists() {
1200 int numTileLists = 0;
1201 for (
int i=0;i < computes.size();i++) {
1202 int pi1 = computes[i].patchInd[0];
1203 int numAtoms1 = patches[pi1].numAtoms;
1204 int numTiles1 = (numAtoms1-1)/
WARPSIZE+1;
1205 numTileLists += numTiles1;
1215 if (CkMyPe() != masterPe)
1216 NAMD_bug(
"CudaComputeNonbonded::finishReductions() called on non masterPe");
1222 if (doStreaming && (doVirial || doEnergy)) {
1224 if (!forceDoneEventRecord)
1225 NAMD_bug(
"CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1226 cudaCheck(cudaEventSynchronize(forceDoneEvent));
1227 forceDoneEventRecord =
false;
1232 virialTensor.
xx = h_virialEnergy->
virial[0];
1233 virialTensor.
xy = h_virialEnergy->
virial[1];
1234 virialTensor.
xz = h_virialEnergy->
virial[2];
1235 virialTensor.
yx = h_virialEnergy->
virial[3];
1236 virialTensor.
yy = h_virialEnergy->
virial[4];
1237 virialTensor.
yz = h_virialEnergy->
virial[5];
1238 virialTensor.
zx = h_virialEnergy->
virial[6];
1239 virialTensor.
zy = h_virialEnergy->
virial[7];
1240 virialTensor.
zz = h_virialEnergy->
virial[8];
1280 computesChanged =
false;
1286 void CudaComputeNonbonded::finishPatch(
int i) {
1287 if (CkMyPe() != patches[i].pe)
1288 NAMD_bug(
"CudaComputeNonbonded::finishPatch called on wrong Pe");
1289 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1291 sprintf(buf,
"%s: %d",
NamdProfileEventStr[NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES], i);
1294 PatchRecord &pr = patches[i];
1295 pr.results = pr.forceBox->open();
1297 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1298 int atomStart = pr.atomStart;
1299 int numAtoms = pr.numAtoms;
1303 float4 *af = h_forces + atomStart;
1304 float4 *af_slow = h_forcesSlow + atomStart;
1307 for (
int k=0; k<numAtoms; ++k ) {
1319 f_slow[j].
x += af_slow[k].x;
1320 f_slow[j].
y += af_slow[k].y;
1321 f_slow[j].
z += af_slow[k].z;
1331 pr.positionBox->close(&(pr.compAtom));
1332 pr.forceBox->close(&(pr.results));
1333 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1334 NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_CUDA_FINISH_PATCHES);
1341 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
1342 if (patchSet.size() == 0)
1343 NAMD_bug(
"CudaComputeNonbonded::finishPatchesOnPe, empty rank");
1349 for (
int i=0;i < patchSet.size();i++) {
1350 finishGBISPhase(patchSet[i]);
1354 if (!simParams->
GBISOn || gbisPhaseSave == 3) {
1355 for (
int i=0;i < patchSet.size();i++) {
1356 finishPatch(patchSet[i]);
1361 patchesCounter -= patchSet.size();
1362 if (patchesCounter == 0) {
1369 if (!simParams->
GBISOn || gbisPhaseSave == 3) {
1380 finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1387 std::vector<int> v(1, i);
1388 finishSetOfPatchesOnPe(v);
1391 void CudaComputeNonbonded::finishPatches() {
1395 void CudaComputeNonbonded::finishGBISPhase(
int i) {
1396 if (CkMyPe() != patches[i].pe)
1397 NAMD_bug(
"CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
1398 PatchRecord &pr = patches[i];
1399 const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
1400 int atomStart = pr.atomStart;
1402 GBReal *psiSumMaster = psiSumH + atomStart;
1403 for (
int k=0; k<pr.numAtoms; ++k ) {
1405 pr.psiSum[j] += psiSumMaster[k];
1407 pr.psiSumBox->close(&(pr.psiSum));
1409 GBReal *dEdaSumMaster = dEdaSumH + atomStart;
1410 for (
int k=0; k<pr.numAtoms; ++k ) {
1412 pr.dEdaSum[j] += dEdaSumMaster[k];
1414 pr.dEdaSumBox->close(&(pr.dEdaSum));
1416 pr.intRadBox->close(&(pr.intRad));
1417 pr.bornRadBox->close(&(pr.bornRad));
1418 pr.dHdrPrefixBox->close(&(pr.dHdrPrefix));
1422 void CudaComputeNonbonded::finishTimers() {
1440 void CudaComputeNonbonded::reSortTileLists() {
1447 void CudaComputeNonbonded::forceDoneCheck(
void *arg,
double walltime) {
1450 if (CkMyPe() != c->masterPe)
1451 NAMD_bug(
"CudaComputeNonbonded::forceDoneCheck called on non masterPe");
1456 if (c->doStreaming) {
1458 while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
1459 c->patchReadyQueue[c->patchReadyQueueNext] = -1;
1460 c->patchReadyQueueNext++;
1463 if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
1465 if (c->atomsChanged && (!simParams->
GBISOn || c->
gbisPhase == 1) && !c->reSortDone) {
1466 c->reSortTileLists();
1467 c->reSortDone =
true;
1471 c->forceDoneSetCallback();
1478 int pe = c->patches[patchInd].pe;
1479 PatchID patchID = c->patches[patchInd].patchID;
1483 if ( c->patchReadyQueueNext == c->patchReadyQueueLen )
return;
1487 if (!c->forceDoneEventRecord)
1488 NAMD_bug(
"CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
1489 cudaError_t err = cudaEventQuery(c->forceDoneEvent);
1490 if (err == cudaSuccess) {
1492 c->forceDoneEventRecord =
false;
1495 if (c->atomsChanged && (!simParams->
GBISOn || c->
gbisPhase == 1) && !c->reSortDone) {
1496 c->reSortTileLists();
1497 c->reSortDone =
true;
1501 c->forceDoneSetCallback();
1507 }
else if (err != cudaErrorNotReady) {
1510 sprintf(errmsg,
"in CudaComputeNonbonded::forceDoneCheck after polling %d times over %f s",
1511 c->checkCount, walltime - c->beforeForceCompute);
1521 if (c->checkCount >= 1000000) {
1523 sprintf(errmsg,
"CudaComputeNonbonded::forceDoneCheck polled %d times over %f s",
1524 c->checkCount, walltime - c->beforeForceCompute);
1530 CcdCallFnAfter(forceDoneCheck, arg, 0.1);
1536 void CudaComputeNonbonded::forceDoneSetCallback() {
1537 if (CkMyPe() != masterPe)
1538 NAMD_bug(
"CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
1539 beforeForceCompute = CkWallTimer();
1541 if (!doStreaming || doVirial || doEnergy) {
1542 cudaCheck(cudaEventRecord(forceDoneEvent, stream));
1543 forceDoneEventRecord =
true;
1548 CcdCallFnAfter(forceDoneCheck,
this, 0.1);
1566 if ( a == b )
return 0;
1567 for (
int bit = 1; bit; bit *= 2 ) {
1568 if ( (a&bit) != (b&bit) )
return ((a&bit) < (b&bit));
1588 if ( rpri != rprj )
return rpri > rprj;
1593 if ( ppi != ppj )
return ppi < ppj;
1594 return pidi.x < pidj.x;
1611 void CudaComputeNonbonded::updateComputes() {
1614 Lattice lattice = patches[0].patch->flags.lattice;
1616 std::stable_sort(computes.begin(), computes.end(), so);
1620 std::stable_sort(computes.begin(), computes.end(), sorp);
1625 for (
int i=0;i < computes.size();i++) {
1626 cudaComputes[i].
patchInd.x = computes[i].patchInd[0];
1627 cudaComputes[i].
patchInd.y = computes[i].patchInd[1];
1628 cudaComputes[i].
offsetXYZ.x = computes[i].offset.x;
1629 cudaComputes[i].
offsetXYZ.y = computes[i].offset.y;
1630 cudaComputes[i].
offsetXYZ.z = computes[i].offset.z;
1633 tileListKernel.
updateComputes(computes.size(), cudaComputes, stream);
1634 cudaCheck(cudaStreamSynchronize(stream));
1636 delete [] cudaComputes;
1641 return ( li[1] < lj[1] );
1648 void CudaComputeNonbonded::buildExclusions() {
1653 #ifdef MEM_OPT_VERSION
1654 int natoms = mol->exclSigPoolSize;
1659 if (exclusionsByAtom != NULL)
delete [] exclusionsByAtom;
1660 exclusionsByAtom =
new int2[natoms];
1668 for (
int i=0; i<natoms; ++i ) {
1671 #ifdef MEM_OPT_VERSION
1674 for (
int j=0; j<n; ++j ) { curList.
add(sig->
fullOffset[j]); }
1678 int n = mol_list[0] + 1;
1679 for (
int j=1; j<n; ++j ) {
1680 curList.
add(mol_list[j] - i);
1686 for ( j=0; j<unique_lists.
size(); ++j ) {
1687 if ( n != unique_lists[j][0] )
continue;
1689 for ( k=0; k<n; ++k ) {
1690 if ( unique_lists[j][k+3] != curList[k] )
break;
1692 if ( k == n )
break;
1694 if ( j == unique_lists.
size() ) {
1698 maxdiff = -1 * curList[0];
1699 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
1701 for (
int k=0; k<n; ++k ) {
1702 list[k+3] = curList[k];
1704 unique_lists.
add(list);
1706 listsByAtom[i] = unique_lists[j];
1710 long int totalbits = 0;
1711 int nlists = unique_lists.
size();
1712 for (
int j=0; j<nlists; ++j ) {
1713 int32 *list = unique_lists[j];
1714 int maxdiff = list[1];
1715 list[2] = totalbits + maxdiff;
1716 totalbits += 2*maxdiff + 1;
1718 for (
int i=0; i<natoms; ++i ) {
1719 exclusionsByAtom[i].x = listsByAtom[i][1];
1720 exclusionsByAtom[i].y = listsByAtom[i][2];
1722 delete [] listsByAtom;
1724 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
1727 long int bytesneeded = totalbits / 8;
1728 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
1729 CkPrintf(
"Info: Found %d unique exclusion lists needing %ld bytes\n",
1730 unique_lists.
size(), bytesneeded);
1734 if ( bytesneeded > bytesavail ) {
1736 sprintf(errmsg,
"Found %d unique exclusion lists needing %ld bytes "
1737 "but only %ld bytes can be addressed with 32-bit int.",
1738 unique_lists.
size(), bytesneeded, bytesavail);
1743 #define SET_EXCL(EXCL,BASE,DIFF) \
1744 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
1746 unsigned int *exclusion_bits =
new unsigned int[totalbits/32];
1747 memset(exclusion_bits, 0, totalbits/8);
1750 for (
int i=0; i<unique_lists.
size(); ++i ) {
1751 base += unique_lists[i][1];
1752 if ( unique_lists[i][2] != (
int32)base ) {
1753 NAMD_bug(
"CudaComputeNonbonded::build_exclusions base != stored");
1755 int n = unique_lists[i][0];
1756 for (
int j=0; j<n; ++j ) {
1757 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
1759 base += unique_lists[i][1] + 1;
1762 int numExclusions = totalbits/32;
1766 delete [] exclusion_bits;
void setNumPatches(int n)
#define CUDA_GBIS2_KERNEL_EVENT
void finishPatchOnPe(int i)
#define NAMD_EVENT_STOP(eon, id)
Type * getNewArray(int n)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
virtual void initialize()
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
virtual void gbisP3PatchReady(PatchID, int seq)
void prepareTileList(cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
BigReal solvent_dielectric
cr_sortop_reverse_priority(cr_sortop_distance &sod, const CudaComputeNonbonded::PatchRecord *patchrecs)
static ProxyMgr * Object()
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
static PatchMap * Object()
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
#define ADD_TENSOR_OBJECT(R, RL, D)
SimParameters * simParameters
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
void basePatchIDList(int pe, PatchIDList &)
virtual void gbisP2PatchReady(PatchID, int seq)
static const Molecule * mol
char const *const NamdProfileEventStr[]
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
static ReductionMgr * Object(void)
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
static PatchMap * ObjectOnPe(int pe)
cr_sortop_distance & distop
void CcdCallBacksReset(void *ignored, double curWallTime)
void messageEnqueueWork()
const CudaComputeNonbonded::PatchRecord * pr
void reallocate_forceSOA(int atomStorageSize)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
virtual void gbisP2PatchReady(PatchID, int seq)
bool operator()(int pidj, int pidi)
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
#define SET_EXCL(EXCL, BASE, DIFF)
static ComputePmeCUDAMgr * Object()
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
bool pid_compare_priority(int2 pidi, int2 pidj)
virtual void gbisP3PatchReady(PatchID, int seq)
void NAMD_bug(const char *err_msg)
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
int getMasterPeForDeviceID(int deviceID)
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
#define CUDA_GBIS3_KERNEL_EVENT
void unregisterBoxesOnPe()
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
void finishTileList(cudaStream_t stream)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void createProxy(PatchID pid)
void NAMD_die(const char *err_msg)
int * getPatchReadyQueue()
void registerComputeSelf(ComputeID cid, PatchID pid)
static bool sortop_bitreverse(int a, int b)
__global__ void const int numTileLists
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
int add(const Elem &elem)
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
virtual void patchReady(PatchID, int doneMigration, int seq)
ScaledPosition center(int pid) const
BlockRadixSort::TempStorage sort
#define NAMD_EVENT_START_EX(eon, id, str)
int reversePriorityRankInPe
#define CUDA_NONBONDED_KERNEL_EVENT
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
__thread DeviceCUDA * deviceCUDA
#define CUDA_GBIS1_KERNEL_EVENT
void assignPatches(ComputeMgr *computeMgrIn)
bool operator()(int32 *li, int32 *lj)
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
virtual void patchReady(PatchID, int doneMigration, int seq)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
void sendFinishPatchOnPe(int pe, CudaComputeNonbonded *c, int i, PatchID patchID)
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
int getNumPesSharingDevice()
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
virtual void atomUpdate()
const int32 * get_full_exclusions_for_atom(int anum) const
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
#define PATCH_PRIORITY(PID)
void sendSkipPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)