20 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
22 #define __thread __declspec(thread)
27 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
28 int leastPriority, greatestPriority;
29 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
30 cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority));
44 if (
atom != NULL) dealloc_((
void *)
atom);
52 void* alloc_(
const int size) {
59 void dealloc_(
void *p) {
72 if (
atom != NULL) dealloc_((
void *)
atom);
80 void* alloc_(
const int size) {
81 return (
void *)(
new char[size]);
85 void dealloc_(
void *p) {
92 for (
int p=0;p < 10;p++) {
94 pencilCapacity[p] = 0;
98 for (
int p=0;p < 10;p++) {
100 pencilCapacity[p] = 0;
104 for (
int p=0;p < 10;p++) {
105 if (pencil[p] != NULL)
delete [] pencil[p];
116 const int pencilIndexY,
const int pencilIndexZ,
const int ylo,
const int yhi,
const int zlo,
const int zhi) {
119 for (
int p=0;p < 10;p++) {
120 if (pencil[p] != NULL && pencilCapacity[p] < numAtoms) {
123 pencilCapacity[p] = 0;
125 if (pencil[p] == NULL) {
126 int newSize = (int)(numAtoms*1.5);
127 pencil[p] =
new int[newSize];
128 pencilCapacity[p] = newSize;
133 const float recip11 = lattice.
a_r().
x;
134 const float recip22 = lattice.
b_r().
y;
135 const float recip33 = lattice.
c_r().
z;
136 const int order1 = pmeGrid.
order - 1;
137 const int K1 = pmeGrid.
K1;
138 const int K2 = pmeGrid.
K2;
139 const int K3 = pmeGrid.
K3;
140 const int yBlocks = pmeGrid.
yBlocks;
141 const int zBlocks = pmeGrid.
zBlocks;
143 for (
int i=0;i < numAtoms;i++) {
151 if (y0 < 0 || y0 >= K2 || z0 < 0 || z0 >= K3) {
153 pencil[9][pencilSize[9]++] = i;
163 for (
int j=0;j < 4;j++) {
165 int py =
getPencilIndexY(pmeGrid, (y0 + (j%2)*order1) % K2) - pencilIndexY;
166 if (py < ylo) py += yBlocks;
167 if (py > yhi) py -= yBlocks;
169 int pz =
getPencilIndexZ(pmeGrid, (z0 + (j/2)*order1) % K3) - pencilIndexZ;
170 if (pz < zlo) pz += zBlocks;
171 if (pz > zhi) pz -= zBlocks;
173 if (py < ylo || py > yhi || pz < zlo || pz > zhi) {
175 pencil[9][pencilSize[9]++] = i;
181 plist[j] = (py-ylo) + (pz-zlo)*3;
185 for (
int j=0;j < 4;j++) {
189 if (!(occupied & pbit)) {
190 pencil[p][pencilSize[p]++] = i;
208 numNodesContributed = 0;
217 NAMD_bug(
"ComputePmeCUDAMgr cannot be migrated");
220 numNodesContributed = 0;
228 for (
int i=0;i < extraDevices.size();i++) {
229 cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
250 int patch_y0 = floor((miny+0.5)*pmeGrid.
K2);
251 int patch_y1 = floor((maxy+0.5)*pmeGrid.
K2)-1;
252 int patch_z0 = floor((minz+0.5)*pmeGrid.
K3);
253 int patch_z1 = floor((maxz+0.5)*pmeGrid.
K3)-1;
255 if (patch_y0 < 0 || patch_y1 >= pmeGrid.
K2 || patch_z0 < 0 || patch_z1 >= pmeGrid.
K3) {
256 NAMD_bug(
"ComputePmeCUDAMgr::getHomePencil, patch bounds are outside grid bounds");
262 for (
int iz=0;iz < pmeGrid.
zBlocks;iz++) {
263 for (
int iy=0;iy < pmeGrid.
yBlocks;iy++) {
264 int pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1;
266 pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1);
268 if (pencil_y1 - pencil_y0 < pmeGrid.
order || pencil_z1 - pencil_z0 < pmeGrid.
order)
269 NAMD_bug(
"ComputePmeCUDAMgr::getHomePencil, pencil size must be >= PMEInterpOrder");
271 int y0 = std::max(patch_y0, pencil_y0);
272 int y1 = std::min(patch_y1, pencil_y1);
273 int z0 = std::max(patch_z0, pencil_z0);
274 int z1 = std::min(patch_z1, pencil_z1);
276 int overlap = (y1-y0 > 0 && z1-z0 > 0) ? (y1-y0)*(z1-z0) : -1;
278 if (overlap > maxOverlap) {
279 maxOverlap = overlap;
286 if (homey == -1 || homez == -1)
287 NAMD_bug(
"ComputePmeCUDAMgr::getHomePencil, home pencil not found");
293 void ComputePmeCUDAMgr::restrictToMaxPMEPencils() {
301 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
302 BigReal marginc = 0.5 * ( patchdim - cutoff ) / sysdimc;
308 while (pid < numPatches) {
317 int min2 = ((int) floor(pmeGrid.
K2 * (miny+0.5 - marginb)));
318 int max2 = ((int) floor(pmeGrid.
K2 * (maxy+0.5 + marginb))) + (pmeGrid.
order - 1);
320 if (min2 < 0) min2 += pmeGrid.
K2;
321 if (max2 >= pmeGrid.
K2) max2 -= pmeGrid.
K2;
326 int dmin2pi = homey - min2pi;
327 if (dmin2pi < 0) dmin2pi += pmeGrid.
yBlocks;
329 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin2pi");
333 if (pmeGrid.
yBlocks <= 0)
break;
336 int dmax2pi = max2pi - homey;
337 if (dmax2pi < 0) dmax2pi += pmeGrid.
yBlocks;
339 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax2pi");
343 if (pmeGrid.
yBlocks <= 0)
break;
353 int min3 = ((int) floor(pmeGrid.
K3 * (minz+0.5 - marginc)));
354 int max3 = ((int) floor(pmeGrid.
K3 * (maxz+0.5 + marginc))) + (pmeGrid.
order - 1);
356 if (min3 < 0) min3 += pmeGrid.
K3;
357 if (max3 >= pmeGrid.
K3) max3 -= pmeGrid.
K3;
362 int dmin3pi = homez - min3pi;
363 if (dmin3pi < 0) dmin3pi += pmeGrid.
zBlocks;
365 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin3pi");
369 if (pmeGrid.
zBlocks <= 0)
break;
372 int dmax3pi = max3pi - homez;
373 if (dmax3pi < 0) dmax3pi += pmeGrid.
zBlocks;
375 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax3pi");
379 if (pmeGrid.
zBlocks <= 0)
break;
391 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, zero PME pencils found");
394 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, unable to restrict number of PME pencils");
407 pmeGrid.
dim2 = pmeGrid.
K2;
408 pmeGrid.
dim3 = 2 * (pmeGrid.
K3/2 + 1);
414 int numDeviceTot = CkNumNodes() * numDevicesTmp;
416 int numDeviceToUse = std::max(1, numDeviceTot/4);
418 if (numDeviceToUse < 4) {
424 pmeGrid.
yBlocks = (int)sqrt((
double)numDeviceToUse);
434 restrictToMaxPMEPencils();
439 if (pmeGrid.
xBlocks > numDeviceTot) pmeGrid.
xBlocks = numDeviceTot;
440 if (pmeGrid.
zBlocks > numDeviceTot) pmeGrid.
zBlocks = numDeviceTot;
446 pmeGrid.
yBlocks = std::min(pmeGrid.
yBlocks, (
int)sqrt((
double)numDeviceTot));
461 }
else if (pmeGrid.
yBlocks == 1) {
473 if (pmePencilType == 1 || pmePencilType == 2) {
476 if (pmePencilType == 1) {
483 for (
int i=0;i < xPes.size();i++) {
484 int node = (i % numDeviceTot)/numDevicesTmp;
485 xPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
487 for (
int i=0;i < yPes.size();i++) {
488 int node = (i % numDeviceTot)/numDevicesTmp;
489 yPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
491 for (
int i=0;i < zPes.size();i++) {
492 int node = (i % numDeviceTot)/numDevicesTmp;
493 zPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
511 nodeDeviceList.resize(xPes.size());
513 for (
int k=0;k < xPes.size();k++) {
514 nodeDeviceList[k].node = CkNodeOf(xPes[k]);
515 nodeDeviceList[k].device = -1;
516 if (nodeDeviceList[k].node == CkMyNode()) {
517 nodeDeviceList[k].device = numDevices++;
526 for (
int k=0;k < xPes.size();k++) {
527 if (CkMyNode() == CkNodeOf(xPes[k])) {
531 ijPencilX.push_back(ij);
534 if (ijPencilX.size() != numDevices)
535 NAMD_bug(
"ComputePmeCUDAMgr::setupPencils, error setting up x-pencils and devices");
538 for (
int k=0;k < yPes.size();k++) {
539 if (CkMyNode() == CkNodeOf(yPes[k])) {
543 ijPencilY.push_back(ij);
549 for (
int k=0;k < zPes.size();k++) {
550 if (CkMyNode() == CkNodeOf(zPes[k])) {
554 ijPencilZ.push_back(ij);
564 for (
int i=0;i < xPes.size();i++) {
565 if (pe == xPes[i])
return true;
574 for (
int i=0;i < nodeDeviceList.size();i++) {
575 if (nodeDeviceList[i].node == node) {
586 for (
int i=0;i < nodeDeviceList.size();i++) {
599 if (msg != NULL)
delete msg;
603 if ( ! CkMyNode() ) {
606 " pencil grid for FFT and reciprocal sum.\n" <<
endi;
610 numHomePatchesList.resize(numDevices, 0);
616 thisProxy[0].initializeDevicesAndAtomFiler(
new NumDevicesMsg(numDevices));
619 if (CkMyNode() == 0) {
621 if (pmePencilType == 3) {
623 CProxy_PmePencilXYZMap xyzMap = CProxy_PmePencilXYZMap::ckNew(xPes[0]);
624 CkArrayOptions xyzOpts(1);
625 xyzOpts.setMap(xyzMap);
626 xyzOpts.setAnytimeMigration(
false);
627 xyzOpts.setStaticInsertion(
true);
628 pmePencilXYZ = CProxy_CudaPmePencilXYZ::ckNew(xyzOpts);
630 thisProxy.recvPencils(pmePencilXYZ);
631 }
else if (pmePencilType == 2) {
633 CProxy_PmePencilXYMap xyMap = CProxy_PmePencilXYMap::ckNew(xPes);
634 CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.
xBlocks, zPes);
635 CkArrayOptions xyOpts(1, 1, pmeGrid.
zBlocks);
636 CkArrayOptions zOpts(pmeGrid.
xBlocks, 1, 1);
637 xyOpts.setMap(xyMap);
639 xyOpts.setAnytimeMigration(
false);
640 zOpts.setAnytimeMigration(
false);
641 xyOpts.setStaticInsertion(
true);
642 zOpts.setStaticInsertion(
true);
643 pmePencilXY = CProxy_CudaPmePencilXY::ckNew(xyOpts);
644 pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
646 thisProxy.recvPencils(pmePencilXY, pmePencilZ);
647 pmePencilXY.initialize(
new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
648 pmePencilZ.initialize(
new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
651 CProxy_PmePencilXMap xMap = CProxy_PmePencilXMap::ckNew(1, 2, pmeGrid.
yBlocks, xPes);
652 CProxy_PmePencilXMap yMap = CProxy_PmePencilXMap::ckNew(0, 2, pmeGrid.
xBlocks, yPes);
653 CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.
xBlocks, zPes);
660 xOpts.setAnytimeMigration(
false);
661 yOpts.setAnytimeMigration(
false);
662 zOpts.setAnytimeMigration(
false);
663 xOpts.setStaticInsertion(
true);
664 yOpts.setStaticInsertion(
true);
665 zOpts.setStaticInsertion(
true);
666 pmePencilX = CProxy_CudaPmePencilX::ckNew(xOpts);
667 pmePencilY = CProxy_CudaPmePencilY::ckNew(yOpts);
668 pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
670 thisProxy.recvPencils(pmePencilX, pmePencilY, pmePencilZ);
671 pmePencilX.initialize(
new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
672 pmePencilY.initialize(
new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
673 pmePencilZ.initialize(
new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
681 NAMD_bug(
"ComputePmeCUDAMgr::createDevicesAndAtomFiler can only be called on root node");
687 for (
int i=0;i < numDevicesMax;i++) {
688 CProxy_ComputePmeCUDADevice dev = CProxy_ComputePmeCUDADevice::ckNew();
689 memcpy(&msg->
dev[i], &dev,
sizeof(CProxy_ComputePmeCUDADevice));
691 thisProxy.recvDevices(msg);
693 CProxy_PmeAtomFiler filer = CProxy_PmeAtomFiler::ckNew();
694 thisProxy.recvAtomFiler(filer);
699 pmeAtomFiler = filer;
704 if (numDevices > numDevicesMax)
705 NAMD_bug(
"ComputePmeCUDAMgr::recvDevices, numDevices > numDevicesMax");
706 deviceProxy.resize(numDevices);
707 for (
int i=0;i < numDevices;i++) {
708 deviceProxy[i] = msg->
dev[i];
733 if (msg != NULL)
delete msg;
738 for (
int i=0;i < ijPencilX.size();i++) {
741 deviceProxy[i].ckLocalBranch()->initialize(pmeGrid, ijPencilX[i].i, ijPencilX[i].j,
742 deviceID, pmePencilType, thisProxy, pmeAtomFiler);
743 if (pmePencilType == 1) {
744 deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilX);
745 }
else if (pmePencilType == 2) {
746 deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXY);
748 deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXYZ);
753 for (
int i=0;i < ijPencilX.size();i++) {
754 if (pmePencilType == 1) {
755 pmePencilX(0, ijPencilX[i].i, ijPencilX[i].j).initializeDevice(
new InitDeviceMsg(deviceProxy[i]));
756 }
else if (pmePencilType == 2) {
757 pmePencilXY(0, 0, ijPencilX[i].j).initializeDevice(
new InitDeviceMsg(deviceProxy[i]));
759 pmePencilXYZ[0].initializeDevice(
new InitDeviceMsg(deviceProxy[i]));
764 int n = std::max(ijPencilY.size(), ijPencilZ.size());
765 if (n > ijPencilX.size()) {
766 int nextra = n - ijPencilX.size();
767 extraDevices.resize(nextra);
768 for (
int i=0;i < nextra;i++) {
770 cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
776 for (
int i=0;i < ijPencilY.size();i++) {
779 if (i < ijPencilX.size()) {
780 deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
781 stream = deviceProxy[i].ckLocalBranch()->getStream();
783 deviceID = extraDevices[i-ijPencilX.size()].deviceID;
784 stream = extraDevices[i-ijPencilX.size()].stream;
786 pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).initializeDevice(
new InitDeviceMsg2(deviceID, stream, thisProxy));
790 for (
int i=0;i < ijPencilZ.size();i++) {
793 if (i < ijPencilX.size()) {
794 deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
795 stream = deviceProxy[i].ckLocalBranch()->getStream();
797 deviceID = extraDevices[i-ijPencilX.size()].deviceID;
798 stream = extraDevices[i-ijPencilX.size()].stream;
800 pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).initializeDevice(
new InitDeviceMsg2(deviceID, stream, thisProxy));
810 if (msg != NULL)
delete msg;
812 for (
int device=0;device < numDevices;device++) {
813 deviceProxy[device].ckLocalBranch()->activate_pencils();
816 for (
int i=0;i < ijPencilY.size();i++) {
818 pmeStartYMsg->
data = NULL;
820 pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).start(pmeStartYMsg);
823 for (
int i=0;i < ijPencilZ.size();i++) {
825 pmeStartZMsg->
data = NULL;
827 pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).start(pmeStartZMsg);
836 if (i < 0 || i >= pmeGrid.
yBlocks || j < 0 || j >= pmeGrid.
zBlocks)
837 NAMD_bug(
"ComputePmeCUDAMgr::getNode, pencil index out of bounds");
838 int ind = i + j*pmeGrid.
yBlocks;
839 return nodeDeviceList[ind].node;
855 if (i < 0 || i >= pmeGrid.
yBlocks || j < 0 || j >= pmeGrid.
zBlocks)
856 NAMD_bug(
"ComputePmeCUDAMgr::getDevice, pencil index out of bounds");
857 int ind = i + j*pmeGrid.
yBlocks;
858 int device = nodeDeviceList[ind].device;
860 NAMD_bug(
"ComputePmeCUDAMgr::getDevice, no device found");
868 if (i < 0 || i >= pmeGrid.
xBlocks || j < 0 || j >= pmeGrid.
zBlocks)
869 NAMD_bug(
"ComputePmeCUDAMgr::getDevicePencilY, pencil index out of bounds");
870 for (
int device=0;device < ijPencilY.size();device++) {
871 if (ijPencilY[device].i == i && ijPencilY[device].j == j)
return device;
874 sprintf(str,
"ComputePmeCUDAMgr::getDevicePencilY, no device found at i %d j %d",i,j);
883 if (i < 0 || i >= pmeGrid.
xBlocks || j < 0 || j >= pmeGrid.
yBlocks)
884 NAMD_bug(
"ComputePmeCUDAMgr::getDevicePencilZ, pencil index out of bounds");
885 for (
int device=0;device < ijPencilZ.size();device++) {
886 if (ijPencilZ[device].i == i && ijPencilZ[device].j == j)
return device;
888 NAMD_bug(
"ComputePmeCUDAMgr::getDevicePencilZ, no device found");
897 return deviceProxy[device].ckLocalBranch()->getDeviceID();
905 return deviceProxy[device].ckLocalBranch()->getDeviceID();
913 return deviceProxy[device].ckLocalBranch()->getDeviceID();
920 switch(pmePencilType) {
928 pmePencilXYZ[0].skip();
935 deviceProxy[device].ckLocalBranch()->recvAtoms(msg);
943 pmeRealSpaceCompute = NULL;
944 streamCreated =
false;
945 lock_numHomePatchesMerged = CmiCreateLock();
946 lock_numPencils = CmiCreateLock();
947 lock_numNeighborsRecv = CmiCreateLock();
948 lock_recvAtoms = CmiCreateLock();
949 numNeighborsExpected = 0;
952 numNeighborsRecv = 0;
953 numHomePatchesRecv = 0;
954 numHomePatchesMerged = 0;
964 pmeRealSpaceCompute = NULL;
965 streamCreated =
false;
966 lock_numHomePatchesMerged = CmiCreateLock();
967 lock_numPencils = CmiCreateLock();
968 lock_numNeighborsRecv = CmiCreateLock();
969 lock_recvAtoms = CmiCreateLock();
970 numNeighborsExpected = 0;
973 numNeighborsRecv = 0;
974 numHomePatchesRecv = 0;
975 numHomePatchesMerged = 0;
985 for (
int j=0;j < 2;j++)
986 for (
int i=0;i < pmeAtomStorage[j].size();i++) {
987 if (pmeAtomStorageAllocatedHere[i])
delete pmeAtomStorage[j][i];
989 if (force != NULL) deallocate_host<CudaForce>(&force);
990 if (pmeRealSpaceCompute != NULL)
delete pmeRealSpaceCompute;
991 CmiDestroyLock(lock_numHomePatchesMerged);
992 CmiDestroyLock(lock_numPencils);
993 CmiDestroyLock(lock_numNeighborsRecv);
994 CmiDestroyLock(lock_recvAtoms);
998 int deviceID_in,
int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
999 CProxy_PmeAtomFiler pmeAtomFiler_in) {
1001 deviceID = deviceID_in;
1003 pmePencilType = pmePencilType_in;
1004 pmeGrid = pmeGrid_in;
1005 pencilIndexY = pencilIndexY_in;
1006 pencilIndexZ = pencilIndexZ_in;
1007 mgrProxy = mgrProxy_in;
1008 pmeAtomFiler = pmeAtomFiler_in;
1010 yNBlocks = std::min(pmeGrid.
yBlocks, 3);
1011 zNBlocks = std::min(pmeGrid.
zBlocks, 3);
1013 if (yNBlocks == 1) {
1016 }
else if (yNBlocks == 2) {
1023 if (zNBlocks == 1) {
1026 }
else if (zNBlocks == 2) {
1034 neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
1036 for (
int j=0;j < 2;j++)
1037 homePatchIndexList[j].resize(yNBlocks*zNBlocks);
1038 neighborPatchIndex.resize(yNBlocks*zNBlocks);
1040 pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks,
false);
1041 for (
int j=0;j < 2;j++) {
1042 pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
1043 for (
int z=zlo;
z <= zhi;
z++) {
1044 for (
int y=ylo;
y <= yhi;
y++) {
1045 int pp =
y-ylo + (
z-zlo)*yNBlocks;
1048 if (
y == 0 &&
z == 0) {
1054 pmeAtomStorageAllocatedHere[pp] =
true;
1061 streamCreated =
true;
1080 if (pmePencilType != 3)
1081 NAMD_bug(
"ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
1082 pmePencilXYZ = pmePencilXYZ_in;
1086 if (pmePencilType != 2)
1087 NAMD_bug(
"ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
1088 pmePencilXY = pmePencilXY_in;
1092 if (pmePencilType != 1)
1093 NAMD_bug(
"ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
1094 pmePencilX = pmePencilX_in;
1098 if (pmePencilType == 1) {
1100 pmeStartXMsg->
data = pmeRealSpaceCompute->
getData();
1102 pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
1103 }
else if (pmePencilType == 2) {
1105 pmeStartXMsg->
data = pmeRealSpaceCompute->
getData();
1107 pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
1108 }
else if (pmePencilType == 3) {
1110 pmeStartMsg->
data = pmeRealSpaceCompute->
getData();
1112 pmePencilXYZ[0].start(pmeStartMsg);
1117 numHomePatches = numHomePatches_in;
1118 for (
int j=0;j < 2;j++)
1119 numPencils[j].resize(numHomePatches);
1120 for (
int j=0;j < 2;j++)
1121 plList[j].resize(numHomePatches);
1122 for (
int j=0;j < 2;j++)
1123 homePatchForceMsgs[j].resize(numHomePatches);
1127 if (numHomePatches > 0) {
1128 for (
int z=zlo;
z <= zhi;
z++) {
1129 for (
int y=ylo;
y <= yhi;
y++) {
1132 int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1133 mgrProxy[node].registerNeighbor(yt, zt);
1140 CmiLock(lock_numHomePatchesMerged);
1141 numNeighborsExpected++;
1142 CmiUnlock(lock_numHomePatchesMerged);
1150 PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
1158 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1160 int pencilPatchIndex[9];
1161 int numStrayAtomsPatch = 0;
1162 if (pmePencilType == 3) {
1165 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms);
1170 pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
1174 int numAtomsCheck = 0;
1175 for (
int p=0;p < 9;p++) {
1180 int pp = y + z*yNBlocks;
1182 if (pp == pp0) p0 = p;
1183 if (pp == pp0 || numAtoms > 0) {
1184 if (pmeGrid.
yBlocks == 1 && pmeGrid.
zBlocks == 1 && (y != 0 || z != 0))
1185 NAMD_bug(
"ComputePmeCUDADevice::recvAtoms, problem with atom filing");
1187 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index);
1190 numAtomsCheck += numAtoms;
1195 numStrayAtomsPatch = pmeAtomFilerPtr->
getNumAtoms(9);
1196 if (numStrayAtomsPatch > 0) {
1198 CkPrintf(
"%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
1199 for (
int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
1205 if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
1206 NAMD_bug(
"ComputePmeCUDADevice::recvAtoms, missing atoms");
1211 if (pmePencilType == 3 && CkNodeOf(msg->
pe) == CkMyNode()) {
1220 forceMsg->
pe = msg->
pe;
1229 CmiLock(lock_recvAtoms);
1230 numStrayAtoms += numStrayAtomsPatch;
1232 int homePatchIndex = numHomePatchesRecv;
1234 plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
1235 if (pmePencilType != 3) {
1237 for (
int p=0;p < 9;p++) {
1242 int pp = y + z*yNBlocks;
1244 if (pp != pp0 && numAtoms > 0) {
1245 homePatchIndexList[atomI][pp].push_back(homePatchIndex);
1249 plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
1253 homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
1256 numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
1258 numHomePatchesRecv++;
1259 if (numHomePatchesRecv == numHomePatches) {
1261 numHomePatchesRecv = 0;
1264 CmiUnlock(lock_recvAtoms);
1281 for (
int z=zlo;
z <= zhi;
z++) {
1282 for (
int y=ylo;
y <= yhi;
y++) {
1284 if (
y != 0 ||
z != 0) {
1287 thisProxy[CkMyNode()].sendAtomsToNeighbor(
y,
z, atomI);
1297 int pp = y-ylo + (z-zlo)*yNBlocks;
1299 pmeAtomStorage[atomIval][pp]->finish();
1303 int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
1312 msgPencil->
srcY = pencilIndexY;
1313 msgPencil->
srcZ = pencilIndexZ;
1319 int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1320 mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
1325 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1327 int y = msg->
srcY - pencilIndexY;
1328 if (y < ylo) y += pmeGrid.
yBlocks;
1329 if (y > yhi) y -= pmeGrid.
yBlocks;
1330 int z = msg->
srcZ - pencilIndexZ;
1331 if (z < zlo) z += pmeGrid.
zBlocks;
1332 if (z > zhi) z -= pmeGrid.
zBlocks;
1333 if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1334 NAMD_bug(
"ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
1342 int pp = y-ylo + (z-zlo)*yNBlocks;
1344 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms);
1353 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1357 CmiLock(lock_numNeighborsRecv);
1359 if (numNeighborsRecv == numNeighborsExpected) {
1361 numNeighborsRecv = 0;
1364 CmiUnlock(lock_numNeighborsRecv);
1375 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1378 pmeAtomStorage[atomI][pp0]->finish();
1380 int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
1385 reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
1387 pmeRealSpaceCompute->
copyAtoms(numAtoms, atoms);
1389 beforeWalltime = CmiWallTimer();
1399 switch(pmePencilType) {
1401 pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
1404 pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
1407 pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
1417 beforeWalltime = CmiWallTimer();
1432 for (
int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
1438 if (pmePencilType != 3) CmiLock(lock_numPencils);
1439 numPencils[forceI][homePatchIndex]--;
1440 if (numPencils[forceI][homePatchIndex] == 0) done =
true;
1441 if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1458 #if CMK_SMP && USE_CKLOOP
1460 if (useCkLoop >= 1) {
1461 CkLoop_Parallelize(
gatherForceDoneLoop, 1, (
void *)
this, CkMyNodeSize(), 0, numHomePatches-1);
1467 for (
int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
1473 if (pmePencilType != 3) CmiLock(lock_numPencils);
1474 numPencils[forceI][homePatchIndex]--;
1475 if (numPencils[forceI][homePatchIndex] == 0) done =
true;
1476 if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1480 thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1486 if (numHomePatches == 0) {
1487 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1488 pmeAtomStorage[forceI][pp0]->clear();
1498 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1499 int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1501 for (
int z=zlo;
z <= zhi;
z++) {
1502 for (
int y=ylo;
y <= yhi;
y++) {
1504 if (
y != 0 ||
z != 0) {
1505 int pp =
y-ylo + (
z-zlo)*yNBlocks;
1506 int patchIndex = neighborPatchIndex[pp];
1507 int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
1508 int atomEnd = patchPos[patchIndex];
1509 int natom = atomEnd-atomStart;
1517 int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
1521 msg->
srcY = pencilIndexY;
1522 msg->
srcZ = pencilIndexZ;
1523 mgrProxy[node].recvForcesFromNeighbor(msg);
1532 int y = msg->
srcY - pencilIndexY;
1533 if (y < ylo) y += pmeGrid.
yBlocks;
1534 if (y > yhi) y -= pmeGrid.
yBlocks;
1535 int z = msg->
srcZ - pencilIndexZ;
1536 if (z < zlo) z += pmeGrid.
zBlocks;
1537 if (z > zhi) z -= pmeGrid.
zBlocks;
1539 if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1540 NAMD_bug(
"ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
1544 int pp = y-ylo + (z-zlo)*yNBlocks;
1547 neighborForcePencilMsgs[pp] = msg;
1559 int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
1560 if (numPatches != homePatchIndexList[forceI][pp].size()) {
1561 NAMD_bug(
"ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
1565 int homePatchIndex = homePatchIndexList[forceI][pp][i];
1571 CmiLock(lock_numPencils);
1572 numPencils[forceI][homePatchIndex]--;
1573 if (numPencils[forceI][homePatchIndex] == 0) done =
true;
1574 CmiUnlock(lock_numPencils);
1578 thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1587 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1590 PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
1592 if (pmePencilType == 3) {
1595 int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1597 int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1598 int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1599 int atomEnd = patchPos[pencilPatchIndex];
1600 int numAtoms = atomEnd-atomStart;
1601 if (forceMsg->zeroCopy) {
1603 forceMsg->force = force+atomStart;
1605 memcpy(forceMsg->force, force+atomStart, numAtoms*
sizeof(
CudaForce));
1611 memset(forceMsg->force, 0, forceMsg->numAtoms*
sizeof(
CudaForce));
1615 int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1616 int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
1617 int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1618 int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1619 int atomEnd = patchPos[pencilPatchIndex];
1620 int numAtoms = atomEnd-atomStart;
1623 for (
int i=0;i < numAtoms;i++) {
1624 forceMsg->force[index[atomStart + i]] = force[atomStart + i];
1630 for (
int j=1;j < plList[forceI][homePatchIndex].size();j++) {
1631 int pp = plList[forceI][homePatchIndex][j].pp;
1632 int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
1634 int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
1635 int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
1636 int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1637 int atomEnd = patchPos[pencilPatchIndex];
1638 int numAtoms = atomEnd-atomStart;
1641 CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
1643 for (
int i=0;i < numAtoms;i++) {
1644 dstForce[index[atomStart + i]].
x += srcForce[atomStart + i].
x;
1645 dstForce[index[atomStart + i]].
y += srcForce[atomStart + i].
y;
1646 dstForce[index[atomStart + i]].
z += srcForce[atomStart + i].
z;
1653 plList[forceI][homePatchIndex].clear();
1657 CmiLock(lock_numHomePatchesMerged);
1658 numHomePatchesMerged++;
1659 if (numHomePatchesMerged == numHomePatches) {
1661 numHomePatchesMerged = 0;
1664 for (
int i=0;i < neighborForcePencilMsgs.size();i++) {
1665 if (neighborForcePencilMsgs[i] != NULL) {
1666 delete neighborForcePencilMsgs[i];
1667 neighborForcePencilMsgs[i] = NULL;
1672 for (
int pp=0;pp < homePatchIndexList[forceI].size();pp++)
1673 homePatchIndexList[forceI][pp].clear();
1674 for (
int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
1675 pmeAtomStorage[forceI][pp]->clear();
1678 CmiUnlock(lock_numHomePatchesMerged);
1683 int pe = forceMsg->pe;
1684 if (CkNodeOf(pe) != CkMyNode())
1685 thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
1693 int pe = forceMsg->
pe;
1700 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
1701 wdProxy[pe].enqueuePme(lmsg);
1706 #include "ComputePmeCUDAMgr.def.h"
int getHomeNode(PatchID patchID)
virtual void gatherForce(Lattice &lattice, CudaForce *force)=0
void sendAtomsToNeighbor(int y, int z, int atomIval)
void sendForcesToNeighbors()
void recvForcesFromNeighbor(PmeForcePencilMsg *msg)
CProxy_ComputePmeCUDAMgr getMgrProxy()
std::ostream & iINFO(std::ostream &s)
void initialize(CkQdMsg *msg)
CpuPmeAtomStorage(const bool useIndex)
int getDeviceIDPencilX(int i, int j)
static void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param)
void sendForcesToPatch(PmeForceMsg *forceMsg)
static PatchMap * Object()
void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void getHomePencil(PatchID patchID, int &homey, int &homez)
BigReal max_c(int pid) const
SimParameters * simParameters
#define CUDA_PME_SPREADCHARGE_EVENT
static __thread atom * atoms
static __thread unsigned int * plist
void recvAtomFiler(CProxy_PmeAtomFiler filer)
void recvAtoms(PmeAtomMsg *msg)
std::ostream & endi(std::ostream &s)
static int getPencilIndexY(const PmeGrid &pmeGrid, const int y)
void mergeForcesOnPatch(int homePatchIndex)
if(ComputeNonbondedUtil::goMethod==2)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void createDevicesAndAtomFiler()
void swap(Array< T > &s, Array< T > &t)
void recvDevices(RecvDeviceMsg *msg)
int getDeviceIDPencilZ(int i, int j)
LocalWorkMsg *const localWorkMsg
BigReal min_b(int pid) const
__thread cudaStream_t stream
bool storePmeForceMsg(PmeForceMsg *msg)
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
static double calcGridCoord(const double x, const double recip11, const int nfftx)
void initialize(PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
void NAMD_bug(const char *err_msg)
int getDevicePencilZ(int i, int j)
void registerRecvAtomsFromNeighbor()
void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg)
void initializePatches(int numHomePatches_in)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void createStream(cudaStream_t &stream)
CudaPmeAtomStorage(const bool useIndex)
int getDevice(int i, int j)
int getDeviceIDPencilY(int i, int j)
BigReal max_b(int pid) const
void sendAtomsToNeighbors()
void recvAtoms(PmeAtomMsg *msg)
int numPatches(void) const
void activate_pencils(CkQdMsg *msg)
int getDeviceIDbyRank(int rank)
__thread DeviceCUDA * deviceCUDA
int getNode(int i, int j)
void initialize_pencils(CkQdMsg *msg)
virtual void spreadCharge(Lattice &lattice)=0
BigReal min_c(int pid) const
int getDevicePencilY(int i, int j)
int * getAtomIndex(int p)
void recvPencils(CProxy_CudaPmePencilXYZ xyz)
CProxy_ComputePmeCUDADevice * dev
void gatherForceDoneSubset(int first, int last)
static int getPencilIndexZ(const PmeGrid &pmeGrid, const int z)
virtual void copyAtoms(const int numAtoms, const CudaAtom *atoms)=0
bool isPmeDevice(int deviceID)
#define CUDA_PME_GATHERFORCE_EVENT