21 #define MIN_DEBUG_LEVEL 4 25 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 27 #define __thread __declspec(thread) 32 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 33 int leastPriority, greatestPriority;
34 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
35 cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority));
49 if (
atom != NULL) dealloc_((
void *)
atom);
63 void* alloc_(
const size_t size) {
70 void dealloc_(
void *p) {
83 if (
atom != NULL) dealloc_((
void *)
atom);
97 void* alloc_(
const size_t size) {
98 return (
void *)(
new char[size]);
102 void dealloc_(
void *p) {
109 for (
int p=0;p < 10;p++) {
111 pencilCapacity[p] = 0;
115 for (
int p=0;p < 10;p++) {
117 pencilCapacity[p] = 0;
121 for (
int p=0;p < 10;p++) {
122 if (pencil[p] != NULL)
delete [] pencil[p];
133 const int pencilIndexY,
const int pencilIndexZ,
const int ylo,
const int yhi,
const int zlo,
const int zhi) {
134 DebugM(2,
"PmeAtomFiler::fileAtoms\n" <<
endi);
136 for (
int p=0;p < 10;p++) {
137 if (pencil[p] != NULL && pencilCapacity[p] < numAtoms) {
140 pencilCapacity[p] = 0;
142 if (pencil[p] == NULL) {
143 int newSize = (int)(numAtoms*1.5);
144 pencil[p] =
new int[newSize];
145 pencilCapacity[p] = newSize;
150 const float recip11 = lattice.
a_r().
x;
151 const float recip22 = lattice.
b_r().
y;
152 const float recip33 = lattice.
c_r().
z;
153 const int order1 = pmeGrid.
order - 1;
154 const int K1 = pmeGrid.
K1;
155 const int K2 = pmeGrid.
K2;
156 const int K3 = pmeGrid.
K3;
157 const int yBlocks = pmeGrid.
yBlocks;
158 const int zBlocks = pmeGrid.
zBlocks;
160 for (
int i=0;i < numAtoms;i++) {
168 if (y0 < 0 || y0 >= K2 || z0 < 0 || z0 >= K3) {
170 pencil[9][pencilSize[9]++] = i;
180 for (
int j=0;j < 4;j++) {
182 int py =
getPencilIndexY(pmeGrid, (y0 + (j%2)*order1) % K2) - pencilIndexY;
183 if (py < ylo) py += yBlocks;
184 if (py > yhi) py -= yBlocks;
186 int pz =
getPencilIndexZ(pmeGrid, (z0 + (j/2)*order1) % K3) - pencilIndexZ;
187 if (pz < zlo) pz += zBlocks;
188 if (pz > zhi) pz -= zBlocks;
190 if (py < ylo || py > yhi || pz < zlo || pz > zhi) {
192 pencil[9][pencilSize[9]++] = i;
198 plist[j] = (py-ylo) + (pz-zlo)*3;
202 for (
int j=0;j < 4;j++) {
206 if (!(occupied & pbit)) {
207 pencil[p][pencilSize[p]++] = i;
225 numNodesContributed = 0;
234 NAMD_bug(
"ComputePmeCUDAMgr cannot be migrated");
237 numNodesContributed = 0;
245 for (
int i=0;i < extraDevices.size();i++) {
246 cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
247 cudaCheck(cudaStreamDestroy(extraDevices[i].stream));
267 int patch_y0 = floor((miny+0.5)*pmeGrid.
K2);
268 int patch_y1 = floor((maxy+0.5)*pmeGrid.
K2)-1;
269 int patch_z0 = floor((minz+0.5)*pmeGrid.
K3);
270 int patch_z1 = floor((maxz+0.5)*pmeGrid.
K3)-1;
272 if (patch_y0 < 0 || patch_y1 >= pmeGrid.
K2 || patch_z0 < 0 || patch_z1 >= pmeGrid.
K3) {
273 NAMD_bug(
"ComputePmeCUDAMgr::getHomePencil, patch bounds are outside grid bounds");
279 for (
int iz=0;iz < pmeGrid.
zBlocks;iz++) {
280 for (
int iy=0;iy < pmeGrid.
yBlocks;iy++) {
281 int pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1;
283 pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1);
285 if (pencil_y1 - pencil_y0 < pmeGrid.
order || pencil_z1 - pencil_z0 < pmeGrid.
order)
286 NAMD_bug(
"ComputePmeCUDAMgr::getHomePencil, pencil size must be >= PMEInterpOrder");
288 int y0 = std::max(patch_y0, pencil_y0);
289 int y1 = std::min(patch_y1, pencil_y1);
290 int z0 = std::max(patch_z0, pencil_z0);
291 int z1 = std::min(patch_z1, pencil_z1);
293 int overlap = (y1-y0 > 0 && z1-z0 > 0) ? (y1-y0)*(z1-z0) : -1;
295 if (overlap > maxOverlap) {
296 maxOverlap = overlap;
303 if (homey == -1 || homez == -1)
304 NAMD_bug(
"ComputePmeCUDAMgr::getHomePencil, home pencil not found");
310 void ComputePmeCUDAMgr::restrictToMaxPMEPencils() {
319 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
320 BigReal marginc = 0.5 * ( patchdim - cutoff ) / sysdimc;
326 while (pid < numPatches) {
335 int min2 = ((int) floor(pmeGrid.
K2 * (miny+0.5 - marginb)));
336 int max2 = ((int) floor(pmeGrid.
K2 * (maxy+0.5 + marginb))) + (pmeGrid.
order - 1);
338 if (min2 < 0) min2 += pmeGrid.
K2;
339 if (max2 >= pmeGrid.
K2) max2 -= pmeGrid.
K2;
344 int dmin2pi = homey - min2pi;
345 if (dmin2pi < 0) dmin2pi += pmeGrid.
yBlocks;
347 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin2pi");
351 if (pmeGrid.
yBlocks <= 0)
break;
354 int dmax2pi = max2pi - homey;
355 if (dmax2pi < 0) dmax2pi += pmeGrid.
yBlocks;
357 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax2pi");
361 if (pmeGrid.
yBlocks <= 0)
break;
371 int min3 = ((int) floor(pmeGrid.
K3 * (minz+0.5 - marginc)));
372 int max3 = ((int) floor(pmeGrid.
K3 * (maxz+0.5 + marginc))) + (pmeGrid.
order - 1);
374 if (min3 < 0) min3 += pmeGrid.
K3;
375 if (max3 >= pmeGrid.
K3) max3 -= pmeGrid.
K3;
380 int dmin3pi = homez - min3pi;
381 if (dmin3pi < 0) dmin3pi += pmeGrid.
zBlocks;
383 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin3pi");
387 if (pmeGrid.
zBlocks <= 0)
break;
390 int dmax3pi = max3pi - homez;
391 if (dmax3pi < 0) dmax3pi += pmeGrid.
zBlocks;
393 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax3pi");
397 if (pmeGrid.
zBlocks <= 0)
break;
409 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, zero PME pencils found");
412 NAMD_bug(
"ComputePmeCUDAMgr::restrictToMaxPMEPencils, unable to restrict number of PME pencils");
420 DebugM(4,
"ComputePmeCUDAMgr::setupPencils\n"<<
endi);
425 pmeGrid.
dim2 = pmeGrid.
K2;
426 pmeGrid.
dim3 = 2 * (pmeGrid.
K3/2 + 1);
432 int numDeviceTot = CkNumNodes() * numDevicesTmp;
435 int numDeviceToUse = 1;
440 DebugM(4,
"ComputePmeCUDAMgr::setupPencils numDeviceToUse "<<numDeviceToUse<<
" numDeviceTot "<< numDeviceTot <<
"\n"<<
endi);
441 if (numDeviceToUse < 4) {
447 pmeGrid.
yBlocks = (int)sqrt((
double)numDeviceToUse);
457 restrictToMaxPMEPencils();
462 if (pmeGrid.
xBlocks > numDeviceTot) pmeGrid.
xBlocks = numDeviceTot;
463 if (pmeGrid.
zBlocks > numDeviceTot) pmeGrid.
zBlocks = numDeviceTot;
469 pmeGrid.
yBlocks = std::min(pmeGrid.
yBlocks, (
int)sqrt((
double)numDeviceTot));
484 iout <<
iINFO <<
"Use 3D box decompostion in PME FFT.\n" <<
endi;
485 }
else if (pmeGrid.
yBlocks == 1) {
488 iout <<
iINFO <<
"Use 2D slab decompostion in PME FFT.\n" <<
endi;
492 iout <<
iINFO <<
"Use 1D pencil decompostion in PME FFT.\n" <<
endi;
499 if (pmePencilType == 1 || pmePencilType == 2) {
502 if (pmePencilType == 1) {
509 for (
int i=0;i < xPes.size();i++) {
510 int node = (i % numDeviceTot)/numDevicesTmp;
511 xPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
513 for (
int i=0;i < yPes.size();i++) {
514 int node = (i % numDeviceTot)/numDevicesTmp;
515 yPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
517 for (
int i=0;i < zPes.size();i++) {
518 int node = (i % numDeviceTot)/numDevicesTmp;
519 zPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
537 nodeDeviceList.resize(xPes.size());
539 for (
int k=0;k < xPes.size();k++) {
540 nodeDeviceList[k].node = CkNodeOf(xPes[k]);
541 nodeDeviceList[k].device = -1;
542 if (nodeDeviceList[k].node == CkMyNode()) {
543 nodeDeviceList[k].device = numDevices++;
552 for (
int k=0;k < xPes.size();k++) {
553 if (CkMyNode() == CkNodeOf(xPes[k])) {
557 ijPencilX.push_back(ij);
560 if (ijPencilX.size() != numDevices)
561 NAMD_bug(
"ComputePmeCUDAMgr::setupPencils, error setting up x-pencils and devices");
564 for (
int k=0;k < yPes.size();k++) {
565 if (CkMyNode() == CkNodeOf(yPes[k])) {
569 ijPencilY.push_back(ij);
575 for (
int k=0;k < zPes.size();k++) {
576 if (CkMyNode() == CkNodeOf(zPes[k])) {
580 ijPencilZ.push_back(ij);
590 for (
int i=0;i < xPes.size();i++) {
591 if (pe == xPes[i])
return true;
600 for (
int i=0;i < nodeDeviceList.size();i++) {
601 if (nodeDeviceList[i].node == node) {
612 for (
int i=0;i < nodeDeviceList.size();i++) {
625 if (msg != NULL)
delete msg;
626 DebugM(4,
"ComputePmeCUDAMgr::initialize(CkQdMsg)\n" <<
endi);
629 if ( ! CkMyNode() ) {
632 " pencil grid for FFT and reciprocal sum.\n" <<
endi;
636 numHomePatchesList.resize(numDevices, 0);
642 thisProxy[0].initializeDevicesAndAtomFiler(
new NumDevicesMsg(numDevices));
645 if (CkMyNode() == 0) {
647 if (pmePencilType == 3) {
649 CProxy_PmePencilXYZMap xyzMap = CProxy_PmePencilXYZMap::ckNew(xPes[0]);
650 CkArrayOptions xyzOpts(1);
651 xyzOpts.setMap(xyzMap);
652 xyzOpts.setAnytimeMigration(
false);
653 xyzOpts.setStaticInsertion(
true);
654 pmePencilXYZ = CProxy_CudaPmePencilXYZ::ckNew(xyzOpts);
656 thisProxy.recvPencils(pmePencilXYZ);
657 }
else if (pmePencilType == 2) {
659 CProxy_PmePencilXYMap xyMap = CProxy_PmePencilXYMap::ckNew(xPes);
660 CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.
xBlocks, zPes);
661 CkArrayOptions xyOpts(1, 1, pmeGrid.
zBlocks);
662 CkArrayOptions zOpts(pmeGrid.
xBlocks, 1, 1);
663 xyOpts.setMap(xyMap);
665 xyOpts.setAnytimeMigration(
false);
666 zOpts.setAnytimeMigration(
false);
667 xyOpts.setStaticInsertion(
true);
668 zOpts.setStaticInsertion(
true);
669 pmePencilXY = CProxy_CudaPmePencilXY::ckNew(xyOpts);
670 pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
672 thisProxy.recvPencils(pmePencilXY, pmePencilZ);
673 pmePencilXY.initialize(
new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
674 pmePencilZ.initialize(
new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
677 CProxy_PmePencilXMap xMap = CProxy_PmePencilXMap::ckNew(1, 2, pmeGrid.
yBlocks, xPes);
678 CProxy_PmePencilXMap yMap = CProxy_PmePencilXMap::ckNew(0, 2, pmeGrid.
xBlocks, yPes);
679 CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.
xBlocks, zPes);
686 xOpts.setAnytimeMigration(
false);
687 yOpts.setAnytimeMigration(
false);
688 zOpts.setAnytimeMigration(
false);
689 xOpts.setStaticInsertion(
true);
690 yOpts.setStaticInsertion(
true);
691 zOpts.setStaticInsertion(
true);
692 pmePencilX = CProxy_CudaPmePencilX::ckNew(xOpts);
693 pmePencilY = CProxy_CudaPmePencilY::ckNew(yOpts);
694 pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
696 thisProxy.recvPencils(pmePencilX, pmePencilY, pmePencilZ);
697 pmePencilX.initialize(
new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
698 pmePencilY.initialize(
new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
699 pmePencilZ.initialize(
new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
707 NAMD_bug(
"ComputePmeCUDAMgr::createDevicesAndAtomFiler can only be called on root node");
713 for (
int i=0;i < numDevicesMax;i++) {
714 CProxy_ComputePmeCUDADevice dev = CProxy_ComputePmeCUDADevice::ckNew();
715 memcpy(&msg->
dev[i], &dev,
sizeof(CProxy_ComputePmeCUDADevice));
717 thisProxy.recvDevices(msg);
719 CProxy_PmeAtomFiler filer = CProxy_PmeAtomFiler::ckNew();
720 thisProxy.recvAtomFiler(filer);
725 pmeAtomFiler = filer;
731 DebugM(4,
"ComputePmeCUDAMgr::recvDevices() numDevicesMax " << numDevicesMax <<
"\n"<<
endi);
732 if (numDevices > numDevicesMax)
733 NAMD_bug(
"ComputePmeCUDAMgr::recvDevices, numDevices > numDevicesMax");
734 deviceProxy.resize(numDevices);
735 for (
int i=0;i < numDevices;i++) {
736 deviceProxy[i] = msg->
dev[i];
761 if (msg != NULL)
delete msg;
764 DebugM(4,
"ComputePmeCUDAMgr::initialize_pencils() numDevicesTmp " << numDevicesTmp <<
"\n"<<
endi);
766 for (
int i=0;i < ijPencilX.size();i++) {
769 deviceProxy[i].ckLocalBranch()->initialize(pmeGrid, ijPencilX[i].i, ijPencilX[i].j,
770 deviceID, pmePencilType, thisProxy, pmeAtomFiler);
771 if (pmePencilType == 1) {
772 deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilX);
773 }
else if (pmePencilType == 2) {
774 deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXY);
776 deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXYZ);
781 for (
int i=0;i < ijPencilX.size();i++) {
782 if (pmePencilType == 1) {
783 pmePencilX(0, ijPencilX[i].i, ijPencilX[i].j).initializeDevice(
new InitDeviceMsg(deviceProxy[i]));
784 }
else if (pmePencilType == 2) {
785 pmePencilXY(0, 0, ijPencilX[i].j).initializeDevice(
new InitDeviceMsg(deviceProxy[i]));
787 pmePencilXYZ[0].initializeDevice(
new InitDeviceMsg(deviceProxy[i]));
792 int n = std::max(ijPencilY.size(), ijPencilZ.size());
793 if (n > ijPencilX.size()) {
794 int nextra = n - ijPencilX.size();
795 extraDevices.resize(nextra);
796 for (
int i=0;i < nextra;i++) {
798 cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
804 for (
int i=0;i < ijPencilY.size();i++) {
807 if (i < ijPencilX.size()) {
808 deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
809 stream = deviceProxy[i].ckLocalBranch()->getStream();
811 deviceID = extraDevices[i-ijPencilX.size()].deviceID;
812 stream = extraDevices[i-ijPencilX.size()].stream;
814 pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).initializeDevice(
new InitDeviceMsg2(deviceID, stream, thisProxy, deviceProxy[i]));
818 for (
int i=0;i < ijPencilZ.size();i++) {
821 if (i < ijPencilX.size()) {
822 deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
823 stream = deviceProxy[i].ckLocalBranch()->getStream();
825 deviceID = extraDevices[i-ijPencilX.size()].deviceID;
826 stream = extraDevices[i-ijPencilX.size()].stream;
828 pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).initializeDevice(
new InitDeviceMsg2(deviceID, stream, thisProxy, deviceProxy[i]));
838 if (msg != NULL)
delete msg;
840 for (
int device=0;device < numDevices;device++) {
841 deviceProxy[device].ckLocalBranch()->activate_pencils();
844 for (
int i=0;i < ijPencilY.size();i++) {
846 for (
unsigned iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
847 pmeStartYMsg->
dataGrid[iGrid] = NULL;
849 pmeStartYMsg->
enabledGrid[iGrid] = deviceProxy[0].ckLocalBranch()->isGridEnabled(iGrid);
851 pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).start(pmeStartYMsg);
854 for (
int i=0;i < ijPencilZ.size();i++) {
856 for (
unsigned iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
857 pmeStartZMsg->
dataGrid[iGrid] = NULL;
859 pmeStartZMsg->
enabledGrid[iGrid] = deviceProxy[0].ckLocalBranch()->isGridEnabled(iGrid);
861 pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).start(pmeStartZMsg);
870 if (i < 0 || i >= pmeGrid.
yBlocks || j < 0 || j >= pmeGrid.
zBlocks)
871 NAMD_bug(
"ComputePmeCUDAMgr::getNode, pencil index out of bounds");
872 int ind = i + j*pmeGrid.
yBlocks;
873 return nodeDeviceList[ind].node;
889 if (i < 0 || i >= pmeGrid.
yBlocks || j < 0 || j >= pmeGrid.
zBlocks)
890 NAMD_bug(
"ComputePmeCUDAMgr::getDevice, pencil index out of bounds");
891 int ind = i + j*pmeGrid.
yBlocks;
892 int device = nodeDeviceList[ind].device;
894 NAMD_bug(
"ComputePmeCUDAMgr::getDevice, no device found");
902 if (i < 0 || i >= pmeGrid.
xBlocks || j < 0 || j >= pmeGrid.
zBlocks)
903 NAMD_bug(
"ComputePmeCUDAMgr::getDevicePencilY, pencil index out of bounds");
904 for (
int device=0;device < ijPencilY.size();device++) {
905 if (ijPencilY[device].i == i && ijPencilY[device].j == j)
return device;
908 sprintf(str,
"ComputePmeCUDAMgr::getDevicePencilY, no device found at i %d j %d",i,j);
917 if (i < 0 || i >= pmeGrid.
xBlocks || j < 0 || j >= pmeGrid.
yBlocks)
918 NAMD_bug(
"ComputePmeCUDAMgr::getDevicePencilZ, pencil index out of bounds");
919 for (
int device=0;device < ijPencilZ.size();device++) {
920 if (ijPencilZ[device].i == i && ijPencilZ[device].j == j)
return device;
922 NAMD_bug(
"ComputePmeCUDAMgr::getDevicePencilZ, no device found");
931 return deviceProxy[device].ckLocalBranch()->getDeviceID();
939 return deviceProxy[device].ckLocalBranch()->getDeviceID();
947 return deviceProxy[device].ckLocalBranch()->getDeviceID();
954 DebugM(2,
"ComputePmeCUDADevice::skip\n" <<
endi);
955 switch(pmePencilType) {
963 pmePencilXYZ[0].skip();
969 DebugM(2,
"ComputePmeCUDADevice::recvAtoms\n" <<
endi);
971 deviceProxy[device].ckLocalBranch()->recvAtoms(msg);
979 DebugM(4,
"ComputePmeCUDADevice::ComputePmeCUDADevice\n" <<
endi);
980 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
981 pmeRealSpaceComputes[iGrid] = NULL;
982 enabledGrid[iGrid] =
false;
983 forces[iGrid] = NULL;
984 forceCapacities[iGrid] = 0;
987 streamCreated =
false;
988 lock_numHomePatchesMerged = CmiCreateLock();
989 lock_numPencils = CmiCreateLock();
990 lock_numNeighborsRecv = CmiCreateLock();
991 lock_recvAtoms = CmiCreateLock();
992 numNeighborsExpected = 0;
995 numNeighborsRecv = 0;
996 numHomePatchesRecv = 0;
997 numHomePatchesMerged = 0;
1007 DebugM(4,
"ComputePmeCUDADevice::ComputePmeCUDADevice(CkMigrateMessage)\n" <<
endi);
1008 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1009 pmeRealSpaceComputes[iGrid] = NULL;
1010 enabledGrid[iGrid] =
false;
1011 forces[iGrid] = NULL;
1012 forceCapacities[iGrid] = 0;
1014 streamCreated =
false;
1015 lock_numHomePatchesMerged = CmiCreateLock();
1016 lock_numPencils = CmiCreateLock();
1017 lock_numNeighborsRecv = CmiCreateLock();
1018 lock_recvAtoms = CmiCreateLock();
1019 numNeighborsExpected = 0;
1022 numNeighborsRecv = 0;
1023 numHomePatchesRecv = 0;
1024 numHomePatchesMerged = 0;
1030 if (streamCreated) {
1034 for (
int j=0;j < 2;j++)
1035 for (
int i=0;i < pmeAtomStorage[j].size();i++) {
1036 if (pmeAtomStorageAllocatedHere[i])
delete pmeAtomStorage[j][i];
1039 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1040 if (pmeRealSpaceComputes[iGrid] != NULL)
delete pmeRealSpaceComputes[iGrid];
1041 if (forces[iGrid] != NULL) deallocate_host<CudaForce>(&forces[iGrid]);
1042 enabledGrid[iGrid] =
false;
1044 CmiDestroyLock(lock_numHomePatchesMerged);
1045 CmiDestroyLock(lock_numPencils);
1046 CmiDestroyLock(lock_numNeighborsRecv);
1047 CmiDestroyLock(lock_recvAtoms);
1051 int deviceID_in,
int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
1052 CProxy_PmeAtomFiler pmeAtomFiler_in) {
1054 deviceID = deviceID_in;
1055 DebugM(4,
"ComputePmeCUDADevice::initialize deviceID "<< deviceID <<
"\n"<<
endi);
1057 pmePencilType = pmePencilType_in;
1058 pmeGrid = pmeGrid_in;
1062 pencilIndexY = pencilIndexY_in;
1063 pencilIndexZ = pencilIndexZ_in;
1064 mgrProxy = mgrProxy_in;
1065 pmeAtomFiler = pmeAtomFiler_in;
1067 yNBlocks = std::min(pmeGrid.
yBlocks, 3);
1068 zNBlocks = std::min(pmeGrid.
zBlocks, 3);
1070 if (yNBlocks == 1) {
1073 }
else if (yNBlocks == 2) {
1080 if (zNBlocks == 1) {
1083 }
else if (zNBlocks == 2) {
1091 neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
1093 for (
int j=0;j < 2;j++)
1094 homePatchIndexList[j].resize(yNBlocks*zNBlocks);
1095 neighborPatchIndex.resize(yNBlocks*zNBlocks);
1097 pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks,
false);
1099 for (
int j=0;j < 2;j++) {
1100 pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
1101 for (
int z=zlo;z <= zhi;z++) {
1102 for (
int y=ylo;y <= yhi;y++) {
1103 int pp = y-ylo + (z-zlo)*yNBlocks;
1106 if (y == 0 && z == 0) {
1109 pmeAtomStorage[j][pp]->setupAlch(*
simParams);
1112 pmeAtomStorage[j][pp]->setupAlch(*
simParams);
1114 pmeAtomStorageAllocatedHere[pp] =
true;
1121 streamCreated =
true;
1125 pmeRealSpaceComputes[0]->setGrid(0);
1126 enabledGrid[0] =
true;
1129 pmeRealSpaceComputes[1]->setGrid(1);
1131 enabledGrid[1] =
true;
1134 pmeRealSpaceComputes[2]->setGrid(2);
1135 enabledGrid[2] =
true;
1137 pmeRealSpaceComputes[3]->setGrid(3);
1138 enabledGrid[3] =
true;
1142 pmeRealSpaceComputes[4]->setGrid(4);
1143 enabledGrid[4] =
true;
1146 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1147 if (enabledGrid[iGrid]) {
1148 forceReady[iGrid] = 0;
1150 forceReady[iGrid] = -1;
1168 return enabledGrid[i];
1172 if (pmePencilType != 3)
1173 NAMD_bug(
"ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
1174 pmePencilXYZ = pmePencilXYZ_in;
1178 if (pmePencilType != 2)
1179 NAMD_bug(
"ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
1180 pmePencilXY = pmePencilXY_in;
1184 if (pmePencilType != 1)
1185 NAMD_bug(
"ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
1186 pmePencilX = pmePencilX_in;
1190 if (pmePencilType == 1) {
1192 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1193 if (enabledGrid[iGrid] ==
true) {
1194 pmeStartXMsg->
dataGrid[iGrid] = pmeRealSpaceComputes[iGrid]->getData();
1195 pmeStartXMsg->
dataSizes[iGrid] = pmeRealSpaceComputes[iGrid]->getDataSize();
1198 pmeStartXMsg->
dataGrid[iGrid] = NULL;
1203 pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
1204 }
else if (pmePencilType == 2) {
1206 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1207 if (enabledGrid[iGrid] ==
true) {
1208 pmeStartXMsg->
dataGrid[iGrid] = pmeRealSpaceComputes[iGrid]->getData();
1209 pmeStartXMsg->
dataSizes[iGrid] = pmeRealSpaceComputes[iGrid]->getDataSize();
1212 pmeStartXMsg->
dataGrid[iGrid] = NULL;
1217 pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
1218 }
else if (pmePencilType == 3) {
1220 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1221 if (enabledGrid[iGrid] ==
true) {
1222 pmeStartMsg->
dataGrid[iGrid] = pmeRealSpaceComputes[iGrid]->getData();
1223 pmeStartMsg->
dataSizes[iGrid] = pmeRealSpaceComputes[iGrid]->getDataSize();
1226 pmeStartMsg->
dataGrid[iGrid] = NULL;
1231 pmePencilXYZ[0].start(pmeStartMsg);
1236 numHomePatches = numHomePatches_in;
1237 for (
int j=0;j < 2;j++)
1238 numPencils[j].resize(numHomePatches);
1239 for (
int j=0;j < 2;j++)
1240 plList[j].resize(numHomePatches);
1241 for (
int j=0;j < 2;j++)
1242 homePatchForceMsgs[j].resize(numHomePatches);
1246 if (numHomePatches > 0) {
1247 for (
int z=zlo;z <= zhi;z++) {
1248 for (
int y=ylo;y <= yhi;y++) {
1251 int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1252 mgrProxy[node].registerNeighbor(yt, zt);
1259 CmiLock(lock_numHomePatchesMerged);
1260 numNeighborsExpected++;
1261 CmiUnlock(lock_numHomePatchesMerged);
1269 PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
1279 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1281 int pencilPatchIndex[9];
1282 int numStrayAtomsPatch = 0;
1283 if (pmePencilType == 3) {
1289 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2});
1293 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4});
1297 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1301 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1305 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1309 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1313 pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{});
1319 pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
1323 int numAtomsCheck = 0;
1324 for (
int p=0;p < 9;p++) {
1329 int pp = y + z*yNBlocks;
1331 if (pp == pp0) p0 = p;
1332 if (pp == pp0 || numAtoms > 0) {
1333 if (pmeGrid.
yBlocks == 1 && pmeGrid.
zBlocks == 1 && (y != 0 || z != 0))
1334 NAMD_bug(
"ComputePmeCUDADevice::recvAtoms, problem with atom filing");
1339 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2});
1343 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4});
1347 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1351 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1355 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1359 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1363 pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->
atoms, index, std::vector<float*>{});
1367 numAtomsCheck += numAtoms;
1372 numStrayAtomsPatch = pmeAtomFilerPtr->
getNumAtoms(9);
1373 if (numStrayAtomsPatch > 0) {
1375 CkPrintf(
"%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
1376 for (
int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
1382 if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
1383 NAMD_bug(
"ComputePmeCUDADevice::recvAtoms, missing atoms");
1388 if (pmePencilType == 3 && CkNodeOf(msg->
pe) == CkMyNode()) {
1394 const int alchGrid =
simParams->alchOn ? 1 : 0;
1395 const int alchDecoupleGrid =
simParams->alchDecouple ? 1: 0;
1396 const int alchSoftCoreOrTI = (
simParams->alchElecLambdaStart > 0 ||
simParams->alchThermIntOn) ? 1 : 0;
1403 forceMsg->
pe = msg->
pe;
1412 CmiLock(lock_recvAtoms);
1413 numStrayAtoms += numStrayAtomsPatch;
1415 int homePatchIndex = numHomePatchesRecv;
1417 plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
1418 if (pmePencilType != 3) {
1420 for (
int p=0;p < 9;p++) {
1425 int pp = y + z*yNBlocks;
1427 if (pp != pp0 && numAtoms > 0) {
1428 homePatchIndexList[atomI][pp].push_back(homePatchIndex);
1432 plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
1436 homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
1439 numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
1441 numHomePatchesRecv++;
1442 if (numHomePatchesRecv == numHomePatches) {
1444 numHomePatchesRecv = 0;
1447 CmiUnlock(lock_recvAtoms);
1464 for (
int z=zlo;z <= zhi;z++) {
1465 for (
int y=ylo;y <= yhi;y++) {
1467 if (y != 0 || z != 0) {
1470 thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
1480 int pp = y-ylo + (z-zlo)*yNBlocks;
1482 pmeAtomStorage[atomIval][pp]->finish();
1486 int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
1487 CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
1493 float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1494 float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1495 memcpy(msgPencil->
chargeFactors1, chargeFactors1, numAtoms*
sizeof(
float));
1496 memcpy(msgPencil->
chargeFactors2, chargeFactors2, numAtoms*
sizeof(
float));
1500 float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1501 float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1502 float* chargeFactors3 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(2);
1503 float* chargeFactors4 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(3);
1504 memcpy(msgPencil->
chargeFactors1, chargeFactors1, numAtoms*
sizeof(
float));
1505 memcpy(msgPencil->
chargeFactors2, chargeFactors2, numAtoms*
sizeof(
float));
1506 memcpy(msgPencil->
chargeFactors3, chargeFactors3, numAtoms*
sizeof(
float));
1507 memcpy(msgPencil->
chargeFactors4, chargeFactors4, numAtoms*
sizeof(
float));
1511 float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1512 float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1513 float* chargeFactors3 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(2);
1514 float* chargeFactors4 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(3);
1515 float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1516 memcpy(msgPencil->
chargeFactors1, chargeFactors1, numAtoms*
sizeof(
float));
1517 memcpy(msgPencil->
chargeFactors2, chargeFactors2, numAtoms*
sizeof(
float));
1518 memcpy(msgPencil->
chargeFactors3, chargeFactors3, numAtoms*
sizeof(
float));
1519 memcpy(msgPencil->
chargeFactors4, chargeFactors4, numAtoms*
sizeof(
float));
1520 memcpy(msgPencil->
chargeFactors5, chargeFactors5, numAtoms*
sizeof(
float));
1524 float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1525 float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1526 float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1527 memcpy(msgPencil->
chargeFactors1, chargeFactors1, numAtoms*
sizeof(
float));
1528 memcpy(msgPencil->
chargeFactors2, chargeFactors2, numAtoms*
sizeof(
float));
1529 memcpy(msgPencil->
chargeFactors5, chargeFactors5, numAtoms*
sizeof(
float));
1533 float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1534 float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1535 float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1536 memcpy(msgPencil->
chargeFactors1, chargeFactors1, numAtoms*
sizeof(
float));
1537 memcpy(msgPencil->
chargeFactors2, chargeFactors2, numAtoms*
sizeof(
float));
1538 memcpy(msgPencil->
chargeFactors5, chargeFactors5, numAtoms*
sizeof(
float));
1542 float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1543 float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1544 float* chargeFactors3 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(2);
1545 float* chargeFactors4 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(3);
1546 float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1547 memcpy(msgPencil->
chargeFactors1, chargeFactors1, numAtoms*
sizeof(
float));
1548 memcpy(msgPencil->
chargeFactors2, chargeFactors2, numAtoms*
sizeof(
float));
1549 memcpy(msgPencil->
chargeFactors3, chargeFactors3, numAtoms*
sizeof(
float));
1550 memcpy(msgPencil->
chargeFactors4, chargeFactors4, numAtoms*
sizeof(
float));
1551 memcpy(msgPencil->
chargeFactors5, chargeFactors5, numAtoms*
sizeof(
float));
1562 msgPencil->
srcY = pencilIndexY;
1563 msgPencil->
srcZ = pencilIndexZ;
1570 int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1571 mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
1576 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1578 int y = msg->
srcY - pencilIndexY;
1579 if (y < ylo) y += pmeGrid.
yBlocks;
1580 if (y > yhi) y -= pmeGrid.
yBlocks;
1581 int z = msg->
srcZ - pencilIndexZ;
1582 if (z < zlo) z += pmeGrid.
zBlocks;
1583 if (z > zhi) z -= pmeGrid.
zBlocks;
1584 if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1585 NAMD_bug(
"ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
1594 int pp = y-ylo + (z-zlo)*yNBlocks;
1599 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2});
1602 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4});
1605 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1608 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, 0, 0, msg->chargeFactors5});
1611 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, 0, 0, msg->chargeFactors5});
1614 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1617 neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->
numAtoms, msg->
atoms, std::vector<float*>{});
1627 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1631 CmiLock(lock_numNeighborsRecv);
1633 if (numNeighborsRecv == numNeighborsExpected) {
1635 numNeighborsRecv = 0;
1638 CmiUnlock(lock_numNeighborsRecv);
1649 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1652 pmeAtomStorage[atomI][pp0]->finish();
1654 int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
1655 CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
1661 float* chargeFactors1 = NULL;
1662 float* chargeFactors2 = NULL;
1663 float* chargeFactors3 = NULL;
1664 float* chargeFactors4 = NULL;
1665 float* chargeFactors5 = NULL;
1667 chargeFactors1 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(0);
1668 chargeFactors2 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(1);
1669 allocate_host<CudaAtom>(&atoms2, numAtoms);
1671 chargeFactors3 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(2);
1672 chargeFactors4 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(3);
1673 allocate_host<CudaAtom>(&atoms3, numAtoms);
1674 allocate_host<CudaAtom>(&atoms4, numAtoms);
1677 chargeFactors5 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(4);
1678 allocate_host<CudaAtom>(&atoms5, numAtoms);
1684 reallocate_host<CudaForce>(&forces[0], &forceCapacities[0], numAtoms, 1.5f);
1686 reallocate_host<CudaForce>(&forces[1], &forceCapacities[1], numAtoms, 1.5f);
1688 reallocate_host<CudaForce>(&forces[2], &forceCapacities[2], numAtoms, 1.5f);
1689 reallocate_host<CudaForce>(&forces[3], &forceCapacities[3], numAtoms, 1.5f);
1692 reallocate_host<CudaForce>(&forces[4], &forceCapacities[4], numAtoms, 1.5f);
1698 for (
int i = 0; i < numAtoms; ++i) {
1700 atoms2[i].
x = atoms[i].
x;
1701 atoms2[i].
y = atoms[i].
y;
1702 atoms2[i].
z = atoms[i].
z;
1703 atoms2[i].
q = atoms[i].
q * chargeFactors2[i];
1705 atoms3[i].
x = atoms[i].
x;
1706 atoms3[i].
y = atoms[i].
y;
1707 atoms3[i].
z = atoms[i].
z;
1708 atoms3[i].
q = atoms[i].
q * chargeFactors3[i];
1709 atoms4[i].
x = atoms[i].
x;
1710 atoms4[i].
y = atoms[i].
y;
1711 atoms4[i].
z = atoms[i].
z;
1712 atoms4[i].
q = atoms[i].
q * chargeFactors4[i];
1715 atoms5[i].
x = atoms[i].
x;
1716 atoms5[i].
y = atoms[i].
y;
1717 atoms5[i].
z = atoms[i].
z;
1718 atoms5[i].
q = atoms[i].
q * chargeFactors5[i];
1720 atoms[i].
q *= chargeFactors1[i];
1722 pmeRealSpaceComputes[0]->copyAtoms(numAtoms, atoms);
1723 pmeRealSpaceComputes[1]->copyAtoms(numAtoms, atoms2);
1725 pmeRealSpaceComputes[2]->copyAtoms(numAtoms, atoms3);
1726 pmeRealSpaceComputes[3]->copyAtoms(numAtoms, atoms4);
1727 deallocate_host<CudaAtom>(&atoms4);
1728 deallocate_host<CudaAtom>(&atoms3);
1731 pmeRealSpaceComputes[4]->copyAtoms(numAtoms, atoms5);
1732 deallocate_host<CudaAtom>(&atoms5);
1734 deallocate_host<CudaAtom>(&atoms2);
1736 pmeRealSpaceComputes[0]->copyAtoms(numAtoms, atoms);
1739 beforeWalltime = CmiWallTimer();
1740 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1741 if (enabledGrid[iGrid] ==
true) {
1742 pmeRealSpaceComputes[iGrid]->spreadCharge(lattice);
1754 switch(pmePencilType) {
1756 pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
1759 pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
1762 pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
1772 beforeWalltime = CmiWallTimer();
1779 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
1780 if (enabledGrid[iGrid]) {
1782 pmeRealSpaceComputes[iGrid]->gatherForce(lattice, forces[iGrid]);
1798 for (
int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
1804 if (pmePencilType != 3) CmiLock(lock_numPencils);
1805 numPencils[forceI][homePatchIndex]--;
1806 if (numPencils[forceI][homePatchIndex] == 0) done =
true;
1807 if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1818 forceReady[iGrid] = 1;
1819 bool all_force_ready =
true;
1823 if (forceReady[i] == -1)
continue;
1824 if (forceReady[i] == 0) all_force_ready =
false;
1826 if (all_force_ready) {
1828 if (forceReady[i] == -1)
continue;
1829 if (forceReady[i] == 1) forceReady[i] = 0;
1839 #if CMK_SMP && USE_CKLOOP 1841 if (useCkLoop >= 1) {
1842 CkLoop_Parallelize(
gatherForceDoneLoop, 1, (
void *)
this, CkMyNodeSize(), 0, numHomePatches-1);
1848 for (
int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
1854 if (pmePencilType != 3) CmiLock(lock_numPencils);
1855 numPencils[forceI][homePatchIndex]--;
1856 if (numPencils[forceI][homePatchIndex] == 0) done =
true;
1857 if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1861 thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1867 if (numHomePatches == 0) {
1868 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1869 pmeAtomStorage[forceI][pp0]->clear();
1879 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1880 int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1882 const int alchGrid =
simParams->alchOn ? 1 : 0;
1883 const int alchDecoupleGrid =
simParams->alchDecouple ? 1: 0;
1884 const int alchSoftCoreOrTI = (
simParams->alchElecLambdaStart > 0 ||
simParams->alchThermIntOn) ? 1 : 0;
1886 for (
int z=zlo;z <= zhi;z++) {
1887 for (
int y=ylo;y <= yhi;y++) {
1889 if (y != 0 || z != 0) {
1890 int pp = y-ylo + (z-zlo)*yNBlocks;
1891 int patchIndex = neighborPatchIndex[pp];
1892 int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
1893 int atomEnd = patchPos[patchIndex];
1894 int natom = atomEnd-atomStart;
1897 msg =
new (natom, alchGrid * natom, alchDecoupleGrid * natom,
1898 alchDecoupleGrid * natom, alchSoftCoreOrTI * natom,
1901 memcpy(msg->
force, forces[0]+atomStart, natom*
sizeof(
CudaForce));
1913 int dstY = (pencilIndexY + y + pmeGrid.
yBlocks) % pmeGrid.
yBlocks;
1914 int dstZ = (pencilIndexZ + z + pmeGrid.
zBlocks) % pmeGrid.
zBlocks;
1915 int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
1919 msg->
srcY = pencilIndexY;
1920 msg->
srcZ = pencilIndexZ;
1921 mgrProxy[node].recvForcesFromNeighbor(msg);
1930 int y = msg->
srcY - pencilIndexY;
1931 if (y < ylo) y += pmeGrid.
yBlocks;
1932 if (y > yhi) y -= pmeGrid.
yBlocks;
1933 int z = msg->
srcZ - pencilIndexZ;
1934 if (z < zlo) z += pmeGrid.
zBlocks;
1935 if (z > zhi) z -= pmeGrid.
zBlocks;
1937 if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1938 NAMD_bug(
"ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
1942 int pp = y-ylo + (z-zlo)*yNBlocks;
1945 neighborForcePencilMsgs[pp] = msg;
1957 int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
1958 if (numPatches != homePatchIndexList[forceI][pp].size()) {
1959 NAMD_bug(
"ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
1961 for (
int i=0;i < numPatches;i++) {
1963 int homePatchIndex = homePatchIndexList[forceI][pp][i];
1969 CmiLock(lock_numPencils);
1970 numPencils[forceI][homePatchIndex]--;
1971 if (numPencils[forceI][homePatchIndex] == 0) done =
true;
1972 CmiUnlock(lock_numPencils);
1976 thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1985 int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1988 PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
1991 if (pmePencilType == 3) {
1994 int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1996 int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1997 int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1998 int atomEnd = patchPos[pencilPatchIndex];
1999 int numAtoms = atomEnd-atomStart;
2000 if (forceMsg->zeroCopy) {
2002 forceMsg->force = forces[0]+atomStart;
2004 forceMsg->force2 = forces[1]+atomStart;
2006 forceMsg->force3 = forces[2]+atomStart;
2007 forceMsg->force4 = forces[3]+atomStart;
2010 forceMsg->force5 = forces[4]+atomStart;
2014 memcpy(forceMsg->force, forces[0]+atomStart, numAtoms*
sizeof(
CudaForce));
2016 memcpy(forceMsg->force2, forces[1]+atomStart, numAtoms*
sizeof(
CudaForce));
2018 memcpy(forceMsg->force3, forces[2]+atomStart, numAtoms*
sizeof(
CudaForce));
2019 memcpy(forceMsg->force4, forces[3]+atomStart, numAtoms*
sizeof(
CudaForce));
2022 memcpy(forceMsg->force5, forces[4]+atomStart, numAtoms*
sizeof(
CudaForce));
2030 memset(forceMsg->force, 0, forceMsg->numAtoms*
sizeof(
CudaForce));
2032 memset(forceMsg->force2, 0, forceMsg->numAtoms*
sizeof(
CudaForce));
2034 memset(forceMsg->force3, 0, forceMsg->numAtoms*
sizeof(
CudaForce));
2035 memset(forceMsg->force4, 0, forceMsg->numAtoms*
sizeof(
CudaForce));
2038 memset(forceMsg->force5, 0, forceMsg->numAtoms*
sizeof(
CudaForce));
2044 int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
2045 int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
2046 int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
2047 int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
2048 int atomEnd = patchPos[pencilPatchIndex];
2049 int numAtoms = atomEnd-atomStart;
2052 for (
int i=0;i < numAtoms;i++) {
2053 forceMsg->force[index[atomStart + i]] = forces[0][atomStart + i];
2055 forceMsg->force2[index[atomStart + i]] = forces[1][atomStart + i];
2057 forceMsg->force3[index[atomStart + i]] = forces[2][atomStart + i];
2058 forceMsg->force4[index[atomStart + i]] = forces[3][atomStart + i];
2061 forceMsg->force5[index[atomStart + i]] = forces[4][atomStart + i];
2069 for (
int j=1;j < plList[forceI][homePatchIndex].size();j++) {
2070 int pp = plList[forceI][homePatchIndex][j].pp;
2071 int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
2073 int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
2074 int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
2075 int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
2076 int atomEnd = patchPos[pencilPatchIndex];
2077 int numAtoms = atomEnd-atomStart;
2080 CudaForce *dstForce2 = forceMsg->force2;
2081 CudaForce *dstForce3 = forceMsg->force3;
2082 CudaForce *dstForce4 = forceMsg->force4;
2083 CudaForce *dstForce5 = forceMsg->force5;
2084 CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
2085 CudaForce *srcForce2 = neighborForcePencilMsgs[pp]->force2;
2086 CudaForce *srcForce3 = neighborForcePencilMsgs[pp]->force3;
2087 CudaForce *srcForce4 = neighborForcePencilMsgs[pp]->force4;
2088 CudaForce *srcForce5 = neighborForcePencilMsgs[pp]->force5;
2090 for (
int i=0;i < numAtoms;i++) {
2091 dstForce[index[atomStart + i]].
x += srcForce[atomStart + i].
x;
2092 dstForce[index[atomStart + i]].
y += srcForce[atomStart + i].
y;
2093 dstForce[index[atomStart + i]].
z += srcForce[atomStart + i].
z;
2095 dstForce2[index[atomStart + i]].
x += srcForce2[atomStart + i].
x;
2096 dstForce2[index[atomStart + i]].
y += srcForce2[atomStart + i].
y;
2097 dstForce2[index[atomStart + i]].
z += srcForce2[atomStart + i].
z;
2099 dstForce3[index[atomStart + i]].
x += srcForce3[atomStart + i].
x;
2100 dstForce3[index[atomStart + i]].
y += srcForce3[atomStart + i].
y;
2101 dstForce3[index[atomStart + i]].
z += srcForce3[atomStart + i].
z;
2102 dstForce4[index[atomStart + i]].
x += srcForce4[atomStart + i].
x;
2103 dstForce4[index[atomStart + i]].
y += srcForce4[atomStart + i].
y;
2104 dstForce4[index[atomStart + i]].
z += srcForce4[atomStart + i].
z;
2107 dstForce5[index[atomStart + i]].
x += srcForce5[atomStart + i].
x;
2108 dstForce5[index[atomStart + i]].
y += srcForce5[atomStart + i].
y;
2109 dstForce5[index[atomStart + i]].
z += srcForce5[atomStart + i].
z;
2118 plList[forceI][homePatchIndex].clear();
2122 CmiLock(lock_numHomePatchesMerged);
2123 numHomePatchesMerged++;
2124 if (numHomePatchesMerged == numHomePatches) {
2126 numHomePatchesMerged = 0;
2129 for (
int i=0;i < neighborForcePencilMsgs.size();i++) {
2130 if (neighborForcePencilMsgs[i] != NULL) {
2131 delete neighborForcePencilMsgs[i];
2132 neighborForcePencilMsgs[i] = NULL;
2137 for (
int pp=0;pp < homePatchIndexList[forceI].size();pp++)
2138 homePatchIndexList[forceI][pp].clear();
2139 for (
int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
2140 pmeAtomStorage[forceI][pp]->clear();
2143 CmiUnlock(lock_numHomePatchesMerged);
2148 int pe = forceMsg->pe;
2149 if (CkNodeOf(pe) != CkMyNode())
2150 thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
2158 int pe = forceMsg->
pe;
2165 CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2166 wdProxy[pe].enqueuePme(lmsg);
2171 #include "ComputePmeCUDAMgr.def.h"
int getHomeNode(PatchID patchID)
void sendAtomsToNeighbor(int y, int z, int atomIval)
void sendForcesToNeighbors()
void recvForcesFromNeighbor(PmeForcePencilMsg *msg)
CProxy_ComputePmeCUDAMgr getMgrProxy()
std::ostream & iINFO(std::ostream &s)
__thread DeviceCUDA * deviceCUDA
void initialize(CkQdMsg *msg)
CpuPmeAtomStorage(const bool useIndex)
NAMD_HOST_DEVICE Vector c() const
void gatherForceDone(unsigned int iGrid)
int getDeviceIDPencilX(int i, int j)
static void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param)
void sendForcesToPatch(PmeForceMsg *forceMsg)
static PatchMap * Object()
void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void getHomePencil(PatchID patchID, int &homey, int &homez)
SimParameters * simParameters
#define CUDA_PME_SPREADCHARGE_EVENT
void recvAtomFiler(CProxy_PmeAtomFiler filer)
void recvAtoms(PmeAtomMsg *msg)
std::ostream & endi(std::ostream &s)
static int getPencilIndexY(const PmeGrid &pmeGrid, const int y)
void mergeForcesOnPatch(int homePatchIndex)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void createDevicesAndAtomFiler()
const unsigned int NUM_GRID_MAX
void swap(Array< T > &s, Array< T > &t)
void recvDevices(RecvDeviceMsg *msg)
int getDeviceIDPencilZ(int i, int j)
LocalWorkMsg *const localWorkMsg
bool storePmeForceMsg(PmeForceMsg *msg)
std::array< int, NUM_GRID_MAX > dataSizes
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
static double calcGridCoord(const double x, const double recip11, const int nfftx)
std::array< float *, NUM_GRID_MAX > dataGrid
void initialize(PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
int numPatches(void) const
bool isGridEnabled(unsigned int i) const
void NAMD_bug(const char *err_msg)
int getDevicePencilZ(int i, int j)
BigReal min_c(int pid) const
void registerRecvAtomsFromNeighbor()
void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg)
void initializePatches(int numHomePatches_in)
NAMD_HOST_DEVICE Vector a_r() const
NAMD_HOST_DEVICE Vector b_r() const
unsigned int totalFactorArrays
void createStream(cudaStream_t &stream)
NAMD_HOST_DEVICE Vector c_r() const
NAMD_HOST_DEVICE Vector b() const
CudaPmeAtomStorage(const bool useIndex)
int getDevice(int i, int j)
int getDeviceIDPencilY(int i, int j)
void sendAtomsToNeighbors()
void recvAtoms(PmeAtomMsg *msg)
void activate_pencils(CkQdMsg *msg)
BigReal max_b(int pid) const
int getDeviceIDbyRank(int rank)
int getNode(int i, int j)
void initialize_pencils(CkQdMsg *msg)
BigReal max_c(int pid) const
std::vector< float * > overflowAtomElecFactorArrays
std::array< bool, NUM_GRID_MAX > enabledGrid
int getDevicePencilY(int i, int j)
int * getAtomIndex(int p)
void recvPencils(CProxy_CudaPmePencilXYZ xyz)
CProxy_ComputePmeCUDADevice * dev
void gatherForceDoneSubset(int first, int last)
BigReal min_b(int pid) const
NAMD_HOST_DEVICE Vector unit(void) const
static int getPencilIndexZ(const PmeGrid &pmeGrid, const int z)
bool isPmeDevice(int deviceID)
std::vector< float * > atomElecFactorArrays
#define CUDA_PME_GATHERFORCE_EVENT