7 #include "ComputePmeCUDAMgr.decl.h" 11 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 13 #define __thread __declspec(thread) 30 int deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
31 cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
32 CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
34 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
35 if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) ==
true) {
38 energyReady[iGrid] = 0;
40 fftComputes[iGrid] = NULL;
41 pmeKSpaceComputes[iGrid] = NULL;
42 energyReady[iGrid] = -1;
47 void CudaPmePencilXYZ::backwardDone() {
48 deviceProxy[CkMyNode()].gatherForce();
51 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
52 if (pmeKSpaceComputes[iGrid] != NULL)
62 submitReductions(iGrid);
81 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
91 deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
92 stream = deviceProxy.ckLocalBranch()->getStream();
93 CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
95 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
96 if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) ==
true) {
100 fftComputes[iGrid] = NULL;
101 pmeTransposes[iGrid] = NULL;
105 deviceBuffers.resize(pmeGrid.xBlocks,
DeviceBuffer(-1,
false));
106 numDeviceBuffers = 0;
110 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
135 for (
int x=0;x < pmeGrid.xBlocks;x++) {
136 int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
137 if (CkNodeOf(pe) == CkMyNode()) {
139 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(x, 0);
141 int canAccessPeer = 0;
142 if (deviceID != deviceID0) {
144 cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
149 unsigned int flags = 0;
150 cudaCheck(cudaDeviceEnablePeerAccess(deviceID0, flags));
156 deviceBuffers[x] =
DeviceBuffer(deviceID0, canAccessPeer);
157 pmePencilZ(x,0,0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
168 void CudaPmePencilXY::start(
const CkCallback &cb) {
169 thisProxy[thisIndex].recvDeviceBuffers(cb);
172 void CudaPmePencilXY::setDeviceBuffers() {
173 std::array<std::vector<float2*>,
NUM_GRID_MAX> dataPtrsGrid;
175 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
176 dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.xBlocks, (float2*)0);
177 for (
int x=0;x < pmeGrid.xBlocks;x++) {
178 if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
179 if (deviceBuffers[x].deviceID == deviceID || deviceBuffers[x].isPeerDevice) {
182 dataPtrsGrid[iGrid][x] = deviceBuffers[x].dataGrid[iGrid];
188 if (pmeTransposes[iGrid] != NULL) {
189 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
195 std::array<float2*, NUM_GRID_MAX> CudaPmePencilXY::getData(
const int i,
const bool sameDevice) {
196 std::array<float2*, NUM_GRID_MAX> data_grid;
197 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
198 if (fftComputes[iGrid] != NULL) {
199 #ifndef P2P_ENABLE_3D 201 int i0, i1, j0, j1, k0, k1;
202 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
203 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
205 data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
208 int i0, i1, j0, j1, k0, k1;
209 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
210 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
213 data_grid[iGrid] = NULL;
219 void CudaPmePencilXY::backwardDone() {
220 deviceProxy[CkMyNode()].gatherForce();
223 void CudaPmePencilXY::forwardDone() {
224 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
225 if (pmeTransposes[iGrid] != NULL) {
227 pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataDst());
229 if (numDeviceBuffers > 0) {
231 for (
int x=0;x < pmeGrid.xBlocks;x++) {
232 if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
233 if (deviceBuffers[x].deviceID != deviceID && !deviceBuffers[x].isPeerDevice) {
234 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(x, deviceBuffers[x].deviceID,
241 cudaCheck(cudaEventRecord(event, stream));
243 for (
int x=0;x < pmeGrid.xBlocks;x++) {
244 if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
248 msg->
y = thisIndex.y;
249 msg->
z = thisIndex.z;
256 pmePencilZ(x,0,0).recvBlock(msg);
262 for (
int x=0;x < pmeGrid.xBlocks;x++) {
263 if (deviceBuffers[x].dataGrid[iGrid] == NULL) {
267 msg->
y = thisIndex.y;
268 msg->
z = thisIndex.z;
277 pmePencilZ(x,0,0).recvBlock(msg);
284 void CudaPmePencilXY::recvDataFromZ(
PmeBlockMsg *msg) {
294 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
x].event, 0));
295 #ifndef P2P_ENABLE_3D 296 if (deviceBuffers[msg->
x].dataGrid[msg->
grid] != NULL && deviceBuffers[msg->
x].deviceID != deviceID && !deviceBuffers[msg->
x].isPeerDevice) {
298 ((
CudaPmeTranspose *)(pmeTransposes[msg->
grid]))->copyDataDeviceToDevice(msg->
x, (float2 *)fftComputes[msg->
grid]->getDataDst());
321 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
331 deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
332 stream = deviceProxy.ckLocalBranch()->getStream();
333 CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
339 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
340 if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) ==
true) {
344 fftComputes[iGrid] = NULL;
345 pmeTransposes[iGrid] = NULL;
351 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
354 deviceBuffers.resize(pmeGrid.xBlocks,
DeviceBuffer(-1,
false));
355 numDeviceBuffers = 0;
357 for (
int x=0;x < pmeGrid.xBlocks;x++) {
358 int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,thisIndex.z));
359 if (CkNodeOf(pe) == CkMyNode()) {
360 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(x, thisIndex.z);
363 pmePencilY(x,0,thisIndex.z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
372 void CudaPmePencilX::start(
const CkCallback &cb) {
373 thisProxy[thisIndex].recvDeviceBuffers(cb);
379 void CudaPmePencilX::setDeviceBuffers() {
380 std::array<std::vector<float2*>,
NUM_GRID_MAX> dataPtrsGrid;
382 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
383 dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.xBlocks, (float2*)0);
384 for (
int x=0;x < pmeGrid.xBlocks;x++) {
385 if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
386 if (deviceBuffers[x].deviceID == deviceID) {
388 dataPtrsGrid[iGrid][x] = deviceBuffers[x].dataGrid[iGrid];
394 if (pmeTransposes[iGrid] != NULL) {
395 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
400 std::array<float2*, NUM_GRID_MAX> CudaPmePencilX::getData(
const int i,
const bool sameDevice) {
401 std::array<float2*, NUM_GRID_MAX> data_grid;
402 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
403 if (fftComputes[iGrid] != NULL) {
404 #ifndef P2P_ENABLE_3D 406 int i0, i1, j0, j1, k0, k1;
407 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
408 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
410 data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
413 int i0, i1, j0, j1, k0, k1;
414 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
415 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
418 data_grid[iGrid] = NULL;
424 void CudaPmePencilX::backwardDone() {
425 deviceProxy[CkMyNode()].gatherForce();
428 void CudaPmePencilX::forwardDone() {
429 if (pmeTransposes[0] == NULL)
430 NAMD_bug(
"CudaPmePencilX::forwardDone, pmeTranspose not initialized");
431 if (blockSizes.size() == 0)
432 NAMD_bug(
"CudaPmePencilX::forwardDone, blockSizes not initialized");
433 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
434 if (pmeTransposes[iGrid] != NULL) {
436 pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataDst());
440 if (numDeviceBuffers > 0) {
442 for (
int x=0;x < pmeGrid.xBlocks;x++) {
443 if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
444 if (deviceBuffers[x].deviceID != deviceID) {
445 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(x, deviceBuffers[x].deviceID,
452 cudaCheck(cudaEventRecord(event, stream));
454 for (
int x=0;x < pmeGrid.xBlocks;x++) {
455 if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
459 msg->
y = thisIndex.y;
460 msg->
z = thisIndex.z;
467 pmePencilY(x,0,thisIndex.z).recvBlock(msg);
473 for (
int x=0;x < pmeGrid.xBlocks;x++) {
474 if (deviceBuffers[x].dataGrid[iGrid] == NULL) {
478 msg->
y = thisIndex.y;
479 msg->
z = thisIndex.z;
488 pmePencilY(x,0,thisIndex.z).recvBlock(msg);
495 void CudaPmePencilX::recvDataFromY(
PmeBlockMsg *msg) {
504 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
x].event, 0));
505 #ifndef P2P_ENABLE_3D 506 if (deviceBuffers[msg->
x].dataGrid[msg->
grid] != NULL && deviceBuffers[msg->
x].deviceID != deviceID) {
508 ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
x, (float2 *)fftComputes[msg->
grid]->getDataDst());
532 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
540 CProxy_ComputePmeCUDADevice deviceProxy = msg->
deviceProxy;
543 CProxy_ComputePmeCUDAMgr mgrProxy = msg->
mgrProxy;
549 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
550 if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) ==
true) {
554 fftComputes[iGrid] = NULL;
555 pmeTransposes[iGrid] = NULL;
561 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
564 deviceBuffersZ.resize(pmeGrid.yBlocks,
DeviceBuffer(-1,
false));
565 deviceBuffersX.resize(pmeGrid.yBlocks,
DeviceBuffer(-1,
false));
566 numDeviceBuffersZ = 0;
567 numDeviceBuffersX = 0;
569 for (
int y=0;y < pmeGrid.yBlocks;y++) {
571 pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x, y, 0));
572 if (CkNodeOf(pe) == CkMyNode()) {
573 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(thisIndex.x, y);
576 pmePencilZ(thisIndex.x, y, 0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID), thisProxy);
578 pe = xMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0, y, thisIndex.z));
579 if (CkNodeOf(pe) == CkMyNode()) {
580 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(y, thisIndex.z);
583 pmePencilX(0, y, thisIndex.z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID), thisProxy);
592 void CudaPmePencilY::start(
const CkCallback &cb) {
593 thisProxy[thisIndex].recvDeviceBuffers(cb);
599 void CudaPmePencilY::setDeviceBuffers() {
600 std::array<std::vector<float2*>,
NUM_GRID_MAX> dataPtrsYZXGrid;
601 std::array<std::vector<float2*>,
NUM_GRID_MAX> dataPtrsZXYGrid;
602 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
603 dataPtrsYZXGrid[iGrid] = std::vector<float2*>(pmeGrid.yBlocks, (float2*)0);
604 dataPtrsZXYGrid[iGrid] = std::vector<float2*>(pmeGrid.yBlocks, (float2*)0);
605 for (
int y=0;y < pmeGrid.yBlocks;y++) {
606 if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
607 if (deviceBuffersZ[y].deviceID == deviceID) {
608 dataPtrsYZXGrid[iGrid][y] = deviceBuffersZ[y].dataGrid[iGrid];
611 if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
612 if (deviceBuffersX[y].deviceID == deviceID) {
613 dataPtrsZXYGrid[iGrid][y] = deviceBuffersX[y].dataGrid[iGrid];
617 if (pmeTransposes[iGrid] != NULL) {
618 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsYZXGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
619 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsZXYGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
624 std::array<float2*, NUM_GRID_MAX> CudaPmePencilY::getDataForX(
const int i,
const bool sameDevice) {
625 std::array<float2*, NUM_GRID_MAX> data_grid;
626 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
627 if (fftComputes[iGrid] != NULL) {
628 #ifndef P2P_ENABLE_3D 630 int i0, i1, j0, j1, k0, k1;
631 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
632 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
634 data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
637 int i0, i1, j0, j1, k0, k1;
638 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
639 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
642 data_grid[iGrid] = NULL;
648 std::array<float2*, NUM_GRID_MAX> CudaPmePencilY::getDataForZ(
const int i,
const bool sameDevice) {
649 std::array<float2*, NUM_GRID_MAX> data_grid;
650 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
651 if (fftComputes[iGrid] != NULL) {
652 #ifndef P2P_ENABLE_3D 654 int i0, i1, j0, j1, k0, k1;
655 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
656 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
658 data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
661 int i0, i1, j0, j1, k0, k1;
662 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
663 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
666 data_grid[iGrid] = NULL;
672 void CudaPmePencilY::backwardDone() {
673 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
674 if (pmeTransposes[iGrid] != NULL) {
676 pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataSrc());
680 if (numDeviceBuffersX > 0) {
681 for (
int y=0;y < pmeGrid.yBlocks;y++) {
682 if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
683 if (deviceBuffersX[y].deviceID != deviceID) {
684 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(y, deviceBuffersX[y].deviceID,
690 cudaCheck(cudaEventRecord(event, stream));
692 for (
int y=0;y < pmeGrid.yBlocks;y++) {
693 if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
696 msg->
x = thisIndex.x;
698 msg->
z = thisIndex.z;
701 pmePencilX(0,y,thisIndex.z).recvBlock(msg);
707 for (
int y=0;y < pmeGrid.yBlocks;y++) {
708 if (deviceBuffersX[y].dataGrid[iGrid] == NULL) {
711 msg->
x = thisIndex.x;
713 msg->
z = thisIndex.z;
718 pmePencilX(0,y,thisIndex.z).recvBlock(msg);
725 void CudaPmePencilY::forwardDone() {
726 if (pmeTransposes[0] == NULL)
727 NAMD_bug(
"CudaPmePencilY::forwardDone, pmeTranspose not initialized");
728 if (blockSizes.size() == 0)
729 NAMD_bug(
"CudaPmePencilY::forwardDone, blockSizes not initialized");
731 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
732 if (pmeTransposes[iGrid] != NULL) {
734 pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataDst());
738 if (numDeviceBuffersZ > 0) {
739 for (
int y=0;y < pmeGrid.yBlocks;y++) {
740 if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
741 if (deviceBuffersZ[y].deviceID != deviceID) {
742 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(y, deviceBuffersZ[y].deviceID,
749 cudaCheck(cudaEventRecord(event, stream));
751 for (
int y=0;y < pmeGrid.yBlocks;y++) {
752 if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
755 msg->
x = thisIndex.x;
757 msg->
z = thisIndex.z;
764 pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
770 for (
int y=0;y < pmeGrid.yBlocks;y++) {
771 if (deviceBuffersZ[y].dataGrid[iGrid] == NULL) {
774 msg->
x = thisIndex.x;
776 msg->
z = thisIndex.z;
785 pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
792 void CudaPmePencilY::recvDataFromX(
PmeBlockMsg *msg) {
801 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersX[msg->
y].event, 0));
802 #ifndef P2P_ENABLE_3D 803 if (deviceBuffersX[msg->
y].dataGrid[msg->
grid] != NULL && deviceBuffersX[msg->
y].deviceID != deviceID) {
805 ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
y, (float2 *)fftComputes[msg->
grid]->getDataSrc());
812 void CudaPmePencilY::recvDataFromZ(
PmeBlockMsg *msg) {
821 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersZ[msg->
y].event, 0));
822 #ifndef P2P_ENABLE_3D 823 if (deviceBuffersZ[msg->
y].dataGrid[msg->
grid] != NULL && deviceBuffersZ[msg->
y].deviceID != deviceID) {
825 ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
y, (float2 *)fftComputes[msg->
grid]->getDataDst());
859 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
867 CProxy_ComputePmeCUDADevice deviceProxy = msg->
deviceProxy;
870 CProxy_ComputePmeCUDAMgr mgrProxy = msg->
mgrProxy;
876 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
877 if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) ==
true) {
881 energyReady[iGrid] = 0;
883 fftComputes[iGrid] = NULL;
884 pmeTransposes[iGrid] = NULL;
885 pmeKSpaceComputes[iGrid] = NULL;
886 energyReady[iGrid] = -1;
892 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
895 deviceBuffers.resize(pmeGrid.zBlocks,
DeviceBuffer(-1,
false));
896 numDeviceBuffers = 0;
899 for (
int z=0;z < pmeGrid.zBlocks;z++) {
900 int pe = xyMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,0,z));
901 if (CkNodeOf(pe) == CkMyNode()) {
902 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(0, z);
904 int canAccessPeer = 0;
905 if (deviceID != deviceID0) {
907 cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
913 deviceBuffers[z] =
DeviceBuffer(deviceID0, canAccessPeer);
914 pmePencilXY(0,0,z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
918 for (
int z=0;z < pmeGrid.zBlocks;z++) {
919 int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,0,z));
920 if (CkNodeOf(pe) == CkMyNode()) {
921 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(thisIndex.x, z);
924 pmePencilY(thisIndex.x,0,z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
934 void CudaPmePencilZ::start(
const CkCallback &cb) {
935 thisProxy[thisIndex].recvDeviceBuffers(cb);
938 void CudaPmePencilZ::setDeviceBuffers() {
939 std::array<std::vector<float2*>,
NUM_GRID_MAX> dataPtrsGrid;
940 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
941 dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.zBlocks, (float2*)0);
942 for (
int z=0;z < pmeGrid.zBlocks;z++) {
943 if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
944 if (deviceBuffers[z].deviceID == deviceID || deviceBuffers[z].isPeerDevice) {
945 dataPtrsGrid[iGrid][z] = deviceBuffers[z].dataGrid[iGrid];
950 if (pmeTransposes[iGrid] != NULL) {
951 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
954 if (pmeTransposes[iGrid] != NULL) {
955 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
961 std::array<float2*, NUM_GRID_MAX> CudaPmePencilZ::getData(
const int i,
const bool sameDevice) {
962 std::array<float2*, NUM_GRID_MAX> data_grid;
963 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
964 if (fftComputes[iGrid] != NULL) {
965 #ifndef P2P_ENABLE_3D 967 int i0, i1, j0, j1, k0, k1;
968 getBlockDim(pmeGrid,
Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
969 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
971 data_grid[iGrid] = ((
CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
974 int i0, i1, j0, j1, k0, k1;
975 getBlockDim(pmeGrid,
Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
976 data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
979 data_grid[iGrid] = NULL;
985 void CudaPmePencilZ::backwardDone() {
986 for (
unsigned int iGrid = 0; iGrid <
NUM_GRID_MAX; ++iGrid) {
987 if (pmeTransposes[iGrid] != NULL) {
990 pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataSrc());
992 pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataSrc());
997 if (numDeviceBuffers > 0) {
998 for (
int z=0;z < pmeGrid.zBlocks;z++) {
999 if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1000 if (deviceBuffers[z].deviceID != deviceID && !deviceBuffers[z].isPeerDevice) {
1001 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(z, deviceBuffers[z].deviceID,
1007 cudaCheck(cudaEventRecord(event, stream));
1009 for (
int z=0;z < pmeGrid.zBlocks;z++) {
1010 if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1013 msg->
x = thisIndex.x;
1014 msg->
y = thisIndex.y;
1018 pmePencilXY(0,0,z).recvBlock(msg);
1024 for (
int z=0;z < pmeGrid.zBlocks;z++) {
1025 if (deviceBuffers[z].dataGrid[iGrid] == NULL) {
1028 msg->
x = thisIndex.x;
1029 msg->
y = thisIndex.y;
1035 pmePencilXY(0,0,z).recvBlock(msg);
1041 if (numDeviceBuffers > 0) {
1042 for (
int z=0;z < pmeGrid.zBlocks;z++) {
1043 if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1044 if (deviceBuffers[z].deviceID != deviceID) {
1045 ((
CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(z, deviceBuffers[z].deviceID,
1051 cudaCheck(cudaEventRecord(event, stream));
1053 for (
int z=0;z < pmeGrid.zBlocks;z++) {
1054 if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1057 msg->
x = thisIndex.x;
1058 msg->
y = thisIndex.y;
1062 pmePencilY(thisIndex.x,0,z).recvBlock(msg);
1068 for (
int z=0;z < pmeGrid.zBlocks;z++) {
1069 if (deviceBuffers[z].dataGrid[iGrid] == NULL) {
1072 msg->
x = thisIndex.x;
1073 msg->
y = thisIndex.y;
1079 pmePencilY(thisIndex.x,0,z).recvBlock(msg);
1093 submitReductions(iGrid);
1096 void CudaPmePencilZ::recvDataFromY(
PmeBlockMsg *msg) {
1106 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
z].event, 0));
1107 #ifndef P2P_ENABLE_3D 1108 if (deviceBuffers[msg->
z].dataGrid[0] != NULL && deviceBuffers[msg->
z].deviceID != deviceID && !deviceBuffers[msg->
z].isPeerDevice) {
1110 ((
CudaPmeTranspose *)pmeTransposes[msg->
grid])->copyDataDeviceToDevice(msg->
z, (float2 *)fftComputes[msg->
grid]->getDataSrc());
1118 #include "CudaPmeSolver.def.h"
CProxy_PmePencilXMap zMap
void initialize(CudaPmeXInitMsg *msg)
void initialize(CudaPmeXYInitMsg *msg)
CProxy_PmePencilXYMap xyMap
CProxy_ComputePmeCUDADevice deviceProxy
CProxy_CudaPmePencilZ pmePencilZ
CProxy_PmePencilXMap xMap
void energyAndVirialDone(unsigned int iGrid)
CProxy_CudaPmePencilX pmePencilX
const unsigned int NUM_GRID_MAX
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_ComputePmeCUDADevice deviceProxy
void initialize(CudaPmeXInitMsg *msg)
void initializeDevice(InitDeviceMsg *msg)
CProxy_PmePencilXMap zMap
void initializeDevice(InitDeviceMsg *msg)
void NAMD_bug(const char *err_msg)
__thread DeviceCUDA * deviceCUDA
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_CudaPmePencilZ pmePencilZ
void energyAndVirialDone(unsigned int iGrid)
void initialize(CudaPmeXYZInitMsg *msg)
CProxy_CudaPmePencilY pmePencilY
void initialize(CudaPmeXInitMsg *msg)
void initializeDevice(InitDeviceMsg *msg)
CProxy_CudaPmePencilXY pmePencilXY
CProxy_ComputePmeCUDAMgr mgrProxy
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
CProxy_PmePencilXMap yMap