6 #include "ComputePmeCUDAMgr.decl.h"
10 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
12 #define __thread __declspec(thread)
29 int deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
30 cudaStream_t
stream = deviceProxy.ckLocalBranch()->getStream();
31 CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
38 void CudaPmePencilXYZ::backwardDone() {
39 deviceProxy[CkMyNode()].gatherForce();
67 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
77 deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
78 stream = deviceProxy.ckLocalBranch()->getStream();
79 CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
84 deviceBuffers.resize(pmeGrid.xBlocks,
DeviceBuffer(-1,
false, NULL));
89 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
107 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
108 int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(
x,0,0));
109 if (CkNodeOf(pe) == CkMyNode()) {
111 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(
x, 0);
113 int canAccessPeer = 0;
114 if (deviceID != deviceID0) {
116 cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
121 unsigned int flags = 0;
122 cudaCheck(cudaDeviceEnablePeerAccess(deviceID0, flags));
127 deviceBuffers[
x] =
DeviceBuffer(deviceID0, canAccessPeer, NULL);
128 pmePencilZ(
x,0,0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
139 void CudaPmePencilXY::start(
const CkCallback &cb) {
140 thisProxy[thisIndex].recvDeviceBuffers(cb);
143 void CudaPmePencilXY::setDeviceBuffers() {
144 std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (
float2*)0);
145 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
146 if (deviceBuffers[
x].data != NULL) {
147 if (deviceBuffers[
x].deviceID == deviceID || deviceBuffers[
x].isPeerDevice) {
149 dataPtrs[
x] = deviceBuffers[
x].data;
158 float2* CudaPmePencilXY::getData(
const int i,
const bool sameDevice) {
160 #ifndef P2P_ENABLE_3D
162 int i0, i1, j0, j1, k0, k1;
163 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
164 data = (
float2 *)fftCompute->getDataDst() + i0;
169 int i0, i1, j0, j1, k0, k1;
170 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
171 data = (
float2 *)fftCompute->getDataDst() + i0;
176 void CudaPmePencilXY::backwardDone() {
177 deviceProxy[CkMyNode()].gatherForce();
180 void CudaPmePencilXY::forwardDone() {
182 pmeTranspose->transposeXYZtoZXY((
float2 *)fftCompute->getDataDst());
185 if (numDeviceBuffers > 0) {
187 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
188 if (deviceBuffers[
x].data != NULL) {
189 if (deviceBuffers[
x].deviceID != deviceID && !deviceBuffers[
x].isPeerDevice) {
190 ((
CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(
x, deviceBuffers[
x].deviceID,
196 cudaCheck(cudaEventRecord(event, stream));
198 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
199 if (deviceBuffers[
x].data != NULL) {
203 msg->
y = thisIndex.y;
204 msg->
z = thisIndex.z;
209 pmePencilZ(
x,0,0).recvBlock(msg);
215 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
216 if (deviceBuffers[
x].data == NULL) {
220 msg->
y = thisIndex.y;
221 msg->
z = thisIndex.z;
228 pmePencilZ(
x,0,0).recvBlock(msg);
233 void CudaPmePencilXY::recvDataFromZ(
PmeBlockMsg *msg) {
242 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
x].event, 0));
243 #ifndef P2P_ENABLE_3D
244 if (deviceBuffers[msg->
x].data != NULL && deviceBuffers[msg->
x].deviceID != deviceID && !deviceBuffers[msg->
x].isPeerDevice) {
269 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
279 deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
280 stream = deviceProxy.ckLocalBranch()->getStream();
281 CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
288 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
291 deviceBuffers.resize(pmeGrid.xBlocks,
DeviceBuffer(-1,
false, NULL));
292 numDeviceBuffers = 0;
294 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
295 int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(
x,0,thisIndex.z));
296 if (CkNodeOf(pe) == CkMyNode()) {
297 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(
x, thisIndex.z);
300 pmePencilY(
x,0,thisIndex.z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
309 void CudaPmePencilX::start(
const CkCallback &cb) {
310 thisProxy[thisIndex].recvDeviceBuffers(cb);
316 void CudaPmePencilX::setDeviceBuffers() {
317 std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (
float2*)0);
318 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
319 if (deviceBuffers[
x].data != NULL) {
320 if (deviceBuffers[
x].deviceID == deviceID) {
322 dataPtrs[
x] = deviceBuffers[
x].data;
331 float2* CudaPmePencilX::getData(
const int i,
const bool sameDevice) {
333 #ifndef P2P_ENABLE_3D
335 int i0, i1, j0, j1, k0, k1;
336 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
337 data = (
float2 *)fftCompute->getDataDst() + i0;
342 int i0, i1, j0, j1, k0, k1;
343 getBlockDim(pmeGrid,
Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
344 data = (
float2 *)fftCompute->getDataDst() + i0;
349 void CudaPmePencilX::backwardDone() {
350 deviceProxy[CkMyNode()].gatherForce();
353 void CudaPmePencilX::forwardDone() {
354 if (pmeTranspose == NULL)
355 NAMD_bug(
"CudaPmePencilX::forwardDone, pmeTranspose not initialized");
356 if (blockSizes.size() == 0)
357 NAMD_bug(
"CudaPmePencilX::forwardDone, blockSizes not initialized");
359 pmeTranspose->transposeXYZtoYZX((
float2 *)fftCompute->getDataDst());
363 if (numDeviceBuffers > 0) {
365 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
366 if (deviceBuffers[
x].data != NULL) {
367 if (deviceBuffers[
x].deviceID != deviceID) {
368 ((
CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(
x, deviceBuffers[
x].deviceID,
374 cudaCheck(cudaEventRecord(event, stream));
376 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
377 if (deviceBuffers[
x].data != NULL) {
381 msg->
y = thisIndex.y;
382 msg->
z = thisIndex.z;
387 pmePencilY(
x,0,thisIndex.z).recvBlock(msg);
393 for (
int x=0;
x < pmeGrid.xBlocks;
x++) {
394 if (deviceBuffers[
x].data == NULL) {
398 msg->
y = thisIndex.y;
399 msg->
z = thisIndex.z;
406 pmePencilY(
x,0,thisIndex.z).recvBlock(msg);
411 void CudaPmePencilX::recvDataFromY(
PmeBlockMsg *msg) {
420 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
x].event, 0));
421 #ifndef P2P_ENABLE_3D
422 if (deviceBuffers[msg->
x].data != NULL && deviceBuffers[msg->
x].deviceID != deviceID) {
448 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
459 CProxy_ComputePmeCUDAMgr mgrProxy = msg->
mgrProxy;
470 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
473 deviceBuffersZ.resize(pmeGrid.yBlocks,
DeviceBuffer(-1,
false, NULL));
474 deviceBuffersX.resize(pmeGrid.yBlocks,
DeviceBuffer(-1,
false, NULL));
475 numDeviceBuffersZ = 0;
476 numDeviceBuffersX = 0;
478 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
480 pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,
y, 0));
481 if (CkNodeOf(pe) == CkMyNode()) {
482 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(thisIndex.x,
y);
485 pmePencilZ(thisIndex.x,
y, 0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID), thisProxy);
487 pe = xMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,
y, thisIndex.z));
488 if (CkNodeOf(pe) == CkMyNode()) {
489 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(
y, thisIndex.z);
492 pmePencilX(0,
y, thisIndex.z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID), thisProxy);
501 void CudaPmePencilY::start(
const CkCallback &cb) {
502 thisProxy[thisIndex].recvDeviceBuffers(cb);
508 void CudaPmePencilY::setDeviceBuffers() {
509 std::vector<float2*> dataPtrsYZX(pmeGrid.yBlocks, (
float2*)0);
510 std::vector<float2*> dataPtrsZXY(pmeGrid.yBlocks, (
float2*)0);
511 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
512 if (deviceBuffersZ[
y].data != NULL) {
513 if (deviceBuffersZ[
y].deviceID == deviceID) {
514 dataPtrsYZX[
y] = deviceBuffersZ[
y].data;
517 if (deviceBuffersX[
y].data != NULL) {
518 if (deviceBuffersX[
y].deviceID == deviceID) {
519 dataPtrsZXY[
y] = deviceBuffersX[
y].data;
527 float2* CudaPmePencilY::getDataForX(
const int i,
const bool sameDevice) {
529 #ifndef P2P_ENABLE_3D
531 int i0, i1, j0, j1, k0, k1;
532 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
533 data = (
float2 *)fftCompute->getDataSrc() + i0;
538 int i0, i1, j0, j1, k0, k1;
539 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
540 data = (
float2 *)fftCompute->getDataSrc() + i0;
545 float2* CudaPmePencilY::getDataForZ(
const int i,
const bool sameDevice) {
547 #ifndef P2P_ENABLE_3D
549 int i0, i1, j0, j1, k0, k1;
550 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
551 data = (
float2 *)fftCompute->getDataDst() + i0;
556 int i0, i1, j0, j1, k0, k1;
557 getBlockDim(pmeGrid,
Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
558 data = (
float2 *)fftCompute->getDataDst() + i0;
563 void CudaPmePencilY::backwardDone() {
565 pmeTranspose->transposeXYZtoZXY((
float2 *)fftCompute->getDataSrc());
569 if (numDeviceBuffersX > 0) {
570 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
571 if (deviceBuffersX[
y].data != NULL) {
572 if (deviceBuffersX[
y].deviceID != deviceID) {
573 ((
CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(
y, deviceBuffersX[
y].deviceID,
579 cudaCheck(cudaEventRecord(event, stream));
581 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
582 if (deviceBuffersX[
y].data != NULL) {
585 msg->
x = thisIndex.x;
587 msg->
z = thisIndex.z;
588 pmePencilX(0,
y,thisIndex.z).recvBlock(msg);
594 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
595 if (deviceBuffersX[
y].data == NULL) {
598 msg->
x = thisIndex.x;
600 msg->
z = thisIndex.z;
603 pmePencilX(0,
y,thisIndex.z).recvBlock(msg);
608 void CudaPmePencilY::forwardDone() {
609 if (pmeTranspose == NULL)
610 NAMD_bug(
"CudaPmePencilY::forwardDone, pmeTranspose not initialized");
611 if (blockSizes.size() == 0)
612 NAMD_bug(
"CudaPmePencilY::forwardDone, blockSizes not initialized");
615 pmeTranspose->transposeXYZtoYZX((
float2 *)fftCompute->getDataDst());
619 if (numDeviceBuffersZ > 0) {
620 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
621 if (deviceBuffersZ[
y].data != NULL) {
622 if (deviceBuffersZ[
y].deviceID != deviceID) {
623 ((
CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(
y, deviceBuffersZ[
y].deviceID,
629 cudaCheck(cudaEventRecord(event, stream));
631 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
632 if (deviceBuffersZ[
y].data != NULL) {
635 msg->
x = thisIndex.x;
637 msg->
z = thisIndex.z;
642 pmePencilZ(thisIndex.x,
y,0).recvBlock(msg);
648 for (
int y=0;
y < pmeGrid.yBlocks;
y++) {
649 if (deviceBuffersZ[
y].data == NULL) {
652 msg->
x = thisIndex.x;
654 msg->
z = thisIndex.z;
661 pmePencilZ(thisIndex.x,
y,0).recvBlock(msg);
666 void CudaPmePencilY::recvDataFromX(
PmeBlockMsg *msg) {
675 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersX[msg->
y].event, 0));
676 #ifndef P2P_ENABLE_3D
677 if (deviceBuffersX[msg->
y].data != NULL && deviceBuffersX[msg->
y].deviceID != deviceID) {
686 void CudaPmePencilY::recvDataFromZ(
PmeBlockMsg *msg) {
695 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersZ[msg->
y].event, 0));
696 #ifndef P2P_ENABLE_3D
697 if (deviceBuffersZ[msg->
y].data != NULL && deviceBuffersZ[msg->
y].deviceID != deviceID) {
733 if (eventCreated)
cudaCheck(cudaEventDestroy(event));
744 CProxy_ComputePmeCUDAMgr mgrProxy = msg->
mgrProxy;
757 cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
760 deviceBuffers.resize(pmeGrid.zBlocks,
DeviceBuffer(-1,
false, NULL));
761 numDeviceBuffers = 0;
764 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
765 int pe = xyMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,0,
z));
766 if (CkNodeOf(pe) == CkMyNode()) {
767 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(0,
z);
769 int canAccessPeer = 0;
770 if (deviceID != deviceID0) {
772 cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
778 deviceBuffers[
z] =
DeviceBuffer(deviceID0, canAccessPeer, NULL);
779 pmePencilXY(0,0,
z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
783 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
784 int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,0,
z));
785 if (CkNodeOf(pe) == CkMyNode()) {
786 int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(thisIndex.x,
z);
789 pmePencilY(thisIndex.x,0,
z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
799 void CudaPmePencilZ::start(
const CkCallback &cb) {
800 thisProxy[thisIndex].recvDeviceBuffers(cb);
803 void CudaPmePencilZ::setDeviceBuffers() {
804 std::vector<float2*> dataPtrs(pmeGrid.zBlocks, (
float2*)0);
805 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
806 if (deviceBuffers[
z].data != NULL) {
807 if (deviceBuffers[
z].deviceID == deviceID || deviceBuffers[
z].isPeerDevice) {
808 dataPtrs[
z] = deviceBuffers[
z].data;
819 float2* CudaPmePencilZ::getData(
const int i,
const bool sameDevice) {
821 #ifndef P2P_ENABLE_3D
823 int i0, i1, j0, j1, k0, k1;
824 getBlockDim(pmeGrid,
Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
825 data = (
float2 *)fftCompute->getDataSrc() + i0;
830 int i0, i1, j0, j1, k0, k1;
831 getBlockDim(pmeGrid,
Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
832 data = (
float2 *)fftCompute->getDataSrc() + i0;
837 void CudaPmePencilZ::backwardDone() {
840 pmeTranspose->transposeXYZtoYZX((
float2 *)fftCompute->getDataSrc());
842 pmeTranspose->transposeXYZtoZXY((
float2 *)fftCompute->getDataSrc());
847 if (numDeviceBuffers > 0) {
848 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
849 if (deviceBuffers[
z].data != NULL) {
850 if (deviceBuffers[
z].deviceID != deviceID && !deviceBuffers[
z].isPeerDevice) {
851 ((
CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(
z, deviceBuffers[
z].deviceID,
857 cudaCheck(cudaEventRecord(event, stream));
859 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
860 if (deviceBuffers[
z].data != NULL) {
863 msg->
x = thisIndex.x;
864 msg->
y = thisIndex.y;
866 pmePencilXY(0,0,
z).recvBlock(msg);
872 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
873 if (deviceBuffers[
z].data == NULL) {
876 msg->
x = thisIndex.x;
877 msg->
y = thisIndex.y;
881 pmePencilXY(0,0,
z).recvBlock(msg);
887 if (numDeviceBuffers > 0) {
888 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
889 if (deviceBuffers[
z].data != NULL) {
890 if (deviceBuffers[
z].deviceID != deviceID) {
891 ((
CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(
z, deviceBuffers[
z].deviceID,
897 cudaCheck(cudaEventRecord(event, stream));
899 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
900 if (deviceBuffers[
z].data != NULL) {
903 msg->
x = thisIndex.x;
904 msg->
y = thisIndex.y;
906 pmePencilY(thisIndex.x,0,
z).recvBlock(msg);
912 for (
int z=0;
z < pmeGrid.zBlocks;
z++) {
913 if (deviceBuffers[
z].data == NULL) {
916 msg->
x = thisIndex.x;
917 msg->
y = thisIndex.y;
921 pmePencilY(thisIndex.x,0,
z).recvBlock(msg);
936 void CudaPmePencilZ::recvDataFromY(
PmeBlockMsg *msg) {
946 cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->
z].event, 0));
947 #ifndef P2P_ENABLE_3D
948 if (deviceBuffers[msg->
z].data != NULL && deviceBuffers[msg->
z].deviceID != deviceID && !deviceBuffers[msg->
z].isPeerDevice) {
958 #include "CudaPmeSolver.def.h"
CProxy_PmePencilXMap zMap
void initialize(CudaPmeXInitMsg *msg)
void initialize(CudaPmeXYInitMsg *msg)
CProxy_PmePencilXYMap xyMap
CProxy_CudaPmePencilZ pmePencilZ
CProxy_PmePencilXMap xMap
void energyAndVirialDone()
CProxy_CudaPmePencilX pmePencilX
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_ComputePmeCUDADevice deviceProxy
void initialize(CudaPmeXInitMsg *msg)
void energyAndVirialDone()
__thread cudaStream_t stream
void initializeDevice(InitDeviceMsg *msg)
CProxy_PmePencilXMap zMap
void initializeDevice(InitDeviceMsg *msg)
void NAMD_bug(const char *err_msg)
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_CudaPmePencilZ pmePencilZ
void initialize(CudaPmeXYZInitMsg *msg)
CProxy_CudaPmePencilY pmePencilY
void initialize(CudaPmeXInitMsg *msg)
void initializeDevice(InitDeviceMsg *msg)
CProxy_CudaPmePencilXY pmePencilXY
CProxy_ComputePmeCUDAMgr mgrProxy
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
__thread DeviceCUDA * deviceCUDA
CProxy_PmePencilXMap yMap