CudaPmeSolver.C

Go to the documentation of this file.
00001 #include "Node.h"
00002 #include "Priorities.h"
00003 #include "ComputeNonbondedUtil.h"
00004 #include "CudaPmeSolverUtil.h"
00005 #include "ComputePmeCUDAMgr.h"
00006 #include "ComputePmeCUDAMgr.decl.h"
00007 #include "CudaPmeSolver.h"
00008 #include "DeviceCUDA.h"
00009 
00010 #ifdef NAMD_CUDA
00011 #ifdef WIN32
00012 #define __thread __declspec(thread)
00013 #endif
00014 extern __thread DeviceCUDA *deviceCUDA;
00015 //#define DISABLE_P2P
00016 
00017 void CudaPmePencilXYZ::initialize(CudaPmeXYZInitMsg *msg) {
00018   pmeGrid = msg->pmeGrid;
00019   delete msg;
00020 }
00021 
00022 //
00023 // CUDA specific initialization
00024 //
00025 void CudaPmePencilXYZ::initializeDevice(InitDeviceMsg *msg) {
00026   // Store device proxy
00027   deviceProxy = msg->deviceProxy;
00028   delete msg;
00029   int deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
00030   cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
00031   CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
00032   // Setup fftCompute and pmeKSpaceCompute
00033   fftCompute = new CudaFFTCompute(deviceID, stream);
00034   pmeKSpaceCompute = new CudaPmeKSpaceCompute(pmeGrid, Perm_cX_Y_Z, 0, 0, 
00035     ComputeNonbondedUtil::ewaldcof, deviceID, stream);
00036 }
00037 
00038 void CudaPmePencilXYZ::backwardDone() {
00039   deviceProxy[CkMyNode()].gatherForce();
00040   ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->energyAndVirialSetCallback(this);
00041 
00042   // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->waitEnergyAndVirial();
00043   // submitReductions();
00044   // deviceProxy[CkMyNode()].gatherForce();
00045 }
00046 
00047 void CudaPmePencilXYZ::energyAndVirialDone() {
00048   submitReductions();
00049   // deviceProxy[CkMyNode()].gatherForce();
00050 }
00051 
00052 //###########################################################################
00053 //###########################################################################
00054 //###########################################################################
00055 
00056 void CudaPmePencilXY::initialize(CudaPmeXYInitMsg *msg) {
00057   pmeGrid = msg->pmeGrid;
00058   pmePencilZ = msg->pmePencilZ;
00059   zMap = msg->zMap;
00060 
00061   delete msg;
00062 
00063   initBlockSizes();
00064 }
00065 
00066 CudaPmePencilXY::~CudaPmePencilXY() {
00067   if (eventCreated) cudaCheck(cudaEventDestroy(event));
00068 }
00069 
00070 //
00071 // CUDA specific initialization
00072 //
00073 void CudaPmePencilXY::initializeDevice(InitDeviceMsg *msg) {
00074   // Store device proxy
00075   deviceProxy = msg->deviceProxy;
00076   delete msg;
00077   deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
00078   stream = deviceProxy.ckLocalBranch()->getStream();
00079   CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
00080   // Setup fftCompute and pmeKSpaceCompute
00081   fftCompute = new CudaFFTCompute(deviceID, stream);
00082   pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, 0, thisIndex.z, deviceID, stream);  
00083 
00084   deviceBuffers.resize(pmeGrid.xBlocks, DeviceBuffer(-1, false, NULL));
00085   numDeviceBuffers = 0;
00086 
00087   // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
00088   cudaCheck(cudaSetDevice(deviceID));
00089   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
00090   eventCreated = true;
00091 
00092 /*
00093   bool useMultiGPUfft = true;
00094   bool allDeviceOnSameNode = true;
00095   for (int x=0;x < pmeGrid.xBlocks;x++) {
00096     int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
00097     allDeviceOnSameNode &= (CkNodeOf(pe) == CkMyNode());
00098   }
00099 
00100   if (useMultiGPUfft && allDeviceOnSameNode && pmeGrid.xBlocks > 1) {
00101 
00102 
00103 
00104   } else {
00105 */
00106 
00107   for (int x=0;x < pmeGrid.xBlocks;x++) {
00108     int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
00109     if (CkNodeOf(pe) == CkMyNode()) {
00110       // Get device ID on a device on this node
00111       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(x, 0);
00112       // Check for Peer-to-Peer access
00113       int canAccessPeer = 0;
00114       if (deviceID != deviceID0) {
00115         cudaCheck(cudaSetDevice(deviceID));
00116         cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
00117 #ifdef DISABLE_P2P
00118         canAccessPeer = 0;
00119 #endif
00120         if (canAccessPeer) {
00121           unsigned int flags = 0;
00122           cudaCheck(cudaDeviceEnablePeerAccess(deviceID0, flags));
00123           // fprintf(stderr, "device %d can access device %d\n", deviceID, deviceID0);
00124         }
00125       }
00126       numDeviceBuffers++;
00127       deviceBuffers[x] = DeviceBuffer(deviceID0, canAccessPeer, NULL);
00128       pmePencilZ(x,0,0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
00129     }
00130   }
00131 
00132   // }
00133 
00134 }
00135 
00136 //
00137 // CUDA specific start
00138 //
00139 void CudaPmePencilXY::start() {
00140   recvDeviceBuffers();
00141 }
00142 
00143 void CudaPmePencilXY::setDeviceBuffers() {
00144   std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (float2*)0);
00145   for (int x=0;x < pmeGrid.xBlocks;x++) {
00146     if (deviceBuffers[x].data != NULL) {
00147       if (deviceBuffers[x].deviceID == deviceID || deviceBuffers[x].isPeerDevice) {
00148         // Device buffer on same device => directly transpose into destination pencil
00149         dataPtrs[x] = deviceBuffers[x].data;
00150         // Otherwise, when device buffer on different device on same node => transpose locally and then 
00151         // use cudaMemcpy3DPeerAsync to perform the copying
00152       }
00153     }
00154   }
00155   ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsZXY(dataPtrs, (float2 *)fftCompute->getDataDst());
00156 }
00157 
00158 float2* CudaPmePencilXY::getData(const int i, const bool sameDevice) {
00159   float2* data;
00160 #ifndef P2P_ENABLE_3D
00161   if (sameDevice) {
00162     int i0, i1, j0, j1, k0, k1;
00163     getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
00164     data = (float2 *)fftCompute->getDataDst() + i0;
00165   } else {
00166     data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
00167   }
00168 #else
00169   int i0, i1, j0, j1, k0, k1;
00170   getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
00171   data = (float2 *)fftCompute->getDataDst() + i0;
00172 #endif
00173   return data;
00174 }
00175 
00176 void CudaPmePencilXY::backwardDone() {
00177   deviceProxy[CkMyNode()].gatherForce();
00178 }
00179 
00180 void CudaPmePencilXY::forwardDone() {
00181   // Transpose locally
00182   pmeTranspose->transposeXYZtoZXY((float2 *)fftCompute->getDataDst());
00183 
00184   // Direct Device-To-Device communication within node
00185   if (numDeviceBuffers > 0) {
00186     // Copy data
00187     for (int x=0;x < pmeGrid.xBlocks;x++) {
00188       if (deviceBuffers[x].data != NULL) {
00189         if (deviceBuffers[x].deviceID != deviceID && !deviceBuffers[x].isPeerDevice) {
00190           ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(x, deviceBuffers[x].deviceID,
00191             Perm_Z_cX_Y, deviceBuffers[x].data);
00192         }
00193       }
00194     }
00195     // Record event for this pencil
00196     cudaCheck(cudaEventRecord(event, stream));
00197     // Send empty message
00198     for (int x=0;x < pmeGrid.xBlocks;x++) {
00199       if (deviceBuffers[x].data != NULL) {
00200         PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
00201         msg->dataSize = 0;
00202         msg->x = x;
00203         msg->y = thisIndex.y;
00204         msg->z = thisIndex.z;
00205         msg->doEnergy = doEnergy;
00206         msg->doVirial = doVirial;
00207         msg->lattice  = lattice;
00208         msg->numStrayAtoms = numStrayAtoms;
00209         pmePencilZ(x,0,0).recvBlock(msg);
00210       }
00211     }
00212   }
00213 
00214   // Copy-Via-Host communication
00215   for (int x=0;x < pmeGrid.xBlocks;x++) {
00216     if (deviceBuffers[x].data == NULL) {
00217       PmeBlockMsg* msg = new (blockSizes[x], PRIORITY_SIZE) PmeBlockMsg();
00218       msg->dataSize = blockSizes[x];
00219       msg->x = x;
00220       msg->y = thisIndex.y;
00221       msg->z = thisIndex.z;
00222       msg->doEnergy = doEnergy;
00223       msg->doVirial = doVirial;
00224       msg->lattice  = lattice;
00225       msg->numStrayAtoms = numStrayAtoms;
00226       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(x, msg->data, msg->dataSize);
00227       ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
00228       pmePencilZ(x,0,0).recvBlock(msg);
00229     }
00230   }
00231 }
00232 
00233 void CudaPmePencilXY::recvDataFromZ(PmeBlockMsg *msg) {
00234   if (msg->dataSize != 0) {
00235     // Buffer is coming from a different node
00236     ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->x, msg->data, (float2 *)fftCompute->getDataDst());
00237   } else {
00238     // Buffer is coming from the same node
00239     // Wait for event that was recorded on the sending pencil
00240     // device ID = deviceBuffers[msg->x].deviceID
00241     // event     = deviceBuffers[msg->x].event
00242     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->x].event, 0));
00243 #ifndef P2P_ENABLE_3D
00244     if (deviceBuffers[msg->x].data != NULL && deviceBuffers[msg->x].deviceID != deviceID && !deviceBuffers[msg->x].isPeerDevice) {
00245       // Data is in temporary device buffer, copy it into final fft-buffer
00246       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->x, (float2 *)fftCompute->getDataDst());
00247     }
00248 #endif
00249   }
00250   delete msg;
00251 }
00252 
00253 //###########################################################################
00254 //###########################################################################
00255 //###########################################################################
00256 
00257 void CudaPmePencilX::initialize(CudaPmeXInitMsg *msg) {
00258   pmeGrid = msg->pmeGrid;
00259   pmePencilY = msg->pmePencilY;
00260   yMap = msg->yMap;
00261 
00262   delete msg;
00263 
00264   initBlockSizes();
00265 
00266 }
00267 
00268 CudaPmePencilX::~CudaPmePencilX() {
00269   if (eventCreated) cudaCheck(cudaEventDestroy(event));
00270 }
00271 
00272 //
00273 // CUDA specific initialization
00274 //
00275 void CudaPmePencilX::initializeDevice(InitDeviceMsg *msg) {
00276   // Store device proxy
00277   deviceProxy = msg->deviceProxy;
00278   delete msg;
00279   deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
00280   stream = deviceProxy.ckLocalBranch()->getStream();
00281   CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
00282   // Setup fftCompute and pmeKSpaceCompute
00283   fftCompute = new CudaFFTCompute(deviceID, stream);
00284   pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, thisIndex.y, thisIndex.z, deviceID, stream);  
00285 
00286   // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
00287   cudaCheck(cudaSetDevice(deviceID));
00288   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
00289   eventCreated = true;
00290 
00291   deviceBuffers.resize(pmeGrid.xBlocks, DeviceBuffer(-1, false, NULL));
00292   numDeviceBuffers = 0;
00293 
00294   for (int x=0;x < pmeGrid.xBlocks;x++) {
00295     int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,thisIndex.z));
00296     if (CkNodeOf(pe) == CkMyNode()) {
00297       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(x, thisIndex.z);
00298       numDeviceBuffers++;
00299       deviceBuffers[x] = DeviceBuffer(deviceID0, false, NULL);
00300       pmePencilY(x,0,thisIndex.z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
00301     }
00302   }
00303 
00304 }
00305 
00306 //
00307 // CUDA specific start
00308 //
00309 void CudaPmePencilX::start() {
00310   recvDeviceBuffers();
00311 }
00312 
00313 //
00314 // Setup direct device buffers
00315 //
00316 void CudaPmePencilX::setDeviceBuffers() {
00317   std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (float2*)0);
00318   for (int x=0;x < pmeGrid.xBlocks;x++) {
00319     if (deviceBuffers[x].data != NULL) {
00320       if (deviceBuffers[x].deviceID == deviceID) {
00321         // Device buffer on same device => directly transpose into destination pencil
00322         dataPtrs[x] = deviceBuffers[x].data;
00323         // Otherwise, when device buffer on different device on same node => transpose locally and then 
00324         // use cudaMemcpy3DPeerAsync to perform the copying
00325       }
00326     }
00327   }
00328   ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsYZX(dataPtrs, (float2 *)fftCompute->getDataDst());
00329 }
00330 
00331 float2* CudaPmePencilX::getData(const int i, const bool sameDevice) {
00332   float2* data;
00333 #ifndef P2P_ENABLE_3D
00334   if (sameDevice) {
00335     int i0, i1, j0, j1, k0, k1;
00336     getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
00337     data = (float2 *)fftCompute->getDataDst() + i0;
00338   } else {
00339     data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
00340   }
00341 #else
00342   int i0, i1, j0, j1, k0, k1;
00343   getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
00344   data = (float2 *)fftCompute->getDataDst() + i0;
00345 #endif
00346   return data;
00347 }
00348 
00349 void CudaPmePencilX::backwardDone() {
00350   deviceProxy[CkMyNode()].gatherForce();
00351 }
00352 
00353 void CudaPmePencilX::forwardDone() {
00354   if (pmeTranspose == NULL)
00355     NAMD_bug("CudaPmePencilX::forwardDone, pmeTranspose not initialized");
00356   if (blockSizes.size() == 0)
00357     NAMD_bug("CudaPmePencilX::forwardDone, blockSizes not initialized");
00358   // Transpose locally
00359   pmeTranspose->transposeXYZtoYZX((float2 *)fftCompute->getDataDst());
00360 
00361   // Send data to y-pencils that share the same z-coordinate. There are pmeGrid.xBlocks of them
00362   // Direct-Device-To-Device communication
00363   if (numDeviceBuffers > 0) {
00364     // Copy data
00365     for (int x=0;x < pmeGrid.xBlocks;x++) {
00366       if (deviceBuffers[x].data != NULL) {
00367         if (deviceBuffers[x].deviceID != deviceID) {
00368           ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(x, deviceBuffers[x].deviceID,
00369             Perm_Y_Z_cX, deviceBuffers[x].data);
00370         }
00371       }
00372     }
00373     // Record event for this pencil
00374     cudaCheck(cudaEventRecord(event, stream));
00375     // Send empty messages
00376     for (int x=0;x < pmeGrid.xBlocks;x++) {
00377       if (deviceBuffers[x].data != NULL) {
00378         PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
00379         msg->dataSize = 0;
00380         msg->x = x;
00381         msg->y = thisIndex.y;
00382         msg->z = thisIndex.z;
00383         msg->doEnergy = doEnergy;
00384         msg->doVirial = doVirial;
00385         msg->lattice  = lattice;
00386         msg->numStrayAtoms = numStrayAtoms;
00387         pmePencilY(x,0,thisIndex.z).recvBlock(msg);     
00388       }
00389     }
00390   }
00391 
00392   // Copy-To-Host communication
00393   for (int x=0;x < pmeGrid.xBlocks;x++) {
00394     if (deviceBuffers[x].data == NULL) {
00395       PmeBlockMsg* msg = new (blockSizes[x], PRIORITY_SIZE) PmeBlockMsg();
00396       msg->dataSize = blockSizes[x];
00397       msg->x = x;
00398       msg->y = thisIndex.y;
00399       msg->z = thisIndex.z;
00400       msg->doEnergy = doEnergy;
00401       msg->doVirial = doVirial;
00402       msg->lattice  = lattice;
00403       msg->numStrayAtoms = numStrayAtoms;
00404       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(x, msg->data, msg->dataSize);
00405       ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
00406       pmePencilY(x,0,thisIndex.z).recvBlock(msg);
00407     }
00408   }
00409 }
00410 
00411 void CudaPmePencilX::recvDataFromY(PmeBlockMsg *msg) {
00412   if (msg->dataSize != 0) {
00413     // Buffer is coming from a different node
00414     ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->x, msg->data, (float2 *)fftCompute->getDataDst());
00415   } else {
00416     // Buffer is coming from the same node
00417     // Wait for event that was recorded on the sending pencil
00418     // device ID = deviceBuffers[msg->x].deviceID
00419     // event     = deviceBuffers[msg->x].event
00420     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->x].event, 0));
00421 #ifndef P2P_ENABLE_3D
00422     if (deviceBuffers[msg->x].data != NULL && deviceBuffers[msg->x].deviceID != deviceID) {
00423       // Data is in temporary device buffer, copy it into final fft-buffer
00424       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->x, (float2 *)fftCompute->getDataDst());
00425     }
00426 #endif
00427   }
00428   delete msg;
00429 }
00430 
00431 //###########################################################################
00432 //###########################################################################
00433 //###########################################################################
00434 
00435 void CudaPmePencilY::initialize(CudaPmeXInitMsg *msg) {
00436   pmeGrid = msg->pmeGrid;
00437   pmePencilX = msg->pmePencilX;
00438   pmePencilZ = msg->pmePencilZ;
00439   xMap = msg->xMap;
00440   zMap = msg->zMap;
00441 
00442   delete msg;
00443 
00444   initBlockSizes();
00445 }
00446 
00447 CudaPmePencilY::~CudaPmePencilY() {
00448   if (eventCreated) cudaCheck(cudaEventDestroy(event));
00449 }
00450 
00451 //
00452 // CUDA specific initialization
00453 //
00454 void CudaPmePencilY::initializeDevice(InitDeviceMsg2 *msg) {
00455   // Get device proxy
00456   // CProxy_ComputePmeCUDADevice deviceProxy = msg->deviceProxy;
00457   deviceID = msg->deviceID;
00458   stream = msg->stream;
00459   CProxy_ComputePmeCUDAMgr mgrProxy = msg->mgrProxy;
00460   delete msg;
00461   // deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
00462   // cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
00463   // CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
00464   // Setup fftCompute and pmeKSpaceCompute
00465   fftCompute = new CudaFFTCompute(deviceID, stream);
00466   pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_Y_Z_cX, thisIndex.z, thisIndex.x, deviceID, stream);
00467 
00468   // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
00469   cudaCheck(cudaSetDevice(deviceID));
00470   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
00471   eventCreated = true;
00472 
00473   deviceBuffersZ.resize(pmeGrid.yBlocks, DeviceBuffer(-1, false, NULL));
00474   deviceBuffersX.resize(pmeGrid.yBlocks, DeviceBuffer(-1, false, NULL));
00475   numDeviceBuffersZ = 0;
00476   numDeviceBuffersX = 0;
00477 
00478   for (int y=0;y < pmeGrid.yBlocks;y++) {
00479     int pe;
00480     pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x, y, 0));
00481     if (CkNodeOf(pe) == CkMyNode()) {
00482       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(thisIndex.x, y);
00483       numDeviceBuffersZ++;
00484       deviceBuffersZ[y] = DeviceBuffer(deviceID0, false, NULL);
00485       pmePencilZ(thisIndex.x, y, 0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID), thisProxy);
00486     }
00487     pe = xMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0, y, thisIndex.z));
00488     if (CkNodeOf(pe) == CkMyNode()) {
00489       int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(y, thisIndex.z);
00490       numDeviceBuffersX++;
00491       deviceBuffersX[y] = DeviceBuffer(deviceID0, false, NULL);
00492       pmePencilX(0, y, thisIndex.z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID), thisProxy);
00493     }
00494   }
00495 
00496 }
00497 
00498 //
00499 // CUDA specific start
00500 //
00501 void CudaPmePencilY::start() {
00502   recvDeviceBuffers();
00503 }
00504 
00505 //
00506 // Setup direct device buffers
00507 //
00508 void CudaPmePencilY::setDeviceBuffers() {
00509   std::vector<float2*> dataPtrsYZX(pmeGrid.yBlocks, (float2*)0);
00510   std::vector<float2*> dataPtrsZXY(pmeGrid.yBlocks, (float2*)0);
00511   for (int y=0;y < pmeGrid.yBlocks;y++) {
00512     if (deviceBuffersZ[y].data != NULL) {
00513       if (deviceBuffersZ[y].deviceID == deviceID) {
00514         dataPtrsYZX[y] = deviceBuffersZ[y].data;
00515       }
00516     }
00517     if (deviceBuffersX[y].data != NULL) {
00518       if (deviceBuffersX[y].deviceID == deviceID) {
00519         dataPtrsZXY[y] = deviceBuffersX[y].data;
00520       }
00521     }
00522   }
00523   ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsYZX(dataPtrsYZX, (float2 *)fftCompute->getDataDst());
00524   ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsZXY(dataPtrsZXY, (float2 *)fftCompute->getDataSrc());
00525 }
00526 
00527 float2* CudaPmePencilY::getDataForX(const int i, const bool sameDevice) {
00528   float2* data;
00529 #ifndef P2P_ENABLE_3D
00530   if (sameDevice) {
00531     int i0, i1, j0, j1, k0, k1;
00532     getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
00533     data = (float2 *)fftCompute->getDataSrc() + i0;
00534   } else {
00535     data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
00536   }
00537 #else
00538   int i0, i1, j0, j1, k0, k1;
00539   getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
00540   data = (float2 *)fftCompute->getDataSrc() + i0;
00541 #endif
00542   return data;
00543 }
00544 
00545 float2* CudaPmePencilY::getDataForZ(const int i, const bool sameDevice) {
00546   float2* data;
00547 #ifndef P2P_ENABLE_3D
00548   if (sameDevice) {
00549     int i0, i1, j0, j1, k0, k1;
00550     getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
00551     data = (float2 *)fftCompute->getDataDst() + i0;
00552   } else {
00553     data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
00554   }
00555 #else
00556   int i0, i1, j0, j1, k0, k1;
00557   getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
00558   data = (float2 *)fftCompute->getDataDst() + i0;
00559 #endif
00560   return data;
00561 }
00562 
00563 void CudaPmePencilY::backwardDone() {
00564   // Transpose locally
00565   pmeTranspose->transposeXYZtoZXY((float2 *)fftCompute->getDataSrc());
00566 
00567   // Send data to x-pencils that share the same x-coordinate. There are pmeGrid.yBlocks of them
00568   // Direct-Device-To-Device communication
00569   if (numDeviceBuffersX > 0) {
00570     for (int y=0;y < pmeGrid.yBlocks;y++) {
00571       if (deviceBuffersX[y].data != NULL) {
00572         if (deviceBuffersX[y].deviceID != deviceID) {
00573           ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(y, deviceBuffersX[y].deviceID,
00574             Perm_cX_Y_Z, deviceBuffersX[y].data);
00575         }
00576       }
00577     }
00578     // Record event for this pencil
00579     cudaCheck(cudaEventRecord(event, stream));
00580     // Send empty message
00581     for (int y=0;y < pmeGrid.yBlocks;y++) {
00582       if (deviceBuffersX[y].data != NULL) {
00583         PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
00584         msg->dataSize = 0;
00585         msg->x = thisIndex.x;
00586         msg->y = y;
00587         msg->z = thisIndex.z;
00588         pmePencilX(0,y,thisIndex.z).recvBlock(msg);
00589       }
00590     }
00591   }
00592 
00593   // Copy via host
00594   for (int y=0;y < pmeGrid.yBlocks;y++) {
00595     if (deviceBuffersX[y].data == NULL) {
00596       PmeBlockMsg* msg = new (blockSizes[y], PRIORITY_SIZE) PmeBlockMsg();
00597       msg->dataSize = blockSizes[y];
00598       msg->x = thisIndex.x;
00599       msg->y = y;
00600       msg->z = thisIndex.z;
00601       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(y, msg->data, msg->dataSize);
00602       ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
00603       pmePencilX(0,y,thisIndex.z).recvBlock(msg);
00604     }
00605   }
00606 }
00607 
00608 void CudaPmePencilY::forwardDone() {
00609   if (pmeTranspose == NULL)
00610     NAMD_bug("CudaPmePencilY::forwardDone, pmeTranspose not initialized");
00611   if (blockSizes.size() == 0)
00612     NAMD_bug("CudaPmePencilY::forwardDone, blockSizes not initialized");
00613 
00614   // Transpose locally
00615   pmeTranspose->transposeXYZtoYZX((float2 *)fftCompute->getDataDst());
00616 
00617   // Send data to z-pencils that share the same x-coordinate. There are pmeGrid.yBlocks of them
00618   // Direct-Device-To-Device communication
00619   if (numDeviceBuffersZ > 0) {
00620     for (int y=0;y < pmeGrid.yBlocks;y++) {
00621       if (deviceBuffersZ[y].data != NULL) {
00622         if (deviceBuffersZ[y].deviceID != deviceID) {
00623           ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(y, deviceBuffersZ[y].deviceID,
00624             Perm_Z_cX_Y, deviceBuffersZ[y].data);
00625         }
00626       }
00627     }
00628     // Record event for this pencil
00629     cudaCheck(cudaEventRecord(event, stream));
00630     // Send empty message
00631     for (int y=0;y < pmeGrid.yBlocks;y++) {
00632       if (deviceBuffersZ[y].data != NULL) {
00633         PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
00634         msg->dataSize = 0;
00635         msg->x = thisIndex.x;
00636         msg->y = y;
00637         msg->z = thisIndex.z;
00638         msg->doEnergy = doEnergy;
00639         msg->doVirial = doVirial;
00640         msg->lattice  = lattice;
00641         msg->numStrayAtoms = numStrayAtoms;
00642         pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
00643       }
00644     }
00645   }
00646 
00647   // Copy-To-Host communication
00648   for (int y=0;y < pmeGrid.yBlocks;y++) {
00649     if (deviceBuffersZ[y].data == NULL) {
00650       PmeBlockMsg* msg = new (blockSizes[y], PRIORITY_SIZE) PmeBlockMsg();
00651       msg->dataSize = blockSizes[y];
00652       msg->x = thisIndex.x;
00653       msg->y = y;
00654       msg->z = thisIndex.z;
00655       msg->doEnergy = doEnergy;
00656       msg->doVirial = doVirial;
00657       msg->lattice  = lattice;
00658       msg->numStrayAtoms = numStrayAtoms;
00659       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(y, msg->data, msg->dataSize);
00660       ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
00661       pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
00662     }
00663   }
00664 }
00665 
00666 void CudaPmePencilY::recvDataFromX(PmeBlockMsg *msg) {
00667   if (msg->dataSize != 0) {
00668     // Buffer is coming from a different node
00669     ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->y, msg->data, (float2 *)fftCompute->getDataSrc());
00670   } else {
00671     // Buffer is coming from the same node
00672     // Wait for event that was recorded on the sending pencil
00673     // device ID = deviceBuffersX[msg->y].deviceID
00674     // event     = deviceBuffersX[msg->y].event
00675     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersX[msg->y].event, 0));
00676 #ifndef P2P_ENABLE_3D
00677     if (deviceBuffersX[msg->y].data != NULL && deviceBuffersX[msg->y].deviceID != deviceID) {
00678       // Data is in temporary device buffer, copy it into final fft-buffer
00679       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->y, (float2 *)fftCompute->getDataSrc());
00680     }
00681 #endif
00682   }
00683   delete msg;
00684 }
00685 
00686 void CudaPmePencilY::recvDataFromZ(PmeBlockMsg *msg) {
00687   if (msg->dataSize != 0) {
00688     // Buffer is coming from a different node
00689     ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->y, msg->data, (float2 *)fftCompute->getDataDst());
00690   } else {
00691     // Buffer is coming from the same node
00692     // Wait for event that was recorded on the sending pencil
00693     // device ID = deviceBuffersZ[msg->y].deviceID
00694     // event     = deviceBuffersZ[msg->y].event
00695     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersZ[msg->y].event, 0));
00696 #ifndef P2P_ENABLE_3D
00697     if (deviceBuffersZ[msg->y].data != NULL && deviceBuffersZ[msg->y].deviceID != deviceID) {
00698       // Data is in temporary device buffer, copy it into final fft-buffer
00699       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->y, (float2 *)fftCompute->getDataDst());
00700     }
00701 #endif
00702   }
00703   delete msg;
00704 }
00705 
00706 //###########################################################################
00707 //###########################################################################
00708 //###########################################################################
00709 
00710 void CudaPmePencilZ::initialize(CudaPmeXInitMsg *msg) {
00711   useXYslab = false;
00712   pmeGrid = msg->pmeGrid;
00713   pmePencilY = msg->pmePencilY;
00714   yMap = msg->yMap;
00715 
00716   delete msg;
00717 
00718   initBlockSizes();
00719 }
00720 
00721 void CudaPmePencilZ::initialize(CudaPmeXYInitMsg *msg) {
00722   useXYslab = true;
00723   pmeGrid = msg->pmeGrid;
00724   pmePencilXY = msg->pmePencilXY;
00725   xyMap = msg->xyMap;
00726 
00727   delete msg;
00728 
00729   initBlockSizes();
00730 }
00731 
00732 CudaPmePencilZ::~CudaPmePencilZ() {
00733   if (eventCreated) cudaCheck(cudaEventDestroy(event));
00734 }
00735 
00736 //
00737 // CUDA specific initialization
00738 //
00739 void CudaPmePencilZ::initializeDevice(InitDeviceMsg2 *msg) {
00740   // Get device proxy
00741   // CProxy_ComputePmeCUDADevice deviceProxy = msg->deviceProxy;
00742   deviceID = msg->deviceID;
00743   stream = msg->stream;
00744   CProxy_ComputePmeCUDAMgr mgrProxy = msg->mgrProxy;
00745   delete msg;
00746   // deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
00747   // cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
00748   // CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
00749   // Setup fftCompute and pmeKSpaceCompute
00750   fftCompute = new CudaFFTCompute(deviceID, stream);
00751   pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_Z_cX_Y, thisIndex.x, thisIndex.y, deviceID, stream);
00752   pmeKSpaceCompute = new CudaPmeKSpaceCompute(pmeGrid, Perm_Z_cX_Y, thisIndex.x, thisIndex.y,
00753     ComputeNonbondedUtil::ewaldcof, deviceID, stream);
00754 
00755   // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
00756   cudaCheck(cudaSetDevice(deviceID));
00757   cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
00758   eventCreated = true;
00759 
00760   deviceBuffers.resize(pmeGrid.zBlocks, DeviceBuffer(-1, false, NULL));
00761   numDeviceBuffers = 0;
00762 
00763   if (useXYslab) {
00764     for (int z=0;z < pmeGrid.zBlocks;z++) {
00765       int pe = xyMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,0,z));
00766       if (CkNodeOf(pe) == CkMyNode()) {
00767         int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(0, z);
00768         // Check for Peer-to-Peer access
00769         int canAccessPeer = 0;
00770         if (deviceID != deviceID0) {
00771           cudaCheck(cudaSetDevice(deviceID));
00772           cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
00773         }
00774 #ifdef DISABLE_P2P
00775         canAccessPeer = 0;
00776 #endif
00777         numDeviceBuffers++;
00778         deviceBuffers[z] = DeviceBuffer(deviceID0, canAccessPeer, NULL);
00779         pmePencilXY(0,0,z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
00780       }
00781     }
00782   } else {
00783     for (int z=0;z < pmeGrid.zBlocks;z++) {
00784       int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,0,z));
00785       if (CkNodeOf(pe) == CkMyNode()) {
00786         int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(thisIndex.x, z);
00787         numDeviceBuffers++;
00788         deviceBuffers[z] = DeviceBuffer(deviceID0, false, NULL);
00789         pmePencilY(thisIndex.x,0,z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
00790       }
00791     }
00792   }
00793 
00794 }
00795 
00796 //
00797 // CUDA specific start
00798 //
00799 void CudaPmePencilZ::start() {
00800   recvDeviceBuffers();
00801 }
00802 
00803 void CudaPmePencilZ::setDeviceBuffers() {
00804   std::vector<float2*> dataPtrs(pmeGrid.zBlocks, (float2*)0);
00805   for (int z=0;z < pmeGrid.zBlocks;z++) {
00806     if (deviceBuffers[z].data != NULL) {
00807       if (deviceBuffers[z].deviceID == deviceID || deviceBuffers[z].isPeerDevice) {
00808         dataPtrs[z] = deviceBuffers[z].data;
00809       }
00810     }
00811   }
00812   if (useXYslab) {
00813     ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsYZX(dataPtrs, (float2 *)fftCompute->getDataSrc());
00814   } else {
00815     ((CudaPmeTranspose *)pmeTranspose)->setDataPtrsZXY(dataPtrs, (float2 *)fftCompute->getDataSrc());
00816   }
00817 }
00818 
00819 float2* CudaPmePencilZ::getData(const int i, const bool sameDevice) {
00820   float2* data;
00821 #ifndef P2P_ENABLE_3D
00822   if (sameDevice) {
00823     int i0, i1, j0, j1, k0, k1;
00824     getBlockDim(pmeGrid, Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
00825     data = (float2 *)fftCompute->getDataSrc() + i0;
00826   } else {
00827     data = ((CudaPmeTranspose *)pmeTranspose)->getBuffer(i);
00828   }
00829 #else
00830   int i0, i1, j0, j1, k0, k1;
00831   getBlockDim(pmeGrid, Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
00832   data = (float2 *)fftCompute->getDataSrc() + i0;
00833 #endif
00834   return data;
00835 }
00836 
00837 void CudaPmePencilZ::backwardDone() {
00838   // Transpose locally
00839   if (useXYslab) {
00840     pmeTranspose->transposeXYZtoYZX((float2 *)fftCompute->getDataSrc());
00841   } else {
00842     pmeTranspose->transposeXYZtoZXY((float2 *)fftCompute->getDataSrc());   
00843   }
00844 
00845   if (useXYslab) {
00846     // Direct-Device-To-Device communication
00847     if (numDeviceBuffers > 0) {
00848       for (int z=0;z < pmeGrid.zBlocks;z++) {
00849         if (deviceBuffers[z].data != NULL) {
00850           if (deviceBuffers[z].deviceID != deviceID && !deviceBuffers[z].isPeerDevice) {
00851             ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceYZX(z, deviceBuffers[z].deviceID,
00852               Perm_cX_Y_Z, deviceBuffers[z].data);
00853           }
00854         }
00855       }
00856       // Record event for this pencil
00857       cudaCheck(cudaEventRecord(event, stream));
00858       // Send empty message
00859       for (int z=0;z < pmeGrid.zBlocks;z++) {
00860         if (deviceBuffers[z].data != NULL) {
00861           PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
00862           msg->dataSize = 0;
00863           msg->x = thisIndex.x;
00864           msg->y = thisIndex.y;
00865           msg->z = z;
00866           pmePencilXY(0,0,z).recvBlock(msg);
00867         }
00868       }
00869     }
00870 
00871     // Copy-To-Host communication
00872     for (int z=0;z < pmeGrid.zBlocks;z++) {
00873       if (deviceBuffers[z].data == NULL) {
00874         PmeBlockMsg* msg = new (blockSizes[z], PRIORITY_SIZE) PmeBlockMsg();
00875         msg->dataSize = blockSizes[z];
00876         msg->x = thisIndex.x;
00877         msg->y = thisIndex.y;
00878         msg->z = z;
00879         ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(z, msg->data, msg->dataSize);
00880         ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
00881         pmePencilXY(0,0,z).recvBlock(msg);
00882       }
00883     }
00884   } else {
00885     // Send data to y-pencils that share the same x-coordinate. There are pmeGrid.zBlocks of them
00886     // Direct-Device-To-Device communication
00887     if (numDeviceBuffers > 0) {
00888       for (int z=0;z < pmeGrid.zBlocks;z++) {
00889         if (deviceBuffers[z].data != NULL) {
00890           if (deviceBuffers[z].deviceID != deviceID) {
00891             ((CudaPmeTranspose *)pmeTranspose)->copyDataToPeerDeviceZXY(z, deviceBuffers[z].deviceID,
00892               Perm_Y_Z_cX, deviceBuffers[z].data);
00893           }
00894         }
00895       }
00896       // Record event for this pencil
00897       cudaCheck(cudaEventRecord(event, stream));
00898       // Send empty message
00899       for (int z=0;z < pmeGrid.zBlocks;z++) {
00900         if (deviceBuffers[z].data != NULL) {
00901           PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
00902           msg->dataSize = 0;
00903           msg->x = thisIndex.x;
00904           msg->y = thisIndex.y;
00905           msg->z = z;
00906           pmePencilY(thisIndex.x,0,z).recvBlock(msg);
00907         }
00908       }
00909     }
00910 
00911     // Copy-To-Host communication
00912     for (int z=0;z < pmeGrid.zBlocks;z++) {
00913       if (deviceBuffers[z].data == NULL) {
00914         PmeBlockMsg* msg = new (blockSizes[z], PRIORITY_SIZE) PmeBlockMsg();
00915         msg->dataSize = blockSizes[z];
00916         msg->x = thisIndex.x;
00917         msg->y = thisIndex.y;
00918         msg->z = z;
00919         ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToHost(z, msg->data, msg->dataSize);
00920         ((CudaPmeTranspose *)pmeTranspose)->waitStreamSynchronize();
00921         pmePencilY(thisIndex.x,0,z).recvBlock(msg);
00922       }
00923     }
00924   }
00925 
00926   // Submit reductions
00927   ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->energyAndVirialSetCallback(this);
00928   // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->waitEnergyAndVirial();
00929   // submitReductions();
00930 }
00931 
00932 void CudaPmePencilZ::energyAndVirialDone() {
00933   submitReductions();
00934 }
00935 
00936 void CudaPmePencilZ::recvDataFromY(PmeBlockMsg *msg) {
00937   // NOTE: No need to synchronize stream here since memory copies are in the stream
00938   if (msg->dataSize != 0) {
00939     // Buffer is coming from a different node
00940     ((CudaPmeTranspose *)pmeTranspose)->copyDataHostToDevice(msg->z, msg->data, (float2 *)fftCompute->getDataSrc());
00941   } else {
00942     // Buffer is coming from the same node
00943     // Wait for event that was recorded on the sending pencil
00944     // device ID = deviceBuffers[msg->z].deviceID
00945     // event     = deviceBuffers[msg->z].event
00946     cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->z].event, 0));
00947 #ifndef P2P_ENABLE_3D
00948     if (deviceBuffers[msg->z].data != NULL && deviceBuffers[msg->z].deviceID != deviceID && !deviceBuffers[msg->z].isPeerDevice) {
00949       // Data is in temporary device buffer, copy it into final fft-buffer
00950       ((CudaPmeTranspose *)pmeTranspose)->copyDataDeviceToDevice(msg->z, (float2 *)fftCompute->getDataSrc());
00951     }
00952 #endif
00953   }
00954   delete msg;
00955 }
00956 #endif // NAMD_CUDA
00957 
00958 #include "CudaPmeSolver.def.h"

Generated on Tue Sep 19 01:17:12 2017 for NAMD by  doxygen 1.4.7