NAMD
CudaPmeSolver.C
Go to the documentation of this file.
1 #include <iomanip>
2 #include "Node.h"
3 #include "Priorities.h"
4 #include "ComputeNonbondedUtil.h"
5 #include "CudaPmeSolverUtil.h"
6 #include "ComputePmeCUDAMgr.h"
7 #include "ComputePmeCUDAMgr.decl.h"
8 #include "CudaPmeSolver.h"
9 #include "DeviceCUDA.h"
10 
11 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
12 #ifdef WIN32
13 #define __thread __declspec(thread)
14 #endif
15 extern __thread DeviceCUDA *deviceCUDA;
16 //#define DISABLE_P2P
17 
19  pmeGrid = msg->pmeGrid;
20  delete msg;
21 }
22 
23 //
24 // CUDA specific initialization
25 //
27  // Store device proxy
28  deviceProxy = msg->deviceProxy;
29  delete msg;
30  int deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
31  cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
32  CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
33  // Setup fftCompute and pmeKSpaceCompute
34  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
35  if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == true) {
36  fftComputes[iGrid] = new CudaFFTCompute(deviceID, stream);
37  pmeKSpaceComputes[iGrid] = new CudaPmeKSpaceCompute(pmeGrid, Perm_cX_Y_Z, 0, 0, ComputeNonbondedUtil::ewaldcof, deviceID, stream, iGrid);
38  energyReady[iGrid] = 0;
39  } else {
40  fftComputes[iGrid] = NULL;
41  pmeKSpaceComputes[iGrid] = NULL;
42  energyReady[iGrid] = -1;
43  }
44  }
45 }
46 
47 void CudaPmePencilXYZ::backwardDone() {
48  deviceProxy[CkMyNode()].gatherForce();
49 // ((CudaPmeKSpaceCompute *)pmeKSpaceComputes[0])->energyAndVirialSetCallback(this);
50 // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute2)->energyAndVirialSetCallback(this);
51  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
52  if (pmeKSpaceComputes[iGrid] != NULL)
53  ((CudaPmeKSpaceCompute *)pmeKSpaceComputes[iGrid])->energyAndVirialSetCallback(this);
54  }
55 
56  // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->waitEnergyAndVirial();
57  // submitReductions();
58  // deviceProxy[CkMyNode()].gatherForce();
59 }
60 
61 void CudaPmePencilXYZ::energyAndVirialDone(unsigned int iGrid) {
62  submitReductions(iGrid);
63  // deviceProxy[CkMyNode()].gatherForce();
64 }
65 
66 //###########################################################################
67 //###########################################################################
68 //###########################################################################
69 
71  pmeGrid = msg->pmeGrid;
72  pmePencilZ = msg->pmePencilZ;
73  zMap = msg->zMap;
74 
75  delete msg;
76 
77  initBlockSizes();
78 }
79 
81  if (eventCreated) cudaCheck(cudaEventDestroy(event));
82 }
83 
84 //
85 // CUDA specific initialization
86 //
88  // Store device proxy
89  deviceProxy = msg->deviceProxy;
90  delete msg;
91  deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
92  stream = deviceProxy.ckLocalBranch()->getStream();
93  CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
94  // Setup fftCompute and pmeKSpaceCompute
95  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
96  if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == true) {
97  fftComputes[iGrid] = new CudaFFTCompute(deviceID, stream);
98  pmeTransposes[iGrid] = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, 0, thisIndex.z, deviceID, stream);
99  } else {
100  fftComputes[iGrid] = NULL;
101  pmeTransposes[iGrid] = NULL;
102  }
103  }
104 
105  deviceBuffers.resize(pmeGrid.xBlocks, DeviceBuffer(-1, false));
106  numDeviceBuffers = 0;
107 
108  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
109  cudaCheck(cudaSetDevice(deviceID));
110  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
111  eventCreated = true;
112 
113 /*
114  bool useMultiGPUfft = true;
115  bool allDeviceOnSameNode = true;
116  for (int x=0;x < pmeGrid.xBlocks;x++) {
117  int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
118  allDeviceOnSameNode &= (CkNodeOf(pe) == CkMyNode());
119  }
120 
121  if (useMultiGPUfft && allDeviceOnSameNode && pmeGrid.xBlocks > 1) {
122  // WARNING: code may be incomplete here!
123  // CHC: Assuming there are two GPUs on the same node and we use:
124  // PMEGridSpacing 2.0
125  // PMEPencilsX 2
126  // PMEPencilsY 1
127  // PMEPencilsZ 1
128  // and running NAMD with all GPUs and two CPU threads,
129  // this "if" statement is satisfied
130 
131 
132  } else {
133 */
134 
135  for (int x=0;x < pmeGrid.xBlocks;x++) {
136  int pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,0));
137  if (CkNodeOf(pe) == CkMyNode()) {
138  // Get device ID on a device on this node
139  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(x, 0);
140  // Check for Peer-to-Peer access
141  int canAccessPeer = 0;
142  if (deviceID != deviceID0) {
143  cudaCheck(cudaSetDevice(deviceID));
144  cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
145 #ifdef DISABLE_P2P
146  canAccessPeer = 0;
147 #endif
148  if (canAccessPeer) {
149  unsigned int flags = 0;
150  cudaCheck(cudaDeviceEnablePeerAccess(deviceID0, flags));
151  // fprintf(stderr, "device %d can access device %d\n", deviceID, deviceID0);
152  }
153  }
154  numDeviceBuffers++;
155  // CHC: I have tried to use deviceID instead of deviceID0, but NAMD still crashes.
156  deviceBuffers[x] = DeviceBuffer(deviceID0, canAccessPeer);
157  pmePencilZ(x,0,0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
158  }
159  }
160 
161  // }
162 
163 }
164 
165 //
166 // CUDA specific start
167 //
168 void CudaPmePencilXY::start(const CkCallback &cb) {
169  thisProxy[thisIndex].recvDeviceBuffers(cb);
170 }
171 
172 void CudaPmePencilXY::setDeviceBuffers() {
173  std::array<std::vector<float2*>, NUM_GRID_MAX> dataPtrsGrid;
174 // std::vector<float2*> data2Ptrs(pmeGrid.xBlocks, (float2*)0);
175  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
176  dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.xBlocks, (float2*)0);
177  for (int x=0;x < pmeGrid.xBlocks;x++) {
178  if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
179  if (deviceBuffers[x].deviceID == deviceID || deviceBuffers[x].isPeerDevice) {
180  // Device buffer on same device => directly transpose into destination pencil
181  // dataPtrs[x] = deviceBuffers[x].dataGrid[0];
182  dataPtrsGrid[iGrid][x] = deviceBuffers[x].dataGrid[iGrid];
183  // Otherwise, when device buffer on different device on same node => transpose locally and then
184  // use cudaMemcpy3DPeerAsync to perform the copying
185  // WARNING: code may be incomplete here!
186  }
187  }
188  if (pmeTransposes[iGrid] != NULL) {
189  ((CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
190  }
191  }
192  }
193 }
194 
195 std::array<float2*, NUM_GRID_MAX> CudaPmePencilXY::getData(const int i, const bool sameDevice) {
196  std::array<float2*, NUM_GRID_MAX> data_grid;
197  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
198  if (fftComputes[iGrid] != NULL) {
199 #ifndef P2P_ENABLE_3D
200  if (sameDevice) {
201  int i0, i1, j0, j1, k0, k1;
202  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
203  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
204  } else {
205  data_grid[iGrid] = ((CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
206  }
207 #else
208  int i0, i1, j0, j1, k0, k1;
209  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
210  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
211 #endif
212  } else {
213  data_grid[iGrid] = NULL;
214  }
215  }
216  return data_grid;
217 }
218 
219 void CudaPmePencilXY::backwardDone() {
220  deviceProxy[CkMyNode()].gatherForce();
221 }
222 
223 void CudaPmePencilXY::forwardDone() {
224  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
225  if (pmeTransposes[iGrid] != NULL) {
226  // Transpose locally
227  pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataDst());
228  // Direct Device-To-Device communication within node
229  if (numDeviceBuffers > 0) {
230  // Copy data
231  for (int x=0;x < pmeGrid.xBlocks;x++) {
232  if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
233  if (deviceBuffers[x].deviceID != deviceID && !deviceBuffers[x].isPeerDevice) {
234  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(x, deviceBuffers[x].deviceID,
235  Perm_Z_cX_Y, deviceBuffers[x].dataGrid[iGrid]);
236  }
237  }
238  }
239 
240  // Record event for this pencil
241  cudaCheck(cudaEventRecord(event, stream));
242  // Send empty message
243  for (int x=0;x < pmeGrid.xBlocks;x++) {
244  if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
245  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
246  msg->dataSize = 0;
247  msg->x = x;
248  msg->y = thisIndex.y;
249  msg->z = thisIndex.z;
250  msg->doEnergy = doEnergy;
251  msg->doVirial = doVirial;
252  msg->lattice = lattice;
253  msg->numStrayAtoms = numStrayAtoms;
254  msg->grid = iGrid;
255  msg->simulationStep = simulationStep;
256  pmePencilZ(x,0,0).recvBlock(msg);
257  }
258  }
259  }
260 
261  // Copy-Via-Host communication
262  for (int x=0;x < pmeGrid.xBlocks;x++) {
263  if (deviceBuffers[x].dataGrid[iGrid] == NULL) {
264  PmeBlockMsg* msg = new (blockSizes[x], PRIORITY_SIZE) PmeBlockMsg();
265  msg->dataSize = blockSizes[x];
266  msg->x = x;
267  msg->y = thisIndex.y;
268  msg->z = thisIndex.z;
269  msg->doEnergy = doEnergy;
270  msg->doVirial = doVirial;
271  msg->lattice = lattice;
272  msg->numStrayAtoms = numStrayAtoms;
273  msg->simulationStep = simulationStep;
274  msg->grid = iGrid;
275  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataDeviceToHost(x, msg->data, msg->dataSize);
276  ((CudaPmeTranspose *)pmeTransposes[iGrid])->waitStreamSynchronize();
277  pmePencilZ(x,0,0).recvBlock(msg);
278  }
279  }
280  }
281  }
282 }
283 
284 void CudaPmePencilXY::recvDataFromZ(PmeBlockMsg *msg) {
285  if (msg->dataSize != 0) {
286  // CHC: is the checking of null pointer redundant?
287  // Buffer is coming from a different node
288  ((CudaPmeTranspose *)(pmeTransposes[msg->grid]))->copyDataHostToDevice(msg->x, msg->data, (float2 *)fftComputes[msg->grid]->getDataDst());
289  } else {
290  // Buffer is coming from the same node
291  // Wait for event that was recorded on the sending pencil
292  // device ID = deviceBuffers[msg->x].deviceID
293  // event = deviceBuffers[msg->x].event
294  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->x].event, 0));
295 #ifndef P2P_ENABLE_3D
296  if (deviceBuffers[msg->x].dataGrid[msg->grid] != NULL && deviceBuffers[msg->x].deviceID != deviceID && !deviceBuffers[msg->x].isPeerDevice) {
297  // Data is in temporary device buffer, copy it into final fft-buffer
298  ((CudaPmeTranspose *)(pmeTransposes[msg->grid]))->copyDataDeviceToDevice(msg->x, (float2 *)fftComputes[msg->grid]->getDataDst());
299  }
300 #endif
301  }
302  delete msg;
303 }
304 
305 //###########################################################################
306 //###########################################################################
307 //###########################################################################
308 
310  pmeGrid = msg->pmeGrid;
311  pmePencilY = msg->pmePencilY;
312  yMap = msg->yMap;
313 
314  delete msg;
315 
316  initBlockSizes();
317 
318 }
319 
321  if (eventCreated) cudaCheck(cudaEventDestroy(event));
322 }
323 
324 //
325 // CUDA specific initialization
326 //
328  // Store device proxy
329  deviceProxy = msg->deviceProxy;
330  delete msg;
331  deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
332  stream = deviceProxy.ckLocalBranch()->getStream();
333  CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
334  // Setup fftCompute and pmeKSpaceCompute
335 // fftCompute = new CudaFFTCompute(deviceID, stream);
336 // fftCompute2 = new CudaFFTCompute(deviceID, stream);
337 // pmeTranspose = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, thisIndex.y, thisIndex.z, deviceID, stream);
338 // pmeTranspose2 = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, thisIndex.y, thisIndex.z, deviceID, stream);
339  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
340  if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == true) {
341  fftComputes[iGrid] = new CudaFFTCompute(deviceID, stream);
342  pmeTransposes[iGrid] = new CudaPmeTranspose(pmeGrid, Perm_cX_Y_Z, thisIndex.y, thisIndex.z, deviceID, stream);
343  } else {
344  fftComputes[iGrid] = NULL;
345  pmeTransposes[iGrid] = NULL;
346  }
347  }
348 
349  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
350  cudaCheck(cudaSetDevice(deviceID));
351  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
352  eventCreated = true;
353 
354  deviceBuffers.resize(pmeGrid.xBlocks, DeviceBuffer(-1, false));
355  numDeviceBuffers = 0;
356 
357  for (int x=0;x < pmeGrid.xBlocks;x++) {
358  int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(x,0,thisIndex.z));
359  if (CkNodeOf(pe) == CkMyNode()) {
360  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(x, thisIndex.z);
361  numDeviceBuffers++;
362  deviceBuffers[x] = DeviceBuffer(deviceID0, false);
363  pmePencilY(x,0,thisIndex.z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
364  }
365  }
366 
367 }
368 
369 //
370 // CUDA specific start
371 //
372 void CudaPmePencilX::start(const CkCallback &cb) {
373  thisProxy[thisIndex].recvDeviceBuffers(cb);
374 }
375 
376 //
377 // Setup direct device buffers
378 //
379 void CudaPmePencilX::setDeviceBuffers() {
380  std::array<std::vector<float2*>, NUM_GRID_MAX> dataPtrsGrid;
381 // std::vector<float2*> dataPtrs(pmeGrid.xBlocks, (float2*)0);
382  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
383  dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.xBlocks, (float2*)0);
384  for (int x=0;x < pmeGrid.xBlocks;x++) {
385  if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
386  if (deviceBuffers[x].deviceID == deviceID) {
387  // Device buffer on same device => directly transpose into destination pencil
388  dataPtrsGrid[iGrid][x] = deviceBuffers[x].dataGrid[iGrid];
389  // Otherwise, when device buffer on different device on same node => transpose locally and then
390  // use cudaMemcpy3DPeerAsync to perform the copying
391  }
392  }
393  }
394  if (pmeTransposes[iGrid] != NULL) {
395  ((CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
396  }
397  }
398 }
399 
400 std::array<float2*, NUM_GRID_MAX> CudaPmePencilX::getData(const int i, const bool sameDevice) {
401  std::array<float2*, NUM_GRID_MAX> data_grid;
402  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
403  if (fftComputes[iGrid] != NULL) {
404 #ifndef P2P_ENABLE_3D
405  if (sameDevice) {
406  int i0, i1, j0, j1, k0, k1;
407  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
408  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
409  } else {
410  data_grid[iGrid] = ((CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
411  }
412 #else
413  int i0, i1, j0, j1, k0, k1;
414  getBlockDim(pmeGrid, Perm_cX_Y_Z, i, 0, 0, i0, i1, j0, j1, k0, k1);
415  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
416 #endif
417  } else {
418  data_grid[iGrid] = NULL;
419  }
420  }
421  return data_grid;
422 }
423 
424 void CudaPmePencilX::backwardDone() {
425  deviceProxy[CkMyNode()].gatherForce();
426 }
427 
428 void CudaPmePencilX::forwardDone() {
429  if (pmeTransposes[0] == NULL)
430  NAMD_bug("CudaPmePencilX::forwardDone, pmeTranspose not initialized");
431  if (blockSizes.size() == 0)
432  NAMD_bug("CudaPmePencilX::forwardDone, blockSizes not initialized");
433  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
434  if (pmeTransposes[iGrid] != NULL) {
435  // Transpose locally
436  pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataDst());
437 
438  // Send data to y-pencils that share the same z-coordinate. There are pmeGrid.xBlocks of them
439  // Direct-Device-To-Device communication
440  if (numDeviceBuffers > 0) {
441  // Copy data
442  for (int x=0;x < pmeGrid.xBlocks;x++) {
443  if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
444  if (deviceBuffers[x].deviceID != deviceID) {
445  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(x, deviceBuffers[x].deviceID,
446  Perm_Y_Z_cX, deviceBuffers[x].dataGrid[iGrid]);
447  }
448  }
449  }
450 
451  // Record event for this pencil
452  cudaCheck(cudaEventRecord(event, stream));
453  // Send empty messages
454  for (int x=0;x < pmeGrid.xBlocks;x++) {
455  if (deviceBuffers[x].dataGrid[iGrid] != NULL) {
456  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
457  msg->dataSize = 0;
458  msg->x = x;
459  msg->y = thisIndex.y;
460  msg->z = thisIndex.z;
461  msg->doEnergy = doEnergy;
462  msg->doVirial = doVirial;
463  msg->lattice = lattice;
464  msg->numStrayAtoms = numStrayAtoms;
465  msg->simulationStep = simulationStep;
466  msg->grid = iGrid;
467  pmePencilY(x,0,thisIndex.z).recvBlock(msg);
468  }
469  }
470  }
471 
472  // Copy-To-Host communication
473  for (int x=0;x < pmeGrid.xBlocks;x++) {
474  if (deviceBuffers[x].dataGrid[iGrid] == NULL) {
475  PmeBlockMsg* msg = new (blockSizes[x], PRIORITY_SIZE) PmeBlockMsg();
476  msg->dataSize = blockSizes[x];
477  msg->x = x;
478  msg->y = thisIndex.y;
479  msg->z = thisIndex.z;
480  msg->doEnergy = doEnergy;
481  msg->doVirial = doVirial;
482  msg->lattice = lattice;
483  msg->numStrayAtoms = numStrayAtoms;
484  msg->simulationStep = simulationStep;
485  msg->grid = iGrid;
486  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataDeviceToHost(x, msg->data, msg->dataSize);
487  ((CudaPmeTranspose *)pmeTransposes[iGrid])->waitStreamSynchronize();
488  pmePencilY(x,0,thisIndex.z).recvBlock(msg);
489  }
490  }
491  }
492  }
493 }
494 
495 void CudaPmePencilX::recvDataFromY(PmeBlockMsg *msg) {
496  if (msg->dataSize != 0) {
497  // Buffer is coming from a different node
498  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataHostToDevice(msg->x, msg->data, (float2 *)fftComputes[msg->grid]->getDataDst());
499  } else {
500  // Buffer is coming from the same node
501  // Wait for event that was recorded on the sending pencil
502  // device ID = deviceBuffers[msg->x].deviceID
503  // event = deviceBuffers[msg->x].event
504  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->x].event, 0));
505 #ifndef P2P_ENABLE_3D
506  if (deviceBuffers[msg->x].dataGrid[msg->grid] != NULL && deviceBuffers[msg->x].deviceID != deviceID) {
507  // Data is in temporary device buffer, copy it into final fft-buffer
508  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataDeviceToDevice(msg->x, (float2 *)fftComputes[msg->grid]->getDataDst());
509  }
510 #endif
511  }
512  delete msg;
513 }
514 
515 //###########################################################################
516 //###########################################################################
517 //###########################################################################
518 
520  pmeGrid = msg->pmeGrid;
521  pmePencilX = msg->pmePencilX;
522  pmePencilZ = msg->pmePencilZ;
523  xMap = msg->xMap;
524  zMap = msg->zMap;
525 
526  delete msg;
527 
528  initBlockSizes();
529 }
530 
532  if (eventCreated) cudaCheck(cudaEventDestroy(event));
533 }
534 
535 //
536 // CUDA specific initialization
537 //
539  // Get device proxy
540  CProxy_ComputePmeCUDADevice deviceProxy = msg->deviceProxy;
541  deviceID = msg->deviceID;
542  stream = msg->stream;
543  CProxy_ComputePmeCUDAMgr mgrProxy = msg->mgrProxy;
544  delete msg;
545  // deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
546  // cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
547  // CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
548  // Setup fftCompute and pmeKSpaceCompute
549  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
550  if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == true) {
551  fftComputes[iGrid] = new CudaFFTCompute(deviceID, stream);
552  pmeTransposes[iGrid] = new CudaPmeTranspose(pmeGrid, Perm_Y_Z_cX, thisIndex.z, thisIndex.x, deviceID, stream);
553  } else {
554  fftComputes[iGrid] = NULL;
555  pmeTransposes[iGrid] = NULL;
556  }
557  }
558 
559  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
560  cudaCheck(cudaSetDevice(deviceID));
561  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
562  eventCreated = true;
563 
564  deviceBuffersZ.resize(pmeGrid.yBlocks, DeviceBuffer(-1, false));
565  deviceBuffersX.resize(pmeGrid.yBlocks, DeviceBuffer(-1, false));
566  numDeviceBuffersZ = 0;
567  numDeviceBuffersX = 0;
568 
569  for (int y=0;y < pmeGrid.yBlocks;y++) {
570  int pe;
571  pe = zMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x, y, 0));
572  if (CkNodeOf(pe) == CkMyNode()) {
573  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilZ(thisIndex.x, y);
574  numDeviceBuffersZ++;
575  deviceBuffersZ[y] = DeviceBuffer(deviceID0, false);
576  pmePencilZ(thisIndex.x, y, 0).getDeviceBuffer(thisIndex.z, (deviceID0 == deviceID), thisProxy);
577  }
578  pe = xMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0, y, thisIndex.z));
579  if (CkNodeOf(pe) == CkMyNode()) {
580  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(y, thisIndex.z);
581  numDeviceBuffersX++;
582  deviceBuffersX[y] = DeviceBuffer(deviceID0, false);
583  pmePencilX(0, y, thisIndex.z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID), thisProxy);
584  }
585  }
586 
587 }
588 
589 //
590 // CUDA specific start
591 //
592 void CudaPmePencilY::start(const CkCallback &cb) {
593  thisProxy[thisIndex].recvDeviceBuffers(cb);
594 }
595 
596 //
597 // Setup direct device buffers
598 //
599 void CudaPmePencilY::setDeviceBuffers() {
600  std::array<std::vector<float2*>, NUM_GRID_MAX> dataPtrsYZXGrid;
601  std::array<std::vector<float2*>, NUM_GRID_MAX> dataPtrsZXYGrid;
602  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
603  dataPtrsYZXGrid[iGrid] = std::vector<float2*>(pmeGrid.yBlocks, (float2*)0);
604  dataPtrsZXYGrid[iGrid] = std::vector<float2*>(pmeGrid.yBlocks, (float2*)0);
605  for (int y=0;y < pmeGrid.yBlocks;y++) {
606  if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
607  if (deviceBuffersZ[y].deviceID == deviceID) {
608  dataPtrsYZXGrid[iGrid][y] = deviceBuffersZ[y].dataGrid[iGrid];
609  }
610  }
611  if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
612  if (deviceBuffersX[y].deviceID == deviceID) {
613  dataPtrsZXYGrid[iGrid][y] = deviceBuffersX[y].dataGrid[iGrid];
614  }
615  }
616  }
617  if (pmeTransposes[iGrid] != NULL) {
618  ((CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsYZXGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataDst());
619  ((CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsZXYGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
620  }
621  }
622 }
623 
624 std::array<float2*, NUM_GRID_MAX> CudaPmePencilY::getDataForX(const int i, const bool sameDevice) {
625  std::array<float2*, NUM_GRID_MAX> data_grid;
626  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
627  if (fftComputes[iGrid] != NULL) {
628 #ifndef P2P_ENABLE_3D
629  if (sameDevice) {
630  int i0, i1, j0, j1, k0, k1;
631  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
632  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
633  } else {
634  data_grid[iGrid] = ((CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
635  }
636 #else
637  int i0, i1, j0, j1, k0, k1;
638  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
639  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
640 #endif
641  } else {
642  data_grid[iGrid] = NULL;
643  }
644  }
645  return data_grid;
646 }
647 
648 std::array<float2*, NUM_GRID_MAX> CudaPmePencilY::getDataForZ(const int i, const bool sameDevice) {
649  std::array<float2*, NUM_GRID_MAX> data_grid;
650  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
651  if (fftComputes[iGrid] != NULL) {
652 #ifndef P2P_ENABLE_3D
653  if (sameDevice) {
654  int i0, i1, j0, j1, k0, k1;
655  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
656  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
657  } else {
658  data_grid[iGrid] = ((CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
659  }
660 #else
661  int i0, i1, j0, j1, k0, k1;
662  getBlockDim(pmeGrid, Perm_Y_Z_cX, i, 0, 0, i0, i1, j0, j1, k0, k1);
663  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataDst() + i0;
664 #endif
665  } else {
666  data_grid[iGrid] = NULL;
667  }
668  }
669  return data_grid;
670 }
671 
672 void CudaPmePencilY::backwardDone() {
673  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
674  if (pmeTransposes[iGrid] != NULL) {
675  // Transpose locally
676  pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataSrc());
677 
678  // Send data to x-pencils that share the same x-coordinate. There are pmeGrid.yBlocks of them
679  // Direct-Device-To-Device communication
680  if (numDeviceBuffersX > 0) {
681  for (int y=0;y < pmeGrid.yBlocks;y++) {
682  if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
683  if (deviceBuffersX[y].deviceID != deviceID) {
684  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(y, deviceBuffersX[y].deviceID,
685  Perm_cX_Y_Z, deviceBuffersX[y].dataGrid[iGrid]);
686  }
687  }
688  }
689  // Record event for this pencil
690  cudaCheck(cudaEventRecord(event, stream));
691  // Send empty message
692  for (int y=0;y < pmeGrid.yBlocks;y++) {
693  if (deviceBuffersX[y].dataGrid[iGrid] != NULL) {
694  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
695  msg->dataSize = 0;
696  msg->x = thisIndex.x;
697  msg->y = y;
698  msg->z = thisIndex.z;
699  msg->grid = iGrid;
700  msg->simulationStep = simulationStep;
701  pmePencilX(0,y,thisIndex.z).recvBlock(msg);
702  }
703  }
704  }
705 
706  // Copy via host
707  for (int y=0;y < pmeGrid.yBlocks;y++) {
708  if (deviceBuffersX[y].dataGrid[iGrid] == NULL) {
709  PmeBlockMsg* msg = new (blockSizes[y], PRIORITY_SIZE) PmeBlockMsg();
710  msg->dataSize = blockSizes[y];
711  msg->x = thisIndex.x;
712  msg->y = y;
713  msg->z = thisIndex.z;
714  msg->grid = iGrid;
715  msg->simulationStep = simulationStep;
716  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataDeviceToHost(y, msg->data, msg->dataSize);
717  ((CudaPmeTranspose *)pmeTransposes[iGrid])->waitStreamSynchronize();
718  pmePencilX(0,y,thisIndex.z).recvBlock(msg);
719  }
720  }
721  }
722  }
723 }
724 
725 void CudaPmePencilY::forwardDone() {
726  if (pmeTransposes[0] == NULL)
727  NAMD_bug("CudaPmePencilY::forwardDone, pmeTranspose not initialized");
728  if (blockSizes.size() == 0)
729  NAMD_bug("CudaPmePencilY::forwardDone, blockSizes not initialized");
730 
731  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
732  if (pmeTransposes[iGrid] != NULL) {
733  // Transpose locally
734  pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataDst());
735 
736  // Send data to z-pencils that share the same x-coordinate. There are pmeGrid.yBlocks of them
737  // Direct-Device-To-Device communication
738  if (numDeviceBuffersZ > 0) {
739  for (int y=0;y < pmeGrid.yBlocks;y++) {
740  if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
741  if (deviceBuffersZ[y].deviceID != deviceID) {
742  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(y, deviceBuffersZ[y].deviceID,
743  Perm_Z_cX_Y, deviceBuffersZ[y].dataGrid[iGrid]);
744  }
745  }
746  }
747 
748  // Record event for this pencil
749  cudaCheck(cudaEventRecord(event, stream));
750  // Send empty message
751  for (int y=0;y < pmeGrid.yBlocks;y++) {
752  if (deviceBuffersZ[y].dataGrid[iGrid] != NULL) {
753  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
754  msg->dataSize = 0;
755  msg->x = thisIndex.x;
756  msg->y = y;
757  msg->z = thisIndex.z;
758  msg->doEnergy = doEnergy;
759  msg->doVirial = doVirial;
760  msg->lattice = lattice;
761  msg->numStrayAtoms = numStrayAtoms;
762  msg->grid = iGrid;
763  msg->simulationStep = simulationStep;
764  pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
765  }
766  }
767  }
768 
769  // Copy-To-Host communication
770  for (int y=0;y < pmeGrid.yBlocks;y++) {
771  if (deviceBuffersZ[y].dataGrid[iGrid] == NULL) {
772  PmeBlockMsg* msg = new (blockSizes[y], PRIORITY_SIZE) PmeBlockMsg();
773  msg->dataSize = blockSizes[y];
774  msg->x = thisIndex.x;
775  msg->y = y;
776  msg->z = thisIndex.z;
777  msg->doEnergy = doEnergy;
778  msg->doVirial = doVirial;
779  msg->lattice = lattice;
780  msg->numStrayAtoms = numStrayAtoms;
781  msg->grid = iGrid;
782  msg->simulationStep = simulationStep;
783  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataDeviceToHost(y, msg->data, msg->dataSize);
784  ((CudaPmeTranspose *)pmeTransposes[iGrid])->waitStreamSynchronize();
785  pmePencilZ(thisIndex.x,y,0).recvBlock(msg);
786  }
787  }
788  }
789  }
790 }
791 
792 void CudaPmePencilY::recvDataFromX(PmeBlockMsg *msg) {
793  if (msg->dataSize != 0) {
794  // Buffer is coming from a different node
795  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataHostToDevice(msg->y, msg->data, (float2 *)fftComputes[msg->grid]->getDataSrc());
796  } else {
797  // Buffer is coming from the same node
798  // Wait for event that was recorded on the sending pencil
799  // device ID = deviceBuffersX[msg->y].deviceID
800  // event = deviceBuffersX[msg->y].event
801  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersX[msg->y].event, 0));
802 #ifndef P2P_ENABLE_3D
803  if (deviceBuffersX[msg->y].dataGrid[msg->grid] != NULL && deviceBuffersX[msg->y].deviceID != deviceID) {
804  // Data is in temporary device buffer, copy it into final fft-buffer
805  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataDeviceToDevice(msg->y, (float2 *)fftComputes[msg->grid]->getDataSrc());
806  }
807 #endif
808  }
809  delete msg;
810 }
811 
812 void CudaPmePencilY::recvDataFromZ(PmeBlockMsg *msg) {
813  if (msg->dataSize != 0) {
814  // Buffer is coming from a different node
815  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataHostToDevice(msg->y, msg->data, (float2 *)fftComputes[msg->grid]->getDataDst());
816  } else {
817  // Buffer is coming from the same node
818  // Wait for event that was recorded on the sending pencil
819  // device ID = deviceBuffersZ[msg->y].deviceID
820  // event = deviceBuffersZ[msg->y].event
821  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffersZ[msg->y].event, 0));
822 #ifndef P2P_ENABLE_3D
823  if (deviceBuffersZ[msg->y].dataGrid[msg->grid] != NULL && deviceBuffersZ[msg->y].deviceID != deviceID) {
824  // Data is in temporary device buffer, copy it into final fft-buffer
825  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataDeviceToDevice(msg->y, (float2 *)fftComputes[msg->grid]->getDataDst());
826  }
827 #endif
828  }
829  delete msg;
830 }
831 
832 //###########################################################################
833 //###########################################################################
834 //###########################################################################
835 
837  useXYslab = false;
838  pmeGrid = msg->pmeGrid;
839  pmePencilY = msg->pmePencilY;
840  yMap = msg->yMap;
841 
842  delete msg;
843 
844  initBlockSizes();
845 }
846 
848  useXYslab = true;
849  pmeGrid = msg->pmeGrid;
850  pmePencilXY = msg->pmePencilXY;
851  xyMap = msg->xyMap;
852 
853  delete msg;
854 
855  initBlockSizes();
856 }
857 
859  if (eventCreated) cudaCheck(cudaEventDestroy(event));
860 }
861 
862 //
863 // CUDA specific initialization
864 //
866  // Get device proxy
867  CProxy_ComputePmeCUDADevice deviceProxy = msg->deviceProxy;
868  deviceID = msg->deviceID;
869  stream = msg->stream;
870  CProxy_ComputePmeCUDAMgr mgrProxy = msg->mgrProxy;
871  delete msg;
872  // deviceID = deviceProxy.ckLocalBranch()->getDeviceID();
873  // cudaStream_t stream = deviceProxy.ckLocalBranch()->getStream();
874  // CProxy_ComputePmeCUDAMgr mgrProxy = deviceProxy.ckLocalBranch()->getMgrProxy();
875  // Setup fftCompute and pmeKSpaceCompute
876  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
877  if (deviceProxy.ckLocalBranch()->isGridEnabled(iGrid) == true) {
878  fftComputes[iGrid] = new CudaFFTCompute(deviceID, stream);
879  pmeTransposes[iGrid] = new CudaPmeTranspose(pmeGrid, Perm_Z_cX_Y, thisIndex.x, thisIndex.y, deviceID, stream);
880  pmeKSpaceComputes[iGrid] = new CudaPmeKSpaceCompute(pmeGrid, Perm_Z_cX_Y, thisIndex.x, thisIndex.y, ComputeNonbondedUtil::ewaldcof, deviceID, stream, iGrid);
881  energyReady[iGrid] = 0;
882  } else {
883  fftComputes[iGrid] = NULL;
884  pmeTransposes[iGrid] = NULL;
885  pmeKSpaceComputes[iGrid] = NULL;
886  energyReady[iGrid] = -1;
887  }
888  }
889 
890  // Create event. NOTE: Events are tied to devices, hence the cudaSetDevice() here
891  cudaCheck(cudaSetDevice(deviceID));
892  cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
893  eventCreated = true;
894 
895  deviceBuffers.resize(pmeGrid.zBlocks, DeviceBuffer(-1, false));
896  numDeviceBuffers = 0;
897 
898  if (useXYslab) {
899  for (int z=0;z < pmeGrid.zBlocks;z++) {
900  int pe = xyMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(0,0,z));
901  if (CkNodeOf(pe) == CkMyNode()) {
902  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilX(0, z);
903  // Check for Peer-to-Peer access
904  int canAccessPeer = 0;
905  if (deviceID != deviceID0) {
906  cudaCheck(cudaSetDevice(deviceID));
907  cudaCheck(cudaDeviceCanAccessPeer(&canAccessPeer, deviceID, deviceID0));
908  }
909 #ifdef DISABLE_P2P
910  canAccessPeer = 0;
911 #endif
912  numDeviceBuffers++;
913  deviceBuffers[z] = DeviceBuffer(deviceID0, canAccessPeer);
914  pmePencilXY(0,0,z).getDeviceBuffer(thisIndex.x, (deviceID0 == deviceID) || canAccessPeer, thisProxy);
915  }
916  }
917  } else {
918  for (int z=0;z < pmeGrid.zBlocks;z++) {
919  int pe = yMap.ckLocalBranch()->procNum(0, CkArrayIndex3D(thisIndex.x,0,z));
920  if (CkNodeOf(pe) == CkMyNode()) {
921  int deviceID0 = mgrProxy.ckLocalBranch()->getDeviceIDPencilY(thisIndex.x, z);
922  numDeviceBuffers++;
923  deviceBuffers[z] = DeviceBuffer(deviceID0, false);
924  pmePencilY(thisIndex.x,0,z).getDeviceBuffer(thisIndex.y, (deviceID0 == deviceID), thisProxy);
925  }
926  }
927  }
928 
929 }
930 
931 //
932 // CUDA specific start
933 //
934 void CudaPmePencilZ::start(const CkCallback &cb) {
935  thisProxy[thisIndex].recvDeviceBuffers(cb);
936 }
937 
938 void CudaPmePencilZ::setDeviceBuffers() {
939  std::array<std::vector<float2*>, NUM_GRID_MAX> dataPtrsGrid;
940  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
941  dataPtrsGrid[iGrid] = std::vector<float2*>(pmeGrid.zBlocks, (float2*)0);
942  for (int z=0;z < pmeGrid.zBlocks;z++) {
943  if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
944  if (deviceBuffers[z].deviceID == deviceID || deviceBuffers[z].isPeerDevice) {
945  dataPtrsGrid[iGrid][z] = deviceBuffers[z].dataGrid[iGrid];
946  }
947  }
948  }
949  if (useXYslab) {
950  if (pmeTransposes[iGrid] != NULL) {
951  ((CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsYZX(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
952  }
953  } else {
954  if (pmeTransposes[iGrid] != NULL) {
955  ((CudaPmeTranspose *)pmeTransposes[iGrid])->setDataPtrsZXY(dataPtrsGrid[iGrid], (float2 *)fftComputes[iGrid]->getDataSrc());
956  }
957  }
958  }
959 }
960 
961 std::array<float2*, NUM_GRID_MAX> CudaPmePencilZ::getData(const int i, const bool sameDevice) {
962  std::array<float2*, NUM_GRID_MAX> data_grid;
963  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
964  if (fftComputes[iGrid] != NULL) {
965 #ifndef P2P_ENABLE_3D
966  if (sameDevice) {
967  int i0, i1, j0, j1, k0, k1;
968  getBlockDim(pmeGrid, Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
969  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
970  } else {
971  data_grid[iGrid] = ((CudaPmeTranspose *)pmeTransposes[iGrid])->getBuffer(i);
972  }
973 #else
974  int i0, i1, j0, j1, k0, k1;
975  getBlockDim(pmeGrid, Perm_Z_cX_Y, i, 0, 0, i0, i1, j0, j1, k0, k1);
976  data_grid[iGrid] = (float2 *)fftComputes[iGrid]->getDataSrc() + i0;
977 #endif
978  } else {
979  data_grid[iGrid] = NULL;
980  }
981  }
982  return data_grid;
983 }
984 
985 void CudaPmePencilZ::backwardDone() {
986  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
987  if (pmeTransposes[iGrid] != NULL) {
988  // Transpose locally
989  if (useXYslab) {
990  pmeTransposes[iGrid]->transposeXYZtoYZX((float2 *)fftComputes[iGrid]->getDataSrc());
991  } else {
992  pmeTransposes[iGrid]->transposeXYZtoZXY((float2 *)fftComputes[iGrid]->getDataSrc());
993  }
994 
995  if (useXYslab) {
996  // Direct-Device-To-Device communication
997  if (numDeviceBuffers > 0) {
998  for (int z=0;z < pmeGrid.zBlocks;z++) {
999  if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1000  if (deviceBuffers[z].deviceID != deviceID && !deviceBuffers[z].isPeerDevice) {
1001  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceYZX(z, deviceBuffers[z].deviceID,
1002  Perm_cX_Y_Z, deviceBuffers[z].dataGrid[iGrid]);
1003  }
1004  }
1005  }
1006  // Record event for this pencil
1007  cudaCheck(cudaEventRecord(event, stream));
1008  // Send empty message
1009  for (int z=0;z < pmeGrid.zBlocks;z++) {
1010  if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1011  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
1012  msg->dataSize = 0;
1013  msg->x = thisIndex.x;
1014  msg->y = thisIndex.y;
1015  msg->z = z;
1016  msg->grid = iGrid;
1017  msg->simulationStep = simulationStep;
1018  pmePencilXY(0,0,z).recvBlock(msg);
1019  }
1020  }
1021  }
1022 
1023  // Copy-To-Host communication
1024  for (int z=0;z < pmeGrid.zBlocks;z++) {
1025  if (deviceBuffers[z].dataGrid[iGrid] == NULL) {
1026  PmeBlockMsg* msg = new (blockSizes[z], PRIORITY_SIZE) PmeBlockMsg();
1027  msg->dataSize = blockSizes[z];
1028  msg->x = thisIndex.x;
1029  msg->y = thisIndex.y;
1030  msg->z = z;
1031  msg->grid = iGrid;
1032  msg->simulationStep = simulationStep;
1033  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataDeviceToHost(z, msg->data, msg->dataSize);
1034  ((CudaPmeTranspose *)pmeTransposes[iGrid])->waitStreamSynchronize();
1035  pmePencilXY(0,0,z).recvBlock(msg);
1036  }
1037  }
1038  } else {
1039  // Send data to y-pencils that share the same x-coordinate. There are pmeGrid.zBlocks of them
1040  // Direct-Device-To-Device communication
1041  if (numDeviceBuffers > 0) {
1042  for (int z=0;z < pmeGrid.zBlocks;z++) {
1043  if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1044  if (deviceBuffers[z].deviceID != deviceID) {
1045  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataToPeerDeviceZXY(z, deviceBuffers[z].deviceID,
1046  Perm_Y_Z_cX, deviceBuffers[z].dataGrid[iGrid]);
1047  }
1048  }
1049  }
1050  // Record event for this pencil
1051  cudaCheck(cudaEventRecord(event, stream));
1052  // Send empty message
1053  for (int z=0;z < pmeGrid.zBlocks;z++) {
1054  if (deviceBuffers[z].dataGrid[iGrid] != NULL) {
1055  PmeBlockMsg* msg = new (0, PRIORITY_SIZE) PmeBlockMsg();
1056  msg->dataSize = 0;
1057  msg->x = thisIndex.x;
1058  msg->y = thisIndex.y;
1059  msg->z = z;
1060  msg->grid = iGrid;
1061  msg->simulationStep = simulationStep;
1062  pmePencilY(thisIndex.x,0,z).recvBlock(msg);
1063  }
1064  }
1065  }
1066 
1067  // Copy-To-Host communication
1068  for (int z=0;z < pmeGrid.zBlocks;z++) {
1069  if (deviceBuffers[z].dataGrid[iGrid] == NULL) {
1070  PmeBlockMsg* msg = new (blockSizes[z], PRIORITY_SIZE) PmeBlockMsg();
1071  msg->dataSize = blockSizes[z];
1072  msg->x = thisIndex.x;
1073  msg->y = thisIndex.y;
1074  msg->z = z;
1075  msg->grid = iGrid;
1076  msg->simulationStep = simulationStep;
1077  ((CudaPmeTranspose *)pmeTransposes[iGrid])->copyDataDeviceToHost(z, msg->data, msg->dataSize);
1078  ((CudaPmeTranspose *)pmeTransposes[iGrid])->waitStreamSynchronize();
1079  pmePencilY(thisIndex.x,0,z).recvBlock(msg);
1080  }
1081  }
1082  }
1083 
1084  // Submit reductions
1085  ((CudaPmeKSpaceCompute *)pmeKSpaceComputes[iGrid])->energyAndVirialSetCallback(this);
1086  // ((CudaPmeKSpaceCompute *)pmeKSpaceCompute)->waitEnergyAndVirial();
1087  // submitReductions();
1088  }
1089  }
1090 }
1091 
1092 void CudaPmePencilZ::energyAndVirialDone(unsigned int iGrid) {
1093  submitReductions(iGrid);
1094 }
1095 
1096 void CudaPmePencilZ::recvDataFromY(PmeBlockMsg *msg) {
1097  // NOTE: No need to synchronize stream here since memory copies are in the stream
1098  if (msg->dataSize != 0) {
1099  // Buffer is coming from a different node
1100  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataHostToDevice(msg->z, msg->data, (float2 *)fftComputes[msg->grid]->getDataSrc());
1101  } else {
1102  // Buffer is coming from the same node
1103  // Wait for event that was recorded on the sending pencil
1104  // device ID = deviceBuffers[msg->z].deviceID
1105  // event = deviceBuffers[msg->z].event
1106  cudaCheck(cudaStreamWaitEvent(stream, deviceBuffers[msg->z].event, 0));
1107 #ifndef P2P_ENABLE_3D
1108  if (deviceBuffers[msg->z].dataGrid[0] != NULL && deviceBuffers[msg->z].deviceID != deviceID && !deviceBuffers[msg->z].isPeerDevice) {
1109  // Data is in temporary device buffer, copy it into final fft-buffer
1110  ((CudaPmeTranspose *)pmeTransposes[msg->grid])->copyDataDeviceToDevice(msg->z, (float2 *)fftComputes[msg->grid]->getDataSrc());
1111  }
1112 #endif
1113  }
1114  delete msg;
1115 }
1116 #endif // NAMD_CUDA
1117 
1118 #include "CudaPmeSolver.def.h"
bool doEnergy
Definition: PmeSolver.h:130
CProxy_PmePencilXMap zMap
Definition: CudaPmeSolver.h:21
void initialize(CudaPmeXInitMsg *msg)
void initialize(CudaPmeXYInitMsg *msg)
Definition: CudaPmeSolver.C:70
CProxy_PmePencilXYMap xyMap
Definition: CudaPmeSolver.h:22
CProxy_ComputePmeCUDADevice deviceProxy
Definition: CudaPmeSolver.h:54
CProxy_CudaPmePencilZ pmePencilZ
Definition: CudaPmeSolver.h:35
CProxy_PmePencilXMap xMap
Definition: CudaPmeSolver.h:36
void energyAndVirialDone(unsigned int iGrid)
CProxy_CudaPmePencilX pmePencilX
Definition: CudaPmeSolver.h:33
float2 * data
Definition: PmeSolver.h:127
const unsigned int NUM_GRID_MAX
Definition: PmeSolverUtil.h:9
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_ComputePmeCUDADevice deviceProxy
Definition: CudaPmeSolver.h:44
void initialize(CudaPmeXInitMsg *msg)
int dataSize
Definition: PmeSolver.h:128
#define PRIORITY_SIZE
Definition: Priorities.h:13
void initializeDevice(InitDeviceMsg *msg)
Definition: CudaPmeSolver.C:87
CProxy_PmePencilXMap zMap
Definition: CudaPmeSolver.h:38
void initializeDevice(InitDeviceMsg *msg)
int numStrayAtoms
Definition: PmeSolver.h:131
void NAMD_bug(const char *err_msg)
Definition: common.C:195
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
void initializeDevice(InitDeviceMsg2 *msg)
CProxy_CudaPmePencilZ pmePencilZ
Definition: CudaPmeSolver.h:20
void energyAndVirialDone(unsigned int iGrid)
Definition: CudaPmeSolver.C:61
cudaStream_t stream
Definition: CudaPmeSolver.h:52
unsigned int grid
Definition: PmeSolver.h:133
void initialize(CudaPmeXYZInitMsg *msg)
Definition: CudaPmeSolver.C:18
CProxy_CudaPmePencilY pmePencilY
Definition: CudaPmeSolver.h:34
void initialize(CudaPmeXInitMsg *msg)
bool doVirial
Definition: PmeSolver.h:130
void initializeDevice(InitDeviceMsg *msg)
Definition: CudaPmeSolver.C:26
CProxy_CudaPmePencilXY pmePencilXY
Definition: CudaPmeSolver.h:19
CProxy_ComputePmeCUDAMgr mgrProxy
Definition: CudaPmeSolver.h:53
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
int simulationStep
Definition: PmeSolver.h:134
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
CProxy_PmePencilXMap yMap
Definition: CudaPmeSolver.h:37
Lattice lattice
Definition: PmeSolver.h:132