NAMD
ComputePmeCUDAMgr.C
Go to the documentation of this file.
1 #include <vector>
2 #include <numeric>
3 #include <algorithm>
4 #include "Node.h"
5 #include "PatchMap.h"
6 #include "HomePatch.h"
7 #include "WorkDistrib.h"
8 #include "Priorities.h"
9 #include "PatchData.h"
10 #include "CudaUtils.h"
11 
12 #include "SimParameters.h"
13 #include "CudaPmeSolverUtil.h"
14 
15 #include "ComputePmeCUDAMgr.h"
16 
17 #include "CudaPmeSolver.h"
18 #include "ComputePmeCUDA.h"
19 
20 #include "DeviceCUDA.h"
21 #define MIN_DEBUG_LEVEL 4
22 //#define DEBUGM
23 #include "Debug.h"
24 
25 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
26 #ifdef WIN32
27 #define __thread __declspec(thread)
28 #endif
29 extern __thread DeviceCUDA *deviceCUDA;
30 
31 void createStream(cudaStream_t& stream) {
32 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
33  int leastPriority, greatestPriority;
34  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
35  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority));
36  // cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,leastPriority));
37 #else
38  cudaCheck(cudaStreamCreate(&stream));
39 #endif
40 }
41 
42 //
43 // CUDA implementation of atom storage
44 //
46 public:
47  CudaPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
49  if (atom != NULL) dealloc_((void *)atom);
50  if (atomIndex != NULL) dealloc_((void *)atomIndex);
51  if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
52  if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
53  if (alchOn) {
54  for (unsigned int i = 0; i < totalFactorArrays; ++i) {
55  if (atomElecFactorArrays[i] != NULL) dealloc_((void *)(atomElecFactorArrays[i]));
56  if (overflowAtomElecFactorArrays[i] != NULL) dealloc_((void *)(overflowAtomElecFactorArrays[i]));
57  }
58  }
59  }
60 private:
61 
62  // Allocate array of size bytes
63  void* alloc_(const size_t size) {
64  void* p;
65  cudaCheck(cudaMallocHost(&p, size));
66  return p;
67  }
68 
69  // Deallocate array
70  void dealloc_(void *p) {
71  cudaCheck(cudaFreeHost(p));
72  }
73 
74 };
75 
76 //
77 // CPU implementation of atom storage
78 //
80 public:
81  CpuPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
83  if (atom != NULL) dealloc_((void *)atom);
84  if (atomIndex != NULL) dealloc_((void *)atomIndex);
85  if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
86  if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
87  if (alchOn) {
88  for (unsigned int i = 0; i < totalFactorArrays; ++i) {
89  if (atomElecFactorArrays[i] != NULL) dealloc_((void *)(atomElecFactorArrays[i]));
90  if (overflowAtomElecFactorArrays[i] != NULL) dealloc_((void *)(overflowAtomElecFactorArrays[i]));
91  }
92  }
93  }
94 private:
95 
96  // Allocate array of size bytes
97  void* alloc_(const size_t size) {
98  return (void *)(new char[size]);
99  }
100 
101  // Deallocate array
102  void dealloc_(void *p) {
103  delete [] (char *)p;
104  }
105 
106 };
107 
109  for (int p=0;p < 10;p++) {
110  pencil[p] = NULL;
111  pencilCapacity[p] = 0;
112  }
113 }
114 PmeAtomFiler::PmeAtomFiler(CkMigrateMessage *m) {
115  for (int p=0;p < 10;p++) {
116  pencil[p] = NULL;
117  pencilCapacity[p] = 0;
118  }
119 }
121  for (int p=0;p < 10;p++) {
122  if (pencil[p] != NULL) delete [] pencil[p];
123  }
124 }
125 
126 //
127 // File atoms into PME pencils. Each atom can belong to maximum 9 pencils
128 // (oy, oz) = origin of the atoms
129 // (miny, minz) = grid minimum corner for this patch
130 // NOTE: This method can only be called locally from the same Pe
131 //
132 void PmeAtomFiler::fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
133  const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi) {
134  DebugM(2, "PmeAtomFiler::fileAtoms\n" << endi);
135  // Make sure there's enough room in the pencil arrays
136  for (int p=0;p < 10;p++) {
137  if (pencil[p] != NULL && pencilCapacity[p] < numAtoms) {
138  delete [] pencil[p];
139  pencil[p] = NULL;
140  pencilCapacity[p] = 0;
141  }
142  if (pencil[p] == NULL) {
143  int newSize = (int)(numAtoms*1.5);
144  pencil[p] = new int[newSize];
145  pencilCapacity[p] = newSize;
146  }
147  pencilSize[p] = 0;
148  }
149 
150  const float recip11 = lattice.a_r().x;
151  const float recip22 = lattice.b_r().y;
152  const float recip33 = lattice.c_r().z;
153  const int order1 = pmeGrid.order - 1;
154  const int K1 = pmeGrid.K1;
155  const int K2 = pmeGrid.K2;
156  const int K3 = pmeGrid.K3;
157  const int yBlocks = pmeGrid.yBlocks;
158  const int zBlocks = pmeGrid.zBlocks;
159 
160  for (int i=0;i < numAtoms;i++) {
161  float frx, fry, frz;
162  // PmeRealSpaceCompute::calcGridCoord(atoms[i].uix, atoms[i].uiy, atoms[i].uiz,
163  // K1, K2, K3, frx, fry, frz);
164  PmeRealSpaceCompute::calcGridCoord(atoms[i].x, atoms[i].y, atoms[i].z, K1, K2, K3, frx, fry, frz);
165  // Charge is spread in the region [y0 ... y0+order-1] x [z0 ... z0+order-1]
166  int y0 = (int)fry;
167  int z0 = (int)frz;
168  if (y0 < 0 || y0 >= K2 || z0 < 0 || z0 >= K3) {
169  // Add to "Stray pencil" and skip to next atom
170  pencil[9][pencilSize[9]++] = i;
171  continue;
172  // fprintf(stderr, "%lf %lf %lf\n", atoms[i].x, atoms[i].y, atoms[i].z);
173  // NAMD_bug("PmeAtomFiler::fileAtoms, charge out of bounds");
174  }
175  // Calculate pencil index for the four corners of the order X order area
176  // The corners determine the pencil indices for this atom.
177  int occupied = 0;
178  int plist[4];
179 #pragma unroll
180  for (int j=0;j < 4;j++) {
181 
182  int py = getPencilIndexY(pmeGrid, (y0 + (j%2)*order1) % K2) - pencilIndexY;
183  if (py < ylo) py += yBlocks;
184  if (py > yhi) py -= yBlocks;
185 
186  int pz = getPencilIndexZ(pmeGrid, (z0 + (j/2)*order1) % K3) - pencilIndexZ;
187  if (pz < zlo) pz += zBlocks;
188  if (pz > zhi) pz -= zBlocks;
189 
190  if (py < ylo || py > yhi || pz < zlo || pz > zhi) {
191  // Add to "Stray pencil" and skip to next atom
192  pencil[9][pencilSize[9]++] = i;
193  goto breakjloop;
194  // fprintf(stderr, "py %d [%d ... %d] pz %d [%d ... %d]\n", pz, zlo, zhi);
195  // NAMD_bug("PmeAtomFiler::fileAtoms, py,pz outside bounds");
196  }
197  // p = 0,1,2,3,4,5,6,7,8 (maximum range)
198  plist[j] = (py-ylo) + (pz-zlo)*3;
199  }
200 
201 #pragma unroll
202  for (int j=0;j < 4;j++) {
203  int p = plist[j];
204  // pbit = 0, 2, 4, 8, 16, 32, 64, 128, 256
205  int pbit = (1 << p);
206  if (!(occupied & pbit)) {
207  pencil[p][pencilSize[p]++] = i;
208  occupied |= pbit;
209  }
210  }
211 
212 breakjloop:
213  continue;
214  }
215 
216 }
217 
218 //
219 // Class constructor
220 //
222  __sdag_init();
223  numDevices = 0;
224  numTotalPatches = 0;
225  numNodesContributed = 0;
226  numDevicesMax = 0;
227 }
228 
229 //
230 // Class constructor
231 //
233  __sdag_init();
234  NAMD_bug("ComputePmeCUDAMgr cannot be migrated");
235  numDevices = 0;
236  numTotalPatches = 0;
237  numNodesContributed = 0;
238  numDevicesMax = 0;
239 }
240 
241 //
242 // Class destructor
243 //
245  for (int i=0;i < extraDevices.size();i++) {
246  cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
247  cudaCheck(cudaStreamDestroy(extraDevices[i].stream));
248  }
249 }
250 
251 //
252 // Returns home pencil (homey, homez)
253 // Home pencil = pencil with most overlap with this patch
254 //
255 void ComputePmeCUDAMgr::getHomePencil(PatchID patchID, int& homey, int& homez) {
256  PatchMap *patchMap = PatchMap::Object();
257 
258  BigReal miny = patchMap->min_b(patchID);
259  BigReal maxy = patchMap->max_b(patchID);
260 
261  BigReal minz = patchMap->min_c(patchID);
262  BigReal maxz = patchMap->max_c(patchID);
263 
264  // Determine home pencil = pencil with most overlap
265 
266  // Calculate patch grid coordinates
267  int patch_y0 = floor((miny+0.5)*pmeGrid.K2);
268  int patch_y1 = floor((maxy+0.5)*pmeGrid.K2)-1;
269  int patch_z0 = floor((minz+0.5)*pmeGrid.K3);
270  int patch_z1 = floor((maxz+0.5)*pmeGrid.K3)-1;
271 
272  if (patch_y0 < 0 || patch_y1 >= pmeGrid.K2 || patch_z0 < 0 || patch_z1 >= pmeGrid.K3) {
273  NAMD_bug("ComputePmeCUDAMgr::getHomePencil, patch bounds are outside grid bounds");
274  }
275 
276  int maxOverlap = 0;
277  homey = -1;
278  homez = -1;
279  for (int iz=0;iz < pmeGrid.zBlocks;iz++) {
280  for (int iy=0;iy < pmeGrid.yBlocks;iy++) {
281  int pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1;
282  getPencilDim(pmeGrid, Perm_X_Y_Z, iy, iz,
283  pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1);
284 
285  if (pencil_y1 - pencil_y0 < pmeGrid.order || pencil_z1 - pencil_z0 < pmeGrid.order)
286  NAMD_bug("ComputePmeCUDAMgr::getHomePencil, pencil size must be >= PMEInterpOrder");
287 
288  int y0 = std::max(patch_y0, pencil_y0);
289  int y1 = std::min(patch_y1, pencil_y1);
290  int z0 = std::max(patch_z0, pencil_z0);
291  int z1 = std::min(patch_z1, pencil_z1);
292 
293  int overlap = (y1-y0 > 0 && z1-z0 > 0) ? (y1-y0)*(z1-z0) : -1;
294 
295  if (overlap > maxOverlap) {
296  maxOverlap = overlap;
297  homey = iy;
298  homez = iz;
299  }
300  }
301  }
302 
303  if (homey == -1 || homez == -1)
304  NAMD_bug("ComputePmeCUDAMgr::getHomePencil, home pencil not found");
305 }
306 
307 //
308 // Calculates maximum number of PME pencils
309 //
310 void ComputePmeCUDAMgr::restrictToMaxPMEPencils() {
311  PatchMap *patchMap = PatchMap::Object();
313  // need to initialize with Lattice from SimParameters
314  Lattice lattice = simParams->lattice;
315  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
316  BigReal sysdimc = lattice.c_r().unit() * lattice.c();
317  BigReal cutoff = simParams->cutoff;
318  BigReal patchdim = simParams->patchDimension;
319  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
320  BigReal marginc = 0.5 * ( patchdim - cutoff ) / sysdimc;
321  int numPatches = patchMap->numPatches();
322 
323  pmeGrid.xBlocks = std::min(pmeGrid.xBlocks, pmeGrid.K1);
324 
325  int pid = 0;
326  while (pid < numPatches) {
327  // Get home pencil
328  int homey, homez;
329  getHomePencil(pid, homey, homez);
330  // Y
331  {
332  BigReal miny = patchMap->min_b(pid);
333  BigReal maxy = patchMap->max_b(pid);
334  // min2 (max2) is smallest (largest) grid line for this patch
335  int min2 = ((int) floor(pmeGrid.K2 * (miny+0.5 - marginb)));
336  int max2 = ((int) floor(pmeGrid.K2 * (maxy+0.5 + marginb))) + (pmeGrid.order - 1);
337  // Restrict grid lines to [0 ... pmeGrid.K2-1]
338  if (min2 < 0) min2 += pmeGrid.K2;
339  if (max2 >= pmeGrid.K2) max2 -= pmeGrid.K2;
340  // Get pencil indices for the grid lines
341  int min2pi = getPencilIndexY(pmeGrid, min2);
342  int max2pi = getPencilIndexY(pmeGrid, max2);
343  // Distance from home pencil
344  int dmin2pi = homey - min2pi;
345  if (dmin2pi < 0) dmin2pi += pmeGrid.yBlocks;
346  if (dmin2pi < 0)
347  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin2pi");
348  // If distance is > 1, must decrease the number of y-pencils and try again
349  if (dmin2pi > 1) {
350  pmeGrid.yBlocks--;
351  if (pmeGrid.yBlocks <= 0) break;
352  continue;
353  }
354  int dmax2pi = max2pi - homey;
355  if (dmax2pi < 0) dmax2pi += pmeGrid.yBlocks;
356  if (dmax2pi < 0)
357  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax2pi");
358  // If distance is > 1, must decrease the number of y-pencils and try again
359  if (dmax2pi > 1) {
360  pmeGrid.yBlocks--;
361  if (pmeGrid.yBlocks <= 0) break;
362  continue;
363  }
364  }
365 
366  // Z
367  {
368  BigReal minz = patchMap->min_c(pid);
369  BigReal maxz = patchMap->max_c(pid);
370  // min3 (max3) is smallest (largest) grid line for this patch
371  int min3 = ((int) floor(pmeGrid.K3 * (minz+0.5 - marginc)));
372  int max3 = ((int) floor(pmeGrid.K3 * (maxz+0.5 + marginc))) + (pmeGrid.order - 1);
373  // Restrict grid lines to [0 ... pmeGrid.K3-1]
374  if (min3 < 0) min3 += pmeGrid.K3;
375  if (max3 >= pmeGrid.K3) max3 -= pmeGrid.K3;
376  // Get pencil indices for the grid lines
377  int min3pi = getPencilIndexZ(pmeGrid, min3);
378  int max3pi = getPencilIndexZ(pmeGrid, max3);
379  // Distance from home pencil
380  int dmin3pi = homez - min3pi;
381  if (dmin3pi < 0) dmin3pi += pmeGrid.zBlocks;
382  if (dmin3pi < 0)
383  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin3pi");
384  // If distance is > 1, must decrease the number of z-pencils and try again
385  if (dmin3pi > 1) {
386  pmeGrid.zBlocks--;
387  if (pmeGrid.zBlocks <= 0) break;
388  continue;
389  }
390  int dmax3pi = max3pi - homez;
391  if (dmax3pi < 0) dmax3pi += pmeGrid.zBlocks;
392  if (dmax3pi < 0)
393  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax3pi");
394  // If distance is > 1, must decrease the number of z-pencils and try again
395  if (dmax3pi > 1) {
396  pmeGrid.zBlocks--;
397  if (pmeGrid.zBlocks <= 0) break;
398  continue;
399  }
400  }
401 
402  pid++;
403  }
404 
405  // if (CkMyNode() == 0)
406  // fprintf(stderr, "pmeGrid.yBlocks %d pmeGrid.zBlocks %d\n", pmeGrid.yBlocks, pmeGrid.zBlocks);
407 
408  if (pmeGrid.xBlocks <= 0 || pmeGrid.yBlocks <= 0 || pmeGrid.zBlocks <= 0)
409  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, zero PME pencils found");
410 
411  if (pmeGrid.xBlocks > pmeGrid.K1 || pmeGrid.yBlocks > pmeGrid.K2|| pmeGrid.zBlocks > pmeGrid.K3)
412  NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, unable to restrict number of PME pencils");
413 }
414 
415 //
416 // Sets up pencils. May be called multiple times.
417 //
420  DebugM(4, "ComputePmeCUDAMgr::setupPencils\n"<< endi);
421  pmeGrid.K1 = simParams->PMEGridSizeX;
422  pmeGrid.K2 = simParams->PMEGridSizeY;
423  pmeGrid.K3 = simParams->PMEGridSizeZ;
424  pmeGrid.order = simParams->PMEInterpOrder;
425  pmeGrid.dim2 = pmeGrid.K2;
426  pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
427 
428  // Count the total number of devices assuming all nodes have the same number as this node
429  // NOTE: This should be changed in the future to support heterogeneous nodes!!!
430  int numDevicesTmp = deviceCUDA->getNumDevice();
431 
432  int numDeviceTot = CkNumNodes() * numDevicesTmp;
433  // Use approximately 1/4th of the devices for PME
434  // int numDeviceToUse = std::max(1, numDeviceTot/4);
435  int numDeviceToUse = 1;// THERE CAN BE ONLY 1
436  // there is a correctness problem in the multi-pencil case, so don't
437  // go there until that is fixed and we have a better multi-device
438  // parallelization.
439 
440  DebugM(4, "ComputePmeCUDAMgr::setupPencils numDeviceToUse "<<numDeviceToUse<< " numDeviceTot "<< numDeviceTot <<"\n"<< endi);
441  if (numDeviceToUse < 4) {
442  // 2D Slab
443  pmeGrid.yBlocks = 1;
444  pmeGrid.xBlocks = pmeGrid.zBlocks = numDeviceToUse;
445  } else {
446  // 1D Pencil
447  pmeGrid.yBlocks = (int)sqrt((double)numDeviceToUse);
448  pmeGrid.zBlocks = numDeviceToUse/pmeGrid.yBlocks;
449  pmeGrid.xBlocks = pmeGrid.zBlocks;
450  }
451 
452  if ( simParams->PMEPencilsX > 0 ) pmeGrid.xBlocks = simParams->PMEPencilsX;
453  if ( simParams->PMEPencilsY > 0 ) pmeGrid.yBlocks = simParams->PMEPencilsY;
454  if ( simParams->PMEPencilsZ > 0 ) pmeGrid.zBlocks = simParams->PMEPencilsZ;
455 
456  // Restrict number of pencils to the maximum number
457  restrictToMaxPMEPencils();
458 
459  // Fix pencil numbers if they don't make sense w.r.t. number of devices
460  if (pmeGrid.yBlocks == 1) {
461  // 2D Slab
462  if (pmeGrid.xBlocks > numDeviceTot) pmeGrid.xBlocks = numDeviceTot;
463  if (pmeGrid.zBlocks > numDeviceTot) pmeGrid.zBlocks = numDeviceTot;
464  } else {
465  // 1D Pencil
466  if (pmeGrid.yBlocks*pmeGrid.zBlocks > numDeviceTot ||
467  pmeGrid.xBlocks*pmeGrid.zBlocks > numDeviceTot ||
468  pmeGrid.xBlocks*pmeGrid.yBlocks > numDeviceTot) {
469  pmeGrid.yBlocks = std::min(pmeGrid.yBlocks, (int)sqrt((double)numDeviceTot));
470  pmeGrid.zBlocks = std::min(pmeGrid.zBlocks, numDeviceTot/pmeGrid.yBlocks);
471  }
472  pmeGrid.xBlocks = std::min(pmeGrid.yBlocks, pmeGrid.zBlocks);
473  }
474 
475  // Here (block1, block2, block3) define the size of charge grid pencil in each direction
476  pmeGrid.block1 = ( pmeGrid.K1 + pmeGrid.xBlocks - 1 ) / pmeGrid.xBlocks;
477  pmeGrid.block2 = ( pmeGrid.K2 + pmeGrid.yBlocks - 1 ) / pmeGrid.yBlocks;
478  pmeGrid.block3 = ( pmeGrid.K3 + pmeGrid.zBlocks - 1 ) / pmeGrid.zBlocks;
479 
480  // Determine type of FFT
481  if (pmeGrid.xBlocks == 1 && pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1) {
482  // Single block => 3D FFT
483  pmePencilType = 3;
484  iout << iINFO << "Use 3D box decompostion in PME FFT.\n" << endi;
485  } else if (pmeGrid.yBlocks == 1) {
486  // Blocks in all but y-dimension => 2D FFT
487  pmePencilType = 2;
488  iout << iINFO << "Use 2D slab decompostion in PME FFT.\n" << endi;
489  } else {
490  // Blocks in all dimensions => 1D FFT
491  pmePencilType = 1;
492  iout << iINFO << "Use 1D pencil decompostion in PME FFT.\n" << endi;
493  }
494 
495  //--------------------------------------------------------------------------
496  // Map pencils into Pes
497  xPes.resize(pmeGrid.yBlocks*pmeGrid.zBlocks);
498 
499  if (pmePencilType == 1 || pmePencilType == 2) {
500  zPes.resize(pmeGrid.xBlocks*pmeGrid.yBlocks);
501  }
502  if (pmePencilType == 1) {
503  yPes.resize(pmeGrid.xBlocks*pmeGrid.zBlocks);
504  }
505 
506  // i % numDeviceTot = device index
507  // (i % numDeviceTot)/deviceCUDA->getNumDevice() = node index
508  // (i % CkNodeSize(node)) = pe displacement
509  for (int i=0;i < xPes.size();i++) {
510  int node = (i % numDeviceTot)/numDevicesTmp;
511  xPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
512  }
513  for (int i=0;i < yPes.size();i++) {
514  int node = (i % numDeviceTot)/numDevicesTmp;
515  yPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
516  }
517  for (int i=0;i < zPes.size();i++) {
518  int node = (i % numDeviceTot)/numDevicesTmp;
519  zPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
520  }
521 
522  // char peStr[256];
523  // char *p = peStr;
524  // p += sprintf(p, "%2d | xPes", CkMyPe());
525  // for (int i=0;i < xPes.size();i++)
526  // p += sprintf(p, " %d", xPes[i]);
527  // p += sprintf(p, " yPes");
528  // for (int i=0;i < yPes.size();i++)
529  // p += sprintf(p, " %d", yPes[i]);
530  // p += sprintf(p, " zPes");
531  // for (int i=0;i < zPes.size();i++)
532  // p += sprintf(p, " %d", zPes[i]);
533  // fprintf(stderr, "%s | %d %d\n",peStr, CkNodeFirst(CkMyNode()), CkNodeSize(CkMyNode()));
534 
535  //--------------------------------------------------------------------------
536  // Build global node list for x-pencils
537  nodeDeviceList.resize(xPes.size());
538  numDevices = 0;
539  for (int k=0;k < xPes.size();k++) {
540  nodeDeviceList[k].node = CkNodeOf(xPes[k]);
541  nodeDeviceList[k].device = -1;
542  if (nodeDeviceList[k].node == CkMyNode()) {
543  nodeDeviceList[k].device = numDevices++;
544  }
545  }
546 
547  ijPencilX.clear();
548  ijPencilY.clear();
549  ijPencilZ.clear();
550 
551  // Construct list of pencil coordinates (i,j) for each device held by this node
552  for (int k=0;k < xPes.size();k++) {
553  if (CkMyNode() == CkNodeOf(xPes[k])) {
554  IJ ij;
555  ij.i = k % pmeGrid.yBlocks;
556  ij.j = k / pmeGrid.yBlocks;
557  ijPencilX.push_back(ij);
558  }
559  }
560  if (ijPencilX.size() != numDevices)
561  NAMD_bug("ComputePmeCUDAMgr::setupPencils, error setting up x-pencils and devices");
562 
563  int numDevicesY = 0;
564  for (int k=0;k < yPes.size();k++) {
565  if (CkMyNode() == CkNodeOf(yPes[k])) {
566  IJ ij;
567  ij.i = k % pmeGrid.xBlocks;
568  ij.j = k / pmeGrid.xBlocks;
569  ijPencilY.push_back(ij);
570  numDevicesY++;
571  }
572  }
573 
574  int numDevicesZ = 0;
575  for (int k=0;k < zPes.size();k++) {
576  if (CkMyNode() == CkNodeOf(zPes[k])) {
577  IJ ij;
578  ij.i = k % pmeGrid.xBlocks;
579  ij.j = k / pmeGrid.xBlocks;
580  ijPencilZ.push_back(ij);
581  numDevicesZ++;
582  }
583  }
584 }
585 
586 //
587 // Returns true if PE "pe" is used in PME
588 //
590  for (int i=0;i < xPes.size();i++) {
591  if (pe == xPes[i]) return true;
592  }
593  return false;
594 }
595 
596 //
597 // Returns true if node "node" is used for PME
598 //
600  for (int i=0;i < nodeDeviceList.size();i++) {
601  if (nodeDeviceList[i].node == node) {
602  return true;
603  }
604  }
605  return false;
606 }
607 
608 //
609 // Returns true if device "deviceID" is used for PME
610 //
611 bool ComputePmeCUDAMgr::isPmeDevice(int deviceID) {
612  for (int i=0;i < nodeDeviceList.size();i++) {
613  if (deviceCUDA->getDeviceIDbyRank(nodeDeviceList[i].device % deviceCUDA->getNumDevice()) == deviceID) {
614  return true;
615  }
616  }
617  return false;
618 }
619 
620 //
621 // Initialize compute manager
622 // This gets called on one Pe on each node from Node::startup()
623 //
624 void ComputePmeCUDAMgr::initialize(CkQdMsg *msg) {
625  if (msg != NULL) delete msg;
626  DebugM(4, "ComputePmeCUDAMgr::initialize(CkQdMsg)\n" << endi);
627  setupPencils();
628 
629  if ( ! CkMyNode() ) {
630  iout << iINFO << "PME using " << pmeGrid.xBlocks << " x " <<
631  pmeGrid.yBlocks << " x " << pmeGrid.zBlocks <<
632  " pencil grid for FFT and reciprocal sum.\n" << endi;
633  }
634 
635  // Initialize list that contains the number of home patches for each device on this manager
636  numHomePatchesList.resize(numDevices, 0);
637 
638  //--------------------------------------------------------------------------
639  // Create devices and atom filer
640  // numDevices = number of devices we'll be using, possibly different on each node
641  // Use root node to compute the maximum number of devices in use over all nodes
642  thisProxy[0].initializeDevicesAndAtomFiler(new NumDevicesMsg(numDevices));
643  //--------------------------------------------------------------------------
644 
645  if (CkMyNode() == 0) {
646 
647  if (pmePencilType == 3) {
648  // Single block => 3D FFT
649  CProxy_PmePencilXYZMap xyzMap = CProxy_PmePencilXYZMap::ckNew(xPes[0]);
650  CkArrayOptions xyzOpts(1);
651  xyzOpts.setMap(xyzMap);
652  xyzOpts.setAnytimeMigration(false);
653  xyzOpts.setStaticInsertion(true);
654  pmePencilXYZ = CProxy_CudaPmePencilXYZ::ckNew(xyzOpts);
655  pmePencilXYZ[0].initialize(new CudaPmeXYZInitMsg(pmeGrid));
656  thisProxy.recvPencils(pmePencilXYZ);
657  } else if (pmePencilType == 2) {
658  // Blocks in all but y-dimension => 2D FFT
659  CProxy_PmePencilXYMap xyMap = CProxy_PmePencilXYMap::ckNew(xPes);
660  CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
661  CkArrayOptions xyOpts(1, 1, pmeGrid.zBlocks);
662  CkArrayOptions zOpts(pmeGrid.xBlocks, 1, 1);
663  xyOpts.setMap(xyMap);
664  zOpts.setMap(zMap);
665  xyOpts.setAnytimeMigration(false);
666  zOpts.setAnytimeMigration(false);
667  xyOpts.setStaticInsertion(true);
668  zOpts.setStaticInsertion(true);
669  pmePencilXY = CProxy_CudaPmePencilXY::ckNew(xyOpts);
670  pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
671  // Send pencil proxies to other nodes
672  thisProxy.recvPencils(pmePencilXY, pmePencilZ);
673  pmePencilXY.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
674  pmePencilZ.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
675  } else {
676  // Blocks in all dimensions => 1D FFT
677  CProxy_PmePencilXMap xMap = CProxy_PmePencilXMap::ckNew(1, 2, pmeGrid.yBlocks, xPes);
678  CProxy_PmePencilXMap yMap = CProxy_PmePencilXMap::ckNew(0, 2, pmeGrid.xBlocks, yPes);
679  CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
680  CkArrayOptions xOpts(1, pmeGrid.yBlocks, pmeGrid.zBlocks);
681  CkArrayOptions yOpts(pmeGrid.xBlocks, 1, pmeGrid.zBlocks);
682  CkArrayOptions zOpts(pmeGrid.xBlocks, pmeGrid.yBlocks, 1);
683  xOpts.setMap(xMap);
684  yOpts.setMap(yMap);
685  zOpts.setMap(zMap);
686  xOpts.setAnytimeMigration(false);
687  yOpts.setAnytimeMigration(false);
688  zOpts.setAnytimeMigration(false);
689  xOpts.setStaticInsertion(true);
690  yOpts.setStaticInsertion(true);
691  zOpts.setStaticInsertion(true);
692  pmePencilX = CProxy_CudaPmePencilX::ckNew(xOpts);
693  pmePencilY = CProxy_CudaPmePencilY::ckNew(yOpts);
694  pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
695  // Send pencil proxies to other nodes
696  thisProxy.recvPencils(pmePencilX, pmePencilY, pmePencilZ);
697  pmePencilX.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
698  pmePencilY.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
699  pmePencilZ.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
700  }
701  }
702 
703 }
704 
706  if (CkMyNode() != 0)
707  NAMD_bug("ComputePmeCUDAMgr::createDevicesAndAtomFiler can only be called on root node");
708 
709  // Root node creates all device proxies
710  // NOTE: Only root node has numDevicesMax
711  RecvDeviceMsg* msg = new (numDevicesMax, PRIORITY_SIZE) RecvDeviceMsg();
712  msg->numDevicesMax = numDevicesMax;
713  for (int i=0;i < numDevicesMax;i++) {
714  CProxy_ComputePmeCUDADevice dev = CProxy_ComputePmeCUDADevice::ckNew();
715  memcpy(&msg->dev[i], &dev, sizeof(CProxy_ComputePmeCUDADevice));
716  }
717  thisProxy.recvDevices(msg);
718 
719  CProxy_PmeAtomFiler filer = CProxy_PmeAtomFiler::ckNew();
720  thisProxy.recvAtomFiler(filer);
721 
722 }
723 
724 void ComputePmeCUDAMgr::recvAtomFiler(CProxy_PmeAtomFiler filer) {
725  pmeAtomFiler = filer;
726 }
727 
729 
730  numDevicesMax = msg->numDevicesMax;
731  DebugM(4, "ComputePmeCUDAMgr::recvDevices() numDevicesMax " << numDevicesMax <<"\n"<< endi);
732  if (numDevices > numDevicesMax)
733  NAMD_bug("ComputePmeCUDAMgr::recvDevices, numDevices > numDevicesMax");
734  deviceProxy.resize(numDevices);
735  for (int i=0;i < numDevices;i++) {
736  deviceProxy[i] = msg->dev[i];
737  }
738  delete msg;
739 }
740 
741 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXYZ xyz) {
742  pmePencilXYZ = xyz;
743 }
744 
745 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z) {
746  pmePencilXY = xy;
747  pmePencilZ = z;
748 }
749 
750 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z) {
751  pmePencilX = x;
752  pmePencilY = y;
753  pmePencilZ = z;
754 }
755 
756 //
757 // Initialize pencils on this node
758 // This gets called on one rank on each node
759 //
761  if (msg != NULL) delete msg;
762 
763  int numDevicesTmp = deviceCUDA->getNumDevice();
764  DebugM(4, "ComputePmeCUDAMgr::initialize_pencils() numDevicesTmp " << numDevicesTmp <<"\n"<< endi);
765  // Initialize device proxies for real-space interfacing
766  for (int i=0;i < ijPencilX.size();i++) {
767  // NOTE: i is here the device ID
768  int deviceID = deviceCUDA->getDeviceIDbyRank(i % numDevicesTmp);
769  deviceProxy[i].ckLocalBranch()->initialize(pmeGrid, ijPencilX[i].i, ijPencilX[i].j,
770  deviceID, pmePencilType, thisProxy, pmeAtomFiler);
771  if (pmePencilType == 1) {
772  deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilX);
773  } else if (pmePencilType == 2) {
774  deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXY);
775  } else {
776  deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXYZ);
777  }
778  }
779 
780  // Use above initialized device proxies for the PME pencils that interface with real-space
781  for (int i=0;i < ijPencilX.size();i++) {
782  if (pmePencilType == 1) {
783  pmePencilX(0, ijPencilX[i].i, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
784  } else if (pmePencilType == 2) {
785  pmePencilXY(0, 0, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
786  } else {
787  pmePencilXYZ[0].initializeDevice(new InitDeviceMsg(deviceProxy[i]));
788  }
789  }
790 
791  // Create extra devices for Y and Z pencils if necessary
792  int n = std::max(ijPencilY.size(), ijPencilZ.size());
793  if (n > ijPencilX.size()) {
794  int nextra = n - ijPencilX.size();
795  extraDevices.resize(nextra);
796  for (int i=0;i < nextra;i++) {
797  extraDevices[i].deviceID = deviceCUDA->getDeviceIDbyRank((i + ijPencilX.size()) % numDevicesTmp);
798  cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
799  createStream(extraDevices[i].stream);
800  }
801  }
802 
803  // Initialize Y pencils
804  for (int i=0;i < ijPencilY.size();i++) {
805  int deviceID;
806  cudaStream_t stream;
807  if (i < ijPencilX.size()) {
808  deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
809  stream = deviceProxy[i].ckLocalBranch()->getStream();
810  } else {
811  deviceID = extraDevices[i-ijPencilX.size()].deviceID;
812  stream = extraDevices[i-ijPencilX.size()].stream;
813  }
814  pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy, deviceProxy[i]));
815  }
816 
817  // Initialize Z pencils
818  for (int i=0;i < ijPencilZ.size();i++) {
819  int deviceID;
820  cudaStream_t stream;
821  if (i < ijPencilX.size()) {
822  deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
823  stream = deviceProxy[i].ckLocalBranch()->getStream();
824  } else {
825  deviceID = extraDevices[i-ijPencilX.size()].deviceID;
826  stream = extraDevices[i-ijPencilX.size()].stream;
827  }
828  pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy, deviceProxy[i]));
829  }
830 
831 }
832 
833 //
834 // Activate (start) pencils
835 // This gets called on rank 0 Pe on each node
836 //
838  if (msg != NULL) delete msg;
839 
840  for (int device=0;device < numDevices;device++) {
841  deviceProxy[device].ckLocalBranch()->activate_pencils();
842  }
843 
844  for (int i=0;i < ijPencilY.size();i++) {
845  PmeStartMsg* pmeStartYMsg = new PmeStartMsg();
846  for (unsigned iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
847  pmeStartYMsg->dataGrid[iGrid] = NULL;
848  pmeStartYMsg->dataSizes[iGrid] = 0;
849  pmeStartYMsg->enabledGrid[iGrid] = deviceProxy[0].ckLocalBranch()->isGridEnabled(iGrid);
850  }
851  pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).start(pmeStartYMsg);
852  }
853 
854  for (int i=0;i < ijPencilZ.size();i++) {
855  PmeStartMsg* pmeStartZMsg = new PmeStartMsg();
856  for (unsigned iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
857  pmeStartZMsg->dataGrid[iGrid] = NULL;
858  pmeStartZMsg->dataSizes[iGrid] = 0;
859  pmeStartZMsg->enabledGrid[iGrid] = deviceProxy[0].ckLocalBranch()->isGridEnabled(iGrid);
860  }
861  pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).start(pmeStartZMsg);
862  }
863 
864 }
865 
866 //
867 // Returns node that contains x-pencil i,j
868 //
869 int ComputePmeCUDAMgr::getNode(int i, int j) {
870  if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
871  NAMD_bug("ComputePmeCUDAMgr::getNode, pencil index out of bounds");
872  int ind = i + j*pmeGrid.yBlocks;
873  return nodeDeviceList[ind].node;
874 }
875 
876 //
877 // Returns home node for a patch
878 //
880  int homey, homez;
881  getHomePencil(patchID, homey, homez);
882  return getNode(homey, homez);
883 }
884 
885 //
886 // Returns device index on this node that contains x-pencil i,j
887 //
888 int ComputePmeCUDAMgr::getDevice(int i, int j) {
889  if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
890  NAMD_bug("ComputePmeCUDAMgr::getDevice, pencil index out of bounds");
891  int ind = i + j*pmeGrid.yBlocks;
892  int device = nodeDeviceList[ind].device;
893  if (device == -1)
894  NAMD_bug("ComputePmeCUDAMgr::getDevice, no device found");
895  return device;
896 }
897 
898 //
899 // Returns device index on this node that contains y-pencil i,j
900 //
902  if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.zBlocks)
903  NAMD_bug("ComputePmeCUDAMgr::getDevicePencilY, pencil index out of bounds");
904  for (int device=0;device < ijPencilY.size();device++) {
905  if (ijPencilY[device].i == i && ijPencilY[device].j == j) return device;
906  }
907  char str[256];
908  sprintf(str, "ComputePmeCUDAMgr::getDevicePencilY, no device found at i %d j %d",i,j);
909  NAMD_bug(str);
910  return -1;
911 }
912 
913 //
914 // Returns device index on this node that contains z-pencil i,j
915 //
917  if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.yBlocks)
918  NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, pencil index out of bounds");
919  for (int device=0;device < ijPencilZ.size();device++) {
920  if (ijPencilZ[device].i == i && ijPencilZ[device].j == j) return device;
921  }
922  NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, no device found");
923  return -1;
924 }
925 
926 //
927 // Returns device ID on this node that contains x-pencil i,j
928 //
930  int device = getDevice(i, j);
931  return deviceProxy[device].ckLocalBranch()->getDeviceID();
932 }
933 
934 //
935 // Returns device ID on this node that contains y-pencil i,j
936 //
938  int device = getDevicePencilY(i, j);
939  return deviceProxy[device].ckLocalBranch()->getDeviceID();
940 }
941 
942 //
943 // Returns device ID on this node that contains z-pencil i,j
944 //
946  int device = getDevicePencilZ(i, j);
947  return deviceProxy[device].ckLocalBranch()->getDeviceID();
948 }
949 
950 //
951 // Skip this round of PME, call skip on all Z-pencils (this is needed to get the reductions submitted)
952 //
954  DebugM(2, "ComputePmeCUDADevice::skip\n" << endi);
955  switch(pmePencilType) {
956  case 1:
957  pmePencilZ.skip();
958  break;
959  case 2:
960  pmePencilZ.skip();
961  break;
962  case 3:
963  pmePencilXYZ[0].skip();
964  break;
965  }
966 }
967 
969  DebugM(2, "ComputePmeCUDADevice::recvAtoms\n" << endi);
970  int device = getDevice(msg->i, msg->j);
971  deviceProxy[device].ckLocalBranch()->recvAtoms(msg);
972 }
973 
975  // __sdag_init();
976  numHomePatches = 0;
977 // forceCapacity = 0;
978 // force = NULL;
979  DebugM(4, "ComputePmeCUDADevice::ComputePmeCUDADevice\n" << endi);
980  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
981  pmeRealSpaceComputes[iGrid] = NULL;
982  enabledGrid[iGrid] = false;
983  forces[iGrid] = NULL;
984  forceCapacities[iGrid] = 0;
985  }
986 // pmeRealSpaceCompute = NULL;
987  streamCreated = false;
988  lock_numHomePatchesMerged = CmiCreateLock();
989  lock_numPencils = CmiCreateLock();
990  lock_numNeighborsRecv = CmiCreateLock();
991  lock_recvAtoms = CmiCreateLock();
992  numNeighborsExpected = 0;
993  numStrayAtoms = 0;
994  // Reset counters
995  numNeighborsRecv = 0;
996  numHomePatchesRecv = 0;
997  numHomePatchesMerged = 0;
998  atomI = 0;
999  forceI = 1;
1000 }
1001 
1003  // __sdag_init();
1004  numHomePatches = 0;
1005 // forceCapacity = 0;
1006 // force = NULL;
1007  DebugM(4, "ComputePmeCUDADevice::ComputePmeCUDADevice(CkMigrateMessage)\n" << endi);
1008  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1009  pmeRealSpaceComputes[iGrid] = NULL;
1010  enabledGrid[iGrid] = false;
1011  forces[iGrid] = NULL;
1012  forceCapacities[iGrid] = 0;
1013  }
1014  streamCreated = false;
1015  lock_numHomePatchesMerged = CmiCreateLock();
1016  lock_numPencils = CmiCreateLock();
1017  lock_numNeighborsRecv = CmiCreateLock();
1018  lock_recvAtoms = CmiCreateLock();
1019  numNeighborsExpected = 0;
1020  numStrayAtoms = 0;
1021  // Reset counters
1022  numNeighborsRecv = 0;
1023  numHomePatchesRecv = 0;
1024  numHomePatchesMerged = 0;
1025  atomI = 0;
1026  forceI = 1;
1027 }
1028 
1030  if (streamCreated) {
1031  cudaCheck(cudaSetDevice(deviceID));
1032  cudaCheck(cudaStreamDestroy(stream));
1033  }
1034  for (int j=0;j < 2;j++)
1035  for (int i=0;i < pmeAtomStorage[j].size();i++) {
1036  if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
1037  }
1038 // if (force != NULL) deallocate_host<CudaForce>(&force);
1039  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1040  if (pmeRealSpaceComputes[iGrid] != NULL) delete pmeRealSpaceComputes[iGrid];
1041  if (forces[iGrid] != NULL) deallocate_host<CudaForce>(&forces[iGrid]);
1042  enabledGrid[iGrid] = false;
1043  }
1044  CmiDestroyLock(lock_numHomePatchesMerged);
1045  CmiDestroyLock(lock_numPencils);
1046  CmiDestroyLock(lock_numNeighborsRecv);
1047  CmiDestroyLock(lock_recvAtoms);
1048 }
1049 
1050 void ComputePmeCUDADevice::initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
1051  int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
1052  CProxy_PmeAtomFiler pmeAtomFiler_in) {
1053 
1054  deviceID = deviceID_in;
1055  DebugM(4, "ComputePmeCUDADevice::initialize deviceID "<< deviceID <<"\n"<< endi);
1056  cudaCheck(cudaSetDevice(deviceID));
1057  pmePencilType = pmePencilType_in;
1058  pmeGrid = pmeGrid_in;
1059 #ifdef DEBUGM
1060  // pmeGrid.print();
1061 #endif
1062  pencilIndexY = pencilIndexY_in;
1063  pencilIndexZ = pencilIndexZ_in;
1064  mgrProxy = mgrProxy_in;
1065  pmeAtomFiler = pmeAtomFiler_in;
1066  // Size of the neighboring pencil grid, max 3x3
1067  yNBlocks = std::min(pmeGrid.yBlocks, 3);
1068  zNBlocks = std::min(pmeGrid.zBlocks, 3);
1069  // Local pencil is at y=0,z=0
1070  if (yNBlocks == 1) {
1071  ylo = 0;
1072  yhi = 0;
1073  } else if (yNBlocks == 2) {
1074  ylo = -1;
1075  yhi = 0;
1076  } else {
1077  ylo = -1;
1078  yhi = 1;
1079  }
1080  if (zNBlocks == 1) {
1081  zlo = 0;
1082  zhi = 0;
1083  } else if (zNBlocks == 2) {
1084  zlo = -1;
1085  zhi = 0;
1086  } else {
1087  zlo = -1;
1088  zhi = 1;
1089  }
1090 
1091  neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
1092  // neighborForcePencils.resize(yNBlocks*zNBlocks);
1093  for (int j=0;j < 2;j++)
1094  homePatchIndexList[j].resize(yNBlocks*zNBlocks);
1095  neighborPatchIndex.resize(yNBlocks*zNBlocks);
1096 
1097  pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
1099  for (int j=0;j < 2;j++) {
1100  pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
1101  for (int z=zlo;z <= zhi;z++) {
1102  for (int y=ylo;y <= yhi;y++) {
1103  int pp = y-ylo + (z-zlo)*yNBlocks;
1104  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1105  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1106  if (y == 0 && z == 0) {
1107  // Primary pencil
1108  pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
1109  pmeAtomStorage[j][pp]->setupAlch(*simParams);
1110  } else {
1111  pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
1112  pmeAtomStorage[j][pp]->setupAlch(*simParams);
1113  }
1114  pmeAtomStorageAllocatedHere[pp] = true;
1115  }
1116  }
1117  }
1118 
1119  // Create stream for this device
1120  createStream(stream);
1121  streamCreated = true;
1122  // CHC: enable at least 1 grid
1123  // CHC: do we need a different stream?
1124  pmeRealSpaceComputes[0] = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ, deviceID, stream);
1125  pmeRealSpaceComputes[0]->setGrid(0);
1126  enabledGrid[0] = true;
1127  if (simParams->alchOn) {
1128  pmeRealSpaceComputes[1] = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ, deviceID, stream);
1129  pmeRealSpaceComputes[1]->setGrid(1);
1130  // at least two grids are required for alchemical transformation
1131  enabledGrid[1] = true;
1132  if (simParams->alchDecouple) {
1133  pmeRealSpaceComputes[2] = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ, deviceID, stream);
1134  pmeRealSpaceComputes[2]->setGrid(2);
1135  enabledGrid[2] = true;
1136  pmeRealSpaceComputes[3] = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ, deviceID, stream);
1137  pmeRealSpaceComputes[3]->setGrid(3);
1138  enabledGrid[3] = true;
1139  }
1140  if (simParams->alchElecLambdaStart || simParams->alchThermIntOn) {
1141  pmeRealSpaceComputes[4] = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ, deviceID, stream);
1142  pmeRealSpaceComputes[4]->setGrid(4);
1143  enabledGrid[4] = true;
1144  }
1145  }
1146  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1147  if (enabledGrid[iGrid]) {
1148  forceReady[iGrid] = 0;
1149  } else {
1150  forceReady[iGrid] = -1;
1151  }
1152  }
1153 }
1154 
1156  return stream;
1157 }
1158 
1160  return deviceID;
1161 }
1162 
1163 CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy() {
1164  return mgrProxy;
1165 }
1166 
1167 bool ComputePmeCUDADevice::isGridEnabled(unsigned int i) const {
1168  return enabledGrid[i];
1169 }
1170 
1171 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in) {
1172  if (pmePencilType != 3)
1173  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
1174  pmePencilXYZ = pmePencilXYZ_in;
1175 }
1176 
1177 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in) {
1178  if (pmePencilType != 2)
1179  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
1180  pmePencilXY = pmePencilXY_in;
1181 }
1182 
1183 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in) {
1184  if (pmePencilType != 1)
1185  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
1186  pmePencilX = pmePencilX_in;
1187 }
1188 
1190  if (pmePencilType == 1) {
1191  PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
1192  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1193  if (enabledGrid[iGrid] == true) {
1194  pmeStartXMsg->dataGrid[iGrid] = pmeRealSpaceComputes[iGrid]->getData();
1195  pmeStartXMsg->dataSizes[iGrid] = pmeRealSpaceComputes[iGrid]->getDataSize();
1196  pmeStartXMsg->enabledGrid[iGrid] = true;
1197  } else {
1198  pmeStartXMsg->dataGrid[iGrid] = NULL;
1199  pmeStartXMsg->dataSizes[iGrid] = 0;
1200  pmeStartXMsg->enabledGrid[iGrid] = false;
1201  }
1202  }
1203  pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
1204  } else if (pmePencilType == 2) {
1205  PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
1206  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1207  if (enabledGrid[iGrid] == true) {
1208  pmeStartXMsg->dataGrid[iGrid] = pmeRealSpaceComputes[iGrid]->getData();
1209  pmeStartXMsg->dataSizes[iGrid] = pmeRealSpaceComputes[iGrid]->getDataSize();
1210  pmeStartXMsg->enabledGrid[iGrid] = true;
1211  } else {
1212  pmeStartXMsg->dataGrid[iGrid] = NULL;
1213  pmeStartXMsg->dataSizes[iGrid] = 0;
1214  pmeStartXMsg->enabledGrid[iGrid] = false;
1215  }
1216  }
1217  pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
1218  } else if (pmePencilType == 3) {
1219  PmeStartMsg* pmeStartMsg = new PmeStartMsg();
1220  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1221  if (enabledGrid[iGrid] == true) {
1222  pmeStartMsg->dataGrid[iGrid] = pmeRealSpaceComputes[iGrid]->getData();
1223  pmeStartMsg->dataSizes[iGrid] = pmeRealSpaceComputes[iGrid]->getDataSize();
1224  pmeStartMsg->enabledGrid[iGrid] = true;
1225  } else {
1226  pmeStartMsg->dataGrid[iGrid] = NULL;
1227  pmeStartMsg->dataSizes[iGrid] = 0;
1228  pmeStartMsg->enabledGrid[iGrid] = false;
1229  }
1230  }
1231  pmePencilXYZ[0].start(pmeStartMsg);
1232  }
1233 }
1234 
1235 void ComputePmeCUDADevice::initializePatches(int numHomePatches_in) {
1236  numHomePatches = numHomePatches_in;
1237  for (int j=0;j < 2;j++)
1238  numPencils[j].resize(numHomePatches);
1239  for (int j=0;j < 2;j++)
1240  plList[j].resize(numHomePatches);
1241  for (int j=0;j < 2;j++)
1242  homePatchForceMsgs[j].resize(numHomePatches);
1243  // for (int j=0;j < 2;j++)
1244  // numHomeAtoms[j].resize(numHomePatches);
1245  // If we have home patches, register this pencil with the neighbors and with self
1246  if (numHomePatches > 0) {
1247  for (int z=zlo;z <= zhi;z++) {
1248  for (int y=ylo;y <= yhi;y++) {
1249  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1250  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1251  int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1252  mgrProxy[node].registerNeighbor(yt, zt);
1253  }
1254  }
1255  }
1256 }
1257 
1259  CmiLock(lock_numHomePatchesMerged);
1260  numNeighborsExpected++;
1261  CmiUnlock(lock_numHomePatchesMerged);
1262 }
1263 
1264 //
1265 // Recevice atoms from patch and file them into pencils
1266 //
1268 
1269  PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
1270  // Store "virial" and "energy" flags
1271  doVirial = msg->doVirial;
1272  doEnergy = msg->doEnergy;
1273  simulationStep = msg->simulationStep;
1274  // Store lattice
1276  lattice = msg->lattice;
1277 
1278  // Primary pencil index
1279  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1280  int p0 = 0;
1281  int pencilPatchIndex[9];
1282  int numStrayAtomsPatch = 0;
1283  if (pmePencilType == 3) {
1284  // 3D box => store atoms directly without index
1285  // NOTE: We don't check for stray atoms here!
1286  if (simParams->alchOn) {
1287  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1288  // only FEP, no alchDecouple and alchElecLambdaStart == 0, use 2 grids
1289  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2});
1290  }
1291  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1292  // FEP with alchDecouple, use 4 grids
1293  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4});
1294  }
1295  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1296  // FEP with alchDecouple and alchElecLambdaStart > 0, use 5 grids
1297  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1298  }
1299  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1300  // FEP without alchDecouple and alchElecLambdaStart > 0, use 3 grids (1,2,5)
1301  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1302  }
1303  if (simParams->alchThermIntOn && !simParams->alchDecouple) {
1304  // TI estimator without alchDecouple, use 3 grids (1,2,5)
1305  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1306  }
1307  if (simParams->alchThermIntOn && simParams->alchDecouple) {
1308  // TI estimator with alchDecouple, use all grids
1309  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1310  }
1311  } else {
1312  // no alchemistry
1313  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{});
1314  }
1315  } else {
1316 
1317  // File atoms
1318  pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
1319  pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
1320 
1321  // Loop through pencils and add atoms to pencil atom lists
1322  // NOTE: we only store to neighboring pencil if there are atoms to store
1323  int numAtomsCheck = 0;
1324  for (int p=0;p < 9;p++) {
1325 
1326  int y = (p % 3);
1327  int z = (p / 3);
1328 
1329  int pp = y + z*yNBlocks;
1330  int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
1331  if (pp == pp0) p0 = p;
1332  if (pp == pp0 || numAtoms > 0) {
1333  if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
1334  NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
1335  int* index = pmeAtomFilerPtr->getAtomIndex(p);
1336  if (simParams->alchOn) {
1337  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1338  // only FEP, no alchDecouple and alchElecLambdaStart == 0, use 2 grids
1339  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2});
1340  }
1341  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1342  // FEP with alchDecouple, use 4 grids
1343  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4});
1344  }
1345  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1346  // FEP without alchDecouple and alchElecLambdaStart > 0, use 3 grids (1,2,5)
1347  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1348  }
1349  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1350  // FEP with alchDecouple and alchElecLambdaStart > 0, use 5 grids
1351  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1352  }
1353  if (simParams->alchThermIntOn && !simParams->alchDecouple) {
1354  // TI estimator without alchDecouple, use 3 grids (1,2,5)
1355  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, NULL, NULL, msg->chargeFactors5});
1356  }
1357  if (simParams->alchThermIntOn && simParams->alchDecouple) {
1358  // TI estimator with alchDecouple, use all grids
1359  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1360  }
1361  } else {
1362  // no alchemistry
1363  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index, std::vector<float*>{});
1364  }
1365  // Number of patches in this storage tells you how many home patches contributed and
1366  // homePatchIndex (pe) tells you which patch contributed
1367  numAtomsCheck += numAtoms;
1368  }
1369  }
1370 
1371  // Deal with stray atoms
1372  numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
1373  if (numStrayAtomsPatch > 0) {
1374  int* index = pmeAtomFilerPtr->getAtomIndex(9);
1375  CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
1376  for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
1377  int j = index[i];
1378  CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
1379  }
1380  }
1381 
1382  if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
1383  NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
1384  }
1385 
1386  // Create storage for home patch forces
1387  PmeForceMsg *forceMsg;
1388  if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
1389  // 3D FFT and compute resides on the same node => use zero-copy forces
1390  // CHC: forces are zero-copy so do we need this for alchDecouple?
1391  forceMsg = new (0, 0, 0, 0, 0, PRIORITY_SIZE) PmeForceMsg();
1392  forceMsg->zeroCopy = true;
1393  } else {
1394  const int alchGrid = simParams->alchOn ? 1 : 0;
1395  const int alchDecoupleGrid = simParams->alchDecouple ? 1: 0;
1396  const int alchSoftCoreOrTI = (simParams->alchElecLambdaStart > 0 || simParams->alchThermIntOn) ? 1 : 0;
1397  forceMsg = new (msg->numAtoms, alchGrid * msg->numAtoms,
1398  alchDecoupleGrid * msg->numAtoms, alchDecoupleGrid * msg->numAtoms,
1399  alchSoftCoreOrTI * msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
1400  forceMsg->zeroCopy = false;
1401  }
1402  forceMsg->numAtoms = msg->numAtoms;
1403  forceMsg->pe = msg->pe;
1404  forceMsg->compute = msg->compute;
1405  forceMsg->numStrayAtoms = numStrayAtomsPatch;
1406 
1407  bool done = false;
1408  // ----------------------------- lock start ---------------------------
1409  // Only after writing has finished, we get homePatchIndex
1410  // This quarantees that for whatever thread that receives "done=true", writing has finished on
1411  // ALL threads.
1412  CmiLock(lock_recvAtoms);
1413  numStrayAtoms += numStrayAtomsPatch;
1414  // Secure homePatchIndex. All writes after this must be inside lock-region
1415  int homePatchIndex = numHomePatchesRecv;
1416  // Store primary pencil first
1417  plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
1418  if (pmePencilType != 3) {
1419  // Go back to through neighboring pencils and store "homePatchIndex"
1420  for (int p=0;p < 9;p++) {
1421 
1422  int y = (p % 3);
1423  int z = (p / 3);
1424 
1425  int pp = y + z*yNBlocks;
1426  int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
1427  if (pp != pp0 && numAtoms > 0) {
1428  homePatchIndexList[atomI][pp].push_back(homePatchIndex);
1429  // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
1430  // sharing it
1431  // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
1432  plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
1433  }
1434  }
1435  }
1436  homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
1437  // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
1438  // Set the number of pencils contributing to this home patch
1439  numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
1440  //
1441  numHomePatchesRecv++;
1442  if (numHomePatchesRecv == numHomePatches) {
1443  // Reset counter
1444  numHomePatchesRecv = 0;
1445  done = true;
1446  }
1447  CmiUnlock(lock_recvAtoms);
1448  // ----------------------------- lock end ---------------------------
1449 
1450  // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
1451 
1452  delete msg;
1453 
1454  if (done) {
1455  // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
1457  }
1458 }
1459 
1460 //
1461 // Loop through pencils and send atoms to neighboring nodes
1462 //
1464  for (int z=zlo;z <= zhi;z++) {
1465  for (int y=ylo;y <= yhi;y++) {
1466  // Only send to neighbors, not self
1467  if (y != 0 || z != 0) {
1468  // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
1469  // before these sends have been performed
1470  thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
1471  }
1472  }
1473  }
1474  // Register primary pencil
1476 }
1477 
1478 void ComputePmeCUDADevice::sendAtomsToNeighbor(int y, int z, int atomIval) {
1479  // Pencil index
1480  int pp = y-ylo + (z-zlo)*yNBlocks;
1481  // This neighbor pencil is done, finish it up before accessing it
1482  pmeAtomStorage[atomIval][pp]->finish();
1483  // Compute destination neighbor pencil index (yt,zt)
1484  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1485  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1486  int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
1487  CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
1489  PmeAtomPencilMsg* msgPencil;
1490  if (simParams->alchOn) {
1491  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1492  msgPencil = new (numAtoms, numAtoms, numAtoms, 0, 0, 0, PRIORITY_SIZE) PmeAtomPencilMsg;
1493  float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1494  float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1495  memcpy(msgPencil->chargeFactors1, chargeFactors1, numAtoms*sizeof(float));
1496  memcpy(msgPencil->chargeFactors2, chargeFactors2, numAtoms*sizeof(float));
1497  }
1498  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1499  msgPencil = new (numAtoms, numAtoms, numAtoms, numAtoms, numAtoms, 0, PRIORITY_SIZE) PmeAtomPencilMsg;
1500  float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1501  float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1502  float* chargeFactors3 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(2);
1503  float* chargeFactors4 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(3);
1504  memcpy(msgPencil->chargeFactors1, chargeFactors1, numAtoms*sizeof(float));
1505  memcpy(msgPencil->chargeFactors2, chargeFactors2, numAtoms*sizeof(float));
1506  memcpy(msgPencil->chargeFactors3, chargeFactors3, numAtoms*sizeof(float));
1507  memcpy(msgPencil->chargeFactors4, chargeFactors4, numAtoms*sizeof(float));
1508  }
1509  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1510  msgPencil = new (numAtoms, numAtoms, numAtoms, numAtoms, numAtoms, numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
1511  float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1512  float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1513  float* chargeFactors3 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(2);
1514  float* chargeFactors4 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(3);
1515  float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1516  memcpy(msgPencil->chargeFactors1, chargeFactors1, numAtoms*sizeof(float));
1517  memcpy(msgPencil->chargeFactors2, chargeFactors2, numAtoms*sizeof(float));
1518  memcpy(msgPencil->chargeFactors3, chargeFactors3, numAtoms*sizeof(float));
1519  memcpy(msgPencil->chargeFactors4, chargeFactors4, numAtoms*sizeof(float));
1520  memcpy(msgPencil->chargeFactors5, chargeFactors5, numAtoms*sizeof(float));
1521  }
1522  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1523  msgPencil = new (numAtoms, numAtoms, numAtoms, 0, 0, numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
1524  float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1525  float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1526  float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1527  memcpy(msgPencil->chargeFactors1, chargeFactors1, numAtoms*sizeof(float));
1528  memcpy(msgPencil->chargeFactors2, chargeFactors2, numAtoms*sizeof(float));
1529  memcpy(msgPencil->chargeFactors5, chargeFactors5, numAtoms*sizeof(float));
1530  }
1531  if (simParams->alchThermIntOn && !simParams->alchDecouple) {
1532  msgPencil = new (numAtoms, numAtoms, numAtoms, 0, 0, numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
1533  float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1534  float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1535  float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1536  memcpy(msgPencil->chargeFactors1, chargeFactors1, numAtoms*sizeof(float));
1537  memcpy(msgPencil->chargeFactors2, chargeFactors2, numAtoms*sizeof(float));
1538  memcpy(msgPencil->chargeFactors5, chargeFactors5, numAtoms*sizeof(float));
1539  }
1540  if (simParams->alchThermIntOn && simParams->alchDecouple) {
1541  msgPencil = new (numAtoms, numAtoms, numAtoms, numAtoms, numAtoms, numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
1542  float* chargeFactors1 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(0);
1543  float* chargeFactors2 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(1);
1544  float* chargeFactors3 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(2);
1545  float* chargeFactors4 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(3);
1546  float* chargeFactors5 = pmeAtomStorage[atomIval][pp]->getAtomElecFactors(4);
1547  memcpy(msgPencil->chargeFactors1, chargeFactors1, numAtoms*sizeof(float));
1548  memcpy(msgPencil->chargeFactors2, chargeFactors2, numAtoms*sizeof(float));
1549  memcpy(msgPencil->chargeFactors3, chargeFactors3, numAtoms*sizeof(float));
1550  memcpy(msgPencil->chargeFactors4, chargeFactors4, numAtoms*sizeof(float));
1551  memcpy(msgPencil->chargeFactors5, chargeFactors5, numAtoms*sizeof(float));
1552  }
1553  } else {
1554  msgPencil = new (numAtoms, 0, 0, 0, 0, 0, PRIORITY_SIZE) PmeAtomPencilMsg;
1555  }
1556  memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
1557  msgPencil->numAtoms = numAtoms;
1558  // Store destination pencil index
1559  msgPencil->y = yt;
1560  msgPencil->z = zt;
1561  // Store source pencil index
1562  msgPencil->srcY = pencilIndexY;
1563  msgPencil->srcZ = pencilIndexZ;
1564  // Store energy and virial flags
1565  msgPencil->doEnergy = doEnergy;
1566  msgPencil->doVirial = doVirial;
1567  msgPencil->simulationStep = simulationStep;
1568  // Store lattice
1569  msgPencil->lattice = lattice;
1570  int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1571  mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
1572 }
1573 
1575  // Store into primary pencil
1576  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1577  // Compute pencil index relative to primary pencil
1578  int y = msg->srcY - pencilIndexY;
1579  if (y < ylo) y += pmeGrid.yBlocks;
1580  if (y > yhi) y -= pmeGrid.yBlocks;
1581  int z = msg->srcZ - pencilIndexZ;
1582  if (z < zlo) z += pmeGrid.zBlocks;
1583  if (z > zhi) z -= pmeGrid.zBlocks;
1584  if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1585  NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
1586  }
1587  // Read energy and virial flags
1588  doEnergy = msg->doEnergy;
1589  doVirial = msg->doVirial;
1590  simulationStep = msg->simulationStep;
1591  // Read lattice
1592  lattice = msg->lattice;
1593  // Pencil index where atoms came from
1594  int pp = y-ylo + (z-zlo)*yNBlocks;
1595  // Store atoms and mark down the patch index where these atoms were added
1597  if (simParams->alchOn) {
1598  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1599  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2});
1600  }
1601  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == false)) {
1602  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4});
1603  }
1604  if (simParams->alchFepOn && simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1605  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1606  }
1607  if (simParams->alchFepOn && !simParams->alchDecouple && (bool(simParams->alchElecLambdaStart) == true)) {
1608  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, 0, 0, msg->chargeFactors5});
1609  }
1610  if (simParams->alchThermIntOn && !simParams->alchDecouple) {
1611  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, 0, 0, msg->chargeFactors5});
1612  }
1613  if (simParams->alchThermIntOn && simParams->alchDecouple) {
1614  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{msg->chargeFactors1, msg->chargeFactors2, msg->chargeFactors3, msg->chargeFactors4, msg->chargeFactors5});
1615  }
1616  } else {
1617  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms, std::vector<float*>{});
1618  }
1619 
1620  delete msg;
1621 
1623 }
1624 
1626  // Primary pencil
1627  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1628 
1629  bool done = false;
1630  // ----------------------------- lock start ---------------------------
1631  CmiLock(lock_numNeighborsRecv);
1632  numNeighborsRecv++;
1633  if (numNeighborsRecv == numNeighborsExpected) {
1634  // Reset counter
1635  numNeighborsRecv = 0;
1636  done = true;
1637  }
1638  CmiUnlock(lock_numNeighborsRecv);
1639  // ----------------------------- lock end ---------------------------
1640 
1641  if (done) {
1642  // Primary pencil has received all atoms and writing has finished => spread charge
1643  spreadCharge();
1644  }
1645 }
1646 
1648  // Spread charges in primary pencil
1649  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1650  // Primary pencil is done, finish it up before accessing it
1651  // (clearing is done in mergeForcesOnPatch)
1652  pmeAtomStorage[atomI][pp0]->finish();
1653  // Get the number of atoms and pointer to atoms
1654  int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
1655  CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
1657  CudaAtom* atoms2 = NULL;
1658  CudaAtom* atoms3 = NULL;
1659  CudaAtom* atoms4 = NULL;
1660  CudaAtom* atoms5 = NULL;
1661  float* chargeFactors1 = NULL;
1662  float* chargeFactors2 = NULL;
1663  float* chargeFactors3 = NULL;
1664  float* chargeFactors4 = NULL;
1665  float* chargeFactors5 = NULL;
1666  if (simParams->alchOn) {
1667  chargeFactors1 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(0);
1668  chargeFactors2 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(1);
1669  allocate_host<CudaAtom>(&atoms2, numAtoms);
1670  if (simParams->alchDecouple) {
1671  chargeFactors3 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(2);
1672  chargeFactors4 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(3);
1673  allocate_host<CudaAtom>(&atoms3, numAtoms);
1674  allocate_host<CudaAtom>(&atoms4, numAtoms);
1675  }
1676  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
1677  chargeFactors5 = pmeAtomStorage[atomI][pp0]->getAtomElecFactors(4);
1678  allocate_host<CudaAtom>(&atoms5, numAtoms);
1679  }
1680  }
1681  // Flip atomI <-> forceI
1682  std::swap(atomI, forceI);
1683  // Re-allocate force buffer if needed
1684  reallocate_host<CudaForce>(&forces[0], &forceCapacities[0], numAtoms, 1.5f);
1685  if (simParams->alchOn) {
1686  reallocate_host<CudaForce>(&forces[1], &forceCapacities[1], numAtoms, 1.5f);
1687  if (simParams->alchDecouple) {
1688  reallocate_host<CudaForce>(&forces[2], &forceCapacities[2], numAtoms, 1.5f);
1689  reallocate_host<CudaForce>(&forces[3], &forceCapacities[3], numAtoms, 1.5f);
1690  }
1691  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
1692  reallocate_host<CudaForce>(&forces[4], &forceCapacities[4], numAtoms, 1.5f);
1693  }
1694  }
1695  // Setup patches and atoms
1696  // Lattice lattice = simParams->lattice;
1697  if (simParams->alchOn) {
1698  for (int i = 0; i < numAtoms; ++i) {
1699  // copy atoms and scale the charges with factors
1700  atoms2[i].x = atoms[i].x;
1701  atoms2[i].y = atoms[i].y;
1702  atoms2[i].z = atoms[i].z;
1703  atoms2[i].q = atoms[i].q * chargeFactors2[i];
1704  if (simParams->alchDecouple) {
1705  atoms3[i].x = atoms[i].x;
1706  atoms3[i].y = atoms[i].y;
1707  atoms3[i].z = atoms[i].z;
1708  atoms3[i].q = atoms[i].q * chargeFactors3[i];
1709  atoms4[i].x = atoms[i].x;
1710  atoms4[i].y = atoms[i].y;
1711  atoms4[i].z = atoms[i].z;
1712  atoms4[i].q = atoms[i].q * chargeFactors4[i];
1713  }
1714  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
1715  atoms5[i].x = atoms[i].x;
1716  atoms5[i].y = atoms[i].y;
1717  atoms5[i].z = atoms[i].z;
1718  atoms5[i].q = atoms[i].q * chargeFactors5[i];
1719  }
1720  atoms[i].q *= chargeFactors1[i];
1721  }
1722  pmeRealSpaceComputes[0]->copyAtoms(numAtoms, atoms);
1723  pmeRealSpaceComputes[1]->copyAtoms(numAtoms, atoms2);
1724  if (simParams->alchDecouple) {
1725  pmeRealSpaceComputes[2]->copyAtoms(numAtoms, atoms3);
1726  pmeRealSpaceComputes[3]->copyAtoms(numAtoms, atoms4);
1727  deallocate_host<CudaAtom>(&atoms4);
1728  deallocate_host<CudaAtom>(&atoms3);
1729  }
1730  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
1731  pmeRealSpaceComputes[4]->copyAtoms(numAtoms, atoms5);
1732  deallocate_host<CudaAtom>(&atoms5);
1733  }
1734  deallocate_host<CudaAtom>(&atoms2);
1735  } else {
1736  pmeRealSpaceComputes[0]->copyAtoms(numAtoms, atoms);
1737  }
1738  // Spread charge
1739  beforeWalltime = CmiWallTimer();
1740  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1741  if (enabledGrid[iGrid] == true) {
1742  pmeRealSpaceComputes[iGrid]->spreadCharge(lattice);
1743  }
1744  }
1745  // Send "charge grid ready to PME solver"
1746  PmeRunMsg *pmeRunMsg = new PmeRunMsg();
1747  pmeRunMsg->doVirial = doVirial;
1748  pmeRunMsg->doEnergy = doEnergy;
1749  pmeRunMsg->simulationStep = simulationStep;
1750  pmeRunMsg->lattice = lattice;
1751  pmeRunMsg->numStrayAtoms = numStrayAtoms;
1752  // Reset stray atom counter
1753  numStrayAtoms = 0;
1754  switch(pmePencilType) {
1755  case 1:
1756  pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
1757  break;
1758  case 2:
1759  pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
1760  break;
1761  case 3:
1762  pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
1763  break;
1764  }
1765 }
1766 
1767 //
1768 // After PME solver is done, we return here
1769 //
1771  traceUserBracketEvent(CUDA_PME_SPREADCHARGE_EVENT, beforeWalltime, CmiWallTimer());
1772  beforeWalltime = CmiWallTimer();
1773  // gather (i.e. un-grid) forces
1775 // if (simParameters->alchOn) {
1776 // pmeRealSpaceComputes[1]->gatherForce(lattice, force);
1777 // ((CudaPmeRealSpaceCompute*)(pmeRealSpaceComputes[1]))->gatherForceSetCallback(this);
1778 // } else {
1779  for (unsigned int iGrid = 0; iGrid < NUM_GRID_MAX; ++iGrid) {
1780  if (enabledGrid[iGrid]) {
1781 // fprintf(stdout, "gatherForce at grid %u\n", iGrid);
1782  pmeRealSpaceComputes[iGrid]->gatherForce(lattice, forces[iGrid]);
1783  // Set callback that will call gatherForceDone() once gatherForce is done
1784  ((CudaPmeRealSpaceCompute*)(pmeRealSpaceComputes[iGrid]))->gatherForceSetCallback(this);
1785  }
1786  }
1787  // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
1788  // gatherForceDone();
1789 // }
1790 }
1791 
1792 static inline void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param) {
1794  c->gatherForceDoneSubset(first, last);
1795 }
1796 
1798  for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
1799  bool done = false;
1800  // ----------------------------- lock start ---------------------------
1801  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1802  // we really would only need to each element but this would required
1803  // numHomePatches number of locks.
1804  if (pmePencilType != 3) CmiLock(lock_numPencils);
1805  numPencils[forceI][homePatchIndex]--;
1806  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1807  if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1808  // ----------------------------- lock end ---------------------------
1809  if (done) {
1810  // This home patch is done, launch force merging
1811  mergeForcesOnPatch(homePatchIndex);
1812  }
1813  }
1814 }
1815 
1816 void ComputePmeCUDADevice::gatherForceDone(unsigned int iGrid) {
1817  // CHC: prevent race condition when there are multiple pmeRealSpaceCompute objects
1818  forceReady[iGrid] = 1;
1819  bool all_force_ready = true;
1820 // fprintf(stdout, "gatherForceDone at grid %u\n", iGrid);
1821  // loop over forceReady to check if all forces are gathered
1822  for (unsigned int i = 0; i < NUM_GRID_MAX; ++i) {
1823  if (forceReady[i] == -1) continue;
1824  if (forceReady[i] == 0) all_force_ready = false;
1825  }
1826  if (all_force_ready) {
1827  for (unsigned int i = 0; i < NUM_GRID_MAX; ++i) {
1828  if (forceReady[i] == -1) continue;
1829  if (forceReady[i] == 1) forceReady[i] = 0;
1830  }
1831 // fprintf(stdout, "all force ready\n");
1832  // Primary pencil has the forces
1833 
1834  traceUserBracketEvent(CUDA_PME_GATHERFORCE_EVENT, beforeWalltime, CmiWallTimer());
1835 
1836  // Send forces to neighbors
1838 
1839 #if CMK_SMP && USE_CKLOOP
1840  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1841  if (useCkLoop >= 1) {
1842  CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
1843  } else
1844 #endif
1845 
1846  {
1847  // Loop through home patches and mark the primary pencil as "done"
1848  for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
1849  bool done = false;
1850  // ----------------------------- lock start ---------------------------
1851  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1852  // we really would only need to each element but this would required
1853  // numHomePatches number of locks.
1854  if (pmePencilType != 3) CmiLock(lock_numPencils);
1855  numPencils[forceI][homePatchIndex]--;
1856  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1857  if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1858  // ----------------------------- lock end ---------------------------
1859  if (done) {
1860  // This home patch is done, launch force merging
1861  thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1862  }
1863  }
1864  }
1865 
1866  // In case we have no home patches, clear the primary pencil storage here
1867  if (numHomePatches == 0) {
1868  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1869  pmeAtomStorage[forceI][pp0]->clear();
1870  }
1871  }
1872 }
1873 
1874 //
1875 // After gatherForce is done, we end up here
1876 //
1878  // Primary pencil has the forces
1879  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1880  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1882  const int alchGrid = simParams->alchOn ? 1 : 0;
1883  const int alchDecoupleGrid = simParams->alchDecouple ? 1: 0;
1884  const int alchSoftCoreOrTI = (simParams->alchElecLambdaStart > 0 || simParams->alchThermIntOn) ? 1 : 0;
1885  // Loop through neighboring pencils
1886  for (int z=zlo;z <= zhi;z++) {
1887  for (int y=ylo;y <= yhi;y++) {
1888  // Only send to neighbors, not self
1889  if (y != 0 || z != 0) {
1890  int pp = y-ylo + (z-zlo)*yNBlocks;
1891  int patchIndex = neighborPatchIndex[pp];
1892  int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
1893  int atomEnd = patchPos[patchIndex];
1894  int natom = atomEnd-atomStart;
1895  // copy forces
1896  PmeForcePencilMsg *msg;
1897  msg = new (natom, alchGrid * natom, alchDecoupleGrid * natom,
1898  alchDecoupleGrid * natom, alchSoftCoreOrTI * natom,
1900  msg->numAtoms = natom;
1901  memcpy(msg->force, forces[0]+atomStart, natom*sizeof(CudaForce));
1902  if (simParams->alchOn) {
1903  memcpy(msg->force2, forces[1]+atomStart, natom*sizeof(CudaForce));
1904  if (simParams->alchDecouple) {
1905  memcpy(msg->force3, forces[2]+atomStart, natom*sizeof(CudaForce));
1906  memcpy(msg->force4, forces[3]+atomStart, natom*sizeof(CudaForce));
1907  }
1908  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
1909  memcpy(msg->force5, forces[4]+atomStart, natom*sizeof(CudaForce));
1910  }
1911  }
1912  // Calculate destination pencil index (dstY, dstZ) for this neighbor
1913  int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1914  int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1915  int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
1916  msg->y = dstY;
1917  msg->z = dstZ;
1918  // Store source pencil index
1919  msg->srcY = pencilIndexY;
1920  msg->srcZ = pencilIndexZ;
1921  mgrProxy[node].recvForcesFromNeighbor(msg);
1922  }
1923  }
1924  }
1925 }
1926 
1928 
1929  // Source pencil index
1930  int y = msg->srcY - pencilIndexY;
1931  if (y < ylo) y += pmeGrid.yBlocks;
1932  if (y > yhi) y -= pmeGrid.yBlocks;
1933  int z = msg->srcZ - pencilIndexZ;
1934  if (z < zlo) z += pmeGrid.zBlocks;
1935  if (z > zhi) z -= pmeGrid.zBlocks;
1936 
1937  if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1938  NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
1939  }
1940 
1941  // Source pencil
1942  int pp = y-ylo + (z-zlo)*yNBlocks;
1943 
1944  // Store message (deleted in mergeForcesOnPatch)
1945  neighborForcePencilMsgs[pp] = msg;
1946 
1947  // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
1948  // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
1949  // neighborForcePencils[pp].numAtoms = msg->numAtoms;
1950  // neighborForcePencils[pp].y = msg->y;
1951  // neighborForcePencils[pp].z = msg->z;
1952  // neighborForcePencils[pp].srcY = msg->srcY;
1953  // neighborForcePencils[pp].srcZ = msg->srcZ;
1954  // delete msg;
1955 
1956  // numPatches = number of home patches this pencil has
1957  int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
1958  if (numPatches != homePatchIndexList[forceI][pp].size()) {
1959  NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
1960  }
1961  for (int i=0;i < numPatches;i++) {
1962  // this pencil contributed to home patch with index "homePatchIndex"
1963  int homePatchIndex = homePatchIndexList[forceI][pp][i];
1964  // ----------------------------- lock start ---------------------------
1965  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1966  // we really would only need to each element but this would required
1967  // numHomePatches number of locks.
1968  bool done = false;
1969  CmiLock(lock_numPencils);
1970  numPencils[forceI][homePatchIndex]--;
1971  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1972  CmiUnlock(lock_numPencils);
1973  // ----------------------------- lock end ---------------------------
1974  if (done) {
1975  // This home patch is done, launch force merging
1976  thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1977  }
1978  }
1979 
1980 }
1981 
1983  // We have all the forces for this patch => merge on a single Pe
1984 
1985  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1986 
1987  // Message that goes out to the compute
1988  PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
1989 
1991  if (pmePencilType == 3) {
1992  // 3D box => simple memory copy will do
1993  // Location of forces in the force[] array
1994  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1995  // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
1996  int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1997  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1998  int atomEnd = patchPos[pencilPatchIndex];
1999  int numAtoms = atomEnd-atomStart;
2000  if (forceMsg->zeroCopy) {
2001  // Zero-copy, just pass the pointer
2002  forceMsg->force = forces[0]+atomStart;
2003  if (simParams->alchOn) {
2004  forceMsg->force2 = forces[1]+atomStart;
2005  if (simParams->alchDecouple) {
2006  forceMsg->force3 = forces[2]+atomStart;
2007  forceMsg->force4 = forces[3]+atomStart;
2008  }
2009  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
2010  forceMsg->force5 = forces[4]+atomStart;
2011  }
2012  }
2013  } else {
2014  memcpy(forceMsg->force, forces[0]+atomStart, numAtoms*sizeof(CudaForce));
2015  if (simParams->alchOn) {
2016  memcpy(forceMsg->force2, forces[1]+atomStart, numAtoms*sizeof(CudaForce));
2017  if (simParams->alchDecouple) {
2018  memcpy(forceMsg->force3, forces[2]+atomStart, numAtoms*sizeof(CudaForce));
2019  memcpy(forceMsg->force4, forces[3]+atomStart, numAtoms*sizeof(CudaForce));
2020  }
2021  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
2022  memcpy(forceMsg->force5, forces[4]+atomStart, numAtoms*sizeof(CudaForce));
2023  }
2024  }
2025  }
2026  } else {
2027 
2028  // Zero force array
2029  // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
2030  memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
2031  if (simParams->alchOn) {
2032  memset(forceMsg->force2, 0, forceMsg->numAtoms*sizeof(CudaForce));
2033  if (simParams->alchDecouple) {
2034  memset(forceMsg->force3, 0, forceMsg->numAtoms*sizeof(CudaForce));
2035  memset(forceMsg->force4, 0, forceMsg->numAtoms*sizeof(CudaForce));
2036  }
2037  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
2038  memset(forceMsg->force5, 0, forceMsg->numAtoms*sizeof(CudaForce));
2039  }
2040  }
2041 
2042  // Store forces from primary pencil
2043  {
2044  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
2045  int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
2046  int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
2047  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
2048  int atomEnd = patchPos[pencilPatchIndex];
2049  int numAtoms = atomEnd-atomStart;
2050 
2051  // Copy in local forces that are stored in the force[] array
2052  for (int i=0;i < numAtoms;i++) {
2053  forceMsg->force[index[atomStart + i]] = forces[0][atomStart + i];
2054  if (simParams->alchOn) {
2055  forceMsg->force2[index[atomStart + i]] = forces[1][atomStart + i];
2056  if (simParams->alchDecouple) {
2057  forceMsg->force3[index[atomStart + i]] = forces[2][atomStart + i];
2058  forceMsg->force4[index[atomStart + i]] = forces[3][atomStart + i];
2059  }
2060  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
2061  forceMsg->force5[index[atomStart + i]] = forces[4][atomStart + i];
2062  }
2063  }
2064  }
2065 
2066  }
2067 
2068  // Add forces from neighboring pencils
2069  for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
2070  int pp = plList[forceI][homePatchIndex][j].pp;
2071  int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
2072 
2073  int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
2074  int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
2075  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
2076  int atomEnd = patchPos[pencilPatchIndex];
2077  int numAtoms = atomEnd-atomStart;
2078  CudaForce *dstForce = forceMsg->force;
2079  // CudaForce *srcForce = neighborForcePencils[pp].force;
2080  CudaForce *dstForce2 = forceMsg->force2;
2081  CudaForce *dstForce3 = forceMsg->force3;
2082  CudaForce *dstForce4 = forceMsg->force4;
2083  CudaForce *dstForce5 = forceMsg->force5;
2084  CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
2085  CudaForce *srcForce2 = neighborForcePencilMsgs[pp]->force2;
2086  CudaForce *srcForce3 = neighborForcePencilMsgs[pp]->force3;
2087  CudaForce *srcForce4 = neighborForcePencilMsgs[pp]->force4;
2088  CudaForce *srcForce5 = neighborForcePencilMsgs[pp]->force5;
2089 
2090  for (int i=0;i < numAtoms;i++) {
2091  dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
2092  dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
2093  dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
2094  if (simParams->alchOn) {
2095  dstForce2[index[atomStart + i]].x += srcForce2[atomStart + i].x;
2096  dstForce2[index[atomStart + i]].y += srcForce2[atomStart + i].y;
2097  dstForce2[index[atomStart + i]].z += srcForce2[atomStart + i].z;
2098  if (simParams->alchDecouple) {
2099  dstForce3[index[atomStart + i]].x += srcForce3[atomStart + i].x;
2100  dstForce3[index[atomStart + i]].y += srcForce3[atomStart + i].y;
2101  dstForce3[index[atomStart + i]].z += srcForce3[atomStart + i].z;
2102  dstForce4[index[atomStart + i]].x += srcForce4[atomStart + i].x;
2103  dstForce4[index[atomStart + i]].y += srcForce4[atomStart + i].y;
2104  dstForce4[index[atomStart + i]].z += srcForce4[atomStart + i].z;
2105  }
2106  if (bool(simParams->alchElecLambdaStart) == true || simParams->alchThermIntOn) {
2107  dstForce5[index[atomStart + i]].x += srcForce5[atomStart + i].x;
2108  dstForce5[index[atomStart + i]].y += srcForce5[atomStart + i].y;
2109  dstForce5[index[atomStart + i]].z += srcForce5[atomStart + i].z;
2110  }
2111  }
2112  }
2113 
2114  }
2115  }
2116 
2117  // Clear storage
2118  plList[forceI][homePatchIndex].clear();
2119 
2120  // ----------------------------- lock start ---------------------------
2121  // bool done = false;
2122  CmiLock(lock_numHomePatchesMerged);
2123  numHomePatchesMerged++;
2124  if (numHomePatchesMerged == numHomePatches) {
2125  // Reset counter
2126  numHomePatchesMerged = 0;
2127 
2128  // Delete messages
2129  for (int i=0;i < neighborForcePencilMsgs.size();i++) {
2130  if (neighborForcePencilMsgs[i] != NULL) {
2131  delete neighborForcePencilMsgs[i];
2132  neighborForcePencilMsgs[i] = NULL;
2133  }
2134  }
2135 
2136  // Done merging and sending forces => clear storage
2137  for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
2138  homePatchIndexList[forceI][pp].clear();
2139  for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
2140  pmeAtomStorage[forceI][pp]->clear();
2141 
2142  }
2143  CmiUnlock(lock_numHomePatchesMerged);
2144  // ----------------------------- lock end ---------------------------
2145 
2146  // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
2147  // this node will then rely the message to the Pe that originally sent the atoms
2148  int pe = forceMsg->pe;
2149  if (CkNodeOf(pe) != CkMyNode())
2150  thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
2151  else
2152  sendForcesToPatch(forceMsg);
2153 
2154 }
2155 
2157  // Now we're on the node that has Pe, hence "compute" -pointer is valid
2158  int pe = forceMsg->pe;
2159  ComputePmeCUDA *compute = forceMsg->compute;
2160 
2161  // Store message for use in ComputePmeCUDA, where it'll also be deleted.
2162  if (compute->storePmeForceMsg(forceMsg)) {
2163  // Enqueue on the pe that sent the atoms in the first place
2164  LocalWorkMsg *lmsg = compute->localWorkMsg;
2165  CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
2166  wdProxy[pe].enqueuePme(lmsg);
2167  }
2168 }
2169 #endif // NAMD_CUDA
2170 
2171 #include "ComputePmeCUDAMgr.def.h"
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
int getHomeNode(PatchID patchID)
void sendAtomsToNeighbor(int y, int z, int atomIval)
int zBlocks
Definition: PmeBase.h:25
void recvForcesFromNeighbor(PmeForcePencilMsg *msg)
CProxy_ComputePmeCUDAMgr getMgrProxy()
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
float q
Definition: CudaRecord.h:59
void initialize(CkQdMsg *msg)
virial xy
ComputePmeCUDA * compute
CpuPmeAtomStorage(const bool useIndex)
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
int numStrayAtoms
Definition: PmeSolver.h:114
int dim3
Definition: PmeBase.h:22
void gatherForceDone(unsigned int iGrid)
int getDeviceIDPencilX(int i, int j)
static void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param)
void sendForcesToPatch(PmeForceMsg *forceMsg)
static PatchMap * Object()
Definition: PatchMap.h:27
void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void getHomePencil(PatchID patchID, int &homey, int &homez)
bool doVirial
Definition: PmeSolver.h:113
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:27
int K1
Definition: PmeBase.h:21
float z
Definition: CudaRecord.h:59
#define DebugM(x, y)
Definition: Debug.h:75
float x
Definition: CudaRecord.h:63
void recvAtomFiler(CProxy_PmeAtomFiler filer)
void recvAtoms(PmeAtomMsg *msg)
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
int block1
Definition: PmeBase.h:24
float x
Definition: CudaRecord.h:59
int getNumDevice()
Definition: DeviceCUDA.h:125
static int getPencilIndexY(const PmeGrid &pmeGrid, const int y)
Definition: PmeSolverUtil.h:23
void mergeForcesOnPatch(int homePatchIndex)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:32
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
const unsigned int NUM_GRID_MAX
Definition: PmeSolverUtil.h:9
void swap(Array< T > &s, Array< T > &t)
Definition: MsmMap.h:319
void recvDevices(RecvDeviceMsg *msg)
int getDeviceIDPencilZ(int i, int j)
LocalWorkMsg *const localWorkMsg
Definition: Compute.h:46
bool storePmeForceMsg(PmeForceMsg *msg)
std::array< int, NUM_GRID_MAX > dataSizes
Definition: PmeSolver.h:106
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
static double calcGridCoord(const double x, const double recip11, const int nfftx)
int getNumAtoms(int p)
CudaAtom * overflowAtom
std::array< float *, NUM_GRID_MAX > dataGrid
Definition: PmeSolver.h:105
int yBlocks
Definition: PmeBase.h:25
#define PRIORITY_SIZE
Definition: Priorities.h:13
void initialize(PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
int numPatches(void) const
Definition: PatchMap.h:59
int order
Definition: PmeBase.h:23
bool isGridEnabled(unsigned int i) const
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getDevicePencilZ(int i, int j)
BigReal min_c(int pid) const
Definition: PatchMap.h:95
int block3
Definition: PmeBase.h:24
CudaAtom * atoms
void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg)
BigReal x
Definition: Vector.h:74
void initializePatches(int numHomePatches_in)
float z
Definition: CudaRecord.h:63
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
unsigned int totalFactorArrays
void createStream(cudaStream_t &stream)
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
CudaPmeAtomStorage(const bool useIndex)
int getDevice(int i, int j)
int getDeviceIDPencilY(int i, int j)
void recvAtoms(PmeAtomMsg *msg)
float y
Definition: CudaRecord.h:59
#define simParams
Definition: Output.C:129
int K3
Definition: PmeBase.h:21
Lattice lattice
Definition: PmeSolver.h:116
void activate_pencils(CkQdMsg *msg)
BigReal max_b(int pid) const
Definition: PatchMap.h:94
int getDeviceIDbyRank(int rank)
Definition: DeviceCUDA.h:145
BigReal y
Definition: Vector.h:74
bool isPmeNode(int node)
int getNode(int i, int j)
void initialize_pencils(CkQdMsg *msg)
BigReal max_c(int pid) const
Definition: PatchMap.h:96
std::vector< float * > overflowAtomElecFactorArrays
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
float y
Definition: CudaRecord.h:63
std::array< bool, NUM_GRID_MAX > enabledGrid
Definition: PmeSolver.h:107
int simulationStep
Definition: PmeSolver.h:115
ComputePmeCUDA * compute
int getDevicePencilY(int i, int j)
int * getAtomIndex(int p)
int xBlocks
Definition: PmeBase.h:25
void recvPencils(CProxy_CudaPmePencilXYZ xyz)
CProxy_ComputePmeCUDADevice * dev
void gatherForceDoneSubset(int first, int last)
BigReal min_b(int pid) const
Definition: PatchMap.h:93
int32 PatchID
Definition: NamdTypes.h:277
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
bool doEnergy
Definition: PmeSolver.h:113
static int getPencilIndexZ(const PmeGrid &pmeGrid, const int z)
Definition: PmeSolverUtil.h:28
bool isPmeDevice(int deviceID)
double BigReal
Definition: common.h:123
std::vector< float * > atomElecFactorArrays
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:28