NAMD
ComputePmeCUDAMgr.h
Go to the documentation of this file.
1 #ifndef COMPUTEPMECUDAMGR_H
2 #define COMPUTEPMECUDAMGR_H
3 
4 #include <vector>
5 
6 #include "PmeBase.h"
7 #include "PmeSolver.h"
8 #include "PmeSolverUtil.h"
9 #include "ComputePmeCUDAMgr.decl.h"
10 #ifdef NAMD_CUDA
11 #include <cuda_runtime.h> // Needed for cudaStream_t that is used in ComputePmeCUDAMgr -class
12 #endif
13 #ifdef NAMD_HIP
14 #include "HipDefines.h"
15 #include <hip/hip_runtime.h>
16 #endif
17 
18 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
19 class ComputePmeCUDA;
20 
21 //
22 // Base class for thread safe atom storage
23 //
25 public:
26  PmeAtomStorage(const bool useIndex) : useIndex(useIndex) {
27  numAtoms = 0;
28  atomCapacity = 0;
29  atom = NULL;
30  atomIndexCapacity = 0;
31  atomIndex = NULL;
32  overflowStart = 0;
33  overflowEnd = 0;
34  overflowAtomCapacity = 0;
35  overflowAtom = NULL;
36  overflowAtomIndexCapacity = 0;
37  overflowAtomIndex = NULL;
38  lock_ = CmiCreateLock();
39  }
40  virtual ~PmeAtomStorage() {
41  CmiDestroyLock(lock_);
42  }
43 
44  int addAtoms(const int natom, const CudaAtom* src) {
45  return addAtoms_(natom, src, NULL);
46  }
47 
48  int addAtomsWithIndex(const int natom, const CudaAtom* src, const int* index) {
49  return addAtoms_(natom, src, index);
50  }
51 
52  // Finish up, must be called after "done" is returned by addAtoms.
53  // Only the last thread that gets the "done" signal from addAtoms can enter here.
54  void finish() {
55  if (overflowEnd-overflowStart > 0) {
56  resize_((void **)&atom, numAtoms, atomCapacity, sizeof(CudaAtom));
57  if (useIndex) resize_((void **)&atomIndex, numAtoms, atomIndexCapacity, sizeof(int));
58  memcpy_(atom+overflowStart, overflowAtom, (overflowEnd - overflowStart)*sizeof(CudaAtom));
59  if (useIndex) memcpy_(atomIndex+overflowStart, overflowAtomIndex, (overflowEnd - overflowStart)*sizeof(int));
60  overflowStart = 0;
61  overflowEnd = 0;
62  }
63  }
64 
65  // Clear and reset storage to initial stage.
66  // Only the last thread that gets the "done" signal from addAtoms can enter here.
67  void clear() {
68  patchPos.clear();
69  numAtoms = 0;
70  }
71 
72  // Return pointer to atom data
74  return atom;
75  }
76 
77  // Return pointer to patch positions
78  int* getPatchPos() {
79  return patchPos.data();
80  }
81 
82  int getNumPatches() {
83  return patchPos.size();
84  }
85 
86  int getNumAtoms() {
87  return numAtoms;
88  }
89 
90  int* getAtomIndex() {
91  if (!useIndex)
92  NAMD_bug("PmeAtomStorage::getAtomIndex, no indexing enabled");
93  return atomIndex;
94  }
95 
96 protected:
97  // Atom array
99  // Atom index array
100  int* atomIndex;
101  // Overflow atom array
103  // Overflow atom index array
105 
106 private:
107  // If true, uses indexed atom arrays
108  const bool useIndex;
109  // Node lock
110  CmiNodeLock lock_;
111  // Data overflow
112  int overflowAtomCapacity;
113  int overflowAtomIndexCapacity;
114  int overflowStart;
115  int overflowEnd;
116  // Number of atoms currently in storage
117  int numAtoms;
118  // Atom patch position
119  std::vector<int> patchPos;
120  // Atom array capacity
121  int atomCapacity;
122  // Atom index array capacity
123  int atomIndexCapacity;
124 
125  // Resize array with 1.5x extra storage
126  void resize_(void **array, int sizeRequested, int& arrayCapacity, const int sizeofType) {
127  // If array is not NULL and has enough capacity => we have nothing to do
128  if (*array != NULL && arrayCapacity >= sizeRequested) return;
129 
130  // Otherwise, allocate new array
131  int newArrayCapacity = (int)(sizeRequested*1.5);
132  void* newArray = alloc_(sizeofType*newArrayCapacity);
133 
134  if (*array != NULL) {
135  // We have old array => copy contents to new array
136  memcpy_(newArray, *array, arrayCapacity*sizeofType);
137  // De-allocate old array
138  dealloc_(*array);
139  }
140 
141  // Set new capacity and array pointer
142  arrayCapacity = newArrayCapacity;
143  *array = newArray;
144  }
145 
146  virtual void memcpy_(void *dst, const void* src, const int size) {
147  memcpy(dst, src, size);
148  }
149 
150  virtual void copyWithIndex_(CudaAtom* dst, const CudaAtom* src, const int natom, const int* indexSrc) {
151  for (int i=0;i < natom;i++) dst[i] = src[indexSrc[i]];
152  }
153 
154  // Allocate array of size bytes
155  virtual void* alloc_(const int size)=0;
156 
157  // Deallocate array
158  virtual void dealloc_(void *p)=0;
159 
160  // Add atoms in thread-safe manner.
161  // Returns the patch index where the atoms were added
162  int addAtoms_(const int natom, const CudaAtom* src, const int* index) {
163  CmiLock(lock_);
164  // Accumulate position for patches:
165  // atoms for patch i are in the range [ patchPos[i-1], patchPos[i]-1 ]
166  int patchInd = patchPos.size();
167  int ppos = (patchInd == 0) ? natom : patchPos[patchInd-1] + natom;
168  patchPos.push_back(ppos);
169  int pos = numAtoms;
170  bool overflow = false;
171  numAtoms += natom;
172  // Check for overflow
173  if (numAtoms > atomCapacity || (useIndex && numAtoms > atomIndexCapacity)) {
174  // number of atoms exceeds capacity, store into overflow buffer
175  // Note: storing to overflow should be very infrequent, most likely only
176  // in the initial call
177  if (overflowEnd-overflowStart == 0) {
178  overflowStart = pos;
179  overflowEnd = pos;
180  }
181  overflowEnd += natom;
182  if (overflowEnd-overflowStart > overflowAtomCapacity) {
183  resize_((void **)&overflowAtom, overflowEnd-overflowStart, overflowAtomCapacity, sizeof(CudaAtom));
184  }
185  if (useIndex && overflowEnd-overflowStart > overflowAtomIndexCapacity) {
186  resize_((void **)&overflowAtomIndex, overflowEnd-overflowStart, overflowAtomIndexCapacity, sizeof(int));
187  }
188  if (index != NULL) {
189  if (useIndex) memcpy_(overflowAtomIndex+overflowEnd-overflowStart-natom, index, natom*sizeof(int));
190  copyWithIndex_(overflowAtom+overflowEnd-overflowStart-natom, src, natom, index);
191  } else {
192  memcpy_(overflowAtom+overflowEnd-overflowStart-natom, src, natom*sizeof(CudaAtom));
193  }
194  overflow = true;
195  }
196  CmiUnlock(lock_);
197  // If no overflow, copy to final position
198  if (!overflow) {
199  if (index != NULL) {
200  if (useIndex) memcpy_(atomIndex+pos, index, natom*sizeof(int));
201  copyWithIndex_(atom+pos, src, natom, index);
202  } else {
203  memcpy_(atom+pos, src, natom*sizeof(CudaAtom));
204  }
205  }
206  return patchInd;
207  }
208 
209 };
210 
211 class PmeAtomMsg : public CMessage_PmeAtomMsg {
212 public:
214  int numAtoms;
215  int i, j;
217  int pe;
220  // int miny, minz;
221 };
222 
223 class PmeForceMsg : public CMessage_PmeForceMsg {
224 public:
226  int pe;
227  int numAtoms;
229  bool zeroCopy;
231 };
232 
233 class PmeLaunchMsg : public CMessage_PmeLaunchMsg {
234 public:
236  int natom;
237  int pe;
239 };
240 
241 class RegisterPatchMsg : public CMessage_RegisterPatchMsg {
242 public:
243  int i, j;
244 };
245 
246 class NumDevicesMsg : public CMessage_NumDevicesMsg {
247 public:
248  NumDevicesMsg(int numDevices) : numDevices(numDevices) {}
250 };
251 
252 class PmeAtomPencilMsg : public CMessage_PmeAtomPencilMsg {
253 public:
255  int numAtoms;
256  int y, z;
257  int srcY, srcZ;
260 };
261 
262 class PmeForcePencilMsg : public CMessage_PmeForcePencilMsg {
263 public:
265  int numAtoms;
266  int y, z;
267  int srcY, srcZ;
268 };
269 
270 class CProxy_ComputePmeCUDADevice;
271 class RecvDeviceMsg : public CMessage_RecvDeviceMsg {
272 public:
273  CProxy_ComputePmeCUDADevice* dev;
275 };
276 
277 class PmeAtomFiler : public CBase_PmeAtomFiler {
278 public:
279  PmeAtomFiler();
280  PmeAtomFiler(CkMigrateMessage *);
281  ~PmeAtomFiler();
282  void fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
283  const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi);
284  // static inline int yBlock(int p) {return p % 3;}
285  // static inline int zBlock(int p) {return p / 3;}
286  int getNumAtoms(int p) {return pencilSize[p];}
287  int* getAtomIndex(int p) {return pencil[p];}
288 private:
289  // 9 Pencils + 1 Stay atom pencil
290  int pencilSize[9+1];
291  int pencilCapacity[9+1];
292  int* pencil[9+1];
293 };
294 
295 
296 class CProxy_ComputePmeCUDAMgr;
297 class ComputePmeCUDADevice : public CBase_ComputePmeCUDADevice {
298 public:
299  // ComputePmeCUDADevice_SDAG_CODE;
301  ComputePmeCUDADevice(CkMigrateMessage *m);
303  void initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
304  int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
305  CProxy_PmeAtomFiler pmeAtomFiler_in);
306  int getDeviceID();
307  cudaStream_t getStream();
308  CProxy_ComputePmeCUDAMgr getMgrProxy();
309  void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in);
310  void setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in);
311  void setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in);
312  void activate_pencils();
313  void initializePatches(int numHomePatches_in);
314  void registerNeighbor();
315  void recvAtoms(PmeAtomMsg *msg);
316  void sendAtomsToNeighbors();
317  void sendAtomsToNeighbor(int y, int z, int atomIval);
320  void spreadCharge();
321  void gatherForce();
322  void gatherForceDone();
323  void sendForcesToNeighbors();
325  void mergeForcesOnPatch(int homePatchIndex);
326  void sendForcesToPatch(PmeForceMsg *forceMsg);
327 
328  void gatherForceDoneSubset(int first, int last);
329 
330 private:
331  // Store updated lattice
332  Lattice lattice;
333  // Store virial and energy flags
334  bool doVirial, doEnergy;
335  // PME grid definiton
336  PmeGrid pmeGrid;
337  // PME pencil type
338  int pmePencilType;
339  // Neighboring pencil bounds, [-1,1]
340  int ylo, yhi, zlo, zhi;
341  // Size of the neighboring pencil grid, maximum value 3. yNBlocks = yhi - ylo + 1
342  int yNBlocks, zNBlocks;
343  // Number of home patches for this device
344  int numHomePatches;
345  // Pencil location for this device
346  int pencilIndexY, pencilIndexZ;
347 
348  // Number of neighbors expected to provide atoms including self
349  int numNeighborsExpected;
350 
351  // Number of stray atoms
352  int numStrayAtoms;
353 
354  // Node locks
355  CmiNodeLock lock_numHomePatchesMerged;
356  CmiNodeLock lock_numPencils;
357  CmiNodeLock lock_numNeighborsRecv;
358  CmiNodeLock lock_recvAtoms;
359 
360  int atomI, forceI;
361 
362  //----------------------------------------------------------------------------------
363  // Book keeping
364  // NOTE: We keep two copies of pmeAtomStorage and homePatchIndexList so that forces can be
365  // merged while next patch of atoms is already being received
366  //----------------------------------------------------------------------------------
367  // Storage for each pencil on the yNBlocks x zNBlocks grid
368  std::vector< PmeAtomStorage* > pmeAtomStorage[2];
369  std::vector<bool> pmeAtomStorageAllocatedHere;
370 
371  // Size numHomePatches:
372  // Tells how many pencils have contributed to home patch
373  std::vector<int> numPencils[2];
374 
375  // Pencil location
376  struct PencilLocation {
377  // Pencil index
378  int pp;
379  // Patch location in the pencil
380  int pencilPatchIndex;
381  PencilLocation(int pp, int pencilPatchIndex) : pp(pp), pencilPatchIndex(pencilPatchIndex) {}
382  };
383 
384  // Size numHomePatches
385  std::vector< std::vector<PencilLocation> > plList[2];
386 
387  // Size numHomePatches
388  std::vector< PmeForceMsg* > homePatchForceMsgs[2];
389 
390  // // Size numHomePatches
391  // std::vector<int> numHomeAtoms[2];
392 
393  std::vector< std::vector<int> > homePatchIndexList[2];
394 
395  // Number of neighbors from which we have received atoms
396  int numNeighborsRecv;
397 
398  // Number of home patches we have received atom from
399  int numHomePatchesRecv;
400 
401  // Number of home patches we have merged forces for
402  int numHomePatchesMerged;
403 
404  // Size yNBlocks*zNBlocks
405  std::vector< PmeForcePencilMsg* > neighborForcePencilMsgs;
406  // std::vector< PmeForcePencil > neighborForcePencils;
407 
408  // Size yNBlocks*zNBlocks
409  std::vector<int> neighborPatchIndex;
410  //----------------------------------------------------------------------------------
411 
412  // CUDA stream
413  cudaStream_t stream;
414  bool streamCreated;
415  // Device ID
416  int deviceID;
417  // Charge spreading and force gathering
418  PmeRealSpaceCompute* pmeRealSpaceCompute;
419  // Host memory force array
420  int forceCapacity;
421  CudaForce* force;
422 
423  // Proxy for the manager
424  CProxy_ComputePmeCUDAMgr mgrProxy;
425 
426  // Atom filer proxy
427  CProxy_PmeAtomFiler pmeAtomFiler;
428 
429  // Pencil proxy
430  CProxy_CudaPmePencilXYZ pmePencilXYZ;
431  CProxy_CudaPmePencilXY pmePencilXY;
432  CProxy_CudaPmePencilX pmePencilX;
433 
434  // For event tracing
435  double beforeWalltime;
436 };
437 
438 class ComputePmeCUDAMgr : public CBase_ComputePmeCUDAMgr {
439 public:
442  ComputePmeCUDAMgr(CkMigrateMessage *);
444  void setupPencils();
445  void initialize(CkQdMsg *msg);
446  void initialize_pencils(CkQdMsg *msg);
447  void activate_pencils(CkQdMsg *msg);
448  PmeGrid getPmeGrid() {return pmeGrid;}
449  int getNode(int i, int j);
450  int getDevice(int i, int j);
451  int getDevicePencilY(int i, int j);
452  int getDevicePencilZ(int i, int j);
453  int getDeviceIDPencilX(int i, int j);
454  int getDeviceIDPencilY(int i, int j);
455  int getDeviceIDPencilZ(int i, int j);
456  void recvPencils(CProxy_CudaPmePencilXYZ xyz);
457  void recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z);
458  void recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z);
459 
461  void recvDevices(RecvDeviceMsg* msg);
462  void recvAtomFiler(CProxy_PmeAtomFiler filer);
463  void skip();
464  void recvAtoms(PmeAtomMsg *msg);
465  void getHomePencil(PatchID patchID, int& homey, int& homez);
466  int getHomeNode(PatchID patchID);
467 
468  bool isPmePe(int pe);
469  bool isPmeNode(int node);
470  bool isPmeDevice(int deviceID);
471 
473  CProxy_ComputePmeCUDAMgr mgrProxy(CkpvAccess(BOCclass_group).computePmeCUDAMgr);
474  return mgrProxy.ckLocalBranch();
475  }
476 protected:
477 
478 private:
479  void restrictToMaxPMEPencils();
480 
481  // ---------------------------------------------
482  // For .ci file
483  // Counter for determining numDevicesMax
484  int numNodesContributed;
485  int numDevicesMax;
486 
487  // Number of home patches for each device on this manager
488  std::vector<int> numHomePatchesList;
489 
490  // Counter for "registerPatchDone"
491  int numTotalPatches;
492  // ---------------------------------------------
493 
494  // PME pencil type: 1=column, 2=slab, 3=box
495  int pmePencilType;
496 
497  PmeGrid pmeGrid;
498 
499  // Number of CUDA devices on this node that are used for PME computation
500  int numDevices;
501 
502  std::vector<int> xPes;
503  std::vector<int> yPes;
504  std::vector<int> zPes;
505 
506  // List of pencil coordinates (i,j) for each device held by this node
507  struct IJ {
508  int i, j;
509  };
510  std::vector<IJ> ijPencilX;
511  std::vector<IJ> ijPencilY;
512  std::vector<IJ> ijPencilZ;
513 
514  struct NodeDevice {
515  int node;
516  int device;
517  };
518  std::vector<NodeDevice> nodeDeviceList;
519 
520  // Atom filer proxy
521  CProxy_PmeAtomFiler pmeAtomFiler;
522 
523  // Device proxies
524  std::vector<CProxy_ComputePmeCUDADevice> deviceProxy;
525 
526  // Extra devices
527  struct ExtraDevice {
528  int deviceID;
529  cudaStream_t stream;
530  };
531  std::vector<ExtraDevice> extraDevices;
532 
533  // Pencil proxies
534  CProxy_CudaPmePencilXYZ pmePencilXYZ;
535  CProxy_CudaPmePencilXY pmePencilXY;
536  CProxy_CudaPmePencilX pmePencilX;
537  CProxy_CudaPmePencilY pmePencilY;
538  CProxy_CudaPmePencilZ pmePencilZ;
539 
540 };
541 #else // NAMD_CUDA
542 class ComputePmeCUDAMgr : public CBase_ComputePmeCUDAMgr {
543 };
544 #endif // NAMD_CUDA
545 
546 #endif // COMPUTEPMECUDAMGR_H
int getHomeNode(PatchID patchID)
void sendAtomsToNeighbor(int y, int z, int atomIval)
void recvForcesFromNeighbor(PmeForcePencilMsg *msg)
CProxy_ComputePmeCUDAMgr getMgrProxy()
void initialize(CkQdMsg *msg)
virial xy
ComputePmeCUDA * compute
int getDeviceIDPencilX(int i, int j)
void sendForcesToPatch(PmeForceMsg *forceMsg)
void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void getHomePencil(PatchID patchID, int &homey, int &homez)
static __thread atom * atoms
void recvAtomFiler(CProxy_PmeAtomFiler filer)
void recvAtoms(PmeAtomMsg *msg)
void mergeForcesOnPatch(int homePatchIndex)
PmeAtomStorage(const bool useIndex)
void recvDevices(RecvDeviceMsg *msg)
int getDeviceIDPencilZ(int i, int j)
__thread cudaStream_t stream
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
int getNumAtoms(int p)
CudaAtom * overflowAtom
void initialize(PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
static ComputePmeCUDAMgr * Object()
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int getDevicePencilZ(int i, int j)
CudaAtom * atoms
void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg)
gridSize z
virtual ~PmeAtomStorage()
void initializePatches(int numHomePatches_in)
int PatchID
Definition: NamdTypes.h:182
int addAtomsWithIndex(const int natom, const CudaAtom *src, const int *index)
CudaForce * force
int addAtoms(const int natom, const CudaAtom *src)
int getDevice(int i, int j)
int getDeviceIDPencilY(int i, int j)
void recvAtoms(PmeAtomMsg *msg)
void activate_pencils(CkQdMsg *msg)
CudaAtom * getAtoms()
bool isPmeNode(int node)
int getNode(int i, int j)
void initialize_pencils(CkQdMsg *msg)
gridSize y
ComputePmeCUDA * compute
int getDevicePencilY(int i, int j)
int * getAtomIndex(int p)
void recvPencils(CProxy_CudaPmePencilXYZ xyz)
CProxy_ComputePmeCUDADevice * dev
gridSize x
void gatherForceDoneSubset(int first, int last)
NumDevicesMsg(int numDevices)
ComputePmeCUDA * compute
CudaForce * force
bool isPmeDevice(int deviceID)