NAMD
CudaTileListKernel.hip.h
Go to the documentation of this file.
1 #ifndef CUDATILELISTKERNEL_HIP_H
2 #define CUDATILELISTKERNEL_HIP_H
3 #if defined(NAMD_HIP)
4 
5 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
6 struct TileExcl {
8 };
9 
10 struct TileList {
11  int iatomStart;
12  int jtileStart;
13  int jtileEnd;
14  float3 offsetXYZ;
15  int2 patchInd; // Patch indices for this list
16  union {
17  int2 patchNumList; // Number of lists contributing to each patch
18  // int icompute;
19  };
20  int icompute;
21 };
22 
23 struct PatchPairRecord {
24  int iatomSize;
25  int iatomFreeSize;
26  int jatomSize;
27  int jatomFreeSize;
28 };
29 
30 //
31 // Bounding box structure
32 //
33 struct BoundingBox {
34  float x, y, z; // Center
35  float wx, wy, wz; // Half-width
36 };
37 
38 //
39 // Stripped-down CUDA version of compute record
40 //
41 struct CudaComputeRecord {
42  int2 patchInd;
43  float3 offsetXYZ;
44 };
45 
46 //
47 // Stripped-down CUDA version of patch record
48 //
49 struct CudaPatchRecord {
50  int numAtoms;
51  int numFreeAtoms;
52  int atomStart;
53 #ifdef NODEGROUP_FORCE_REGISTER
54  // appending an additional field to hold the patchId
55  int patchID;
56 #endif
57 };
58 
59 //
60 // Tile list status. Used to communicate tile list sizes between GPU and CPU
61 //
62 struct TileListStat {
63  int numTileLists;
64  int numTileListsGBIS;
65  int numJtiles;
66  int numExcluded;
68  int outputOrderIndex;
69  bool tilesSizeExceeded;
70 };
71 
72 struct TileListVirialEnergy {
73  float shx, shy, shz;
74  float forcex, forcey, forcez;
76  double energyVdw;
77  double energyElec;
78  double energySlow;
79 
80  /* FEP energies */
81  double energyVdw_s;
82  double energyElec_s;
83  double energySlow_s;
84 
85  /* TI energies */
86  double energyVdw_ti_1;
87  double energyVdw_ti_2;
88  double energyElec_ti_1;
89  double energyElec_ti_2;
90  double energySlow_ti_1;
91  double energySlow_ti_2;
92 
93  double energyGBIS;
94 };
95 
96 struct VirialEnergy {
97  double virial[9];
98  double virialSlow[9];
99  double energyVdw;
100  double energyElec;
101  double energySlow;
102 
103  /* FEP energies */
104  double energyVdw_s;
105  double energyElec_s;
106  double energySlow_s;
107 
108  /* TI energies */
109  double energyVdw_ti_1;
110  double energyVdw_ti_2;
111  double energyElec_ti_1;
112  double energyElec_ti_2;
113  double energySlow_ti_1;
114  double energySlow_ti_2;
115 
116  double energyGBIS;
117 };
118 
119 class CudaTileListKernel {
120 private:
121 
122  template <typename T>
123  struct PtrSize {
124  PtrSize(T* ptr, int size) : ptr(ptr), size(size) {}
125  T* ptr;
126  size_t size;
127  };
128 
129  const int deviceID;
130 
131  // Events
132  cudaEvent_t tileListStatEvent;
133  bool tileListStatEventRecord;
134 
135  // Pair list cutoff squared
136  float plcutoff2;
137 
138  // Number of patches
139  int numPatches;
140 
141  // Number of computes
142  int numComputes;
143 
144  // Number of tile lists
145  int numTileLists;
146 
147  // Number of tile lists for GBIS
148  int numTileListsGBIS;
149 
150  // Number of tiles
151  int numJtiles;
152 
153  // Maximum number of tiles per tile list
154  int maxTileListLen;
155 
156  CudaPatchRecord* cudaPatches;
157  size_t cudaPatchesSize;
158 
159  CudaComputeRecord* cudaComputes;
160  size_t cudaComputesSize;
161 
162  // --- For Streaming ---
163  const bool doStreaming;
164  int* patchNumLists;
165  size_t patchNumListsSize;
166 
167  int* emptyPatches;
168  size_t emptyPatchesSize;
169  int* h_emptyPatches;
170  size_t h_emptyPatchesSize;
171  int numEmptyPatches;
172 
173  unsigned int* sortKeySrc;
174  size_t sortKeySrcSize;
175  unsigned int* sortKeyDst;
176  size_t sortKeyDstSize;
177 
178  int maxTileListLen_sortKeys;
179 
180  unsigned int* sortKeys;
181  size_t sortKeysSize;
182 
183  int2* minmaxListLen;
184  size_t minmaxListLenSize;
185 
186  int sortKeys_endbit;
187  // ---------------------
188 
189  // Single entry pinned host and device buffers for communicating tile list status
190  TileListStat* h_tileListStat;
191  TileListStat* d_tileListStat;
192 
193  // Atom coordinates and charge
194  float4* xyzq;
195  size_t xyzqSize;
196  // Atom coordinate storage size
197  size_t atomStorageSize;
198 
199  char *part;
200  size_t partSize;
201 
202  // Tile lists
203  TileList* tileLists1;
204  size_t tileLists1Size;
205  TileList* tileLists2;
206  size_t tileLists2Size;
207  TileList* tileListsGBIS;
208  size_t tileListsGBISSize;
209 
210  // Pair pairs
211  PatchPairRecord* patchPairs1;
212  size_t patchPairs1Size;
213  PatchPairRecord* patchPairs2;
214  size_t patchPairs2Size;
215 
216  // j-atom start for tiles
217  int* tileJatomStart1;
218  size_t tileJatomStart1Size;
219  int* tileJatomStart2;
220  size_t tileJatomStart2Size;
221  int* tileJatomStartGBIS;
222  size_t tileJatomStartGBISSize;
223 
224  // Bounding boxes
225  BoundingBox* boundingBoxes;
226  size_t boundingBoxesSize;
227 
228  // Depth of each tile list
229  unsigned int* tileListDepth1;
230  size_t tileListDepth1Size;
231  unsigned int* tileListDepth2;
232  size_t tileListDepth2Size;
233 
234  // Tile list order
235  int* tileListOrder1;
236  size_t tileListOrder1Size;
237  int* tileListOrder2;
238  size_t tileListOrder2Size;
239 
240  // Position of each tile list = ExclusiveSum(tileListDepths)
241  int* tileListPos;
242  size_t tileListPosSize;
243 
244  // jtile occupancy and position
245  int* jtiles;
246  size_t jtilesSize;
247 
248  // Temporary buffers used in buildTileLists
249  int* tilePos;
250  size_t tilePosSize;
251 
252  // Exclusions
253  TileExcl* tileExcls1;
254  size_t tileExcls1Size;
255  TileExcl* tileExcls2;
256  size_t tileExcls2Size;
257 
258  // Temporary storage for CUB
259  char* tempStorage;
260  size_t tempStorageSize;
261 
262  // Number of exclusions detected
263  int numExcluded;
264 
265  // Virials and energies for tile lists
266  TileListVirialEnergy* tileListVirialEnergy;
267  size_t tileListVirialEnergySize;
268 
269  int tileListVirialEnergyLength;
270  int tileListVirialEnergyGBISLength;
271 
272  int activeBuffer;
273 
274  void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}
275 
276  void sortTileLists(
277  const bool useJtiles,
278  const int begin_bit, const bool highDepthBitsSet,
279  // Source
280  const int numTileListsSrc, const int numJtilesSrc,
281  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
282  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
283  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
284  // Destination
285  const int numTileListsDst, const int numJtilesDst,
286  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
287  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
288  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
289  cudaStream_t stream);
290 
291  void writeTileList(const char* filename, const int numTileLists,
292  const TileList* d_tileLists, cudaStream_t stream);
293  void writeTileList(FILE* handle, const int numTileLists,
294  const TileList* d_tileLists, cudaStream_t stream);
295  void writeTileJatomStart(const char* filename, const int numJtiles,
296  const int* d_tileJatomStart, cudaStream_t stream);
297  void writeTileJatomStart(FILE* handle, const int numJtiles,
298  const int* d_tileJatomStart, cudaStream_t stream);
299  void writeTileExcls(FILE* handle, const int numJtiles,
300  const TileExcl* d_tileExcl, cudaStream_t stream);
301  // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
302  // const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);
303 
304  int* outputOrder;
305  size_t outputOrderSize;
306  bool doOutputOrder;
307 
308 public:
309 
310  CudaTileListKernel(int deviceID, bool doStreaming);
312 
313  int getNumEmptyPatches() {return numEmptyPatches;}
314  int* getEmptyPatches() {return h_emptyPatches;}
315 
316  int getNumExcluded() {return numExcluded;}
317 
318  float get_plcutoff2() {return plcutoff2;}
319  int getNumTileLists() {return numTileLists;}
320  int getNumTileListsGBIS() {return numTileListsGBIS;}
321  int getNumJtiles() {return numJtiles;}
322  BoundingBox* getBoundingBoxes() {return boundingBoxes;}
323  int* getJtiles() {return jtiles;}
324  float4* get_xyzq() {return xyzq;}
325  char* get_part() {return part;}
326 
327  TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
328  void clearTileListStat(cudaStream_t stream);
329 
330  int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
332  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
333  }
334  unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
335  int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
336  TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
337  PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
338 
339  int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
340  TileList* getTileListsGBIS() {return tileListsGBIS;}
341 
342  TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}
343 
344  CudaPatchRecord* getCudaPatches() {return cudaPatches;}
345  int getCudaPatchesSize() {return cudaPatchesSize;}
346 
347  void prepareTileList(cudaStream_t stream);
348  void finishTileList(cudaStream_t stream);
349 
350  void updateComputes(const int numComputesIn,
351  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);
352 
353  void prepareBuffers(
354  int atomStorageSizeIn, int numPatchesIn,
355  const CudaPatchRecord* h_cudaPatches,
356  cudaStream_t stream);
357 
358  void buildTileLists(const int numTileListsPrev,
359  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
360  const float3 lata, const float3 latb, const float3 latc,
361  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
362  const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged,
363  const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration);
364 
365 void reSortTileLists(const bool doGBIS, const bool CUDASOAIntegratorOn, cudaStream_t stream); // void applyOutputOrder(cudaStream_t stream);
366 
367  void setTileListVirialEnergyLength(int len);
368  void setTileListVirialEnergyGBISLength(int len);
369  int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
370  int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}
371 
372  int getNumPatches() {return numPatches;}
373 
374  int getNumComputes() {return numComputes;}
375  int* getOutputOrder() {
376  if (!doStreaming) return NULL;
377  if (doOutputOrder) {
378  return outputOrder;
379  } else {
380  return NULL;
381  }
382  }
383 
384 };
385 #endif // NAMD_CUDA
386 #endif // CUDATILELISTKERNEL_H
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
void setTileListVirialEnergyLength(int len)
PatchPairRecord * getPatchPairs()
void clearTileListStat(cudaStream_t stream)
void setTileListVirialEnergyGBISLength(int len)
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
CudaPatchRecord * getCudaPatches()
unsigned int * getTileListDepth()
#define WARPSIZE
Definition: CudaUtils.h:17
BoundingBox * getBoundingBoxes()
unsigned int excl[32]
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
TileList * getTileListsGBIS()
unsigned int WarpMask
Definition: CudaUtils.h:19
TileListStat * getTileListStatDevPtr()
void finishTileList(cudaStream_t stream)
float3 offsetXYZ
double virialSlow[9]
TileListVirialEnergy * getTileListVirialEnergy()
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)