NAMD
CudaTileListKernel.h
Go to the documentation of this file.
1 #ifndef CUDATILELISTKERNEL_H
2 #define CUDATILELISTKERNEL_H
3 #ifdef NAMD_CUDA
4 
5 #include <cuda_runtime.h>
6 
7 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
8 struct TileExcl {
9  unsigned int excl[32];
10 };
11 
12 struct TileList {
15  int jtileEnd;
16  float3 offsetXYZ;
17  int2 patchInd; // Patch indices for this list
18  union {
19  int2 patchNumList; // Number of lists contributing to each patch
20  // int icompute;
21  };
22  int icompute;
23 };
24 
26  int iatomSize;
28  int jatomSize;
30 };
31 
32 //
33 // Bounding box structure
34 //
35 struct BoundingBox {
36  float x, y, z; // Center
37  float wx, wy, wz; // Half-width
38 };
39 
40 //
41 // Stripped-down CUDA version of compute record
42 //
44  int2 patchInd;
45  float3 offsetXYZ;
46 };
47 
48 //
49 // Stripped-down CUDA version of patch record
50 //
52  int numAtoms;
54  int atomStart;
55 #ifdef NODEGROUP_FORCE_REGISTER
56  // appending an additional field to hold the patchId
57  int patchID;
58 #endif
59 };
60 
61 //
62 // Tile list status. Used to communicate tile list sizes between GPU and CPU
63 //
64 struct TileListStat {
67  int numJtiles;
72 };
73 
75  float shx, shy, shz;
76  float forcex, forcey, forcez;
78  double energyVdw;
79  double energyElec;
80  double energySlow;
81 
82  /* FEP energies */
83  double energyVdw_s;
84  double energyElec_s;
85  double energySlow_s;
86 
87  /* TI energies */
94 
95  double energyGBIS;
96 };
97 
98 struct VirialEnergy {
99  double virial[9];
100  double virialSlow[9];
101  double energyVdw;
102  double energyElec;
103  double energySlow;
104 
105  /* FEP energies */
106  double energyVdw_s;
107  double energyElec_s;
108  double energySlow_s;
109 
110  /* TI energies */
111  double energyVdw_ti_1;
117 
118  double energyGBIS;
119 };
120 
122 private:
123 
124  template <typename T>
125  struct PtrSize {
126  PtrSize(T* ptr, size_t size) : ptr(ptr), size(size) {}
127  T* ptr;
128  size_t size;
129  };
130 
131  const int deviceID;
132 
133  // Events
134  cudaEvent_t tileListStatEvent;
135  bool tileListStatEventRecord;
136 
137  // Pair list cutoff squared
138  float plcutoff2;
139 
140  // Number of patches
141  int numPatches;
142 
143  // Number of computes
144  int numComputes;
145 
146  // Number of tile lists
147  int numTileLists;
148 
149  // Number of tile lists for GBIS
150  int numTileListsGBIS;
151 
152  // Number of tiles
153  int numJtiles;
154 
155  // Maximum number of tiles per tile list
156  int maxTileListLen;
157 
158  CudaPatchRecord* cudaPatches;
159  size_t cudaPatchesSize;
160 
161  CudaComputeRecord* cudaComputes;
162  size_t cudaComputesSize;
163 
164  // --- For Streaming ---
165  const bool doStreaming;
166  int* patchNumLists;
167  size_t patchNumListsSize;
168 
169  int* emptyPatches;
170  size_t emptyPatchesSize;
171  int* h_emptyPatches;
172  size_t h_emptyPatchesSize;
173  int numEmptyPatches;
174 
175  unsigned int* sortKeySrc;
176  size_t sortKeySrcSize;
177  unsigned int* sortKeyDst;
178  size_t sortKeyDstSize;
179 
180  int maxTileListLen_sortKeys;
181 
182  unsigned int* sortKeys;
183  size_t sortKeysSize;
184 
185  int2* minmaxListLen;
186  size_t minmaxListLenSize;
187 
188  int sortKeys_endbit;
189  // ---------------------
190 
191  // Single entry pinned host and device buffers for communicating tile list status
192  TileListStat* h_tileListStat;
193  TileListStat* d_tileListStat;
194 
195  // Atom coordinates and charge
196  float4* xyzq;
197  size_t xyzqSize;
198  // Atom coordinate storage size
199  int atomStorageSize;
200 
201  char *part;
202  size_t partSize;
203 
204  // Tile lists
205  TileList* tileLists1;
206  size_t tileLists1Size;
207  TileList* tileLists2;
208  size_t tileLists2Size;
209  TileList* tileListsGBIS;
210  size_t tileListsGBISSize;
211 
212  // Pair pairs
213  PatchPairRecord* patchPairs1;
214  size_t patchPairs1Size;
215  PatchPairRecord* patchPairs2;
216  size_t patchPairs2Size;
217 
218  // j-atom start for tiles
219  int* tileJatomStart1;
220  size_t tileJatomStart1Size;
221  int* tileJatomStart2;
222  size_t tileJatomStart2Size;
223  int* tileJatomStartGBIS;
224  size_t tileJatomStartGBISSize;
225 
226  // Bounding boxes
227  BoundingBox* boundingBoxes;
228  size_t boundingBoxesSize;
229 
230  // Depth of each tile list
231  unsigned int* tileListDepth1;
232  size_t tileListDepth1Size;
233  unsigned int* tileListDepth2;
234  size_t tileListDepth2Size;
235 
236  // Tile list order
237  int* tileListOrder1;
238  size_t tileListOrder1Size;
239  int* tileListOrder2;
240  size_t tileListOrder2Size;
241 
242  // Position of each tile list = ExclusiveSum(tileListDepths)
243  int* tileListPos;
244  size_t tileListPosSize;
245 
246  // jtile occupancy and position
247  int* jtiles;
248  size_t jtilesSize;
249 
250  // Temporary buffers used in buildTileLists
251  int* tilePos;
252  size_t tilePosSize;
253 
254  // Exclusions
255  TileExcl* tileExcls1;
256  size_t tileExcls1Size;
257  TileExcl* tileExcls2;
258  size_t tileExcls2Size;
259 
260  // Temporary storage for CUB
261  char* tempStorage;
262  size_t tempStorageSize;
263 
264  // Number of exclusions detected
265  int numExcluded;
266 
267  // Virials and energies for tile lists
268  TileListVirialEnergy* tileListVirialEnergy;
269  size_t tileListVirialEnergySize;
270 
271  int tileListVirialEnergyLength;
272  int tileListVirialEnergyGBISLength;
273 
274  int activeBuffer;
275 
276  void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}
277 
278  void sortTileLists(
279  const bool useJtiles,
280  const int begin_bit, const bool highDepthBitsSet,
281  // Source
282  const int numTileListsSrc, const int numJtilesSrc,
283  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
284  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
285  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
286  // Destination
287  const int numTileListsDst, const int numJtilesDst,
288  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
289  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
290  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
291  cudaStream_t stream);
292 
293  void writeTileList(const char* filename, const int numTileLists,
294  const TileList* d_tileLists, cudaStream_t stream);
295  void writeTileJatomStart(const char* filename, const int numJtiles,
296  const int* d_tileJatomStart, cudaStream_t stream);
297  // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
298  // const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);
299 
300  int* outputOrder;
301  size_t outputOrderSize;
302  bool doOutputOrder;
303 
304 public:
305 
306  CudaTileListKernel(int deviceID, bool doStreaming);
308 
309  int getNumEmptyPatches() {return numEmptyPatches;}
310  int* getEmptyPatches() {return h_emptyPatches;}
311 
312  int getNumExcluded() {return numExcluded;}
313 
314  float get_plcutoff2() {return plcutoff2;}
315  int getNumTileLists() {return numTileLists;}
316  int getNumTileListsGBIS() {return numTileListsGBIS;}
317  int getNumJtiles() {return numJtiles;}
318  BoundingBox* getBoundingBoxes() {return boundingBoxes;}
319  int* getJtiles() {return jtiles;}
320  float4* get_xyzq() {return xyzq;}
321  char* get_part() {return part;}
322 
323  TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
324  void clearTileListStat(cudaStream_t stream);
325 
326  int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
328  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
329  }
330  unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
331  int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
332  TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
333  PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
334 
335  int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
336  TileList* getTileListsGBIS() {return tileListsGBIS;}
337 
338  TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}
339 
340  CudaPatchRecord* getCudaPatches() {return cudaPatches;}
341  int getCudaPatchesSize() {return cudaPatchesSize;}
342 
343  void prepareTileList(cudaStream_t stream);
344  void finishTileList(cudaStream_t stream);
345 
346  void updateComputes(const int numComputesIn,
347  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);
348 
349  void prepareBuffers(
350  int atomStorageSizeIn, int numPatchesIn,
351  const CudaPatchRecord* h_cudaPatches,
352  cudaStream_t stream);
353 
354  void buildTileLists(const int numTileListsPrev,
355  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
356  const float3 lata, const float3 latb, const float3 latc,
357  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
358  const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged,
359  const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration);
360 
361  void reSortTileLists(const bool doGBIS, cudaStream_t stream);
362  // void applyOutputOrder(cudaStream_t stream);
363 
364  void setTileListVirialEnergyLength(int len);
365  void setTileListVirialEnergyGBISLength(int len);
366  int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
367  int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}
368 
369  int getNumPatches() {return numPatches;}
370 
371  int getNumComputes() {return numComputes;}
372  int* getOutputOrder() {
373  if (!doStreaming) return NULL;
374  if (doOutputOrder) {
375  return outputOrder;
376  } else {
377  return NULL;
378  }
379  }
380 
381 };
382 #endif // NAMD_CUDA
383 #endif // CUDATILELISTKERNEL_H
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
void setTileListVirialEnergyLength(int len)
PatchPairRecord * getPatchPairs()
void clearTileListStat(cudaStream_t stream)
void setTileListVirialEnergyGBISLength(int len)
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
CudaPatchRecord * getCudaPatches()
unsigned int * getTileListDepth()
BoundingBox * getBoundingBoxes()
unsigned int excl[32]
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
TileList * getTileListsGBIS()
TileListStat * getTileListStatDevPtr()
void finishTileList(cudaStream_t stream)
float3 offsetXYZ
double virialSlow[9]
TileListVirialEnergy * getTileListVirialEnergy()
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)