NAMD
CudaTileListKernel.h
Go to the documentation of this file.
1 #ifndef CUDATILELISTKERNEL_H
2 #define CUDATILELISTKERNEL_H
3 #ifdef NAMD_CUDA
4 
5 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
6 struct TileExcl {
7  unsigned int excl[32];
8 };
9 
10 struct TileList {
13  int jtileEnd;
14  float3 offsetXYZ;
15  int2 patchInd; // Patch indices for this list
16  union {
17  int2 patchNumList; // Number of lists contributing to each patch
18  // int icompute;
19  };
20  int icompute;
21 };
22 
24  int iatomSize;
26  int jatomSize;
28 };
29 
30 //
31 // Bounding box structure
32 //
33 struct BoundingBox {
34  float x, y, z; // Center
35  float wx, wy, wz; // Half-width
36 };
37 
38 //
39 // Stripped-down CUDA version of compute record
40 //
42  int2 patchInd;
43  float3 offsetXYZ;
44 };
45 
46 //
47 // Stripped-down CUDA version of patch record
48 //
50  int numAtoms;
52  int atomStart;
53 #ifdef NODEGROUP_FORCE_REGISTER
54  // appending an additional field to hold the patchId
55  int patchID;
56 #endif
57 };
58 
59 //
60 // Tile list status. Used to communicate tile list sizes between GPU and CPU
61 //
62 struct TileListStat {
65  int numJtiles;
70 };
71 
73  float shx, shy, shz;
74  float forcex, forcey, forcez;
76  double energyVdw;
77  double energyElec;
78  double energySlow;
79 
80  /* FEP energies */
81  double energyVdw_s;
82  double energyElec_s;
83  double energySlow_s;
84 
85  /* TI energies */
92 
93  double energyGBIS;
94 };
95 
96 struct VirialEnergy {
97  double virial[9];
98  double virialSlow[9];
99  double energyVdw;
100  double energyElec;
101  double energySlow;
102 
103  /* FEP energies */
104  double energyVdw_s;
105  double energyElec_s;
106  double energySlow_s;
107 
108  /* TI energies */
109  double energyVdw_ti_1;
115 
116  double energyGBIS;
117 };
118 
120 private:
121 
122  template <typename T>
123  struct PtrSize {
124  PtrSize(T* ptr, size_t size) : ptr(ptr), size(size) {}
125  T* ptr;
126  size_t size;
127  };
128 
129  const int deviceID;
130 
131  // Events
132  cudaEvent_t tileListStatEvent;
133  bool tileListStatEventRecord;
134 
135  // Pair list cutoff squared
136  float plcutoff2;
137 
138  // Number of patches
139  int numPatches;
140 
141  // Number of computes
142  int numComputes;
143 
144  // Number of tile lists
145  int numTileLists;
146 
147  // Number of tile lists for GBIS
148  int numTileListsGBIS;
149 
150  // Number of tiles
151  int numJtiles;
152 
153  // Maximum number of tiles per tile list
154  int maxTileListLen;
155 
156  CudaPatchRecord* cudaPatches;
157  size_t cudaPatchesSize;
158 
159  CudaComputeRecord* cudaComputes;
160  size_t cudaComputesSize;
161 
162  // --- For Streaming ---
163  const bool doStreaming;
164  int* patchNumLists;
165  size_t patchNumListsSize;
166 
167  int* emptyPatches;
168  size_t emptyPatchesSize;
169  int* h_emptyPatches;
170  size_t h_emptyPatchesSize;
171  int numEmptyPatches;
172 
173  unsigned int* sortKeySrc;
174  size_t sortKeySrcSize;
175  unsigned int* sortKeyDst;
176  size_t sortKeyDstSize;
177 
178  int maxTileListLen_sortKeys;
179 
180  unsigned int* sortKeys;
181  size_t sortKeysSize;
182 
183  int2* minmaxListLen;
184  size_t minmaxListLenSize;
185 
186  int sortKeys_endbit;
187  // ---------------------
188 
189  // Single entry pinned host and device buffers for communicating tile list status
190  TileListStat* h_tileListStat;
191  TileListStat* d_tileListStat;
192 
193  // Atom coordinates and charge
194  float4* xyzq;
195  size_t xyzqSize;
196  // Atom coordinate storage size
197  int atomStorageSize;
198 
199  char *part;
200  size_t partSize;
201 
202  // Tile lists
203  TileList* tileLists1;
204  size_t tileLists1Size;
205  TileList* tileLists2;
206  size_t tileLists2Size;
207  TileList* tileListsGBIS;
208  size_t tileListsGBISSize;
209 
210  // Pair pairs
211  PatchPairRecord* patchPairs1;
212  size_t patchPairs1Size;
213  PatchPairRecord* patchPairs2;
214  size_t patchPairs2Size;
215 
216  // j-atom start for tiles
217  int* tileJatomStart1;
218  size_t tileJatomStart1Size;
219  int* tileJatomStart2;
220  size_t tileJatomStart2Size;
221  int* tileJatomStartGBIS;
222  size_t tileJatomStartGBISSize;
223 
224  // Bounding boxes
225  BoundingBox* boundingBoxes;
226  size_t boundingBoxesSize;
227 
228  // Depth of each tile list
229  unsigned int* tileListDepth1;
230  size_t tileListDepth1Size;
231  unsigned int* tileListDepth2;
232  size_t tileListDepth2Size;
233 
234  // Tile list order
235  int* tileListOrder1;
236  size_t tileListOrder1Size;
237  int* tileListOrder2;
238  size_t tileListOrder2Size;
239 
240  // Position of each tile list = ExclusiveSum(tileListDepths)
241  int* tileListPos;
242  size_t tileListPosSize;
243 
244  // jtile occupancy and position
245  int* jtiles;
246  size_t jtilesSize;
247 
248  // Temporary buffers used in buildTileLists
249  int* tilePos;
250  size_t tilePosSize;
251 
252  // Exclusions
253  TileExcl* tileExcls1;
254  size_t tileExcls1Size;
255  TileExcl* tileExcls2;
256  size_t tileExcls2Size;
257 
258  // Temporary storage for CUB
259  char* tempStorage;
260  size_t tempStorageSize;
261 
262  // Number of exclusions detected
263  int numExcluded;
264 
265  // Virials and energies for tile lists
266  TileListVirialEnergy* tileListVirialEnergy;
267  size_t tileListVirialEnergySize;
268 
269  int tileListVirialEnergyLength;
270  int tileListVirialEnergyGBISLength;
271 
272  int activeBuffer;
273 
274  void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}
275 
276  void sortTileLists(
277  const bool useJtiles,
278  const int begin_bit, const bool highDepthBitsSet,
279  // Source
280  const int numTileListsSrc, const int numJtilesSrc,
281  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
282  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
283  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
284  // Destination
285  const int numTileListsDst, const int numJtilesDst,
286  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
287  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
288  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
289  cudaStream_t stream);
290 
291  void writeTileList(const char* filename, const int numTileLists,
292  const TileList* d_tileLists, cudaStream_t stream);
293  void writeTileJatomStart(const char* filename, const int numJtiles,
294  const int* d_tileJatomStart, cudaStream_t stream);
295  // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
296  // const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);
297 
298  int* outputOrder;
299  size_t outputOrderSize;
300  bool doOutputOrder;
301 
302 public:
303 
304  CudaTileListKernel(int deviceID, bool doStreaming);
306 
307  int getNumEmptyPatches() {return numEmptyPatches;}
308  int* getEmptyPatches() {return h_emptyPatches;}
309 
310  int getNumExcluded() {return numExcluded;}
311 
312  float get_plcutoff2() {return plcutoff2;}
313  int getNumTileLists() {return numTileLists;}
314  int getNumTileListsGBIS() {return numTileListsGBIS;}
315  int getNumJtiles() {return numJtiles;}
316  BoundingBox* getBoundingBoxes() {return boundingBoxes;}
317  int* getJtiles() {return jtiles;}
318  float4* get_xyzq() {return xyzq;}
319  char* get_part() {return part;}
320 
321  TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
322  void clearTileListStat(cudaStream_t stream);
323 
324  int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
326  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
327  }
328  unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
329  int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
330  TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
331  PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
332 
333  int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
334  TileList* getTileListsGBIS() {return tileListsGBIS;}
335 
336  TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}
337 
338  CudaPatchRecord* getCudaPatches() {return cudaPatches;}
339  int getCudaPatchesSize() {return cudaPatchesSize;}
340 
341  void prepareTileList(cudaStream_t stream);
342  void finishTileList(cudaStream_t stream);
343 
344  void updateComputes(const int numComputesIn,
345  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);
346 
347  void prepareBuffers(
348  int atomStorageSizeIn, int numPatchesIn,
349  const CudaPatchRecord* h_cudaPatches,
350  cudaStream_t stream);
351 
352  void buildTileLists(const int numTileListsPrev,
353  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
354  const float3 lata, const float3 latb, const float3 latc,
355  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
356  const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged,
357  const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration);
358 
359  void reSortTileLists(const bool doGBIS, cudaStream_t stream);
360  // void applyOutputOrder(cudaStream_t stream);
361 
362  void setTileListVirialEnergyLength(int len);
363  void setTileListVirialEnergyGBISLength(int len);
364  int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
365  int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}
366 
367  int getNumPatches() {return numPatches;}
368 
369  int getNumComputes() {return numComputes;}
370  int* getOutputOrder() {
371  if (!doStreaming) return NULL;
372  if (doOutputOrder) {
373  return outputOrder;
374  } else {
375  return NULL;
376  }
377  }
378 
379 };
380 #endif // NAMD_CUDA
381 #endif // CUDATILELISTKERNEL_H
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
void setTileListVirialEnergyLength(int len)
PatchPairRecord * getPatchPairs()
void clearTileListStat(cudaStream_t stream)
void setTileListVirialEnergyGBISLength(int len)
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
CudaPatchRecord * getCudaPatches()
unsigned int * getTileListDepth()
BoundingBox * getBoundingBoxes()
unsigned int excl[32]
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
TileList * getTileListsGBIS()
TileListStat * getTileListStatDevPtr()
void finishTileList(cudaStream_t stream)
float3 offsetXYZ
double virialSlow[9]
TileListVirialEnergy * getTileListVirialEnergy()
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)
void reSortTileLists(const bool doGBIS, cudaStream_t stream)