NAMD
CudaTileListKernel.h
Go to the documentation of this file.
1 #ifndef CUDATILELISTKERNEL_H
2 #define CUDATILELISTKERNEL_H
3 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4 
5 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
6 struct TileExcl {
8 };
9 
10 struct TileList {
13  int jtileEnd;
14  float3 offsetXYZ;
15  int2 patchInd; // Patch indices for this list
16  union {
17  int2 patchNumList; // Number of lists contributing to each patch
18  // int icompute;
19  };
20  int icompute;
21 };
22 
24  int iatomSize;
26  int jatomSize;
28 };
29 
30 //
31 // Bounding box structure
32 //
33 struct BoundingBox {
34  float x, y, z; // Center
35  float wx, wy, wz; // Half-width
36 };
37 
38 //
39 // Stripped-down CUDA version of compute record
40 //
42  int2 patchInd;
43  float3 offsetXYZ;
44 };
45 
46 //
47 // Stripped-down CUDA version of patch record
48 //
50  int numAtoms;
52  int atomStart;
53 };
54 
55 //
56 // Tile list status. Used to communicate tile list sizes between GPU and CPU
57 //
58 struct TileListStat {
61  int numJtiles;
66 };
67 
69  float shx, shy, shz;
70  float forcex, forcey, forcez;
72  double energyVdw;
73  double energyElec;
74  double energySlow;
75  double energyGBIS;
76 };
77 
78 struct VirialEnergy {
79  double virial[9];
80  double virialSlow[9];
81  double energyVdw;
82  double energyElec;
83  double energySlow;
84  double energyGBIS;
85 };
86 
88 private:
89 
90  template <typename T>
91  struct PtrSize {
92  PtrSize(T* ptr, int size) : ptr(ptr), size(size) {}
93  T* ptr;
94  int size;
95  };
96 
97  const int deviceID;
98 
99  // Events
100  cudaEvent_t tileListStatEvent;
101  bool tileListStatEventRecord;
102 
103  // Pair list cutoff squared
104  float plcutoff2;
105 
106  // Number of patches
107  int numPatches;
108 
109  // Number of computes
110  int numComputes;
111 
112  // Number of tile lists
113  int numTileLists;
114 
115  // Number of tile lists for GBIS
116  int numTileListsGBIS;
117 
118  // Number of tiles
119  int numJtiles;
120 
121  // Maximum number of tiles per tile list
122  int maxTileListLen;
123 
124  CudaPatchRecord* cudaPatches;
125  int cudaPatchesSize;
126 
127  CudaComputeRecord* cudaComputes;
128  int cudaComputesSize;
129 
130  // --- For Streaming ---
131  const bool doStreaming;
132  int* patchNumLists;
133  int patchNumListsSize;
134 
135  int* emptyPatches;
136  int emptyPatchesSize;
137  int* h_emptyPatches;
138  int h_emptyPatchesSize;
139  int numEmptyPatches;
140 
141  unsigned int* sortKeySrc;
142  int sortKeySrcSize;
143  unsigned int* sortKeyDst;
144  int sortKeyDstSize;
145 
146  int maxTileListLen_sortKeys;
147 
148  unsigned int* sortKeys;
149  int sortKeysSize;
150 
151  int2* minmaxListLen;
152  int minmaxListLenSize;
153 
154  int sortKeys_endbit;
155  // ---------------------
156 
157  // Single entry pinned host and device buffers for communicating tile list status
158  TileListStat* h_tileListStat;
159  TileListStat* d_tileListStat;
160 
161  // Atom coordinates and charge
162  float4* xyzq;
163  int xyzqSize;
164  // Atom coordinate storage size
165  int atomStorageSize;
166 
167  // Tile lists
168  TileList* tileLists1;
169  int tileLists1Size;
170  TileList* tileLists2;
171  int tileLists2Size;
172  TileList* tileListsGBIS;
173  int tileListsGBISSize;
174 
175  // Pair pairs
176  PatchPairRecord* patchPairs1;
177  int patchPairs1Size;
178  PatchPairRecord* patchPairs2;
179  int patchPairs2Size;
180 
181  // j-atom start for tiles
182  int* tileJatomStart1;
183  int tileJatomStart1Size;
184  int* tileJatomStart2;
185  int tileJatomStart2Size;
186  int* tileJatomStartGBIS;
187  int tileJatomStartGBISSize;
188 
189  // Bounding boxes
190  BoundingBox* boundingBoxes;
191  int boundingBoxesSize;
192 
193  // Depth of each tile list
194  unsigned int* tileListDepth1;
195  int tileListDepth1Size;
196  unsigned int* tileListDepth2;
197  int tileListDepth2Size;
198 
199  // Tile list order
200  int* tileListOrder1;
201  int tileListOrder1Size;
202  int* tileListOrder2;
203  int tileListOrder2Size;
204 
205  // Position of each tile list = ExclusiveSum(tileListDepths)
206  int* tileListPos;
207  int tileListPosSize;
208 
209  // jtile occupancy and position
210  int* jtiles;
211  int jtilesSize;
212 
213  // Temporary buffers used in buildTileLists
214  int* tilePos;
215  int tilePosSize;
216 
217  // Exclusions
218  TileExcl* tileExcls1;
219  int tileExcls1Size;
220  TileExcl* tileExcls2;
221  int tileExcls2Size;
222 
223  // Temporary storage for CUB
224  char* tempStorage;
225  int tempStorageSize;
226 
227  // Number of exclusions detected
228  int numExcluded;
229 
230  // Virials and energies for tile lists
231  TileListVirialEnergy* tileListVirialEnergy;
232  int tileListVirialEnergySize;
233 
234  int tileListVirialEnergyLength;
235  int tileListVirialEnergyGBISLength;
236 
237  int activeBuffer;
238 
239  void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}
240 
241  void sortTileLists(
242  const bool useJtiles,
243  const int begin_bit, const bool highDepthBitsSet,
244  // Source
245  const int numTileListsSrc, const int numJtilesSrc,
246  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
247  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
248  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
249  // Destination
250  const int numTileListsDst, const int numJtilesDst,
251  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
252  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
253  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
254  cudaStream_t stream);
255 
256  void writeTileList(const char* filename, const int numTileLists,
257  const TileList* d_tileLists, cudaStream_t stream);
258  void writeTileJatomStart(const char* filename, const int numJtiles,
259  const int* d_tileJatomStart, cudaStream_t stream);
260  // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
261  // const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);
262 
263  int* outputOrder;
264  int outputOrderSize;
265  bool doOutputOrder;
266 
267 public:
268 
269  CudaTileListKernel(int deviceID, bool doStreaming);
271 
272  int getNumEmptyPatches() {return numEmptyPatches;}
273  int* getEmptyPatches() {return h_emptyPatches;}
274 
275  int getNumExcluded() {return numExcluded;}
276 
277  float get_plcutoff2() {return plcutoff2;}
278  int getNumTileLists() {return numTileLists;}
279  int getNumTileListsGBIS() {return numTileListsGBIS;}
280  int getNumJtiles() {return numJtiles;}
281  BoundingBox* getBoundingBoxes() {return boundingBoxes;}
282  int* getJtiles() {return jtiles;}
283  float4* get_xyzq() {return xyzq;}
284 
285  TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
286  void clearTileListStat(cudaStream_t stream);
287 
288  int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
290  return ((activeBuffer == 1) ? tileLists1 : tileLists2);
291  }
292  unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
293  int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
294  TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
295  PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}
296 
297  int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
298  TileList* getTileListsGBIS() {return tileListsGBIS;}
299 
300  TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}
301 
302  CudaPatchRecord* getCudaPatches() {return cudaPatches;}
303 
304  void prepareTileList(cudaStream_t stream);
305  void finishTileList(cudaStream_t stream);
306 
307  void updateComputes(const int numComputesIn,
308  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);
309 
310  void buildTileLists(const int numTileListsPrev,
311  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
312  const float3 lata, const float3 latb, const float3 latc,
313  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
314  const size_t maxShmemPerBlock, cudaStream_t stream);
315 
316  void reSortTileLists(const bool doGBIS, cudaStream_t stream);
317  // void applyOutputOrder(cudaStream_t stream);
318 
319  void setTileListVirialEnergyLength(int len);
320  void setTileListVirialEnergyGBISLength(int len);
321  int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
322  int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}
323 
324  int getNumPatches() {return numPatches;}
325 
326  int getNumComputes() {return numComputes;}
327  int* getOutputOrder() {
328  if (!doStreaming) return NULL;
329  if (doOutputOrder) {
330  return outputOrder;
331  } else {
332  return NULL;
333  }
334  }
335 
336 };
337 #endif // NAMD_CUDA
338 #endif // CUDATILELISTKERNEL_H
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
void setTileListVirialEnergyLength(int len)
numTileListsSrc
const int const int begin_bit
PatchPairRecord * getPatchPairs()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
void clearTileListStat(cudaStream_t stream)
void setTileListVirialEnergyGBISLength(int len)
CudaPatchRecord * getCudaPatches()
unsigned int * getTileListDepth()
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ tileListOrderSrc
#define WARPSIZE
Definition: CudaUtils.h:10
BoundingBox * getBoundingBoxes()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
TileList * getTileListsGBIS()
unsigned int WarpMask
Definition: CudaUtils.h:11
TileListStat * getTileListStatDevPtr()
const int numTileListsDst
void finishTileList(cudaStream_t stream)
float3 offsetXYZ
WarpMask excl[WARPSIZE]
double virialSlow[9]
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
TileListVirialEnergy * getTileListVirialEnergy()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
void reSortTileLists(const bool doGBIS, cudaStream_t stream)