NAMD
CudaTileListKernelHip.cu
Go to the documentation of this file.
1 // #include <map>
2 // #include <algorithm>
3 #if defined(NAMD_HIP) //NAMD_HIP
4 #include <hip/hip_runtime.h>
5 #include <hipcub/hipcub.hpp>
6 #define cub hipcub
7 #endif // NAMD_HIP
8 
9 #include "CudaUtils.h"
10 #include "CudaTileListKernel.hip.h"
11 #include "CudaComputeNonbondedKernel.hip.h"
12 #include "DeviceCUDA.h"
13 
14 #if defined(NAMD_HIP) //NAMD_HIP
15 
16 #ifdef WIN32
17 #define __thread __declspec(thread)
18 #endif
19 extern __thread DeviceCUDA *deviceCUDA;
20 
21 #define OVERALLOC 1.2f
22 #define DEFAULTKERNEL_NUM_THREAD 256
23 #define UPDATEPATCHESKERNEL_NUM_THREAD 256
24 #define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 256
25 #define BOUNDINGBOXKERNEL_NUM_WARP 4
26 #define __ldg *
27 
28 void NAMD_die(const char *);
29 
30 //
31 // Calculate the number of lists that contribute to each patch
32 //
33 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
34  const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
35 
36  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
37  {
38  int2 patchInd = tileLists[i].patchInd;
39  atomicAdd(&patchNumLists[patchInd.x], 1);
40  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
41  }
42 
43 }
44 
45 //
46 // Write patchNumList back to tile list and
47 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
48 //
49 __global__ void setPatchNumLists_findEmptyPatches(const int numTileLists,
50  TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
51  const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
52 
53  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
54  {
55  int2 patchInd = tileLists[i].patchInd;
56  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
57  tileLists[i].patchNumList = patchNumList;
58  }
59 
60  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
61  {
62  if (patchNumLists[i] == 0) {
63  int ind = atomicAdd(numEmptyPatches, 1);
64  emptyPatches[ind] = i;
65  }
66  }
67 
68 }
69 
70 //
71 // Builds a sort key that removes zeros but keeps the order otherwise the same
72 //
73 __global__ void buildRemoveZerosSortKey(const int numTileLists,
74  const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
75 
76  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
77  {
78  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
79  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
80  }
81 
82 }
83 
84 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
85  const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
86  const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
87 
88  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
89  {
90  int icompute = tileLists[itileList].icompute;
91  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
92  int i = icompute*maxTileListLen + (depth - 1);
93  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
94  }
95 
96 }
97 
98 template <int width>
99 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
100  unsigned int* __restrict__ keys, int* __restrict__ vals) {
101 
102  // NOTE: blockDim.x = width
103 
104  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
105  {
106  int i = base + threadIdx.x;
107  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
108  __shared__ typename BlockRadixSort::TempStorage tempStorage;
109  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
110  int val[1] = {(i < n) ? vals[i] : 0};
111  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
112  if (i < n) {
113  keys[i] = key[0];
114  vals[i] = val[0];
115  }
116  BLOCK_SYNC;
117  }
118 
119 }
120 
121 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
122  const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
123  const int* __restrict__ tileListOrderSrc,
124  const unsigned int* __restrict__ tileListDepthSrc,
125  int* __restrict__ tileListOrderDst,
126  unsigned int* __restrict__ tileListDepthDst) {
127 
128  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
129  {
130  int j = outputOrder[numTileListsSrc - i - 1];
131  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
132  int k = tileListPos[i];
133  tileListDepthDst[k] = tileListDepthSrc[j];
134  tileListOrderDst[k] = j; //tileListOrderSrc[j];
135  }
136  }
137 }
138 
139 //
140 // Bit shift tileListDepth so that only lower 16 bits are used
141 //
142 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
143  const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
144  unsigned int* __restrict__ tileListDepthDst) {
145 
146  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
147  {
148  int j = outputOrder[numTileLists - i - 1];
149  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
150  }
151 
152 }
153 
154 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
155  int2* __restrict__ minmaxListLen) {
156 
157  int2 val;
158  val.x = maxTileListLen+1;
159  val.y = 0;
160  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
161  {
162  minmaxListLen[i] = val;
163  }
164 
165 }
166 
167 //
168 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
169 //
170 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
171  const TileList* __restrict__ tileListsSrc,
172  const int* __restrict__ tileListOrderDst,
173  const unsigned int* __restrict__ tileListDepthDst,
174  int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
175 
176  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
177  {
178  int k = tileListOrderDst[i];
179  int icompute = tileListsSrc[k].icompute;
180  int depth = tileListDepthDst[i] & 65535;
181  // depth is in range [1 ... maxTileListLen]
182  int j = icompute*maxTileListLen + (depth-1);
183  sortKeys[j] = i;
184  int2 minmax = minmaxListLen[icompute];
185  int2 minmaxOrig = minmax;
186  if (minmax.x > depth) minmax.x = depth;
187  if (minmax.y < depth) minmax.y = depth;
188  if (minmax.x != minmaxOrig.x) {
189  atomicMin(&minmaxListLen[icompute].x, minmax.x);
190  }
191  if (minmax.y != minmaxOrig.y) {
192  atomicMax(&minmaxListLen[icompute].y, minmax.y);
193  }
194  }
195 
196 }
197 
198 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
199  const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
200 
201  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
202  const int wid = threadIdx.x % WARPSIZE;
203  int2 minmax = minmaxListLen[i];
204  int minlen = minmax.x;
205  int maxlen = minmax.y;
206  // minlen, maxlen are in range [1 ... maxTileListLen]
207  // as long as i is in tileListsSrc[].icompute above
208  if ( maxlen < minlen ) {
209  minlen = 1;
210  maxlen = maxTileListLen;
211  }
212  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
213  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
214  unsigned int aveKey = (maxKey + minKey)/2;
215  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
216  sortKeys[i*maxTileListLen + j] = minKey;
217  }
218  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
219  sortKeys[i*maxTileListLen + j] = maxKey;
220  }
221  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
222  if (sortKeys[i*maxTileListLen + j] == 0) {
223  sortKeys[i*maxTileListLen + j] = aveKey;
224  }
225  }
226  }
227 
228 }
229 
230 //
231 // Calculate bounding boxes for sets of WARPSIZE atoms
232 //
233 
234 __global__ void
235 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
236  BoundingBox* __restrict__ boundingBoxes) {
237 
238  const int warpId = threadIdx.x / BOUNDINGBOXSIZE;
239  const int wid = threadIdx.x % BOUNDINGBOXSIZE;
240 
241  // Loop with warp-aligned index to avoid warp-divergence
242  for (int iwarp = warpId*BOUNDINGBOXSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
243  // Full atom index
244  const int i = iwarp + wid;
245  // Bounding box index
246  const int ibb = i/BOUNDINGBOXSIZE;
247 
248  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
249 
250  volatile float3 minxyz, maxxyz;
251 
252  typedef cub::BlockReduce<float, BOUNDINGBOXSIZE> BlockReduce;
253  __shared__ typename BlockReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
254  minxyz.x = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
255  minxyz.y = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
256  minxyz.z = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
257  maxxyz.x = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
258  maxxyz.y = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
259  maxxyz.z = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
260 
261  if (wid == 0) {
262  BoundingBox boundingBox;
263  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
264  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
265  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
266  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
267  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
268  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
269  boundingBoxes[ibb] = boundingBox;
270  }
271  }
272 
273 }
274 
275 //
276 // Returns the lower estimate for the distance between two bounding boxes
277 //
278 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
279  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
280  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
281  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
282  float r2 = dx*dx + dy*dy + dz*dz;
283  return r2;
284 }
285 
286 #if 0
287 //
288 // Performs warp-level exclusive sum on a shared memory array:
289 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
290 // sh_in and sh_out can point to same array
291 // Returns the total sum
292 //
293 template <typename T>
294 __device__ __forceinline__
295 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
296  const int wid = threadIdx.x % WARPSIZE;
297  volatile int blockOffset = 0;
298  for (int iblock=0;iblock < n;iblock += WARPSIZE) {
299  // Size of the exclusive sum
300  int blockLen = min(WARPSIZE, n-iblock);
301  // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
302  typedef cub::WarpScan<int> WarpScan;
303  __shared__ typename WarpScan::TempStorage tempStorage;
304  int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
305  WarpScan(tempStorage).ExclusiveSum(data, data);
306  // Shift by block offset
307  data += blockOffset;
308  // Save last value
309  int last = (int)sh_in[iblock + blockLen-1];
310  // Write output
311  if (wid < blockLen) sh_out[iblock + wid] = data;
312  // block offset = last term of the exclusive sum + last value
313  blockOffset = sh_out[iblock + blockLen-1] + last;
314  }
315  return blockOffset;
316 }
317 #endif
318 
319 #define TILELISTKERNELNEW_NUM_WARP 4
320 
321 //
322 // NOTE: Executed on a single thread block
323 //
324 template<int nthread>
325 __global__ void calcTileListPosKernel(const int numComputes,
326  const CudaComputeRecord* __restrict__ computes,
327  const CudaPatchRecord* __restrict__ patches,
328  int* __restrict__ tilePos) {
329 
330  typedef cub::BlockScan<int, nthread> BlockScan;
331 
332  __shared__ typename BlockScan::TempStorage tempStorage;
333  __shared__ int shTilePos0;
334 
335  if (threadIdx.x == nthread-1) {
336  shTilePos0 = 0;
337  }
338 
339  for (int base=0;base < numComputes;base+=nthread) {
340  int k = base + threadIdx.x;
341 
342  int numTiles1 = (k < numComputes) ?
343  (CudaComputeNonbondedKernel::computeNumTiles(patches[computes[k].patchInd.x].numAtoms, BOUNDINGBOXSIZE)) : 0;
344 
345  // Calculate positions in tile list and jtile list
346  int tilePosVal;
347  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
348 
349  // Store into global memory
350  if (k < numComputes) {
351  tilePos[k] = shTilePos0 + tilePosVal;
352  }
353 
354  BLOCK_SYNC;
355  // Store block end position
356  if (threadIdx.x == nthread-1) {
357  shTilePos0 += tilePosVal + numTiles1;
358  }
359  }
360 }
361 
362 
363 template<int nthread>
364 __global__ void updatePatchesKernel(const int numComputes,
365  const int* __restrict__ tilePos,
366  const CudaComputeRecord* __restrict__ computes,
367  const CudaPatchRecord* __restrict__ patches,
368  TileList* __restrict__ tileLists) {
369 
370  const int tid = threadIdx.x % nthread;
371 
372  // nthread threads takes care of one compute
373  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
374  {
375  CudaComputeRecord compute = computes[k];
376  float3 offsetXYZ = compute.offsetXYZ;
377  int2 patchInd = compute.patchInd;
378  int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(patches[patchInd.x].numAtoms, BOUNDINGBOXSIZE);
379  int itileList0 = tilePos[k];
380  for (int i=tid;i < numTiles1;i+=nthread) {
381  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
382  tileLists[itileList0 + i].patchInd = patchInd;
383  tileLists[itileList0 + i].icompute = k;
384  }
385  }
386 
387 }
388 
389 __host__ __device__ __forceinline__
390 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
391  // Size in bytes
392  int size = (
393  maxTileListLen*sizeof(char)
394  );
395  return size;
396 }
397 
398 __global__ void
399 buildTileListsBBKernel(const int numTileLists,
400  TileList* __restrict__ tileLists,
401  const CudaPatchRecord* __restrict__ patches,
402  const int* __restrict__ tileListPos,
403  const float3 lata, const float3 latb, const float3 latc,
404  const float cutoff2, const int maxTileListLen,
405  const BoundingBox* __restrict__ boundingBoxes,
406  int* __restrict__ tileJatomStart,
407  const int tileJatomStartSize,
408  unsigned int* __restrict__ tileListDepth,
409  int* __restrict__ tileListOrder,
410  PatchPairRecord* __restrict__ patchPairs,
411  TileListStat* __restrict__ tileListStat
412 ) {
413  HIP_DYNAMIC_SHARED( char, sh_buffer)
414  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
415  int pos = threadIdx.x*sizePerThread;
416  volatile char* sh_tile = (char*)&sh_buffer[pos];
417 
418  // Loop with warp-aligned index to avoid warp-divergence
419  for (int iwarp = (threadIdx.x/BOUNDINGBOXSIZE)*BOUNDINGBOXSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
420 
421  // Use one thread per tile list
422  const int wid = threadIdx.x % BOUNDINGBOXSIZE;
423  const int itileList = iwarp + wid;
424 
425  int i;
426  int itileListLen = 0;
427  CudaPatchRecord patch1;
428  CudaPatchRecord patch2;
429  float3 offsetXYZ;
430  int2 patchInd;
431  int numTiles2;
432  int icompute;
433 
434  if (itileList < numTileLists) {
435  offsetXYZ = tileLists[itileList].offsetXYZ;
436  patchInd = tileLists[itileList].patchInd;
437  icompute = tileLists[itileList].icompute;
438  // Get i-column
439  i = itileList - tileListPos[icompute];
440 
441  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
442  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
443  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
444 
445  // DH - set zeroShift flag if magnitude of shift vector is zero
446  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
447 
448  // Load patches
449  patch1 = patches[patchInd.x];
450  patch2 = patches[patchInd.y];
451  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
452  numTiles2 = CudaComputeNonbondedKernel::computeNumTiles(patch2.numAtoms, BOUNDINGBOXSIZE);
453  int tileStart1 = patch1.atomStart/BOUNDINGBOXSIZE;
454  int tileStart2 = patch2.atomStart/BOUNDINGBOXSIZE;
455 
456  // DH - self requires that zeroShift is also set
457  bool self = zeroShift && (tileStart1 == tileStart2);
458 
459  // Load i-atom data (and shift coordinates)
460  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
461  boundingBoxI.x += shx;
462  boundingBoxI.y += shy;
463  boundingBoxI.z += shz;
464 
465  for (int j=0;j < numTiles2;j++) {
466  sh_tile[j] = 0;
467  if (!self || j >= i) {
468  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
469  float r2bb = distsq(boundingBoxI, boundingBoxJ);
470  if (r2bb < cutoff2) {
471  sh_tile[j] = 1;
472  itileListLen++;
473  }
474  }
475  }
476 
477  tileListDepth[itileList] = (unsigned int)itileListLen;
478  tileListOrder[itileList] = itileList;
479  }
480 
481  typedef cub::BlockScan<int, BOUNDINGBOXSIZE> BlockScan;
482  __shared__ typename BlockScan::TempStorage tempStorage;
483  int active = (itileListLen > 0);
484  int activePos;
485  BlockScan(tempStorage).ExclusiveSum(active, activePos);
486  int itileListPos;
487  BlockScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
488  int jtileStart, numJtiles;
489  // Last thread in the warp knows the total number
490  if (wid == BOUNDINGBOXSIZE-1) {
491  atomicAdd(&tileListStat->numTileLists, activePos + active);
492  numJtiles = itileListPos + itileListLen;
493  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
494  }
495  numJtiles = cub::ShuffleIndex<BOUNDINGBOXSIZE>(numJtiles, BOUNDINGBOXSIZE-1, WARP_FULL_MASK);
496  jtileStart = cub::ShuffleIndex<BOUNDINGBOXSIZE>(jtileStart, BOUNDINGBOXSIZE-1, WARP_FULL_MASK);
497  if (jtileStart + numJtiles > tileJatomStartSize) {
498  // tileJatomStart out of memory, exit
499  if (wid == 0) tileListStat->tilesSizeExceeded = true;
500  return;
501  }
502 
503  int jStart = itileListPos;
504  int jEnd = cub::ShuffleDown<BOUNDINGBOXSIZE>(itileListPos, 1, BOUNDINGBOXSIZE-1, WARP_FULL_MASK);
505  if (wid == BOUNDINGBOXSIZE-1) jEnd = numJtiles;
506 
507  if (itileListLen > 0) {
508  // Setup tileLists[]
509  //TileList TLtmp;
510  tileLists[itileList].iatomStart = patch1.atomStart + i*BOUNDINGBOXSIZE;
511  tileLists[itileList].jtileStart = jtileStart + jStart;
512  tileLists[itileList].jtileEnd = jtileStart + jEnd - 1;
513  tileLists[itileList].patchInd = patchInd;
514  tileLists[itileList].offsetXYZ = offsetXYZ;
515  tileLists[itileList].icompute = icompute;
516  // TLtmp.patchNumList.x = 0;
517  // TLtmp.patchNumList.y = 0;
518  // tileLists[itileList] = TLtmp;
519  // PatchPair
520  PatchPairRecord patchPair;
521  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
522  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
523  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
524  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
525  patchPairs[itileList] = patchPair;
526 
527  // Write tiles
528  int jtile = jtileStart + jStart;
529  for (int j=0;j < numTiles2;j++) {
530  if (sh_tile[j]) {
531  tileJatomStart[jtile] = patch2.atomStart + j*BOUNDINGBOXSIZE;
532  jtile++;
533  }
534  }
535 
536  }
537 
538  }
539 
540 }
541 
542 #define REPACKTILELISTSKERNEL_NUM_WARP 1
543 __global__ void
544 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
545  const int* __restrict__ tileListOrder,
546  const int* __restrict__ jtiles,
547  const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
548  const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
549  const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
550  const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
551 
552  const int wid = threadIdx.x % BOUNDINGBOXSIZE;
553 
554  // One warp does one tile list
555  for (int i = threadIdx.x/BOUNDINGBOXSIZE + blockDim.x/BOUNDINGBOXSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/BOUNDINGBOXSIZE*gridDim.x)
556  {
557  int j = tileListOrder[i];
558  int start = tileListPos[i];
559  int end = tileListPos[i+1]-1;
560  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
561  // TileList
562  int startOld = __ldg(&tileListsSrc[j].jtileStart);
563  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
564  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
565  float3 offsetXYZ;
566  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
567  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
568  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
569  int2 patchInd = tileListsSrc[j].patchInd;
570  int icompute = __ldg(&tileListsSrc[j].icompute);
571  if (wid == 0) {
572  // TileList tileList;
573  tileListsDst[i].iatomStart = iatomStart;
574  tileListsDst[i].offsetXYZ = offsetXYZ;
575  tileListsDst[i].jtileStart = start;
576  tileListsDst[i].jtileEnd = end;
577  tileListsDst[i].patchInd = patchInd;
578  tileListsDst[i].icompute = icompute;
579  //tileListsDst[i] = tileList;
580  }
581 
582  if (jtiles == NULL) {
583  // No jtiles, simple copy will do
584  int jtile = start;
585  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=BOUNDINGBOXSIZE,jtile+=BOUNDINGBOXSIZE) {
586  if (jtileOld + wid <= endOld) {
587  // this changes the values of tileListdst it seems. why?
588  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
589  }
590  }
591  if (tileExclsSrc != NULL) {
592  int jtile = start;
593  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
594  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
595  }
596  }
597  } else {
598  int jtile0 = start;
599  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=BOUNDINGBOXSIZE) {
600  int t = jtileOld + wid;
601  int jtile = (t <= endOld) ? jtiles[t] : 0;
602  jtile >>= begin_bit;
603  jtile &= 65535;
604  typedef cub::BlockScan<int, BOUNDINGBOXSIZE> BlockScan;
605  __shared__ typename BlockScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
606  int warpId = threadIdx.x / BOUNDINGBOXSIZE;
607  int jtilePos;
608  BlockScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
609 
610  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
611 
612  WarpMask b = NAMD_WARP_BALLOT(WARP_FULL_MASK, jtile);
613  if (tileExclsSrc != NULL) {
614  while (b != 0) {
615  // k = index of thread that has data
616 #if BOUNDINGBOXSIZE == 64
617  int k = __ffsll(b) - 1;
618 #else
619  int k = __ffs(b) - 1;
620 #endif
621  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
622  // remove 1 bit and advance jtile0
623  b ^= ((WarpMask)1 << k);
624  jtile0++;
625  }
626  } else {
627 #if BOUNDINGBOXSIZE == 64
628  jtile0 += __popcll(b);
629 #else
630  jtile0 += __popc(b);
631 #endif
632  }
633  }
634  }
635  }
636 
637 }
638 
639 //
640 // NOTE: Executed on a single thread block
641 // oobKey = out-of-bounds key value
642 //
643 #define SORTTILELISTSKERNEL_NUM_THREAD 256
644 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 15
645 
646 template <typename keyT, typename valT, bool ascend>
647 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__
648 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
649  const int begin_bit, const int end_bit, const keyT oobKey,
650  keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
651  valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
652 
653  typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
654  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
655 
656  typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD,
657  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
658 
659  typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
660  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort;
661 
662  __shared__ union {
663  typename BlockLoad::TempStorage load;
664  typename BlockLoadU::TempStorage loadU;
665  typename BlockRadixSort::TempStorage sort;
666  } tempStorage;
667 
668  keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
669  valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
670 
671  BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
672  BLOCK_SYNC;
673  BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
674  BLOCK_SYNC;
675 
676  if (ascend)
677  BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
678  else
679  BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
680 
681  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
682  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
683 }
684 
685 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
686  unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
687 
688  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
689  {
690  int j = tileListOrder[i];
691  tileListDepthDst[i] = tileListDepthSrc[j];
692  }
693 
694 }
695 
696 //
697 // Bit shift tileListDepth so that only lower 16 bits are used
698 //
699 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
700  unsigned int* __restrict__ tileListDepth) {
701 
702  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
703  {
704  unsigned int a = tileListDepth[itileList];
705  a >>= begin_bit;
706  a &= 65535;
707  tileListDepth[itileList] = a;
708  }
709 
710 }
711 
712 // ##############################################################################################
713 // ##############################################################################################
714 // ##############################################################################################
715 
716 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
717 deviceID(deviceID), doStreaming(doStreaming) {
718 
719  cudaCheck(cudaSetDevice(deviceID));
720 
721  activeBuffer = 1;
722 
723  numPatches = 0;
724  numComputes = 0;
725 
726  cudaPatches = NULL;
727  cudaPatchesSize = 0;
728 
729  cudaComputes = NULL;
730  cudaComputesSize = 0;
731 
732  patchNumLists = NULL;
733  patchNumListsSize = 0;
734 
735  emptyPatches = NULL;
736  emptyPatchesSize = 0;
737  h_emptyPatches = NULL;
738  h_emptyPatchesSize = 0;
739  numEmptyPatches = 0;
740 
741  sortKeySrc = NULL;
742  sortKeySrcSize = 0;
743  sortKeyDst = NULL;
744  sortKeyDstSize = 0;
745 
746  tileLists1 = NULL;
747  tileLists1Size = 0;
748  tileLists2 = NULL;
749  tileLists2Size = 0;
750 
751  patchPairs1 = NULL;
752  patchPairs1Size = 0;
753  patchPairs2 = NULL;
754  patchPairs2Size = 0;
755 
756  tileJatomStart1 = NULL;
757  tileJatomStart1Size = 0;
758  tileJatomStart2 = NULL;
759  tileJatomStart2Size = 0;
760 
761  boundingBoxes = NULL;
762  boundingBoxesSize = 0;
763 
764  tileListDepth1 = NULL;
765  tileListDepth1Size = 0;
766  tileListDepth2 = NULL;
767  tileListDepth2Size = 0;
768 
769  tileListOrder1 = NULL;
770  tileListOrder1Size = 0;
771  tileListOrder2 = NULL;
772  tileListOrder2Size = 0;
773 
774  tileExcls1 = NULL;
775  tileExcls1Size = 0;
776  tileExcls2 = NULL;
777  tileExcls2Size = 0;
778 
779  xyzq = NULL;
780  xyzqSize = 0;
781  part = NULL;
782  partSize = 0;
783 
784  allocate_device<TileListStat>(&d_tileListStat, 1);
785  allocate_host<TileListStat>(&h_tileListStat, 1);
786 
787  tileListPos = NULL;
788  tileListPosSize = 0;
789  tempStorage = NULL;
790  tempStorageSize = 0;
791 
792  jtiles = NULL;
793  jtilesSize = 0;
794 
795  tilePos = NULL;
796  tilePosSize = 0;
797 
798  tileListsGBIS = NULL;
799  tileListsGBISSize = 0;
800 
801  tileJatomStartGBIS = NULL;
802  tileJatomStartGBISSize = 0;
803 
804  tileListVirialEnergy = NULL;
805  tileListVirialEnergySize = 0;
806 
807  atomStorageSize = 0;
808  numTileLists = 0;
809  numTileListsGBIS = 0;
810  numJtiles = 1;
811 
812  outputOrder = NULL;
813  outputOrderSize = 0;
814  doOutputOrder = false;
815 
816  minmaxListLen = NULL;
817  minmaxListLenSize = 0;
818 
819  sortKeys = NULL;
820  sortKeysSize = 0;
821  sortKeys_endbit = 0;
822 
823  cudaCheck(cudaEventCreate(&tileListStatEvent));
824  tileListStatEventRecord = false;
825 }
826 
827 CudaTileListKernel::~CudaTileListKernel() {
828  cudaCheck(cudaSetDevice(deviceID));
829  deallocate_device<TileListStat>(&d_tileListStat);
830  deallocate_host<TileListStat>(&h_tileListStat);
831  //
832  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
833  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
834  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
835  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
836  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
837  //
838  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
839  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
840  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
841  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
842  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
843  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
844  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
845  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
846  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
847  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
848  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
849  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
850  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
851  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
852  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
853  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
854  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
855  if (jtiles != NULL) deallocate_device<int>(&jtiles);
856  if (tilePos != NULL) deallocate_device<int>(&tilePos);
857 
858  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
859  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
860 
861  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
862 
863  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
864  if (part != NULL) deallocate_device<char>(&part);
865 
866  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
867  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
868 
869  cudaCheck(cudaEventDestroy(tileListStatEvent));
870 }
871 
872 void CudaTileListKernel::prepareTileList(cudaStream_t stream) {
873  clear_device_array<int>(jtiles, numJtiles, stream);
874 }
875 
876 void CudaTileListKernel::clearTileListStat(cudaStream_t stream) {
877  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
878  memset(h_tileListStat, 0, sizeof(TileListStat));
879  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
880  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
881 }
882 
883 void CudaTileListKernel::finishTileList(cudaStream_t stream) {
884  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
885  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
886  tileListStatEventRecord = true;
887 }
888 
889 void CudaTileListKernel::updateComputes(const int numComputesIn,
890  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
891 
892  numComputes = numComputesIn;
893 
894  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
895  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
896 
897  if (doStreaming) doOutputOrder = true;
898 }
899 
900 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
901  const TileList* d_tileLists, cudaStream_t stream) {
902 
903  TileList* h_tileLists = new TileList[numTileLists];
904  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
905  cudaCheck(cudaStreamSynchronize(stream));
906  FILE* handle = fopen(filename,"wt");
907  for (int itileList=0;itileList < numTileLists;itileList++) {
908  TileList tmp = h_tileLists[itileList];
909  fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
910  tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
911  tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
912  }
913  fclose(handle);
914  delete [] h_tileLists;
915 }
916 
917 void CudaTileListKernel::writeTileList(FILE* handle, const int numTileLists,
918  const TileList* d_tileLists, cudaStream_t stream) {
919 
920  TileList* h_tileLists = new TileList[numTileLists];
921  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
922  cudaCheck(cudaStreamSynchronize(stream));
923  for (int itileList=0;itileList < numTileLists;itileList++) {
924  TileList tmp = h_tileLists[itileList];
925  fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
926  tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
927  tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
928  }
929  delete [] h_tileLists;
930 }
931 
932 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
933  const int* d_tileJatomStart, cudaStream_t stream) {
934 
935  int* h_tileJatomStart = new int[numJtiles];
936  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
937  cudaCheck(cudaStreamSynchronize(stream));
938  FILE* handle = fopen(filename,"wt");
939  for (int i=0;i < numJtiles;i++) {
940  fprintf(handle, "%d\n", h_tileJatomStart[i]);
941  }
942  fclose(handle);
943  delete [] h_tileJatomStart;
944 }
945 
946 void CudaTileListKernel::writeTileJatomStart(FILE* handle, const int numJtiles,
947  const int* d_tileJatomStart, cudaStream_t stream) {
948 
949  int* h_tileJatomStart = new int[numJtiles];
950  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
951  cudaCheck(cudaStreamSynchronize(stream));
952  for (int i=0;i < numJtiles;i++) {
953  fprintf(handle, "%d\n", h_tileJatomStart[i]);
954  }
955  delete [] h_tileJatomStart;
956 }
957 
958 void CudaTileListKernel::writeTileExcls(FILE* handle, const int numJTiles,
959  const TileExcl* d_tileExcl, cudaStream_t stream){
960  TileExcl* h_tileExcl = new TileExcl[numJtiles];
961  copy_DtoH<TileExcl>(d_tileExcl, h_tileExcl, numJtiles, stream);
962  cudaCheck(cudaStreamSynchronize(stream));
963  for (int i=0;i < numJtiles;i++) {
964  for(int j = 0; j < WARPSIZE; j++) fprintf(handle, " %u ", h_tileExcl[i].excl[j]);
965  fprintf(handle, "\n");
966  }
967  delete [] h_tileExcl;
968 }
969 
970 /*
971 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
972 {
973  return std::pair<int, int>(p.second, p.first);
974 }
975 
976 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
977  const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
978 
979  const int shCacheSize = 10;
980  TileList* h_tileLists = new TileList[numTileLists];
981  int* h_tileJatomStart = new int[numJtiles];
982  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
983  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
984  cudaCheck(cudaStreamSynchronize(stream));
985  int ntotal = 0;
986  int ncache = 0;
987  for (int i=0;i < numTileLists;i+=width) {
988  int jend = min(i + width, numTileLists);
989  std::map<int, int> atomStartMap;
990  std::map<int, int>::iterator it;
991  atomStartMap.clear();
992  for (int j=i;j < jend;j++) {
993  TileList tmp = h_tileLists[j];
994  int iatomStart = tmp.iatomStart;
995  it = atomStartMap.find(iatomStart);
996  if (it == atomStartMap.end()) {
997  // Insert new
998  atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
999  } else {
1000  // Increase counter
1001  it->second--;
1002  }
1003  int jtileStart = tmp.jtileStart;
1004  int jtileEnd = tmp.jtileEnd;
1005  ntotal += (jtileEnd - jtileStart + 1) + 1;
1006  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
1007  int jatomStart = h_tileJatomStart[jtile];
1008  it = atomStartMap.find(jatomStart);
1009  if (it == atomStartMap.end()) {
1010  // Insert new
1011  atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
1012  } else {
1013  // Increase counter
1014  it->second--;
1015  }
1016  jatomStart |= (65535 << 16);
1017  h_tileJatomStart[jtile] = jatomStart;
1018  }
1019  iatomStart |= (65535 << 16);
1020  tmp.iatomStart = iatomStart;
1021  h_tileLists[j] = tmp;
1022  }
1023  ncache += atomStartMap.size();
1024  std::multimap<int, int> imap;
1025  imap.clear();
1026  std::multimap<int, int>::iterator imap_it;
1027  std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
1028  if (i < 400) {
1029  printf("%d %d\n", ntotal, imap.size());
1030  for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
1031  if (imap_it->first != 0)
1032  printf("(%d %d)\n", imap_it->first, imap_it->second);
1033  }
1034  }
1035  }
1036  printf("ntotal %d ncache %d\n", ntotal, ncache);
1037  copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
1038  copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
1039  cudaCheck(cudaStreamSynchronize(stream));
1040  delete [] h_tileLists;
1041  delete [] h_tileJatomStart;
1042 }
1043 */
1044 
1045 void CudaTileListKernel::prepareBuffers(
1046  int atomStorageSizeIn, int numPatchesIn,
1047  const CudaPatchRecord* h_cudaPatches,
1048  cudaStream_t stream
1049 ) {
1050  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSizeIn, OVERALLOC);
1051 
1052  // Copy cudaPatches to device
1053  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatchesIn);
1054  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatchesIn, stream);
1055 }
1056 
1057 
1058 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
1059  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
1060  const float3 lata, const float3 latb, const float3 latc,
1061  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
1062  const float plcutoff2In, const size_t maxShmemPerBlock,
1063  cudaStream_t stream, const bool atomsChanged, const bool allocatePart,
1064  bool CUDASOAintegratorOn, bool deviceMigration) {
1065 
1066  numPatches = numPatchesIn;
1067  atomStorageSize = atomStorageSizeIn;
1068  maxTileListLen = maxTileListLenIn;
1069  plcutoff2 = plcutoff2In;
1070 
1071  if (doStreaming) {
1072  // Re-allocate patchNumLists
1073  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1074  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1075  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1076  }
1077 
1078  // Re-allocate (tileLists1, patchPairs1
1079  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1080  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1081 
1082  // Copy cudaPatches to device
1083  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1084  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1085 
1086  // Copy cudaPatches to device
1087  // If DeviceMigration is on then this happens in prepareBuffers()
1088  if (!CUDASOAintegratorOn || !deviceMigration) {
1089  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1090  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1091  }
1092 
1093  // Re-allocate temporary storage
1094  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1095  // Calculate tile list positions (tilePos)
1096  {
1097  int nthread = DEFAULTKERNEL_NUM_THREAD;
1098  int nblock = 1;
1099  calcTileListPosKernel<DEFAULTKERNEL_NUM_THREAD> <<< nblock, nthread, 0, stream >>> (
1100  numComputes, cudaComputes, cudaPatches, tilePos);
1101  cudaStreamSynchronize(stream);// for now
1102  cudaCheck(cudaGetLastError());
1103  }
1104 
1105  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1106  {
1107  int nthread = UPDATEPATCHESKERNEL_NUM_THREAD;
1108  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/BOUNDINGBOXSIZE)+1);
1109  updatePatchesKernel<BOUNDINGBOXSIZE> <<< nblock, BOUNDINGBOXSIZE, 0, stream >>> (
1110  numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1111  // cudaCheck(cudaGetLastError());
1112  }
1113 
1114  // ---------------------------------------------------------------------------------------------
1115 
1116 
1117  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1118  // tileListDepth2 and tileListOrder2 since they're used in sorting
1119  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1120  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1121 
1122  // Allocate with +1 to include last term in the exclusive sum
1123  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1124 
1125  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1126 
1127  if (!deviceMigration) {
1128  if(atomsChanged) reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1129  if(!CUDASOAintegratorOn || atomsChanged) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1130  }
1131 
1132  //This should be allocated for FEP only
1133  if(allocatePart) reallocate_device<char>(&part, &partSize, atomStorageSize, OVERALLOC);
1134 
1135  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1136  {
1137  int numBoundingBoxes = atomStorageSize/BOUNDINGBOXSIZE;
1138  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1139 
1140  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1141  // int nthread = BOUNDINGBOXSIZE*nwarp;
1142  int nthread = BOUNDINGBOXSIZE*1; // Changing this value to 1 as we want to use BlockReduce primitives
1143  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1144  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1145  cudaCheck(cudaGetLastError());
1146  }
1147 
1148  {
1149  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1150  // int nthread = BOUNDINGBOXSIZE*nwarp;
1151  int nthread = BOUNDINGBOXSIZE*1; // 1 for now to use BlockReduce.
1152  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1153 
1154  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1155  if(shmem_size > maxShmemPerBlock){
1156  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1157  }
1158 
1159  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1160  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1161  // re-allocation only happens when the size is exceeded.
1162  h_tileListStat->tilesSizeExceeded = true;
1163  int reallocCount = 0;
1164  while (h_tileListStat->tilesSizeExceeded) {
1165  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1166 
1167  clearTileListStat(stream);
1168  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1169 
1170  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1171  (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1172  lata, latb, latc, plcutoff2, maxTileListLen,
1173  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1174  tileListDepth1, tileListOrder1, patchPairs1,
1175  d_tileListStat
1176  );
1177  cudaCheck(cudaGetLastError());
1178 
1179  // get (numATileLists, numJtiles, tilesSizeExceeded)
1180  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1181  cudaCheck(cudaStreamSynchronize(stream));
1182  numJtiles = h_tileListStat->numJtiles;
1183 
1184  if (h_tileListStat->tilesSizeExceeded) {
1185  reallocCount++;
1186  if (reallocCount > 1) {
1187  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1188  }
1189  }
1190 
1191  }
1192 
1193  numTileLists = h_tileListStat->numTileLists;
1194 
1195  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1196  }
1197 
1198  // Re-allocate tileListVirialEnergy.
1199  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1200  // we're quaranteed to have enough space
1201  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1202 
1203  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1204  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1205  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1206  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1207  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1208 
1209  int numTileListsSrc = numTileListsPrev;
1210  int numJtilesSrc = numJtiles;
1211  int numTileListsDst = numTileLists;
1212  int numJtilesDst = numJtiles;
1213 
1214 #if 0
1215  writeTileList("tileListBefore_HIP.txt", numTileLists, tileLists1, stream);
1216  writeTileJatomStart("tileJatomStartBefore_HIP.txt", numJtiles, tileJatomStart1, stream);
1217 #endif
1218 
1219  // Sort tiles
1220  sortTileLists(
1221  false,
1222  0, false,
1223  numTileListsSrc, numJtilesSrc,
1224  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1225  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1226  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1227  numTileListsDst, numJtilesDst,
1228  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1229  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1230  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1231  stream);
1232 
1233 #if 0
1234  writeTileList("tileListAfter_HIP.txt", numTileLists, tileLists1, stream);
1235  writeTileJatomStart("tileJatomStartAfter_HIP.txt", numJtiles, tileJatomStart1, stream);
1236 #endif
1237 
1238 
1239  // Set active buffer to 2
1240  setActiveBuffer(2);
1241 
1242  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1243 }
1244 
1245 //
1246 // Returns integer log2(a) rounded up
1247 //
1248 int ilog2(int a) {
1249  // if (a < 0)
1250  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1251  int k = 1;
1252  while (a >>= 1) k++;
1253  return k;
1254 }
1255 
1256 //
1257 // Sort tile lists
1258 //
1259 void CudaTileListKernel::sortTileLists(
1260  const bool useJtiles,
1261  const int begin_bit, const bool highDepthBitsSetIn,
1262  // Source
1263  const int numTileListsSrc, const int numJtilesSrc,
1264  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1265  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1266  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1267  // Destination
1268  const int numTileListsDst, const int numJtilesDst,
1269  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1270  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1271  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1272  cudaStream_t stream) {
1273 
1274  bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1275 
1276  // if (numTileListsDst == 0)
1277  // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1278 
1279  // Check that the array sizes are adequate
1280  if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1281  numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1282  (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1283  (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1284  NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1285 
1286  if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1287  numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1288  (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1289  (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1290  NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1291 
1292  if (begin_bit != 0 && begin_bit != 16)
1293  NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1294 
1295  // Number of bits needed in the sort
1296  int num_bit = ilog2(maxTileListLen);
1297  if (num_bit > 16)
1298  NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1299  int end_bit = begin_bit + num_bit;
1300 
1301  if (doStreaming)
1302  {
1303  // ----------------------------------------------------------------------------------------
1304  if (doOutputOrder && useJtiles) {
1305  // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1306  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1307 
1308  // Calculate position from depth
1309  {
1310  // -----------------------------------------------------------------------------
1311  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1312  // -----------------------------------------------------------------------------
1313  if (doShiftDown)
1314  {
1315  int nthread = DEFAULTKERNEL_NUM_THREAD;
1316  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1317  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1318  (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1319  cudaCheck(cudaGetLastError());
1320  }
1321 
1322  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1323 
1324  // --------------------------------------------------------------------
1325  // Compute itileList positions to store tileLists
1326  // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1327  // --------------------------------------------------------------------
1328  {
1329  size_t size = 0;
1330  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1331  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1332  // Make sure tempStorage doesn't remain NULL
1333  if (size == 0) size = 128;
1334  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1335  size = tempStorageSize;
1336  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1337  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1338  }
1339  }
1340 
1341  // Store in reverse order from outputOrder
1342  {
1343  int nthread = DEFAULTKERNEL_NUM_THREAD;
1344  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1345  storeInReverse <<< nblock, nthread, 0, stream >>>
1346  (numTileListsSrc, begin_bit, outputOrder, tileListPos,
1347  tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1348  tileListOrderDst.ptr, tileListDepthDst.ptr);
1349  cudaCheck(cudaGetLastError());
1350  }
1351 
1352  // Build sortKeys
1353  {
1354  maxTileListLen_sortKeys = maxTileListLen;
1355 
1356  reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1357  clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1358 
1359  // Re-allocate and initialize minmaxListLen
1360  {
1361  reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1362 
1363  int nthread = DEFAULTKERNEL_NUM_THREAD;
1364  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1365  initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1366  (numComputes, maxTileListLen, minmaxListLen);
1367  cudaCheck(cudaGetLastError());
1368  }
1369 
1370  // Build sortKeys and calculate minmaxListLen
1371  {
1372  int nthread = DEFAULTKERNEL_NUM_THREAD;
1373  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1374  buildSortKeys <<< nblock, nthread, 0, stream >>>
1375  (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1376  tileListDepthDst.ptr, minmaxListLen, sortKeys);
1377  cudaCheck(cudaGetLastError());
1378 
1379  // Maximum value in sortKeys[] is numTileListsDst - 1
1380  sortKeys_endbit = ilog2(numTileListsDst);
1381  }
1382 
1383  // Fill in missing sortKeys using minmaxListLen
1384  {
1385  int nthread = DEFAULTKERNEL_NUM_THREAD;
1386  int nwarp = nthread/WARPSIZE;
1387  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1388  fillSortKeys <<< nblock, nthread, 0, stream >>>
1389  (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1390  cudaCheck(cudaGetLastError());
1391  }
1392 
1393  }
1394 
1395  doOutputOrder = false;
1396 
1397  } else if (doOutputOrder) {
1398  // OutputOrder will be produced in next pairlist non-bond kernel.
1399  // This time just remove zero length lists
1400  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1401 
1402  int endbit_tmp = ilog2(numTileListsSrc);
1403 
1404  // Remove zeros
1405  {
1406  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1407  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1408 
1409  int nthread = DEFAULTKERNEL_NUM_THREAD;
1410  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1411  buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1412  (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1413  cudaCheck(cudaGetLastError());
1414  }
1415 
1416  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1417  {
1418  // Short list, sort withing a single thread block
1419  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1420  int nblock = 1;
1421 
1422  unsigned int oobKey = numTileListsSrc;
1423  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1424  (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1425  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1426  cudaCheck(cudaGetLastError());
1427  }
1428  else
1429  {
1430  // Long list, sort on multiple thread blocks
1431  size_t size = 0;
1432  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1433  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1434  numTileListsSrc, 0, endbit_tmp, stream));
1435  // Make sure tempStorage doesn't remain NULL
1436  if (size == 0) size = 128;
1437  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1438  size = tempStorageSize;
1439  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1440  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1441  numTileListsSrc, 0, endbit_tmp, stream));
1442  }
1443 
1444  // Re-order tileListDepth using tileListOrderDst
1445  {
1446  int nthread = DEFAULTKERNEL_NUM_THREAD;
1447  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1448  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1449  (numTileListsDst, tileListOrderDst.ptr,
1450  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1451  cudaCheck(cudaGetLastError());
1452  }
1453 
1454  } else {
1455  // This is done during regular MD cycle
1456 
1457  if (sortKeys_endbit <= 0)
1458  NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1459 
1460  // Setup sort keys
1461  {
1462  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1463  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1464 
1465  int nthread = DEFAULTKERNEL_NUM_THREAD;
1466  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1467  setupSortKey <<< nblock, nthread, 0, stream >>>
1468  (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1469  cudaCheck(cudaGetLastError());
1470 
1471  }
1472 
1473  // Global sort
1474  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1475  // if (false)
1476  {
1477  // Short list, sort withing a single thread block
1478  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1479  int nblock = 1;
1480 
1481  unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1482  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1483  (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1484  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1485  cudaCheck(cudaGetLastError());
1486  }
1487  else
1488  {
1489  // Long list, sort on multiple thread blocks
1490  size_t size = 0;
1491  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1492  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1493  numTileListsSrc, 0, sortKeys_endbit, stream));
1494  // Make sure tempStorage doesn't remain NULL
1495  if (size == 0) size = 128;
1496  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1497  size = tempStorageSize;
1498  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1499  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1500  numTileListsSrc, 0, sortKeys_endbit, stream));
1501  }
1502 
1503  // Re-order tileListDepth using tileListOrderDst
1504  {
1505  int nthread = DEFAULTKERNEL_NUM_THREAD;
1506  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1507  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1508  (numTileListsDst, tileListOrderDst.ptr,
1509  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1510  cudaCheck(cudaGetLastError());
1511  }
1512 
1513  // Local sort
1514  {
1515  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/WARPSIZE+1);
1516  localSort<WARPSIZE> <<< nblock, WARPSIZE, 0, stream >>> (
1517  numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1518  cudaCheck(cudaGetLastError());
1519 
1520  // No need to shift any more
1521  doShiftDown = false;
1522  }
1523 
1524  }
1525  // ----------------------------------------------------------------------------------------
1526 
1527  } // (doStreaming)
1528  else
1529  {
1530  // --------------------------------------------------------------------
1531  // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1532  // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1533  // --------------------------------------------------------------------
1534  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1535  {
1536  // Short list, sort withing a single thread block
1537  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1538  int nblock = 1;
1539 
1540  sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1541  (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1542  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1543  cudaCheck(cudaGetLastError());
1544 
1545  }
1546  else
1547  {
1548  // Long list, sort on multiple thread blocks
1549  size_t size = 0;
1550  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1551  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1552  numTileListsSrc, begin_bit, end_bit, stream));
1553  // Make sure tempStorage doesn't remain NULL
1554  if (size == 0) size = 128;
1555  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1556  size = tempStorageSize;
1557  cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1558  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1559  numTileListsSrc, begin_bit, end_bit, stream));
1560  }
1561  }
1562 
1563  // -----------------------------------------------------------------------------
1564  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1565  // -----------------------------------------------------------------------------
1566  if (doShiftDown)
1567  {
1568  int nthread = DEFAULTKERNEL_NUM_THREAD;
1569  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1570  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1571  (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1572  // cudaCheck(cudaGetLastError());
1573  }
1574 
1575  // Allocate with +1 to include last term in the exclusive sum
1576  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1577 
1578  // --------------------------------------------------------------------
1579  // Compute itileList positions to store tileLists
1580  // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1581  // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1582  // since this is an exclusive sum. But with this trick,
1583  // tileListPos[numTileListsDst] will contain the total number
1584  // of tile lists
1585  // --------------------------------------------------------------------
1586  {
1587  size_t size = 0;
1588  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1589  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1590  // Make sure tempStorage doesn't remain NULL
1591  if (size == 0) size = 128;
1592  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1593  size = tempStorageSize;
1594  // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1595  // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1596  cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1597  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1598  }
1599 
1600  // --------------------------------------------------------------------
1601  // Re-package to
1602  // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1603  // patchPairDst[]
1604  // tileJatomStartDst[]
1605  // tileExclsDst[0 ... numJtilesDst-1]
1606  // --------------------------------------------------------------------
1607  {
1608  // int nthread = WARPSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1609  int nthread = BOUNDINGBOXSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1610  int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1611  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1612 
1613  repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1614  (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1615  (useJtiles) ? jtiles : NULL,
1616  tileListsSrc.ptr, tileListsDst.ptr,
1617  patchPairsSrc.ptr, patchPairsDst.ptr,
1618  tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1619  tileExclsSrc.ptr, tileExclsDst.ptr);
1620  }
1621 
1622  // Count the number of tileLists that contribute to each patch
1623  if (doStreaming)
1624  {
1625  clear_device_array<int>(patchNumLists, numPatches, stream);
1626 
1627  // Fill in patchNumLists[0...numPatches-1]
1628  int nthread = CALCPATCHNUMLISTSKERNEL_NUM_THREAD;
1629  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1630  calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1631  (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1632  cudaCheck(cudaGetLastError());
1633 
1634  // Use emptyPatches[numPatches] as the count variable
1635  clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1636 
1637  // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1638  // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1639  setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1640  (numTileListsDst, tileListsDst.ptr, patchNumLists,
1641  numPatches, &emptyPatches[numPatches], emptyPatches);
1642  cudaCheck(cudaGetLastError());
1643 
1644  // // Copy emptyPatches[0 ... numPatches] to host
1645  copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1646  cudaCheck(cudaStreamSynchronize(stream));
1647  numEmptyPatches = h_emptyPatches[numPatches];
1648  }
1649 
1650 }
1651 
1652 //
1653 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1654 //
1655 void CudaTileListKernel::reSortTileLists(const bool doGBIS, const bool CUDASOAIntegratorOn, cudaStream_t stream) {
1656  // Store previous number of active lists
1657  int numTileListsPrev = numTileLists;
1658 
1659  // Wait for finishTileList() to stop copying
1660  if (!tileListStatEventRecord)
1661  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1662  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1663 
1664  // Get numTileLists, numTileListsGBIS, and numExcluded
1665  {
1666  numTileLists = h_tileListStat->numTileLists;
1667  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1668  numExcluded = h_tileListStat->numExcluded;
1669  }
1670 
1671  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1672  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1673  sortTileLists(true, 0, true,
1674  numTileListsPrev, numJtiles,
1675  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1676  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1677  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1678  numTileLists, numJtiles,
1679  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1680  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1681  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1682  stream);
1683 
1684  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1685  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1686  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1687  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1688 
1689  // NOTE:
1690  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1691  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1692 
1693  if (doGBIS) {
1694  // GBIS is used => produce a second tile list
1695  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1696  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1697  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1698 
1699  sortTileLists(true, 16, true,
1700  numTileListsPrev, numJtiles,
1701  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1702  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1703  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1704  numTileListsGBIS, numJtiles,
1705  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1706  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1707  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1708  stream);
1709  }
1710 
1711  // Set active buffer to be 1
1712  setActiveBuffer(1);
1713 
1714 }
1715 
1716 /*
1717 //
1718 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1719 //
1720 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1721  return;
1722 
1723  if (!doStreaming || !doOutputOrder)
1724  return;
1725 
1726  // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1727  // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1728  sortTileLists(false, 0, true,
1729  numTileLists, numJtiles,
1730  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1731  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1732  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1733  numTileLists, numJtiles,
1734  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1735  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1736  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1737  stream);
1738 
1739  // Set active buffer to be 2
1740  setActiveBuffer(2);
1741 
1742 }
1743 */
1744 
1745 void CudaTileListKernel::setTileListVirialEnergyLength(int len) {
1746  if (len > tileListVirialEnergySize) {
1747  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1748  }
1749  tileListVirialEnergyLength = len;
1750 }
1751 
1752 void CudaTileListKernel::setTileListVirialEnergyGBISLength(int len) {
1753  if (len > tileListVirialEnergySize) {
1754  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1755  }
1756  tileListVirialEnergyGBISLength = len;
1757 }
1758 
1759 #endif //NAMD_HIP