NAMD
CudaTileListKernel.cu
Go to the documentation of this file.
1 // #include <map>
2 // #include <algorithm>
3 
4 #if defined(NAMD_CUDA)
5 #include <cuda.h>
6 #if __CUDACC_VER_MAJOR__ >= 11
7 #include <cub/device/device_radix_sort.cuh>
8 #include <cub/device/device_scan.cuh>
9 #include <cub/cub.cuh>
10 #else
11 #include <namd_cub/device/device_radix_sort.cuh>
12 #include <namd_cub/device/device_scan.cuh>
13 #include <namd_cub/cub.cuh>
14 #endif
15 #endif // NAMD_CUDA
16 
17 #include "CudaUtils.h"
18 #include "CudaTileListKernel.h"
19 #include "CudaComputeNonbondedKernel.h"
20 #include "DeviceCUDA.h"
21 
22 #if defined(NAMD_CUDA)
23 
24 #ifdef WIN32
25 #define __thread __declspec(thread)
26 #endif
27 
28 extern __thread DeviceCUDA *deviceCUDA;
29 
30 #define OVERALLOC 2.0f
31 
32 #if __CUDA_ARCH__ < 350
33 #define __ldg *
34 #endif
35 
36 void NAMD_die(const char *);
37 
38 //
39 // Calculate the number of lists that contribute to each patch
40 //
41 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
42  const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
43 
44  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
45  {
46  int2 patchInd = tileLists[i].patchInd;
47  atomicAdd(&patchNumLists[patchInd.x], 1);
48  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
49  }
50 
51 }
52 
53 //
54 // Write patchNumList back to tile list and
55 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
56 //
57 __global__ void setPatchNumLists_findEmptyPatches(const int numTileLists,
58  TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
59  const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
60 
61  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
62  {
63  int2 patchInd = tileLists[i].patchInd;
64  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
65  tileLists[i].patchNumList = patchNumList;
66  }
67 
68  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
69  {
70  if (patchNumLists[i] == 0) {
71  int ind = atomicAdd(numEmptyPatches, 1);
72  emptyPatches[ind] = i;
73  }
74  }
75 
76 }
77 
78 //
79 // Builds a sort key that removes zeros but keeps the order otherwise the same
80 //
81 __global__ void buildRemoveZerosSortKey(const int numTileLists,
82  const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
83 
84  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
85  {
86  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
87  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
88  }
89 
90 }
91 
92 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
93  const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
94  const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
95 
96  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
97  {
98  int icompute = tileLists[itileList].icompute;
99  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
100  int i = icompute*maxTileListLen + (depth - 1);
101  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
102  }
103 
104 }
105 
106 template <int width>
107 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
108  unsigned int* __restrict__ keys, int* __restrict__ vals) {
109 
110  // NOTE: blockDim.x = width
111 
112  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
113  {
114  int i = base + threadIdx.x;
115  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
116  __shared__ typename BlockRadixSort::TempStorage tempStorage;
117  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
118  int val[1] = {(i < n) ? vals[i] : 0};
119  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
120  if (i < n) {
121  keys[i] = key[0];
122  vals[i] = val[0];
123  }
124  BLOCK_SYNC;
125  }
126 
127 }
128 
129 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
130  const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
131  const int* __restrict__ tileListOrderSrc,
132  const unsigned int* __restrict__ tileListDepthSrc,
133  int* __restrict__ tileListOrderDst,
134  unsigned int* __restrict__ tileListDepthDst) {
135 
136  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
137  {
138  int j = outputOrder[numTileListsSrc - i - 1];
139  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
140  int k = tileListPos[i];
141  tileListDepthDst[k] = tileListDepthSrc[j];
142  tileListOrderDst[k] = j; //tileListOrderSrc[j];
143  }
144  }
145 }
146 
147 //
148 // Bit shift tileListDepth so that only lower 16 bits are used
149 //
150 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
151  const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
152  unsigned int* __restrict__ tileListDepthDst) {
153 
154  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
155  {
156  int j = outputOrder[numTileLists - i - 1];
157  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
158  }
159 
160 }
161 
162 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
163  int2* __restrict__ minmaxListLen) {
164 
165  int2 val;
166  val.x = maxTileListLen+1;
167  val.y = 0;
168  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
169  {
170  minmaxListLen[i] = val;
171  }
172 
173 }
174 
175 //
176 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
177 //
178 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
179  const TileList* __restrict__ tileListsSrc,
180  const int* __restrict__ tileListOrderDst,
181  const unsigned int* __restrict__ tileListDepthDst,
182  int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
183 
184  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
185  {
186  int k = tileListOrderDst[i];
187  int icompute = tileListsSrc[k].icompute;
188  int depth = tileListDepthDst[i] & 65535;
189  // depth is in range [1 ... maxTileListLen]
190  int j = icompute*maxTileListLen + (depth-1);
191  sortKeys[j] = i;
192  int2 minmax = minmaxListLen[icompute];
193  int2 minmaxOrig = minmax;
194  if (minmax.x > depth) minmax.x = depth;
195  if (minmax.y < depth) minmax.y = depth;
196  if (minmax.x != minmaxOrig.x) {
197  atomicMin(&minmaxListLen[icompute].x, minmax.x);
198  }
199  if (minmax.y != minmaxOrig.y) {
200  atomicMax(&minmaxListLen[icompute].y, minmax.y);
201  }
202  }
203 
204 }
205 
206 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
207  const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
208 
209  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
210  const int wid = threadIdx.x % WARPSIZE;
211  int2 minmax = minmaxListLen[i];
212  int minlen = minmax.x;
213  int maxlen = minmax.y;
214  // minlen, maxlen are in range [1 ... maxTileListLen]
215  // as long as i is in tileListsSrc[].icompute above
216  if ( maxlen < minlen ) {
217  minlen = 1;
218  maxlen = maxTileListLen;
219  }
220  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
221  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
222  unsigned int aveKey = (maxKey + minKey)/2;
223  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
224  sortKeys[i*maxTileListLen + j] = minKey;
225  }
226  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
227  sortKeys[i*maxTileListLen + j] = maxKey;
228  }
229  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
230  if (sortKeys[i*maxTileListLen + j] == 0) {
231  sortKeys[i*maxTileListLen + j] = aveKey;
232  }
233  }
234  }
235 
236 }
237 
238 //
239 // Calculate bounding boxes for sets of WARPSIZE=32 atoms
240 //
241 #define BOUNDINGBOXKERNEL_NUM_WARP 8
242 __global__ void
243 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
244  BoundingBox* __restrict__ boundingBoxes) {
245 
246  const int warpId = threadIdx.x / WARPSIZE;
247  const int wid = threadIdx.x % WARPSIZE;
248 
249  // Loop with warp-aligned index to avoid warp-divergence
250  for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
251  // Full atom index
252  const int i = iwarp + wid;
253  // Bounding box index
254  const int ibb = i/WARPSIZE;
255 
256  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
257 
258  volatile float3 minxyz, maxxyz;
259 
260  typedef cub::WarpReduce<float> WarpReduce;
261  __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
262  minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
263  minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
264  minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
265  maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
266  maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
267  maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
268 
269  if (wid == 0) {
270  BoundingBox boundingBox;
271  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
272  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
273  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
274  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
275  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
276  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
277  boundingBoxes[ibb] = boundingBox;
278  }
279  }
280 
281 }
282 
283 //
284 // Returns the lower estimate for the distance between two bounding boxes
285 //
286 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
287  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
288  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
289  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
290  float r2 = dx*dx + dy*dy + dz*dz;
291  return r2;
292 }
293 
294 #if 0
295 //
296 // Performs warp-level exclusive sum on a shared memory array:
297 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
298 // sh_in and sh_out can point to same array
299 // Returns the total sum
300 //
301 template <typename T>
302 __device__ __forceinline__
303 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
304  const int wid = threadIdx.x % WARPSIZE;
305  volatile int blockOffset = 0;
306  for (int iblock=0;iblock < n;iblock += WARPSIZE) {
307  // Size of the exclusive sum
308  int blockLen = min(WARPSIZE, n-iblock);
309  // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
310  typedef cub::WarpScan<int> WarpScan;
311  __shared__ typename WarpScan::TempStorage tempStorage;
312  int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
313  WarpScan(tempStorage).ExclusiveSum(data, data);
314  // Shift by block offset
315  data += blockOffset;
316  // Save last value
317  int last = (int)sh_in[iblock + blockLen-1];
318  // Write output
319  if (wid < blockLen) sh_out[iblock + wid] = data;
320  // block offset = last term of the exclusive sum + last value
321  blockOffset = sh_out[iblock + blockLen-1] + last;
322  }
323  return blockOffset;
324 }
325 #endif
326 
327 #define TILELISTKERNELNEW_NUM_WARP 4
328 
329 //
330 // NOTE: Executed on a single thread block
331 //
332 template<int nthread>
333 __global__ void calcTileListPosKernel(const int numComputes,
334  const CudaComputeRecord* __restrict__ computes,
335  const CudaPatchRecord* __restrict__ patches,
336  int* __restrict__ tilePos) {
337 
338  typedef cub::BlockScan<int, nthread> BlockScan;
339 
340  __shared__ typename BlockScan::TempStorage tempStorage;
341  __shared__ int shTilePos0;
342 
343  if (threadIdx.x == nthread-1) {
344  shTilePos0 = 0;
345  }
346 
347  for (int base=0;base < numComputes;base+=nthread) {
348  int k = base + threadIdx.x;
349 
350  int numTiles1 = (k < numComputes) ?
351  (CudaComputeNonbondedKernel::computeNumTiles(patches[computes[k].patchInd.x].numAtoms, WARPSIZE)) : 0;
352  // Calculate positions in tile list and jtile list
353  int tilePosVal;
354  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
355 
356  // Store into global memory
357  if (k < numComputes) {
358  tilePos[k] = shTilePos0 + tilePosVal;
359  }
360 
361  BLOCK_SYNC;
362  // Store block end position
363  if (threadIdx.x == nthread-1) {
364  shTilePos0 += tilePosVal + numTiles1;
365  }
366  }
367 }
368 
369 
370 template<int nthread>
371 __global__ void updatePatchesKernel(const int numComputes,
372  const int* __restrict__ tilePos,
373  const CudaComputeRecord* __restrict__ computes,
374  const CudaPatchRecord* __restrict__ patches,
375  TileList* __restrict__ tileLists) {
376 
377  const int tid = threadIdx.x % nthread;
378 
379  // nthread threads takes care of one compute
380  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
381  {
382  CudaComputeRecord compute = computes[k];
383  float3 offsetXYZ = compute.offsetXYZ;
384  int2 patchInd = compute.patchInd;
385  int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(patches[patchInd.x].numAtoms);
386  int itileList0 = tilePos[k];
387  for (int i=tid;i < numTiles1;i+=nthread) {
388  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
389  tileLists[itileList0 + i].patchInd = patchInd;
390  tileLists[itileList0 + i].icompute = k;
391  }
392  }
393 
394 }
395 
396 __host__ __device__ __forceinline__
397 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
398  // Size in bytes
399  int size = (
400  maxTileListLen*sizeof(char)
401  );
402  return size;
403 }
404 
405 __global__ void
406 buildTileListsBBKernel(const int numTileLists,
407  TileList* __restrict__ tileLists,
408  const CudaPatchRecord* __restrict__ patches,
409  const int* __restrict__ tileListPos,
410  const float3 lata, const float3 latb, const float3 latc,
411  const float cutoff2, const int maxTileListLen,
412  const BoundingBox* __restrict__ boundingBoxes,
413  int* __restrict__ tileJatomStart,
414  const int tileJatomStartSize,
415  unsigned int* __restrict__ tileListDepth,
416  int* __restrict__ tileListOrder,
417  PatchPairRecord* __restrict__ patchPairs,
418  TileListStat* __restrict__ tileListStat) {
419 
420  extern __shared__ char sh_buffer[];
421  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
422  int pos = threadIdx.x*sizePerThread;
423  volatile char* sh_tile = (char*)&sh_buffer[pos];
424 
425  // Loop with warp-aligned index to avoid warp-divergence
426  for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
427 
428  // Use one thread per tile list
429  const int wid = threadIdx.x % WARPSIZE;
430  const int itileList = iwarp + wid;
431 
432  int i;
433  int itileListLen = 0;
434  CudaPatchRecord patch1;
435  CudaPatchRecord patch2;
436  float3 offsetXYZ;
437  int2 patchInd;
438  int numTiles2;
439  int icompute;
440 
441  if (itileList < numTileLists) {
442  offsetXYZ = tileLists[itileList].offsetXYZ;
443  patchInd = tileLists[itileList].patchInd;
444  icompute = tileLists[itileList].icompute;
445  // Get i-column
446  i = itileList - tileListPos[icompute];
447 
448  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
449  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
450  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
451 
452  // DH - set zeroShift flag if magnitude of shift vector is zero
453  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
454 
455  // Load patches
456  patch1 = patches[patchInd.x];
457  patch2 = patches[patchInd.y];
458  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
459  numTiles2 = CudaComputeNonbondedKernel::computeNumTiles(patch2.numAtoms);
460  int tileStart1 = patch1.atomStart/WARPSIZE;
461  int tileStart2 = patch2.atomStart/WARPSIZE;
462 
463  // DH - self requires that zeroShift is also set
464  bool self = zeroShift && (tileStart1 == tileStart2);
465 
466  // Load i-atom data (and shift coordinates)
467  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
468  boundingBoxI.x += shx;
469  boundingBoxI.y += shy;
470  boundingBoxI.z += shz;
471 
472  for (int j=0;j < numTiles2;j++) {
473  sh_tile[j] = 0;
474  if (!self || j >= i) {
475  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
476  float r2bb = distsq(boundingBoxI, boundingBoxJ);
477  if (r2bb < cutoff2) {
478  sh_tile[j] = 1;
479  itileListLen++;
480  }
481  }
482  }
483 
484  tileListDepth[itileList] = (unsigned int)itileListLen;
485  tileListOrder[itileList] = itileList;
486  }
487 
488  typedef cub::WarpScan<int> WarpScan;
489  __shared__ typename WarpScan::TempStorage tempStorage;
490  int active = (itileListLen > 0);
491  int activePos;
492  WarpScan(tempStorage).ExclusiveSum(active, activePos);
493  int itileListPos;
494  WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
495 
496  int jtileStart, numJtiles;
497  // Last thread in the warp knows the total number
498  if (wid == WARPSIZE-1) {
499  atomicAdd(&tileListStat->numTileLists, activePos + active);
500  numJtiles = itileListPos + itileListLen;
501  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
502  }
503  numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
504  jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
505  if (jtileStart + numJtiles > tileJatomStartSize) {
506  // tileJatomStart out of memory, exit
507  if (wid == 0) tileListStat->tilesSizeExceeded = true;
508  return;
509  }
510 
511  int jStart = itileListPos;
512  int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
513  if (wid == WARPSIZE-1) jEnd = numJtiles;
514 
515  if (itileListLen > 0) {
516  // Setup tileLists[]
517  TileList TLtmp;
518  TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
519  TLtmp.jtileStart = jtileStart + jStart;
520  TLtmp.jtileEnd = jtileStart + jEnd - 1;
521  TLtmp.patchInd = patchInd;
522  TLtmp.offsetXYZ = offsetXYZ;
523  TLtmp.icompute = icompute;
524  // TLtmp.patchNumList.x = 0;
525  // TLtmp.patchNumList.y = 0;
526  tileLists[itileList] = TLtmp;
527  // PatchPair
528  PatchPairRecord patchPair;
529  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
530  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
531  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
532  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
533  patchPairs[itileList] = patchPair;
534 
535  // Write tiles
536  int jtile = jtileStart + jStart;
537  for (int j=0;j < numTiles2;j++) {
538  if (sh_tile[j]) {
539  tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
540  jtile++;
541  }
542  }
543 
544  }
545 
546  }
547 
548 }
549 
550 #define REPACKTILELISTSKERNEL_NUM_WARP 32
551 __global__ void
552 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
553  const int* __restrict__ tileListOrder,
554  const int* __restrict__ jtiles,
555  const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
556  const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
557  const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
558  const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
559 
560  const int wid = threadIdx.x % WARPSIZE;
561 
562  // One warp does one tile list
563  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
564  {
565  int j = tileListOrder[i];
566  int start = tileListPos[i];
567  int end = tileListPos[i+1]-1;
568  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
569  // TileList
570  int startOld = __ldg(&tileListsSrc[j].jtileStart);
571  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
572  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
573  float3 offsetXYZ;
574  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
575  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
576  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
577  int2 patchInd = tileListsSrc[j].patchInd;
578  int icompute = __ldg(&tileListsSrc[j].icompute);
579  if (wid == 0) {
580  TileList tileList;
581  tileList.iatomStart = iatomStart;
582  tileList.offsetXYZ = offsetXYZ;
583  tileList.jtileStart = start;
584  tileList.jtileEnd = end;
585  tileList.patchInd = patchInd;
586  tileList.icompute = icompute;
587  tileListsDst[i] = tileList;
588  }
589 
590  if (jtiles == NULL) {
591  // No jtiles, simple copy will do
592  int jtile = start;
593  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
594  if (jtileOld + wid <= endOld) {
595  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
596  }
597  }
598  if (tileExclsSrc != NULL) {
599  int jtile = start;
600  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
601  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
602  }
603  }
604  } else {
605  int jtile0 = start;
606  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
607  int t = jtileOld + wid;
608  int jtile = (t <= endOld) ? jtiles[t] : 0;
609  jtile >>= begin_bit;
610  jtile &= 65535;
611  typedef cub::WarpScan<int> WarpScan;
612  __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
613  int warpId = threadIdx.x / WARPSIZE;
614  int jtilePos;
615  WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
616 
617  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
618 
619  if (tileExclsSrc != NULL) {
620  unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
621  while (b != 0) {
622  // k = index of thread that has data
623  // Promote k to 64-bit integer. DMC doesn't think this is necessary
624  // but it resolves issues for CUDA 11.6
625 #if 1
626  int64_t k = __ffs(b) - 1;
627 #else
628  int k = __ffs(b) - 1;
629 #endif
630  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
631  // remove 1 bit and advance jtile0
632  b ^= ((unsigned int)1 << k);
633  jtile0++;
634  }
635  } else {
636  jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
637  }
638  }
639  }
640  }
641 
642 }
643 
644 //
645 // NOTE: Executed on a single thread block
646 // oobKey = out-of-bounds key value
647 //
648 #define SORTTILELISTSKERNEL_NUM_THREAD 512
649 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
650 template <typename keyT, typename valT, bool ascend>
651 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__
652 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
653  const int begin_bit, const int end_bit, const keyT oobKey,
654  keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
655  valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
656 
657  typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
658  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
659 
660  typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD,
661  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
662 
663  typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
664  SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort;
665 
666  __shared__ union {
667  typename BlockLoad::TempStorage load;
668  typename BlockLoadU::TempStorage loadU;
669  typename BlockRadixSort::TempStorage sort;
670  } tempStorage;
671 
672  keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
673  valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
674 
675  BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
676  BLOCK_SYNC;
677  BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
678  BLOCK_SYNC;
679 
680  if (ascend)
681  BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
682  else
683  BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
684 
685  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
686  cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
687 }
688 
689 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
690  unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
691 
692  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
693  {
694  int j = tileListOrder[i];
695  tileListDepthDst[i] = tileListDepthSrc[j];
696  }
697 
698 }
699 
700 //
701 // Bit shift tileListDepth so that only lower 16 bits are used
702 //
703 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
704  unsigned int* __restrict__ tileListDepth) {
705 
706  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
707  {
708  unsigned int a = tileListDepth[itileList];
709  a >>= begin_bit;
710  a &= 65535;
711  tileListDepth[itileList] = a;
712  }
713 
714 }
715 
716 // ##############################################################################################
717 // ##############################################################################################
718 // ##############################################################################################
719 
720 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
721 deviceID(deviceID), doStreaming(doStreaming) {
722 
723  cudaCheck(cudaSetDevice(deviceID));
724 
725  activeBuffer = 1;
726 
727  numPatches = 0;
728  numComputes = 0;
729 
730  cudaPatches = NULL;
731  cudaPatchesSize = 0;
732 
733  cudaComputes = NULL;
734  cudaComputesSize = 0;
735 
736  patchNumLists = NULL;
737  patchNumListsSize = 0;
738 
739  emptyPatches = NULL;
740  emptyPatchesSize = 0;
741  h_emptyPatches = NULL;
742  h_emptyPatchesSize = 0;
743  numEmptyPatches = 0;
744 
745  sortKeySrc = NULL;
746  sortKeySrcSize = 0;
747  sortKeyDst = NULL;
748  sortKeyDstSize = 0;
749 
750  tileLists1 = NULL;
751  tileLists1Size = 0;
752  tileLists2 = NULL;
753  tileLists2Size = 0;
754 
755  patchPairs1 = NULL;
756  patchPairs1Size = 0;
757  patchPairs2 = NULL;
758  patchPairs2Size = 0;
759 
760  tileJatomStart1 = NULL;
761  tileJatomStart1Size = 0;
762  tileJatomStart2 = NULL;
763  tileJatomStart2Size = 0;
764 
765  boundingBoxes = NULL;
766  boundingBoxesSize = 0;
767 
768  tileListDepth1 = NULL;
769  tileListDepth1Size = 0;
770  tileListDepth2 = NULL;
771  tileListDepth2Size = 0;
772 
773  tileListOrder1 = NULL;
774  tileListOrder1Size = 0;
775  tileListOrder2 = NULL;
776  tileListOrder2Size = 0;
777 
778  tileExcls1 = NULL;
779  tileExcls1Size = 0;
780  tileExcls2 = NULL;
781  tileExcls2Size = 0;
782 
783  xyzq = NULL;
784  xyzqSize = 0;
785  part = NULL;
786  partSize = 0;
787 
788  allocate_device<TileListStat>(&d_tileListStat, 1);
789  allocate_host<TileListStat>(&h_tileListStat, 1);
790 
791  tileListPos = NULL;
792  tileListPosSize = 0;
793  tempStorage = NULL;
794  tempStorageSize = 0;
795 
796  jtiles = NULL;
797  jtilesSize = 0;
798 
799  tilePos = NULL;
800  tilePosSize = 0;
801 
802  tileListsGBIS = NULL;
803  tileListsGBISSize = 0;
804 
805  tileJatomStartGBIS = NULL;
806  tileJatomStartGBISSize = 0;
807 
808  tileListVirialEnergy = NULL;
809  tileListVirialEnergySize = 0;
810 
811  atomStorageSize = 0;
812  numTileLists = 0;
813  numTileListsGBIS = 0;
814  numJtiles = 1;
815 
816  outputOrder = NULL;
817  outputOrderSize = 0;
818  doOutputOrder = false;
819 
820  minmaxListLen = NULL;
821  minmaxListLenSize = 0;
822 
823  sortKeys = NULL;
824  sortKeysSize = 0;
825  sortKeys_endbit = 0;
826 
827  cudaCheck(cudaEventCreate(&tileListStatEvent));
828  tileListStatEventRecord = false;
829 }
830 
831 CudaTileListKernel::~CudaTileListKernel() {
832  cudaCheck(cudaSetDevice(deviceID));
833  deallocate_device<TileListStat>(&d_tileListStat);
834  deallocate_host<TileListStat>(&h_tileListStat);
835  //
836  if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
837  if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
838  if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
839  if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
840  if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
841  //
842  if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
843  if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
844  if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
845  if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
846  if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
847  if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
848  if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
849  if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
850  if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
851  if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
852  if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
853  if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
854  if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
855  if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
856  if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
857  if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
858  if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
859  if (jtiles != NULL) deallocate_device<int>(&jtiles);
860  if (tilePos != NULL) deallocate_device<int>(&tilePos);
861 
862  if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
863  if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
864 
865  if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
866 
867  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
868  if (part != NULL) deallocate_device<char>(&part);
869 
870  if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
871  if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
872 
873  cudaCheck(cudaEventDestroy(tileListStatEvent));
874 }
875 
876 void CudaTileListKernel::prepareTileList(cudaStream_t stream) {
877  clear_device_array<int>(jtiles, numJtiles, stream);
878 }
879 
880 void CudaTileListKernel::clearTileListStat(cudaStream_t stream) {
881  // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
882  memset(h_tileListStat, 0, sizeof(TileListStat));
883  h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
884  copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
885 }
886 
887 void CudaTileListKernel::finishTileList(cudaStream_t stream) {
888  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
889  cudaCheck(cudaEventRecord(tileListStatEvent, stream));
890  tileListStatEventRecord = true;
891 }
892 
893 void CudaTileListKernel::updateComputes(const int numComputesIn,
894  const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
895 
896  numComputes = numComputesIn;
897 
898  reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
899  copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
900 
901  if (doStreaming) doOutputOrder = true;
902 }
903 
904 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
905  const TileList* d_tileLists, cudaStream_t stream) {
906 
907  TileList* h_tileLists = new TileList[numTileLists];
908  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
909  cudaCheck(cudaStreamSynchronize(stream));
910  FILE* handle = fopen(filename,"wt");
911  for (int itileList=0;itileList < numTileLists;itileList++) {
912  TileList tmp = h_tileLists[itileList];
913  fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
914  tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
915  tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
916  }
917  fclose(handle);
918  delete [] h_tileLists;
919 }
920 
921 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
922  const int* d_tileJatomStart, cudaStream_t stream) {
923 
924  int* h_tileJatomStart = new int[numJtiles];
925  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
926  cudaCheck(cudaStreamSynchronize(stream));
927  FILE* handle = fopen(filename,"wt");
928  for (int i=0;i < numJtiles;i++) {
929  fprintf(handle, "%d\n", h_tileJatomStart[i]);
930  }
931  fclose(handle);
932  delete [] h_tileJatomStart;
933 }
934 
935 /*
936 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
937 {
938  return std::pair<int, int>(p.second, p.first);
939 }
940 
941 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
942  const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
943 
944  const int shCacheSize = 10;
945  TileList* h_tileLists = new TileList[numTileLists];
946  int* h_tileJatomStart = new int[numJtiles];
947  copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
948  copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
949  cudaCheck(cudaStreamSynchronize(stream));
950  int ntotal = 0;
951  int ncache = 0;
952  for (int i=0;i < numTileLists;i+=width) {
953  int jend = min(i + width, numTileLists);
954  std::map<int, int> atomStartMap;
955  std::map<int, int>::iterator it;
956  atomStartMap.clear();
957  for (int j=i;j < jend;j++) {
958  TileList tmp = h_tileLists[j];
959  int iatomStart = tmp.iatomStart;
960  it = atomStartMap.find(iatomStart);
961  if (it == atomStartMap.end()) {
962  // Insert new
963  atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
964  } else {
965  // Increase counter
966  it->second--;
967  }
968  int jtileStart = tmp.jtileStart;
969  int jtileEnd = tmp.jtileEnd;
970  ntotal += (jtileEnd - jtileStart + 1) + 1;
971  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
972  int jatomStart = h_tileJatomStart[jtile];
973  it = atomStartMap.find(jatomStart);
974  if (it == atomStartMap.end()) {
975  // Insert new
976  atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
977  } else {
978  // Increase counter
979  it->second--;
980  }
981  jatomStart |= (65535 << 16);
982  h_tileJatomStart[jtile] = jatomStart;
983  }
984  iatomStart |= (65535 << 16);
985  tmp.iatomStart = iatomStart;
986  h_tileLists[j] = tmp;
987  }
988  ncache += atomStartMap.size();
989  std::multimap<int, int> imap;
990  imap.clear();
991  std::multimap<int, int>::iterator imap_it;
992  std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
993  if (i < 400) {
994  printf("%d %d\n", ntotal, imap.size());
995  for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
996  if (imap_it->first != 0)
997  printf("(%d %d)\n", imap_it->first, imap_it->second);
998  }
999  }
1000  }
1001  printf("ntotal %d ncache %d\n", ntotal, ncache);
1002  copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
1003  copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
1004  cudaCheck(cudaStreamSynchronize(stream));
1005  delete [] h_tileLists;
1006  delete [] h_tileJatomStart;
1007 }
1008 */
1009 
1010 void CudaTileListKernel::prepareBuffers(
1011  int atomStorageSizeIn, int numPatchesIn,
1012  const CudaPatchRecord* h_cudaPatches,
1013  cudaStream_t stream
1014 ) {
1015  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSizeIn, OVERALLOC);
1016 
1017  // Copy cudaPatches to device
1018  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatchesIn);
1019  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatchesIn, stream);
1020 }
1021 
1022 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
1023  const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
1024  const float3 lata, const float3 latb, const float3 latc,
1025  const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
1026  const float plcutoff2In, const size_t maxShmemPerBlock,
1027  cudaStream_t stream, const bool atomsChanged, const bool allocatePart,
1028  bool CUDASOAintegrator, bool deviceMigration) {
1029 
1030  numPatches = numPatchesIn;
1031  atomStorageSize = atomStorageSizeIn;
1032  maxTileListLen = maxTileListLenIn;
1033  plcutoff2 = plcutoff2In;
1034 
1035  if (doStreaming) {
1036  // Re-allocate patchNumLists
1037  reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1038  reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1039  reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1040  }
1041 
1042  // Re-allocate (tileLists1, patchPairs1
1043  reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1044  reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1045 
1046  // Copy cudaPatches to device
1047  // If DeviceMigration is on then this happens in prepareBuffers()
1048  if (!CUDASOAintegrator || !deviceMigration) {
1049  reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1050  copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1051  }
1052 
1053  // Re-allocate temporary storage
1054  reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1055  // Calculate tile list positions (tilePos)
1056  {
1057  int nthread = 1024;
1058  int nblock = 1;
1059  calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
1060  (numComputes, cudaComputes, cudaPatches, tilePos);
1061  cudaCheck(cudaGetLastError());
1062  }
1063 
1064  // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1065  {
1066  int nthread = 512;
1067  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/32)+1);
1068  updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
1069  (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1070  cudaCheck(cudaGetLastError());
1071  }
1072 
1073  // ---------------------------------------------------------------------------------------------
1074 
1075 
1076  // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1077  // tileListDepth2 and tileListOrder2 since they're used in sorting
1078  reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1079  reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1080 
1081  // Allocate with +1 to include last term in the exclusive sum
1082  reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1083 
1084  reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1085 
1086  // deviceMigration could be true with CUDASOAintegrator as false if minimization is being
1087  // performed. So we want this to be true if CUDASOAintegrator is false or if deviceMigration
1088  // is false
1089  if (!CUDASOAintegrator || !deviceMigration) {
1090  if(atomsChanged) reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1091  if(!CUDASOAintegrator || atomsChanged) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1092  }
1093 
1094  //This should be allocated for FEP only
1095  if(allocatePart) reallocate_device<char>(&part, &partSize, atomStorageSize, OVERALLOC);
1096 
1097  // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1098  {
1099  int numBoundingBoxes = atomStorageSize/WARPSIZE;
1100  reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1101 
1102  int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1103  int nthread = WARPSIZE*nwarp;
1104  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1105  buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1106  cudaCheck(cudaGetLastError());
1107  }
1108 
1109  {
1110  int nwarp = TILELISTKERNELNEW_NUM_WARP;
1111  int nthread = WARPSIZE*nwarp;
1112  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1113 
1114  int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1115  if(shmem_size > maxShmemPerBlock){
1116  NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1117  }
1118 
1119  // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1120  // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1121  // re-allocation only happens when the size is exceeded.
1122  h_tileListStat->tilesSizeExceeded = true;
1123  int reallocCount = 0;
1124  while (h_tileListStat->tilesSizeExceeded) {
1125  reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1126 
1127  clearTileListStat(stream);
1128  // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1129 
1130  buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1131  (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1132  lata, latb, latc, plcutoff2, maxTileListLen,
1133  boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1134  tileListDepth1, tileListOrder1, patchPairs1,
1135  d_tileListStat);
1136 
1137  cudaCheck(cudaGetLastError());
1138 
1139  // get (numATileLists, numJtiles, tilesSizeExceeded)
1140  copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1141  cudaCheck(cudaStreamSynchronize(stream));
1142  numJtiles = h_tileListStat->numJtiles;
1143 
1144  if (h_tileListStat->tilesSizeExceeded) {
1145  reallocCount++;
1146  if (reallocCount > 1) {
1147  NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1148  }
1149  }
1150 
1151  }
1152 
1153  numTileLists = h_tileListStat->numTileLists;
1154 
1155  reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1156  }
1157 
1158  // Re-allocate tileListVirialEnergy.
1159  // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1160  // we're quaranteed to have enough space
1161  reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1162 
1163  reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1164  reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1165  reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1166  reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1167  reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1168 
1169  int numTileListsSrc = numTileListsPrev;
1170  int numJtilesSrc = numJtiles;
1171  int numTileListsDst = numTileLists;
1172  int numJtilesDst = numJtiles;
1173 
1174  // Sort tiles
1175  sortTileLists(
1176  false,
1177  0, false,
1178  numTileListsSrc, numJtilesSrc,
1179  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1180  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1181  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1182  numTileListsDst, numJtilesDst,
1183  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1184  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1185  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1186  stream);
1187 
1188  // Set active buffer to 2
1189  setActiveBuffer(2);
1190 
1191  if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1192 }
1193 
1194 //
1195 // Returns integer log2(a) rounded up
1196 //
1197 int ilog2(int a) {
1198  // if (a < 0)
1199  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1200  int k = 1;
1201  while (a >>= 1) k++;
1202  return k;
1203 }
1204 
1205 //
1206 // Sort tile lists
1207 //
1208 void CudaTileListKernel::sortTileLists(
1209  const bool useJtiles,
1210  const int begin_bit, const bool highDepthBitsSetIn,
1211  // Source
1212  const int numTileListsSrc, const int numJtilesSrc,
1213  PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1214  PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1215  PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1216  // Destination
1217  const int numTileListsDst, const int numJtilesDst,
1218  PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1219  PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1220  PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1221  cudaStream_t stream) {
1222 
1223  bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1224 
1225  // if (numTileListsDst == 0)
1226  // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1227 
1228  // Check that the array sizes are adequate
1229  if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1230  numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1231  (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1232  (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1233  NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1234 
1235  if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1236  numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1237  (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1238  (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1239  NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1240 
1241  if (begin_bit != 0 && begin_bit != 16)
1242  NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1243 
1244  // Number of bits needed in the sort
1245  int num_bit = ilog2(maxTileListLen);
1246  if (num_bit > 16)
1247  NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1248  int end_bit = begin_bit + num_bit;
1249 
1250  if (doStreaming)
1251  {
1252  // ----------------------------------------------------------------------------------------
1253  if (doOutputOrder && useJtiles) {
1254  // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1255  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1256 
1257  // Calculate position from depth
1258  {
1259  // -----------------------------------------------------------------------------
1260  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1261  // -----------------------------------------------------------------------------
1262  if (doShiftDown)
1263  {
1264  int nthread = 1024;
1265  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1266  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1267  (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1268  cudaCheck(cudaGetLastError());
1269  }
1270 
1271  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1272 
1273  // --------------------------------------------------------------------
1274  // Compute itileList positions to store tileLists
1275  // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1276  // --------------------------------------------------------------------
1277  {
1278  size_t size = 0;
1279  cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1280  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1281  // Make sure tempStorage doesn't remain NULL
1282  if (size == 0) size = 128;
1283  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1284  size = tempStorageSize;
1285  cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1286  (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1287  }
1288  }
1289 
1290  // Store in reverse order from outputOrder
1291  {
1292  int nthread = 1024;
1293  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1294  storeInReverse <<< nblock, nthread, 0, stream >>>
1295  (numTileListsSrc, begin_bit, outputOrder, tileListPos,
1296  tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1297  tileListOrderDst.ptr, tileListDepthDst.ptr);
1298  cudaCheck(cudaGetLastError());
1299  }
1300 
1301  // Build sortKeys
1302  {
1303  maxTileListLen_sortKeys = maxTileListLen;
1304 
1305  reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1306  clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1307 
1308  // Re-allocate and initialize minmaxListLen
1309  {
1310  reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1311 
1312  int nthread = 1024;
1313  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1314  initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1315  (numComputes, maxTileListLen, minmaxListLen);
1316  cudaCheck(cudaGetLastError());
1317  }
1318 
1319  // Build sortKeys and calculate minmaxListLen
1320  {
1321  int nthread = 1024;
1322  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1323  buildSortKeys <<< nblock, nthread, 0, stream >>>
1324  (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1325  tileListDepthDst.ptr, minmaxListLen, sortKeys);
1326  cudaCheck(cudaGetLastError());
1327 
1328  // Maximum value in sortKeys[] is numTileListsDst - 1
1329  sortKeys_endbit = ilog2(numTileListsDst);
1330  }
1331 
1332  // Fill in missing sortKeys using minmaxListLen
1333  {
1334  int nthread = 1024;
1335  int nwarp = nthread/WARPSIZE;
1336  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1337  fillSortKeys <<< nblock, nthread, 0, stream >>>
1338  (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1339  cudaCheck(cudaGetLastError());
1340  }
1341 
1342  }
1343 
1344  doOutputOrder = false;
1345 
1346  } else if (doOutputOrder) {
1347  // OutputOrder will be produced in next pairlist non-bond kernel.
1348  // This time just remove zero length lists
1349  // NOTE: This is done very infrequently, typically only once when the MD run starts.
1350 
1351  int endbit_tmp = ilog2(numTileListsSrc);
1352 
1353  // Remove zeros
1354  {
1355  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1356  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1357 
1358  int nthread = 1024;
1359  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1360  buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1361  (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1362  cudaCheck(cudaGetLastError());
1363  }
1364 
1365  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1366  {
1367  // Short list, sort withing a single thread block
1368  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1369  int nblock = 1;
1370 
1371  unsigned int oobKey = numTileListsSrc;
1372  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1373  (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1374  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1375  cudaCheck(cudaGetLastError());
1376  }
1377  else
1378  {
1379  // Long list, sort on multiple thread blocks
1380  size_t size = 0;
1381  cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1382  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1383  numTileListsSrc, 0, endbit_tmp, stream));
1384  // Make sure tempStorage doesn't remain NULL
1385  if (size == 0) size = 128;
1386  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1387  size = tempStorageSize;
1388  cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1389  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1390  numTileListsSrc, 0, endbit_tmp, stream));
1391  }
1392 
1393  // Re-order tileListDepth using tileListOrderDst
1394  {
1395  int nthread = 1024;
1396  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1397  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1398  (numTileListsDst, tileListOrderDst.ptr,
1399  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1400  cudaCheck(cudaGetLastError());
1401  }
1402 
1403  } else {
1404  // This is done during regular MD cycle
1405 
1406  if (sortKeys_endbit <= 0)
1407  NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1408 
1409  // Setup sort keys
1410  {
1411  reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1412  reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1413 
1414  int nthread = 1024;
1415  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1416  setupSortKey <<< nblock, nthread, 0, stream >>>
1417  (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1418  cudaCheck(cudaGetLastError());
1419 
1420  }
1421 
1422  // Global sort
1423  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1424  // if (false)
1425  {
1426  // Short list, sort withing a single thread block
1427  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1428  int nblock = 1;
1429 
1430  unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1431  sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1432  (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1433  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1434  cudaCheck(cudaGetLastError());
1435  }
1436  else
1437  {
1438  // Long list, sort on multiple thread blocks
1439  size_t size = 0;
1440  cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
1441  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1442  numTileListsSrc, 0, sortKeys_endbit, stream));
1443  // Make sure tempStorage doesn't remain NULL
1444  if (size == 0) size = 128;
1445  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1446  size = tempStorageSize;
1447  cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1448  sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1449  numTileListsSrc, 0, sortKeys_endbit, stream));
1450  }
1451 
1452  // Re-order tileListDepth using tileListOrderDst
1453  {
1454  int nthread = 1024;
1455  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1456  reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1457  (numTileListsDst, tileListOrderDst.ptr,
1458  tileListDepthSrc.ptr, tileListDepthDst.ptr);
1459  cudaCheck(cudaGetLastError());
1460  }
1461 
1462  // Local sort
1463  {
1464  int nthread = 32;
1465  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1466  localSort<32> <<< nblock, nthread, 0, stream >>>
1467  (numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1468  cudaCheck(cudaGetLastError());
1469 
1470  // No need to shift any more
1471  doShiftDown = false;
1472  }
1473 
1474  }
1475  // ----------------------------------------------------------------------------------------
1476 
1477  } // (doStreaming)
1478  else
1479  {
1480  // --------------------------------------------------------------------
1481  // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1482  // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1483  // --------------------------------------------------------------------
1484  if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1485  {
1486  // Short list, sort withing a single thread block
1487  int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1488  int nblock = 1;
1489 
1490  sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1491  (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1492  tileListOrderSrc.ptr, tileListOrderDst.ptr);
1493  cudaCheck(cudaGetLastError());
1494 
1495  }
1496  else
1497  {
1498  // Long list, sort on multiple thread blocks
1499  size_t size = 0;
1500  cudaCheck(cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1501  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1502  numTileListsSrc, begin_bit, end_bit, stream));
1503  // Make sure tempStorage doesn't remain NULL
1504  if (size == 0) size = 128;
1505  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1506  size = tempStorageSize;
1507  cudaCheck(cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1508  tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1509  numTileListsSrc, begin_bit, end_bit, stream));
1510  }
1511  }
1512 
1513  // -----------------------------------------------------------------------------
1514  // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1515  // -----------------------------------------------------------------------------
1516  if (doShiftDown)
1517  {
1518  int nthread = 1024;
1519  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1520  bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1521  (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1522  cudaCheck(cudaGetLastError());
1523  }
1524 
1525  // Allocate with +1 to include last term in the exclusive sum
1526  reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1527 
1528  // --------------------------------------------------------------------
1529  // Compute itileList positions to store tileLists
1530  // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1531  // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1532  // since this is an exclusive sum. But with this trick,
1533  // tileListPos[numTileListsDst] will contain the total number
1534  // of tile lists
1535  // --------------------------------------------------------------------
1536  {
1537  size_t size = 0;
1538  cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
1539  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1540  // Make sure tempStorage doesn't remain NULL
1541  if (size == 0) size = 128;
1542  reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1543  size = tempStorageSize;
1544  // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1545  // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1546  cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1547  (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1548  }
1549 
1550  // --------------------------------------------------------------------
1551  // Re-package to
1552  // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1553  // patchPairDst[]
1554  // tileJatomStartDst[]
1555  // tileExclsDst[0 ... numJtilesDst-1]
1556  // --------------------------------------------------------------------
1557  {
1558  int nthread = WARPSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1559  int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1560  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1561 
1562  repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1563  (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1564  (useJtiles) ? jtiles : NULL,
1565  tileListsSrc.ptr, tileListsDst.ptr,
1566  patchPairsSrc.ptr, patchPairsDst.ptr,
1567  tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1568  tileExclsSrc.ptr, tileExclsDst.ptr);
1569  cudaCheck(cudaGetLastError());
1570  }
1571 
1572  // Count the number of tileLists that contribute to each patch
1573  if (doStreaming)
1574  {
1575  clear_device_array<int>(patchNumLists, numPatches, stream);
1576 
1577  // Fill in patchNumLists[0...numPatches-1]
1578  int nthread = 512;
1579  int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1580  calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1581  (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1582  cudaCheck(cudaGetLastError());
1583 
1584  // Use emptyPatches[numPatches] as the count variable
1585  clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1586 
1587  // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1588  // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1589  setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1590  (numTileListsDst, tileListsDst.ptr, patchNumLists,
1591  numPatches, &emptyPatches[numPatches], emptyPatches);
1592  cudaCheck(cudaGetLastError());
1593 
1594  // // Copy emptyPatches[0 ... numPatches] to host
1595  copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1596  cudaCheck(cudaStreamSynchronize(stream));
1597  numEmptyPatches = h_emptyPatches[numPatches];
1598  }
1599 
1600 }
1601 
1602 //
1603 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1604 //
1605 void CudaTileListKernel::reSortTileLists(const bool doGBIS, cudaStream_t stream) {
1606  // Store previous number of active lists
1607  int numTileListsPrev = numTileLists;
1608 
1609  // Wait for finishTileList() to stop copying
1610  if (!tileListStatEventRecord)
1611  NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1612  cudaCheck(cudaEventSynchronize(tileListStatEvent));
1613 
1614  // Get numTileLists, numTileListsGBIS, and numExcluded
1615  {
1616  numTileLists = h_tileListStat->numTileLists;
1617  numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1618  numExcluded = h_tileListStat->numExcluded;
1619  }
1620 
1621  // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1622  // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1623  sortTileLists(true, 0, true,
1624  numTileListsPrev, numJtiles,
1625  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1626  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1627  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1628  numTileLists, numJtiles,
1629  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1630  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1631  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1632  stream);
1633 
1634  // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1635  // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1636  // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1637 
1638  // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1639 
1640  // NOTE:
1641  // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1642  // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1643 
1644  if (doGBIS) {
1645  // GBIS is used => produce a second tile list
1646  // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1647  reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1648  reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1649 
1650  sortTileLists(true, 16, true,
1651  numTileListsPrev, numJtiles,
1652  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1653  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1654  PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1655  numTileListsGBIS, numJtiles,
1656  PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1657  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1658  PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1659  stream);
1660  }
1661 
1662  // Set active buffer to be 1
1663  setActiveBuffer(1);
1664 
1665 }
1666 
1667 /*
1668 //
1669 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1670 //
1671 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1672  return;
1673 
1674  if (!doStreaming || !doOutputOrder)
1675  return;
1676 
1677  // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1678  // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1679  sortTileLists(false, 0, true,
1680  numTileLists, numJtiles,
1681  PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1682  PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1683  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1684  numTileLists, numJtiles,
1685  PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1686  PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1687  PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1688  stream);
1689 
1690  // Set active buffer to be 2
1691  setActiveBuffer(2);
1692 
1693 }
1694 */
1695 
1696 void CudaTileListKernel::setTileListVirialEnergyLength(int len) {
1697  if (len > tileListVirialEnergySize) {
1698  NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1699  }
1700  tileListVirialEnergyLength = len;
1701 }
1702 
1703 void CudaTileListKernel::setTileListVirialEnergyGBISLength(int len) {
1704  if (len > tileListVirialEnergySize) {
1705  NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1706  }
1707  tileListVirialEnergyGBISLength = len;
1708 }
1709 
1710 #endif // NAMD_CUDA