2 // #include <algorithm>
3 #if defined(NAMD_HIP) //NAMD_HIP
4 #include <hip/hip_runtime.h>
5 #include <hipcub/hipcub.hpp>
10 #include "CudaTileListKernel.hip.h"
11 #include "CudaComputeNonbondedKernel.hip.h"
12 #include "DeviceCUDA.h"
14 #if defined(NAMD_HIP) //NAMD_HIP
17 #define __thread __declspec(thread)
19 extern __thread DeviceCUDA *deviceCUDA;
21 #define OVERALLOC 1.2f
22 #define DEFAULTKERNEL_NUM_THREAD 256
23 #define UPDATEPATCHESKERNEL_NUM_THREAD 256
24 #define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 256
25 #define BOUNDINGBOXKERNEL_NUM_WARP 4
28 void NAMD_die(const char *);
31 // Calculate the number of lists that contribute to each patch
33 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
34 const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
36 for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
38 int2 patchInd = tileLists[i].patchInd;
39 atomicAdd(&patchNumLists[patchInd.x], 1);
40 if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
46 // Write patchNumList back to tile list and
47 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
49 __global__ void setPatchNumLists_findEmptyPatches(const int numTileLists,
50 TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
51 const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
53 for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
55 int2 patchInd = tileLists[i].patchInd;
56 int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
57 tileLists[i].patchNumList = patchNumList;
60 for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
62 if (patchNumLists[i] == 0) {
63 int ind = atomicAdd(numEmptyPatches, 1);
64 emptyPatches[ind] = i;
71 // Builds a sort key that removes zeros but keeps the order otherwise the same
73 __global__ void buildRemoveZerosSortKey(const int numTileLists,
74 const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
76 for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
78 int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
79 sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
84 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
85 const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
86 const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
88 for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
90 int icompute = tileLists[itileList].icompute;
91 int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
92 int i = icompute*maxTileListLen + (depth - 1);
93 sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
99 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
100 unsigned int* __restrict__ keys, int* __restrict__ vals) {
102 // NOTE: blockDim.x = width
104 for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
106 int i = base + threadIdx.x;
107 typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
108 __shared__ typename BlockRadixSort::TempStorage tempStorage;
109 unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
110 int val[1] = {(i < n) ? vals[i] : 0};
111 BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
121 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
122 const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
123 const int* __restrict__ tileListOrderSrc,
124 const unsigned int* __restrict__ tileListDepthSrc,
125 int* __restrict__ tileListOrderDst,
126 unsigned int* __restrict__ tileListDepthDst) {
128 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
130 int j = outputOrder[numTileListsSrc - i - 1];
131 if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
132 int k = tileListPos[i];
133 tileListDepthDst[k] = tileListDepthSrc[j];
134 tileListOrderDst[k] = j; //tileListOrderSrc[j];
140 // Bit shift tileListDepth so that only lower 16 bits are used
142 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
143 const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
144 unsigned int* __restrict__ tileListDepthDst) {
146 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
148 int j = outputOrder[numTileLists - i - 1];
149 tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
154 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
155 int2* __restrict__ minmaxListLen) {
158 val.x = maxTileListLen+1;
160 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
162 minmaxListLen[i] = val;
168 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
170 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
171 const TileList* __restrict__ tileListsSrc,
172 const int* __restrict__ tileListOrderDst,
173 const unsigned int* __restrict__ tileListDepthDst,
174 int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
176 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
178 int k = tileListOrderDst[i];
179 int icompute = tileListsSrc[k].icompute;
180 int depth = tileListDepthDst[i] & 65535;
181 // depth is in range [1 ... maxTileListLen]
182 int j = icompute*maxTileListLen + (depth-1);
184 int2 minmax = minmaxListLen[icompute];
185 int2 minmaxOrig = minmax;
186 if (minmax.x > depth) minmax.x = depth;
187 if (minmax.y < depth) minmax.y = depth;
188 if (minmax.x != minmaxOrig.x) {
189 atomicMin(&minmaxListLen[icompute].x, minmax.x);
191 if (minmax.y != minmaxOrig.y) {
192 atomicMax(&minmaxListLen[icompute].y, minmax.y);
198 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
199 const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
201 for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
202 const int wid = threadIdx.x % WARPSIZE;
203 int2 minmax = minmaxListLen[i];
204 int minlen = minmax.x;
205 int maxlen = minmax.y;
206 // minlen, maxlen are in range [1 ... maxTileListLen]
207 // as long as i is in tileListsSrc[].icompute above
208 if ( maxlen < minlen ) {
210 maxlen = maxTileListLen;
212 unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
213 unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
214 unsigned int aveKey = (maxKey + minKey)/2;
215 for (int j=wid;j < minlen-1;j+=WARPSIZE) {
216 sortKeys[i*maxTileListLen + j] = minKey;
218 for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
219 sortKeys[i*maxTileListLen + j] = maxKey;
221 for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
222 if (sortKeys[i*maxTileListLen + j] == 0) {
223 sortKeys[i*maxTileListLen + j] = aveKey;
231 // Calculate bounding boxes for sets of WARPSIZE atoms
235 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
236 BoundingBox* __restrict__ boundingBoxes) {
238 const int warpId = threadIdx.x / BOUNDINGBOXSIZE;
239 const int wid = threadIdx.x % BOUNDINGBOXSIZE;
241 // Loop with warp-aligned index to avoid warp-divergence
242 for (int iwarp = warpId*BOUNDINGBOXSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
244 const int i = iwarp + wid;
245 // Bounding box index
246 const int ibb = i/BOUNDINGBOXSIZE;
248 float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
250 volatile float3 minxyz, maxxyz;
252 typedef cub::BlockReduce<float, BOUNDINGBOXSIZE> BlockReduce;
253 __shared__ typename BlockReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
254 minxyz.x = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
255 minxyz.y = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
256 minxyz.z = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
257 maxxyz.x = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
258 maxxyz.y = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
259 maxxyz.z = BlockReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
262 BoundingBox boundingBox;
263 boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
264 boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
265 boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
266 boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
267 boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
268 boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
269 boundingBoxes[ibb] = boundingBox;
276 // Returns the lower estimate for the distance between two bounding boxes
278 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
279 float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
280 float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
281 float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
282 float r2 = dx*dx + dy*dy + dz*dz;
288 // Performs warp-level exclusive sum on a shared memory array:
289 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
290 // sh_in and sh_out can point to same array
291 // Returns the total sum
293 template <typename T>
294 __device__ __forceinline__
295 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
296 const int wid = threadIdx.x % WARPSIZE;
297 volatile int blockOffset = 0;
298 for (int iblock=0;iblock < n;iblock += WARPSIZE) {
299 // Size of the exclusive sum
300 int blockLen = min(WARPSIZE, n-iblock);
301 // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
302 typedef cub::WarpScan<int> WarpScan;
303 __shared__ typename WarpScan::TempStorage tempStorage;
304 int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
305 WarpScan(tempStorage).ExclusiveSum(data, data);
306 // Shift by block offset
309 int last = (int)sh_in[iblock + blockLen-1];
311 if (wid < blockLen) sh_out[iblock + wid] = data;
312 // block offset = last term of the exclusive sum + last value
313 blockOffset = sh_out[iblock + blockLen-1] + last;
319 #define TILELISTKERNELNEW_NUM_WARP 4
322 // NOTE: Executed on a single thread block
324 template<int nthread>
325 __global__ void calcTileListPosKernel(const int numComputes,
326 const CudaComputeRecord* __restrict__ computes,
327 const CudaPatchRecord* __restrict__ patches,
328 int* __restrict__ tilePos) {
330 typedef cub::BlockScan<int, nthread> BlockScan;
332 __shared__ typename BlockScan::TempStorage tempStorage;
333 __shared__ int shTilePos0;
335 if (threadIdx.x == nthread-1) {
339 for (int base=0;base < numComputes;base+=nthread) {
340 int k = base + threadIdx.x;
342 int numTiles1 = (k < numComputes) ?
343 (CudaComputeNonbondedKernel::computeNumTiles(patches[computes[k].patchInd.x].numAtoms, BOUNDINGBOXSIZE)) : 0;
345 // Calculate positions in tile list and jtile list
347 BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
349 // Store into global memory
350 if (k < numComputes) {
351 tilePos[k] = shTilePos0 + tilePosVal;
355 // Store block end position
356 if (threadIdx.x == nthread-1) {
357 shTilePos0 += tilePosVal + numTiles1;
363 template<int nthread>
364 __global__ void updatePatchesKernel(const int numComputes,
365 const int* __restrict__ tilePos,
366 const CudaComputeRecord* __restrict__ computes,
367 const CudaPatchRecord* __restrict__ patches,
368 TileList* __restrict__ tileLists) {
370 const int tid = threadIdx.x % nthread;
372 // nthread threads takes care of one compute
373 for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
375 CudaComputeRecord compute = computes[k];
376 float3 offsetXYZ = compute.offsetXYZ;
377 int2 patchInd = compute.patchInd;
378 int numTiles1 = CudaComputeNonbondedKernel::computeNumTiles(patches[patchInd.x].numAtoms, BOUNDINGBOXSIZE);
379 int itileList0 = tilePos[k];
380 for (int i=tid;i < numTiles1;i+=nthread) {
381 tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
382 tileLists[itileList0 + i].patchInd = patchInd;
383 tileLists[itileList0 + i].icompute = k;
389 __host__ __device__ __forceinline__
390 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
393 maxTileListLen*sizeof(char)
399 buildTileListsBBKernel(const int numTileLists,
400 TileList* __restrict__ tileLists,
401 const CudaPatchRecord* __restrict__ patches,
402 const int* __restrict__ tileListPos,
403 const float3 lata, const float3 latb, const float3 latc,
404 const float cutoff2, const int maxTileListLen,
405 const BoundingBox* __restrict__ boundingBoxes,
406 int* __restrict__ tileJatomStart,
407 const int tileJatomStartSize,
408 unsigned int* __restrict__ tileListDepth,
409 int* __restrict__ tileListOrder,
410 PatchPairRecord* __restrict__ patchPairs,
411 TileListStat* __restrict__ tileListStat
413 HIP_DYNAMIC_SHARED( char, sh_buffer)
414 int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
415 int pos = threadIdx.x*sizePerThread;
416 volatile char* sh_tile = (char*)&sh_buffer[pos];
418 // Loop with warp-aligned index to avoid warp-divergence
419 for (int iwarp = (threadIdx.x/BOUNDINGBOXSIZE)*BOUNDINGBOXSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
421 // Use one thread per tile list
422 const int wid = threadIdx.x % BOUNDINGBOXSIZE;
423 const int itileList = iwarp + wid;
426 int itileListLen = 0;
427 CudaPatchRecord patch1;
428 CudaPatchRecord patch2;
434 if (itileList < numTileLists) {
435 offsetXYZ = tileLists[itileList].offsetXYZ;
436 patchInd = tileLists[itileList].patchInd;
437 icompute = tileLists[itileList].icompute;
439 i = itileList - tileListPos[icompute];
441 float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
442 float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
443 float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
445 // DH - set zeroShift flag if magnitude of shift vector is zero
446 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
449 patch1 = patches[patchInd.x];
450 patch2 = patches[patchInd.y];
451 // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
452 numTiles2 = CudaComputeNonbondedKernel::computeNumTiles(patch2.numAtoms, BOUNDINGBOXSIZE);
453 int tileStart1 = patch1.atomStart/BOUNDINGBOXSIZE;
454 int tileStart2 = patch2.atomStart/BOUNDINGBOXSIZE;
456 // DH - self requires that zeroShift is also set
457 bool self = zeroShift && (tileStart1 == tileStart2);
459 // Load i-atom data (and shift coordinates)
460 BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
461 boundingBoxI.x += shx;
462 boundingBoxI.y += shy;
463 boundingBoxI.z += shz;
465 for (int j=0;j < numTiles2;j++) {
467 if (!self || j >= i) {
468 BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
469 float r2bb = distsq(boundingBoxI, boundingBoxJ);
470 if (r2bb < cutoff2) {
477 tileListDepth[itileList] = (unsigned int)itileListLen;
478 tileListOrder[itileList] = itileList;
481 typedef cub::BlockScan<int, BOUNDINGBOXSIZE> BlockScan;
482 __shared__ typename BlockScan::TempStorage tempStorage;
483 int active = (itileListLen > 0);
485 BlockScan(tempStorage).ExclusiveSum(active, activePos);
487 BlockScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
488 int jtileStart, numJtiles;
489 // Last thread in the warp knows the total number
490 if (wid == BOUNDINGBOXSIZE-1) {
491 atomicAdd(&tileListStat->numTileLists, activePos + active);
492 numJtiles = itileListPos + itileListLen;
493 jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
495 numJtiles = cub::ShuffleIndex<BOUNDINGBOXSIZE>(numJtiles, BOUNDINGBOXSIZE-1, WARP_FULL_MASK);
496 jtileStart = cub::ShuffleIndex<BOUNDINGBOXSIZE>(jtileStart, BOUNDINGBOXSIZE-1, WARP_FULL_MASK);
497 if (jtileStart + numJtiles > tileJatomStartSize) {
498 // tileJatomStart out of memory, exit
499 if (wid == 0) tileListStat->tilesSizeExceeded = true;
503 int jStart = itileListPos;
504 int jEnd = cub::ShuffleDown<BOUNDINGBOXSIZE>(itileListPos, 1, BOUNDINGBOXSIZE-1, WARP_FULL_MASK);
505 if (wid == BOUNDINGBOXSIZE-1) jEnd = numJtiles;
507 if (itileListLen > 0) {
510 tileLists[itileList].iatomStart = patch1.atomStart + i*BOUNDINGBOXSIZE;
511 tileLists[itileList].jtileStart = jtileStart + jStart;
512 tileLists[itileList].jtileEnd = jtileStart + jEnd - 1;
513 tileLists[itileList].patchInd = patchInd;
514 tileLists[itileList].offsetXYZ = offsetXYZ;
515 tileLists[itileList].icompute = icompute;
516 // TLtmp.patchNumList.x = 0;
517 // TLtmp.patchNumList.y = 0;
518 // tileLists[itileList] = TLtmp;
520 PatchPairRecord patchPair;
521 patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
522 patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
523 patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
524 patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
525 patchPairs[itileList] = patchPair;
528 int jtile = jtileStart + jStart;
529 for (int j=0;j < numTiles2;j++) {
531 tileJatomStart[jtile] = patch2.atomStart + j*BOUNDINGBOXSIZE;
542 #define REPACKTILELISTSKERNEL_NUM_WARP 1
544 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
545 const int* __restrict__ tileListOrder,
546 const int* __restrict__ jtiles,
547 const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
548 const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
549 const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
550 const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
552 const int wid = threadIdx.x % BOUNDINGBOXSIZE;
554 // One warp does one tile list
555 for (int i = threadIdx.x/BOUNDINGBOXSIZE + blockDim.x/BOUNDINGBOXSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/BOUNDINGBOXSIZE*gridDim.x)
557 int j = tileListOrder[i];
558 int start = tileListPos[i];
559 int end = tileListPos[i+1]-1;
560 if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
562 int startOld = __ldg(&tileListsSrc[j].jtileStart);
563 int endOld = __ldg(&tileListsSrc[j].jtileEnd);
564 int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
566 offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
567 offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
568 offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
569 int2 patchInd = tileListsSrc[j].patchInd;
570 int icompute = __ldg(&tileListsSrc[j].icompute);
572 // TileList tileList;
573 tileListsDst[i].iatomStart = iatomStart;
574 tileListsDst[i].offsetXYZ = offsetXYZ;
575 tileListsDst[i].jtileStart = start;
576 tileListsDst[i].jtileEnd = end;
577 tileListsDst[i].patchInd = patchInd;
578 tileListsDst[i].icompute = icompute;
579 //tileListsDst[i] = tileList;
582 if (jtiles == NULL) {
583 // No jtiles, simple copy will do
585 for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=BOUNDINGBOXSIZE,jtile+=BOUNDINGBOXSIZE) {
586 if (jtileOld + wid <= endOld) {
587 // this changes the values of tileListdst it seems. why?
588 tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
591 if (tileExclsSrc != NULL) {
593 for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
594 tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
599 for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=BOUNDINGBOXSIZE) {
600 int t = jtileOld + wid;
601 int jtile = (t <= endOld) ? jtiles[t] : 0;
604 typedef cub::BlockScan<int, BOUNDINGBOXSIZE> BlockScan;
605 __shared__ typename BlockScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
606 int warpId = threadIdx.x / BOUNDINGBOXSIZE;
608 BlockScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
610 if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
612 WarpMask b = NAMD_WARP_BALLOT(WARP_FULL_MASK, jtile);
613 if (tileExclsSrc != NULL) {
615 // k = index of thread that has data
616 #if BOUNDINGBOXSIZE == 64
617 int k = __ffsll(b) - 1;
619 int k = __ffs(b) - 1;
621 tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
622 // remove 1 bit and advance jtile0
623 b ^= ((WarpMask)1 << k);
627 #if BOUNDINGBOXSIZE == 64
628 jtile0 += __popcll(b);
640 // NOTE: Executed on a single thread block
641 // oobKey = out-of-bounds key value
643 #define SORTTILELISTSKERNEL_NUM_THREAD 256
644 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 15
646 template <typename keyT, typename valT, bool ascend>
647 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__
648 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
649 const int begin_bit, const int end_bit, const keyT oobKey,
650 keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
651 valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
653 typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
654 SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
656 typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD,
657 SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
659 typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
660 SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort;
663 typename BlockLoad::TempStorage load;
664 typename BlockLoadU::TempStorage loadU;
665 typename BlockRadixSort::TempStorage sort;
668 keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
669 valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
671 BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
673 BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
677 BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
679 BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
681 cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
682 cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
685 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
686 unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
688 for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
690 int j = tileListOrder[i];
691 tileListDepthDst[i] = tileListDepthSrc[j];
697 // Bit shift tileListDepth so that only lower 16 bits are used
699 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
700 unsigned int* __restrict__ tileListDepth) {
702 for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
704 unsigned int a = tileListDepth[itileList];
707 tileListDepth[itileList] = a;
712 // ##############################################################################################
713 // ##############################################################################################
714 // ##############################################################################################
716 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
717 deviceID(deviceID), doStreaming(doStreaming) {
719 cudaCheck(cudaSetDevice(deviceID));
730 cudaComputesSize = 0;
732 patchNumLists = NULL;
733 patchNumListsSize = 0;
736 emptyPatchesSize = 0;
737 h_emptyPatches = NULL;
738 h_emptyPatchesSize = 0;
756 tileJatomStart1 = NULL;
757 tileJatomStart1Size = 0;
758 tileJatomStart2 = NULL;
759 tileJatomStart2Size = 0;
761 boundingBoxes = NULL;
762 boundingBoxesSize = 0;
764 tileListDepth1 = NULL;
765 tileListDepth1Size = 0;
766 tileListDepth2 = NULL;
767 tileListDepth2Size = 0;
769 tileListOrder1 = NULL;
770 tileListOrder1Size = 0;
771 tileListOrder2 = NULL;
772 tileListOrder2Size = 0;
784 allocate_device<TileListStat>(&d_tileListStat, 1);
785 allocate_host<TileListStat>(&h_tileListStat, 1);
798 tileListsGBIS = NULL;
799 tileListsGBISSize = 0;
801 tileJatomStartGBIS = NULL;
802 tileJatomStartGBISSize = 0;
804 tileListVirialEnergy = NULL;
805 tileListVirialEnergySize = 0;
809 numTileListsGBIS = 0;
814 doOutputOrder = false;
816 minmaxListLen = NULL;
817 minmaxListLenSize = 0;
823 cudaCheck(cudaEventCreate(&tileListStatEvent));
824 tileListStatEventRecord = false;
827 CudaTileListKernel::~CudaTileListKernel() {
828 cudaCheck(cudaSetDevice(deviceID));
829 deallocate_device<TileListStat>(&d_tileListStat);
830 deallocate_host<TileListStat>(&h_tileListStat);
832 if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
833 if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
834 if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
835 if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
836 if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
838 if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
839 if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
840 if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
841 if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
842 if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
843 if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
844 if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
845 if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
846 if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
847 if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
848 if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
849 if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
850 if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
851 if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
852 if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
853 if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
854 if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
855 if (jtiles != NULL) deallocate_device<int>(&jtiles);
856 if (tilePos != NULL) deallocate_device<int>(&tilePos);
858 if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
859 if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
861 if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
863 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
864 if (part != NULL) deallocate_device<char>(&part);
866 if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
867 if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
869 cudaCheck(cudaEventDestroy(tileListStatEvent));
872 void CudaTileListKernel::prepareTileList(cudaStream_t stream) {
873 clear_device_array<int>(jtiles, numJtiles, stream);
876 void CudaTileListKernel::clearTileListStat(cudaStream_t stream) {
877 // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
878 memset(h_tileListStat, 0, sizeof(TileListStat));
879 h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
880 copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
883 void CudaTileListKernel::finishTileList(cudaStream_t stream) {
884 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
885 cudaCheck(cudaEventRecord(tileListStatEvent, stream));
886 tileListStatEventRecord = true;
889 void CudaTileListKernel::updateComputes(const int numComputesIn,
890 const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
892 numComputes = numComputesIn;
894 reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
895 copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
897 if (doStreaming) doOutputOrder = true;
900 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
901 const TileList* d_tileLists, cudaStream_t stream) {
903 TileList* h_tileLists = new TileList[numTileLists];
904 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
905 cudaCheck(cudaStreamSynchronize(stream));
906 FILE* handle = fopen(filename,"wt");
907 for (int itileList=0;itileList < numTileLists;itileList++) {
908 TileList tmp = h_tileLists[itileList];
909 fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
910 tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
911 tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
914 delete [] h_tileLists;
917 void CudaTileListKernel::writeTileList(FILE* handle, const int numTileLists,
918 const TileList* d_tileLists, cudaStream_t stream) {
920 TileList* h_tileLists = new TileList[numTileLists];
921 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
922 cudaCheck(cudaStreamSynchronize(stream));
923 for (int itileList=0;itileList < numTileLists;itileList++) {
924 TileList tmp = h_tileLists[itileList];
925 fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
926 tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
927 tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
929 delete [] h_tileLists;
932 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
933 const int* d_tileJatomStart, cudaStream_t stream) {
935 int* h_tileJatomStart = new int[numJtiles];
936 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
937 cudaCheck(cudaStreamSynchronize(stream));
938 FILE* handle = fopen(filename,"wt");
939 for (int i=0;i < numJtiles;i++) {
940 fprintf(handle, "%d\n", h_tileJatomStart[i]);
943 delete [] h_tileJatomStart;
946 void CudaTileListKernel::writeTileJatomStart(FILE* handle, const int numJtiles,
947 const int* d_tileJatomStart, cudaStream_t stream) {
949 int* h_tileJatomStart = new int[numJtiles];
950 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
951 cudaCheck(cudaStreamSynchronize(stream));
952 for (int i=0;i < numJtiles;i++) {
953 fprintf(handle, "%d\n", h_tileJatomStart[i]);
955 delete [] h_tileJatomStart;
958 void CudaTileListKernel::writeTileExcls(FILE* handle, const int numJTiles,
959 const TileExcl* d_tileExcl, cudaStream_t stream){
960 TileExcl* h_tileExcl = new TileExcl[numJtiles];
961 copy_DtoH<TileExcl>(d_tileExcl, h_tileExcl, numJtiles, stream);
962 cudaCheck(cudaStreamSynchronize(stream));
963 for (int i=0;i < numJtiles;i++) {
964 for(int j = 0; j < WARPSIZE; j++) fprintf(handle, " %u ", h_tileExcl[i].excl[j]);
965 fprintf(handle, "\n");
967 delete [] h_tileExcl;
971 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
973 return std::pair<int, int>(p.second, p.first);
976 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
977 const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
979 const int shCacheSize = 10;
980 TileList* h_tileLists = new TileList[numTileLists];
981 int* h_tileJatomStart = new int[numJtiles];
982 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
983 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
984 cudaCheck(cudaStreamSynchronize(stream));
987 for (int i=0;i < numTileLists;i+=width) {
988 int jend = min(i + width, numTileLists);
989 std::map<int, int> atomStartMap;
990 std::map<int, int>::iterator it;
991 atomStartMap.clear();
992 for (int j=i;j < jend;j++) {
993 TileList tmp = h_tileLists[j];
994 int iatomStart = tmp.iatomStart;
995 it = atomStartMap.find(iatomStart);
996 if (it == atomStartMap.end()) {
998 atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
1003 int jtileStart = tmp.jtileStart;
1004 int jtileEnd = tmp.jtileEnd;
1005 ntotal += (jtileEnd - jtileStart + 1) + 1;
1006 for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
1007 int jatomStart = h_tileJatomStart[jtile];
1008 it = atomStartMap.find(jatomStart);
1009 if (it == atomStartMap.end()) {
1011 atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
1016 jatomStart |= (65535 << 16);
1017 h_tileJatomStart[jtile] = jatomStart;
1019 iatomStart |= (65535 << 16);
1020 tmp.iatomStart = iatomStart;
1021 h_tileLists[j] = tmp;
1023 ncache += atomStartMap.size();
1024 std::multimap<int, int> imap;
1026 std::multimap<int, int>::iterator imap_it;
1027 std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
1029 printf("%d %d\n", ntotal, imap.size());
1030 for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
1031 if (imap_it->first != 0)
1032 printf("(%d %d)\n", imap_it->first, imap_it->second);
1036 printf("ntotal %d ncache %d\n", ntotal, ncache);
1037 copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
1038 copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
1039 cudaCheck(cudaStreamSynchronize(stream));
1040 delete [] h_tileLists;
1041 delete [] h_tileJatomStart;
1045 void CudaTileListKernel::prepareBuffers(
1046 int atomStorageSizeIn, int numPatchesIn,
1047 const CudaPatchRecord* h_cudaPatches,
1050 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSizeIn, OVERALLOC);
1052 // Copy cudaPatches to device
1053 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatchesIn);
1054 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatchesIn, stream);
1058 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
1059 const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
1060 const float3 lata, const float3 latb, const float3 latc,
1061 const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq,
1062 const float plcutoff2In, const size_t maxShmemPerBlock,
1063 cudaStream_t stream, const bool atomsChanged, const bool allocatePart,
1064 bool CUDASOAintegratorOn, bool deviceMigration) {
1066 numPatches = numPatchesIn;
1067 atomStorageSize = atomStorageSizeIn;
1068 maxTileListLen = maxTileListLenIn;
1069 plcutoff2 = plcutoff2In;
1072 // Re-allocate patchNumLists
1073 reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1074 reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1075 reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1078 // Re-allocate (tileLists1, patchPairs1
1079 reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
1080 reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
1082 // Copy cudaPatches to device
1083 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1084 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1086 // Copy cudaPatches to device
1087 // If DeviceMigration is on then this happens in prepareBuffers()
1088 if (!CUDASOAintegratorOn || !deviceMigration) {
1089 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1090 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
1093 // Re-allocate temporary storage
1094 reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
1095 // Calculate tile list positions (tilePos)
1097 int nthread = DEFAULTKERNEL_NUM_THREAD;
1099 calcTileListPosKernel<DEFAULTKERNEL_NUM_THREAD> <<< nblock, nthread, 0, stream >>> (
1100 numComputes, cudaComputes, cudaPatches, tilePos);
1101 cudaStreamSynchronize(stream);// for now
1102 cudaCheck(cudaGetLastError());
1105 // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
1107 int nthread = UPDATEPATCHESKERNEL_NUM_THREAD;
1108 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/BOUNDINGBOXSIZE)+1);
1109 updatePatchesKernel<BOUNDINGBOXSIZE> <<< nblock, BOUNDINGBOXSIZE, 0, stream >>> (
1110 numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1111 // cudaCheck(cudaGetLastError());
1114 // ---------------------------------------------------------------------------------------------
1117 // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
1118 // tileListDepth2 and tileListOrder2 since they're used in sorting
1119 reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
1120 reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
1122 // Allocate with +1 to include last term in the exclusive sum
1123 reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
1125 reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
1127 if (!deviceMigration) {
1128 if(atomsChanged) reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
1129 if(!CUDASOAintegratorOn || atomsChanged) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
1132 //This should be allocated for FEP only
1133 if(allocatePart) reallocate_device<char>(&part, &partSize, atomStorageSize, OVERALLOC);
1135 // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
1137 int numBoundingBoxes = atomStorageSize/BOUNDINGBOXSIZE;
1138 reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
1140 int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
1141 // int nthread = BOUNDINGBOXSIZE*nwarp;
1142 int nthread = BOUNDINGBOXSIZE*1; // Changing this value to 1 as we want to use BlockReduce primitives
1143 int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1144 buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1145 cudaCheck(cudaGetLastError());
1149 int nwarp = TILELISTKERNELNEW_NUM_WARP;
1150 // int nthread = BOUNDINGBOXSIZE*nwarp;
1151 int nthread = BOUNDINGBOXSIZE*1; // 1 for now to use BlockReduce.
1152 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
1154 int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
1155 if(shmem_size > maxShmemPerBlock){
1156 NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1159 // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
1160 // tell the required size in h_tileListStat->numJtiles. In subsequent calls,
1161 // re-allocation only happens when the size is exceeded.
1162 h_tileListStat->tilesSizeExceeded = true;
1163 int reallocCount = 0;
1164 while (h_tileListStat->tilesSizeExceeded) {
1165 reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
1167 clearTileListStat(stream);
1168 // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
1170 buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
1171 (numTileListsPrev, tileLists1, cudaPatches, tilePos,
1172 lata, latb, latc, plcutoff2, maxTileListLen,
1173 boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1174 tileListDepth1, tileListOrder1, patchPairs1,
1177 cudaCheck(cudaGetLastError());
1179 // get (numATileLists, numJtiles, tilesSizeExceeded)
1180 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
1181 cudaCheck(cudaStreamSynchronize(stream));
1182 numJtiles = h_tileListStat->numJtiles;
1184 if (h_tileListStat->tilesSizeExceeded) {
1186 if (reallocCount > 1) {
1187 NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
1193 numTileLists = h_tileListStat->numTileLists;
1195 reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
1198 // Re-allocate tileListVirialEnergy.
1199 // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
1200 // we're quaranteed to have enough space
1201 reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
1203 reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
1204 reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
1205 reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
1206 reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
1207 reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
1209 int numTileListsSrc = numTileListsPrev;
1210 int numJtilesSrc = numJtiles;
1211 int numTileListsDst = numTileLists;
1212 int numJtilesDst = numJtiles;
1215 writeTileList("tileListBefore_HIP.txt", numTileLists, tileLists1, stream);
1216 writeTileJatomStart("tileJatomStartBefore_HIP.txt", numJtiles, tileJatomStart1, stream);
1223 numTileListsSrc, numJtilesSrc,
1224 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1225 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1226 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1227 numTileListsDst, numJtilesDst,
1228 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1229 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1230 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1234 writeTileList("tileListAfter_HIP.txt", numTileLists, tileLists1, stream);
1235 writeTileJatomStart("tileJatomStartAfter_HIP.txt", numJtiles, tileJatomStart1, stream);
1239 // Set active buffer to 2
1242 if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
1246 // Returns integer log2(a) rounded up
1250 // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1252 while (a >>= 1) k++;
1259 void CudaTileListKernel::sortTileLists(
1260 const bool useJtiles,
1261 const int begin_bit, const bool highDepthBitsSetIn,
1263 const int numTileListsSrc, const int numJtilesSrc,
1264 PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1265 PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1266 PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1268 const int numTileListsDst, const int numJtilesDst,
1269 PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1270 PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1271 PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1272 cudaStream_t stream) {
1274 bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1276 // if (numTileListsDst == 0)
1277 // NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
1279 // Check that the array sizes are adequate
1280 if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1281 numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1282 (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1283 (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1284 NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
1286 if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1287 numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1288 (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1289 (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1290 NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
1292 if (begin_bit != 0 && begin_bit != 16)
1293 NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1295 // Number of bits needed in the sort
1296 int num_bit = ilog2(maxTileListLen);
1298 NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
1299 int end_bit = begin_bit + num_bit;
1303 // ----------------------------------------------------------------------------------------
1304 if (doOutputOrder && useJtiles) {
1305 // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
1306 // NOTE: This is done very infrequently, typically only once when the MD run starts.
1308 // Calculate position from depth
1310 // -----------------------------------------------------------------------------
1311 // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1312 // -----------------------------------------------------------------------------
1315 int nthread = DEFAULTKERNEL_NUM_THREAD;
1316 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1317 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1318 (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1319 cudaCheck(cudaGetLastError());
1322 reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
1324 // --------------------------------------------------------------------
1325 // Compute itileList positions to store tileLists
1326 // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
1327 // --------------------------------------------------------------------
1330 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1331 (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1332 // Make sure tempStorage doesn't remain NULL
1333 if (size == 0) size = 128;
1334 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1335 size = tempStorageSize;
1336 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1337 (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1341 // Store in reverse order from outputOrder
1343 int nthread = DEFAULTKERNEL_NUM_THREAD;
1344 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1345 storeInReverse <<< nblock, nthread, 0, stream >>>
1346 (numTileListsSrc, begin_bit, outputOrder, tileListPos,
1347 tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1348 tileListOrderDst.ptr, tileListDepthDst.ptr);
1349 cudaCheck(cudaGetLastError());
1354 maxTileListLen_sortKeys = maxTileListLen;
1356 reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1357 clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
1359 // Re-allocate and initialize minmaxListLen
1361 reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1363 int nthread = DEFAULTKERNEL_NUM_THREAD;
1364 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
1365 initMinMaxListLen <<< nblock, nthread, 0, stream >>>
1366 (numComputes, maxTileListLen, minmaxListLen);
1367 cudaCheck(cudaGetLastError());
1370 // Build sortKeys and calculate minmaxListLen
1372 int nthread = DEFAULTKERNEL_NUM_THREAD;
1373 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1374 buildSortKeys <<< nblock, nthread, 0, stream >>>
1375 (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1376 tileListDepthDst.ptr, minmaxListLen, sortKeys);
1377 cudaCheck(cudaGetLastError());
1379 // Maximum value in sortKeys[] is numTileListsDst - 1
1380 sortKeys_endbit = ilog2(numTileListsDst);
1383 // Fill in missing sortKeys using minmaxListLen
1385 int nthread = DEFAULTKERNEL_NUM_THREAD;
1386 int nwarp = nthread/WARPSIZE;
1387 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
1388 fillSortKeys <<< nblock, nthread, 0, stream >>>
1389 (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1390 cudaCheck(cudaGetLastError());
1395 doOutputOrder = false;
1397 } else if (doOutputOrder) {
1398 // OutputOrder will be produced in next pairlist non-bond kernel.
1399 // This time just remove zero length lists
1400 // NOTE: This is done very infrequently, typically only once when the MD run starts.
1402 int endbit_tmp = ilog2(numTileListsSrc);
1406 reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1407 reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1409 int nthread = DEFAULTKERNEL_NUM_THREAD;
1410 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1411 buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>>
1412 (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
1413 cudaCheck(cudaGetLastError());
1416 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1418 // Short list, sort withing a single thread block
1419 int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1422 unsigned int oobKey = numTileListsSrc;
1423 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1424 (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
1425 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1426 cudaCheck(cudaGetLastError());
1430 // Long list, sort on multiple thread blocks
1432 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1433 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1434 numTileListsSrc, 0, endbit_tmp, stream));
1435 // Make sure tempStorage doesn't remain NULL
1436 if (size == 0) size = 128;
1437 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1438 size = tempStorageSize;
1439 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1440 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1441 numTileListsSrc, 0, endbit_tmp, stream));
1444 // Re-order tileListDepth using tileListOrderDst
1446 int nthread = DEFAULTKERNEL_NUM_THREAD;
1447 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1448 reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1449 (numTileListsDst, tileListOrderDst.ptr,
1450 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1451 cudaCheck(cudaGetLastError());
1455 // This is done during regular MD cycle
1457 if (sortKeys_endbit <= 0)
1458 NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1462 reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
1463 reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
1465 int nthread = DEFAULTKERNEL_NUM_THREAD;
1466 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
1467 setupSortKey <<< nblock, nthread, 0, stream >>>
1468 (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
1469 cudaCheck(cudaGetLastError());
1474 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1477 // Short list, sort withing a single thread block
1478 int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1481 unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1482 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
1483 (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
1484 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1485 cudaCheck(cudaGetLastError());
1489 // Long list, sort on multiple thread blocks
1491 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1492 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1493 numTileListsSrc, 0, sortKeys_endbit, stream));
1494 // Make sure tempStorage doesn't remain NULL
1495 if (size == 0) size = 128;
1496 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1497 size = tempStorageSize;
1498 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
1499 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1500 numTileListsSrc, 0, sortKeys_endbit, stream));
1503 // Re-order tileListDepth using tileListOrderDst
1505 int nthread = DEFAULTKERNEL_NUM_THREAD;
1506 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1507 reOrderTileListDepth <<< nblock, nthread, 0, stream >>>
1508 (numTileListsDst, tileListOrderDst.ptr,
1509 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1510 cudaCheck(cudaGetLastError());
1515 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/WARPSIZE+1);
1516 localSort<WARPSIZE> <<< nblock, WARPSIZE, 0, stream >>> (
1517 numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
1518 cudaCheck(cudaGetLastError());
1520 // No need to shift any more
1521 doShiftDown = false;
1525 // ----------------------------------------------------------------------------------------
1530 // --------------------------------------------------------------------
1531 // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
1532 // => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
1533 // --------------------------------------------------------------------
1534 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1536 // Short list, sort withing a single thread block
1537 int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
1540 sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
1541 (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
1542 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1543 cudaCheck(cudaGetLastError());
1548 // Long list, sort on multiple thread blocks
1550 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1551 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1552 numTileListsSrc, begin_bit, end_bit, stream));
1553 // Make sure tempStorage doesn't remain NULL
1554 if (size == 0) size = 128;
1555 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1556 size = tempStorageSize;
1557 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
1558 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1559 numTileListsSrc, begin_bit, end_bit, stream));
1563 // -----------------------------------------------------------------------------
1564 // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
1565 // -----------------------------------------------------------------------------
1568 int nthread = DEFAULTKERNEL_NUM_THREAD;
1569 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1570 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
1571 (numTileListsDst, begin_bit, tileListDepthDst.ptr);
1572 // cudaCheck(cudaGetLastError());
1575 // Allocate with +1 to include last term in the exclusive sum
1576 reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
1578 // --------------------------------------------------------------------
1579 // Compute itileList positions to store tileLists
1580 // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
1581 // NOTE: tileListDepthDst[numTileListsDst] is not accessed
1582 // since this is an exclusive sum. But with this trick,
1583 // tileListPos[numTileListsDst] will contain the total number
1585 // --------------------------------------------------------------------
1588 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1589 (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1590 // Make sure tempStorage doesn't remain NULL
1591 if (size == 0) size = 128;
1592 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1593 size = tempStorageSize;
1594 // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
1595 // Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
1596 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
1597 (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1600 // --------------------------------------------------------------------
1602 // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
1604 // tileJatomStartDst[]
1605 // tileExclsDst[0 ... numJtilesDst-1]
1606 // --------------------------------------------------------------------
1608 // int nthread = WARPSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1609 int nthread = BOUNDINGBOXSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
1610 int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
1611 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
1613 repackTileListsKernel <<< nblock, nthread, 0, stream >>>
1614 (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
1615 (useJtiles) ? jtiles : NULL,
1616 tileListsSrc.ptr, tileListsDst.ptr,
1617 patchPairsSrc.ptr, patchPairsDst.ptr,
1618 tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1619 tileExclsSrc.ptr, tileExclsDst.ptr);
1622 // Count the number of tileLists that contribute to each patch
1625 clear_device_array<int>(patchNumLists, numPatches, stream);
1627 // Fill in patchNumLists[0...numPatches-1]
1628 int nthread = CALCPATCHNUMLISTSKERNEL_NUM_THREAD;
1629 int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
1630 calcPatchNumLists <<< nblock, nthread, 0, stream >>>
1631 (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1632 cudaCheck(cudaGetLastError());
1634 // Use emptyPatches[numPatches] as the count variable
1635 clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
1637 // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
1638 // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
1639 setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>>
1640 (numTileListsDst, tileListsDst.ptr, patchNumLists,
1641 numPatches, &emptyPatches[numPatches], emptyPatches);
1642 cudaCheck(cudaGetLastError());
1644 // // Copy emptyPatches[0 ... numPatches] to host
1645 copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
1646 cudaCheck(cudaStreamSynchronize(stream));
1647 numEmptyPatches = h_emptyPatches[numPatches];
1653 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
1655 void CudaTileListKernel::reSortTileLists(const bool doGBIS, const bool CUDASOAIntegratorOn, cudaStream_t stream) {
1656 // Store previous number of active lists
1657 int numTileListsPrev = numTileLists;
1659 // Wait for finishTileList() to stop copying
1660 if (!tileListStatEventRecord)
1661 NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1662 cudaCheck(cudaEventSynchronize(tileListStatEvent));
1664 // Get numTileLists, numTileListsGBIS, and numExcluded
1666 numTileLists = h_tileListStat->numTileLists;
1667 numTileListsGBIS = h_tileListStat->numTileListsGBIS;
1668 numExcluded = h_tileListStat->numExcluded;
1671 // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
1672 // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
1673 sortTileLists(true, 0, true,
1674 numTileListsPrev, numJtiles,
1675 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1676 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1677 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1678 numTileLists, numJtiles,
1679 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1680 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1681 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1684 // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
1685 // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
1686 // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
1687 // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
1690 // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
1691 // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
1694 // GBIS is used => produce a second tile list
1695 // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
1696 reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
1697 reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
1699 sortTileLists(true, 16, true,
1700 numTileListsPrev, numJtiles,
1701 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1702 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1703 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1704 numTileListsGBIS, numJtiles,
1705 PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1706 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1707 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1711 // Set active buffer to be 1
1718 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
1720 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
1723 if (!doStreaming || !doOutputOrder)
1726 // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
1727 // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
1728 sortTileLists(false, 0, true,
1729 numTileLists, numJtiles,
1730 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1731 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1732 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
1733 numTileLists, numJtiles,
1734 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1735 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1736 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1739 // Set active buffer to be 2
1745 void CudaTileListKernel::setTileListVirialEnergyLength(int len) {
1746 if (len > tileListVirialEnergySize) {
1747 NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1749 tileListVirialEnergyLength = len;
1752 void CudaTileListKernel::setTileListVirialEnergyGBISLength(int len) {
1753 if (len > tileListVirialEnergySize) {
1754 NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1756 tileListVirialEnergyGBISLength = len;