3 #if defined(NAMD_HIP) && !defined(NAMD_CUDA)//NAMD_HIP
4 #include "hip/hip_runtime.h"
5 #include <hipcub/hipcub.hpp>
10 #if __CUDACC_VER_MAJOR__ >= 11
11 #include <cub/device/device_radix_sort.cuh>
12 #include <cub/device/device_scan.cuh>
13 #include <cub/cub.cuh>
15 #include <namd_cub/device/device_radix_sort.cuh>
16 #include <namd_cub/device/device_scan.cuh>
17 #include <namd_cub/cub.cuh>
18 #endif //CUDACC version
25 #define __thread __declspec(thread)
29 #define OVERALLOC 1.2f
31 #define DEFAULTKERNEL_NUM_THREAD 1024
32 #define UPDATEPATCHESKERNEL_NUM_THREAD 512
33 #define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 512
34 #define BOUNDINGBOXKERNEL_NUM_WARP 8
36 #define DEFAULTKERNEL_NUM_THREAD 256
37 #define UPDATEPATCHESKERNEL_NUM_THREAD 256
38 #define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 256
39 #define BOUNDINGBOXKERNEL_NUM_WARP 4
41 #if __CUDA_ARCH__ < 350
53 for (
int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
55 int2 patchInd = tileLists[i].patchInd;
56 atomicAdd(&patchNumLists[patchInd.x], 1);
57 if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
68 const int numPatches,
int* __restrict__ numEmptyPatches,
int* __restrict__ emptyPatches) {
70 for (
int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
72 int2 patchInd = tileLists[i].patchInd;
73 int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
74 tileLists[i].patchNumList = patchNumList;
77 for (
int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
79 if (patchNumLists[i] == 0) {
80 int ind = atomicAdd(numEmptyPatches, 1);
81 emptyPatches[ind] = i;
103 const int begin_bit,
const unsigned int* __restrict__ sortKeys,
unsigned int* __restrict__ sortKey) {
107 int icompute = tileLists[
itileList].icompute;
108 int depth = min((tileListDepth[
itileList] >> begin_bit) & 65535, maxTileListLen);
109 int i = icompute*maxTileListLen + (depth - 1);
110 sortKey[
itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
117 unsigned int* __restrict__
keys,
int* __restrict__ vals) {
121 for (
int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
123 int i = base + threadIdx.x;
124 typedef cub::BlockRadixSort<unsigned int, width, 1, int>
BlockRadixSort;
125 __shared__
typename BlockRadixSort::TempStorage
tempStorage;
126 unsigned int key[1] = {(i < n) ? ((keys[i] >>
begin_bit) & 65535) : 0};
127 int val[1] = {(i < n) ? vals[i] : 0};
139 const int* __restrict__
outputOrder,
const int* __restrict__ tileListPos,
142 int* __restrict__ tileListOrderDst,
145 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
147 int j = outputOrder[numTileListsSrc - i - 1];
148 if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
149 int k = tileListPos[i];
150 tileListDepthDst[k] = tileListDepthSrc[j];
151 tileListOrderDst[k] = j;
163 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
165 int j = outputOrder[numTileLists - i - 1];
166 tileListDepthDst[i] = ((tileListDepthSrc[j] >>
begin_bit) & 65535) == 0 ? 0 : 1;
172 int2* __restrict__ minmaxListLen) {
175 val.x = maxTileListLen+1;
177 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
179 minmaxListLen[i] = val;
188 const TileList* __restrict__ tileListsSrc,
189 const int* __restrict__ tileListOrderDst,
191 int2* __restrict__ minmaxListLen,
unsigned int* __restrict__ sortKeys) {
193 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
195 int k = tileListOrderDst[i];
196 int icompute = tileListsSrc[k].icompute;
197 int depth = tileListDepthDst[i] & 65535;
199 int j = icompute*maxTileListLen + (depth-1);
201 int2 minmax = minmaxListLen[icompute];
202 int2 minmaxOrig = minmax;
203 if (minmax.x > depth) minmax.x = depth;
204 if (minmax.y < depth) minmax.y = depth;
205 if (minmax.x != minmaxOrig.x) {
206 atomicMin(&minmaxListLen[icompute].
x, minmax.x);
208 if (minmax.y != minmaxOrig.y) {
209 atomicMax(&minmaxListLen[icompute].
y, minmax.y);
215 __global__
void fillSortKeys(
const int numComputes,
const int maxTileListLen,
216 const int2* __restrict__ minmaxListLen,
unsigned int* __restrict__ sortKeys) {
218 for (
int i = threadIdx.x/
WARPSIZE + blockDim.x/
WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/
WARPSIZE*gridDim.x) {
219 const int wid = threadIdx.x %
WARPSIZE;
220 int2 minmax = minmaxListLen[i];
221 int minlen = minmax.x;
222 int maxlen = minmax.y;
225 if ( maxlen < minlen ) {
227 maxlen = maxTileListLen;
229 unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
230 unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
231 unsigned int aveKey = (maxKey + minKey)/2;
232 for (
int j=wid;j < minlen-1;j+=
WARPSIZE) {
233 sortKeys[i*maxTileListLen + j] = minKey;
235 for (
int j=maxlen+wid;j < maxTileListLen;j+=
WARPSIZE) {
236 sortKeys[i*maxTileListLen + j] = maxKey;
238 for (
int j=wid;j < maxTileListLen;j+=
WARPSIZE) {
239 if (sortKeys[i*maxTileListLen + j] == 0) {
240 sortKeys[i*maxTileListLen + j] = aveKey;
255 const int warpId = threadIdx.x /
WARPSIZE;
256 const int wid = threadIdx.x %
WARPSIZE;
259 for (
int iwarp = warpId*
WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
261 const int i = iwarp + wid;
265 float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
267 volatile float3 minxyz, maxxyz;
269 typedef cub::WarpReduce<float> WarpReduce;
271 minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
272 minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
273 minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
274 maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
275 maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
276 maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
280 boundingBox.
x = 0.5f*(minxyz.x + maxxyz.x);
281 boundingBox.
y = 0.5f*(minxyz.y + maxxyz.y);
282 boundingBox.
z = 0.5f*(minxyz.z + maxxyz.z);
283 boundingBox.
wx = 0.5f*(maxxyz.x - minxyz.x);
284 boundingBox.
wy = 0.5f*(maxxyz.y - minxyz.y);
285 boundingBox.
wz = 0.5f*(maxxyz.z - minxyz.z);
286 boundingBoxes[ibb] = boundingBox;
296 float dx = max(0.0f, fabsf(a.
x - b.
x) - a.
wx - b.
wx);
297 float dy = max(0.0f, fabsf(a.
y - b.
y) - a.
wy - b.
wy);
298 float dz = max(0.0f, fabsf(a.
z - b.
z) - a.
wz - b.
wz);
299 float r2 = dx*dx + dy*dy + dz*dz;
310 template <
typename T>
311 __device__ __forceinline__
312 int shWarpExclusiveSum(
const int n,
volatile T* sh_in,
volatile int* sh_out) {
313 const int wid = threadIdx.x %
WARPSIZE;
314 volatile int blockOffset = 0;
315 for (
int iblock=0;iblock < n;iblock +=
WARPSIZE) {
317 int blockLen = min(
WARPSIZE, n-iblock);
319 typedef cub::WarpScan<int> WarpScan;
320 __shared__
typename WarpScan::TempStorage
tempStorage;
321 int data = (wid < blockLen) ? (
int)sh_in[iblock + wid] : 0;
322 WarpScan(tempStorage).ExclusiveSum(data, data);
326 int last = (int)sh_in[iblock + blockLen-1];
328 if (wid < blockLen) sh_out[iblock + wid] = data;
330 blockOffset = sh_out[iblock + blockLen-1] + last;
336 #define TILELISTKERNELNEW_NUM_WARP 4
341 template<
int nthread>
345 int* __restrict__ tilePos) {
347 typedef cub::BlockScan<int, nthread> BlockScan;
349 __shared__
typename BlockScan::TempStorage
tempStorage;
350 __shared__
int shTilePos0;
352 if (threadIdx.x == nthread-1) {
355 for (
int base=0;base < numComputes;base+=nthread) {
356 int k = base + threadIdx.x;
358 int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/
WARPSIZE+1 : 0;
362 BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
365 if (k < numComputes) {
366 tilePos[k] = shTilePos0 + tilePosVal;
371 if (threadIdx.x == nthread-1) {
372 shTilePos0 += tilePosVal + numTiles1;
378 template<
int nthread>
380 const int* __restrict__ tilePos,
385 const int tid = threadIdx.x % nthread;
388 for (
int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
393 int numTiles1 = (patches[patchInd.x].numAtoms-1)/
WARPSIZE+1;
394 int itileList0 = tilePos[k];
395 for (
int i=tid;i < numTiles1;i+=nthread) {
396 tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
397 tileLists[itileList0 + i].patchInd = patchInd;
398 tileLists[itileList0 + i].icompute = k;
404 __host__ __device__ __forceinline__
408 maxTileListLen*
sizeof(char)
417 const int* __restrict__ tileListPos,
418 const float3
lata,
const float3
latb,
const float3
latc,
419 const float cutoff2,
const int maxTileListLen,
422 const int tileJatomStartSize,
428 extern __shared__
char sh_buffer[];
430 HIP_DYNAMIC_SHARED(
char, sh_buffer)
433 int pos = threadIdx.x*sizePerThread;
434 volatile char* sh_tile = (
char*)&sh_buffer[pos];
440 const int wid = threadIdx.x %
WARPSIZE;
444 int itileListLen = 0;
452 if (itileList < numTileLists) {
453 offsetXYZ = tileLists[
itileList].offsetXYZ;
454 patchInd = tileLists[
itileList].patchInd;
455 icompute = tileLists[
itileList].icompute;
457 i = itileList - tileListPos[icompute];
459 float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
460 float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
461 float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
464 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
467 patch1 = patches[patchInd.x];
468 patch2 = patches[patchInd.y];
475 bool self = zeroShift && (tileStart1 == tileStart2);
478 BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
479 boundingBoxI.
x += shx;
480 boundingBoxI.y += shy;
481 boundingBoxI.z += shz;
483 for (
int j=0;j < numTiles2;j++) {
485 if (!
self || j >= i) {
486 BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
487 float r2bb =
distsq(boundingBoxI, boundingBoxJ);
488 if (r2bb < cutoff2) {
495 tileListDepth[
itileList] = (
unsigned int)itileListLen;
499 typedef cub::WarpScan<int> WarpScan;
500 __shared__
typename WarpScan::TempStorage
tempStorage;
501 int active = (itileListLen > 0);
503 WarpScan(tempStorage).ExclusiveSum(active, activePos);
505 WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
507 int jtileStart, numJtiles;
510 atomicAdd(&tileListStat->numTileLists, activePos + active);
511 numJtiles = itileListPos + itileListLen;
512 jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
516 jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1,
WARP_FULL_MASK);
518 if (jtileStart + numJtiles > tileJatomStartSize) {
520 if (wid == 0) tileListStat->tilesSizeExceeded =
true;
524 int jStart = itileListPos;
525 int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1,
WARP_FULL_MASK);
527 if (wid == WARPSIZE-1) jEnd = numJtiles;
529 if (itileListLen > 0) {
533 tileLists[
itileList].jtileStart = jtileStart + jStart;
534 tileLists[
itileList].jtileEnd = jtileStart + jEnd - 1;
535 tileLists[
itileList].patchInd = patchInd;
536 tileLists[
itileList].offsetXYZ = offsetXYZ;
537 tileLists[
itileList].icompute = icompute;
550 int jtile = jtileStart + jStart;
551 for (
int j=0;j < numTiles2;j++) {
564 #define REPACKTILELISTSKERNEL_NUM_WARP 32
566 #define REPACKTILELISTSKERNEL_NUM_WARP 4
571 const int* __restrict__
jtiles,
572 const TileList* __restrict__ tileListsSrc,
TileList* __restrict__ tileListsDst,
574 const int* __restrict__ tileJatomStartSrc,
int* __restrict__ tileJatomStartDst,
575 const TileExcl* __restrict__ tileExclsSrc,
TileExcl* __restrict__ tileExclsDst) {
577 const int wid = threadIdx.x %
WARPSIZE;
580 for (
int i = threadIdx.x/
WARPSIZE + blockDim.x/
WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/
WARPSIZE*gridDim.x)
582 int j = tileListOrder[i];
583 int start = tileListPos[i];
584 int end = tileListPos[i+1]-1;
585 if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
587 int startOld =
__ldg(&tileListsSrc[j].jtileStart);
588 int endOld =
__ldg(&tileListsSrc[j].jtileEnd);
589 int iatomStart =
__ldg(&tileListsSrc[j].iatomStart);
591 offsetXYZ.x =
__ldg(&tileListsSrc[j].offsetXYZ.x);
592 offsetXYZ.y =
__ldg(&tileListsSrc[j].offsetXYZ.y);
593 offsetXYZ.z =
__ldg(&tileListsSrc[j].offsetXYZ.z);
594 int2 patchInd = tileListsSrc[j].patchInd;
595 int icompute =
__ldg(&tileListsSrc[j].icompute);
598 tileListsDst[i].iatomStart = iatomStart;
599 tileListsDst[i].offsetXYZ = offsetXYZ;
600 tileListsDst[i].jtileStart = start;
601 tileListsDst[i].jtileEnd = end;
602 tileListsDst[i].patchInd = patchInd;
603 tileListsDst[i].icompute = icompute;
607 if (jtiles == NULL) {
610 for (
int jtileOld=startOld;jtileOld <= endOld;jtileOld+=
WARPSIZE,jtile+=
WARPSIZE) {
611 if (jtileOld + wid <= endOld) {
612 tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
615 if (tileExclsSrc != NULL) {
617 for (
int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
618 tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
623 for (
int jtileOld=startOld;jtileOld <= endOld;jtileOld+=
WARPSIZE) {
624 int t = jtileOld + wid;
625 int jtile = (t <= endOld) ? jtiles[t] : 0;
628 typedef cub::WarpScan<int> WarpScan;
630 int warpId = threadIdx.x /
WARPSIZE;
632 WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
634 if (jtile) tileJatomStartDst[jtile0+jtilePos] =
__ldg(&tileJatomStartSrc[t]);
637 if (tileExclsSrc != NULL) {
641 int k = __ffsll(b) - 1;
643 int k = __ffs(b) - 1;
645 tileExclsDst[jtile0].excl[wid] =
__ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
652 jtile0 += __popcll(b);
668 #define SORTTILELISTSKERNEL_NUM_THREAD 512
669 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
671 #define SORTTILELISTSKERNEL_NUM_THREAD 256
672 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 15
674 template <
typename keyT,
typename valT,
bool ascend>
691 typename BlockLoad::TempStorage
load;
692 typename BlockLoadU::TempStorage
loadU;
693 typename BlockRadixSort::TempStorage
sort;
699 BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
701 BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
705 BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
707 BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
710 cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst,
values,
numTileListsDst);
714 unsigned int* __restrict__ tileListDepthSrc,
unsigned int* __restrict__ tileListDepthDst) {
716 for (
int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
718 int j = tileListOrder[i];
719 tileListDepthDst[i] = tileListDepthSrc[j];
732 unsigned int a = tileListDepth[
itileList];
745 deviceID(deviceID), doStreaming(doStreaming) {
758 cudaComputesSize = 0;
760 patchNumLists = NULL;
761 patchNumListsSize = 0;
764 emptyPatchesSize = 0;
765 h_emptyPatches = NULL;
766 h_emptyPatchesSize = 0;
784 tileJatomStart1 = NULL;
785 tileJatomStart1Size = 0;
786 tileJatomStart2 = NULL;
787 tileJatomStart2Size = 0;
789 boundingBoxes = NULL;
790 boundingBoxesSize = 0;
792 tileListDepth1 = NULL;
793 tileListDepth1Size = 0;
794 tileListDepth2 = NULL;
795 tileListDepth2Size = 0;
797 tileListOrder1 = NULL;
798 tileListOrder1Size = 0;
799 tileListOrder2 = NULL;
800 tileListOrder2Size = 0;
810 allocate_device<TileListStat>(&d_tileListStat, 1);
811 allocate_host<TileListStat>(&h_tileListStat, 1);
824 tileListsGBIS = NULL;
825 tileListsGBISSize = 0;
827 tileJatomStartGBIS = NULL;
828 tileJatomStartGBISSize = 0;
830 tileListVirialEnergy = NULL;
831 tileListVirialEnergySize = 0;
835 numTileListsGBIS = 0;
840 doOutputOrder =
false;
842 minmaxListLen = NULL;
843 minmaxListLenSize = 0;
849 cudaCheck(cudaEventCreate(&tileListStatEvent));
850 tileListStatEventRecord =
false;
855 deallocate_device<TileListStat>(&d_tileListStat);
856 deallocate_host<TileListStat>(&h_tileListStat);
858 if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
859 if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
860 if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
861 if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
862 if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
864 if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
865 if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
866 if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
867 if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
868 if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
869 if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
870 if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
871 if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
872 if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
873 if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
874 if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
875 if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
876 if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
877 if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
878 if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
879 if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
880 if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
881 if (jtiles != NULL) deallocate_device<int>(&jtiles);
882 if (tilePos != NULL) deallocate_device<int>(&tilePos);
884 if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
885 if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
887 if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
889 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
891 if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
892 if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
894 cudaCheck(cudaEventDestroy(tileListStatEvent));
898 clear_device_array<int>(jtiles, numJtiles,
stream);
905 copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1,
stream);
909 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1,
stream);
910 cudaCheck(cudaEventRecord(tileListStatEvent, stream));
911 tileListStatEventRecord =
true;
917 numComputes = numComputesIn;
919 reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
920 copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes,
stream);
922 if (doStreaming) doOutputOrder =
true;
925 void CudaTileListKernel::writeTileList(
const char* filename,
const int numTileLists,
929 copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists,
stream);
930 cudaCheck(cudaStreamSynchronize(stream));
931 FILE* handle = fopen(filename,
"wt");
934 fprintf(handle,
"%d %d %d %f %f %f %d %d %d %d\n",
939 delete [] h_tileLists;
942 void CudaTileListKernel::writeTileJatomStart(
const char* filename,
const int numJtiles,
943 const int* d_tileJatomStart, cudaStream_t stream) {
945 int* h_tileJatomStart =
new int[numJtiles];
946 copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles,
stream);
947 cudaCheck(cudaStreamSynchronize(stream));
948 FILE* handle = fopen(filename,
"wt");
949 for (
int i=0;i < numJtiles;i++) {
950 fprintf(handle,
"%d\n", h_tileJatomStart[i]);
953 delete [] h_tileJatomStart;
1032 const int numPatchesIn,
const int atomStorageSizeIn,
const int maxTileListLenIn,
1033 const float3
lata,
const float3
latb,
const float3
latc,
1035 const float plcutoff2In,
const size_t maxShmemPerBlock,
1036 cudaStream_t stream) {
1038 numPatches = numPatchesIn;
1039 atomStorageSize = atomStorageSizeIn;
1040 maxTileListLen = maxTileListLenIn;
1041 plcutoff2 = plcutoff2In;
1045 reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
1046 reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
1047 reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
1051 reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev,
OVERALLOC);
1052 reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev,
OVERALLOC);
1055 reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
1056 copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches,
stream);
1059 reallocate_device<int>(&tilePos, &tilePosSize, numComputes,
OVERALLOC);
1064 calcTileListPosKernel<DEFAULTKERNEL_NUM_THREAD> <<< nblock, nthread, 0, stream >>> (numComputes, cudaComputes, cudaPatches, tilePos);
1072 updatePatchesKernel<WARPSIZE> <<< nblock, nthread, 0, stream >>> (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
1081 reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1,
OVERALLOC);
1082 reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev,
OVERALLOC);
1085 reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1,
OVERALLOC);
1087 reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev,
OVERALLOC);
1089 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize,
OVERALLOC);
1091 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize,
stream);
1095 int numBoundingBoxes = atomStorageSize/
WARPSIZE;
1096 reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes,
OVERALLOC);
1101 buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
1111 if(shmem_size > maxShmemPerBlock){
1112 NAMD_die(
"CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
1119 int reallocCount = 0;
1121 reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles,
OVERALLOC);
1125 buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>> (
1126 numTileListsPrev, tileLists1, cudaPatches, tilePos,
1128 boundingBoxes, tileJatomStart1, tileJatomStart1Size,
1129 tileListDepth1, tileListOrder1, patchPairs1,
1135 copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1,
stream);
1136 cudaCheck(cudaStreamSynchronize(stream));
1137 numJtiles = h_tileListStat->numJtiles;
1139 if (h_tileListStat->tilesSizeExceeded) {
1141 if (reallocCount > 1) {
1142 NAMD_die(
"CudaTileListKernel::buildTileLists, multiple reallocations detected");
1150 reallocate_device<int>(&jtiles, &jtilesSize, numJtiles,
OVERALLOC);
1156 reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists,
OVERALLOC);
1158 reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists,
OVERALLOC);
1159 reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists,
OVERALLOC);
1160 reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles,
OVERALLOC);
1161 reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles,
OVERALLOC);
1162 reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles,
OVERALLOC);
1164 int numTileListsSrc = numTileListsPrev;
1165 int numJtilesSrc = numJtiles;
1166 int numTileListsDst = numTileLists;
1167 int numJtilesDst = numJtiles;
1173 numTileListsSrc, numJtilesSrc,
1174 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1175 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1176 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1177 numTileListsDst, numJtilesDst,
1178 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1179 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1180 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1186 if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists,
OVERALLOC);
1196 while (a >>= 1) k++;
1203 void CudaTileListKernel::sortTileLists(
1204 const bool useJtiles,
1205 const int begin_bit,
const bool highDepthBitsSetIn,
1207 const int numTileListsSrc,
const int numJtilesSrc,
1208 PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
1209 PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
1210 PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
1212 const int numTileListsDst,
const int numJtilesDst,
1213 PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
1214 PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
1215 PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
1216 cudaStream_t stream) {
1218 bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
1224 if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
1225 numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
1226 (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) ||
1227 (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
1228 NAMD_die(
"CudaTileListKernel::sortTileLists, Src allocated too small");
1230 if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
1231 numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
1232 (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) ||
1233 (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
1234 NAMD_die(
"CudaTileListKernel::sortTileLists, Dst allocated too small");
1236 if (begin_bit != 0 && begin_bit != 16)
1237 NAMD_die(
"CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
1240 int num_bit =
ilog2(maxTileListLen);
1242 NAMD_die(
"CudaTileListKernel::sortTileLists, num_bit overflow");
1243 int end_bit = begin_bit + num_bit;
1248 if (doOutputOrder && useJtiles) {
1261 bitshiftTileListDepth <<< nblock, nthread, 0, stream >>> (
numTileListsSrc,
begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1273 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1274 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1276 if (size == 0) size = 128;
1277 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1278 size = tempStorageSize;
1279 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((
void *)tempStorage, size,
1280 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
1288 storeInReverse <<< nblock, nthread, 0, stream >>> (
1290 tileListOrderSrc.ptr, tileListDepthSrc.ptr,
1291 tileListOrderDst.ptr, tileListDepthDst.ptr);
1297 maxTileListLen_sortKeys = maxTileListLen;
1299 reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
1300 clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen,
stream);
1304 reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
1308 initMinMaxListLen <<< nblock, nthread, 0, stream >>> (numComputes, maxTileListLen, minmaxListLen);
1316 buildSortKeys <<< nblock, nthread, 0, stream >>> (
1317 numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
1318 tileListDepthDst.ptr, minmaxListLen, sortKeys);
1322 sortKeys_endbit =
ilog2(numTileListsDst);
1330 fillSortKeys <<< nblock, nthread, 0, stream >>> (numComputes, maxTileListLen, minmaxListLen, sortKeys);
1336 doOutputOrder =
false;
1338 }
else if (doOutputOrder) {
1343 int endbit_tmp =
ilog2(numTileListsSrc);
1352 buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>> (
numTileListsSrc, tileListDepthSrc.ptr,
begin_bit, sortKeySrc);
1356 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1363 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>> (
1365 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1372 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1373 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1374 numTileListsSrc, 0, endbit_tmp, stream));
1376 if (size == 0) size = 128;
1377 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1378 size = tempStorageSize;
1379 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((
void *)tempStorage, size,
1380 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1381 numTileListsSrc, 0, endbit_tmp, stream));
1388 reOrderTileListDepth <<< nblock, nthread, 0, stream >>> (
numTileListsDst, tileListOrderDst.ptr,
1389 tileListDepthSrc.ptr, tileListDepthDst.ptr);
1396 if (sortKeys_endbit <= 0)
1397 NAMD_die(
"CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
1406 setupSortKey <<< nblock, nthread, 0, stream >>> (
numTileListsSrc, maxTileListLen_sortKeys,
1407 tileListsSrc.ptr, tileListDepthSrc.ptr,
begin_bit, sortKeys, sortKeySrc);
1413 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1420 unsigned int oobKey = (2 << sortKeys_endbit) - 1;
1421 sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>> (
1423 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1430 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs(NULL, size,
1431 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1432 numTileListsSrc, 0, sortKeys_endbit, stream));
1434 if (size == 0) size = 128;
1435 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1436 size = tempStorageSize;
1437 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairs((
void *)tempStorage, size,
1438 sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1439 numTileListsSrc, 0, sortKeys_endbit, stream));
1446 reOrderTileListDepth <<< nblock, nthread, 0, stream >>> (
numTileListsDst, tileListOrderDst.ptr, tileListDepthSrc.ptr, tileListDepthDst.ptr);
1457 doShiftDown =
false;
1470 if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
1475 sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>> (
1477 tileListOrderSrc.ptr, tileListOrderDst.ptr);
1485 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending(NULL, size,
1486 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1487 numTileListsSrc, begin_bit, end_bit, stream));
1489 if (size == 0) size = 128;
1490 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1491 size = tempStorageSize;
1492 cudaCheck((cudaError_t)cub::DeviceRadixSort::SortPairsDescending((
void *)tempStorage, size,
1493 tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
1494 numTileListsSrc, begin_bit, end_bit, stream));
1510 reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1,
OVERALLOC);
1522 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum(NULL, size,
1523 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1525 if (size == 0) size = 128;
1526 reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
1527 size = tempStorageSize;
1530 cudaCheck((cudaError_t)cub::DeviceScan::ExclusiveSum((
void *)tempStorage, size,
1531 (
int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
1545 repackTileListsKernel <<< nblock, nthread, 0, stream >>> (
1547 (useJtiles) ? jtiles : NULL,
1548 tileListsSrc.ptr, tileListsDst.ptr,
1549 patchPairsSrc.ptr, patchPairsDst.ptr,
1550 tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
1551 tileExclsSrc.ptr, tileExclsDst.ptr);
1558 clear_device_array<int>(patchNumLists, numPatches,
stream);
1563 calcPatchNumLists <<< nblock, nthread, 0, stream >>> (
numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
1567 clear_device_array<int>(&emptyPatches[numPatches], 1,
stream);
1571 setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>> (
numTileListsDst, tileListsDst.ptr, patchNumLists,
1572 numPatches, &emptyPatches[numPatches], emptyPatches);
1576 copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1,
stream);
1577 cudaCheck(cudaStreamSynchronize(stream));
1578 numEmptyPatches = h_emptyPatches[numPatches];
1588 int numTileListsPrev = numTileLists;
1591 if (!tileListStatEventRecord)
1592 NAMD_die(
"CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
1593 cudaCheck(cudaEventSynchronize(tileListStatEvent));
1604 sortTileLists(
true, 0,
true,
1605 numTileListsPrev, numJtiles,
1606 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1607 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1608 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
1609 numTileLists, numJtiles,
1610 PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
1611 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1612 PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
1628 reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS,
OVERALLOC);
1629 reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles,
OVERALLOC);
1631 sortTileLists(
true, 16,
true,
1632 numTileListsPrev, numJtiles,
1633 PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
1634 PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
1635 PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
1636 numTileListsGBIS, numJtiles,
1637 PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
1638 PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
1639 PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
1678 if (len > tileListVirialEnergySize) {
1679 NAMD_die(
"CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
1681 tileListVirialEnergyLength = len;
1685 if (len > tileListVirialEnergySize) {
1686 NAMD_die(
"CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
1688 tileListVirialEnergyGBISLength = len;
__global__ void reOrderTileListDepth(const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
BlockLoad::TempStorage load
__global__ void repackTileListsKernel(const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
__global__ void storeInReverse(const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
CudaTileListKernel(int deviceID, bool doStreaming)
void prepareTileList(cudaStream_t stream)
__global__ void setupSortKey(const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
void setTileListVirialEnergyLength(int len)
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
#define CALCPATCHNUMLISTSKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
void clearTileListStat(cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
__global__ void buildTileListsBBKernel(const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
void setTileListVirialEnergyGBISLength(int len)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void setPatchNumLists_findEmptyPatches(const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
#define REPACKTILELISTSKERNEL_NUM_WARP
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ tileListOrderSrc
cub::BlockRadixSort< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > BlockRadixSort
#define UPDATEPATCHESKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void buildRemoveZerosSortKey(const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void calcPatchNumLists(const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
#define SORTTILELISTSKERNEL_NUM_THREAD
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void updatePatchesKernel(const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
cub::BlockLoad< valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoad
keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD
const int const int const int const keyT oobKey
__global__ void calcTileListPosKernel(const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
#define DEFAULTKERNEL_NUM_THREAD
#define WARP_BALLOT(MASK, P)
const int numTileListsDst
void finishTileList(cudaStream_t stream)
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void NAMD_die(const char *err_msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
BlockLoadU::TempStorage loadU
const int const int const int end_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__global__ void buildBoundingBoxesKernel(const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int numTileLists
#define BOUNDINGBOXKERNEL_NUM_WARP
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
__shared__ union @43 tempStorage
BlockRadixSort::TempStorage sort
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void localSort(const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__thread DeviceCUDA * deviceCUDA
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ valT *__restrict__ tileListOrderDst typedef cub::BlockLoad< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > BlockLoadU
#define TILELISTKERNELNEW_NUM_WARP
valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
void reSortTileLists(const bool doGBIS, cudaStream_t stream)
__global__ void fillSortKeys(const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start