CudaTileListKernel.cu

Go to the documentation of this file.
00001 // #include <map>
00002 // #include <algorithm>
00003 #include <cuda.h>
00004 #include <cub/device/device_radix_sort.cuh>
00005 #include <cub/device/device_scan.cuh>
00006 #include <cub/cub.cuh>
00007 #include "CudaUtils.h"
00008 #include "CudaTileListKernel.h"
00009 #include "DeviceCUDA.h"
00010 #ifdef WIN32
00011 #define __thread __declspec(thread)
00012 #endif
00013 extern __thread DeviceCUDA *deviceCUDA;
00014 
00015 #define OVERALLOC 1.2f
00016 
00017 #if __CUDA_ARCH__ < 350
00018 #define __ldg *
00019 #endif
00020 
00021 void NAMD_die(const char *);
00022 
00023 //
00024 // Calculate the number of lists that contribute to each patch
00025 //
00026 __global__ void calcPatchNumLists(const int numTileLists, const int numPatches,
00027   const TileList* __restrict__ tileLists, int* __restrict__ patchNumLists) {
00028 
00029   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00030   {
00031     int2 patchInd = tileLists[i].patchInd;
00032     atomicAdd(&patchNumLists[patchInd.x], 1);
00033     if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
00034   }
00035 
00036 }
00037 
00038 //
00039 // Write patchNumList back to tile list and
00040 // Find empty patches into emptyPatches[0 ... numEmptyPatches-1]
00041 //
00042 __global__ void setPatchNumLists_findEmptyPatches(const int numTileLists,
00043   TileList* __restrict__ tileLists, const int* __restrict__ patchNumLists,
00044   const int numPatches, int* __restrict__ numEmptyPatches, int* __restrict__ emptyPatches) {
00045 
00046   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00047   {
00048     int2 patchInd = tileLists[i].patchInd;
00049     int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
00050     tileLists[i].patchNumList = patchNumList;
00051   }
00052 
00053   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
00054   {
00055     if (patchNumLists[i] == 0) {
00056       int ind = atomicAdd(numEmptyPatches, 1);
00057       emptyPatches[ind] = i;
00058     }
00059   }
00060 
00061 }
00062 
00063 //
00064 // Builds a sort key that removes zeros but keeps the order otherwise the same
00065 //
00066 __global__ void buildRemoveZerosSortKey(const int numTileLists,
00067   const unsigned int* __restrict__ tileListDepth, const int begin_bit, unsigned int* __restrict__ sortKey) {
00068 
00069   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00070   {
00071     int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
00072     sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
00073   }
00074 
00075 }
00076 
00077 __global__ void setupSortKey(const int numTileLists, const int maxTileListLen,
00078   const TileList* __restrict__ tileLists, const unsigned int* __restrict__ tileListDepth,
00079   const int begin_bit, const unsigned int* __restrict__ sortKeys, unsigned int* __restrict__ sortKey) {
00080 
00081   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00082   {
00083     int icompute = tileLists[itileList].icompute;
00084     int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
00085     int i = icompute*maxTileListLen + (depth - 1);
00086     sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
00087   }
00088 
00089 }
00090 
00091 template <int width>
00092 __global__ void localSort(const int n, const int begin_bit, const int num_bit,
00093   unsigned int* __restrict__ keys, int* __restrict__ vals) {
00094 
00095   // NOTE: blockDim.x = width
00096 
00097   for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
00098   {
00099     int i = base + threadIdx.x;
00100     typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
00101     __shared__ typename BlockRadixSort::TempStorage tempStorage;
00102     unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
00103     int val[1] = {(i < n) ? vals[i] : 0};
00104     BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
00105     if (i < n) {
00106       keys[i] = key[0];
00107       vals[i] = val[0];
00108     }
00109     BLOCK_SYNC;
00110   }
00111 
00112 }
00113 
00114 __global__ void storeInReverse(const int numTileListsSrc, const int begin_bit,
00115   const int* __restrict__ outputOrder, const int* __restrict__ tileListPos,
00116   const int* __restrict__ tileListOrderSrc,
00117   const unsigned int* __restrict__ tileListDepthSrc,
00118   int* __restrict__ tileListOrderDst,
00119   unsigned int* __restrict__ tileListDepthDst) {
00120 
00121   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
00122   {
00123     int j = outputOrder[numTileListsSrc - i - 1];
00124     if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
00125       int k = tileListPos[i];
00126       tileListDepthDst[k] = tileListDepthSrc[j];
00127       tileListOrderDst[k] = j; //tileListOrderSrc[j];
00128     }
00129   }
00130 }
00131 
00132 //
00133 // Bit shift tileListDepth so that only lower 16 bits are used
00134 //
00135 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
00136   const int* __restrict__ outputOrder, const unsigned int* __restrict__ tileListDepthSrc,
00137   unsigned int* __restrict__ tileListDepthDst) {
00138 
00139   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00140   {
00141     int j = outputOrder[numTileLists - i - 1];
00142     tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
00143   }
00144 
00145 }
00146 
00147 __global__ void initMinMaxListLen(const int numComputes, const int maxTileListLen,
00148   int2* __restrict__ minmaxListLen) {
00149 
00150   int2 val;
00151   val.x = maxTileListLen+1;
00152   val.y = 0;
00153   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
00154   {
00155     minmaxListLen[i] = val;
00156   }
00157 
00158 }
00159 
00160 //
00161 // Build sortKeys[], values are in range 0 ... numTileListsDst-1
00162 //
00163 __global__ void buildSortKeys(const int numTileListsDst, const int maxTileListLen,
00164   const TileList* __restrict__ tileListsSrc,
00165   const int* __restrict__ tileListOrderDst,
00166   const unsigned int* __restrict__ tileListDepthDst,
00167   int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
00168 
00169   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
00170   {
00171     int k = tileListOrderDst[i];
00172     int icompute = tileListsSrc[k].icompute;
00173     int depth    = tileListDepthDst[i] & 65535;
00174     // depth is in range [1 ... maxTileListLen]
00175     int j        = icompute*maxTileListLen + (depth-1);
00176     sortKeys[j] = i;
00177     int2 minmax = minmaxListLen[icompute];
00178     int2 minmaxOrig = minmax;
00179     if (minmax.x > depth) minmax.x = depth;
00180     if (minmax.y < depth) minmax.y = depth;
00181     if (minmax.x != minmaxOrig.x) {
00182       atomicMin(&minmaxListLen[icompute].x, minmax.x);
00183     }
00184     if (minmax.y != minmaxOrig.y) {
00185       atomicMax(&minmaxListLen[icompute].y, minmax.y);
00186     }
00187   }
00188 
00189 }
00190 
00191 __global__ void fillSortKeys(const int numComputes, const int maxTileListLen,
00192   const int2* __restrict__ minmaxListLen, unsigned int* __restrict__ sortKeys) {
00193 
00194   int i = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;
00195   if (i < numComputes) {
00196     const int wid = threadIdx.x % WARPSIZE;
00197     int2 minmax = minmaxListLen[i];
00198     int minlen = minmax.x;
00199     int maxlen = minmax.y;
00200     // minlen, maxlen are in range [1 ... maxTileListLen]
00201     unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
00202     unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
00203     unsigned int aveKey = (maxKey + minKey)/2;
00204     for (int j=wid;j < minlen-1;j+=WARPSIZE) {
00205       sortKeys[i*maxTileListLen + j] = minKey;
00206     }
00207     for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
00208       sortKeys[i*maxTileListLen + j] = maxKey;
00209     }
00210     for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
00211       if (sortKeys[i*maxTileListLen + j] == 0) {
00212         sortKeys[i*maxTileListLen + j] = aveKey;
00213       }
00214     }
00215   }
00216 
00217 }
00218 
00219 //
00220 // Calculate bounding boxes for sets of WARPSIZE=32 atoms
00221 //
00222 #define BOUNDINGBOXKERNEL_NUM_WARP 8
00223 __global__ void
00224 buildBoundingBoxesKernel(const int atomStorageSize, const float4* __restrict__ xyzq,
00225   BoundingBox* __restrict__ boundingBoxes) {
00226 
00227   const int warpId = threadIdx.x / WARPSIZE;
00228   const int wid = threadIdx.x % WARPSIZE;
00229 
00230   // Loop with warp-aligned index to avoid warp-divergence
00231   for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
00232     // Full atom index
00233     const int i = iwarp + wid;
00234     // Bounding box index
00235     const int ibb = i/WARPSIZE;
00236 
00237     float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
00238 
00239     volatile float3 minxyz, maxxyz;
00240 
00241     typedef cub::WarpReduce<float> WarpReduce;
00242     __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
00243     minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
00244     minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
00245     minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
00246     maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
00247     maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
00248     maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
00249 
00250     if (wid == 0) {
00251       BoundingBox boundingBox;
00252       boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
00253       boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
00254       boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
00255       boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
00256       boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
00257       boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
00258       boundingBoxes[ibb] = boundingBox;
00259     }
00260   }
00261 
00262 }
00263 
00264 //
00265 // Returns the lower estimate for the distance between two bounding boxes
00266 //
00267 __device__ __forceinline__ float distsq(const BoundingBox a, const BoundingBox b) {
00268   float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
00269   float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
00270   float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
00271   float r2 = dx*dx + dy*dy + dz*dz;
00272   return r2;
00273 }
00274 
00275 #if 0
00276 //
00277 // Performs warp-level exclusive sum on a shared memory array:
00278 // sh_out[0 ... n-1] = exclusiveSum( sh_in[0 ... n-1] )
00279 // sh_in and sh_out can point to same array
00280 // Returns the total sum
00281 //
00282 template <typename T>
00283 __device__ __forceinline__
00284 int shWarpExclusiveSum(const int n, volatile T* sh_in, volatile int* sh_out) {
00285   const int wid = threadIdx.x % WARPSIZE;
00286   volatile int blockOffset = 0;
00287   for (int iblock=0;iblock < n;iblock += WARPSIZE) {
00288     // Size of the exclusive sum
00289     int blockLen = min(WARPSIZE, n-iblock);
00290     // Perform exclusive sum on sh_in[iblock ... iblock + blockLen-1]
00291     typedef cub::WarpScan<int> WarpScan;
00292     __shared__ typename WarpScan::TempStorage tempStorage;
00293     int data = (wid < blockLen) ? (int)sh_in[iblock + wid] : 0;
00294     WarpScan(tempStorage).ExclusiveSum(data, data);
00295     // Shift by block offset
00296     data += blockOffset;
00297     // Save last value
00298     int last = (int)sh_in[iblock + blockLen-1];
00299     // Write output
00300     if (wid < blockLen) sh_out[iblock + wid] = data;
00301     // block offset = last term of the exclusive sum + last value
00302     blockOffset = sh_out[iblock + blockLen-1] + last;
00303   }
00304   return blockOffset;
00305 }
00306 #endif
00307 
00308 #define TILELISTKERNELNEW_NUM_WARP 4
00309 
00310 //
00311 // NOTE: Executed on a single thread block
00312 //
00313 template<int nthread>
00314 __global__ void calcTileListPosKernel(const int numComputes,
00315   const CudaComputeRecord* __restrict__ computes,
00316   const CudaPatchRecord* __restrict__ patches,
00317   int* __restrict__ tilePos) {
00318 
00319   typedef cub::BlockScan<int, nthread> BlockScan;
00320 
00321   __shared__ typename BlockScan::TempStorage tempStorage;
00322   __shared__ int shTilePos0;
00323 
00324   if (threadIdx.x == nthread-1) {
00325     shTilePos0 = 0;
00326   }
00327 
00328   for (int base=0;base < numComputes;base+=nthread) {
00329     int k = base + threadIdx.x;
00330 
00331     int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
00332 
00333     // Calculate positions in tile list and jtile list
00334     int tilePosVal;
00335     BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
00336 
00337     // Store into global memory
00338     if (k < numComputes) {      
00339       tilePos[k] = shTilePos0 + tilePosVal;
00340     }
00341 
00342     BLOCK_SYNC;
00343     // Store block end position
00344     if (threadIdx.x == nthread-1) {
00345       shTilePos0 += tilePosVal + numTiles1;
00346     }
00347   }
00348 }
00349 
00350 
00351 template<int nthread>
00352 __global__ void updatePatchesKernel(const int numComputes,
00353   const int* __restrict__ tilePos,
00354   const CudaComputeRecord* __restrict__ computes,
00355   const CudaPatchRecord* __restrict__ patches,
00356   TileList* __restrict__ tileLists) {
00357 
00358   const int tid = threadIdx.x % nthread;
00359 
00360   // nthread threads takes care of one compute
00361   for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
00362   {
00363     CudaComputeRecord compute = computes[k];
00364     float3 offsetXYZ = compute.offsetXYZ;
00365     int2 patchInd = compute.patchInd;
00366     int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
00367     int itileList0 = tilePos[k];
00368     for (int i=tid;i < numTiles1;i+=nthread) {
00369       tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
00370       tileLists[itileList0 + i].patchInd  = patchInd;
00371       tileLists[itileList0 + i].icompute  = k;
00372     }
00373   }
00374 
00375 }
00376 
00377 __host__ __device__ __forceinline__
00378 int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen) {
00379   // Size in bytes
00380   int size = (
00381     maxTileListLen*sizeof(char)
00382     );
00383   return size;
00384 }
00385 
00386 __global__ void
00387 buildTileListsBBKernel(const int numTileLists,
00388   TileList* __restrict__ tileLists,
00389   const CudaPatchRecord* __restrict__ patches,
00390   const int* __restrict__ tileListPos,
00391   const float3 lata, const float3 latb, const float3 latc,
00392   const float cutoff2, const int maxTileListLen,
00393   const BoundingBox* __restrict__ boundingBoxes,
00394   int* __restrict__ tileJatomStart,
00395   const int tileJatomStartSize,
00396   unsigned int* __restrict__ tileListDepth,
00397   int* __restrict__ tileListOrder,
00398   PatchPairRecord* __restrict__ patchPairs,
00399   TileListStat* __restrict__ tileListStat) {
00400 
00401   extern __shared__ char sh_buffer[];
00402   int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
00403   int pos = threadIdx.x*sizePerThread;
00404   volatile char* sh_tile = (char*)&sh_buffer[pos];
00405 
00406   // Loop with warp-aligned index to avoid warp-divergence
00407   for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
00408 
00409     // Use one thread per tile list
00410     const int wid = threadIdx.x % WARPSIZE;
00411     const int itileList = iwarp + wid;
00412 
00413     int i;
00414     int itileListLen = 0;
00415     CudaPatchRecord patch1;
00416     CudaPatchRecord patch2;
00417     float3 offsetXYZ;
00418     int2 patchInd;
00419     int numTiles2;
00420     int icompute;
00421 
00422     if (itileList < numTileLists) {
00423       offsetXYZ = tileLists[itileList].offsetXYZ;
00424       patchInd  = tileLists[itileList].patchInd;
00425       icompute  = tileLists[itileList].icompute;
00426       // Get i-column
00427       i = itileList - tileListPos[icompute];
00428 
00429       float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
00430       float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
00431       float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
00432 
00433       // DH - set zeroShift flag if magnitude of shift vector is zero
00434       bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
00435 
00436       // Load patches
00437       patch1 = patches[patchInd.x];
00438       patch2 = patches[patchInd.y];
00439       // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
00440       numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
00441       int tileStart1 = patch1.atomStart/WARPSIZE;
00442       int tileStart2 = patch2.atomStart/WARPSIZE;
00443 
00444       // DH - self requires that zeroShift is also set
00445       bool self = zeroShift && (tileStart1 == tileStart2);
00446 
00447       // Load i-atom data (and shift coordinates)
00448       BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
00449       boundingBoxI.x += shx;
00450       boundingBoxI.y += shy;
00451       boundingBoxI.z += shz;
00452 
00453       for (int j=0;j < numTiles2;j++) {
00454         sh_tile[j] = 0;
00455         if (!self || j >= i) {
00456           BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
00457           float r2bb = distsq(boundingBoxI, boundingBoxJ);
00458           if (r2bb < cutoff2) {
00459             sh_tile[j] = 1;
00460             itileListLen++;
00461           }
00462         }
00463       }
00464 
00465       tileListDepth[itileList] = (unsigned int)itileListLen;
00466       tileListOrder[itileList] = itileList;
00467     }
00468 
00469     typedef cub::WarpScan<int> WarpScan;
00470     __shared__ typename WarpScan::TempStorage tempStorage;
00471     int active = (itileListLen > 0);
00472     int activePos;
00473     WarpScan(tempStorage).ExclusiveSum(active, activePos);
00474     int itileListPos;
00475     WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
00476 
00477     int jtileStart, numJtiles;
00478     // Last thread in the warp knows the total number
00479     if (wid == WARPSIZE-1) {
00480       atomicAdd(&tileListStat->numTileLists, activePos + active);
00481       numJtiles = itileListPos + itileListLen;
00482       jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
00483     }
00484     numJtiles  = cub::ShuffleIndex(numJtiles,  WARPSIZE-1, WARPSIZE, WARP_FULL_MASK);
00485     jtileStart = cub::ShuffleIndex(jtileStart, WARPSIZE-1, WARPSIZE, WARP_FULL_MASK);
00486     if (jtileStart + numJtiles > tileJatomStartSize) {
00487       // tileJatomStart out of memory, exit 
00488       if (wid == 0) tileListStat->tilesSizeExceeded = true;
00489       return;
00490     }
00491 
00492     int jStart = itileListPos;
00493     int jEnd   = cub::ShuffleDown(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
00494     if (wid == WARPSIZE-1) jEnd = numJtiles;
00495 
00496     if (itileListLen > 0) {
00497       // Setup tileLists[]
00498       TileList TLtmp;
00499       TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
00500       TLtmp.jtileStart = jtileStart + jStart;
00501       TLtmp.jtileEnd   = jtileStart + jEnd - 1;
00502       TLtmp.patchInd   = patchInd;
00503       TLtmp.offsetXYZ  = offsetXYZ;
00504       TLtmp.icompute   = icompute;
00505       // TLtmp.patchNumList.x = 0;
00506       // TLtmp.patchNumList.y = 0;
00507       tileLists[itileList] = TLtmp;
00508       // PatchPair
00509       PatchPairRecord patchPair;
00510       patchPair.iatomSize     = patch1.atomStart + patch1.numAtoms;
00511       patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
00512       patchPair.jatomSize     = patch2.atomStart + patch2.numAtoms;
00513       patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
00514       patchPairs[itileList] = patchPair;
00515 
00516       // Write tiles
00517       int jtile = jtileStart + jStart;
00518       for (int j=0;j < numTiles2;j++) {
00519         if (sh_tile[j]) {
00520           tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
00521           jtile++;
00522         }
00523       }
00524 
00525     }
00526 
00527   }
00528 
00529 }
00530 
00531 #define REPACKTILELISTSKERNEL_NUM_WARP 32
00532 __global__ void
00533 repackTileListsKernel(const int numTileLists, const int begin_bit, const int* __restrict__ tileListPos,
00534   const int* __restrict__ tileListOrder,
00535   const int* __restrict__ jtiles,
00536   const TileList* __restrict__ tileListsSrc, TileList* __restrict__ tileListsDst,
00537   const PatchPairRecord* __restrict__ patchPairsSrc, PatchPairRecord* __restrict__ patchPairsDst,
00538   const int* __restrict__ tileJatomStartSrc, int* __restrict__ tileJatomStartDst,
00539   const TileExcl* __restrict__ tileExclsSrc, TileExcl* __restrict__ tileExclsDst) {
00540 
00541   const int wid = threadIdx.x % WARPSIZE;
00542 
00543   // One warp does one tile list
00544   for (int i = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;i < numTileLists;i+=blockDim.x*gridDim.x/WARPSIZE) 
00545   {
00546     int j = tileListOrder[i];
00547     int start = tileListPos[i];
00548     int end   = tileListPos[i+1]-1;
00549     if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
00550     // TileList
00551     int startOld   = __ldg(&tileListsSrc[j].jtileStart);
00552     int endOld     = __ldg(&tileListsSrc[j].jtileEnd);
00553     int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
00554     float3 offsetXYZ;
00555     offsetXYZ.x  = __ldg(&tileListsSrc[j].offsetXYZ.x);
00556     offsetXYZ.y  = __ldg(&tileListsSrc[j].offsetXYZ.y);
00557     offsetXYZ.z  = __ldg(&tileListsSrc[j].offsetXYZ.z);
00558     int2 patchInd = tileListsSrc[j].patchInd;
00559     int icompute = __ldg(&tileListsSrc[j].icompute);
00560     if (wid == 0) {
00561       TileList tileList;
00562       tileList.iatomStart = iatomStart;
00563       tileList.offsetXYZ  = offsetXYZ;
00564       tileList.jtileStart = start;
00565       tileList.jtileEnd   = end;
00566       tileList.patchInd   = patchInd;
00567       tileList.icompute   = icompute;
00568       tileListsDst[i] = tileList;
00569     }
00570 
00571     if (jtiles == NULL) {
00572       // No jtiles, simple copy will do
00573       int jtile = start;
00574       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
00575         if (jtileOld + wid <= endOld) {
00576           tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
00577         }
00578       }
00579       if (tileExclsSrc != NULL) {
00580         int jtile = start;
00581         for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
00582           tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
00583         }
00584       }
00585     } else {
00586       int jtile0 = start;
00587       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
00588         int t = jtileOld + wid;
00589         int jtile = (t <= endOld) ? jtiles[t] : 0;
00590         jtile >>= begin_bit;
00591         jtile &= 65535;
00592         typedef cub::WarpScan<int> WarpScan;
00593         __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
00594         int warpId = threadIdx.x / WARPSIZE;
00595         int jtilePos;
00596         WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
00597 
00598         if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
00599 
00600         if (tileExclsSrc != NULL) {
00601           unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
00602           while (b != 0) {
00603             // k = index of thread that has data
00604             int k = __ffs(b) - 1;
00605             tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
00606             // remove 1 bit and advance jtile0
00607             b ^= ((unsigned int)1 << k);
00608             jtile0++;
00609           }
00610         } else {
00611           jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
00612         }
00613       }
00614     }
00615   }
00616 
00617 }
00618 
00619 //
00620 // NOTE: Executed on a single thread block
00621 // oobKey = out-of-bounds key value
00622 //
00623 #define SORTTILELISTSKERNEL_NUM_THREAD 512
00624 #define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22
00625 template <typename keyT, typename valT, bool ascend>
00626 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__
00627 void sortTileListsKernel(const int numTileListsSrc, const int numTileListsDst,
00628   const int begin_bit, const int end_bit, const keyT oobKey,
00629   keyT* __restrict__ tileListDepthSrc, keyT* __restrict__ tileListDepthDst,
00630   valT* __restrict__ tileListOrderSrc, valT* __restrict__ tileListOrderDst) {
00631 
00632   typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
00633   SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU;
00634 
00635   typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD,
00636   SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
00637 
00638   typedef cub::BlockRadixSort<keyT, SORTTILELISTSKERNEL_NUM_THREAD,
00639   SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT> BlockRadixSort;
00640 
00641   __shared__ union {
00642     typename BlockLoad::TempStorage      load;
00643     typename BlockLoadU::TempStorage     loadU;
00644     typename BlockRadixSort::TempStorage sort;
00645   } tempStorage;
00646 
00647   keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
00648   valT values[SORTTILELISTSKERNEL_ITEMS_PER_THREAD];
00649 
00650   BlockLoadU(tempStorage.loadU).Load(tileListDepthSrc, keys, numTileListsSrc, oobKey);
00651   BLOCK_SYNC;
00652   BlockLoad(tempStorage.load).Load(tileListOrderSrc, values, numTileListsSrc);
00653   BLOCK_SYNC;
00654 
00655   if (ascend)
00656     BlockRadixSort(tempStorage.sort).SortBlockedToStriped(keys, values, begin_bit, end_bit);
00657   else
00658     BlockRadixSort(tempStorage.sort).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
00659 
00660   cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListDepthDst, keys, numTileListsDst);
00661   cub::StoreDirectStriped<SORTTILELISTSKERNEL_NUM_THREAD>(threadIdx.x, tileListOrderDst, values, numTileListsDst);
00662 }
00663 
00664 __global__ void reOrderTileListDepth(const int numTileLists, const int* __restrict__ tileListOrder,
00665   unsigned int* __restrict__ tileListDepthSrc, unsigned int* __restrict__ tileListDepthDst) {
00666 
00667   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00668   {
00669     int j = tileListOrder[i];
00670     tileListDepthDst[i] = tileListDepthSrc[j];
00671   }
00672 
00673 } 
00674 
00675 //
00676 // Bit shift tileListDepth so that only lower 16 bits are used
00677 //
00678 __global__ void bitshiftTileListDepth(const int numTileLists, const int begin_bit,
00679   unsigned int* __restrict__ tileListDepth) {
00680 
00681   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
00682   {
00683     unsigned int a = tileListDepth[itileList];
00684     a >>= begin_bit;
00685     a &= 65535;
00686     tileListDepth[itileList] = a;
00687   }
00688 
00689 }
00690 
00691 // ##############################################################################################
00692 // ##############################################################################################
00693 // ##############################################################################################
00694 
00695 CudaTileListKernel::CudaTileListKernel(int deviceID, bool doStreaming) :
00696 deviceID(deviceID), doStreaming(doStreaming) {
00697 
00698   cudaCheck(cudaSetDevice(deviceID));
00699 
00700   activeBuffer = 1;
00701 
00702   numPatches = 0;
00703   numComputes = 0;
00704 
00705   cudaPatches = NULL;
00706   cudaPatchesSize = 0;
00707 
00708   cudaComputes = NULL;
00709   cudaComputesSize = 0;
00710 
00711   patchNumLists = NULL;
00712   patchNumListsSize = 0;
00713 
00714   emptyPatches = NULL;
00715   emptyPatchesSize = 0;
00716   h_emptyPatches = NULL;
00717   h_emptyPatchesSize = 0;
00718   numEmptyPatches = 0;
00719 
00720   sortKeySrc = NULL;
00721   sortKeySrcSize = 0;
00722   sortKeyDst = NULL;
00723   sortKeyDstSize = 0;
00724 
00725   tileLists1 = NULL;
00726   tileLists1Size = 0;
00727   tileLists2 = NULL;
00728   tileLists2Size = 0;
00729 
00730   patchPairs1 = NULL;
00731   patchPairs1Size = 0;
00732   patchPairs2 = NULL;
00733   patchPairs2Size = 0;
00734 
00735   tileJatomStart1 = NULL;
00736   tileJatomStart1Size = 0;
00737   tileJatomStart2 = NULL;
00738   tileJatomStart2Size = 0;
00739 
00740   boundingBoxes = NULL;
00741   boundingBoxesSize = 0;
00742 
00743   tileListDepth1 = NULL;
00744   tileListDepth1Size = 0;
00745   tileListDepth2 = NULL;
00746   tileListDepth2Size = 0;
00747 
00748   tileListOrder1 = NULL;
00749   tileListOrder1Size = 0;
00750   tileListOrder2 = NULL;
00751   tileListOrder2Size = 0;
00752 
00753   tileExcls1 = NULL;
00754   tileExcls1Size = 0;
00755   tileExcls2 = NULL;
00756   tileExcls2Size = 0;
00757 
00758   xyzq = NULL;
00759   xyzqSize = 0;
00760 
00761   allocate_device<TileListStat>(&d_tileListStat, 1);
00762   allocate_host<TileListStat>(&h_tileListStat, 1);
00763 
00764   tileListPos = NULL;
00765   tileListPosSize = 0;
00766   tempStorage = NULL;
00767   tempStorageSize = 0;
00768 
00769   jtiles = NULL;
00770   jtilesSize = 0;
00771 
00772   tilePos = NULL;
00773   tilePosSize = 0;
00774 
00775   tileListsGBIS = NULL;
00776   tileListsGBISSize = 0;
00777 
00778   tileJatomStartGBIS = NULL;
00779   tileJatomStartGBISSize = 0;
00780 
00781   tileListVirialEnergy = NULL;
00782   tileListVirialEnergySize = 0;
00783 
00784   atomStorageSize = 0;
00785   numTileLists = 0;
00786   numTileListsGBIS = 0;
00787   numJtiles = 1;
00788 
00789   outputOrder = NULL;
00790   outputOrderSize = 0;
00791   doOutputOrder = false;
00792 
00793   minmaxListLen = NULL;
00794   minmaxListLenSize = 0;
00795 
00796   sortKeys = NULL;
00797   sortKeysSize = 0;
00798   sortKeys_endbit = 0;
00799 
00800   cudaCheck(cudaEventCreate(&tileListStatEvent));
00801   tileListStatEventRecord = false;
00802 }
00803 
00804 CudaTileListKernel::~CudaTileListKernel() {
00805   cudaCheck(cudaSetDevice(deviceID));
00806   deallocate_device<TileListStat>(&d_tileListStat);
00807   deallocate_host<TileListStat>(&h_tileListStat);
00808   //
00809   if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
00810   if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
00811   if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
00812   if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
00813   if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
00814   //
00815   if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
00816   if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
00817   if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
00818   if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
00819   if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
00820   if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
00821   if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
00822   if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
00823   if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
00824   if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
00825   if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
00826   if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
00827   if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
00828   if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
00829   if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
00830   if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
00831   if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
00832   if (jtiles != NULL) deallocate_device<int>(&jtiles);
00833   if (tilePos != NULL) deallocate_device<int>(&tilePos);
00834 
00835   if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
00836   if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
00837 
00838   if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
00839 
00840   if (xyzq != NULL) deallocate_device<float4>(&xyzq);
00841 
00842   if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
00843   if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
00844 
00845   cudaCheck(cudaEventDestroy(tileListStatEvent));
00846 }
00847 
00848 void CudaTileListKernel::prepareTileList(cudaStream_t stream) {
00849   clear_device_array<int>(jtiles, numJtiles, stream);
00850 }
00851 
00852 void CudaTileListKernel::clearTileListStat(cudaStream_t stream) {
00853   // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
00854   memset(h_tileListStat, 0, sizeof(TileListStat));
00855   h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
00856   copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
00857 }
00858 
00859 void CudaTileListKernel::finishTileList(cudaStream_t stream) {
00860   copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
00861   cudaCheck(cudaEventRecord(tileListStatEvent, stream));
00862   tileListStatEventRecord = true;
00863 }
00864 
00865 void CudaTileListKernel::updateComputes(const int numComputesIn,
00866   const CudaComputeRecord* h_cudaComputes, cudaStream_t stream) {
00867 
00868   numComputes = numComputesIn;
00869 
00870   reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
00871   copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
00872 
00873   if (doStreaming) doOutputOrder = true;
00874 }
00875 
00876 void CudaTileListKernel::writeTileList(const char* filename, const int numTileLists,
00877   const TileList* d_tileLists, cudaStream_t stream) {
00878   
00879   TileList* h_tileLists = new TileList[numTileLists];
00880   copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
00881   cudaCheck(cudaStreamSynchronize(stream));
00882   FILE* handle = fopen(filename,"wt");
00883   for (int itileList=0;itileList < numTileLists;itileList++) {
00884     TileList tmp = h_tileLists[itileList];
00885     fprintf(handle, "%d %d %d %f %f %f %d %d %d %d\n",
00886       tmp.iatomStart, tmp.jtileStart, tmp.jtileEnd, tmp.offsetXYZ.x, tmp.offsetXYZ.y,
00887       tmp.offsetXYZ.z, tmp.patchInd.x, tmp.patchInd.y, tmp.patchNumList.x, tmp.patchNumList.y);
00888   }
00889   fclose(handle);
00890   delete [] h_tileLists;
00891 }
00892 
00893 void CudaTileListKernel::writeTileJatomStart(const char* filename, const int numJtiles,
00894   const int* d_tileJatomStart, cudaStream_t stream) {
00895 
00896   int* h_tileJatomStart = new int[numJtiles];
00897   copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
00898   cudaCheck(cudaStreamSynchronize(stream));
00899   FILE* handle = fopen(filename,"wt");
00900   for (int i=0;i < numJtiles;i++) {
00901     fprintf(handle, "%d\n", h_tileJatomStart[i]);
00902   }
00903   fclose(handle);
00904   delete [] h_tileJatomStart;
00905 }
00906 
00907 /*
00908 std::pair<int, int> flip_pair(const std::pair<int, int> &p)
00909 {
00910     return std::pair<int, int>(p.second, p.first);
00911 }
00912 
00913 void CudaTileListKernel::markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
00914   const int numJtiles, int* d_tileJatomStart, cudaStream_t stream) {
00915 
00916   const int shCacheSize = 10;
00917   TileList* h_tileLists = new TileList[numTileLists];
00918   int* h_tileJatomStart = new int[numJtiles];
00919   copy_DtoH<TileList>(d_tileLists, h_tileLists, numTileLists, stream);
00920   copy_DtoH<int>(d_tileJatomStart, h_tileJatomStart, numJtiles, stream);
00921   cudaCheck(cudaStreamSynchronize(stream));
00922   int ntotal = 0;
00923   int ncache = 0;
00924   for (int i=0;i < numTileLists;i+=width) {
00925     int jend = min(i + width, numTileLists);
00926     std::map<int, int> atomStartMap;
00927     std::map<int, int>::iterator it;
00928     atomStartMap.clear();
00929     for (int j=i;j < jend;j++) {
00930       TileList tmp = h_tileLists[j];
00931       int iatomStart = tmp.iatomStart;
00932       it = atomStartMap.find(iatomStart);
00933       if (it == atomStartMap.end()) {
00934         // Insert new
00935         atomStartMap.insert( std::pair<int, int>(iatomStart, 0) );
00936       } else {
00937         // Increase counter
00938         it->second--;
00939       }
00940       int jtileStart = tmp.jtileStart;
00941       int jtileEnd   = tmp.jtileEnd;
00942       ntotal += (jtileEnd - jtileStart + 1) + 1;
00943       for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
00944         int jatomStart = h_tileJatomStart[jtile];
00945         it = atomStartMap.find(jatomStart);
00946         if (it == atomStartMap.end()) {
00947           // Insert new
00948           atomStartMap.insert( std::pair<int, int>(jatomStart, 0) );
00949         } else {
00950           // Increase counter
00951           it->second--;
00952         }
00953         jatomStart |= (65535 << 16);
00954         h_tileJatomStart[jtile] = jatomStart;
00955       }
00956       iatomStart |= (65535 << 16);
00957       tmp.iatomStart = iatomStart;
00958       h_tileLists[j] = tmp;
00959     }
00960     ncache += atomStartMap.size();
00961     std::multimap<int, int> imap;
00962     imap.clear();
00963     std::multimap<int, int>::iterator imap_it;
00964     std::transform(atomStartMap.begin(), atomStartMap.end(), std::inserter(imap, imap.begin()), flip_pair);
00965     if (i < 400) {
00966       printf("%d %d\n", ntotal, imap.size());
00967       for (imap_it = imap.begin();imap_it != imap.end();imap_it++) {
00968         if (imap_it->first != 0)
00969           printf("(%d %d)\n", imap_it->first, imap_it->second);
00970       }
00971     }
00972   }
00973   printf("ntotal %d ncache %d\n", ntotal, ncache);
00974   copy_HtoD<TileList>(h_tileLists, d_tileLists, numTileLists, stream);
00975   copy_HtoD<int>(h_tileJatomStart, d_tileJatomStart, numJtiles, stream);
00976   cudaCheck(cudaStreamSynchronize(stream));
00977   delete [] h_tileLists;
00978   delete [] h_tileJatomStart;
00979 }
00980 */
00981 
00982 void CudaTileListKernel::buildTileLists(const int numTileListsPrev,
00983   const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
00984   const float3 lata, const float3 latb, const float3 latc,
00985   const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
00986   cudaStream_t stream) {
00987 
00988   numPatches = numPatchesIn;
00989   atomStorageSize = atomStorageSizeIn;
00990   maxTileListLen = maxTileListLenIn;
00991   plcutoff2 = plcutoff2In;
00992 
00993   if (doStreaming) {
00994     // Re-allocate patchNumLists
00995     reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
00996     reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
00997     reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
00998   }
00999 
01000   // Re-allocate (tileLists1, patchPairs1
01001   reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
01002   reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
01003 
01004   // Copy cudaPatches to device
01005   reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
01006   copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
01007 
01008   // Re-allocate temporary storage
01009   reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
01010   // Calculate tile list positions (tilePos)
01011   {
01012     int nthread = 1024;
01013     int nblock = 1;
01014     calcTileListPosKernel<1024> <<< nblock, nthread, 0, stream >>>
01015     (numComputes, cudaComputes, cudaPatches, tilePos);
01016     cudaCheck(cudaGetLastError());
01017   }
01018 
01019   // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
01020   {
01021     int nthread = 512;
01022     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/32)+1);
01023     updatePatchesKernel<32> <<< nblock, nthread, 0, stream >>>
01024     (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
01025     cudaCheck(cudaGetLastError());
01026   }
01027 
01028   // ---------------------------------------------------------------------------------------------
01029 
01030 
01031   // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
01032   // tileListDepth2 and tileListOrder2 since they're used in sorting
01033   reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
01034   reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
01035 
01036   // Allocate with +1 to include last term in the exclusive sum
01037   reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
01038 
01039   reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
01040 
01041   reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
01042 
01043   copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
01044 
01045   // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
01046   {
01047     int numBoundingBoxes = atomStorageSize/WARPSIZE;
01048     reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
01049 
01050     int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
01051     int nthread = WARPSIZE*nwarp;
01052     int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
01053     buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
01054     cudaCheck(cudaGetLastError());
01055   }
01056 
01057   {
01058     int nwarp = TILELISTKERNELNEW_NUM_WARP;
01059     int nthread = WARPSIZE*nwarp;
01060     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
01061 
01062     int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
01063 
01064     // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
01065     //       tell the required size in h_tileListStat->numJtiles. In subsequent calls,
01066     //       re-allocation only happens when the size is exceeded.
01067     h_tileListStat->tilesSizeExceeded = true;
01068     int reallocCount = 0;
01069     while (h_tileListStat->tilesSizeExceeded) {
01070       reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
01071 
01072       clearTileListStat(stream);
01073       // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
01074 
01075       buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>>
01076       (numTileListsPrev, tileLists1, cudaPatches, tilePos,
01077         lata, latb, latc, plcutoff2, maxTileListLen,
01078         boundingBoxes, tileJatomStart1, tileJatomStart1Size,
01079         tileListDepth1, tileListOrder1, patchPairs1,
01080         d_tileListStat);
01081 
01082       cudaCheck(cudaGetLastError());
01083 
01084       // get (numATileLists, numJtiles, tilesSizeExceeded)
01085       copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
01086       cudaCheck(cudaStreamSynchronize(stream));
01087       numJtiles = h_tileListStat->numJtiles;
01088 
01089       if (h_tileListStat->tilesSizeExceeded) {
01090         reallocCount++;
01091         if (reallocCount > 1) {
01092           NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
01093         }
01094       }
01095 
01096     }
01097 
01098     numTileLists = h_tileListStat->numTileLists;
01099 
01100     reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
01101   }
01102 
01103   // Re-allocate tileListVirialEnergy.
01104   // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
01105   //       we're quaranteed to have enough space
01106   reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
01107 
01108   reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
01109   reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
01110   reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
01111   reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
01112   reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
01113 
01114   int numTileListsSrc = numTileListsPrev;
01115   int numJtilesSrc    = numJtiles;
01116   int numTileListsDst = numTileLists;
01117   int numJtilesDst    = numJtiles;
01118 
01119   // Sort tiles
01120   sortTileLists(
01121     false,
01122     0, false,
01123     numTileListsSrc, numJtilesSrc,
01124     PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
01125     PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
01126     PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
01127     numTileListsDst, numJtilesDst,
01128     PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
01129     PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
01130     PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
01131     stream);
01132 
01133   // Set active buffer to 2
01134   setActiveBuffer(2);
01135 
01136   if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
01137 }
01138 
01139 //
01140 // Returns integer log2(a) rounded up
01141 //
01142 int ilog2(int a) {
01143   // if (a < 0)
01144   //   NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
01145   int k = 1;
01146   while (a >>= 1) k++;
01147   return k;
01148 }
01149 
01150 //
01151 // Sort tile lists
01152 //
01153 void CudaTileListKernel::sortTileLists(
01154   const bool useJtiles,
01155   const int begin_bit, const bool highDepthBitsSetIn,
01156   // Source
01157   const int numTileListsSrc, const int numJtilesSrc,
01158   PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
01159   PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
01160   PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
01161   // Destination
01162   const int numTileListsDst, const int numJtilesDst,
01163   PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
01164   PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
01165   PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
01166   cudaStream_t stream) {
01167 
01168   bool doShiftDown = (begin_bit != 0 || highDepthBitsSetIn);
01169 
01170   // if (numTileListsDst == 0)
01171   //   NAMD_die("CudaTileListKernel::sortTileLists, numTileListsDst = 0");
01172 
01173   // Check that the array sizes are adequate
01174   if (numTileListsSrc > tileListsSrc.size || numJtilesSrc > tileJatomStartSrc.size ||
01175     numTileListsSrc > tileListDepthSrc.size || numTileListsSrc > tileListOrderSrc.size ||
01176     (patchPairsSrc.ptr != NULL && numTileListsSrc > patchPairsSrc.size) || 
01177     (tileExclsSrc.ptr != NULL && numJtilesSrc > tileExclsSrc.size))
01178     NAMD_die("CudaTileListKernel::sortTileLists, Src allocated too small");
01179 
01180   if (numTileListsDst > tileListsDst.size || numJtilesDst > tileJatomStartDst.size ||
01181     numTileListsSrc > tileListDepthDst.size || numTileListsSrc > tileListOrderDst.size ||
01182     (patchPairsDst.ptr != NULL && numTileListsDst > patchPairsDst.size) || 
01183     (tileExclsDst.ptr != NULL && numJtilesDst > tileExclsDst.size))
01184     NAMD_die("CudaTileListKernel::sortTileLists, Dst allocated too small");
01185 
01186   if (begin_bit != 0 && begin_bit != 16)
01187     NAMD_die("CudaTileListKernel::sortTileLists, begin_bit must be 0 or 16");
01188 
01189   // Number of bits needed in the sort
01190   int num_bit = ilog2(maxTileListLen);
01191   if (num_bit > 16)
01192     NAMD_die("CudaTileListKernel::sortTileLists, num_bit overflow");
01193   int end_bit = begin_bit + num_bit;
01194 
01195   if (doStreaming)
01196   {
01197     // ----------------------------------------------------------------------------------------
01198     if (doOutputOrder && useJtiles) {
01199       // outputOrder has been produced, put tile lists back in reverse order and produce sortKeys
01200       // NOTE: This is done very infrequently, typically only once when the MD run starts.
01201 
01202       // Calculate position from depth
01203       {
01204         // -----------------------------------------------------------------------------
01205         // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
01206         // -----------------------------------------------------------------------------
01207         if (doShiftDown)
01208         {
01209           int nthread = 1024;
01210           int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
01211           bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
01212           (numTileListsSrc, begin_bit, outputOrder, tileListDepthSrc.ptr, tileListDepthDst.ptr);
01213           cudaCheck(cudaGetLastError());
01214         }
01215 
01216         reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsSrc, OVERALLOC);
01217 
01218         // --------------------------------------------------------------------
01219         // Compute itileList positions to store tileLists
01220         // ExclusiveSum(tileListDepthDst[0...numTileListsDst-1])
01221         // --------------------------------------------------------------------
01222         {
01223           size_t size = 0;
01224           cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
01225             (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
01226           // Make sure tempStorage doesn't remain NULL
01227           if (size == 0) size = 128;
01228           reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
01229           size = tempStorageSize;
01230           cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
01231             (int *)tileListDepthDst.ptr, tileListPos, numTileListsSrc, stream));
01232         }
01233       }
01234 
01235       // Store in reverse order from outputOrder
01236       {
01237         int nthread = 1024;
01238         int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
01239         storeInReverse <<< nblock, nthread, 0, stream >>>
01240         (numTileListsSrc, begin_bit, outputOrder, tileListPos,
01241           tileListOrderSrc.ptr, tileListDepthSrc.ptr,
01242           tileListOrderDst.ptr, tileListDepthDst.ptr);
01243         cudaCheck(cudaGetLastError());
01244       }
01245 
01246       // Build sortKeys
01247       {
01248         maxTileListLen_sortKeys = maxTileListLen;
01249 
01250         reallocate_device<unsigned int>(&sortKeys, &sortKeysSize, numComputes*maxTileListLen);
01251         clear_device_array<unsigned int>(sortKeys, numComputes*maxTileListLen, stream);
01252 
01253         // Re-allocate and initialize minmaxListLen
01254         {
01255           reallocate_device<int2>(&minmaxListLen, &minmaxListLenSize, numComputes);
01256 
01257           int nthread = 1024;
01258           int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nthread+1);
01259           initMinMaxListLen <<< nblock, nthread, 0, stream >>>
01260           (numComputes, maxTileListLen, minmaxListLen);
01261           cudaCheck(cudaGetLastError());
01262         }
01263 
01264         // Build sortKeys and calculate minmaxListLen
01265         {
01266           int nthread = 1024;
01267           int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
01268           buildSortKeys <<< nblock, nthread, 0, stream >>>
01269           (numTileListsDst, maxTileListLen, tileListsSrc.ptr, tileListOrderDst.ptr,
01270             tileListDepthDst.ptr, minmaxListLen, sortKeys);
01271           cudaCheck(cudaGetLastError());
01272 
01273           // Maximum value in sortKeys[] is numTileListsDst - 1
01274           sortKeys_endbit = ilog2(numTileListsDst);
01275         }
01276 
01277         // Fill in missing sortKeys using minmaxListLen
01278         {
01279           int nthread = 1024;
01280           int nwarp = nthread/WARPSIZE;
01281           int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/nwarp+1);
01282           fillSortKeys <<< nblock, nthread, 0, stream >>>
01283           (numComputes, maxTileListLen, minmaxListLen, sortKeys);
01284           cudaCheck(cudaGetLastError());
01285         }
01286 
01287       }
01288 
01289       doOutputOrder = false;
01290 
01291     } else if (doOutputOrder) {
01292       // OutputOrder will be produced in next pairlist non-bond kernel.
01293       // This time just remove zero length lists
01294       // NOTE: This is done very infrequently, typically only once when the MD run starts.
01295 
01296       int endbit_tmp = ilog2(numTileListsSrc);
01297 
01298       // Remove zeros
01299       {
01300         reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
01301         reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
01302 
01303         int nthread = 1024;
01304         int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
01305         buildRemoveZerosSortKey <<< nblock, nthread, 0, stream >>> 
01306         (numTileListsSrc, tileListDepthSrc.ptr, begin_bit, sortKeySrc);
01307         cudaCheck(cudaGetLastError());
01308       }
01309 
01310       if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
01311       {
01312         // Short list, sort withing a single thread block
01313         int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
01314         int nblock = 1;
01315 
01316         unsigned int oobKey = numTileListsSrc;
01317         sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
01318         (numTileListsSrc, numTileListsDst, 0, endbit_tmp, oobKey, sortKeySrc, sortKeyDst,
01319           tileListOrderSrc.ptr, tileListOrderDst.ptr);
01320         cudaCheck(cudaGetLastError());
01321       }
01322       else
01323       {
01324         // Long list, sort on multiple thread blocks
01325         size_t size = 0;
01326         cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
01327           sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
01328           numTileListsSrc, 0, endbit_tmp, stream));
01329         // Make sure tempStorage doesn't remain NULL
01330         if (size == 0) size = 128;
01331         reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
01332         size = tempStorageSize;
01333         cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
01334           sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
01335           numTileListsSrc, 0, endbit_tmp, stream));
01336       }
01337 
01338       // Re-order tileListDepth using tileListOrderDst
01339       {
01340         int nthread = 1024;
01341         int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
01342         reOrderTileListDepth <<< nblock, nthread, 0, stream >>> 
01343         (numTileListsDst, tileListOrderDst.ptr,
01344           tileListDepthSrc.ptr, tileListDepthDst.ptr);
01345         cudaCheck(cudaGetLastError());
01346       }
01347 
01348     } else {
01349       // This is done during regular MD cycle
01350 
01351       if (sortKeys_endbit <= 0)
01352         NAMD_die("CudaTileListKernel::sortTileLists, sortKeys not produced or invalid sortKeys_endbit");
01353 
01354       // Setup sort keys
01355       {
01356         reallocate_device<unsigned int>(&sortKeySrc, &sortKeySrcSize, numTileListsSrc, OVERALLOC);
01357         reallocate_device<unsigned int>(&sortKeyDst, &sortKeyDstSize, numTileListsSrc, OVERALLOC);
01358 
01359         int nthread = 1024;
01360         int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsSrc-1)/nthread+1);
01361         setupSortKey <<< nblock, nthread, 0, stream >>> 
01362         (numTileListsSrc, maxTileListLen_sortKeys, tileListsSrc.ptr, tileListDepthSrc.ptr, begin_bit, sortKeys, sortKeySrc);
01363         cudaCheck(cudaGetLastError());
01364 
01365       }
01366 
01367       // Global sort
01368       if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
01369       // if (false)
01370       {
01371         // Short list, sort withing a single thread block
01372         int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
01373         int nblock = 1;
01374 
01375         unsigned int oobKey = (2 << sortKeys_endbit) - 1;
01376         sortTileListsKernel <unsigned int, int, true> <<< nblock, nthread, 0, stream >>>
01377         (numTileListsSrc, numTileListsDst, 0, sortKeys_endbit, oobKey, sortKeySrc, sortKeyDst,
01378           tileListOrderSrc.ptr, tileListOrderDst.ptr);
01379         cudaCheck(cudaGetLastError());
01380       }
01381       else
01382       {
01383         // Long list, sort on multiple thread blocks
01384         size_t size = 0;
01385         cudaCheck(cub::DeviceRadixSort::SortPairs(NULL, size,
01386           sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
01387           numTileListsSrc, 0, sortKeys_endbit, stream));
01388         // Make sure tempStorage doesn't remain NULL
01389         if (size == 0) size = 128;
01390         reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
01391         size = tempStorageSize;
01392         cudaCheck(cub::DeviceRadixSort::SortPairs((void *)tempStorage, size,
01393           sortKeySrc, sortKeyDst, tileListOrderSrc.ptr, tileListOrderDst.ptr,
01394           numTileListsSrc, 0, sortKeys_endbit, stream));
01395       }
01396 
01397       // Re-order tileListDepth using tileListOrderDst
01398       {
01399         int nthread = 1024;
01400         int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
01401         reOrderTileListDepth <<< nblock, nthread, 0, stream >>> 
01402         (numTileListsDst, tileListOrderDst.ptr,
01403           tileListDepthSrc.ptr, tileListDepthDst.ptr);
01404         cudaCheck(cudaGetLastError());
01405       }
01406 
01407       // Local sort
01408       {
01409         int nthread = 32;
01410         int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
01411         localSort<32> <<< nblock, nthread, 0, stream >>> 
01412         (numTileListsDst, begin_bit, num_bit, tileListDepthDst.ptr, tileListOrderDst.ptr);
01413         cudaCheck(cudaGetLastError());
01414 
01415         // No need to shift any more
01416         doShiftDown = false;
01417       }
01418 
01419     }
01420     // ----------------------------------------------------------------------------------------
01421 
01422   } // (doStreaming)
01423   else
01424   {
01425     // --------------------------------------------------------------------
01426     // Sort {tileListDepthSrc, tileListOrderSrc}[0 ... numTileListsSrc-1]
01427     //   => {tileListDepthDst, tileListOrderDst}[0 ... numTileListsSrc-1]
01428     // --------------------------------------------------------------------
01429     if (numTileListsSrc <= SORTTILELISTSKERNEL_NUM_THREAD*SORTTILELISTSKERNEL_ITEMS_PER_THREAD)
01430     {
01431       // Short list, sort withing a single thread block
01432       int nthread = SORTTILELISTSKERNEL_NUM_THREAD;
01433       int nblock = 1;
01434 
01435       sortTileListsKernel<unsigned int, int, false> <<< nblock, nthread, 0, stream >>>
01436       (numTileListsSrc, numTileListsDst, begin_bit, end_bit, 0, tileListDepthSrc.ptr, tileListDepthDst.ptr,
01437         tileListOrderSrc.ptr, tileListOrderDst.ptr);
01438       cudaCheck(cudaGetLastError());
01439 
01440     }
01441     else
01442     {
01443       // Long list, sort on multiple thread blocks
01444       size_t size = 0;
01445       cudaCheck(cub::DeviceRadixSort::SortPairsDescending(NULL, size,
01446         tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
01447         numTileListsSrc, begin_bit, end_bit, stream));
01448       // Make sure tempStorage doesn't remain NULL
01449       if (size == 0) size = 128;
01450       reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
01451       size = tempStorageSize;
01452       cudaCheck(cub::DeviceRadixSort::SortPairsDescending((void *)tempStorage, size,
01453         tileListDepthSrc.ptr, tileListDepthDst.ptr, tileListOrderSrc.ptr, tileListOrderDst.ptr,
01454         numTileListsSrc, begin_bit, end_bit, stream));
01455     }
01456   }
01457 
01458   // -----------------------------------------------------------------------------
01459   // Bit shift & mask tileListDepthDst such that only lower 16 bits are occupied
01460   // -----------------------------------------------------------------------------
01461   if (doShiftDown)
01462   {
01463     int nthread = 1024;
01464     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
01465     bitshiftTileListDepth <<< nblock, nthread, 0, stream >>>
01466     (numTileListsDst, begin_bit, tileListDepthDst.ptr);
01467     cudaCheck(cudaGetLastError());
01468   }
01469 
01470   // Allocate with +1 to include last term in the exclusive sum
01471   reallocate_device<int>(&tileListPos, &tileListPosSize, numTileListsDst+1, OVERALLOC);
01472 
01473   // --------------------------------------------------------------------
01474   // Compute itileList positions to store tileLists
01475   // ExclusiveSum(tileListDepthDst[0...numTileListsDst+1])
01476   // NOTE: tileListDepthDst[numTileListsDst] is not accessed
01477   //       since this is an exclusive sum. But with this trick,
01478   //       tileListPos[numTileListsDst] will contain the total number
01479   //       of tile lists
01480   // --------------------------------------------------------------------
01481   {
01482     size_t size = 0;
01483     cudaCheck(cub::DeviceScan::ExclusiveSum(NULL, size,
01484       (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
01485     // Make sure tempStorage doesn't remain NULL
01486     if (size == 0) size = 128;
01487     reallocate_device<char>(&tempStorage, &tempStorageSize, size, 1.5f);
01488     size = tempStorageSize;
01489     // NOTE: Bug in CUB 1.4.1, stalls here with Geforce GTC Titan X.
01490     //       Tested on "manila" node at UIUC. Works OK with CUB 1.5.2
01491     cudaCheck(cub::DeviceScan::ExclusiveSum((void *)tempStorage, size,
01492       (int *)tileListDepthDst.ptr, tileListPos, numTileListsDst+1, stream));
01493   }
01494 
01495   // --------------------------------------------------------------------
01496   // Re-package to
01497   // tileListsDst[0 ... numTileListsDst-1], tileJatomStartDst[0 ... numJtilesDst-1]
01498   // patchPairDst[]
01499   // tileJatomStartDst[]
01500   // tileExclsDst[0 ... numJtilesDst-1]
01501   // --------------------------------------------------------------------
01502   {
01503     int nthread = WARPSIZE*REPACKTILELISTSKERNEL_NUM_WARP;
01504     int nwarp = REPACKTILELISTSKERNEL_NUM_WARP;
01505     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nwarp+1);
01506 
01507     repackTileListsKernel <<< nblock, nthread, 0, stream >>>
01508     (numTileListsDst, begin_bit, tileListPos, tileListOrderDst.ptr,
01509       (useJtiles) ? jtiles : NULL,
01510       tileListsSrc.ptr, tileListsDst.ptr,
01511       patchPairsSrc.ptr, patchPairsDst.ptr,
01512       tileJatomStartSrc.ptr, tileJatomStartDst.ptr,
01513       tileExclsSrc.ptr, tileExclsDst.ptr);
01514     cudaCheck(cudaGetLastError());
01515   }
01516 
01517   // Count the number of tileLists that contribute to each patch
01518   if (doStreaming)
01519   {
01520     clear_device_array<int>(patchNumLists, numPatches, stream);
01521 
01522     // Fill in patchNumLists[0...numPatches-1]
01523     int nthread = 512;
01524     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsDst-1)/nthread+1);
01525     calcPatchNumLists <<< nblock, nthread, 0, stream >>> 
01526     (numTileListsDst, numPatches, tileListsDst.ptr, patchNumLists);
01527     cudaCheck(cudaGetLastError());
01528 
01529     // Use emptyPatches[numPatches] as the count variable
01530     clear_device_array<int>(&emptyPatches[numPatches], 1, stream);
01531 
01532     // Fill in tileListsDst[0...numTileListsDst-1].patchNumLists
01533     // and find empty patches into emptyPatches[0 ... numEmptyPatches - 1]
01534     setPatchNumLists_findEmptyPatches <<< nblock, nthread, 0, stream >>> 
01535     (numTileListsDst, tileListsDst.ptr, patchNumLists,
01536       numPatches, &emptyPatches[numPatches], emptyPatches);
01537     cudaCheck(cudaGetLastError());
01538 
01539     // // Copy emptyPatches[0 ... numPatches] to host
01540     copy_DtoH<int>(emptyPatches, h_emptyPatches, numPatches+1, stream);
01541     cudaCheck(cudaStreamSynchronize(stream));
01542     numEmptyPatches = h_emptyPatches[numPatches];
01543   }
01544 
01545 }
01546 
01547 //
01548 // Re-sort tile lists after pairlist refinement. Can only be called after finishTileList() has finished copying
01549 //
01550 void CudaTileListKernel::reSortTileLists(const bool doGBIS, cudaStream_t stream) {
01551   // Store previous number of active lists
01552   int numTileListsPrev = numTileLists;
01553 
01554   // Wait for finishTileList() to stop copying
01555   if (!tileListStatEventRecord)
01556     NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
01557   cudaCheck(cudaEventSynchronize(tileListStatEvent));
01558 
01559   // Get numTileLists, numTileListsGBIS, and numExcluded
01560   {
01561     numTileLists     = h_tileListStat->numTileLists;
01562     numTileListsGBIS = h_tileListStat->numTileListsGBIS;
01563     numExcluded      = h_tileListStat->numExcluded;
01564   }
01565 
01566   // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
01567   // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
01568   sortTileLists(true, 0, true,
01569     numTileListsPrev, numJtiles,
01570     PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
01571     PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
01572     PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
01573     numTileLists, numJtiles,
01574     PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
01575     PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
01576     PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
01577     stream);
01578 
01579   // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
01580   // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
01581   // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
01582 
01583   // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
01584 
01585   // NOTE:
01586   // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
01587   // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
01588 
01589   if (doGBIS) {
01590     // GBIS is used => produce a second tile list
01591     // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
01592     reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
01593     reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
01594 
01595     sortTileLists(true, 16, true,
01596       numTileListsPrev, numJtiles,
01597       PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
01598       PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
01599       PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
01600       numTileListsGBIS, numJtiles,
01601       PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
01602       PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
01603       PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
01604       stream);
01605   }
01606 
01607   // Set active buffer to be 1
01608   setActiveBuffer(1);
01609 
01610 }
01611 
01612 /*
01613 //
01614 // Apply outputOrder after regular (non-pairlist, non-energy) non-bonded kernel
01615 //
01616 void CudaTileListKernel::applyOutputOrder(cudaStream_t stream) {
01617   return;
01618 
01619   if (!doStreaming || !doOutputOrder)
01620     return;
01621 
01622   // Sort {tileList1, tileJatomStart1, tileExcl1} => {tileList2, tileJatomStart2, tileExcl2}
01623   // VdW tile list in {tileList1, tileJatomStart1, tileExcl1}
01624   sortTileLists(false, 0, true,
01625     numTileLists, numJtiles,
01626     PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
01627     PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
01628     PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls2Size),
01629     numTileLists, numJtiles,
01630     PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
01631     PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
01632     PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
01633     stream);
01634 
01635   // Set active buffer to be 2
01636   setActiveBuffer(2);
01637 
01638 }
01639 */
01640 
01641 void CudaTileListKernel::setTileListVirialEnergyLength(int len) {
01642   if (len > tileListVirialEnergySize) {
01643     NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
01644   }
01645   tileListVirialEnergyLength = len;
01646 }
01647 
01648 void CudaTileListKernel::setTileListVirialEnergyGBISLength(int len) {
01649   if (len > tileListVirialEnergySize) {
01650     NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
01651   }
01652   tileListVirialEnergyGBISLength = len;
01653 }

Generated on Tue Nov 21 01:17:12 2017 for NAMD by  doxygen 1.4.7