CudaTileListKernel.cu File Reference

#include <cuda.h>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_scan.cuh>
#include <cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Defines

#define OVERALLOC   1.2f
#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define TILELISTKERNELNEW_NUM_WARP   4
#define REPACKTILELISTSKERNEL_NUM_WARP   32
#define SORTTILELISTSKERNEL_NUM_THREAD   512
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Functions

void NAMD_die (const char *)
__global__ void calcPatchNumLists (const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
__global__ void setPatchNumLists_findEmptyPatches (const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
__global__ void buildRemoveZerosSortKey (const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
__global__ void setupSortKey (const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
template<int width>
__global__ void localSort (const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
__global__ void storeInReverse (const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
__global__ void initMinMaxListLen (const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
__global__ void buildSortKeys (const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void fillSortKeys (const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
__global__ void buildBoundingBoxesKernel (const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
__device__ __forceinline__
float 
distsq (const BoundingBox a, const BoundingBox b)
template<int nthread>
__global__ void calcTileListPosKernel (const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
template<int nthread>
__global__ void updatePatchesKernel (const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
__host__ __device__ __forceinline__
int 
buildTileListsBBKernel_shmem_sizePerThread (const int maxTileListLen)
__global__ void buildTileListsBBKernel (const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
__global__ void repackTileListsKernel (const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
template<typename keyT, typename valT, bool ascend>
 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__ void sortTileListsKernel(const int numTileListsSrc
 doStreaming (doStreaming)
int ilog2 (int a)

Variables

__thread DeviceCUDAdeviceCUDA
const int numTileListsDst
const int const int begin_bit
const int const int const int end_bit
const int const int const
int const keyT 
oobKey
const int const int const
int const keyT keyT *__restrict__ 
tileListDepthSrc
const int const int const
int const keyT keyT *__restrict__
keyT *__restrict__ 
tileListDepthDst
const int const int const
int const keyT keyT *__restrict__
keyT *__restrict__ valT *__restrict__ 
tileListOrderSrc


Define Documentation

#define __ldg   *

Definition at line 18 of file CudaTileListKernel.cu.

Referenced by calcForceEnergy(), exclusionForce(), gather_force(), modifiedExclusionForce(), and repackTileListsKernel().

#define BOUNDINGBOXKERNEL_NUM_WARP   8

Definition at line 222 of file CudaTileListKernel.cu.

Referenced by buildBoundingBoxesKernel(), and CudaTileListKernel::buildTileLists().

#define OVERALLOC   1.2f

Definition at line 15 of file CudaTileListKernel.cu.

#define REPACKTILELISTSKERNEL_NUM_WARP   32

Definition at line 531 of file CudaTileListKernel.cu.

Referenced by repackTileListsKernel().

#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Definition at line 624 of file CudaTileListKernel.cu.

#define SORTTILELISTSKERNEL_NUM_THREAD   512

Definition at line 623 of file CudaTileListKernel.cu.

#define TILELISTKERNELNEW_NUM_WARP   4

Definition at line 308 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().


Function Documentation

template<typename keyT, typename valT, bool ascend>
__launch_bounds__ ( SORTTILELISTSKERNEL_NUM_THREAD  ,
 
) const

__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  outputOrder,
const unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 135 of file CudaTileListKernel.cu.

References j.

00137                                                {
00138 
00139   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
00140   {
00141     int j = outputOrder[numTileLists - i - 1];
00142     tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
00143   }
00144 
00145 }

__global__ void buildBoundingBoxesKernel ( const int  atomStorageSize,
const float4 *__restrict__  xyzq,
BoundingBox *__restrict__  boundingBoxes 
)

Definition at line 224 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, WARPSIZE, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00225                                            {
00226 
00227   const int warpId = threadIdx.x / WARPSIZE;
00228   const int wid = threadIdx.x % WARPSIZE;
00229 
00230   // Loop with warp-aligned index to avoid warp-divergence
00231   for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
00232     // Full atom index
00233     const int i = iwarp + wid;
00234     // Bounding box index
00235     const int ibb = i/WARPSIZE;
00236 
00237     float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
00238 
00239     volatile float3 minxyz, maxxyz;
00240 
00241     typedef cub::WarpReduce<float> WarpReduce;
00242     __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
00243     minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
00244     minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
00245     minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
00246     maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
00247     maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
00248     maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
00249 
00250     if (wid == 0) {
00251       BoundingBox boundingBox;
00252       boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
00253       boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
00254       boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
00255       boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
00256       boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
00257       boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
00258       boundingBoxes[ibb] = boundingBox;
00259     }
00260   }
00261 
00262 }

__global__ void buildRemoveZerosSortKey ( const int  numTileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
unsigned int *__restrict__  sortKey 
)

Definition at line 66 of file CudaTileListKernel.cu.

00067                                                                                                            {
00068 
00069   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00070   {
00071     int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
00072     sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
00073   }
00074 
00075 }

__global__ void buildSortKeys ( const int  numTileListsDst,
const int  maxTileListLen,
const TileList *__restrict__  tileListsSrc,
const int *__restrict__  tileListOrderDst,
const unsigned int *__restrict__  tileListDepthDst,
int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 163 of file CudaTileListKernel.cu.

References j, x, and y.

00167                                                                          {
00168 
00169   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
00170   {
00171     int k = tileListOrderDst[i];
00172     int icompute = tileListsSrc[k].icompute;
00173     int depth    = tileListDepthDst[i] & 65535;
00174     // depth is in range [1 ... maxTileListLen]
00175     int j        = icompute*maxTileListLen + (depth-1);
00176     sortKeys[j] = i;
00177     int2 minmax = minmaxListLen[icompute];
00178     int2 minmaxOrig = minmax;
00179     if (minmax.x > depth) minmax.x = depth;
00180     if (minmax.y < depth) minmax.y = depth;
00181     if (minmax.x != minmaxOrig.x) {
00182       atomicMin(&minmaxListLen[icompute].x, minmax.x);
00183     }
00184     if (minmax.y != minmaxOrig.y) {
00185       atomicMax(&minmaxListLen[icompute].y, minmax.y);
00186     }
00187   }
00188 
00189 }

__global__ void buildTileListsBBKernel ( const int  numTileLists,
TileList *__restrict__  tileLists,
const CudaPatchRecord *__restrict__  patches,
const int *__restrict__  tileListPos,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
const int  maxTileListLen,
const BoundingBox *__restrict__  boundingBoxes,
int *__restrict__  tileJatomStart,
const int  tileJatomStartSize,
unsigned int *__restrict__  tileListDepth,
int *__restrict__  tileListOrder,
PatchPairRecord *__restrict__  patchPairs,
TileListStat *__restrict__  tileListStat 
)

Definition at line 387 of file CudaTileListKernel.cu.

References CudaPatchRecord::atomStart, buildTileListsBBKernel_shmem_sizePerThread(), distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, TileList::iatomStart, TileList::icompute, j, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, CudaPatchRecord::numAtoms, CudaPatchRecord::numFreeAtoms, TileList::offsetXYZ, TileList::patchInd, WARP_FULL_MASK, and WARPSIZE.

00399                                            {
00400 
00401   extern __shared__ char sh_buffer[];
00402   int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
00403   int pos = threadIdx.x*sizePerThread;
00404   volatile char* sh_tile = (char*)&sh_buffer[pos];
00405 
00406   // Loop with warp-aligned index to avoid warp-divergence
00407   for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
00408 
00409     // Use one thread per tile list
00410     const int wid = threadIdx.x % WARPSIZE;
00411     const int itileList = iwarp + wid;
00412 
00413     int i;
00414     int itileListLen = 0;
00415     CudaPatchRecord patch1;
00416     CudaPatchRecord patch2;
00417     float3 offsetXYZ;
00418     int2 patchInd;
00419     int numTiles2;
00420     int icompute;
00421 
00422     if (itileList < numTileLists) {
00423       offsetXYZ = tileLists[itileList].offsetXYZ;
00424       patchInd  = tileLists[itileList].patchInd;
00425       icompute  = tileLists[itileList].icompute;
00426       // Get i-column
00427       i = itileList - tileListPos[icompute];
00428 
00429       float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
00430       float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
00431       float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
00432 
00433       // DH - set zeroShift flag if magnitude of shift vector is zero
00434       bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
00435 
00436       // Load patches
00437       patch1 = patches[patchInd.x];
00438       patch2 = patches[patchInd.y];
00439       // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
00440       numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
00441       int tileStart1 = patch1.atomStart/WARPSIZE;
00442       int tileStart2 = patch2.atomStart/WARPSIZE;
00443 
00444       // DH - self requires that zeroShift is also set
00445       bool self = zeroShift && (tileStart1 == tileStart2);
00446 
00447       // Load i-atom data (and shift coordinates)
00448       BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
00449       boundingBoxI.x += shx;
00450       boundingBoxI.y += shy;
00451       boundingBoxI.z += shz;
00452 
00453       for (int j=0;j < numTiles2;j++) {
00454         sh_tile[j] = 0;
00455         if (!self || j >= i) {
00456           BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
00457           float r2bb = distsq(boundingBoxI, boundingBoxJ);
00458           if (r2bb < cutoff2) {
00459             sh_tile[j] = 1;
00460             itileListLen++;
00461           }
00462         }
00463       }
00464 
00465       tileListDepth[itileList] = (unsigned int)itileListLen;
00466       tileListOrder[itileList] = itileList;
00467     }
00468 
00469     typedef cub::WarpScan<int> WarpScan;
00470     __shared__ typename WarpScan::TempStorage tempStorage;
00471     int active = (itileListLen > 0);
00472     int activePos;
00473     WarpScan(tempStorage).ExclusiveSum(active, activePos);
00474     int itileListPos;
00475     WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
00476 
00477     int jtileStart, numJtiles;
00478     // Last thread in the warp knows the total number
00479     if (wid == WARPSIZE-1) {
00480       atomicAdd(&tileListStat->numTileLists, activePos + active);
00481       numJtiles = itileListPos + itileListLen;
00482       jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
00483     }
00484     numJtiles  = cub::ShuffleIndex(numJtiles,  WARPSIZE-1, WARPSIZE, WARP_FULL_MASK);
00485     jtileStart = cub::ShuffleIndex(jtileStart, WARPSIZE-1, WARPSIZE, WARP_FULL_MASK);
00486     if (jtileStart + numJtiles > tileJatomStartSize) {
00487       // tileJatomStart out of memory, exit 
00488       if (wid == 0) tileListStat->tilesSizeExceeded = true;
00489       return;
00490     }
00491 
00492     int jStart = itileListPos;
00493     int jEnd   = cub::ShuffleDown(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
00494     if (wid == WARPSIZE-1) jEnd = numJtiles;
00495 
00496     if (itileListLen > 0) {
00497       // Setup tileLists[]
00498       TileList TLtmp;
00499       TLtmp.iatomStart = patch1.atomStart + i*WARPSIZE;
00500       TLtmp.jtileStart = jtileStart + jStart;
00501       TLtmp.jtileEnd   = jtileStart + jEnd - 1;
00502       TLtmp.patchInd   = patchInd;
00503       TLtmp.offsetXYZ  = offsetXYZ;
00504       TLtmp.icompute   = icompute;
00505       // TLtmp.patchNumList.x = 0;
00506       // TLtmp.patchNumList.y = 0;
00507       tileLists[itileList] = TLtmp;
00508       // PatchPair
00509       PatchPairRecord patchPair;
00510       patchPair.iatomSize     = patch1.atomStart + patch1.numAtoms;
00511       patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
00512       patchPair.jatomSize     = patch2.atomStart + patch2.numAtoms;
00513       patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
00514       patchPairs[itileList] = patchPair;
00515 
00516       // Write tiles
00517       int jtile = jtileStart + jStart;
00518       for (int j=0;j < numTiles2;j++) {
00519         if (sh_tile[j]) {
00520           tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
00521           jtile++;
00522         }
00523       }
00524 
00525     }
00526 
00527   }
00528 
00529 }

__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread ( const int  maxTileListLen  ) 

Definition at line 378 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists(), and buildTileListsBBKernel().

00378                                                                          {
00379   // Size in bytes
00380   int size = (
00381     maxTileListLen*sizeof(char)
00382     );
00383   return size;
00384 }

__global__ void calcPatchNumLists ( const int  numTileLists,
const int  numPatches,
const TileList *__restrict__  tileLists,
int *__restrict__  patchNumLists 
)

Definition at line 26 of file CudaTileListKernel.cu.

00027                                                                            {
00028 
00029   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00030   {
00031     int2 patchInd = tileLists[i].patchInd;
00032     atomicAdd(&patchNumLists[patchInd.x], 1);
00033     if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
00034   }
00035 
00036 }

template<int nthread>
__global__ void calcTileListPosKernel ( const int  numComputes,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
int *__restrict__  tilePos 
)

Definition at line 314 of file CudaTileListKernel.cu.

References BLOCK_SYNC, and WARPSIZE.

00317                              {
00318 
00319   typedef cub::BlockScan<int, nthread> BlockScan;
00320 
00321   __shared__ typename BlockScan::TempStorage tempStorage;
00322   __shared__ int shTilePos0;
00323 
00324   if (threadIdx.x == nthread-1) {
00325     shTilePos0 = 0;
00326   }
00327 
00328   for (int base=0;base < numComputes;base+=nthread) {
00329     int k = base + threadIdx.x;
00330 
00331     int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
00332 
00333     // Calculate positions in tile list and jtile list
00334     int tilePosVal;
00335     BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
00336 
00337     // Store into global memory
00338     if (k < numComputes) {      
00339       tilePos[k] = shTilePos0 + tilePosVal;
00340     }
00341 
00342     BLOCK_SYNC;
00343     // Store block end position
00344     if (threadIdx.x == nthread-1) {
00345       shTilePos0 += tilePosVal + numTiles1;
00346     }
00347   }
00348 }

__device__ __forceinline__ float distsq ( const BoundingBox  a,
const BoundingBox  b 
)

Definition at line 267 of file CudaTileListKernel.cu.

References f, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

00267                                                                                   {
00268   float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
00269   float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
00270   float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
00271   float r2 = dx*dx + dy*dy + dz*dz;
00272   return r2;
00273 }

doStreaming ( doStreaming   ) 

Definition at line 696 of file CudaTileListKernel.cu.

References atomStorageSize, boundingBoxes, cudaCheck, cudaPatches, jtiles, numPatches, outputOrder, and xyzq.

00696                                              {
00697 
00698   cudaCheck(cudaSetDevice(deviceID));
00699 
00700   activeBuffer = 1;
00701 
00702   numPatches = 0;
00703   numComputes = 0;
00704 
00705   cudaPatches = NULL;
00706   cudaPatchesSize = 0;
00707 
00708   cudaComputes = NULL;
00709   cudaComputesSize = 0;
00710 
00711   patchNumLists = NULL;
00712   patchNumListsSize = 0;
00713 
00714   emptyPatches = NULL;
00715   emptyPatchesSize = 0;
00716   h_emptyPatches = NULL;
00717   h_emptyPatchesSize = 0;
00718   numEmptyPatches = 0;
00719 
00720   sortKeySrc = NULL;
00721   sortKeySrcSize = 0;
00722   sortKeyDst = NULL;
00723   sortKeyDstSize = 0;
00724 
00725   tileLists1 = NULL;
00726   tileLists1Size = 0;
00727   tileLists2 = NULL;
00728   tileLists2Size = 0;
00729 
00730   patchPairs1 = NULL;
00731   patchPairs1Size = 0;
00732   patchPairs2 = NULL;
00733   patchPairs2Size = 0;
00734 
00735   tileJatomStart1 = NULL;
00736   tileJatomStart1Size = 0;
00737   tileJatomStart2 = NULL;
00738   tileJatomStart2Size = 0;
00739 
00740   boundingBoxes = NULL;
00741   boundingBoxesSize = 0;
00742 
00743   tileListDepth1 = NULL;
00744   tileListDepth1Size = 0;
00745   tileListDepth2 = NULL;
00746   tileListDepth2Size = 0;
00747 
00748   tileListOrder1 = NULL;
00749   tileListOrder1Size = 0;
00750   tileListOrder2 = NULL;
00751   tileListOrder2Size = 0;
00752 
00753   tileExcls1 = NULL;
00754   tileExcls1Size = 0;
00755   tileExcls2 = NULL;
00756   tileExcls2Size = 0;
00757 
00758   xyzq = NULL;
00759   xyzqSize = 0;
00760 
00761   allocate_device<TileListStat>(&d_tileListStat, 1);
00762   allocate_host<TileListStat>(&h_tileListStat, 1);
00763 
00764   tileListPos = NULL;
00765   tileListPosSize = 0;
00766   tempStorage = NULL;
00767   tempStorageSize = 0;
00768 
00769   jtiles = NULL;
00770   jtilesSize = 0;
00771 
00772   tilePos = NULL;
00773   tilePosSize = 0;
00774 
00775   tileListsGBIS = NULL;
00776   tileListsGBISSize = 0;
00777 
00778   tileJatomStartGBIS = NULL;
00779   tileJatomStartGBISSize = 0;
00780 
00781   tileListVirialEnergy = NULL;
00782   tileListVirialEnergySize = 0;
00783 
00784   atomStorageSize = 0;
00785   numTileLists = 0;
00786   numTileListsGBIS = 0;
00787   numJtiles = 1;
00788 
00789   outputOrder = NULL;
00790   outputOrderSize = 0;
00791   doOutputOrder = false;
00792 
00793   minmaxListLen = NULL;
00794   minmaxListLenSize = 0;
00795 
00796   sortKeys = NULL;
00797   sortKeysSize = 0;
00798   sortKeys_endbit = 0;
00799 
00800   cudaCheck(cudaEventCreate(&tileListStatEvent));
00801   tileListStatEventRecord = false;
00802 }

__global__ void fillSortKeys ( const int  numComputes,
const int  maxTileListLen,
const int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 191 of file CudaTileListKernel.cu.

References j, and WARPSIZE.

00192                                                                                {
00193 
00194   int i = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;
00195   if (i < numComputes) {
00196     const int wid = threadIdx.x % WARPSIZE;
00197     int2 minmax = minmaxListLen[i];
00198     int minlen = minmax.x;
00199     int maxlen = minmax.y;
00200     // minlen, maxlen are in range [1 ... maxTileListLen]
00201     unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
00202     unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
00203     unsigned int aveKey = (maxKey + minKey)/2;
00204     for (int j=wid;j < minlen-1;j+=WARPSIZE) {
00205       sortKeys[i*maxTileListLen + j] = minKey;
00206     }
00207     for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
00208       sortKeys[i*maxTileListLen + j] = maxKey;
00209     }
00210     for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
00211       if (sortKeys[i*maxTileListLen + j] == 0) {
00212         sortKeys[i*maxTileListLen + j] = aveKey;
00213       }
00214     }
00215   }
00216 
00217 }

int ilog2 ( int  a  ) 

Definition at line 1142 of file CudaTileListKernel.cu.

01142                  {
01143   // if (a < 0)
01144   //   NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
01145   int k = 1;
01146   while (a >>= 1) k++;
01147   return k;
01148 }

__global__ void initMinMaxListLen ( const int  numComputes,
const int  maxTileListLen,
int2 *__restrict__  minmaxListLen 
)

Definition at line 147 of file CudaTileListKernel.cu.

00148                                     {
00149 
00150   int2 val;
00151   val.x = maxTileListLen+1;
00152   val.y = 0;
00153   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
00154   {
00155     minmaxListLen[i] = val;
00156   }
00157 
00158 }

template<int width>
__global__ void localSort ( const int  n,
const int  begin_bit,
const int  num_bit,
unsigned int *__restrict__  keys,
int *__restrict__  vals 
)

Definition at line 92 of file CudaTileListKernel.cu.

References BLOCK_SYNC.

00093                                                            {
00094 
00095   // NOTE: blockDim.x = width
00096 
00097   for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
00098   {
00099     int i = base + threadIdx.x;
00100     typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
00101     __shared__ typename BlockRadixSort::TempStorage tempStorage;
00102     unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
00103     int val[1] = {(i < n) ? vals[i] : 0};
00104     BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
00105     if (i < n) {
00106       keys[i] = key[0];
00107       vals[i] = val[0];
00108     }
00109     BLOCK_SYNC;
00110   }
00111 
00112 }

void NAMD_die ( const char *   ) 

Definition at line 81 of file common.C.

Referenced by Controller::adaptTempInit(), ConfigList::add_element(), ComputeMsmMgr::addPotential(), after_backend_init(), ALCHPAIR(), Controller::algorithm(), AnisoElem::AnisoElem(), msm::PatchData::anterpolation(), msm::PatchData::anterpolationC1Hermite(), Parameters::assign_angle_index(), Parameters::assign_bond_index(), Parameters::assign_crossterm_index(), Parameters::assign_dihedral_index(), Parameters::assign_improper_index(), Parameters::assign_vdw_index(), WorkDistrib::assignNodeToPatch(), ComputeNonbondedCUDA::build_exclusions(), Molecule::build_go_arrays(), Molecule::build_go_params(), Molecule::build_go_sigmas(), Molecule::build_go_sigmas2(), Molecule::build_gridforce_params(), buildBondData(), ReductionMgr::buildSpanTree(), CudaTileListKernel::buildTileLists(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), GlobalMaster::calculate(), GlobalMasterTcl::calculate(), GlobalMasterTest::calculate(), ComputeQMMgr::calcUSR(), GlobalMaster::check(), ComputeMsmMgr::compute(), PmeRealSpace::compute_forces(), AngleElem::computeForce(), ComputeMgr::ComputeMgr(), ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeTclBC::ComputeTclBC(), ConfigList::ConfigList(), NamdState::configListInit(), ConfigList::ConfigListNode::ConfigListNode(), Controller::correctMomentum(), Sequencer::correctMomentum(), WorkDistrib::createAtomLists(), ComputeMgr::createComputes(), cuda_check_local_progress(), cuda_check_pme_charges(), cuda_check_pme_forces(), cuda_check_progress(), cuda_check_remote_progress(), cuda_errcheck(), cuda_getargs(), cuda_init_bspline_coeffs(), CudaComputeNonbonded::CudaComputeNonbonded(), cudaDie(), ComputeMsmMgr::d_stencil_1d(), Molecule::delete_qm_bonded(), ComputeGridForce::doForce(), ComputeNonbondedSelf::doForce(), HomePatch::doMarginCheck(), Parameters::done_reading_files(), ComputeFullDirect::doWork(), ComputeTclBC::doWork(), ARestraint::EarlyExit(), GlobalMasterMisc::easy_calc(), msm::Array< msm::PatchDiagram >::elem(), msm::GridFixed< T, N >::elem(), msm::Grid< Vtype >::elem(), ScriptTcl::eval(), colvarproxy_namd::fatal_error(), PmeZPencil::fft_init(), PmeYPencil::fft_init(), PmeXPencil::fft_init(), OptPmeZPencil::fft_init(), OptPmeYPencil::fft_init(), OptPmeXPencil::fft_init(), PmeRealSpace::fill_charges(), parm::get(), Molecule::get_atom_from_index_in_residue(), Molecule::get_atom_from_name(), Molecule::get_atomtype(), Parameters::get_dihedral_params(), Molecule::get_fep_bonded_type(), Parameters::get_improper_params(), Molecule::get_residue_size(), Parameters::get_vdw_params(), AngleElem::getMoleculePointers(), AnisoElem::getMoleculePointers(), BondElem::getMoleculePointers(), CrosstermElem::getMoleculePointers(), DihedralElem::getMoleculePointers(), GromacsPairElem::getMoleculePointers(), ImproperElem::getMoleculePointers(), ExclElem::getMoleculePointers(), TholeElem::getMoleculePointers(), CudaComputeNonbondedKernel::getPatchReadyQueue(), AnisoElem::getTupleInfo(), TholeElem::getTupleInfo(), GlobalMasterIMD::GlobalMasterIMD(), GlobalMasterSymmetry::GlobalMasterSymmetry(), GromacsTopFile::GromacsTopFile(), BackEnd::init(), ComputeMsmMgr::initialize(), ComputePmeMgr::initialize(), DeviceCUDA::initialize(), GridforceFullMainGrid::initialize(), GridforceFullSubGrid::initialize(), GridforceLiteGrid::initialize(), LdbCoordinator::initialize(), OptPmeMgr::initialize(), SimParameters::initialize_config_data(), msm::PatchData::interpolation(), msm::PatchData::interpolationC1Hermite(), ScriptTcl::load(), NamdState::loadStructure(), ludcmp(), main::main(), Node::mallocTest(), WorkDistrib::mapComputes(), HomePatch::minimize_rattle2(), Molecule::Molecule(), HomePatch::mollyAverage(), HomePatch::mollyMollify(), NAMD_new_handler(), NAMD_read_int(), NAMD_read_line(), NAMD_seek(), NAMD_write(), ComputeMsmMgr::ndsplitting(), CudaComputeNonbondedKernel::nonbondedForce(), Vector::operator[](), Node::outputPatchComputeMaps(), MGridforceParamsList::pack_data(), WorkDistrib::patchMapInit(), PDB::PDB(), PDBUnknown::PDBUnknown(), parm::preadln(), Molecule::prepare_qm(), ProblemParsing(), GlobalMaster::processData(), MsmBlockProxyMsg::put(), MsmC1HermiteBlockProxyMsg::put(), HomePatch::rattle1old(), HomePatch::rattle2(), read_binary_file(), Parameters::read_charmm_parameter_file(), Parameters::read_ener_table(), Parameters::read_energy_type(), Parameters::read_energy_type_bothcubspline(), Parameters::read_energy_type_cubspline(), Molecule::read_go_file(), Parameters::read_parameter_file(), Parameters::read_parm(), SimParameters::readExtendedSystem(), GridforceFullBaseGrid::readSubgridHierarchy(), RecBisection::RecBisection(), Molecule::receive_GoMolecule(), Parameters::receive_Parameters(), CollectionMaster::receiveDataStream(), ParallelIOMgr::recvAtomsCntPerPatch(), Controller::recvCheckpointReq(), HomePatch::recvCheckpointReq(), ComputeMgr::recvComputeDPMEData(), ComputeMgr::recvComputeDPMEResults(), ComputeMgr::recvComputeEwaldData(), ComputeMgr::recvComputeEwaldResults(), ComputeMgr::recvComputeGlobalData(), ComputeMgr::recvComputeGlobalResults(), ComputeExtMgr::recvCoord(), ComputeFmmSerialMgr::recvCoord(), ComputeMsmSerialMgr::recvCoord(), ComputeQMMgr::recvPartQM(), ComputeQMMgr::recvPntChrg(), Output::recvReplicaDcdData(), Node::reloadCharges(), Node::reloadGridforceGrid(), ReductionMgr::remoteRegister(), ReductionMgr::remoteUnregister(), ReductionSet::removeData(), Controller::rescaleaccelMD(), Sequencer::rescaleaccelMD(), CudaTileListKernel::reSortTileLists(), ScriptTcl::run(), ComputeMsm::saveResults(), SimParameters::scriptSet(), ComputeNonbondedUtil::select(), Molecule::send_GoMolecule(), Parameters::send_Parameters(), ComputeMgr::sendComputeDPMEData(), ComputeMgr::sendComputeEwaldData(), StringList::set(), msm::Array< T >::setmax(), CudaTileListKernel::setTileListVirialEnergyGBISLength(), CudaTileListKernel::setTileListVirialEnergyLength(), ComputeMsmMgr::setup_hgrid_1d(), ComputeMsmMgr::setup_periodic_blocksize(), PatchMap::sizeGrid(), ComputeMsmMgr::splitting(), ComputeMsmMgr::stencil_1d(), NamdCentLB::Strategy(), StringList::StringList(), ScriptTcl::tclmain(), TholeElem::TholeElem(), Node::updateGridScale(), ReductionMgr::willRequire(), and ReductionMgr::willSubmit().

00083 {
00084    if ( ! err_msg ) err_msg = "(unknown error)";
00085    char *new_err_msg = new char[strlen(err_msg) + 40];
00086    sprintf(new_err_msg,"FATAL ERROR: %s\n",err_msg);
00087    CkPrintf(new_err_msg);
00088    fflush(stdout);
00089    if ( CmiNumPartitions() > 1 ) {
00090      sprintf(new_err_msg,"REPLICA %d FATAL ERROR: %s\n", CmiMyPartition(), err_msg);
00091    }
00092    CmiAbort(new_err_msg);
00093    delete [] new_err_msg;
00094 }

__global__ void repackTileListsKernel ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrder,
const int *__restrict__  jtiles,
const TileList *__restrict__  tileListsSrc,
TileList *__restrict__  tileListsDst,
const PatchPairRecord *__restrict__  patchPairsSrc,
PatchPairRecord *__restrict__  patchPairsDst,
const int *__restrict__  tileJatomStartSrc,
int *__restrict__  tileJatomStartDst,
const TileExcl *__restrict__  tileExclsSrc,
TileExcl *__restrict__  tileExclsDst 
)

Definition at line 533 of file CudaTileListKernel.cu.

References __ldg, TileList::iatomStart, TileList::icompute, j, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, TileList::patchInd, REPACKTILELISTSKERNEL_NUM_WARP, WARP_BALLOT, WARP_FULL_MASK, and WARPSIZE.

00539                                                                                   {
00540 
00541   const int wid = threadIdx.x % WARPSIZE;
00542 
00543   // One warp does one tile list
00544   for (int i = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;i < numTileLists;i+=blockDim.x*gridDim.x/WARPSIZE) 
00545   {
00546     int j = tileListOrder[i];
00547     int start = tileListPos[i];
00548     int end   = tileListPos[i+1]-1;
00549     if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
00550     // TileList
00551     int startOld   = __ldg(&tileListsSrc[j].jtileStart);
00552     int endOld     = __ldg(&tileListsSrc[j].jtileEnd);
00553     int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
00554     float3 offsetXYZ;
00555     offsetXYZ.x  = __ldg(&tileListsSrc[j].offsetXYZ.x);
00556     offsetXYZ.y  = __ldg(&tileListsSrc[j].offsetXYZ.y);
00557     offsetXYZ.z  = __ldg(&tileListsSrc[j].offsetXYZ.z);
00558     int2 patchInd = tileListsSrc[j].patchInd;
00559     int icompute = __ldg(&tileListsSrc[j].icompute);
00560     if (wid == 0) {
00561       TileList tileList;
00562       tileList.iatomStart = iatomStart;
00563       tileList.offsetXYZ  = offsetXYZ;
00564       tileList.jtileStart = start;
00565       tileList.jtileEnd   = end;
00566       tileList.patchInd   = patchInd;
00567       tileList.icompute   = icompute;
00568       tileListsDst[i] = tileList;
00569     }
00570 
00571     if (jtiles == NULL) {
00572       // No jtiles, simple copy will do
00573       int jtile = start;
00574       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
00575         if (jtileOld + wid <= endOld) {
00576           tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
00577         }
00578       }
00579       if (tileExclsSrc != NULL) {
00580         int jtile = start;
00581         for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
00582           tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
00583         }
00584       }
00585     } else {
00586       int jtile0 = start;
00587       for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
00588         int t = jtileOld + wid;
00589         int jtile = (t <= endOld) ? jtiles[t] : 0;
00590         jtile >>= begin_bit;
00591         jtile &= 65535;
00592         typedef cub::WarpScan<int> WarpScan;
00593         __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
00594         int warpId = threadIdx.x / WARPSIZE;
00595         int jtilePos;
00596         WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
00597 
00598         if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
00599 
00600         if (tileExclsSrc != NULL) {
00601           unsigned int b = WARP_BALLOT(WARP_FULL_MASK, jtile);
00602           while (b != 0) {
00603             // k = index of thread that has data
00604             int k = __ffs(b) - 1;
00605             tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
00606             // remove 1 bit and advance jtile0
00607             b ^= ((unsigned int)1 << k);
00608             jtile0++;
00609           }
00610         } else {
00611           jtile0 += __popc(WARP_BALLOT(WARP_FULL_MASK, jtile));
00612         }
00613       }
00614     }
00615   }
00616 
00617 }

__global__ void setPatchNumLists_findEmptyPatches ( const int  numTileLists,
TileList *__restrict__  tileLists,
const int *__restrict__  patchNumLists,
const int  numPatches,
int *__restrict__  numEmptyPatches,
int *__restrict__  emptyPatches 
)

Definition at line 42 of file CudaTileListKernel.cu.

00044                                                                                            {
00045 
00046   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
00047   {
00048     int2 patchInd = tileLists[i].patchInd;
00049     int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
00050     tileLists[i].patchNumList = patchNumList;
00051   }
00052 
00053   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
00054   {
00055     if (patchNumLists[i] == 0) {
00056       int ind = atomicAdd(numEmptyPatches, 1);
00057       emptyPatches[ind] = i;
00058     }
00059   }
00060 
00061 }

__global__ void setupSortKey ( const int  numTileLists,
const int  maxTileListLen,
const TileList *__restrict__  tileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
const unsigned int *__restrict__  sortKeys,
unsigned int *__restrict__  sortKey 
)

Definition at line 77 of file CudaTileListKernel.cu.

00079                                                                                                       {
00080 
00081   for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
00082   {
00083     int icompute = tileLists[itileList].icompute;
00084     int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
00085     int i = icompute*maxTileListLen + (depth - 1);
00086     sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
00087   }
00088 
00089 }

__global__ void storeInReverse ( const int  numTileListsSrc,
const int  begin_bit,
const int *__restrict__  outputOrder,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrderSrc,
const unsigned int *__restrict__  tileListDepthSrc,
int *__restrict__  tileListOrderDst,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 114 of file CudaTileListKernel.cu.

References j.

00119                                                {
00120 
00121   for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
00122   {
00123     int j = outputOrder[numTileListsSrc - i - 1];
00124     if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
00125       int k = tileListPos[i];
00126       tileListDepthDst[k] = tileListDepthSrc[j];
00127       tileListOrderDst[k] = j; //tileListOrderSrc[j];
00128     }
00129   }
00130 }

template<int nthread>
__global__ void updatePatchesKernel ( const int  numComputes,
const int *__restrict__  tilePos,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
TileList *__restrict__  tileLists 
)

Definition at line 352 of file CudaTileListKernel.cu.

References CudaComputeRecord::offsetXYZ, CudaComputeRecord::patchInd, and WARPSIZE.

00356                                     {
00357 
00358   const int tid = threadIdx.x % nthread;
00359 
00360   // nthread threads takes care of one compute
00361   for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
00362   {
00363     CudaComputeRecord compute = computes[k];
00364     float3 offsetXYZ = compute.offsetXYZ;
00365     int2 patchInd = compute.patchInd;
00366     int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
00367     int itileList0 = tilePos[k];
00368     for (int i=tid;i < numTiles1;i+=nthread) {
00369       tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
00370       tileLists[itileList0 + i].patchInd  = patchInd;
00371       tileLists[itileList0 + i].icompute  = k;
00372     }
00373   }
00374 
00375 }


Variable Documentation

const int const int begin_bit

Definition at line 627 of file CudaTileListKernel.cu.

__thread DeviceCUDA* deviceCUDA

Definition at line 18 of file DeviceCUDA.C.

const int const int const int end_bit

Definition at line 627 of file CudaTileListKernel.cu.

const int numTileListsDst

Definition at line 627 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().

const int const int const int const keyT oobKey

Definition at line 627 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ tileListDepthDst

Definition at line 627 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ tileListDepthSrc

Definition at line 627 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ tileListOrderSrc

Definition at line 627 of file CudaTileListKernel.cu.


Generated on Mon Nov 20 01:17:15 2017 for NAMD by  doxygen 1.4.7