NAMD
Macros | Typedefs | Functions | Variables
CudaTileListKernel.cu File Reference
#include <cuda.h>
#include <namd_cub/device/device_radix_sort.cuh>
#include <namd_cub/device/device_scan.cuh>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Macros

#define OVERALLOC   1.2f
 
#define DEFAULTKERNEL_NUM_THREAD   1024
 
#define UPDATEPATCHESKERNEL_NUM_THREAD   512
 
#define CALCPATCHNUMLISTSKERNEL_NUM_THREAD   512
 
#define BOUNDINGBOXKERNEL_NUM_WARP   8
 
#define __ldg   *
 
#define TILELISTKERNELNEW_NUM_WARP   4
 
#define REPACKTILELISTSKERNEL_NUM_WARP   32
 
#define SORTTILELISTSKERNEL_NUM_THREAD   512
 
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22
 

Typedefs

typedef cub::BlockLoad< valT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoad
 
typedef cub::BlockRadixSort
< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
valT > 
BlockRadixSort
 

Functions

void NAMD_die (const char *)
 
__global__ void calcPatchNumLists (const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists)
 
__global__ void setPatchNumLists_findEmptyPatches (const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches)
 
__global__ void buildRemoveZerosSortKey (const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey)
 
__global__ void setupSortKey (const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey)
 
template<int width>
__global__ void localSort (const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals)
 
__global__ void storeInReverse (const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst)
 
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
 
__global__ void initMinMaxListLen (const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen)
 
__global__ void buildSortKeys (const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
 
__global__ void fillSortKeys (const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys)
 
__global__ void buildBoundingBoxesKernel (const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes)
 
__device__ __forceinline__ float distsq (const BoundingBox a, const BoundingBox b)
 
template<int nthread>
__global__ void calcTileListPosKernel (const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos)
 
template<int nthread>
__global__ void updatePatchesKernel (const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists)
 
__host__ __device__
__forceinline__ int 
buildTileListsBBKernel_shmem_sizePerThread (const int maxTileListLen)
 
__global__ void buildTileListsBBKernel (const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat)
 
__global__ void repackTileListsKernel (const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst)
 
template<typename keyT , typename valT , bool ascend>
 __launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__ void sortTileListsKernel(const int numTileListsSrc
 
 BlockLoadU (tempStorage.loadU).Load(tileListDepthSrc
 
 BlockLoad (tempStorage.load).Load(tileListOrderSrc
 
 BlockRadixSort (tempStorage.sort).SortBlockedToStriped(keys
 
__global__ void reOrderTileListDepth (const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst)
 
__global__ void bitshiftTileListDepth (const int numTileLists, const int begin_bit, unsigned int *__restrict__ tileListDepth)
 
int ilog2 (int a)
 

Variables

const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ valT
*__restrict__ tileListOrderDst
typedef cub::BlockLoad< keyT,
SORTTILELISTSKERNEL_NUM_THREAD,
SORTTILELISTSKERNEL_ITEMS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE > 
BlockLoadU
 
__thread DeviceCUDAdeviceCUDA
 
const int numTileListsDst
 
const int const int begin_bit
 
const int const int const int end_bit
 
const int const int const int
const keyT 
oobKey
 
const int const int const int
const keyT keyT *__restrict__ 
tileListDepthSrc
 
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ 
tileListDepthDst
 
const int const int const int
const keyT keyT *__restrict__
keyT *__restrict__ valT
*__restrict__ 
tileListOrderSrc
 
union {
   BlockLoad::TempStorage   load
 
   BlockLoadU::TempStorage   loadU
 
   BlockRadixSort::TempStorage   sort
 
tempStorage
 
keyT keys [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
 
valT values [SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
 
 numTileListsSrc
 
 BLOCK_SYNC
 

Macro Definition Documentation

#define __ldg   *
#define BOUNDINGBOXKERNEL_NUM_WARP   8
#define CALCPATCHNUMLISTSKERNEL_NUM_THREAD   512

Definition at line 33 of file CudaTileListKernel.cu.

#define DEFAULTKERNEL_NUM_THREAD   1024

Definition at line 31 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().

#define OVERALLOC   1.2f
#define REPACKTILELISTSKERNEL_NUM_WARP   32

Definition at line 564 of file CudaTileListKernel.cu.

Referenced by repackTileListsKernel().

#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD   22

Definition at line 669 of file CudaTileListKernel.cu.

#define SORTTILELISTSKERNEL_NUM_THREAD   512

Definition at line 668 of file CudaTileListKernel.cu.

#define TILELISTKERNELNEW_NUM_WARP   4

Definition at line 336 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().

#define UPDATEPATCHESKERNEL_NUM_THREAD   512

Definition at line 32 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists().

Typedef Documentation

typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad

Definition at line 685 of file CudaTileListKernel.cu.

Definition at line 688 of file CudaTileListKernel.cu.

Function Documentation

template<typename keyT , typename valT , bool ascend>
__launch_bounds__ ( SORTTILELISTSKERNEL_NUM_THREAD  ,
 
) const
__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  outputOrder,
const unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 159 of file CudaTileListKernel.cu.

References begin_bit.

161  {
162 
163  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
164  {
165  int j = outputOrder[numTileLists - i - 1];
166  tileListDepthDst[i] = ((tileListDepthSrc[j] >> begin_bit) & 65535) == 0 ? 0 : 1;
167  }
168 
169 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int numTileLists
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
__global__ void bitshiftTileListDepth ( const int  numTileLists,
const int  begin_bit,
unsigned int *__restrict__  tileListDepth 
)

Definition at line 727 of file CudaTileListKernel.cu.

References begin_bit, and itileList.

728  {
729 
730  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList+=blockDim.x*gridDim.x)
731  {
732  unsigned int a = tileListDepth[itileList];
733  a >>= begin_bit;
734  a &= 65535;
736  }
737 
738 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int numTileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
BlockLoad ( tempStorage.  load)
BlockLoadU ( tempStorage.  loadU)
BlockRadixSort ( tempStorage.  sort)
__global__ void buildBoundingBoxesKernel ( const int  atomStorageSize,
const float4 *__restrict__  xyzq,
BoundingBox *__restrict__  boundingBoxes 
)

Definition at line 252 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, tempStorage, WARPSIZE, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

253  {
254 
255  const int warpId = threadIdx.x / WARPSIZE;
256  const int wid = threadIdx.x % WARPSIZE;
257 
258  // Loop with warp-aligned index to avoid warp-divergence
259  for (int iwarp = warpId*WARPSIZE + blockIdx.x*blockDim.x;iwarp < atomStorageSize;iwarp += blockDim.x*gridDim.x) {
260  // Full atom index
261  const int i = iwarp + wid;
262  // Bounding box index
263  const int ibb = i/WARPSIZE;
264 
265  float4 xyzq_i = xyzq[min(atomStorageSize-1, i)];
266 
267  volatile float3 minxyz, maxxyz;
268 
269  typedef cub::WarpReduce<float> WarpReduce;
270  __shared__ typename WarpReduce::TempStorage tempStorage[BOUNDINGBOXKERNEL_NUM_WARP];
271  minxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Min());
272  minxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Min());
273  minxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Min());
274  maxxyz.x = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.x, cub::Max());
275  maxxyz.y = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.y, cub::Max());
276  maxxyz.z = WarpReduce(tempStorage[warpId]).Reduce(xyzq_i.z, cub::Max());
277 
278  if (wid == 0) {
279  BoundingBox boundingBox;
280  boundingBox.x = 0.5f*(minxyz.x + maxxyz.x);
281  boundingBox.y = 0.5f*(minxyz.y + maxxyz.y);
282  boundingBox.z = 0.5f*(minxyz.z + maxxyz.z);
283  boundingBox.wx = 0.5f*(maxxyz.x - minxyz.x);
284  boundingBox.wy = 0.5f*(maxxyz.y - minxyz.y);
285  boundingBox.wz = 0.5f*(maxxyz.z - minxyz.z);
286  boundingBoxes[ibb] = boundingBox;
287  }
288  }
289 
290 }
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
#define BOUNDINGBOXKERNEL_NUM_WARP
__shared__ union @43 tempStorage
__global__ void buildRemoveZerosSortKey ( const int  numTileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
unsigned int *__restrict__  sortKey 
)

Definition at line 90 of file CudaTileListKernel.cu.

References begin_bit, and itileList.

91  {
92 
93  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
94  {
95  int depth = (tileListDepth[itileList] >> begin_bit) & 65535;
96  sortKey[itileList] = (depth == 0) ? numTileLists : itileList;
97  }
98 
99 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int numTileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void buildSortKeys ( const int  numTileListsDst,
const int  maxTileListLen,
const TileList *__restrict__  tileListsSrc,
const int *__restrict__  tileListOrderDst,
const unsigned int *__restrict__  tileListDepthDst,
int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 187 of file CudaTileListKernel.cu.

References x, and y.

191  {
192 
193  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsDst;i += blockDim.x*gridDim.x)
194  {
195  int k = tileListOrderDst[i];
196  int icompute = tileListsSrc[k].icompute;
197  int depth = tileListDepthDst[i] & 65535;
198  // depth is in range [1 ... maxTileListLen]
199  int j = icompute*maxTileListLen + (depth-1);
200  sortKeys[j] = i;
201  int2 minmax = minmaxListLen[icompute];
202  int2 minmaxOrig = minmax;
203  if (minmax.x > depth) minmax.x = depth;
204  if (minmax.y < depth) minmax.y = depth;
205  if (minmax.x != minmaxOrig.x) {
206  atomicMin(&minmaxListLen[icompute].x, minmax.x);
207  }
208  if (minmax.y != minmaxOrig.y) {
209  atomicMax(&minmaxListLen[icompute].y, minmax.y);
210  }
211  }
212 
213 }
const int numTileListsDst
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
gridSize y
gridSize x
__global__ void buildTileListsBBKernel ( const int  numTileLists,
TileList *__restrict__  tileLists,
const CudaPatchRecord *__restrict__  patches,
const int *__restrict__  tileListPos,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
const int  maxTileListLen,
const BoundingBox *__restrict__  boundingBoxes,
int *__restrict__  tileJatomStart,
const int  tileJatomStartSize,
unsigned int *__restrict__  tileListDepth,
int *__restrict__  tileListOrder,
PatchPairRecord *__restrict__  patchPairs,
TileListStat *__restrict__  tileListStat 
)

Definition at line 414 of file CudaTileListKernel.cu.

References CudaPatchRecord::atomStart, buildTileListsBBKernel_shmem_sizePerThread(), distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, itileList, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, CudaPatchRecord::numAtoms, CudaPatchRecord::numFreeAtoms, numTileLists, tempStorage, WARP_FULL_MASK, WARPSIZE, and BoundingBox::x.

426  {
427  #ifdef NAMD_CUDA
428  extern __shared__ char sh_buffer[];
429  #else
430  HIP_DYNAMIC_SHARED( char, sh_buffer)
431  #endif
432  int sizePerThread = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen);
433  int pos = threadIdx.x*sizePerThread;
434  volatile char* sh_tile = (char*)&sh_buffer[pos];
435 
436  // Loop with warp-aligned index to avoid warp-divergence
437  for (int iwarp = (threadIdx.x/WARPSIZE)*WARPSIZE + blockIdx.x*blockDim.x;iwarp < numTileLists;iwarp += blockDim.x*gridDim.x) {
438 
439  // Use one thread per tile list
440  const int wid = threadIdx.x % WARPSIZE;
441  const int itileList = iwarp + wid;
442 
443  int i;
444  int itileListLen = 0;
445  CudaPatchRecord patch1;
446  CudaPatchRecord patch2;
447  float3 offsetXYZ;
448  int2 patchInd;
449  int numTiles2;
450  int icompute;
451 
452  if (itileList < numTileLists) {
453  offsetXYZ = tileLists[itileList].offsetXYZ;
454  patchInd = tileLists[itileList].patchInd;
455  icompute = tileLists[itileList].icompute;
456  // Get i-column
457  i = itileList - tileListPos[icompute];
458 
459  float shx = offsetXYZ.x*lata.x + offsetXYZ.y*latb.x + offsetXYZ.z*latc.x;
460  float shy = offsetXYZ.x*lata.y + offsetXYZ.y*latb.y + offsetXYZ.z*latc.y;
461  float shz = offsetXYZ.x*lata.z + offsetXYZ.y*latb.z + offsetXYZ.z*latc.z;
462 
463  // DH - set zeroShift flag if magnitude of shift vector is zero
464  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
465 
466  // Load patches
467  patch1 = patches[patchInd.x];
468  patch2 = patches[patchInd.y];
469  // int numTiles1 = (patch1.numAtoms-1)/WARPSIZE+1;
470  numTiles2 = (patch2.numAtoms-1)/WARPSIZE+1;
471  int tileStart1 = patch1.atomStart/WARPSIZE;
472  int tileStart2 = patch2.atomStart/WARPSIZE;
473 
474  // DH - self requires that zeroShift is also set
475  bool self = zeroShift && (tileStart1 == tileStart2);
476 
477  // Load i-atom data (and shift coordinates)
478  BoundingBox boundingBoxI = boundingBoxes[i + tileStart1];
479  boundingBoxI.x += shx;
480  boundingBoxI.y += shy;
481  boundingBoxI.z += shz;
482 
483  for (int j=0;j < numTiles2;j++) {
484  sh_tile[j] = 0;
485  if (!self || j >= i) {
486  BoundingBox boundingBoxJ = boundingBoxes[j + tileStart2];
487  float r2bb = distsq(boundingBoxI, boundingBoxJ);
488  if (r2bb < cutoff2) {
489  sh_tile[j] = 1;
490  itileListLen++;
491  }
492  }
493  }
494 
495  tileListDepth[itileList] = (unsigned int)itileListLen;
497  }
498 
499  typedef cub::WarpScan<int> WarpScan;
500  __shared__ typename WarpScan::TempStorage tempStorage;
501  int active = (itileListLen > 0);
502  int activePos;
503  WarpScan(tempStorage).ExclusiveSum(active, activePos);
504  int itileListPos;
505  WarpScan(tempStorage).ExclusiveSum(itileListLen, itileListPos);
506 
507  int jtileStart, numJtiles;
508  // Last thread in the warp knows the total number
509  if (wid == WARPSIZE-1) {
510  atomicAdd(&tileListStat->numTileLists, activePos + active);
511  numJtiles = itileListPos + itileListLen;
512  jtileStart = atomicAdd(&tileListStat->numJtiles, numJtiles);
513  }
514 
515  numJtiles = cub::ShuffleIndex<WARPSIZE>(numJtiles, WARPSIZE-1, WARP_FULL_MASK);
516  jtileStart = cub::ShuffleIndex<WARPSIZE>(jtileStart, WARPSIZE-1, WARP_FULL_MASK);
517 
518  if (jtileStart + numJtiles > tileJatomStartSize) {
519  // tileJatomStart out of memory, exit
520  if (wid == 0) tileListStat->tilesSizeExceeded = true;
521  return;
522  }
523 
524  int jStart = itileListPos;
525  int jEnd = cub::ShuffleDown<WARPSIZE>(itileListPos, 1, WARPSIZE-1, WARP_FULL_MASK);
526 
527  if (wid == WARPSIZE-1) jEnd = numJtiles;
528 
529  if (itileListLen > 0) {
530  // Setup tileLists[]
531  //TileList TLtmp;
532  tileLists[itileList].iatomStart = patch1.atomStart + i*WARPSIZE;
533  tileLists[itileList].jtileStart = jtileStart + jStart;
534  tileLists[itileList].jtileEnd = jtileStart + jEnd - 1;
535  tileLists[itileList].patchInd = patchInd;
536  tileLists[itileList].offsetXYZ = offsetXYZ;
537  tileLists[itileList].icompute = icompute;
538  // TLtmp.patchNumList.x = 0;
539  // TLtmp.patchNumList.y = 0;
540  // tileLists[itileList] = TLtmp;
541  // PatchPair
542  PatchPairRecord patchPair;
543  patchPair.iatomSize = patch1.atomStart + patch1.numAtoms;
544  patchPair.iatomFreeSize = patch1.atomStart + patch1.numFreeAtoms;
545  patchPair.jatomSize = patch2.atomStart + patch2.numAtoms;
546  patchPair.jatomFreeSize = patch2.atomStart + patch2.numFreeAtoms;
547  patchPairs[itileList] = patchPair;
548 
549  // Write tiles
550  int jtile = jtileStart + jStart;
551  for (int j=0;j < numTiles2;j++) {
552  if (sh_tile[j]) {
553  tileJatomStart[jtile] = patch2.atomStart + j*WARPSIZE;
554  jtile++;
555  }
556  }
557 
558  }
559 
560  }
561 
562 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread(const int maxTileListLen)
float3 offsetXYZ
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread ( const int  maxTileListLen)

Definition at line 405 of file CudaTileListKernel.cu.

Referenced by CudaTileListKernel::buildTileLists(), and buildTileListsBBKernel().

405  {
406  // Size in bytes
407  int size = (
408  maxTileListLen*sizeof(char)
409  );
410  return size;
411 }
__global__ void calcPatchNumLists ( const int  numTileLists,
const int  numPatches,
const TileList *__restrict__  tileLists,
int *__restrict__  patchNumLists 
)

Definition at line 50 of file CudaTileListKernel.cu.

51  {
52 
53  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
54  {
55  int2 patchInd = tileLists[i].patchInd;
56  atomicAdd(&patchNumLists[patchInd.x], 1);
57  if (patchInd.x != patchInd.y) atomicAdd(&patchNumLists[patchInd.y], 1);
58  }
59 
60 }
__global__ void const int numTileLists
template<int nthread>
__global__ void calcTileListPosKernel ( const int  numComputes,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
int *__restrict__  tilePos 
)

Definition at line 342 of file CudaTileListKernel.cu.

References BLOCK_SYNC, tempStorage, and WARPSIZE.

345  {
346 
347  typedef cub::BlockScan<int, nthread> BlockScan;
348 
349  __shared__ typename BlockScan::TempStorage tempStorage;
350  __shared__ int shTilePos0;
351 
352  if (threadIdx.x == nthread-1) {
353  shTilePos0 = 0;
354  }
355  for (int base=0;base < numComputes;base+=nthread) {
356  int k = base + threadIdx.x;
357 
358  int numTiles1 = (k < numComputes) ? (patches[computes[k].patchInd.x].numAtoms-1)/WARPSIZE+1 : 0;
359 
360  // Calculate positions in tile list and jtile list
361  int tilePosVal;
362  BlockScan(tempStorage).ExclusiveSum(numTiles1, tilePosVal);
363 
364  // Store into global memory
365  if (k < numComputes) {
366  tilePos[k] = shTilePos0 + tilePosVal;
367  }
368 
369  BLOCK_SYNC;
370  // Store block end position
371  if (threadIdx.x == nthread-1) {
372  shTilePos0 += tilePosVal + numTiles1;
373  }
374  }
375 }
#define WARPSIZE
Definition: CudaUtils.h:10
__shared__ union @43 tempStorage
__device__ __forceinline__ float distsq ( const BoundingBox  a,
const BoundingBox  b 
)

Definition at line 295 of file CudaTileListKernel.cu.

References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

295  {
296  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx - b.wx);
297  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy - b.wy);
298  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz - b.wz);
299  float r2 = dx*dx + dy*dy + dz*dz;
300  return r2;
301 }
__global__ void fillSortKeys ( const int  numComputes,
const int  maxTileListLen,
const int2 *__restrict__  minmaxListLen,
unsigned int *__restrict__  sortKeys 
)

Definition at line 215 of file CudaTileListKernel.cu.

References WARPSIZE.

216  {
217 
218  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numComputes;i+=blockDim.x/WARPSIZE*gridDim.x) {
219  const int wid = threadIdx.x % WARPSIZE;
220  int2 minmax = minmaxListLen[i];
221  int minlen = minmax.x;
222  int maxlen = minmax.y;
223  // minlen, maxlen are in range [1 ... maxTileListLen]
224  // as long as i is in tileListsSrc[].icompute above
225  if ( maxlen < minlen ) {
226  minlen = 1;
227  maxlen = maxTileListLen;
228  }
229  unsigned int minKey = sortKeys[i*maxTileListLen + minlen-1];
230  unsigned int maxKey = sortKeys[i*maxTileListLen + maxlen-1];
231  unsigned int aveKey = (maxKey + minKey)/2;
232  for (int j=wid;j < minlen-1;j+=WARPSIZE) {
233  sortKeys[i*maxTileListLen + j] = minKey;
234  }
235  for (int j=maxlen+wid;j < maxTileListLen;j+=WARPSIZE) {
236  sortKeys[i*maxTileListLen + j] = maxKey;
237  }
238  for (int j=wid;j < maxTileListLen;j+=WARPSIZE) {
239  if (sortKeys[i*maxTileListLen + j] == 0) {
240  sortKeys[i*maxTileListLen + j] = aveKey;
241  }
242  }
243  }
244 
245 }
#define WARPSIZE
Definition: CudaUtils.h:10
int ilog2 ( int  a)

Definition at line 1192 of file CudaTileListKernel.cu.

1192  {
1193  // if (a < 0)
1194  // NAMD_die("CudaTileListKernel, ilog2: negative input value not valid");
1195  int k = 1;
1196  while (a >>= 1) k++;
1197  return k;
1198 }
__global__ void initMinMaxListLen ( const int  numComputes,
const int  maxTileListLen,
int2 *__restrict__  minmaxListLen 
)

Definition at line 171 of file CudaTileListKernel.cu.

172  {
173 
174  int2 val;
175  val.x = maxTileListLen+1;
176  val.y = 0;
177  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numComputes;i += blockDim.x*gridDim.x)
178  {
179  minmaxListLen[i] = val;
180  }
181 
182 }
template<int width>
__global__ void localSort ( const int  n,
const int  begin_bit,
const int  num_bit,
unsigned int *__restrict__  keys,
int *__restrict__  vals 
)

Definition at line 116 of file CudaTileListKernel.cu.

References begin_bit, BLOCK_SYNC, and tempStorage.

117  {
118 
119  // NOTE: blockDim.x = width
120 
121  for (int base = blockDim.x*blockIdx.x;base < n;base += blockDim.x*gridDim.x)
122  {
123  int i = base + threadIdx.x;
124  typedef cub::BlockRadixSort<unsigned int, width, 1, int> BlockRadixSort;
125  __shared__ typename BlockRadixSort::TempStorage tempStorage;
126  unsigned int key[1] = {(i < n) ? ((keys[i] >> begin_bit) & 65535) : 0};
127  int val[1] = {(i < n) ? vals[i] : 0};
128  BlockRadixSort(tempStorage).SortDescending(key, val, 0, num_bit);
129  if (i < n) {
130  keys[i] = key[0];
131  vals[i] = val[0];
132  }
133  BLOCK_SYNC;
134  }
135 
136 }
const int const int begin_bit
cub::BlockRadixSort< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > BlockRadixSort
keyT keys[SORTTILELISTSKERNEL_ITEMS_PER_THREAD]
__shared__ union @43 tempStorage
void NAMD_die ( const char *  )

Definition at line 85 of file common.C.

86 {
87  if ( ! err_msg ) err_msg = "(unknown error)";
88  CkPrintf("FATAL ERROR: %s\n", err_msg);
89  fflush(stdout);
90  char repstr[24] = "";
91  if (CmiNumPartitions() > 1) {
92  sprintf(repstr,"REPLICA %d ", CmiMyPartition());
93  // CkAbort ensures that all replicas die
94  CkAbort("%sFATAL ERROR: %s\n", repstr, err_msg);
95  }
96  CkError("%sFATAL ERROR: %s\n", repstr, err_msg);
97 #if CHARM_VERSION < 61000
98  CkExit();
99 #else
100  CkExit(1);
101 #endif
102 }
__global__ void reOrderTileListDepth ( const int  numTileLists,
const int *__restrict__  tileListOrder,
unsigned int *__restrict__  tileListDepthSrc,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 713 of file CudaTileListKernel.cu.

714  {
715 
716  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileLists;i+=blockDim.x*gridDim.x)
717  {
718  int j = tileListOrder[i];
720  }
721 
722 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void const int numTileLists
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
__global__ void repackTileListsKernel ( const int  numTileLists,
const int  begin_bit,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrder,
const int *__restrict__  jtiles,
const TileList *__restrict__  tileListsSrc,
TileList *__restrict__  tileListsDst,
const PatchPairRecord *__restrict__  patchPairsSrc,
PatchPairRecord *__restrict__  patchPairsDst,
const int *__restrict__  tileJatomStartSrc,
int *__restrict__  tileJatomStartDst,
const TileExcl *__restrict__  tileExclsSrc,
TileExcl *__restrict__  tileExclsDst 
)

Definition at line 569 of file CudaTileListKernel.cu.

References __ldg, begin_bit, REPACKTILELISTSKERNEL_NUM_WARP, WARP_BALLOT, WARP_FULL_MASK, and WARPSIZE.

575  {
576 
577  const int wid = threadIdx.x % WARPSIZE;
578 
579  // One warp does one tile list
580  for (int i = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;i < numTileLists;i+=blockDim.x/WARPSIZE*gridDim.x)
581  {
582  int j = tileListOrder[i];
583  int start = tileListPos[i];
584  int end = tileListPos[i+1]-1;
585  if (wid == 0 && patchPairsSrc != NULL) patchPairsDst[i] = patchPairsSrc[j];
586  // TileList
587  int startOld = __ldg(&tileListsSrc[j].jtileStart);
588  int endOld = __ldg(&tileListsSrc[j].jtileEnd);
589  int iatomStart = __ldg(&tileListsSrc[j].iatomStart);
590  float3 offsetXYZ;
591  offsetXYZ.x = __ldg(&tileListsSrc[j].offsetXYZ.x);
592  offsetXYZ.y = __ldg(&tileListsSrc[j].offsetXYZ.y);
593  offsetXYZ.z = __ldg(&tileListsSrc[j].offsetXYZ.z);
594  int2 patchInd = tileListsSrc[j].patchInd;
595  int icompute = __ldg(&tileListsSrc[j].icompute);
596  if (wid == 0) {
597  // TileList tileList;
598  tileListsDst[i].iatomStart = iatomStart;
599  tileListsDst[i].offsetXYZ = offsetXYZ;
600  tileListsDst[i].jtileStart = start;
601  tileListsDst[i].jtileEnd = end;
602  tileListsDst[i].patchInd = patchInd;
603  tileListsDst[i].icompute = icompute;
604  //tileListsDst[i] = tileList;
605  }
606 
607  if (jtiles == NULL) {
608  // No jtiles, simple copy will do
609  int jtile = start;
610  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE,jtile+=WARPSIZE) {
611  if (jtileOld + wid <= endOld) {
612  tileJatomStartDst[jtile + wid] = tileJatomStartSrc[jtileOld + wid];
613  }
614  }
615  if (tileExclsSrc != NULL) {
616  int jtile = start;
617  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld++,jtile++) {
618  tileExclsDst[jtile].excl[wid] = tileExclsSrc[jtileOld].excl[wid];
619  }
620  }
621  } else {
622  int jtile0 = start;
623  for (int jtileOld=startOld;jtileOld <= endOld;jtileOld+=WARPSIZE) {
624  int t = jtileOld + wid;
625  int jtile = (t <= endOld) ? jtiles[t] : 0;
626  jtile >>= begin_bit;
627  jtile &= 65535;
628  typedef cub::WarpScan<int> WarpScan;
629  __shared__ typename WarpScan::TempStorage tempStorage[REPACKTILELISTSKERNEL_NUM_WARP];
630  int warpId = threadIdx.x / WARPSIZE;
631  int jtilePos;
632  WarpScan(tempStorage[warpId]).ExclusiveSum(jtile, jtilePos);
633 
634  if (jtile) tileJatomStartDst[jtile0+jtilePos] = __ldg(&tileJatomStartSrc[t]);
635 
636  WarpMask b = WARP_BALLOT(WARP_FULL_MASK, jtile);
637  if (tileExclsSrc != NULL) {
638  while (b != 0) {
639  // k = index of thread that has data
640 #if WARPSIZE == 64
641  int k = __ffsll(b) - 1;
642 #else
643  int k = __ffs(b) - 1;
644 #endif
645  tileExclsDst[jtile0].excl[wid] = __ldg(&tileExclsSrc[jtileOld + k].excl[wid]);
646  // remove 1 bit and advance jtile0
647  b ^= ((WarpMask)1 << k);
648  jtile0++;
649  }
650  } else {
651 #if WARPSIZE == 64
652  jtile0 += __popcll(b);
653 #else
654  jtile0 += __popc(b);
655 #endif
656  }
657  }
658  }
659  }
660 
661 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
const int const int begin_bit
#define REPACKTILELISTSKERNEL_NUM_WARP
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
unsigned int WarpMask
Definition: CudaUtils.h:11
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:58
#define __ldg
float3 offsetXYZ
WarpMask excl[WARPSIZE]
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void setPatchNumLists_findEmptyPatches ( const int  numTileLists,
TileList *__restrict__  tileLists,
const int *__restrict__  patchNumLists,
const int  numPatches,
int *__restrict__  numEmptyPatches,
int *__restrict__  emptyPatches 
)

Definition at line 66 of file CudaTileListKernel.cu.

68  {
69 
70  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numTileLists;i += blockDim.x*gridDim.x)
71  {
72  int2 patchInd = tileLists[i].patchInd;
73  int2 patchNumList = make_int2(patchNumLists[patchInd.x], patchNumLists[patchInd.y]);
74  tileLists[i].patchNumList = patchNumList;
75  }
76 
77  for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < numPatches;i += blockDim.x*gridDim.x)
78  {
79  if (patchNumLists[i] == 0) {
80  int ind = atomicAdd(numEmptyPatches, 1);
81  emptyPatches[ind] = i;
82  }
83  }
84 
85 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int numTileLists
__global__ void setupSortKey ( const int  numTileLists,
const int  maxTileListLen,
const TileList *__restrict__  tileLists,
const unsigned int *__restrict__  tileListDepth,
const int  begin_bit,
const unsigned int *__restrict__  sortKeys,
unsigned int *__restrict__  sortKey 
)

Definition at line 101 of file CudaTileListKernel.cu.

References itileList.

103  {
104 
105  for (int itileList = threadIdx.x + blockDim.x*blockIdx.x;itileList < numTileLists;itileList += blockDim.x*gridDim.x)
106  {
107  int icompute = tileLists[itileList].icompute;
108  int depth = min((tileListDepth[itileList] >> begin_bit) & 65535, maxTileListLen);
109  int i = icompute*maxTileListLen + (depth - 1);
110  sortKey[itileList] = (depth == 0) ? 0x7fffffff : sortKeys[i];
111  }
112 
113 }
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int numTileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__global__ void storeInReverse ( const int  numTileListsSrc,
const int  begin_bit,
const int *__restrict__  outputOrder,
const int *__restrict__  tileListPos,
const int *__restrict__  tileListOrderSrc,
const unsigned int *__restrict__  tileListDepthSrc,
int *__restrict__  tileListOrderDst,
unsigned int *__restrict__  tileListDepthDst 
)

Definition at line 138 of file CudaTileListKernel.cu.

143  {
144 
145  for (int i = threadIdx.x + blockDim.x*blockIdx.x;i < numTileListsSrc;i += blockDim.x*gridDim.x)
146  {
147  int j = outputOrder[numTileListsSrc - i - 1];
148  if ( ((tileListDepthSrc[j] >> begin_bit) & 65535) > 0 ) {
149  int k = tileListPos[i];
151  tileListOrderDst[k] = j; //tileListOrderSrc[j];
152  }
153  }
154 }
numTileListsSrc
const int const int begin_bit
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ tileListDepthDst
const int const int const int const keyT keyT *__restrict__ tileListDepthSrc
template<int nthread>
__global__ void updatePatchesKernel ( const int  numComputes,
const int *__restrict__  tilePos,
const CudaComputeRecord *__restrict__  computes,
const CudaPatchRecord *__restrict__  patches,
TileList *__restrict__  tileLists 
)

Definition at line 379 of file CudaTileListKernel.cu.

References CudaComputeRecord::offsetXYZ, CudaComputeRecord::patchInd, and WARPSIZE.

383  {
384 
385  const int tid = threadIdx.x % nthread;
386 
387  // nthread threads takes care of one compute
388  for (int k = (threadIdx.x + blockIdx.x*blockDim.x)/nthread;k < numComputes;k+=blockDim.x*gridDim.x/nthread)
389  {
390  CudaComputeRecord compute = computes[k];
391  float3 offsetXYZ = compute.offsetXYZ;
392  int2 patchInd = compute.patchInd;
393  int numTiles1 = (patches[patchInd.x].numAtoms-1)/WARPSIZE+1;
394  int itileList0 = tilePos[k];
395  for (int i=tid;i < numTiles1;i+=nthread) {
396  tileLists[itileList0 + i].offsetXYZ = offsetXYZ;
397  tileLists[itileList0 + i].patchInd = patchInd;
398  tileLists[itileList0 + i].icompute = k;
399  }
400  }
401 
402 }
#define WARPSIZE
Definition: CudaUtils.h:10
float3 offsetXYZ

Variable Documentation

else begin_bit
BLOCK_SYNC
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ valT* __restrict__ tileListOrderDst typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU

Definition at line 676 of file CudaTileListKernel.cu.

__thread DeviceCUDA* deviceCUDA

Definition at line 22 of file DeviceCUDA.C.

else end_bit

Definition at line 676 of file CudaTileListKernel.cu.

keys

Definition at line 696 of file CudaTileListKernel.cu.

BlockLoad::TempStorage load
BlockLoadU::TempStorage loadU

Definition at line 692 of file CudaTileListKernel.cu.

const int numTileListsDst

Definition at line 676 of file CudaTileListKernel.cu.

numTileListsSrc

Definition at line 699 of file CudaTileListKernel.cu.

oobKey

Definition at line 676 of file CudaTileListKernel.cu.

BlockRadixSort::TempStorage sort
__shared__ { ... } tempStorage
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ tileListDepthDst

Definition at line 676 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ tileListDepthSrc

Definition at line 676 of file CudaTileListKernel.cu.

const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ tileListOrderSrc

Definition at line 676 of file CudaTileListKernel.cu.

else values

Definition at line 697 of file CudaTileListKernel.cu.

Referenced by Controller::printEnergies(), and Parameters::read_parm().