NAMD
|
#include <cuda.h>
#include <namd_cub/device/device_radix_sort.cuh>
#include <namd_cub/device/device_scan.cuh>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"
Go to the source code of this file.
Macros | |
#define | OVERALLOC 1.2f |
#define | DEFAULTKERNEL_NUM_THREAD 1024 |
#define | UPDATEPATCHESKERNEL_NUM_THREAD 512 |
#define | CALCPATCHNUMLISTSKERNEL_NUM_THREAD 512 |
#define | BOUNDINGBOXKERNEL_NUM_WARP 8 |
#define | __ldg * |
#define | TILELISTKERNELNEW_NUM_WARP 4 |
#define | REPACKTILELISTSKERNEL_NUM_WARP 32 |
#define | SORTTILELISTSKERNEL_NUM_THREAD 512 |
#define | SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22 |
Typedefs | |
typedef cub::BlockLoad< valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > | BlockLoad |
typedef cub::BlockRadixSort < keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, valT > | BlockRadixSort |
Functions | |
void | NAMD_die (const char *) |
__global__ void | calcPatchNumLists (const int numTileLists, const int numPatches, const TileList *__restrict__ tileLists, int *__restrict__ patchNumLists) |
__global__ void | setPatchNumLists_findEmptyPatches (const int numTileLists, TileList *__restrict__ tileLists, const int *__restrict__ patchNumLists, const int numPatches, int *__restrict__ numEmptyPatches, int *__restrict__ emptyPatches) |
__global__ void | buildRemoveZerosSortKey (const int numTileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, unsigned int *__restrict__ sortKey) |
__global__ void | setupSortKey (const int numTileLists, const int maxTileListLen, const TileList *__restrict__ tileLists, const unsigned int *__restrict__ tileListDepth, const int begin_bit, const unsigned int *__restrict__ sortKeys, unsigned int *__restrict__ sortKey) |
template<int width> | |
__global__ void | localSort (const int n, const int begin_bit, const int num_bit, unsigned int *__restrict__ keys, int *__restrict__ vals) |
__global__ void | storeInReverse (const int numTileListsSrc, const int begin_bit, const int *__restrict__ outputOrder, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrderSrc, const unsigned int *__restrict__ tileListDepthSrc, int *__restrict__ tileListOrderDst, unsigned int *__restrict__ tileListDepthDst) |
__global__ void | bitshiftTileListDepth (const int numTileLists, const int begin_bit, const int *__restrict__ outputOrder, const unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst) |
__global__ void | initMinMaxListLen (const int numComputes, const int maxTileListLen, int2 *__restrict__ minmaxListLen) |
__global__ void | buildSortKeys (const int numTileListsDst, const int maxTileListLen, const TileList *__restrict__ tileListsSrc, const int *__restrict__ tileListOrderDst, const unsigned int *__restrict__ tileListDepthDst, int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys) |
__global__ void | fillSortKeys (const int numComputes, const int maxTileListLen, const int2 *__restrict__ minmaxListLen, unsigned int *__restrict__ sortKeys) |
__global__ void | buildBoundingBoxesKernel (const int atomStorageSize, const float4 *__restrict__ xyzq, BoundingBox *__restrict__ boundingBoxes) |
__device__ __forceinline__ float | distsq (const BoundingBox a, const BoundingBox b) |
template<int nthread> | |
__global__ void | calcTileListPosKernel (const int numComputes, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, int *__restrict__ tilePos) |
template<int nthread> | |
__global__ void | updatePatchesKernel (const int numComputes, const int *__restrict__ tilePos, const CudaComputeRecord *__restrict__ computes, const CudaPatchRecord *__restrict__ patches, TileList *__restrict__ tileLists) |
__host__ __device__ __forceinline__ int | buildTileListsBBKernel_shmem_sizePerThread (const int maxTileListLen) |
__global__ void | buildTileListsBBKernel (const int numTileLists, TileList *__restrict__ tileLists, const CudaPatchRecord *__restrict__ patches, const int *__restrict__ tileListPos, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const int maxTileListLen, const BoundingBox *__restrict__ boundingBoxes, int *__restrict__ tileJatomStart, const int tileJatomStartSize, unsigned int *__restrict__ tileListDepth, int *__restrict__ tileListOrder, PatchPairRecord *__restrict__ patchPairs, TileListStat *__restrict__ tileListStat) |
__global__ void | repackTileListsKernel (const int numTileLists, const int begin_bit, const int *__restrict__ tileListPos, const int *__restrict__ tileListOrder, const int *__restrict__ jtiles, const TileList *__restrict__ tileListsSrc, TileList *__restrict__ tileListsDst, const PatchPairRecord *__restrict__ patchPairsSrc, PatchPairRecord *__restrict__ patchPairsDst, const int *__restrict__ tileJatomStartSrc, int *__restrict__ tileJatomStartDst, const TileExcl *__restrict__ tileExclsSrc, TileExcl *__restrict__ tileExclsDst) |
template<typename keyT , typename valT , bool ascend> | |
__launch_bounds__ (SORTTILELISTSKERNEL_NUM_THREAD, 1) __global__ void sortTileListsKernel(const int numTileListsSrc | |
BlockLoadU (tempStorage.loadU).Load(tileListDepthSrc | |
BlockLoad (tempStorage.load).Load(tileListOrderSrc | |
BlockRadixSort (tempStorage.sort).SortBlockedToStriped(keys | |
__global__ void | reOrderTileListDepth (const int numTileLists, const int *__restrict__ tileListOrder, unsigned int *__restrict__ tileListDepthSrc, unsigned int *__restrict__ tileListDepthDst) |
__global__ void | bitshiftTileListDepth (const int numTileLists, const int begin_bit, unsigned int *__restrict__ tileListDepth) |
int | ilog2 (int a) |
Variables | |
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ valT *__restrict__ tileListOrderDst typedef cub::BlockLoad< keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE > | BlockLoadU |
__thread DeviceCUDA * | deviceCUDA |
const int | numTileListsDst |
const int const int | begin_bit |
const int const int const int | end_bit |
const int const int const int const keyT | oobKey |
const int const int const int const keyT keyT *__restrict__ | tileListDepthSrc |
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ | tileListDepthDst |
const int const int const int const keyT keyT *__restrict__ keyT *__restrict__ valT *__restrict__ | tileListOrderSrc |
union { | |
BlockLoad::TempStorage load | |
BlockLoadU::TempStorage loadU | |
BlockRadixSort::TempStorage sort | |
} | tempStorage |
keyT | keys [SORTTILELISTSKERNEL_ITEMS_PER_THREAD] |
valT | values [SORTTILELISTSKERNEL_ITEMS_PER_THREAD] |
numTileListsSrc | |
BLOCK_SYNC | |
#define __ldg * |
Definition at line 42 of file CudaTileListKernel.cu.
Referenced by calcForceEnergy(), exclusionForce(), gather_force(), modifiedExclusionForce(), and repackTileListsKernel().
#define BOUNDINGBOXKERNEL_NUM_WARP 8 |
Definition at line 34 of file CudaTileListKernel.cu.
Referenced by buildBoundingBoxesKernel(), and CudaTileListKernel::buildTileLists().
#define CALCPATCHNUMLISTSKERNEL_NUM_THREAD 512 |
Definition at line 33 of file CudaTileListKernel.cu.
#define DEFAULTKERNEL_NUM_THREAD 1024 |
Definition at line 31 of file CudaTileListKernel.cu.
Referenced by CudaTileListKernel::buildTileLists().
#define OVERALLOC 1.2f |
Definition at line 29 of file CudaTileListKernel.cu.
Referenced by CudaTileListKernel::buildTileLists(), and CudaTileListKernel::reSortTileLists().
#define REPACKTILELISTSKERNEL_NUM_WARP 32 |
Definition at line 564 of file CudaTileListKernel.cu.
Referenced by repackTileListsKernel().
#define SORTTILELISTSKERNEL_ITEMS_PER_THREAD 22 |
Definition at line 669 of file CudaTileListKernel.cu.
#define SORTTILELISTSKERNEL_NUM_THREAD 512 |
Definition at line 668 of file CudaTileListKernel.cu.
#define TILELISTKERNELNEW_NUM_WARP 4 |
Definition at line 336 of file CudaTileListKernel.cu.
Referenced by CudaTileListKernel::buildTileLists().
#define UPDATEPATCHESKERNEL_NUM_THREAD 512 |
Definition at line 32 of file CudaTileListKernel.cu.
Referenced by CudaTileListKernel::buildTileLists().
typedef cub::BlockLoad<valT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad |
Definition at line 685 of file CudaTileListKernel.cu.
Definition at line 688 of file CudaTileListKernel.cu.
__launch_bounds__ | ( | SORTTILELISTSKERNEL_NUM_THREAD | , |
1 | |||
) | const |
__global__ void bitshiftTileListDepth | ( | const int | numTileLists, |
const int | begin_bit, | ||
const int *__restrict__ | outputOrder, | ||
const unsigned int *__restrict__ | tileListDepthSrc, | ||
unsigned int *__restrict__ | tileListDepthDst | ||
) |
Definition at line 159 of file CudaTileListKernel.cu.
References begin_bit.
__global__ void bitshiftTileListDepth | ( | const int | numTileLists, |
const int | begin_bit, | ||
unsigned int *__restrict__ | tileListDepth | ||
) |
Definition at line 727 of file CudaTileListKernel.cu.
References begin_bit, and itileList.
BlockLoad | ( | tempStorage. | load | ) |
BlockLoadU | ( | tempStorage. | loadU | ) |
BlockRadixSort | ( | tempStorage. | sort | ) |
__global__ void buildBoundingBoxesKernel | ( | const int | atomStorageSize, |
const float4 *__restrict__ | xyzq, | ||
BoundingBox *__restrict__ | boundingBoxes | ||
) |
Definition at line 252 of file CudaTileListKernel.cu.
References BOUNDINGBOXKERNEL_NUM_WARP, tempStorage, WARPSIZE, BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.
__global__ void buildRemoveZerosSortKey | ( | const int | numTileLists, |
const unsigned int *__restrict__ | tileListDepth, | ||
const int | begin_bit, | ||
unsigned int *__restrict__ | sortKey | ||
) |
Definition at line 90 of file CudaTileListKernel.cu.
References begin_bit, and itileList.
__global__ void buildSortKeys | ( | const int | numTileListsDst, |
const int | maxTileListLen, | ||
const TileList *__restrict__ | tileListsSrc, | ||
const int *__restrict__ | tileListOrderDst, | ||
const unsigned int *__restrict__ | tileListDepthDst, | ||
int2 *__restrict__ | minmaxListLen, | ||
unsigned int *__restrict__ | sortKeys | ||
) |
Definition at line 187 of file CudaTileListKernel.cu.
__global__ void buildTileListsBBKernel | ( | const int | numTileLists, |
TileList *__restrict__ | tileLists, | ||
const CudaPatchRecord *__restrict__ | patches, | ||
const int *__restrict__ | tileListPos, | ||
const float3 | lata, | ||
const float3 | latb, | ||
const float3 | latc, | ||
const float | cutoff2, | ||
const int | maxTileListLen, | ||
const BoundingBox *__restrict__ | boundingBoxes, | ||
int *__restrict__ | tileJatomStart, | ||
const int | tileJatomStartSize, | ||
unsigned int *__restrict__ | tileListDepth, | ||
int *__restrict__ | tileListOrder, | ||
PatchPairRecord *__restrict__ | patchPairs, | ||
TileListStat *__restrict__ | tileListStat | ||
) |
Definition at line 414 of file CudaTileListKernel.cu.
References CudaPatchRecord::atomStart, buildTileListsBBKernel_shmem_sizePerThread(), distsq(), PatchPairRecord::iatomFreeSize, PatchPairRecord::iatomSize, itileList, PatchPairRecord::jatomFreeSize, PatchPairRecord::jatomSize, CudaPatchRecord::numAtoms, CudaPatchRecord::numFreeAtoms, numTileLists, tempStorage, WARP_FULL_MASK, WARPSIZE, and BoundingBox::x.
__host__ __device__ __forceinline__ int buildTileListsBBKernel_shmem_sizePerThread | ( | const int | maxTileListLen | ) |
Definition at line 405 of file CudaTileListKernel.cu.
Referenced by CudaTileListKernel::buildTileLists(), and buildTileListsBBKernel().
__global__ void calcPatchNumLists | ( | const int | numTileLists, |
const int | numPatches, | ||
const TileList *__restrict__ | tileLists, | ||
int *__restrict__ | patchNumLists | ||
) |
Definition at line 50 of file CudaTileListKernel.cu.
__global__ void calcTileListPosKernel | ( | const int | numComputes, |
const CudaComputeRecord *__restrict__ | computes, | ||
const CudaPatchRecord *__restrict__ | patches, | ||
int *__restrict__ | tilePos | ||
) |
Definition at line 342 of file CudaTileListKernel.cu.
References BLOCK_SYNC, tempStorage, and WARPSIZE.
__device__ __forceinline__ float distsq | ( | const BoundingBox | a, |
const BoundingBox | b | ||
) |
Definition at line 295 of file CudaTileListKernel.cu.
References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.
__global__ void fillSortKeys | ( | const int | numComputes, |
const int | maxTileListLen, | ||
const int2 *__restrict__ | minmaxListLen, | ||
unsigned int *__restrict__ | sortKeys | ||
) |
int ilog2 | ( | int | a | ) |
Definition at line 1192 of file CudaTileListKernel.cu.
__global__ void initMinMaxListLen | ( | const int | numComputes, |
const int | maxTileListLen, | ||
int2 *__restrict__ | minmaxListLen | ||
) |
Definition at line 171 of file CudaTileListKernel.cu.
__global__ void localSort | ( | const int | n, |
const int | begin_bit, | ||
const int | num_bit, | ||
unsigned int *__restrict__ | keys, | ||
int *__restrict__ | vals | ||
) |
Definition at line 116 of file CudaTileListKernel.cu.
References begin_bit, BLOCK_SYNC, and tempStorage.
__global__ void reOrderTileListDepth | ( | const int | numTileLists, |
const int *__restrict__ | tileListOrder, | ||
unsigned int *__restrict__ | tileListDepthSrc, | ||
unsigned int *__restrict__ | tileListDepthDst | ||
) |
Definition at line 713 of file CudaTileListKernel.cu.
__global__ void repackTileListsKernel | ( | const int | numTileLists, |
const int | begin_bit, | ||
const int *__restrict__ | tileListPos, | ||
const int *__restrict__ | tileListOrder, | ||
const int *__restrict__ | jtiles, | ||
const TileList *__restrict__ | tileListsSrc, | ||
TileList *__restrict__ | tileListsDst, | ||
const PatchPairRecord *__restrict__ | patchPairsSrc, | ||
PatchPairRecord *__restrict__ | patchPairsDst, | ||
const int *__restrict__ | tileJatomStartSrc, | ||
int *__restrict__ | tileJatomStartDst, | ||
const TileExcl *__restrict__ | tileExclsSrc, | ||
TileExcl *__restrict__ | tileExclsDst | ||
) |
Definition at line 569 of file CudaTileListKernel.cu.
References __ldg, begin_bit, REPACKTILELISTSKERNEL_NUM_WARP, WARP_BALLOT, WARP_FULL_MASK, and WARPSIZE.
__global__ void setPatchNumLists_findEmptyPatches | ( | const int | numTileLists, |
TileList *__restrict__ | tileLists, | ||
const int *__restrict__ | patchNumLists, | ||
const int | numPatches, | ||
int *__restrict__ | numEmptyPatches, | ||
int *__restrict__ | emptyPatches | ||
) |
Definition at line 66 of file CudaTileListKernel.cu.
__global__ void setupSortKey | ( | const int | numTileLists, |
const int | maxTileListLen, | ||
const TileList *__restrict__ | tileLists, | ||
const unsigned int *__restrict__ | tileListDepth, | ||
const int | begin_bit, | ||
const unsigned int *__restrict__ | sortKeys, | ||
unsigned int *__restrict__ | sortKey | ||
) |
Definition at line 101 of file CudaTileListKernel.cu.
References itileList.
__global__ void storeInReverse | ( | const int | numTileListsSrc, |
const int | begin_bit, | ||
const int *__restrict__ | outputOrder, | ||
const int *__restrict__ | tileListPos, | ||
const int *__restrict__ | tileListOrderSrc, | ||
const unsigned int *__restrict__ | tileListDepthSrc, | ||
int *__restrict__ | tileListOrderDst, | ||
unsigned int *__restrict__ | tileListDepthDst | ||
) |
Definition at line 138 of file CudaTileListKernel.cu.
__global__ void updatePatchesKernel | ( | const int | numComputes, |
const int *__restrict__ | tilePos, | ||
const CudaComputeRecord *__restrict__ | computes, | ||
const CudaPatchRecord *__restrict__ | patches, | ||
TileList *__restrict__ | tileLists | ||
) |
Definition at line 379 of file CudaTileListKernel.cu.
References CudaComputeRecord::offsetXYZ, CudaComputeRecord::patchInd, and WARPSIZE.
else begin_bit |
Definition at line 676 of file CudaTileListKernel.cu.
Referenced by bitshiftTileListDepth(), buildRemoveZerosSortKey(), localSort(), and repackTileListsKernel().
BLOCK_SYNC |
Definition at line 700 of file CudaTileListKernel.cu.
Referenced by bondedForcesKernel(), calcTileListPosKernel(), gather_force(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), localSort(), modifiedExclusionForcesKernel(), reduceGBISEnergyKernel(), reduceNonbondedVirialKernel(), reduceVariables(), reduceVirialEnergyKernel(), scalar_sum_kernel(), spread_charge_kernel(), transpose_xyz_yzx_device(), transpose_xyz_zxy_device(), and void().
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ valT* __restrict__ tileListOrderDst typedef cub::BlockLoad<keyT, SORTTILELISTSKERNEL_NUM_THREAD, SORTTILELISTSKERNEL_ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadU |
Definition at line 676 of file CudaTileListKernel.cu.
__thread DeviceCUDA* deviceCUDA |
Definition at line 22 of file DeviceCUDA.C.
else end_bit |
Definition at line 676 of file CudaTileListKernel.cu.
keys |
Definition at line 696 of file CudaTileListKernel.cu.
BlockLoad::TempStorage load |
Definition at line 691 of file CudaTileListKernel.cu.
Referenced by Rebalancer::computeAverage(), Rebalancer::computeMax(), Rebalancer::decrSTLoad(), RefineTorusLB::newRefine(), Rebalancer::printSummary(), recursive_bisect_with_curve(), Rebalancer::refine(), and NamdCentLB::Strategy().
BlockLoadU::TempStorage loadU |
Definition at line 692 of file CudaTileListKernel.cu.
const int numTileListsDst |
Definition at line 676 of file CudaTileListKernel.cu.
numTileListsSrc |
Definition at line 699 of file CudaTileListKernel.cu.
oobKey |
Definition at line 676 of file CudaTileListKernel.cu.
BlockRadixSort::TempStorage sort |
Definition at line 693 of file CudaTileListKernel.cu.
Referenced by ComputeNonbondedCUDA::assignPatches(), CudaComputeNonbonded::assignPatches(), CudaComputeNonbonded::assignPatchesOnPe(), Molecule::build_go_sigmas2(), WorkDistrib::buildNodeAwarePeOrdering(), ParallelIOMgr::createHomePatches(), generatePmePeList2(), PairTable::getPairGaussArrays2(), PairTable::getPairLJArrays2(), ParallelIOMgr::initialize(), ComputePmeMgr::initialize(), ComputePmeMgr::initialize_pencils(), ParallelIOMgr::integrateMigratedAtoms(), PmePencil< CBase_PmeZPencil >::order_init(), outputProxyTree(), Patch::positionsReady(), recursive_bisect_coord(), recursive_bisect_with_curve(), ComputeQMMgr::recvPntChrg(), TopoManagerWrapper::sortAndSplit(), sortAtomsForPatches(), ExclSigInfo::sortExclOffset(), AtomSigInfo::sortTupleSigIndices(), and TopoManagerWrapper::TopoManagerWrapper().
__shared__ { ... } tempStorage |
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ tileListDepthDst |
Definition at line 676 of file CudaTileListKernel.cu.
const int const int const int const keyT keyT* __restrict__ tileListDepthSrc |
Definition at line 676 of file CudaTileListKernel.cu.
const int const int const int const keyT keyT* __restrict__ keyT* __restrict__ valT* __restrict__ tileListOrderSrc |
Definition at line 676 of file CudaTileListKernel.cu.
else values |
Definition at line 697 of file CudaTileListKernel.cu.
Referenced by Controller::printEnergies(), and Parameters::read_parm().