#include <CudaTileListKernel.h>

Public Member Functions
	CudaTileListKernel (int deviceID, bool doStreaming)

	~CudaTileListKernel ()

int	getNumEmptyPatches ()

int *	getEmptyPatches ()

int	getNumExcluded ()

float	get_plcutoff2 ()

int	getNumTileLists ()

int	getNumTileListsGBIS ()

int	getNumJtiles ()

BoundingBox *	getBoundingBoxes ()

int *	getJtiles ()

float4 *	get_xyzq ()

TileListStat *	getTileListStatDevPtr ()

void	clearTileListStat (cudaStream_t stream)

int *	getTileJatomStart ()

TileList *	getTileLists ()

unsigned int *	getTileListDepth ()

int *	getTileListOrder ()

TileExcl *	getTileExcls ()

PatchPairRecord *	getPatchPairs ()

int *	getTileJatomStartGBIS ()

TileList *	getTileListsGBIS ()

TileListVirialEnergy *	getTileListVirialEnergy ()

CudaPatchRecord *	getCudaPatches ()

void	prepareTileList (cudaStream_t stream)

void	finishTileList (cudaStream_t stream)

void	updateComputes (const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)

void	buildTileLists (const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord h_cudaPatches, const float4 h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream)

void	reSortTileLists (const bool doGBIS, cudaStream_t stream)

void	setTileListVirialEnergyLength (int len)

void	setTileListVirialEnergyGBISLength (int len)

int	getTileListVirialEnergyLength ()

int	getTileListVirialEnergyGBISLength ()

int	getNumPatches ()

int	getNumComputes ()

int *	getOutputOrder ()

Detailed Description

Definition at line 87 of file CudaTileListKernel.h.

Constructor & Destructor Documentation

CudaTileListKernel::CudaTileListKernel	(	int	deviceID,
		bool	doStreaming
	)

Definition at line 744 of file CudaTileListKernel.cu.

References cudaCheck.

                                                                      :
 deviceID(deviceID), doStreaming(doStreaming) {
 
   cudaCheck(cudaSetDevice(deviceID));
 
   activeBuffer = 1;
 
   numPatches = 0;
   numComputes = 0;
 
   cudaPatches = NULL;
   cudaPatchesSize = 0;
 
   cudaComputes = NULL;
   cudaComputesSize = 0;
 
   patchNumLists = NULL;
   patchNumListsSize = 0;
 
   emptyPatches = NULL;
   emptyPatchesSize = 0;
   h_emptyPatches = NULL;
   h_emptyPatchesSize = 0;
   numEmptyPatches = 0;
 
   sortKeySrc = NULL;
   sortKeySrcSize = 0;
   sortKeyDst = NULL;
   sortKeyDstSize = 0;
 
   tileLists1 = NULL;
   tileLists1Size = 0;
   tileLists2 = NULL;
   tileLists2Size = 0;
 
   patchPairs1 = NULL;
   patchPairs1Size = 0;
   patchPairs2 = NULL;
   patchPairs2Size = 0;
 
   tileJatomStart1 = NULL;
   tileJatomStart1Size = 0;
   tileJatomStart2 = NULL;
   tileJatomStart2Size = 0;
 
   boundingBoxes = NULL;
   boundingBoxesSize = 0;
 
   tileListDepth1 = NULL;
   tileListDepth1Size = 0;
   tileListDepth2 = NULL;
   tileListDepth2Size = 0;
 
   tileListOrder1 = NULL;
   tileListOrder1Size = 0;
   tileListOrder2 = NULL;
   tileListOrder2Size = 0;
 
   tileExcls1 = NULL;
   tileExcls1Size = 0;
   tileExcls2 = NULL;
   tileExcls2Size = 0;
 
   xyzq = NULL;
   xyzqSize = 0;
 
   allocate_device<TileListStat>(&d_tileListStat, 1);
   allocate_host<TileListStat>(&h_tileListStat, 1);
 
   tileListPos = NULL;
   tileListPosSize = 0;
   tempStorage = NULL;
   tempStorageSize = 0;
 
   jtiles = NULL;
   jtilesSize = 0;
 
   tilePos = NULL;
   tilePosSize = 0;
 
   tileListsGBIS = NULL;
   tileListsGBISSize = 0;
 
   tileJatomStartGBIS = NULL;
   tileJatomStartGBISSize = 0;
 
   tileListVirialEnergy = NULL;
   tileListVirialEnergySize = 0;
 
   atomStorageSize = 0;
   numTileLists = 0;
   numTileListsGBIS = 0;
   numJtiles = 1;
 
   outputOrder = NULL;
   outputOrderSize = 0;
   doOutputOrder = false;
 
   minmaxListLen = NULL;
   minmaxListLenSize = 0;
 
   sortKeys = NULL;
   sortKeysSize = 0;
   sortKeys_endbit = 0;
 
   cudaCheck(cudaEventCreate(&tileListStatEvent));
   tileListStatEventRecord = false;
 }

CudaTileListKernel::~CudaTileListKernel ( )

Definition at line 853 of file CudaTileListKernel.cu.

References cudaCheck.

                                         {
   cudaCheck(cudaSetDevice(deviceID));
   deallocate_device<TileListStat>(&d_tileListStat);
   deallocate_host<TileListStat>(&h_tileListStat);
   //
   if (patchNumLists != NULL) deallocate_device<int>(&patchNumLists);
   if (emptyPatches != NULL) deallocate_device<int>(&emptyPatches);
   if (h_emptyPatches != NULL) deallocate_host<int>(&h_emptyPatches);
   if (sortKeySrc != NULL) deallocate_device<unsigned int>(&sortKeySrc);
   if (sortKeyDst != NULL) deallocate_device<unsigned int>(&sortKeyDst);
   //
   if (cudaPatches != NULL) deallocate_device<CudaPatchRecord>(&cudaPatches);
   if (cudaComputes != NULL) deallocate_device<CudaComputeRecord>(&cudaComputes);
   if (patchPairs1 != NULL) deallocate_device<PatchPairRecord>(&patchPairs1);
   if (patchPairs2 != NULL) deallocate_device<PatchPairRecord>(&patchPairs2);
   if (tileLists1 != NULL) deallocate_device<TileList>(&tileLists1);
   if (tileLists2 != NULL) deallocate_device<TileList>(&tileLists2);
   if (tileJatomStart1 != NULL) deallocate_device<int>(&tileJatomStart1);
   if (tileJatomStart2 != NULL) deallocate_device<int>(&tileJatomStart2);
   if (boundingBoxes != NULL) deallocate_device<BoundingBox>(&boundingBoxes);
   if (tileListDepth1 != NULL) deallocate_device<unsigned int>(&tileListDepth1);
   if (tileListDepth2 != NULL) deallocate_device<unsigned int>(&tileListDepth2);
   if (tileListOrder1 != NULL) deallocate_device<int>(&tileListOrder1);
   if (tileListOrder2 != NULL) deallocate_device<int>(&tileListOrder2);
   if (tileListPos != NULL) deallocate_device<int>(&tileListPos);
   if (tileExcls1 != NULL) deallocate_device<TileExcl>(&tileExcls1);
   if (tileExcls2 != NULL) deallocate_device<TileExcl>(&tileExcls2);
   if (tempStorage != NULL) deallocate_device<char>(&tempStorage);
   if (jtiles != NULL) deallocate_device<int>(&jtiles);
   if (tilePos != NULL) deallocate_device<int>(&tilePos);
 
   if (tileListsGBIS != NULL) deallocate_device<TileList>(&tileListsGBIS);
   if (tileJatomStartGBIS != NULL) deallocate_device<int>(&tileJatomStartGBIS);
 
   if (tileListVirialEnergy != NULL) deallocate_device<TileListVirialEnergy>(&tileListVirialEnergy);
 
   if (xyzq != NULL) deallocate_device<float4>(&xyzq);
 
   if (sortKeys != NULL) deallocate_device<unsigned int>(&sortKeys);
   if (minmaxListLen != NULL) deallocate_device<int2>(&minmaxListLen);
 
   cudaCheck(cudaEventDestroy(tileListStatEvent));
 }

Member Function Documentation

void CudaTileListKernel::buildTileLists	(	const int	numTileListsPrev,
		const int	numPatchesIn,
		const int	atomStorageSizeIn,
		const int	maxTileListLenIn,
		const float3	lata,
		const float3	latb,
		const float3	latc,
		const CudaPatchRecord *	h_cudaPatches,
		const float4 *	h_xyzq,
		const float	plcutoff2In,
		const size_t	maxShmemPerBlock,
		cudaStream_t	stream
	)

Definition at line 1031 of file CudaTileListKernel.cu.

References BOUNDINGBOXKERNEL_NUM_WARP, buildTileListsBBKernel_shmem_sizePerThread(), clearTileListStat(), cudaCheck, DEFAULTKERNEL_NUM_THREAD, deviceCUDA, DeviceCUDA::getMaxNumBlocks(), lata, latb, latc, NAMD_die(), TileListStat::numTileLists, OVERALLOC, stream, TILELISTKERNELNEW_NUM_WARP, TileListStat::tilesSizeExceeded, UPDATEPATCHESKERNEL_NUM_THREAD, and WARPSIZE.

                        {
 
   numPatches = numPatchesIn;
   atomStorageSize = atomStorageSizeIn;
   maxTileListLen = maxTileListLenIn;
   plcutoff2 = plcutoff2In;
 
   if (doStreaming) {
     // Re-allocate patchNumLists
     reallocate_device<int>(&patchNumLists, &patchNumListsSize, numPatches);
     reallocate_device<int>(&emptyPatches, &emptyPatchesSize, numPatches+1);
     reallocate_host<int>(&h_emptyPatches, &h_emptyPatchesSize, numPatches+1);
   }
 
   // Re-allocate (tileLists1, patchPairs1
   reallocate_device<TileList>(&tileLists1, &tileLists1Size, numTileListsPrev, OVERALLOC);
   reallocate_device<PatchPairRecord>(&patchPairs1, &patchPairs1Size, numTileListsPrev, OVERALLOC);
 
   // Copy cudaPatches to device
   reallocate_device<CudaPatchRecord>(&cudaPatches, &cudaPatchesSize, numPatches);
   copy_HtoD<CudaPatchRecord>(h_cudaPatches, cudaPatches, numPatches, stream);
 
   // Re-allocate temporary storage
   reallocate_device<int>(&tilePos, &tilePosSize, numComputes, OVERALLOC);
   // Calculate tile list positions (tilePos)
   {
     int nthread = DEFAULTKERNEL_NUM_THREAD;
     int nblock = 1;
     calcTileListPosKernel<DEFAULTKERNEL_NUM_THREAD> <<< nblock, nthread, 0, stream >>> (numComputes, cudaComputes, cudaPatches, tilePos);
     cudaCheck(cudaGetLastError());
   }
 
   // Build (tileLists1.patchInd, tileLists1.offsetXYZ)
   {
     int nthread = UPDATEPATCHESKERNEL_NUM_THREAD;
     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numComputes-1)/(nthread/WARPSIZE)+1);
     updatePatchesKernel<WARPSIZE> <<< nblock, nthread, 0, stream >>> (numComputes, tilePos, cudaComputes, cudaPatches, tileLists1);
     cudaCheck(cudaGetLastError());
   }
 
   // ---------------------------------------------------------------------------------------------
 
 
   // NOTE: tileListDepth2 and tileListOrder2 must have at least same size as
   // tileListDepth2 and tileListOrder2 since they're used in sorting
   reallocate_device<unsigned int>(&tileListDepth2, &tileListDepth2Size, numTileListsPrev + 1, OVERALLOC);
   reallocate_device<int>(&tileListOrder2, &tileListOrder2Size, numTileListsPrev, OVERALLOC);
 
   // Allocate with +1 to include last term in the exclusive sum
   reallocate_device<unsigned int>(&tileListDepth1, &tileListDepth1Size, numTileListsPrev + 1, OVERALLOC);
 
   reallocate_device<int>(&tileListOrder1, &tileListOrder1Size, numTileListsPrev, OVERALLOC);
 
   reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, OVERALLOC);
 
   copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
 
   // Fills in boundingBoxes[0 ... numBoundingBoxes-1]
   {
     int numBoundingBoxes = atomStorageSize/WARPSIZE;
     reallocate_device<BoundingBox>(&boundingBoxes, &boundingBoxesSize, numBoundingBoxes, OVERALLOC);
 
     int nwarp = BOUNDINGBOXKERNEL_NUM_WARP;
     int nthread = WARPSIZE*nwarp;
     int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
     buildBoundingBoxesKernel <<< nblock, nthread, 0, stream >>> (atomStorageSize, xyzq, boundingBoxes);
     cudaCheck(cudaGetLastError());
   }
 
   {
     int nwarp = TILELISTKERNELNEW_NUM_WARP;
     int nthread = WARPSIZE*nwarp;
     int nblock = min(deviceCUDA->getMaxNumBlocks(), (numTileListsPrev-1)/nthread+1);
 
     int shmem_size = buildTileListsBBKernel_shmem_sizePerThread(maxTileListLen)*nthread;
     if(shmem_size > maxShmemPerBlock){
       NAMD_die("CudaTileListKernel::buildTileLists, maximum shared memory allocation exceeded. Too many atoms in a patch");
     }
 
     // NOTE: In the first call numJtiles = 1. buildTileListsBBKernel will return and
     //       tell the required size in h_tileListStat->numJtiles. In subsequent calls,
     //       re-allocation only happens when the size is exceeded.
     h_tileListStat->tilesSizeExceeded = true;
     int reallocCount = 0;
     while (h_tileListStat->tilesSizeExceeded) {
       reallocate_device<int>(&tileJatomStart1, &tileJatomStart1Size, numJtiles, OVERALLOC);
 
       clearTileListStat(stream);
       // clear_device_array<TileListStat>(d_tileListStat, 1, stream);
       buildTileListsBBKernel <<< nblock, nthread, shmem_size, stream >>> (
         numTileListsPrev, tileLists1, cudaPatches, tilePos,
         lata, latb, latc, plcutoff2, maxTileListLen,
         boundingBoxes, tileJatomStart1, tileJatomStart1Size,
         tileListDepth1, tileListOrder1, patchPairs1,
         d_tileListStat);
 
       cudaCheck(cudaGetLastError());
 
       // get (numATileLists, numJtiles, tilesSizeExceeded)
       copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
       cudaCheck(cudaStreamSynchronize(stream));
       numJtiles = h_tileListStat->numJtiles;
 
       if (h_tileListStat->tilesSizeExceeded) {
         reallocCount++;
         if (reallocCount > 1) {
           NAMD_die("CudaTileListKernel::buildTileLists, multiple reallocations detected");
         }
       }
 
     }
 
     numTileLists = h_tileListStat->numTileLists;
 
     reallocate_device<int>(&jtiles, &jtilesSize, numJtiles, OVERALLOC);
   }
 
   // Re-allocate tileListVirialEnergy.
   // NOTE: Since numTileLists here is an upper estimate (since it's based on bounding boxes),
   //       we're quaranteed to have enough space
   reallocate_device<TileListVirialEnergy>(&tileListVirialEnergy, &tileListVirialEnergySize, numTileLists, OVERALLOC);
 
   reallocate_device<TileList>(&tileLists2, &tileLists2Size, numTileLists, OVERALLOC);
   reallocate_device<PatchPairRecord>(&patchPairs2, &patchPairs2Size, numTileLists, OVERALLOC);
   reallocate_device<int>(&tileJatomStart2, &tileJatomStart2Size, numJtiles, OVERALLOC);
   reallocate_device<TileExcl>(&tileExcls1, &tileExcls1Size, numJtiles, OVERALLOC);
   reallocate_device<TileExcl>(&tileExcls2, &tileExcls2Size, numJtiles, OVERALLOC);
 
   int numTileListsSrc = numTileListsPrev;
   int numJtilesSrc    = numJtiles;
   int numTileListsDst = numTileLists;
   int numJtilesDst    = numJtiles;
 
   // Sort tiles
   sortTileLists(
     false,
     0, false,
     numTileListsSrc, numJtilesSrc,
     PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
     PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
     PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
     numTileListsDst, numJtilesDst,
     PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
     PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
     PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
     stream);
 
   // Set active buffer to 2
   setActiveBuffer(2);
 
   if (doOutputOrder) reallocate_device<int>(&outputOrder, &outputOrderSize, numTileLists, OVERALLOC);
 }

void CudaTileListKernel::clearTileListStat ( cudaStream_t stream )

Definition at line 901 of file CudaTileListKernel.cu.

References getNumEmptyPatches(), TileListStat::patchReadyQueueCount, and stream.

Referenced by buildTileLists(), and CudaComputeNonbondedKernel::nonbondedForce().

                                                               {
   // clear tileListStat, for patchReadyQueueCount, which is set equal to the number of empty patches
   memset(h_tileListStat, 0, sizeof(TileListStat));
   h_tileListStat->patchReadyQueueCount = getNumEmptyPatches();
   copy_HtoD<TileListStat>(h_tileListStat, d_tileListStat, 1, stream);
 }

void CudaTileListKernel::finishTileList ( cudaStream_t stream )

Definition at line 908 of file CudaTileListKernel.cu.

References cudaCheck, and stream.

                                                            {
   copy_DtoH<TileListStat>(d_tileListStat, h_tileListStat, 1, stream);
   cudaCheck(cudaEventRecord(tileListStatEvent, stream));
   tileListStatEventRecord = true;
 }

float CudaTileListKernel::get_plcutoff2 ( )

inline

Definition at line 277 of file CudaTileListKernel.h.

277 {return plcutoff2;}

plcutoff2

__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2

Definition: CudaComputeNonbondedKernel.cu:254

float4* CudaTileListKernel::get_xyzq ( )

inline

Definition at line 283 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase3(), CudaComputeNonbondedKernel::nonbondedForce(), and CudaComputeNonbondedKernel::reduceVirialEnergy().

283 {return xyzq;}

xyzq

__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq

Definition: CudaComputeNonbondedKernel.cu:254

BoundingBox* CudaTileListKernel::getBoundingBoxes ( )

inline

Definition at line 281 of file CudaTileListKernel.h.

281 {return boundingBoxes;}

boundingBoxes

__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes

Definition: CudaComputeNonbondedKernel.cu:254

CudaPatchRecord* CudaTileListKernel::getCudaPatches ( )

inline

Definition at line 302 of file CudaTileListKernel.h.

302 {return cudaPatches;}

cudaPatches

__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches

Definition: CudaComputeNonbondedKernel.cu:254

int* CudaTileListKernel::getEmptyPatches ( )

inline

Definition at line 273 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::launchWork().

273 {return h_emptyPatches;}

int* CudaTileListKernel::getJtiles ( )

inline

Definition at line 282 of file CudaTileListKernel.h.

282 {return jtiles;}

jtiles

__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles

Definition: CudaComputeNonbondedKernel.cu:254

int CudaTileListKernel::getNumComputes ( )

inline

Definition at line 326 of file CudaTileListKernel.h.

326 {return numComputes;}

int CudaTileListKernel::getNumEmptyPatches ( )

inline

Definition at line 272 of file CudaTileListKernel.h.

Referenced by clearTileListStat(), and CudaComputeNonbonded::launchWork().

272 {return numEmptyPatches;}

int CudaTileListKernel::getNumExcluded ( )

inline

Definition at line 275 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::finishReductions().

275 {return numExcluded;}

int CudaTileListKernel::getNumJtiles ( )

inline

Definition at line 280 of file CudaTileListKernel.h.

280 {return numJtiles;}

int CudaTileListKernel::getNumPatches ( )

inline

Definition at line 324 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbonded::launchWork(), and CudaComputeNonbondedKernel::nonbondedForce().

324 {return numPatches;}

numPatches

__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches

Definition: CudaComputeNonbondedKernel.cu:254

int CudaTileListKernel::getNumTileLists ( )

inline

Definition at line 278 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

278 {return numTileLists;}

numTileLists

__global__ void const int numTileLists

Definition: CudaComputeNonbondedKernel.cu:254

int CudaTileListKernel::getNumTileListsGBIS ( )

inline

Definition at line 279 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase2(), and CudaComputeGBISKernel::GBISphase3().

279 {return numTileListsGBIS;}

int* CudaTileListKernel::getOutputOrder ( )

inline

Definition at line 327 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

                         {
     if (!doStreaming) return NULL;
     if (doOutputOrder) {
       return outputOrder;
     } else {
       return NULL;
     }
   }

PatchPairRecord* CudaTileListKernel::getPatchPairs ( )

inline

Definition at line 295 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

295 {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}

TileExcl* CudaTileListKernel::getTileExcls ( )

inline

Definition at line 294 of file CudaTileListKernel.h.

294 {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}

int* CudaTileListKernel::getTileJatomStart ( )

inline

Definition at line 288 of file CudaTileListKernel.h.

288 {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}

int* CudaTileListKernel::getTileJatomStartGBIS ( )

inline

Definition at line 297 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

297 {return tileJatomStartGBIS;}

unsigned int* CudaTileListKernel::getTileListDepth ( )

inline

Definition at line 292 of file CudaTileListKernel.h.

292 {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}

int* CudaTileListKernel::getTileListOrder ( )

inline

Definition at line 293 of file CudaTileListKernel.h.

293 {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}

TileList* CudaTileListKernel::getTileLists ( )

inline

Definition at line 289 of file CudaTileListKernel.h.

                            {
     return ((activeBuffer == 1) ? tileLists1 : tileLists2);
   }

TileList* CudaTileListKernel::getTileListsGBIS ( )

inline

Definition at line 298 of file CudaTileListKernel.h.

Referenced by CudaComputeGBISKernel::GBISphase1(), and CudaComputeGBISKernel::GBISphase3().

298 {return tileListsGBIS;}

TileListStat* CudaTileListKernel::getTileListStatDevPtr ( )

inline

Definition at line 285 of file CudaTileListKernel.h.

285 {return d_tileListStat;}

TileListVirialEnergy* CudaTileListKernel::getTileListVirialEnergy ( )

inline

Definition at line 300 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

300 {return tileListVirialEnergy;}

int CudaTileListKernel::getTileListVirialEnergyGBISLength ( )

inline

Definition at line 322 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

322 {return tileListVirialEnergyGBISLength;}

int CudaTileListKernel::getTileListVirialEnergyLength ( )

inline

Definition at line 321 of file CudaTileListKernel.h.

Referenced by CudaComputeNonbondedKernel::reduceVirialEnergy().

321 {return tileListVirialEnergyLength;}

void CudaTileListKernel::prepareTileList ( cudaStream_t stream )

Definition at line 897 of file CudaTileListKernel.cu.

References stream.

                                                             {
   clear_device_array<int>(jtiles, numJtiles, stream);
 }

void CudaTileListKernel::reSortTileLists	(	const bool	doGBIS,
		cudaStream_t	stream
	)

Definition at line 1586 of file CudaTileListKernel.cu.

References cudaCheck, NAMD_die(), TileListStat::numExcluded, TileListStat::numTileLists, TileListStat::numTileListsGBIS, and OVERALLOC.

                                                                                {
   // Store previous number of active lists
   int numTileListsPrev = numTileLists;
 
   // Wait for finishTileList() to stop copying
   if (!tileListStatEventRecord)
     NAMD_die("CudaTileListKernel::reSortTileLists, tileListStatEvent not recorded");
   cudaCheck(cudaEventSynchronize(tileListStatEvent));
 
   // Get numTileLists, numTileListsGBIS, and numExcluded
   {
     numTileLists     = h_tileListStat->numTileLists;
     numTileListsGBIS = h_tileListStat->numTileListsGBIS;
     numExcluded      = h_tileListStat->numExcluded;
   }
 
   // Sort {tileLists2, tileJatomStart2, tileExcl2} => {tileLists1, tileJatomStart1, tileExcl1}
   // VdW tile list in {tileLists1, tileJatomStart1, tileExcl1}
   sortTileLists(true, 0, true,
     numTileListsPrev, numJtiles,
     PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
     PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
     PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls2, tileExcls2Size),
     numTileLists, numJtiles,
     PtrSize<TileList>(tileLists1, tileLists1Size), PtrSize<int>(tileJatomStart1, tileJatomStart1Size),
     PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
     PtrSize<PatchPairRecord>(NULL, 0), PtrSize<TileExcl>(tileExcls1, tileExcls1Size),
     stream);
 
   // fprintf(stderr, "reSortTileLists, writing tile lists to disk...\n");
   // writeTileList("tileList.txt", numTileLists, tileLists1, stream);
   // writeTileJatomStart("tileJatomStart.txt", numJtiles, tileJatomStart1, stream);
 
   // markJtileOverlap(4, numTileLists, tileLists1, numJtiles, tileJatomStart1, stream);
 
   // NOTE:
   // Only {tileList1, tileJatomStart1, tileExcl1} are used from here on,
   // the rest {tileListDepth1, tileListOrder1, patchPairs1} may be re-used by the GBIS sorting
 
   if (doGBIS) {
     // GBIS is used => produce a second tile list
     // GBIS tile list in {tileListGBIS, tileJatomStartGBIS, patchPairs1}
     reallocate_device<TileList>(&tileListsGBIS, &tileListsGBISSize, numTileListsGBIS, OVERALLOC);
     reallocate_device<int>(&tileJatomStartGBIS, &tileJatomStartGBISSize, numJtiles, OVERALLOC);
 
     sortTileLists(true, 16, true,
       numTileListsPrev, numJtiles,
       PtrSize<TileList>(tileLists2, tileLists2Size), PtrSize<int>(tileJatomStart2, tileJatomStart2Size),
       PtrSize<unsigned int>(tileListDepth2, tileListDepth2Size), PtrSize<int>(tileListOrder2, tileListOrder2Size),
       PtrSize<PatchPairRecord>(patchPairs2, patchPairs2Size), PtrSize<TileExcl>(NULL, 0),
       numTileListsGBIS, numJtiles,
       PtrSize<TileList>(tileListsGBIS, tileListsGBISSize), PtrSize<int>(tileJatomStartGBIS, tileJatomStartGBISSize),
       PtrSize<unsigned int>(tileListDepth1, tileListDepth1Size), PtrSize<int>(tileListOrder1, tileListOrder1Size),
       PtrSize<PatchPairRecord>(patchPairs1, patchPairs1Size), PtrSize<TileExcl>(NULL, 0),
       stream);
   }
 
   // Set active buffer to be 1
   setActiveBuffer(1);
 
 }

void CudaTileListKernel::setTileListVirialEnergyGBISLength ( int len )

Definition at line 1684 of file CudaTileListKernel.cu.

References NAMD_die().

Referenced by CudaComputeGBISKernel::GBISphase2().

                                                                   {
   if (len > tileListVirialEnergySize) {
     NAMD_die("CudaTileListKernel::setTileListVirialEnergyGBISLength, size overflow");
   }
   tileListVirialEnergyGBISLength = len;
 }

void CudaTileListKernel::setTileListVirialEnergyLength ( int len )

Definition at line 1677 of file CudaTileListKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbondedKernel::nonbondedForce().

                                                               {
   if (len > tileListVirialEnergySize) {
     NAMD_die("CudaTileListKernel::setTileListVirialEnergyLength, size overflow");
   }
   tileListVirialEnergyLength = len;
 }

void CudaTileListKernel::updateComputes	(	const int	numComputesIn,
		const CudaComputeRecord *	h_cudaComputes,
		cudaStream_t	stream
	)

Definition at line 914 of file CudaTileListKernel.cu.

References stream.

                                                                 {
 
   numComputes = numComputesIn;
 
   reallocate_device<CudaComputeRecord>(&cudaComputes, &cudaComputesSize, numComputes);
   copy_HtoD<CudaComputeRecord>(h_cudaComputes, cudaComputes, numComputes, stream);
 
   if (doStreaming) doOutputOrder = true;
 }

The documentation for this class was generated from the following files:

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation