CudaComputeNonbondedKernel Class Reference

#include <CudaComputeNonbondedKernel.h>

List of all members.

Public Member Functions

 CudaComputeNonbondedKernel (int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 ~CudaComputeNonbondedKernel ()
void updateVdwTypesExcl (const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
void nonbondedForce (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
void reduceVirialEnergy (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
void getVirialEnergy (VirialEnergy *h_virialEnergy, cudaStream_t stream)
void bindExclusions (int numExclusions, unsigned int *exclusion_bits)
int * getPatchReadyQueue ()


Detailed Description

Definition at line 8 of file CudaComputeNonbondedKernel.h.


Constructor & Destructor Documentation

CudaComputeNonbondedKernel::CudaComputeNonbondedKernel ( int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel (  ) 

Definition at line 959 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

00959                                                         {
00960   cudaCheck(cudaSetDevice(deviceID));
00961   if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
00962   if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
00963   if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
00964   if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
00965   if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
00966   if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
00967 }


Member Function Documentation

void CudaComputeNonbondedKernel::bindExclusions ( int  numExclusions,
unsigned int *  exclusion_bits 
)

Definition at line 1152 of file CudaComputeNonbondedKernel.cu.

References constExclusions, cudaCheck, and MAX_CONST_EXCLUSIONS.

01152                                                                                                {
01153         int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
01154         cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
01155 
01156   reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
01157   copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
01158 }

int * CudaComputeNonbondedKernel::getPatchReadyQueue (  ) 

Definition at line 981 of file CudaComputeNonbondedKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbonded::launchWork().

00981                                                     {
00982   if (!doStreaming) {
00983     NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
00984   }
00985   return patchReadyQueue;
00986 }

void CudaComputeNonbondedKernel::getVirialEnergy ( VirialEnergy h_virialEnergy,
cudaStream_t  stream 
)

void CudaComputeNonbondedKernel::nonbondedForce ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doPairlist,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *  h_xyzq,
const float  cutoff2,
float4 *  d_forces,
float4 *  d_forcesSlow,
float4 *  h_forces,
float4 *  h_forcesSlow,
cudaStream_t  stream 
)

Definition at line 988 of file CudaComputeNonbondedKernel.cu.

References CALL, CudaTileListKernel::clearTileListStat(), cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getNumPatches(), CudaTileListKernel::getNumTileLists(), CudaTileListKernel::getOutputOrder(), NAMD_die(), CudaTileListKernel::setTileListVirialEnergyLength(), and WARPSIZE.

00995                        {
00996 
00997   if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
00998 
00999   clear_device_array<float4>(d_forces, atomStorageSize, stream);
01000   if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
01001 
01002   tlKernel.clearTileListStat(stream);
01003   // clear_device_array<TileListStat>(tlKernel.getTileListStatDevPtr(), 1, stream);
01004 
01005   // --- streaming ----
01006   float4* m_forces = NULL;
01007   float4* m_forcesSlow = NULL;
01008   int* m_patchReadyQueue = NULL;
01009   int numPatches = 0;
01010   unsigned int* patchNumCountPtr = NULL;
01011   if (doStreaming) {
01012     numPatches = tlKernel.getNumPatches();
01013     if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
01014       // If re-allocated, clear array
01015       clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
01016     }
01017     patchNumCountPtr = patchNumCount;
01018     bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
01019     if (re) {
01020       // If re-allocated, re-set to "-1"
01021       for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
01022     }
01023     cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
01024     cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
01025     cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
01026   }
01027   // -----------------
01028 
01029   if (doVirial || doEnergy) {
01030     tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
01031   }
01032 
01033   int shMemSize = 0;
01034 
01035   int* outputOrderPtr = tlKernel.getOutputOrder();
01036 
01037   int nwarp = NONBONDKERNEL_NUM_WARP;
01038   int nthread = WARPSIZE*nwarp;
01039   int start = 0;
01040   while (start < tlKernel.getNumTileLists())
01041   {
01042 
01043     int nleft = tlKernel.getNumTileLists() - start;
01044     int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
01045 
01046 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
01047     nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
01048   <<< nblock, nthread, shMemSize, stream >>>  \
01049   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
01050     cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
01051     vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
01052     cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
01053     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
01054     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
01055     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
01056     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
01057     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
01058 
01059     bool called = false;
01060 
01061     if (doStreaming) {
01062       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
01063       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
01064       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
01065       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
01066       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
01067       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
01068       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
01069       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
01070 
01071       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 1);
01072       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 1);
01073       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 1);
01074       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 1);
01075       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 1);
01076       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 1);
01077       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 1);
01078       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 1);
01079     } else {
01080       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
01081       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
01082       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
01083       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
01084       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
01085       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
01086       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
01087       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
01088 
01089       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 0);
01090       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 0);
01091       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 0);
01092       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 0);
01093       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 0);
01094       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 0);
01095       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 0);
01096       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 0);
01097     }
01098 
01099     if (!called) {
01100       NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
01101     }
01102 
01103 #undef CALL
01104     cudaCheck(cudaGetLastError());
01105 
01106     start += nblock*nwarp;
01107   }
01108 
01109 }

void CudaComputeNonbondedKernel::reduceVirialEnergy ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const bool  doGBIS,
float4 *  d_forces,
float4 *  d_forcesSlow,
VirialEnergy d_virialEnergy,
cudaStream_t  stream 
)

Definition at line 1114 of file CudaComputeNonbondedKernel.cu.

References cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getTileListVirialEnergy(), CudaTileListKernel::getTileListVirialEnergyGBISLength(), CudaTileListKernel::getTileListVirialEnergyLength(), REDUCEGBISENERGYKERNEL_NUM_WARP, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP, REDUCEVIRIALENERGYKERNEL_NUM_WARP, and WARPSIZE.

Referenced by CudaComputeNonbonded::launchWork().

01117                                                      {
01118 
01119   if (doEnergy || doVirial) {
01120     clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
01121   }
01122 
01123   if (doVirial)
01124   {
01125     int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
01126     int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
01127     reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
01128     (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
01129     cudaCheck(cudaGetLastError());
01130   }
01131 
01132   if (doVirial || doEnergy)
01133   {
01134     int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
01135     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
01136     reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
01137     (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01138     cudaCheck(cudaGetLastError());
01139   }  
01140 
01141   if (doGBIS && doEnergy)
01142   {
01143     int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
01144     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
01145     reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
01146     (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01147     cudaCheck(cudaGetLastError());
01148   }
01149 
01150 }

void CudaComputeNonbondedKernel::updateVdwTypesExcl ( const int  atomStorageSize,
const int *  h_vdwTypes,
const int2 *  h_exclIndexMaxDiff,
const int *  h_atomIndex,
cudaStream_t  stream 
)

Definition at line 969 of file CudaComputeNonbondedKernel.cu.

References OVERALLOC.

00970                                                                                {
00971 
00972   reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
00973   reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
00974   reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
00975 
00976   copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
00977   copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
00978   copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
00979 }


The documentation for this class was generated from the following files:
Generated on Thu Nov 23 01:17:18 2017 for NAMD by  doxygen 1.4.7