NAMD
Macros | Functions | Variables
CudaComputeNonbondedKernel.cu File Reference
#include <cuda.h>
#include <namd_cub/cub.cuh>
#include "CudaComputeNonbondedKernel.h"
#include "CudaTileListKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Macros

#define NONBONDKERNEL_NUM_WARP   4
 
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP   32
 
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP   32
 
#define REDUCEGBISENERGYKERNEL_NUM_WARP   32
 
#define OVERALLOC   1.2f
 
#define MAX_CONST_EXCLUSIONS   2048
 
#define LARGE_FLOAT   (float)(1.0e10)
 
#define TABLE_PARAMS   cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
 
#define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING)
 

Functions

void NAMD_die (const char *)
 
__device__ __forceinline__ float4 sampleTableTex (cudaTextureObject_t tex, float k)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcForceEnergy (const float r2, const float qi, const float qj, const float dx, const float dy, const float dz, const int vdwtypei, const int vdwtypej, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, float3 &iforce, float3 &iforceSlow, float3 &jforce, float3 &jforceSlow, float &energyVdw, float &energyElec, float &energySlow)
 
template<bool doSlow>
__device__ __forceinline__ void storeForces (const int pos, const float3 force, const float3 forceSlow, float *__restrict__ devForces_x, float *__restrict__ devForces_y, float *__restrict__ devForces_z, float *__restrict__ devForcesSlow_x, float *__restrict__ devForcesSlow_y, float *__restrict__ devForcesSlow_z)
 
template<bool doPairlist>
__device__ __forceinline__ void shuffleNext (float &xyzq_j_w, int &vdwtypej, int &jatomIndex, int &jexclMaxdiff, int &jexclIndex)
 
template<bool doPairlist>
__device__ __forceinline__ void shuffleNext (float &xyzq_j_w, int &vdwtypej, int &jatomIndex)
 
template<bool doSlow>
__device__ __forceinline__ void shuffleNext (float3 &jforce, float3 &jforceSlow)
 
__device__ __forceinline__ float distsq (const BoundingBox a, const float4 b)
 
template<bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
__global__ void __launch_bounds__ (WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
 
 if (itileList< numTileLists)
 
__global__ void reduceNonbondedVirialKernel (const bool doSlow, const int atomStorageSize, const float4 *__restrict__ xyzq, const float4 *__restrict__ devForces, const float4 *__restrict__ devForcesSlow, VirialEnergy *__restrict__ virialEnergy)
 
__global__ void reduceVirialEnergyKernel (const bool doEnergy, const bool doVirial, const bool doSlow, const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
 
__global__ void reduceGBISEnergyKernel (const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
 
__global__ void reduceNonbondedBinsKernel (const bool doVirial, const bool doEnergy, const bool doSlow, const bool doGBIS, VirialEnergy *__restrict__ virialEnergy)
 
template<int doSlow>
__global__ void transposeForcesKernel (float4 *f, float4 *fSlow, float *fx, float *fy, float *fz, float *fw, float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww, int n)
 

Variables

__thread DeviceCUDAdeviceCUDA
 
__constant__ unsigned int constExclusions [MAX_CONST_EXCLUSIONS]
 
__global__ void const int numTileLists
 
__global__ void const int
const TileList *__restrict__ 
tileLists
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ 
tileExcls
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ 
tileJatomStart
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int 
vdwCoefTableWidth
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__ 
vdwCoefTable
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t 
vdwCoefTableTex
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ 
vdwTypes
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3 
lata
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 
latb
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3 
latc
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__ 
xyzq
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float 
cutoff2
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t 
forceTableTex
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t 
energyTableTex
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float 
plcutoff2
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ 
patchPairs
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ 
atomIndex
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ 
exclIndexMaxDiff
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ 
overflowExclusions
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ 
tileListDepth
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ 
tileListOrder
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ 
jtiles
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ 
tileListStat
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__ 
boundingBoxes
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ 
devForce_x
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ 
devForce_y
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ 
devForce_z
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ 
devForce_w
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ 
devForceSlow_x
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ 
devForceSlow_y
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ 
devForceSlow_z
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ 
devForceSlow_w
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int 
numPatches
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__ 
patchNumCount
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ 
cudaPatches
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ 
mapForces
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ 
mapForcesSlow
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ int
*__restrict__ 
mapPatchReadyQueue
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ int
*__restrict__ int
*__restrict__ 
outputOrder
 
__global__ void const int
const TileList *__restrict__
TileExcl *__restrict__ const
int *__restrict__ const int
const float2 *__restrict__
cudaTextureObject_t const int
*__restrict__ const float3
const float3 const float3
const float4 *__restrict__
const float
cudaTextureObject_t
cudaTextureObject_t float
const PatchPairRecord
*__restrict__ const int
*__restrict__ const int2
*__restrict__ const unsigned
int *__restrict__ unsigned int
*__restrict__ int
*__restrict__ int
*__restrict__ TileListStat
*__restrict__ const
BoundingBox *__restrict__
float *__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ float
*__restrict__ const int
unsigned int *__restrict__
const CudaPatchRecord
*__restrict__ float4
*__restrict__ float4
*__restrict__ int
*__restrict__ int
*__restrict__
TileListVirialEnergy
*__restrict__ virialEnergy int 
itileList = start + (NONBONDKERNEL_NUM_WARP == 1 ? blockIdx.x : (threadIdx.x/WARPSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x))
 

Macro Definition Documentation

#define CALL (   DOENERGY,
  DOVIRIAL,
  DOSLOW,
  DOPAIRLIST,
  DOSTREAMING 
)
Value:
nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
<<< nblock, nthread, shMemSize, stream >>> (\
start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), cudaNonbondedTables.getVdwCoefTableTex(), \
vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
tlKernel.getBoundingBoxes(), \
force_x, force_y, force_z, force_w, \
forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
#define TABLE_PARAMS
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
#define LARGE_FLOAT   (float)(1.0e10)

Definition at line 238 of file CudaComputeNonbondedKernel.cu.

#define MAX_CONST_EXCLUSIONS   2048
#define NONBONDKERNEL_NUM_WARP   4
#define OVERALLOC   1.2f
#define REDUCEGBISENERGYKERNEL_NUM_WARP   32
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP   32
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP   32
#define TABLE_PARAMS   cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()

Function Documentation

template<bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
__global__ void __launch_bounds__ ( WARPSIZE NONBONDKERNEL_NUM_WARP,
doPairlist?  10):(doEnergy?(10):(10) 
) const
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcForceEnergy ( const float  r2,
const float  qi,
const float  qj,
const float  dx,
const float  dy,
const float  dz,
const int  vdwtypei,
const int  vdwtypej,
const float2 *__restrict__  vdwCoefTable,
cudaTextureObject_t  vdwCoefTableTex,
cudaTextureObject_t  forceTableTex,
cudaTextureObject_t  energyTableTex,
float3 &  iforce,
float3 &  iforceSlow,
float3 &  jforce,
float3 &  jforceSlow,
float &  energyVdw,
float &  energyElec,
float &  energySlow 
)

Definition at line 91 of file CudaComputeNonbondedKernel.cu.

References __ldg, energyTableTex, forceTableTex, sampleTableTex(), vdwCoefTableTex, float2::x, and float2::y.

101  {
102 
103  int vdwIndex = vdwtypej + vdwtypei;
104 #if __CUDA_ARCH__ >= 350
105  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
106 #else
107  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
108 #endif
109 
110  float rinv = __frsqrt_rn(r2);
111  float4 ei;
112 
113 #ifdef NAMD_HIP
114  float4 fi = sampleTableTex(forceTableTex, rinv);
115  if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
116 #else
117  float4 fi = tex1D<float4>(forceTableTex, rinv);
118  if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
119 #endif
120 
121  float fSlow = qi * qj;
122  float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
123 
124  if (doEnergy) {
125  energyVdw += ljab.x * ei.z + ljab.y * ei.y;
126  energyElec += fSlow * ei.x;
127  if (doSlow) energySlow += fSlow * ei.w;
128  }
129  if (doSlow) fSlow *= fi.w;
130 
131  float fx = dx * f;
132  float fy = dy * f;
133  float fz = dz * f;
134  iforce.x += fx;
135  iforce.y += fy;
136  iforce.z += fz;
137  jforce.x -= fx;
138  jforce.y -= fy;
139  jforce.z -= fz;
140 
141  if (doSlow) {
142  float fxSlow = dx * fSlow;
143  float fySlow = dy * fSlow;
144  float fzSlow = dz * fSlow;
145  iforceSlow.x += fxSlow;
146  iforceSlow.y += fySlow;
147  iforceSlow.z += fzSlow;
148  jforceSlow.x -= fxSlow;
149  jforceSlow.y -= fySlow;
150  jforceSlow.z -= fzSlow;
151  }
152 }
float x
Definition: PmeSolver.C:4
__device__ __forceinline__ float4 sampleTableTex(cudaTextureObject_t tex, float k)
float y
Definition: PmeSolver.C:4
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
__device__ __forceinline__ float distsq ( const BoundingBox  a,
const float4  b 
)

Definition at line 230 of file CudaComputeNonbondedKernel.cu.

References BoundingBox::wx, BoundingBox::wy, BoundingBox::wz, BoundingBox::x, BoundingBox::y, and BoundingBox::z.

Referenced by buildTileListsBBKernel().

230  {
231  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
232  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
233  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
234  float r2 = dx*dx + dy*dy + dz*dz;
235  return r2;
236 }
if ( )

Definition at line 300 of file CudaComputeNonbondedKernel.cu.

300  : (threadIdx.x/WARPSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x)); if (itileList < numTileLists)
301  {
302 
303  float3 iforce;
304  float3 iforceSlow;
305  float energyVdw, energyElec, energySlow;
306  int nexcluded;
307  unsigned int itileListLen;
308  int2 patchInd;
309  int2 patchNumList;
310  __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][WARPSIZE];
311  __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][WARPSIZE];
312  __shared__ float3 s_jforce[NONBONDKERNEL_NUM_WARP][WARPSIZE];
313  __shared__ float3 s_jforceSlow[NONBONDKERNEL_NUM_WARP][WARPSIZE];
314  __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][WARPSIZE];
315 
316  // Warp index (0...warpsize-1)
317  const int wid = threadIdx.x % WARPSIZE;
318  const int iwarp = threadIdx.x / WARPSIZE;
319 
320  // Start computation
321  {
322 
323 
325  int iatomStart = tmp.iatomStart;
326  int jtileStart = tmp.jtileStart;
327  int jtileEnd = tmp.jtileEnd;
328  patchInd = tmp.patchInd;
329  patchNumList = tmp.patchNumList;
330 
331  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
332  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
333  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
334 
335  // DH - set zeroShift flag if magnitude of shift vector is zero
336  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
337 
338  int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
339  if (doPairlist) {
341  iatomSize = PPStmp.iatomSize;
342  iatomFreeSize = PPStmp.iatomFreeSize;
343  jatomSize = PPStmp.jatomSize;
344  jatomFreeSize = PPStmp.jatomFreeSize;
345  }
346 
347  // Write to global memory here to avoid register spilling
348  if (doVirial) {
349  if (wid == 0) {
350  virialEnergy[itileList].shx = shx;
351  virialEnergy[itileList].shy = shy;
352  virialEnergy[itileList].shz = shz;
353  }
354  }
355 
356  // Load i-atom data (and shift coordinates)
357  float4 xyzq_i = xyzq[iatomStart + wid];
358  xyzq_i.x += shx;
359  xyzq_i.y += shy;
360  xyzq_i.z += shz;
361  int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
362 
363  // Load i-atom data (and shift coordinates)
364  BoundingBox boundingBoxI;
365  if (doPairlist) {
366  boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
367  boundingBoxI.x += shx;
368  boundingBoxI.y += shy;
369  boundingBoxI.z += shz;
370  }
371 
372  // Get i-atom global index
373 #ifdef USE_NEW_EXCL_METHOD
374  int iatomIndex, minExclAtom, maxExclAtom;
375 #else
376  int iatomIndex;
377 #endif
378  if (doPairlist) {
379 #ifdef USE_NEW_EXCL_METHOD
380  iatomIndex = atomIndex[iatomStart + wid];
381  int2 tmp = minmaxExclAtom[iatomStart + wid];
382  minExclAtom = tmp.x;
383  maxExclAtom = tmp.y;
384 #else
385  iatomIndex = atomIndex[iatomStart + wid];
386 #endif
387  }
388 
389  // i-forces in registers
390  // float3 iforce;
391  iforce.x = 0.0f;
392  iforce.y = 0.0f;
393  iforce.z = 0.0f;
394 
395  // float3 iforceSlow;
396  if (doSlow) {
397  iforceSlow.x = 0.0f;
398  iforceSlow.y = 0.0f;
399  iforceSlow.z = 0.0f;
400  }
401 
402  // float energyVdw, energyElec, energySlow;
403  if (doEnergy) {
404  energyVdw = 0.0f;
405  energyElec = 0.0f;
406  if (doSlow) energySlow = 0.0f;
407  }
408 
409  // Number of exclusions
410  // NOTE: Lowest bit is used as indicator bit for tile pairs:
411  // bit 0 tile has no atoms within pairlist cutoff
412  // bit 1 tile has atoms within pairlist cutoff
413  // int nexcluded;
414  if (doPairlist) nexcluded = 0;
415 
416  // Number of i loops and free atoms
417  int nfreei;
418  if (doPairlist) {
419  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
420  nfreei = max(iatomFreeSize - iatomStart, 0);
421  if (wid >= nloopi) {
422  xyzq_i.x = -LARGE_FLOAT;
423  xyzq_i.y = -LARGE_FLOAT;
424  xyzq_i.z = -LARGE_FLOAT;
425  }
426  }
427 
428  // tile list stuff
429  // int itileListLen;
430  // int minJatomStart;
431  if (doPairlist) {
432  // minJatomStart = tileJatomStart[jtileStart];
433  itileListLen = 0;
434  }
435 
436  // Exclusion index and maxdiff
437  int iexclIndex, iexclMaxdiff;
438  if (doPairlist) {
439  int2 tmp = exclIndexMaxDiff[iatomStart + wid];
440  iexclIndex = tmp.x;
441  iexclMaxdiff = tmp.y;
442  }
443 
444  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
445 
446  // Load j-atom starting index and exclusion mask
447  int jatomStart = tileJatomStart[jtile];
448 
449  float4 xyzq_j = xyzq[jatomStart + wid];
451 
452  // Check for early bail
453  if (doPairlist) {
454  float r2bb = distsq(boundingBoxI, xyzq_j);
455  if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
456  }
457  WarpMask excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
458  int vdwtypej = vdwTypes[jatomStart + wid];
459  s_vdwtypej[iwarp][wid] = vdwtypej;
460 
461  // Get i-atom global index
462  if (doPairlist) {
463  s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
464  }
465 
466  // Number of j loops and free atoms
467  int nfreej;
468  if (doPairlist) {
469  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
470  nfreej = max(jatomFreeSize - jatomStart, 0);
471  //if (nfreei == 0 && nfreej == 0) continue;
472  if (wid >= nloopj) {
473  xyzq_j.x = LARGE_FLOAT;
474  xyzq_j.y = LARGE_FLOAT;
475  xyzq_j.z = LARGE_FLOAT;
476  }
477  }
478 
479  s_xyzq[iwarp][wid] = xyzq_j;
480 
481  // DH - self requires that zeroShift is also set
482  const bool self = zeroShift && (iatomStart == jatomStart);
483  const int modval = (self) ? 2*WARPSIZE-1 : WARPSIZE-1;
484 
485  s_jforce[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
486  if (doSlow)
487  s_jforceSlow[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
489 
490 
491  int t = (self) ? 1 : 0;
492 
493  if (doPairlist) {
494  // Build pair list
495  // NOTE: Pairlist update, we must also include the diagonal since this is used
496  // in GBIS phase 2.
497  // Clear the lowest (indicator) bit
498  nexcluded &= (~1);
499 
500  // For self tiles, do the diagonal term (t=0).
501  // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
502  if (self) {
503  int j = (0 + wid) & modval;
504  xyzq_j = s_xyzq[iwarp][j];
505  float dx = xyzq_j.x - xyzq_i.x;
506  float dy = xyzq_j.y - xyzq_i.y;
507  float dz = xyzq_j.z - xyzq_i.z;
508 
509  float r2 = dx*dx + dy*dy + dz*dz;
510 
511  if (j < WARPSIZE && r2 < plcutoff2) {
512  // We have atom pair within the pairlist cutoff => Set indicator bit
513  nexcluded |= 1;
514  }
515  }
516 
517  for (;t < WARPSIZE;t++) {
518  int j = (t + wid) & modval;
519 
520  excl >>= 1;
521  if (j < WARPSIZE ) {
522  xyzq_j = s_xyzq[iwarp][j];
523  float dx = xyzq_j.x - xyzq_i.x;
524  float dy = xyzq_j.y - xyzq_i.y;
525  float dz = xyzq_j.z - xyzq_i.z;
526  float r2 = dx*dx + dy*dy + dz*dz;
527  // We have atom pair within the pairlist cutoff => Set indicator bit
528  if(r2 < plcutoff2){
529  nexcluded |= 1;
530  if (j < nfreej || wid < nfreei) {
531  bool excluded = false;
532  int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
533  if ( abs(indexdiff) <= iexclMaxdiff) {
534  indexdiff += iexclIndex;
535  int indexword = ((unsigned int) indexdiff) >> 5;
536 #ifdef NAMD_CUDA
537  if ( indexword < MAX_CONST_EXCLUSIONS ) {
538  indexword = constExclusions[indexword];
539  } else
540 #endif
541  {
542  indexword = overflowExclusions[indexword];
543  }
544 
545  excluded = ((indexword & (1<<(indexdiff&31))) != 0); // WARPSIZE-1?
546  }
547  if (excluded) nexcluded += 2;
548  if (!excluded) excl |= (WarpMask)1 << (WARPSIZE-1);
549  if (!excluded && r2 < cutoff2) {
550  calcForceEnergy<doEnergy, doSlow>(
551  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
552  vdwtypei, s_vdwtypej[iwarp][j],
554 #ifdef USE_TABLE_ARRAYS
555  forceTable, energyTable,
556 #else
558 #endif
559  iforce, iforceSlow,
560  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
561  energyVdw,
562  energyElec, energySlow);
563  }
564  }
565  }
566  }
568  } // t
569  } else {
570  // Just compute forces
571  if (self) {
572  // Clear the first bit
573  excl = excl & (~(WarpMask)1);
574  }
575  for (int t = 0;t < WARPSIZE;t++) {
576  if ((excl & 1)) {
577  int j = (wid+t) & (WARPSIZE-1);
578  xyzq_j = s_xyzq[iwarp][j];
579  float dx = xyzq_j.x - xyzq_i.x;
580  float dy = xyzq_j.y - xyzq_i.y;
581  float dz = xyzq_j.z - xyzq_i.z;
582 
583  float r2 = dx*dx + dy*dy + dz*dz;
584 
585  if (r2 < cutoff2) {
586  calcForceEnergy<doEnergy, doSlow>(
587  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
588  vdwtypei, s_vdwtypej[iwarp][j],
590 #ifdef USE_TABLE_ARRAYS
591  forceTable, energyTable,
592 #else
594 #endif
595  iforce, iforceSlow,
596  s_jforce[iwarp][j],
597  s_jforceSlow[iwarp][j],
598  energyVdw, energyElec, energySlow);
599  } // (r2 < cutoff2)
600  } // (excl & 1)
601  excl >>= 1;
603  } // t
604  }
605 
606  // Write j-forces
607  storeForces<doSlow>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
610 
611  // Write exclusions
612  if (doPairlist && WARP_ANY(WARP_FULL_MASK, nexcluded & 1)) {
613  int anyexcl = (65536 | WARP_ANY(WARP_FULL_MASK, excl != 0));
614  // Mark this jtile as non-empty:
615  // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
616  // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
617  if (wid == 0) jtiles[jtile] = anyexcl;
618  // Store exclusions
619  tileExcls[jtile].excl[wid] = excl;
620  // itileListLen:
621  // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
622  // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
623  itileListLen += anyexcl;
624  // NOTE, this minJatomStart is only stored once for the first tile list entry
625  // minJatomStart = min(minJatomStart, jatomStart);
626  }
627 
628  } // jtile
629 
630  // Write i-forces
631  storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow,
634  }
635  // Done with computation
636 
637  // Save pairlist stuff
638  if (doPairlist) {
639 
640  // Warp index (0...warpsize-1)
641  // const int wid = threadIdx.x % WARPSIZE;
642 
643  if (wid == 0) {
644  // minJatomStart is in range [0 ... atomStorageSize-1]
645  //int atom0 = (minJatomStart)/WARPSIZE;
646  // int atom0 = 0;
647  // int storageOffset = atomStorageSize/WARPSIZE;
648  // int itileListLen = 0;
649  // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
650  // Store 0 if itileListLen == 0
651  // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
652  tileListDepth[itileList] = itileListLen;
654  // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
655  if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
656  // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
657  if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
658  // NOTE: always numTileListsGBIS >= numTileLists
659  }
660 
661  typedef cub::WarpReduce<int> WarpReduceInt;
662  __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
663  int warpId = threadIdx.x / WARPSIZE;
664  // Remove indicator bit
665  nexcluded >>= 1;
666  volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
667  if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
668 
669  }
670 
671  if (doVirial) {
672  // Warp index (0...warpsize-1)
673  // const int wid = threadIdx.x % WARPSIZE;
674 
675  typedef cub::WarpReduce<float> WarpReduce;
676  __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
677  int warpId = threadIdx.x / WARPSIZE;
678  volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
680  volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
682  volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
684  if (wid == 0) {
685  virialEnergy[itileList].forcex = iforcexSum;
686  virialEnergy[itileList].forcey = iforceySum;
687  virialEnergy[itileList].forcez = iforcezSum;
688  }
689 
690  if (doSlow) {
691  iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
693  iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
695  iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
697  if (wid == 0) {
698  virialEnergy[itileList].forceSlowx = iforcexSum;
699  virialEnergy[itileList].forceSlowy = iforceySum;
700  virialEnergy[itileList].forceSlowz = iforcezSum;
701  }
702  }
703  }
704 
705  // Reduce energy
706  if (doEnergy) {
707  // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
708  // (Why does CUB suck here?)
709 #pragma unroll
710  for (int i=WARPSIZE/2;i >= 1;i/=2) {
711  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
712  energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, WARPSIZE);
713  if (doSlow) energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
714  }
715 
716  if (threadIdx.x % WARPSIZE == 0) {
717  virialEnergy[itileList].energyVdw = energyVdw;
718  virialEnergy[itileList].energyElec = energyElec;
719  if (doSlow) virialEnergy[itileList].energySlow = energySlow;
720  }
721  }
722 
723  if (doStreaming) {
724  // Make sure devForces and devForcesSlow have been written into device memory
726  __threadfence();
727 
728  int patchDone[2] = {false, false};
729  const int wid = threadIdx.x % WARPSIZE;
730  if (wid == 0) {
731  int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
732  patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
733  if (patchInd.x != patchInd.y) {
734  int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
735  patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
736  }
737  }
738 
739  patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
740  patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
741 
742  if (patchDone[0]) {
743  // Patch 1 is done, write onto host-mapped memory
744  CudaPatchRecord patch = cudaPatches[patchInd.x];
745  int start = patch.atomStart;
746  int end = start + patch.numAtoms;
747  for (int i=start+wid;i < end;i+=WARPSIZE) {
748  mapForces[i] = make_float4(devForce_x[i],
749  devForce_y[i], devForce_z[i], devForce_w[i]);
750  if (doSlow){
751  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
752  devForceSlow_y[i],
753  devForceSlow_z[i],
754  devForceSlow_w[i]);
755  }
756  }
757  }
758  if (patchDone[1]) {
759  // Patch 2 is done
760  CudaPatchRecord patch = cudaPatches[patchInd.y];
761  int start = patch.atomStart;
762  int end = start + patch.numAtoms;
763  for (int i=start+wid;i < end;i+=WARPSIZE) {
764  mapForces[i] = make_float4(devForce_x[i], devForce_y[i], devForce_z[i], devForce_w[i]);
765  if (doSlow){
766  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
767  devForceSlow_y[i],
768  devForceSlow_z[i],
769  devForceSlow_w[i]);
770  }
771  }
772  }
773 
774  if (patchDone[0] || patchDone[1]) {
775  // Make sure mapForces and mapForcesSlow are up-to-date
777  __threadfence_system();
778  // Add patch into "patchReadyQueue"
779  if (wid == 0) {
780  if (patchDone[0]) {
781  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
782  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
783  mapPatchReadyQueue[ind] = patchInd.x;
784  }
785  if (patchDone[1]) {
786  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
787  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
788  mapPatchReadyQueue[ind] = patchInd.y;
789  }
790  }
791  }
792  }
793 
794  if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
795  int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
796  outputOrder[index] = itileList;
797  }
798  } // if (itileList < numTileLists)
#define WARP_ALL(MASK, P)
Definition: CudaUtils.h:56
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ mapForces
#define WARPSIZE
Definition: CudaUtils.h:10
#define LARGE_FLOAT
#define NONBONDKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ tileLists
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
unsigned int WarpMask
Definition: CudaUtils.h:11
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_w
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define MAX_CONST_EXCLUSIONS
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ devForce_y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 offsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:59
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ mapForcesSlow
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ tileExcls
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
#define WARP_ANY(MASK, P)
Definition: CudaUtils.h:57
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ mapPatchReadyQueue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ devForce_x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_w
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_z
void NAMD_die ( const char *  )

Definition at line 85 of file common.C.

86 {
87  if ( ! err_msg ) err_msg = "(unknown error)";
88  CkPrintf("FATAL ERROR: %s\n", err_msg);
89  fflush(stdout);
90  char repstr[24] = "";
91  if (CmiNumPartitions() > 1) {
92  sprintf(repstr,"REPLICA %d ", CmiMyPartition());
93  // CkAbort ensures that all replicas die
94  CkAbort("%sFATAL ERROR: %s\n", repstr, err_msg);
95  }
96  CkError("%sFATAL ERROR: %s\n", repstr, err_msg);
97 #if CHARM_VERSION < 61000
98  CkExit();
99 #else
100  CkExit(1);
101 #endif
102 }
__global__ void reduceGBISEnergyKernel ( const int  numTileLists,
const TileListVirialEnergy *__restrict__  tileListVirialEnergy,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 1049 of file CudaComputeNonbondedKernel.cu.

References ATOMIC_BINS, BLOCK_SYNC, itileList, and tempStorage.

1051  {
1052 
1053  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1054  {
1055  int itileList = ibase + threadIdx.x;
1056  double energyGBISt = 0.0;
1057  if (itileList < numTileLists) {
1058  energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
1059  }
1060 
1061  const int bin = blockIdx.x % ATOMIC_BINS;
1062 
1063  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1064  __shared__ typename BlockReduce::TempStorage tempStorage;
1065  double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
1066  if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energyGBIS, energyGBIS);
1067  }
1068 
1069 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void reduceNonbondedBinsKernel ( const bool  doVirial,
const bool  doEnergy,
const bool  doSlow,
const bool  doGBIS,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 1071 of file CudaComputeNonbondedKernel.cu.

References ATOMIC_BINS, and tempStorage.

1076  {
1077 
1078  const int bin = threadIdx.x;
1079 
1080  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
1081  __shared__ typename WarpReduce::TempStorage tempStorage;
1082 
1083  if (doVirial) {
1084  double vxx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[0]);
1085  double vxy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[1]);
1086  double vxz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[2]);
1087  double vyx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[3]);
1088  double vyy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[4]);
1089  double vyz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[5]);
1090  double vzx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[6]);
1091  double vzy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[7]);
1092  double vzz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[8]);
1093  if (threadIdx.x == 0) {
1094  virialEnergy->virial[0] = vxx;
1095  virialEnergy->virial[1] = vxy;
1096  virialEnergy->virial[2] = vxz;
1097  virialEnergy->virial[3] = vyx;
1098  virialEnergy->virial[4] = vyy;
1099  virialEnergy->virial[5] = vyz;
1100  virialEnergy->virial[6] = vzx;
1101  virialEnergy->virial[7] = vzy;
1102  virialEnergy->virial[8] = vzz;
1103  }
1104 
1105  if (doSlow) {
1106  double vxxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[0]);
1107  double vxySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[1]);
1108  double vxzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[2]);
1109  double vyxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[3]);
1110  double vyySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[4]);
1111  double vyzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[5]);
1112  double vzxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[6]);
1113  double vzySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[7]);
1114  double vzzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[8]);
1115  if (threadIdx.x == 0) {
1116  virialEnergy->virialSlow[0] = vxxSlow;
1117  virialEnergy->virialSlow[1] = vxySlow;
1118  virialEnergy->virialSlow[2] = vxzSlow;
1119  virialEnergy->virialSlow[3] = vyxSlow;
1120  virialEnergy->virialSlow[4] = vyySlow;
1121  virialEnergy->virialSlow[5] = vyzSlow;
1122  virialEnergy->virialSlow[6] = vzxSlow;
1123  virialEnergy->virialSlow[7] = vzySlow;
1124  virialEnergy->virialSlow[8] = vzzSlow;
1125  }
1126  }
1127  }
1128 
1129  if (doEnergy) {
1130  double energyVdw = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw);
1131  double energyElec = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec);
1132  if (threadIdx.x == 0) {
1133  virialEnergy->energyVdw = energyVdw;
1134  virialEnergy->energyElec = energyElec;
1135  }
1136  if (doSlow) {
1137  double energySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow);
1138  if (threadIdx.x == 0) {
1139  virialEnergy->energySlow = energySlow;
1140  }
1141  }
1142  if (doGBIS) {
1143  double energyGBIS = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyGBIS);
1144  if (threadIdx.x == 0) {
1145  virialEnergy->energyGBIS = energyGBIS;
1146  }
1147  }
1148  }
1149 }
#define ATOMIC_BINS
Definition: CudaUtils.h:24
double virialSlow[9]
__shared__ union @43 tempStorage
__global__ void reduceNonbondedVirialKernel ( const bool  doSlow,
const int  atomStorageSize,
const float4 *__restrict__  xyzq,
const float4 *__restrict__  devForces,
const float4 *__restrict__  devForcesSlow,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 804 of file CudaComputeNonbondedKernel.cu.

References ATOMIC_BINS, BLOCK_SYNC, and tempStorage.

808  {
809 
810  for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
811  {
812  int i = ibase + threadIdx.x;
813 
814  // Set to zero to avoid nan*0
815  float4 pos;
816  pos.x = 0.0f;
817  pos.y = 0.0f;
818  pos.z = 0.0f;
819  float4 force, forceSlow;
820  force.x = 0.0f;
821  force.y = 0.0f;
822  force.z = 0.0f;
823  forceSlow.x = 0.0f;
824  forceSlow.y = 0.0f;
825  forceSlow.z = 0.0f;
826  if (i < atomStorageSize) {
827  pos = xyzq[i];
828  force = devForces[i];
829  if (doSlow) forceSlow = devForcesSlow[i];
830  }
831  // Reduce across the entire thread block
832  float vxxt = force.x*pos.x;
833  float vxyt = force.x*pos.y;
834  float vxzt = force.x*pos.z;
835  float vyxt = force.y*pos.x;
836  float vyyt = force.y*pos.y;
837  float vyzt = force.y*pos.z;
838  float vzxt = force.z*pos.x;
839  float vzyt = force.z*pos.y;
840  float vzzt = force.z*pos.z;
841 
842  const int bin = blockIdx.x % ATOMIC_BINS;
843 
844  // atomicAdd(&virialEnergy->virial[0], (double)vxx);
845  // atomicAdd(&virialEnergy->virial[1], (double)vxy);
846  // atomicAdd(&virialEnergy->virial[2], (double)vxz);
847  // atomicAdd(&virialEnergy->virial[3], (double)vyx);
848  // atomicAdd(&virialEnergy->virial[4], (double)vyy);
849  // atomicAdd(&virialEnergy->virial[5], (double)vyz);
850  // atomicAdd(&virialEnergy->virial[6], (double)vzx);
851  // atomicAdd(&virialEnergy->virial[7], (double)vzy);
852  // atomicAdd(&virialEnergy->virial[8], (double)vzz);
853 
854  typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
855  __shared__ typename BlockReduce::TempStorage tempStorage;
856  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
857  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
858  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
859  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
860  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
861  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
862  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
863  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
864  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
865  if (threadIdx.x == 0) {
866  atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
867  atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
868  atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
869  atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
870  atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
871  atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
872  atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
873  atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
874  atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
875  }
876 
877  if (doSlow) {
878  // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
879  float vxxSlowt = forceSlow.x*pos.x;
880  float vxySlowt = forceSlow.x*pos.y;
881  float vxzSlowt = forceSlow.x*pos.z;
882  float vyxSlowt = forceSlow.y*pos.x;
883  float vyySlowt = forceSlow.y*pos.y;
884  float vyzSlowt = forceSlow.y*pos.z;
885  float vzxSlowt = forceSlow.z*pos.x;
886  float vzySlowt = forceSlow.z*pos.y;
887  float vzzSlowt = forceSlow.z*pos.z;
888  // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
889  // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
890  // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
891  // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
892  // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
893  // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
894  // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
895  // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
896  // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
897  float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
898  float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
899  float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
900  float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
901  float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
902  float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
903  float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
904  float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
905  float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
906  if (threadIdx.x == 0) {
907  atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxxSlow);
908  atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxySlow);
909  atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxzSlow);
910  atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyxSlow);
911  atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyySlow);
912  atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyzSlow);
913  atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzxSlow);
914  atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzySlow);
915  atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzzSlow);
916  }
917  }
918 
919  }
920 
921 }
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__shared__ union @43 tempStorage
__global__ void reduceVirialEnergyKernel ( const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const int  numTileLists,
const TileListVirialEnergy *__restrict__  tileListVirialEnergy,
VirialEnergy *__restrict__  virialEnergy 
)

Definition at line 923 of file CudaComputeNonbondedKernel.cu.

References ATOMIC_BINS, BLOCK_SYNC, TileListVirialEnergy::energyElec, TileListVirialEnergy::energySlow, TileListVirialEnergy::energyVdw, TileListVirialEnergy::forceSlowx, TileListVirialEnergy::forceSlowy, TileListVirialEnergy::forceSlowz, TileListVirialEnergy::forcex, TileListVirialEnergy::forcey, TileListVirialEnergy::forcez, itileList, TileListVirialEnergy::shx, TileListVirialEnergy::shy, TileListVirialEnergy::shz, and tempStorage.

927  {
928 
929  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
930  {
931  int itileList = ibase + threadIdx.x;
933  if (itileList < numTileLists) {
934  ve = tileListVirialEnergy[itileList];
935  } else {
936  // Set to zero to avoid nan*0
937  if (doVirial) {
938  ve.shx = 0.0f;
939  ve.shy = 0.0f;
940  ve.shz = 0.0f;
941  ve.forcex = 0.0f;
942  ve.forcey = 0.0f;
943  ve.forcez = 0.0f;
944  ve.forceSlowx = 0.0f;
945  ve.forceSlowy = 0.0f;
946  ve.forceSlowz = 0.0f;
947  }
948  if (doEnergy) {
949  ve.energyVdw = 0.0;
950  ve.energyElec = 0.0;
951  ve.energySlow = 0.0;
952  // ve.energyGBIS = 0.0;
953  }
954  }
955 
956  const int bin = blockIdx.x % ATOMIC_BINS;
957 
958  if (doVirial) {
959  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
960  __shared__ typename BlockReduce::TempStorage tempStorage;
961  float vxxt = ve.forcex*ve.shx;
962  float vxyt = ve.forcex*ve.shy;
963  float vxzt = ve.forcex*ve.shz;
964  float vyxt = ve.forcey*ve.shx;
965  float vyyt = ve.forcey*ve.shy;
966  float vyzt = ve.forcey*ve.shz;
967  float vzxt = ve.forcez*ve.shx;
968  float vzyt = ve.forcez*ve.shy;
969  float vzzt = ve.forcez*ve.shz;
970  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
971  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
972  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
973  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
974  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
975  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
976  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
977  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
978  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
979  if (threadIdx.x == 0) {
980  atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
981  atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
982  atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
983  atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
984  atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
985  atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
986  atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
987  atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
988  atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
989  }
990 
991  if (doSlow) {
992  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
993  __shared__ typename BlockReduce::TempStorage tempStorage;
994  float vxxt = ve.forceSlowx*ve.shx;
995  float vxyt = ve.forceSlowx*ve.shy;
996  float vxzt = ve.forceSlowx*ve.shz;
997  float vyxt = ve.forceSlowy*ve.shx;
998  float vyyt = ve.forceSlowy*ve.shy;
999  float vyzt = ve.forceSlowy*ve.shz;
1000  float vzxt = ve.forceSlowz*ve.shx;
1001  float vzyt = ve.forceSlowz*ve.shy;
1002  float vzzt = ve.forceSlowz*ve.shz;
1003  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1004  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1005  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1006  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1007  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1008  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1009  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1010  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1011  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1012  if (threadIdx.x == 0) {
1013  atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxx);
1014  atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxy);
1015  atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxz);
1016  atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyx);
1017  atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyy);
1018  atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyz);
1019  atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzx);
1020  atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzy);
1021  atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzz);
1022  }
1023  }
1024  }
1025 
1026  if (doEnergy) {
1027  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1028  __shared__ typename BlockReduce::TempStorage tempStorage;
1029  double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
1030  double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
1031  if (threadIdx.x == 0) {
1032  atomicAdd(&virialEnergy[bin].energyVdw, energyVdw);
1033  atomicAdd(&virialEnergy[bin].energyElec, energyElec);
1034  }
1035  if (doSlow) {
1036  double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
1037  if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energySlow, energySlow);
1038  }
1039  // if (doGBIS) {
1040  // double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
1041  // if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
1042  // }
1043  }
1044 
1045  }
1046 
1047 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__device__ __forceinline__ float4 sampleTableTex ( cudaTextureObject_t  tex,
float  k 
)

Definition at line 44 of file CudaComputeNonbondedKernel.cu.

References FORCE_ENERGY_TABLE_SIZE, and x.

44  {
45  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
46  const float x = k * (float)tableSize - 0.5f;
47  const float f = floorf(x);
48  const float a = x - f;
49  const unsigned int i = (unsigned int)f;
50  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
51  const int i1 = i0 + 1;
52  const float4 t0 = tex1Dfetch<float4>(tex, i0);
53  const float4 t1 = tex1Dfetch<float4>(tex, i1);
54  return make_float4(
55  a * (t1.x - t0.x) + t0.x,
56  a * (t1.y - t0.y) + t0.y,
57  a * (t1.z - t0.z) + t0.z,
58  a * (t1.w - t0.w) + t0.w);
59 }
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:19
gridSize x
template<bool doPairlist>
__device__ __forceinline__ void shuffleNext ( float &  xyzq_j_w,
int &  vdwtypej,
int &  jatomIndex,
int &  jexclMaxdiff,
int &  jexclIndex 
)

Definition at line 192 of file CudaComputeNonbondedKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

192  {
193  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
194  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
195  if (doPairlist) {
196  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
197  jexclIndex = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
198  jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
199  }
200 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
template<bool doPairlist>
__device__ __forceinline__ void shuffleNext ( float &  xyzq_j_w,
int &  vdwtypej,
int &  jatomIndex 
)

Definition at line 204 of file CudaComputeNonbondedKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

204  {
205  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
206  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
207  if (doPairlist) {
208  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
209  }
210 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
template<bool doSlow>
__device__ __forceinline__ void shuffleNext ( float3 &  jforce,
float3 &  jforceSlow 
)

Definition at line 214 of file CudaComputeNonbondedKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

214  {
215  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
216  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
217  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
218  if (doSlow) {
219  jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
220  jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
221  jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
222  }
223 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
template<bool doSlow>
__device__ __forceinline__ void storeForces ( const int  pos,
const float3  force,
const float3  forceSlow,
float *__restrict__  devForces_x,
float *__restrict__  devForces_y,
float *__restrict__  devForces_z,
float *__restrict__  devForcesSlow_x,
float *__restrict__  devForcesSlow_y,
float *__restrict__  devForcesSlow_z 
)

Definition at line 156 of file CudaComputeNonbondedKernel.cu.

163 {
164 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
165  if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
166  atomicAddNoRet(&devForces_x[pos], force.x);
167  atomicAddNoRet(&devForces_y[pos], force.y);
168  atomicAddNoRet(&devForces_z[pos], force.z);
169  }
170  if (doSlow) {
171  if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
172  atomicAddNoRet(&devForcesSlow_x[pos], forceSlow.x);
173  atomicAddNoRet(&devForcesSlow_y[pos], forceSlow.y);
174  atomicAddNoRet(&devForcesSlow_z[pos], forceSlow.z);
175  }
176  }
177 #else
178  atomicAdd(&devForces_x[pos], force.x);
179  atomicAdd(&devForces_y[pos], force.y);
180  atomicAdd(&devForces_z[pos], force.z);
181  if (doSlow) {
182  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
183  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
184  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
185  }
186 #endif
187 }
template<int doSlow>
__global__ void transposeForcesKernel ( float4 *  f,
float4 *  fSlow,
float *  fx,
float *  fy,
float *  fz,
float *  fw,
float *  fSlowx,
float *  fSlowy,
float *  fSlowz,
float *  fSloww,
int  n 
)

Definition at line 1234 of file CudaComputeNonbondedKernel.cu.

1238 {
1239  int tid = blockIdx.x*blockDim.x + threadIdx.x;
1240  if (tid < n) {
1241  f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
1242  if (doSlow) {
1243  fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
1244  }
1245  }
1246 }

Variable Documentation

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ atomIndex
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ boundingBoxes

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ cudaPatches

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cutoff2
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ devForce_w

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ devForce_x

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ devForce_y

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ devForce_z

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ devForceSlow_w

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ devForceSlow_x

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ devForceSlow_y

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ devForceSlow_z

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__thread DeviceCUDA* deviceCUDA

Definition at line 22 of file DeviceCUDA.C.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

Referenced by calcForceEnergy(), and modifiedExclusionForce().

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ exclIndexMaxDiff

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t forceTableTex

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

Referenced by calcForceEnergy(), and modifiedExclusionForce().

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ int* __restrict__ int* __restrict__ TileListVirialEnergy* __restrict__ virialEnergy int itileList = start + (NONBONDKERNEL_NUM_WARP == 1 ? blockIdx.x : (threadIdx.x/WARPSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x))
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ jtiles

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 lata
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 latb
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ mapForces

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ mapForcesSlow

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ int* __restrict__ mapPatchReadyQueue

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int numPatches
__global__ void const int numTileLists

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

Referenced by buildTileListsBBKernel().

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ const CudaPatchRecord* __restrict__ float4* __restrict__ float4* __restrict__ int* __restrict__ int* __restrict__ outputOrder

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ overflowExclusions

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ const BoundingBox* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ float* __restrict__ const int unsigned int* __restrict__ patchNumCount

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ patchPairs

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ tileExcls

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ tileJatomStart

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ tileListDepth

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ tileListOrder

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ tileLists

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord* __restrict__ const int* __restrict__ const int2* __restrict__ const unsigned int* __restrict__ unsigned int* __restrict__ int* __restrict__ int* __restrict__ TileListStat* __restrict__ tileListStat

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ vdwCoefTable

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

Referenced by modifiedExclusionForcesKernel().

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t vdwCoefTableTex
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ vdwTypes

Definition at line 254 of file CudaComputeNonbondedKernel.cu.

__global__ void const int const TileList* __restrict__ TileExcl* __restrict__ const int* __restrict__ const int const float2* __restrict__ cudaTextureObject_t const int* __restrict__ const float3 const float3 const float3 const float4* __restrict__ xyzq