NAMD
Classes | Macros | Functions | Variables
CudaComputeGBISKernel.cu File Reference
#include <cuda.h>
#include <namd_cub/cub.cuh>
#include "CudaUtils.h"
#include "CudaTileListKernel.h"
#include "CudaComputeGBISKernel.h"
#include "ComputeGBIS.inl"
#include "DeviceCUDA.h"

Go to the source code of this file.

Classes

struct  GBISParam< phase >
 
struct  GBISInput< phase >
 
struct  GBISResults< phase >
 
struct  GBISParam< 1 >
 
struct  GBISInput< 1 >
 
struct  GBISResults< 1 >
 
struct  GBISParam< 2 >
 
struct  GBISInput< 2 >
 
struct  GBISResults< 2 >
 
struct  GBISParam< 3 >
 
struct  GBISInput< 3 >
 
struct  GBISResults< 3 >
 

Macros

#define GBIS_CUDA
 
#define LARGE_FLOAT   (float)(1.0e10)
 
#define GBISKERNEL_NUM_WARP   4
 
#define CALL(DOENERGY, DOSLOW)
 

Functions

__device__ __forceinline__ void shuffleNext (float &w)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
 
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 1 > res, float *out, float4 *forces)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 2 > param, const GBISInput< 2 > inp, GBISResults< 2 > &resI, GBISResults< 2 > &resJ, float &energyT)
 
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 2 > res, float *out, float4 *forces)
 
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase (const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 3 > param, const GBISInput< 3 > inp, GBISResults< 3 > &resI, GBISResults< 3 > &resJ, float &energy)
 
__device__ __forceinline__ void writeResult (const int i, const GBISResults< 3 > res, float *out, float4 *forces)
 
template<bool doEnergy, bool doSlow, int phase>
__global__ void GBIS_Kernel (const int numTileLists, const TileList *__restrict__ tileLists, const int *__restrict__ tileJatomStart, const PatchPairRecord *__restrict__ patchPairs, const float3 lata, const float3 latb, const float3 latc, const float4 *__restrict__ xyzq, const float cutoff2, const GBISParam< phase > param, const float *__restrict__ inp1, const float *__restrict__ inp2, const float *__restrict__ inp3, float *__restrict__ out, float4 *__restrict__ forces, TileListVirialEnergy *__restrict__ virialEnergy)
 

Variables

__thread DeviceCUDAdeviceCUDA
 

Macro Definition Documentation

#define CALL (   DOENERGY,
  DOSLOW 
)
Value:
GBIS_Kernel<DOENERGY, DOSLOW, 2> <<< nblock, nthread, 0, stream >>> (\
tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
#define GBIS_CUDA

Definition at line 18 of file CudaComputeGBISKernel.cu.

#define GBISKERNEL_NUM_WARP   4
#define LARGE_FLOAT   (float)(1.0e10)

Definition at line 26 of file CudaComputeGBISKernel.cu.

Function Documentation

template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 1 >  param,
const GBISInput< 1 >  inp,
GBISResults< 1 > &  resI,
GBISResults< 1 > &  resJ,
float &  energy 
)

Definition at line 78 of file CudaComputeGBISKernel.cu.

References GBISParam< 1 >::a_cut, CalcH(), GBISInput< 1 >::intRad0j, GBISInput< 1 >::intRadSi, GBISResults< 1 >::psiSum, GBISInput< 1 >::qi, and GBISInput< 1 >::qj.

80  {
81 
82  if (r2 < cutoff2 && r2 > 0.01f) {
83  float r_i = rsqrtf(r2);
84  float r = r2 * r_i;
85  float hij;
86  int dij;
87  CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
88  resI.psiSum += hij;
89  float hji;
90  int dji;
91  CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
92  resJ.psiSum += hji;
93  }
94 
95 }
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 2 >  param,
const GBISInput< 2 >  inp,
GBISResults< 2 > &  resI,
GBISResults< 2 > &  resJ,
float &  energyT 
)

Definition at line 147 of file CudaComputeGBISKernel.cu.

References GBISInput< 2 >::bornRadI, GBISInput< 2 >::bornRadJ, GBISResults< 2 >::dEdaSum, GBISParam< 2 >::epsilon_p_i, GBISParam< 2 >::epsilon_s_i, GBISResults< 2 >::force, if(), GBISParam< 2 >::kappa, GBISInput< 2 >::qi, GBISInput< 2 >::qj, GBISParam< 2 >::r_cut2, GBISParam< 2 >::r_cut_2, GBISParam< 2 >::r_cut_4, GBISParam< 2 >::scaling, and GBISParam< 2 >::smoothDist.

149  {
150 
151  if (r2 < cutoff2 && r2 > 0.01f) {
152  float r_i = rsqrtf(r2);
153  float r = r2 * r_i;
154  //float bornRadJ = sh_jBornRad[j];
155 
156  //calculate GB energy
157  float qiqj = inp.qi*inp.qj;
158  float aiaj = inp.bornRadI*inp.bornRadJ;
159  float aiaj4 = 4*aiaj;
160  float expr2aiaj4 = exp(-r2/aiaj4);
161  float fij = sqrt(r2 + aiaj*expr2aiaj4);
162  float f_i = 1/fij;
163  float expkappa = exp(-param.kappa*fij);
164  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
165  float gbEij = qiqj*Dij*f_i;
166 
167  //calculate energy derivatives
168  float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
169  float ddrf_i = -ddrfij*f_i*f_i;
170  float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
171  float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
172 
173  //NAMD smoothing function
174  float scale = 1.f;
175  float ddrScale = 0.f;
176  float forcedEdr;
177  if (param.smoothDist > 0.f) {
178  scale = r2 * param.r_cut_2 - 1.f;
179  scale *= scale;
180  ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
181  if (doEnergy) energyT += gbEij * scale;
182  forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
183  } else {
184  if (doEnergy) energyT += gbEij;
185  forcedEdr = ddrGbEij;
186  }
187 
188  //add dEda
189  if (doSlow) {
190  float dEdai = 0.5f*qiqj*f_i*f_i
191  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
192  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
193  resI.dEdaSum += dEdai;
194  float dEdaj = 0.5f*qiqj*f_i*f_i
195  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
196  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
197  resJ.dEdaSum += dEdaj;
198  }
199 
200  forcedEdr *= r_i;
201  float tmpx = dx*forcedEdr;
202  float tmpy = dy*forcedEdr;
203  float tmpz = dz*forcedEdr;
204  resI.force.x += tmpx;
205  resI.force.y += tmpy;
206  resI.force.z += tmpz;
207  resJ.force.x -= tmpx;
208  resJ.force.y -= tmpy;
209  resJ.force.z -= tmpz;
210  }
211 
212  // GB Self Energy
213  if (doEnergy) {
214  if (r2 < 0.01f) {
215  float fij = inp.bornRadI;//inf
216  float expkappa = exp(-param.kappa*fij);//0
217  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
218  float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
219  energyT += 0.5f*gbEij;
220  } //same atom or within cutoff
221  }
222 }
if(ComputeNonbondedUtil::goMethod==2)
template<bool doEnergy, bool doSlow>
__device__ __forceinline__ void calcGBISPhase ( const float  r2,
const float  dx,
const float  dy,
const float  dz,
const float  cutoff2,
const GBISParam< 3 >  param,
const GBISInput< 3 >  inp,
GBISResults< 3 > &  resI,
GBISResults< 3 > &  resJ,
float &  energy 
)

Definition at line 274 of file CudaComputeGBISKernel.cu.

References GBISParam< 3 >::a_cut, CalcDH(), GBISInput< 3 >::dHdrPrefixI, GBISInput< 3 >::dHdrPrefixJ, GBISResults< 3 >::force, GBISInput< 3 >::intRadIS, GBISInput< 3 >::intRadJ0, GBISInput< 3 >::intRadSJ, and GBISInput< 3 >::qi.

276  {
277 
278  if (r2 < cutoff2 && r2 > 0.01f) {
279  float r_i = rsqrtf(r2);
280  float r = r2 * r_i;
281  float dhij, dhji;
282  int dij, dji;
283  CalcDH(r, r2, r_i, param.a_cut, inp.qi, inp.intRadSJ, dhij, dij);
284  CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
285 
286  float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
287  float tmpx = dx * forceAlpha;
288  float tmpy = dy * forceAlpha;
289  float tmpz = dz * forceAlpha;
290  resI.force.x += tmpx;
291  resI.force.y += tmpy;
292  resI.force.z += tmpz;
293  resJ.force.x -= tmpx;
294  resJ.force.y -= tmpy;
295  resJ.force.z -= tmpz;
296  }
297 
298 }
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
template<bool doEnergy, bool doSlow, int phase>
__global__ void GBIS_Kernel ( const int  numTileLists,
const TileList *__restrict__  tileLists,
const int *__restrict__  tileJatomStart,
const PatchPairRecord *__restrict__  patchPairs,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *__restrict__  xyzq,
const float  cutoff2,
const GBISParam< phase >  param,
const float *__restrict__  inp1,
const float *__restrict__  inp2,
const float *__restrict__  inp3,
float *__restrict__  out,
float4 *__restrict__  forces,
TileListVirialEnergy *__restrict__  virialEnergy 
)

Definition at line 314 of file CudaComputeGBISKernel.cu.

References cutoff2, GBISKERNEL_NUM_WARP, PatchPairRecord::iatomSize, TileList::iatomStart, itileList, PatchPairRecord::jatomSize, TileList::jtileEnd, TileList::jtileStart, TileList::offsetXYZ, tempStorage, WARP_FULL_MASK, WARP_SHUFFLE, WARPSIZE, and writeResult().

327  {
328 
329  // Single warp takes care of one list of tiles
330  for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
331  {
332  TileList tmp = tileLists[itileList];
333  int iatomStart = tmp.iatomStart;
334  int jtileStart = tmp.jtileStart;
335  int jtileEnd = tmp.jtileEnd;
336  PatchPairRecord PPStmp = patchPairs[itileList];
337  int iatomSize = PPStmp.iatomSize;
338  int jatomSize = PPStmp.jatomSize;
339 
340  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
341  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
342  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
343 
344  // Warp index (0...warpsize-1)
345  const int wid = threadIdx.x % WARPSIZE;
346 
347  // Load i-atom data (and shift coordinates)
348  float4 xyzq_i = xyzq[iatomStart + wid];
349  xyzq_i.x += shx;
350  xyzq_i.y += shy;
351  xyzq_i.z += shz;
352 
353  GBISInput<phase> inp;
354  inp.loadI(iatomStart + wid, inp1, inp2, inp3);
355  if (phase == 2) inp.initQi(param, xyzq_i.w);
356 
357  // Number of i loops
358  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
359 
360  GBISResults<phase> resI;
361  resI.init();
362  float energyT;
363  if (doEnergy) energyT = 0.0f;
364 
365  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
366 
367  // Load j-atom starting index and exclusion mask
368  int jatomStart = tileJatomStart[jtile];
369 
370  // Load coordinates and charge
371  float4 xyzq_j = xyzq[jatomStart + wid];
372 
373  inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
374  if (phase == 2) inp.initQj(xyzq_j.w);
375 
376  // Number of j loops
377  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
378 
379  const bool diag_tile = (iatomStart == jatomStart);
380  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
381  int t = (phase != 2 && diag_tile) ? 1 : 0;
382  if (phase != 2 && diag_tile) {
383  inp.shuffleNext();
384  }
385 
386  GBISResults<phase> resJ;
387  resJ.init();
388 
389  for (;t < WARPSIZE;t++) {
390  int j = (t + wid) & modval;
391 
392  float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
393  float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
394  float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
395 
396  float r2 = dx*dx + dy*dy + dz*dz;
397 
398  if (j < nloopj && wid < nloopi) {
399  calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
400  }
401 
402  inp.shuffleNext();
403  resJ.shuffleNext();
404  } // t
405 
406  // Write j
407  writeResult(jatomStart + wid, resJ, out, forces);
408 
409  } // jtile
410 
411  // Write i
412  writeResult(iatomStart + wid, resI, out, forces);
413 
414  // Reduce energy
415  if (doEnergy) {
416  typedef cub::WarpReduce<double> WarpReduceDouble;
417  __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
418  int warpId = threadIdx.x / WARPSIZE;
419  volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
420  if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
421  }
422 
423  }
424 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread float4 * forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
#define WARPSIZE
Definition: CudaUtils.h:10
__device__ __forceinline__ void writeResult(const int i, const GBISResults< 1 > res, float *out, float4 *forces)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 offsetXYZ
__global__ void const int numTileLists
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
#define GBISKERNEL_NUM_WARP
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
__device__ __forceinline__ void shuffleNext ( float &  w)

Definition at line 31 of file CudaComputeGBISKernel.cu.

References WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

31  {
32  w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
33 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 1 >  res,
float *  out,
float4 *  forces 
)

Definition at line 97 of file CudaComputeGBISKernel.cu.

References GBISResults< 1 >::psiSum.

Referenced by GBIS_Kernel().

97  {
98  atomicAdd(&out[i], res.psiSum);
99 }
__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 2 >  res,
float *  out,
float4 *  forces 
)

Definition at line 224 of file CudaComputeGBISKernel.cu.

References GBISResults< 2 >::dEdaSum, GBISResults< 2 >::force, x, y, and z.

224  {
225  atomicAdd(&out[i], res.dEdaSum);
226  atomicAdd(&forces[i].x, res.force.x);
227  atomicAdd(&forces[i].y, res.force.y);
228  atomicAdd(&forces[i].z, res.force.z);
229 }
static __thread float4 * forces
gridSize z
gridSize y
gridSize x
__device__ __forceinline__ void writeResult ( const int  i,
const GBISResults< 3 >  res,
float *  out,
float4 *  forces 
)

Definition at line 300 of file CudaComputeGBISKernel.cu.

References GBISResults< 3 >::force, x, y, and z.

300  {
301  atomicAdd(&forces[i].x, res.force.x);
302  atomicAdd(&forces[i].y, res.force.y);
303  atomicAdd(&forces[i].z, res.force.z);
304 }
static __thread float4 * forces
gridSize z
gridSize y
gridSize x

Variable Documentation

__thread DeviceCUDA* deviceCUDA

Definition at line 22 of file DeviceCUDA.C.