3 #if __CUDACC_VER_MAJOR__ >= 11
6 #include <namd_cub/cub.cuh>
7 #endif //CUDACC version
9 #ifdef NAMD_HIP //NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include <hipcub/hipcub.hpp>
22 #define __thread __declspec(thread)
26 #define LARGE_FLOAT (float)(1.0e10)
28 #define GBISKERNEL_NUM_WARP 4
30 __device__ __forceinline__
52 __device__ __forceinline__
void loadI(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
56 __device__ __forceinline__
void loadJ(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
61 __device__ __forceinline__
void initQj(
const float q) {}
70 __device__ __forceinline__
void init() {psiSum = 0.0f;}
77 template <
bool doEnergy,
bool doSlow>
78 __device__ __forceinline__
void calcGBISPhase(
const float r2,
const float dx,
const float dy,
const float dz,
82 if (r2 < cutoff2 && r2 > 0.01f) {
83 float r_i = rsqrtf(r2);
98 atomicAdd(&out[i], res.
psiSum);
116 __device__ __forceinline__
void loadI(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
119 __device__ __forceinline__
void loadJ(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
125 __device__ __forceinline__
void initQj(
const float q) {
137 __device__ __forceinline__
void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
146 template <
bool doEnergy,
bool doSlow>
147 __device__ __forceinline__
void calcGBISPhase(
const float r2,
const float dx,
const float dy,
const float dz,
151 if (r2 < cutoff2 && r2 > 0.01f) {
152 float r_i = rsqrtf(r2);
157 float qiqj = inp.
qi*inp.
qj;
159 float aiaj4 = 4*aiaj;
160 float expr2aiaj4 = exp(-r2/aiaj4);
161 float fij = sqrt(r2 + aiaj*expr2aiaj4);
163 float expkappa = exp(-param.
kappa*fij);
165 float gbEij = qiqj*Dij*f_i;
168 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
169 float ddrf_i = -ddrfij*f_i*f_i;
171 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
175 float ddrScale = 0.f;
178 scale = r2 * param.
r_cut_2 - 1.f;
181 if (doEnergy) energyT += gbEij * scale;
182 forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
184 if (doEnergy) energyT += gbEij;
185 forcedEdr = ddrGbEij;
190 float dEdai = 0.5f*qiqj*f_i*f_i
192 *(aiaj+0.25f*r2)*expr2aiaj4/inp.
bornRadI*scale;
194 float dEdaj = 0.5f*qiqj*f_i*f_i
196 *(aiaj+0.25f*r2)*expr2aiaj4/inp.
bornRadJ*scale;
201 float tmpx = dx*forcedEdr;
202 float tmpy = dy*forcedEdr;
203 float tmpz = dz*forcedEdr;
204 resI.
force.x += tmpx;
205 resI.
force.y += tmpy;
206 resI.
force.z += tmpz;
207 resJ.
force.x -= tmpx;
208 resJ.
force.y -= tmpy;
209 resJ.
force.z -= tmpz;
216 float expkappa = exp(-param.
kappa*fij);
218 float gbEij = inp.
qi*(inp.
qi / (-param.
scaling) )*Dij/fij;
219 energyT += 0.5f*gbEij;
225 atomicAdd(&out[i], res.
dEdaSum);
226 atomicAdd(&forces[i].
x, res.
force.x);
227 atomicAdd(&forces[i].
y, res.
force.y);
228 atomicAdd(&forces[i].
z, res.
force.z);
244 __device__ __forceinline__
void loadI(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
247 dHdrPrefixI = inp3[i];
249 __device__ __forceinline__
void loadJ(
const int i,
const float* inp1,
const float* inp2,
const float* inp3) {
252 dHdrPrefixJ = inp3[i];
255 __device__ __forceinline__
void initQj(
const float q) {}
265 __device__ __forceinline__
void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
273 template <
bool doEnergy,
bool doSlow>
274 __device__ __forceinline__
void calcGBISPhase(
const float r2,
const float dx,
const float dy,
const float dz,
278 if (r2 < cutoff2 && r2 > 0.01f) {
279 float r_i = rsqrtf(r2);
287 float tmpx = dx * forceAlpha;
288 float tmpy = dy * forceAlpha;
289 float tmpz = dz * forceAlpha;
290 resI.
force.x += tmpx;
291 resI.
force.y += tmpy;
292 resI.
force.z += tmpz;
293 resJ.
force.x -= tmpx;
294 resJ.
force.y -= tmpy;
295 resJ.
force.z -= tmpz;
301 atomicAdd(&forces[i].
x, res.
force.x);
302 atomicAdd(&forces[i].
y, res.
force.y);
303 atomicAdd(&forces[i].
z, res.
force.z);
311 template <
bool doEnergy,
bool doSlow,
int phase>
318 const float3
lata, const float3
latb, const float3
latc,
319 const float4* __restrict__
xyzq,
322 const
float* __restrict__ inp1,
323 const
float* __restrict__ inp2,
324 const
float* __restrict__ inp3,
325 float* __restrict__ out,
326 float4* __restrict__
forces,
345 const int wid = threadIdx.x %
WARPSIZE;
348 float4 xyzq_i = xyzq[iatomStart + wid];
354 inp.loadI(iatomStart + wid, inp1, inp2, inp3);
355 if (phase == 2) inp.initQi(param, xyzq_i.w);
358 int nloopi = min(iatomSize - iatomStart,
WARPSIZE);
363 if (doEnergy) energyT = 0.0f;
365 for (
int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
368 int jatomStart = tileJatomStart[jtile];
371 float4 xyzq_j = xyzq[jatomStart + wid];
373 inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
374 if (phase == 2) inp.initQj(xyzq_j.w);
377 int nloopj = min(jatomSize - jatomStart,
WARPSIZE);
379 const bool diag_tile = (iatomStart == jatomStart);
381 int t = (phase != 2 && diag_tile) ? 1 : 0;
382 if (phase != 2 && diag_tile) {
390 int j = (t + wid) & modval;
396 float r2 = dx*dx + dy*dy + dz*dz;
398 if (j < nloopj && wid < nloopi) {
399 calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz,
cutoff2, param, inp, resI, resJ, energyT);
416 typedef cub::WarpReduce<double> WarpReduceDouble;
418 int warpId = threadIdx.x /
WARPSIZE;
419 volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((
double)energyT);
420 if (wid == 0) virialEnergy[
itileList].energyGBIS = energyTot;
463 if (intRad0 != NULL) deallocate_device<float>(&intRad0);
464 if (intRadS != NULL) deallocate_device<float>(&intRadS);
465 if (psiSum != NULL) deallocate_device<float>(&psiSum);
466 if (bornRad != NULL) deallocate_device<float>(&bornRad);
467 if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
468 if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
477 reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
478 reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
488 reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
496 reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
504 const float3
lata,
const float3
latb,
const float3
latc,
const float a_cut,
float* h_psiSum,
507 reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
508 clear_device_array<float>(psiSum, atomStorageSize,
stream);
518 GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>> (
521 param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
525 copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize,
stream);
532 const bool doEnergy,
const bool doSlow,
533 const float3
lata,
const float3
latb,
const float3
latc,
534 const float r_cut,
const float scaling,
const float kappa,
const float smoothDist,
535 const float epsilon_p,
const float epsilon_s,
536 float4* d_forces,
float* h_dEdaSum, cudaStream_t
stream) {
538 reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
539 clear_device_array<float>(dEdaSum, atomStorageSize,
stream);
551 param.
r_cut2 = r_cut*r_cut;
560 #define CALL(DOENERGY, DOSLOW) \
561 GBIS_Kernel<DOENERGY, DOSLOW, 2> <<< nblock, nthread, 0, stream >>> (\
562 tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
563 tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
564 param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
566 if (!doEnergy && !doSlow)
CALL(
false,
false);
567 if (!doEnergy && doSlow)
CALL(
false,
true);
568 if ( doEnergy && !doSlow)
CALL(
true,
false);
569 if ( doEnergy && doSlow)
CALL(
true,
true);
573 copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize,
stream);
580 const float3
lata,
const float3
latb,
const float3
latc,
const float a_cut,
581 float4* d_forces, cudaStream_t
stream) {
591 GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>> (
594 param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
__global__ void GBIS_Kernel(const int numTileLists, const TileList *__restrict__ tileLists, const int *__restrict__ tileJatomStart, const PatchPairRecord *__restrict__ patchPairs, const float3 lata, const float3 latb, const float3 latc, const float4 *__restrict__ xyzq, const float cutoff2, const GBISParam< phase > param, const float *__restrict__ inp1, const float *__restrict__ inp2, const float *__restrict__ inp3, float *__restrict__ out, float4 *__restrict__ forces, TileListVirialEnergy *__restrict__ virialEnergy)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
PatchPairRecord * getPatchPairs()
__device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread float4 * forces
static __thread float * bornRadH
void setTileListVirialEnergyGBISLength(int len)
static __thread float * dHdrPrefixH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__device__ __forceinline__ void shuffleNext()
if(ComputeNonbondedUtil::goMethod==2)
__device__ __forceinline__ void shuffleNext()
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
__device__ __forceinline__ void shuffleNext(float &w)
__device__ __forceinline__ void init()
__device__ __forceinline__ void writeResult(const int i, const GBISResults< 1 > res, float *out, float4 *forces)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ tileLists
TileList * getTileListsGBIS()
static __thread float * intRadSH
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
__device__ __forceinline__ void init()
int getNumTileListsGBIS()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
int * getTileJatomStartGBIS()
__global__ void const int numTileLists
__device__ __forceinline__ void init()
CudaComputeGBISKernel(int deviceID)
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__thread DeviceCUDA * deviceCUDA
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__device__ __forceinline__ void shuffleNext()
#define CALL(DOENERGY, DOVIRIAL)
#define GBISKERNEL_NUM_WARP
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static __thread float * intRad0H
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)