3 #if __CUDACC_VER_MAJOR__ >= 11
6 #include <namd_cub/cub.cuh>
7 #endif //CUDACC version
9 #ifdef NAMD_HIP //NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include <hipcub/hipcub.hpp>
15 #include "HipDefines.h"
16 #include "CudaUtils.h"
17 #include "CudaTileListKernel.h"
18 #include "CudaComputeGBISKernel.h"
19 #include "DeviceCUDA.h"
22 #include "ComputeGBIS.inl"
24 #define __thread __declspec(thread)
26 extern __thread DeviceCUDA *deviceCUDA;
28 #define LARGE_FLOAT (float)(1.0e10)
30 #define GBISKERNEL_NUM_WARP 4
32 __device__ __forceinline__
33 void shuffleNext(float& w) {
34 w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
38 template <int phase> struct GBISParam {};
39 template <int phase> struct GBISInput {};
40 template <int phase> struct GBISResults {};
42 // ------------------------------------------------------------------------------------------------
43 // Phase 1 specialization
45 template <> struct GBISParam<1> {
49 template <> struct GBISInput<1> {
51 float intRad0j, intRadSi;
54 __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
58 __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
62 __device__ __forceinline__ void initQi(const GBISParam<1> param, const float q) {}
63 __device__ __forceinline__ void initQj(const float q) {}
64 __device__ __forceinline__ void shuffleNext() {
65 qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
66 intRad0j = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
70 template <> struct GBISResults<1> {
72 __device__ __forceinline__ void init() {psiSum = 0.0f;}
73 __device__ __forceinline__ void shuffleNext() {
74 psiSum = WARP_SHUFFLE(WARP_FULL_MASK, psiSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
78 // calculate H(i,j) [and not H(j,i)]
79 template <bool doEnergy, bool doSlow>
80 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
81 const float cutoff2, const GBISParam<1> param, const GBISInput<1> inp,
82 GBISResults<1>& resI, GBISResults<1>& resJ, float& energy) {
84 if (r2 < cutoff2 && r2 > 0.01f) {
85 float r_i = rsqrtf(r2);
89 CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
93 CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
99 __device__ __forceinline__ void writeResult(const int i, const GBISResults<1> res, float* out, float4* forces) {
100 atomicAdd(&out[i], res.psiSum);
103 // ------------------------------------------------------------------------------------------------
106 template <> struct GBISParam<2> {
108 float epsilon_p_i, epsilon_s_i;
110 float r_cut2, r_cut_2, r_cut_4;
114 template <> struct GBISInput<2> {
116 float bornRadI, bornRadJ;
118 __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
121 __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
124 __device__ __forceinline__ void initQi(const GBISParam<2> param, const float q) {
125 qi = -q*param.scaling;
127 __device__ __forceinline__ void initQj(const float q) {
130 __device__ __forceinline__ void shuffleNext() {
131 qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
132 bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
136 template <> struct GBISResults<2> {
139 __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
140 __device__ __forceinline__ void shuffleNext() {
141 force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
142 force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
143 force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
144 dEdaSum = WARP_SHUFFLE(WARP_FULL_MASK, dEdaSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
148 template <bool doEnergy, bool doSlow>
149 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
150 const float cutoff2, const GBISParam<2> param, const GBISInput<2> inp,
151 GBISResults<2>& resI, GBISResults<2>& resJ, float& energyT) {
153 if (r2 < cutoff2 && r2 > 0.01f) {
154 float r_i = rsqrtf(r2);
156 //float bornRadJ = sh_jBornRad[j];
158 //calculate GB energy
159 float qiqj = inp.qi*inp.qj;
160 float aiaj = inp.bornRadI*inp.bornRadJ;
161 float aiaj4 = 4*aiaj;
162 float expr2aiaj4 = exp(-r2/aiaj4);
163 float fij = sqrt(r2 + aiaj*expr2aiaj4);
165 float expkappa = exp(-param.kappa*fij);
166 float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
167 float gbEij = qiqj*Dij*f_i;
169 //calculate energy derivatives
170 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
171 float ddrf_i = -ddrfij*f_i*f_i;
172 float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
173 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
175 //NAMD smoothing function
177 float ddrScale = 0.f;
179 if (param.smoothDist > 0.f) {
180 scale = r2 * param.r_cut_2 - 1.f;
182 ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
183 if (doEnergy) energyT += gbEij * scale;
184 forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
186 if (doEnergy) energyT += gbEij;
187 forcedEdr = ddrGbEij;
192 float dEdai = 0.5f*qiqj*f_i*f_i
193 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
194 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
195 resI.dEdaSum += dEdai;
196 float dEdaj = 0.5f*qiqj*f_i*f_i
197 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
198 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
199 resJ.dEdaSum += dEdaj;
203 float tmpx = dx*forcedEdr;
204 float tmpy = dy*forcedEdr;
205 float tmpz = dz*forcedEdr;
206 resI.force.x += tmpx;
207 resI.force.y += tmpy;
208 resI.force.z += tmpz;
209 resJ.force.x -= tmpx;
210 resJ.force.y -= tmpy;
211 resJ.force.z -= tmpz;
217 float fij = inp.bornRadI;//inf
218 float expkappa = exp(-param.kappa*fij);//0
219 float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
220 float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
221 energyT += 0.5f*gbEij;
222 } //same atom or within cutoff
226 __device__ __forceinline__ void writeResult(const int i, const GBISResults<2> res, float* out, float4* forces) {
227 atomicAdd(&out[i], res.dEdaSum);
228 atomicAdd(&forces[i].x, res.force.x);
229 atomicAdd(&forces[i].y, res.force.y);
230 atomicAdd(&forces[i].z, res.force.z);
233 // ------------------------------------------------------------------------------------------------
235 template <> struct GBISParam<3> {
239 template <> struct GBISInput<3> {
241 float intRadSJ, intRadJ0, intRadIS;
242 float dHdrPrefixI, dHdrPrefixJ;
246 __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
249 dHdrPrefixI = inp3[i];
251 __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
254 dHdrPrefixJ = inp3[i];
256 __device__ __forceinline__ void initQi(const GBISParam<3> param, const float q) {}
257 __device__ __forceinline__ void initQj(const float q) {}
258 __device__ __forceinline__ void shuffleNext() {
259 intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
260 dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
261 intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
265 template <> struct GBISResults<3> {
267 __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
268 __device__ __forceinline__ void shuffleNext() {
269 force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
270 force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
271 force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
275 template <bool doEnergy, bool doSlow>
276 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
277 const float cutoff2, const GBISParam<3> param, const GBISInput<3> inp,
278 GBISResults<3>& resI, GBISResults<3>& resJ, float& energy) {
280 if (r2 < cutoff2 && r2 > 0.01f) {
281 float r_i = rsqrtf(r2);
285 CalcDH(r, r2, r_i, param.a_cut, inp.qi, inp.intRadSJ, dhij, dij);
286 CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
288 float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
289 float tmpx = dx * forceAlpha;
290 float tmpy = dy * forceAlpha;
291 float tmpz = dz * forceAlpha;
292 resI.force.x += tmpx;
293 resI.force.y += tmpy;
294 resI.force.z += tmpz;
295 resJ.force.x -= tmpx;
296 resJ.force.y -= tmpy;
297 resJ.force.z -= tmpz;
302 __device__ __forceinline__ void writeResult(const int i, const GBISResults<3> res, float* out, float4* forces) {
303 atomicAdd(&forces[i].x, res.force.x);
304 atomicAdd(&forces[i].y, res.force.y);
305 atomicAdd(&forces[i].z, res.force.z);
308 // ------------------------------------------------------------------------------------------------
313 template <bool doEnergy, bool doSlow, int phase>
315 __launch_bounds__(WARPSIZE*GBISKERNEL_NUM_WARP, 6)
316 GBIS_Kernel(const int numTileLists,
317 const TileList* __restrict__ tileLists,
318 const int* __restrict__ tileJatomStart,
319 const PatchPairRecord* __restrict__ patchPairs,
320 const float3 lata, const float3 latb, const float3 latc,
321 const float4* __restrict__ xyzq,
323 const GBISParam<phase> param,
324 const float* __restrict__ inp1,
325 const float* __restrict__ inp2,
326 const float* __restrict__ inp3,
327 float* __restrict__ out,
328 float4* __restrict__ forces,
329 TileListVirialEnergy* __restrict__ virialEnergy) {
331 // Single warp takes care of one list of tiles
332 for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
334 TileList tmp = tileLists[itileList];
335 int iatomStart = tmp.iatomStart;
336 int jtileStart = tmp.jtileStart;
337 int jtileEnd = tmp.jtileEnd;
338 PatchPairRecord PPStmp = patchPairs[itileList];
339 int iatomSize = PPStmp.iatomSize;
340 int jatomSize = PPStmp.jatomSize;
342 float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
343 float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
344 float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
346 // Warp index (0...warpsize-1)
347 const int wid = threadIdx.x % WARPSIZE;
349 // Load i-atom data (and shift coordinates)
350 float4 xyzq_i = xyzq[iatomStart + wid];
355 GBISInput<phase> inp;
356 inp.loadI(iatomStart + wid, inp1, inp2, inp3);
357 if (phase == 2) inp.initQi(param, xyzq_i.w);
360 int nloopi = min(iatomSize - iatomStart, WARPSIZE);
362 GBISResults<phase> resI;
365 if (doEnergy) energyT = 0.0f;
367 for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
369 // Load j-atom starting index and exclusion mask
370 int jatomStart = tileJatomStart[jtile];
372 // Load coordinates and charge
373 float4 xyzq_j = xyzq[jatomStart + wid];
375 inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
376 if (phase == 2) inp.initQj(xyzq_j.w);
379 int nloopj = min(jatomSize - jatomStart, WARPSIZE);
381 const bool diag_tile = (iatomStart == jatomStart);
382 const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
383 int t = (phase != 2 && diag_tile) ? 1 : 0;
384 if (phase != 2 && diag_tile) {
388 GBISResults<phase> resJ;
391 for (;t < WARPSIZE;t++) {
392 int j = (t + wid) & modval;
394 float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
395 float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
396 float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
398 float r2 = dx*dx + dy*dy + dz*dz;
400 if (j < nloopj && wid < nloopi) {
401 calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
409 writeResult(jatomStart + wid, resJ, out, forces);
414 writeResult(iatomStart + wid, resI, out, forces);
418 typedef cub::WarpReduce<double> WarpReduceDouble;
419 __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
420 int warpId = threadIdx.x / WARPSIZE;
421 volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
422 if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
430 // ##############################################################################################
431 // ##############################################################################################
432 // ##############################################################################################
437 CudaComputeGBISKernel::CudaComputeGBISKernel(int deviceID) : deviceID(deviceID) {
438 cudaCheck(cudaSetDevice(deviceID));
463 CudaComputeGBISKernel::~CudaComputeGBISKernel() {
464 cudaCheck(cudaSetDevice(deviceID));
465 if (intRad0 != NULL) deallocate_device<float>(&intRad0);
466 if (intRadS != NULL) deallocate_device<float>(&intRadS);
467 if (psiSum != NULL) deallocate_device<float>(&psiSum);
468 if (bornRad != NULL) deallocate_device<float>(&bornRad);
469 if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
470 if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
474 // Update (copy Host->Device) intRad0, intRadS
476 void CudaComputeGBISKernel::updateIntRad(const int atomStorageSize, float* intRad0H, float* intRadSH,
477 cudaStream_t stream) {
479 reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
480 reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
482 copy_HtoD<float>(intRad0H, intRad0, atomStorageSize, stream);
483 copy_HtoD<float>(intRadSH, intRadS, atomStorageSize, stream);
487 // Update (copy Host->Device) bornRad
489 void CudaComputeGBISKernel::updateBornRad(const int atomStorageSize, float* bornRadH, cudaStream_t stream) {
490 reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
491 copy_HtoD<float>(bornRadH, bornRad, atomStorageSize, stream);
495 // Update (copy Host->Device) dHdrPrefix
497 void CudaComputeGBISKernel::update_dHdrPrefix(const int atomStorageSize, float* dHdrPrefixH, cudaStream_t stream) {
498 reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
499 copy_HtoD<float>(dHdrPrefixH, dHdrPrefix, atomStorageSize, stream);
505 void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize,
506 const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum,
507 cudaStream_t stream) {
509 reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
510 clear_device_array<float>(psiSum, atomStorageSize, stream);
512 int nwarp = GBISKERNEL_NUM_WARP;
513 int nthread = WARPSIZE*nwarp;
514 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
519 float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
521 GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>>
522 (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
523 tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
524 param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
526 cudaCheck(cudaGetLastError());
528 copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize, stream);
534 void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize,
535 const bool doEnergy, const bool doSlow,
536 const float3 lata, const float3 latb, const float3 latc,
537 const float r_cut, const float scaling, const float kappa, const float smoothDist,
538 const float epsilon_p, const float epsilon_s,
539 float4* d_forces, float* h_dEdaSum, cudaStream_t stream) {
541 reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
542 clear_device_array<float>(dEdaSum, atomStorageSize, stream);
545 tlKernel.setTileListVirialEnergyGBISLength(tlKernel.getNumTileListsGBIS());
548 int nwarp = GBISKERNEL_NUM_WARP;
549 int nthread = WARPSIZE*nwarp;
551 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
554 param.r_cut2 = r_cut*r_cut;
555 param.r_cut_2 = 1.f / param.r_cut2;
556 param.r_cut_4 = 4.f*param.r_cut_2*param.r_cut_2;
557 param.epsilon_s_i = 1.f / epsilon_s;
558 param.epsilon_p_i = 1.f / epsilon_p;
559 param.scaling = scaling;
561 param.smoothDist = smoothDist;
563 #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \
564 <<< nblock, nthread, 0, stream >>> \
565 (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
566 tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
567 param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
569 if (!doEnergy && !doSlow) CALL(false, false);
570 if (!doEnergy && doSlow) CALL(false, true);
571 if ( doEnergy && !doSlow) CALL(true, false);
572 if ( doEnergy && doSlow) CALL(true, true);
574 cudaCheck(cudaGetLastError());
576 copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize, stream);
582 void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize,
583 const float3 lata, const float3 latb, const float3 latc, const float a_cut,
584 float4* d_forces, cudaStream_t stream) {
586 int nwarp = GBISKERNEL_NUM_WARP;
587 int nthread = WARPSIZE*nwarp;
588 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
593 float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
595 GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>>
596 (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
597 tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
598 param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
600 cudaCheck(cudaGetLastError());