NAMD
CudaComputeGBISKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda.h>
3 #if __CUDACC_VER_MAJOR__ >= 11
4 #include <cub/cub.cuh>
5 #else
6 #include <namd_cub/cub.cuh>
7 #endif //CUDACC version
8 #endif //NAMD_CUDA
9 #ifdef NAMD_HIP //NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include <hipcub/hipcub.hpp>
12 #include "HipDefines.h"
13 #define cub hipcub
14 #endif
15 #include "CudaUtils.h"
16 #include "CudaTileListKernel.h"
17 #include "CudaComputeGBISKernel.h"
18 #define GBIS_CUDA
19 #include "ComputeGBIS.inl"
20 #include "DeviceCUDA.h"
21 #ifdef WIN32
22 #define __thread __declspec(thread)
23 #endif
24 extern __thread DeviceCUDA *deviceCUDA;
25 
26 #define LARGE_FLOAT (float)(1.0e10)
27 
28 #define GBISKERNEL_NUM_WARP 4
29 
30 __device__ __forceinline__
31 void shuffleNext(float& w) {
32  w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
33 }
34 
35 // Generic
36 template <int phase> struct GBISParam {};
37 template <int phase> struct GBISInput {};
38 template <int phase> struct GBISResults {};
39 
40 // ------------------------------------------------------------------------------------------------
41 // Phase 1 specialization
42 
43 template <> struct GBISParam<1> {
44  float a_cut;
45 };
46 
47 template <> struct GBISInput<1> {
48  float qi, qj;
49  float intRad0j, intRadSi;
50  // inp1 = intRad0
51  // inp2 = intRadS
52  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
53  qi = inp1[i];
54  intRadSi = inp2[i];
55  }
56  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
57  qj = inp2[i];
58  intRad0j = inp1[i];
59  }
60  __device__ __forceinline__ void initQi(const GBISParam<1> param, const float q) {}
61  __device__ __forceinline__ void initQj(const float q) {}
62  __device__ __forceinline__ void shuffleNext() {
63  qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
64  intRad0j = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
65  }
66 };
67 
68 template <> struct GBISResults<1> {
69  float psiSum;
70  __device__ __forceinline__ void init() {psiSum = 0.0f;}
71  __device__ __forceinline__ void shuffleNext() {
72  psiSum = WARP_SHUFFLE(WARP_FULL_MASK, psiSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
73  }
74 };
75 
76 // calculate H(i,j) [and not H(j,i)]
77 template <bool doEnergy, bool doSlow>
78 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
79  const float cutoff2, const GBISParam<1> param, const GBISInput<1> inp,
80  GBISResults<1>& resI, GBISResults<1>& resJ, float& energy) {
81 
82  if (r2 < cutoff2 && r2 > 0.01f) {
83  float r_i = rsqrtf(r2);
84  float r = r2 * r_i;
85  float hij;
86  int dij;
87  CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
88  resI.psiSum += hij;
89  float hji;
90  int dji;
91  CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
92  resJ.psiSum += hji;
93  }
94 
95 }
96 
97 __device__ __forceinline__ void writeResult(const int i, const GBISResults<1> res, float* out, float4* forces) {
98  atomicAdd(&out[i], res.psiSum);
99 }
100 
101 // ------------------------------------------------------------------------------------------------
102 // Phase 2
103 
104 template <> struct GBISParam<2> {
105  float kappa;
106  float epsilon_p_i, epsilon_s_i;
107  float smoothDist;
108  float r_cut2, r_cut_2, r_cut_4;
109  float scaling;
110 };
111 
112 template <> struct GBISInput<2> {
113  float qi, qj;
114  float bornRadI, bornRadJ;
115  // inp1 = bornRad
116  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
117  bornRadI = inp1[i];
118  }
119  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
120  bornRadJ = inp1[i];
121  }
122  __device__ __forceinline__ void initQi(const GBISParam<2> param, const float q) {
123  qi = -q*param.scaling;
124  }
125  __device__ __forceinline__ void initQj(const float q) {
126  qj = q;
127  }
128  __device__ __forceinline__ void shuffleNext() {
129  qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
130  bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
131  }
132 };
133 
134 template <> struct GBISResults<2> {
135  float3 force;
136  float dEdaSum;
137  __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
138  __device__ __forceinline__ void shuffleNext() {
139  force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
140  force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
141  force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
142  dEdaSum = WARP_SHUFFLE(WARP_FULL_MASK, dEdaSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
143  }
144 };
145 
146 template <bool doEnergy, bool doSlow>
147 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
148  const float cutoff2, const GBISParam<2> param, const GBISInput<2> inp,
149  GBISResults<2>& resI, GBISResults<2>& resJ, float& energyT) {
150 
151  if (r2 < cutoff2 && r2 > 0.01f) {
152  float r_i = rsqrtf(r2);
153  float r = r2 * r_i;
154  //float bornRadJ = sh_jBornRad[j];
155 
156  //calculate GB energy
157  float qiqj = inp.qi*inp.qj;
158  float aiaj = inp.bornRadI*inp.bornRadJ;
159  float aiaj4 = 4*aiaj;
160  float expr2aiaj4 = exp(-r2/aiaj4);
161  float fij = sqrt(r2 + aiaj*expr2aiaj4);
162  float f_i = 1/fij;
163  float expkappa = exp(-param.kappa*fij);
164  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
165  float gbEij = qiqj*Dij*f_i;
166 
167  //calculate energy derivatives
168  float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
169  float ddrf_i = -ddrfij*f_i*f_i;
170  float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
171  float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
172 
173  //NAMD smoothing function
174  float scale = 1.f;
175  float ddrScale = 0.f;
176  float forcedEdr;
177  if (param.smoothDist > 0.f) {
178  scale = r2 * param.r_cut_2 - 1.f;
179  scale *= scale;
180  ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
181  if (doEnergy) energyT += gbEij * scale;
182  forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
183  } else {
184  if (doEnergy) energyT += gbEij;
185  forcedEdr = ddrGbEij;
186  }
187 
188  //add dEda
189  if (doSlow) {
190  float dEdai = 0.5f*qiqj*f_i*f_i
191  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
192  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
193  resI.dEdaSum += dEdai;
194  float dEdaj = 0.5f*qiqj*f_i*f_i
195  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
196  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
197  resJ.dEdaSum += dEdaj;
198  }
199 
200  forcedEdr *= r_i;
201  float tmpx = dx*forcedEdr;
202  float tmpy = dy*forcedEdr;
203  float tmpz = dz*forcedEdr;
204  resI.force.x += tmpx;
205  resI.force.y += tmpy;
206  resI.force.z += tmpz;
207  resJ.force.x -= tmpx;
208  resJ.force.y -= tmpy;
209  resJ.force.z -= tmpz;
210  }
211 
212  // GB Self Energy
213  if (doEnergy) {
214  if (r2 < 0.01f) {
215  float fij = inp.bornRadI;//inf
216  float expkappa = exp(-param.kappa*fij);//0
217  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
218  float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
219  energyT += 0.5f*gbEij;
220  } //same atom or within cutoff
221  }
222 }
223 
224 __device__ __forceinline__ void writeResult(const int i, const GBISResults<2> res, float* out, float4* forces) {
225  atomicAdd(&out[i], res.dEdaSum);
226  atomicAdd(&forces[i].x, res.force.x);
227  atomicAdd(&forces[i].y, res.force.y);
228  atomicAdd(&forces[i].z, res.force.z);
229 }
230 
231 // ------------------------------------------------------------------------------------------------
232 // Phase 3
233 template <> struct GBISParam<3> {
234  float a_cut;
235 };
236 
237 template <> struct GBISInput<3> {
238  float qi;
239  float intRadSJ, intRadJ0, intRadIS;
240  float dHdrPrefixI, dHdrPrefixJ;
241  // inp1 = intRad0
242  // inp2 = intRadS
243  // inp3 = dHdrPrefix
244  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
245  qi = inp1[i];
246  intRadIS = inp2[i];
247  dHdrPrefixI = inp3[i];
248  }
249  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
250  intRadJ0 = inp1[i];
251  intRadSJ = inp2[i];
252  dHdrPrefixJ = inp3[i];
253  }
254  __device__ __forceinline__ void initQi(const GBISParam<3> param, const float q) {}
255  __device__ __forceinline__ void initQj(const float q) {}
256  __device__ __forceinline__ void shuffleNext() {
257  intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
258  dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
259  intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
260  }
261 };
262 
263 template <> struct GBISResults<3> {
264  float3 force;
265  __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
266  __device__ __forceinline__ void shuffleNext() {
267  force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
268  force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
269  force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
270  }
271 };
272 
273 template <bool doEnergy, bool doSlow>
274 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
275  const float cutoff2, const GBISParam<3> param, const GBISInput<3> inp,
276  GBISResults<3>& resI, GBISResults<3>& resJ, float& energy) {
277 
278  if (r2 < cutoff2 && r2 > 0.01f) {
279  float r_i = rsqrtf(r2);
280  float r = r2 * r_i;
281  float dhij, dhji;
282  int dij, dji;
283  CalcDH(r, r2, r_i, param.a_cut, inp.qi, inp.intRadSJ, dhij, dij);
284  CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
285 
286  float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
287  float tmpx = dx * forceAlpha;
288  float tmpy = dy * forceAlpha;
289  float tmpz = dz * forceAlpha;
290  resI.force.x += tmpx;
291  resI.force.y += tmpy;
292  resI.force.z += tmpz;
293  resJ.force.x -= tmpx;
294  resJ.force.y -= tmpy;
295  resJ.force.z -= tmpz;
296  }
297 
298 }
299 
300 __device__ __forceinline__ void writeResult(const int i, const GBISResults<3> res, float* out, float4* forces) {
301  atomicAdd(&forces[i].x, res.force.x);
302  atomicAdd(&forces[i].y, res.force.y);
303  atomicAdd(&forces[i].z, res.force.z);
304 }
305 
306 // ------------------------------------------------------------------------------------------------
307 
308 //
309 // GBIS kernel
310 //
311 template <bool doEnergy, bool doSlow, int phase>
312 __global__ void
315  const TileList* __restrict__ tileLists,
316  const int* __restrict__ tileJatomStart,
317  const PatchPairRecord* __restrict__ patchPairs,
318  const float3 lata, const float3 latb, const float3 latc,
319  const float4* __restrict__ xyzq,
320  const float cutoff2,
321  const GBISParam<phase> param,
322  const float* __restrict__ inp1,
323  const float* __restrict__ inp2,
324  const float* __restrict__ inp3,
325  float* __restrict__ out,
326  float4* __restrict__ forces,
327  TileListVirialEnergy* __restrict__ virialEnergy) {
328 
329  // Single warp takes care of one list of tiles
330  for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
331  {
332  TileList tmp = tileLists[itileList];
333  int iatomStart = tmp.iatomStart;
334  int jtileStart = tmp.jtileStart;
335  int jtileEnd = tmp.jtileEnd;
336  PatchPairRecord PPStmp = patchPairs[itileList];
337  int iatomSize = PPStmp.iatomSize;
338  int jatomSize = PPStmp.jatomSize;
339 
340  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
341  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
342  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
343 
344  // Warp index (0...warpsize-1)
345  const int wid = threadIdx.x % WARPSIZE;
346 
347  // Load i-atom data (and shift coordinates)
348  float4 xyzq_i = xyzq[iatomStart + wid];
349  xyzq_i.x += shx;
350  xyzq_i.y += shy;
351  xyzq_i.z += shz;
352 
353  GBISInput<phase> inp;
354  inp.loadI(iatomStart + wid, inp1, inp2, inp3);
355  if (phase == 2) inp.initQi(param, xyzq_i.w);
356 
357  // Number of i loops
358  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
359 
360  GBISResults<phase> resI;
361  resI.init();
362  float energyT;
363  if (doEnergy) energyT = 0.0f;
364 
365  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
366 
367  // Load j-atom starting index and exclusion mask
368  int jatomStart = tileJatomStart[jtile];
369 
370  // Load coordinates and charge
371  float4 xyzq_j = xyzq[jatomStart + wid];
372 
373  inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
374  if (phase == 2) inp.initQj(xyzq_j.w);
375 
376  // Number of j loops
377  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
378 
379  const bool diag_tile = (iatomStart == jatomStart);
380  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
381  int t = (phase != 2 && diag_tile) ? 1 : 0;
382  if (phase != 2 && diag_tile) {
383  inp.shuffleNext();
384  }
385 
386  GBISResults<phase> resJ;
387  resJ.init();
388 
389  for (;t < WARPSIZE;t++) {
390  int j = (t + wid) & modval;
391 
392  float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
393  float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
394  float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
395 
396  float r2 = dx*dx + dy*dy + dz*dz;
397 
398  if (j < nloopj && wid < nloopi) {
399  calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
400  }
401 
402  inp.shuffleNext();
403  resJ.shuffleNext();
404  } // t
405 
406  // Write j
407  writeResult(jatomStart + wid, resJ, out, forces);
408 
409  } // jtile
410 
411  // Write i
412  writeResult(iatomStart + wid, resI, out, forces);
413 
414  // Reduce energy
415  if (doEnergy) {
416  typedef cub::WarpReduce<double> WarpReduceDouble;
417  __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
418  int warpId = threadIdx.x / WARPSIZE;
419  volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
420  if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
421  }
422 
423  }
424 }
425 
426 #undef GBIS_CUDA
427 
428 // ##############################################################################################
429 // ##############################################################################################
430 // ##############################################################################################
431 
432 //
433 // Class constructor
434 //
435 CudaComputeGBISKernel::CudaComputeGBISKernel(int deviceID) : deviceID(deviceID) {
436  cudaCheck(cudaSetDevice(deviceID));
437 
438  intRad0 = NULL;
439  intRad0Size = 0;
440 
441  intRadS = NULL;
442  intRadSSize = 0;
443 
444  psiSum = NULL;
445  psiSumSize = 0;
446 
447  bornRad = NULL;
448  bornRadSize = 0;
449 
450  dEdaSum = NULL;
451  dEdaSumSize = 0;
452 
453  dHdrPrefix = NULL;
454  dHdrPrefixSize = 0;
455 
456 }
457 
458 //
459 // Class destructor
460 //
462  cudaCheck(cudaSetDevice(deviceID));
463  if (intRad0 != NULL) deallocate_device<float>(&intRad0);
464  if (intRadS != NULL) deallocate_device<float>(&intRadS);
465  if (psiSum != NULL) deallocate_device<float>(&psiSum);
466  if (bornRad != NULL) deallocate_device<float>(&bornRad);
467  if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
468  if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
469 }
470 
471 //
472 // Update (copy Host->Device) intRad0, intRadS
473 //
474 void CudaComputeGBISKernel::updateIntRad(const int atomStorageSize, float* intRad0H, float* intRadSH,
475  cudaStream_t stream) {
476 
477  reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
478  reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
479 
480  copy_HtoD<float>(intRad0H, intRad0, atomStorageSize, stream);
481  copy_HtoD<float>(intRadSH, intRadS, atomStorageSize, stream);
482 }
483 
484 //
485 // Update (copy Host->Device) bornRad
486 //
487 void CudaComputeGBISKernel::updateBornRad(const int atomStorageSize, float* bornRadH, cudaStream_t stream) {
488  reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
489  copy_HtoD<float>(bornRadH, bornRad, atomStorageSize, stream);
490 }
491 
492 //
493 // Update (copy Host->Device) dHdrPrefix
494 //
495 void CudaComputeGBISKernel::update_dHdrPrefix(const int atomStorageSize, float* dHdrPrefixH, cudaStream_t stream) {
496  reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
497  copy_HtoD<float>(dHdrPrefixH, dHdrPrefix, atomStorageSize, stream);
498 }
499 
500 //
501 // Phase 1
502 //
503 void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize,
504  const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum,
505  cudaStream_t stream) {
506 
507  reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
508  clear_device_array<float>(psiSum, atomStorageSize, stream);
509 
510  int nwarp = GBISKERNEL_NUM_WARP;
511  int nthread = WARPSIZE*nwarp;
512  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
513 
514  GBISParam<1> param;
515  param.a_cut = a_cut;
516 
517  float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
518  GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>> (
519  tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
520  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
521  param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
522 
523  cudaCheck(cudaGetLastError());
524 
525  copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize, stream);
526 }
527 
528 //
529 // Phase 2
530 //
531 void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize,
532  const bool doEnergy, const bool doSlow,
533  const float3 lata, const float3 latb, const float3 latc,
534  const float r_cut, const float scaling, const float kappa, const float smoothDist,
535  const float epsilon_p, const float epsilon_s,
536  float4* d_forces, float* h_dEdaSum, cudaStream_t stream) {
537 
538  reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
539  clear_device_array<float>(dEdaSum, atomStorageSize, stream);
540 
541  if (doEnergy) {
543  }
544 
545  int nwarp = GBISKERNEL_NUM_WARP;
546  int nthread = WARPSIZE*nwarp;
547 
548  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
549 
550  GBISParam<2> param;
551  param.r_cut2 = r_cut*r_cut;
552  param.r_cut_2 = 1.f / param.r_cut2;
553  param.r_cut_4 = 4.f*param.r_cut_2*param.r_cut_2;
554  param.epsilon_s_i = 1.f / epsilon_s;
555  param.epsilon_p_i = 1.f / epsilon_p;
556  param.scaling = scaling;
557  param.kappa = kappa;
558  param.smoothDist = smoothDist;
559 
560 #define CALL(DOENERGY, DOSLOW) \
561  GBIS_Kernel<DOENERGY, DOSLOW, 2> <<< nblock, nthread, 0, stream >>> (\
562  tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
563  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
564  param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
565 
566  if (!doEnergy && !doSlow) CALL(false, false);
567  if (!doEnergy && doSlow) CALL(false, true);
568  if ( doEnergy && !doSlow) CALL(true, false);
569  if ( doEnergy && doSlow) CALL(true, true);
570 
571  cudaCheck(cudaGetLastError());
572 
573  copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize, stream);
574 }
575 
576 //
577 // Phase 3
578 //
579 void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize,
580  const float3 lata, const float3 latb, const float3 latc, const float a_cut,
581  float4* d_forces, cudaStream_t stream) {
582 
583  int nwarp = GBISKERNEL_NUM_WARP;
584  int nthread = WARPSIZE*nwarp;
585  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
586 
587  GBISParam<3> param;
588  param.a_cut = a_cut;
589 
590  float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
591  GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>> (
592  tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
593  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
594  param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
595 
596  cudaCheck(cudaGetLastError());
597 }
__global__ void GBIS_Kernel(const int numTileLists, const TileList *__restrict__ tileLists, const int *__restrict__ tileJatomStart, const PatchPairRecord *__restrict__ patchPairs, const float3 lata, const float3 latb, const float3 latc, const float4 *__restrict__ xyzq, const float cutoff2, const GBISParam< phase > param, const float *__restrict__ inp1, const float *__restrict__ inp2, const float *__restrict__ inp3, float *__restrict__ out, float4 *__restrict__ forces, TileListVirialEnergy *__restrict__ virialEnergy)
void updateIntRad(const int atomStorageSize, float *intRad0H, float *intRadSH, cudaStream_t stream)
void GBISphase2(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float4 *d_forces, float *h_dEdaSum, cudaStream_t stream)
void GBISphase3(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float4 *d_forces, cudaStream_t stream)
__device__ __forceinline__ void loadI(const int i, const float *inp1, const float *inp2, const float *inp3)
__device__ __forceinline__ void initQi(const GBISParam< 2 > param, const float q)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
PatchPairRecord * getPatchPairs()
__device__ __forceinline__ void initQj(const float q)
__device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz, const float cutoff2, const GBISParam< 1 > param, const GBISInput< 1 > inp, GBISResults< 1 > &resI, GBISResults< 1 > &resJ, float &energy)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread float4 * forces
static __thread float * bornRadH
void setTileListVirialEnergyGBISLength(int len)
static __thread float * dHdrPrefixH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__device__ __forceinline__ void shuffleNext()
if(ComputeNonbondedUtil::goMethod==2)
__device__ __forceinline__ void shuffleNext()
void updateBornRad(const int atomStorageSize, float *bornRadH, cudaStream_t stream)
__device__ __forceinline__ void initQj(const float q)
__device__ __forceinline__ void shuffleNext()
#define WARPSIZE
Definition: CudaUtils.h:10
__device__ __forceinline__ void shuffleNext(float &w)
__device__ __forceinline__ void init()
__device__ __forceinline__ void writeResult(const int i, const GBISResults< 1 > res, float *out, float4 *forces)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__global__ void const int const TileList *__restrict__ tileLists
__device__ __forceinline__ void loadI(const int i, const float *inp1, const float *inp2, const float *inp3)
TileList * getTileListsGBIS()
static __thread float * intRadSH
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
__device__ __forceinline__ void loadJ(const int i, const float *inp1, const float *inp2, const float *inp3)
__device__ __forceinline__ void init()
__device__ __forceinline__ void initQj(const float q)
gridSize z
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 offsetXYZ
__global__ void const int numTileLists
__device__ __forceinline__ void init()
__shared__ union @43 tempStorage
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
#define FS_MAX
Definition: ComputeGBIS.inl:24
__device__ __forceinline__ void initQi(const GBISParam< 1 > param, const float q)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
__device__ __forceinline__ void initQi(const GBISParam< 3 > param, const float q)
__device__ __forceinline__ void loadJ(const int i, const float *inp1, const float *inp2, const float *inp3)
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
gridSize y
void update_dHdrPrefix(const int atomStorageSize, float *dHdrPrefixH, cudaStream_t stream)
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
__device__ __forceinline__ void shuffleNext()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
__device__ __forceinline__ void shuffleNext()
#define CALL(DOENERGY, DOVIRIAL)
#define GBISKERNEL_NUM_WARP
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
__device__ __forceinline__ void shuffleNext()
void GBISphase1(CudaTileListKernel &tlKernel, const int atomStorageSize, const float3 lata, const float3 latb, const float3 latc, const float a_cut, float *h_psiSum, cudaStream_t stream)
static __thread float * intRad0H
__device__ __forceinline__ void loadJ(const int i, const float *inp1, const float *inp2, const float *inp3)
__device__ __forceinline__ void loadI(const int i, const float *inp1, const float *inp2, const float *inp3)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54