NAMD
CudaComputeGBISKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda.h>
3 #if __CUDACC_VER_MAJOR__ >= 11
4 #include <cub/cub.cuh>
5 #else
6 #include <namd_cub/cub.cuh>
7 #endif //CUDACC version
8 #endif //NAMD_CUDA
9 #ifdef NAMD_HIP //NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include <hipcub/hipcub.hpp>
12 #define cub hipcub
13 #endif // NAMD_HIP
14 
15 #include "HipDefines.h"
16 #include "CudaUtils.h"
17 #include "CudaTileListKernel.h"
18 #include "CudaComputeGBISKernel.h"
19 #include "DeviceCUDA.h"
20 
21 #define GBIS_CUDA
22 #include "ComputeGBIS.inl"
23 #ifdef WIN32
24 #define __thread __declspec(thread)
25 #endif
26 extern __thread DeviceCUDA *deviceCUDA;
27 
28 #define LARGE_FLOAT (float)(1.0e10)
29 
30 #define GBISKERNEL_NUM_WARP 4
31 
32 __device__ __forceinline__
33 void shuffleNext(float& w) {
34  w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
35 }
36 
37 // Generic
38 template <int phase> struct GBISParam {};
39 template <int phase> struct GBISInput {};
40 template <int phase> struct GBISResults {};
41 
42 // ------------------------------------------------------------------------------------------------
43 // Phase 1 specialization
44 
45 template <> struct GBISParam<1> {
46  float a_cut;
47 };
48 
49 template <> struct GBISInput<1> {
50  float qi, qj;
51  float intRad0j, intRadSi;
52  // inp1 = intRad0
53  // inp2 = intRadS
54  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
55  qi = inp1[i];
56  intRadSi = inp2[i];
57  }
58  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
59  qj = inp2[i];
60  intRad0j = inp1[i];
61  }
62  __device__ __forceinline__ void initQi(const GBISParam<1> param, const float q) {}
63  __device__ __forceinline__ void initQj(const float q) {}
64  __device__ __forceinline__ void shuffleNext() {
65  qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
66  intRad0j = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
67  }
68 };
69 
70 template <> struct GBISResults<1> {
71  float psiSum;
72  __device__ __forceinline__ void init() {psiSum = 0.0f;}
73  __device__ __forceinline__ void shuffleNext() {
74  psiSum = WARP_SHUFFLE(WARP_FULL_MASK, psiSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
75  }
76 };
77 
78 // calculate H(i,j) [and not H(j,i)]
79 template <bool doEnergy, bool doSlow>
80 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
81  const float cutoff2, const GBISParam<1> param, const GBISInput<1> inp,
82  GBISResults<1>& resI, GBISResults<1>& resJ, float& energy) {
83 
84  if (r2 < cutoff2 && r2 > 0.01f) {
85  float r_i = rsqrtf(r2);
86  float r = r2 * r_i;
87  float hij;
88  int dij;
89  CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
90  resI.psiSum += hij;
91  float hji;
92  int dji;
93  CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
94  resJ.psiSum += hji;
95  }
96 
97 }
98 
99 __device__ __forceinline__ void writeResult(const int i, const GBISResults<1> res, float* out, float4* forces) {
100  atomicAdd(&out[i], res.psiSum);
101 }
102 
103 // ------------------------------------------------------------------------------------------------
104 // Phase 2
105 
106 template <> struct GBISParam<2> {
107  float kappa;
108  float epsilon_p_i, epsilon_s_i;
109  float smoothDist;
110  float r_cut2, r_cut_2, r_cut_4;
111  float scaling;
112 };
113 
114 template <> struct GBISInput<2> {
115  float qi, qj;
116  float bornRadI, bornRadJ;
117  // inp1 = bornRad
118  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
119  bornRadI = inp1[i];
120  }
121  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
122  bornRadJ = inp1[i];
123  }
124  __device__ __forceinline__ void initQi(const GBISParam<2> param, const float q) {
125  qi = -q*param.scaling;
126  }
127  __device__ __forceinline__ void initQj(const float q) {
128  qj = q;
129  }
130  __device__ __forceinline__ void shuffleNext() {
131  qj = WARP_SHUFFLE(WARP_FULL_MASK, qj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
132  bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
133  }
134 };
135 
136 template <> struct GBISResults<2> {
137  float3 force;
138  float dEdaSum;
139  __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
140  __device__ __forceinline__ void shuffleNext() {
141  force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
142  force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
143  force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
144  dEdaSum = WARP_SHUFFLE(WARP_FULL_MASK, dEdaSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
145  }
146 };
147 
148 template <bool doEnergy, bool doSlow>
149 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
150  const float cutoff2, const GBISParam<2> param, const GBISInput<2> inp,
151  GBISResults<2>& resI, GBISResults<2>& resJ, float& energyT) {
152 
153  if (r2 < cutoff2 && r2 > 0.01f) {
154  float r_i = rsqrtf(r2);
155  float r = r2 * r_i;
156  //float bornRadJ = sh_jBornRad[j];
157 
158  //calculate GB energy
159  float qiqj = inp.qi*inp.qj;
160  float aiaj = inp.bornRadI*inp.bornRadJ;
161  float aiaj4 = 4*aiaj;
162  float expr2aiaj4 = exp(-r2/aiaj4);
163  float fij = sqrt(r2 + aiaj*expr2aiaj4);
164  float f_i = 1/fij;
165  float expkappa = exp(-param.kappa*fij);
166  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
167  float gbEij = qiqj*Dij*f_i;
168 
169  //calculate energy derivatives
170  float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
171  float ddrf_i = -ddrfij*f_i*f_i;
172  float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
173  float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
174 
175  //NAMD smoothing function
176  float scale = 1.f;
177  float ddrScale = 0.f;
178  float forcedEdr;
179  if (param.smoothDist > 0.f) {
180  scale = r2 * param.r_cut_2 - 1.f;
181  scale *= scale;
182  ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
183  if (doEnergy) energyT += gbEij * scale;
184  forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
185  } else {
186  if (doEnergy) energyT += gbEij;
187  forcedEdr = ddrGbEij;
188  }
189 
190  //add dEda
191  if (doSlow) {
192  float dEdai = 0.5f*qiqj*f_i*f_i
193  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
194  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
195  resI.dEdaSum += dEdai;
196  float dEdaj = 0.5f*qiqj*f_i*f_i
197  *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
198  *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
199  resJ.dEdaSum += dEdaj;
200  }
201 
202  forcedEdr *= r_i;
203  float tmpx = dx*forcedEdr;
204  float tmpy = dy*forcedEdr;
205  float tmpz = dz*forcedEdr;
206  resI.force.x += tmpx;
207  resI.force.y += tmpy;
208  resI.force.z += tmpz;
209  resJ.force.x -= tmpx;
210  resJ.force.y -= tmpy;
211  resJ.force.z -= tmpz;
212  }
213 
214  // GB Self Energy
215  if (doEnergy) {
216  if (r2 < 0.01f) {
217  float fij = inp.bornRadI;//inf
218  float expkappa = exp(-param.kappa*fij);//0
219  float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
220  float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
221  energyT += 0.5f*gbEij;
222  } //same atom or within cutoff
223  }
224 }
225 
226 __device__ __forceinline__ void writeResult(const int i, const GBISResults<2> res, float* out, float4* forces) {
227  atomicAdd(&out[i], res.dEdaSum);
228  atomicAdd(&forces[i].x, res.force.x);
229  atomicAdd(&forces[i].y, res.force.y);
230  atomicAdd(&forces[i].z, res.force.z);
231 }
232 
233 // ------------------------------------------------------------------------------------------------
234 // Phase 3
235 template <> struct GBISParam<3> {
236  float a_cut;
237 };
238 
239 template <> struct GBISInput<3> {
240  float qi;
241  float intRadSJ, intRadJ0, intRadIS;
242  float dHdrPrefixI, dHdrPrefixJ;
243  // inp1 = intRad0
244  // inp2 = intRadS
245  // inp3 = dHdrPrefix
246  __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
247  qi = inp1[i];
248  intRadIS = inp2[i];
249  dHdrPrefixI = inp3[i];
250  }
251  __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
252  intRadJ0 = inp1[i];
253  intRadSJ = inp2[i];
254  dHdrPrefixJ = inp3[i];
255  }
256  __device__ __forceinline__ void initQi(const GBISParam<3> param, const float q) {}
257  __device__ __forceinline__ void initQj(const float q) {}
258  __device__ __forceinline__ void shuffleNext() {
259  intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
260  dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
261  intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
262  }
263 };
264 
265 template <> struct GBISResults<3> {
266  float3 force;
267  __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
268  __device__ __forceinline__ void shuffleNext() {
269  force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
270  force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
271  force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
272  }
273 };
274 
275 template <bool doEnergy, bool doSlow>
276 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
277  const float cutoff2, const GBISParam<3> param, const GBISInput<3> inp,
278  GBISResults<3>& resI, GBISResults<3>& resJ, float& energy) {
279 
280  if (r2 < cutoff2 && r2 > 0.01f) {
281  float r_i = rsqrtf(r2);
282  float r = r2 * r_i;
283  float dhij, dhji;
284  int dij, dji;
285  CalcDH(r, r2, r_i, param.a_cut, inp.qi, inp.intRadSJ, dhij, dij);
286  CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
287 
288  float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
289  float tmpx = dx * forceAlpha;
290  float tmpy = dy * forceAlpha;
291  float tmpz = dz * forceAlpha;
292  resI.force.x += tmpx;
293  resI.force.y += tmpy;
294  resI.force.z += tmpz;
295  resJ.force.x -= tmpx;
296  resJ.force.y -= tmpy;
297  resJ.force.z -= tmpz;
298  }
299 
300 }
301 
302 __device__ __forceinline__ void writeResult(const int i, const GBISResults<3> res, float* out, float4* forces) {
303  atomicAdd(&forces[i].x, res.force.x);
304  atomicAdd(&forces[i].y, res.force.y);
305  atomicAdd(&forces[i].z, res.force.z);
306 }
307 
308 // ------------------------------------------------------------------------------------------------
309 
310 //
311 // GBIS kernel
312 //
313 template <bool doEnergy, bool doSlow, int phase>
314 __global__ void
315 __launch_bounds__(WARPSIZE*GBISKERNEL_NUM_WARP, 6)
316 GBIS_Kernel(const int numTileLists,
317  const TileList* __restrict__ tileLists,
318  const int* __restrict__ tileJatomStart,
319  const PatchPairRecord* __restrict__ patchPairs,
320  const float3 lata, const float3 latb, const float3 latc,
321  const float4* __restrict__ xyzq,
322  const float cutoff2,
323  const GBISParam<phase> param,
324  const float* __restrict__ inp1,
325  const float* __restrict__ inp2,
326  const float* __restrict__ inp3,
327  float* __restrict__ out,
328  float4* __restrict__ forces,
329  TileListVirialEnergy* __restrict__ virialEnergy) {
330 
331  // Single warp takes care of one list of tiles
332  for (int itileList = threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;itileList < numTileLists;itileList += blockDim.x/WARPSIZE*gridDim.x)
333  {
334  TileList tmp = tileLists[itileList];
335  int iatomStart = tmp.iatomStart;
336  int jtileStart = tmp.jtileStart;
337  int jtileEnd = tmp.jtileEnd;
338  PatchPairRecord PPStmp = patchPairs[itileList];
339  int iatomSize = PPStmp.iatomSize;
340  int jatomSize = PPStmp.jatomSize;
341 
342  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
343  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
344  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
345 
346  // Warp index (0...warpsize-1)
347  const int wid = threadIdx.x % WARPSIZE;
348 
349  // Load i-atom data (and shift coordinates)
350  float4 xyzq_i = xyzq[iatomStart + wid];
351  xyzq_i.x += shx;
352  xyzq_i.y += shy;
353  xyzq_i.z += shz;
354 
355  GBISInput<phase> inp;
356  inp.loadI(iatomStart + wid, inp1, inp2, inp3);
357  if (phase == 2) inp.initQi(param, xyzq_i.w);
358 
359  // Number of i loops
360  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
361 
362  GBISResults<phase> resI;
363  resI.init();
364  float energyT;
365  if (doEnergy) energyT = 0.0f;
366 
367  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
368 
369  // Load j-atom starting index and exclusion mask
370  int jatomStart = tileJatomStart[jtile];
371 
372  // Load coordinates and charge
373  float4 xyzq_j = xyzq[jatomStart + wid];
374 
375  inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
376  if (phase == 2) inp.initQj(xyzq_j.w);
377 
378  // Number of j loops
379  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
380 
381  const bool diag_tile = (iatomStart == jatomStart);
382  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
383  int t = (phase != 2 && diag_tile) ? 1 : 0;
384  if (phase != 2 && diag_tile) {
385  inp.shuffleNext();
386  }
387 
388  GBISResults<phase> resJ;
389  resJ.init();
390 
391  for (;t < WARPSIZE;t++) {
392  int j = (t + wid) & modval;
393 
394  float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
395  float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
396  float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
397 
398  float r2 = dx*dx + dy*dy + dz*dz;
399 
400  if (j < nloopj && wid < nloopi) {
401  calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
402  }
403 
404  inp.shuffleNext();
405  resJ.shuffleNext();
406  } // t
407 
408  // Write j
409  writeResult(jatomStart + wid, resJ, out, forces);
410 
411  } // jtile
412 
413  // Write i
414  writeResult(iatomStart + wid, resI, out, forces);
415 
416  // Reduce energy
417  if (doEnergy) {
418  typedef cub::WarpReduce<double> WarpReduceDouble;
419  __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
420  int warpId = threadIdx.x / WARPSIZE;
421  volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
422  if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
423  }
424 
425  }
426 }
427 
428 #undef GBIS_CUDA
429 
430 // ##############################################################################################
431 // ##############################################################################################
432 // ##############################################################################################
433 
434 //
435 // Class constructor
436 //
437 CudaComputeGBISKernel::CudaComputeGBISKernel(int deviceID) : deviceID(deviceID) {
438  cudaCheck(cudaSetDevice(deviceID));
439 
440  intRad0 = NULL;
441  intRad0Size = 0;
442 
443  intRadS = NULL;
444  intRadSSize = 0;
445 
446  psiSum = NULL;
447  psiSumSize = 0;
448 
449  bornRad = NULL;
450  bornRadSize = 0;
451 
452  dEdaSum = NULL;
453  dEdaSumSize = 0;
454 
455  dHdrPrefix = NULL;
456  dHdrPrefixSize = 0;
457 
458 }
459 
460 //
461 // Class destructor
462 //
463 CudaComputeGBISKernel::~CudaComputeGBISKernel() {
464  cudaCheck(cudaSetDevice(deviceID));
465  if (intRad0 != NULL) deallocate_device<float>(&intRad0);
466  if (intRadS != NULL) deallocate_device<float>(&intRadS);
467  if (psiSum != NULL) deallocate_device<float>(&psiSum);
468  if (bornRad != NULL) deallocate_device<float>(&bornRad);
469  if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
470  if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
471 }
472 
473 //
474 // Update (copy Host->Device) intRad0, intRadS
475 //
476 void CudaComputeGBISKernel::updateIntRad(const int atomStorageSize, float* intRad0H, float* intRadSH,
477  cudaStream_t stream) {
478 
479  reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
480  reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
481 
482  copy_HtoD<float>(intRad0H, intRad0, atomStorageSize, stream);
483  copy_HtoD<float>(intRadSH, intRadS, atomStorageSize, stream);
484 }
485 
486 //
487 // Update (copy Host->Device) bornRad
488 //
489 void CudaComputeGBISKernel::updateBornRad(const int atomStorageSize, float* bornRadH, cudaStream_t stream) {
490  reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
491  copy_HtoD<float>(bornRadH, bornRad, atomStorageSize, stream);
492 }
493 
494 //
495 // Update (copy Host->Device) dHdrPrefix
496 //
497 void CudaComputeGBISKernel::update_dHdrPrefix(const int atomStorageSize, float* dHdrPrefixH, cudaStream_t stream) {
498  reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
499  copy_HtoD<float>(dHdrPrefixH, dHdrPrefix, atomStorageSize, stream);
500 }
501 
502 //
503 // Phase 1
504 //
505 void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize,
506  const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum,
507  cudaStream_t stream) {
508 
509  reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
510  clear_device_array<float>(psiSum, atomStorageSize, stream);
511 
512  int nwarp = GBISKERNEL_NUM_WARP;
513  int nthread = WARPSIZE*nwarp;
514  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
515 
516  GBISParam<1> param;
517  param.a_cut = a_cut;
518 
519  float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
520 
521  GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>>
522  (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
523  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
524  param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
525 
526  cudaCheck(cudaGetLastError());
527 
528  copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize, stream);
529 }
530 
531 //
532 // Phase 2
533 //
534 void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize,
535  const bool doEnergy, const bool doSlow,
536  const float3 lata, const float3 latb, const float3 latc,
537  const float r_cut, const float scaling, const float kappa, const float smoothDist,
538  const float epsilon_p, const float epsilon_s,
539  float4* d_forces, float* h_dEdaSum, cudaStream_t stream) {
540 
541  reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
542  clear_device_array<float>(dEdaSum, atomStorageSize, stream);
543 
544  if (doEnergy) {
545  tlKernel.setTileListVirialEnergyGBISLength(tlKernel.getNumTileListsGBIS());
546  }
547 
548  int nwarp = GBISKERNEL_NUM_WARP;
549  int nthread = WARPSIZE*nwarp;
550 
551  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
552 
553  GBISParam<2> param;
554  param.r_cut2 = r_cut*r_cut;
555  param.r_cut_2 = 1.f / param.r_cut2;
556  param.r_cut_4 = 4.f*param.r_cut_2*param.r_cut_2;
557  param.epsilon_s_i = 1.f / epsilon_s;
558  param.epsilon_p_i = 1.f / epsilon_p;
559  param.scaling = scaling;
560  param.kappa = kappa;
561  param.smoothDist = smoothDist;
562 
563 #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \
564  <<< nblock, nthread, 0, stream >>> \
565  (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
566  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
567  param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
568 
569  if (!doEnergy && !doSlow) CALL(false, false);
570  if (!doEnergy && doSlow) CALL(false, true);
571  if ( doEnergy && !doSlow) CALL(true, false);
572  if ( doEnergy && doSlow) CALL(true, true);
573 
574  cudaCheck(cudaGetLastError());
575 
576  copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize, stream);
577 }
578 
579 //
580 // Phase 3
581 //
582 void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize,
583  const float3 lata, const float3 latb, const float3 latc, const float a_cut,
584  float4* d_forces, cudaStream_t stream) {
585 
586  int nwarp = GBISKERNEL_NUM_WARP;
587  int nthread = WARPSIZE*nwarp;
588  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
589 
590  GBISParam<3> param;
591  param.a_cut = a_cut;
592 
593  float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
594 
595  GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>>
596  (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
597  tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
598  param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
599 
600  cudaCheck(cudaGetLastError());
601 }
602