NAMD
ComputeGridForceCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_HIP
2 #include <hipcub/hipcub.hpp>
3 #define cub hipcub
4 #endif // NAMD_HIP
5 
6 #ifdef NAMD_CUDA
7 #include <cuda.h>
8 #if __CUDACC_VER_MAJOR__ >= 11
9 #include <cub/cub.cuh>
10 #else
11 #include <namd_cub/cub.cuh>
12 #endif
13 #endif // NAMD_CUDA
14 
15 #include "ComputeGridForceCUDAKernel.h"
16 
17 //CUDA needs everyone in one compilation unit.
18 #include "GridforceGridCUDAKernel.h"
19 
20 
21 #ifdef NODEGROUP_FORCE_REGISTER
22 
23 template<int T_DOENERGY, int T_DOVIRIAL, int BLOCKS>
24 __global__ void computeGridForceKernel(
25  const GridforceGridCUDA& grid,
26  const Lattice lat,
27  const double* d_pos_x,
28  const double* d_pos_y,
29  const double* d_pos_z,
30  const char3* __restrict d_transform,
31  const int* __restrict gridForcedAtomIdxArr,
32  const int* __restrict gridForcedAtomIdxMap,
33  const float* __restrict d_gridded_charge,
34  const float* __restrict d_gridded_scale,
35  double* __restrict d_f_normal_x,
36  double* __restrict d_f_normal_y,
37  double* __restrict d_f_normal_z,
38  double3* __restrict h_netForce,
39  double3* __restrict d_netForce,
40  double* __restrict h_netEnergy,
41  double* __restrict d_netEnergy,
42  cudaTensor* __restrict h_virial,
43  cudaTensor* __restrict d_virial,
44  const int numGriddedAtoms,
45  unsigned int* __restrict d_tbcatomic
46  )
47 {
48  int tid = threadIdx.x + blockIdx.x * blockDim.x;
49  int totaltb = gridDim.x;
50  bool isLastBlockDone;
51  if(threadIdx.x == 0){
52  isLastBlockDone = 0;
53 
54  }
55  __syncthreads();
56 
57  double energy;
58  if (T_DOENERGY) {
59  energy = 0.0;
60  }
61  cudaTensor g_virial;
62  double3 g_netForce;
63  if (T_DOVIRIAL) {
64  g_virial.xx = 0.0; g_virial.xy = 0.0; g_virial.xz = 0.0;
65  g_virial.yx = 0.0; g_virial.yy = 0.0; g_virial.yz = 0.0;
66  g_virial.zx = 0.0; g_virial.zy = 0.0; g_virial.zz = 0.0;
67  g_netForce.x = 0.0;
68  g_netForce.y = 0.0;
69  g_netForce.z = 0.0;
70  }
71  double f_x, f_y, f_z;
72  Vector gfScale= grid.get_scale();
73 
74  // Wrap coordinates using grid center
75 
76  // FIXME: going back to struct form for the position is
77  // suboptimal, but this is a shortcut to getting something we
78  // can use without rewriting all the gridforcees code, as
79  // otherwise we have to SoAify both wrap_position and the chain
80  // of functions in compute_VdV. Which is a rat's nest of
81  // computational code that would likely be better rewritten
82  // entirely from scratch, because performance does not seem to
83  // have been the overriding priority in the initial
84  // implementation. This modest violation of best practices
85  // would at least let us compute all the grid updates inside the
86  // device, without having to touch the host side between
87  // migration steps. Even using the GPU badly is likely to
88  // perform orders of magnitude faster than syncing back to the
89  // host every step to use the current host side scheme.
90  if(tid < numGriddedAtoms)
91  {
92  int indexG = gridForcedAtomIdxMap[tid];
93  int soaID = gridForcedAtomIdxArr[indexG];
94  Position pos_i= Position(d_pos_x[soaID], d_pos_y[soaID], d_pos_z[soaID]);
95 
96 
97 // about 5x faster, but the energy and forces differ slightly, likely due to
98 // absent handling for the gapscale factor around the periodic boundary
99 //#define USE_INTERPOLATE_FORCE_D
100 #ifdef USE_INTERPOLATE_FORCE_D
101  Position pos_wrapped = grid.wrap_position(pos_i,lat);
102  ForceEnergy gridForceEnergy=grid.interpolateForceD(pos_wrapped);
103  f_x = gridForceEnergy.force.x * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x;
104  f_y = gridForceEnergy.force.y * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.y;
105  f_z = gridForceEnergy.force.z * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.z;
106  if(T_DOENERGY && gfScale.x == gfScale.y && gfScale.x == gfScale.z)
107  {
108  energy = gridForceEnergy.energy * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x;
109  }
110  if (1){
111 #else
112  // Here's where the action happens
113  float Vval;
114  Vector dV;
115  Position pos = grid.wrap_position(pos_i,lat);
116  int err = grid.compute_VdV(pos, Vval, dV);
117  if (!err) {
118  //Force force = scale * Tensor::diagonal(gfScale) * (-charge * dV);
119  f_x = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x * dV.x;
120  f_y = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.y * dV.y;
121  f_z = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.z * dV.z;
122  if(T_DOENERGY && gfScale.x == gfScale.y && gfScale.x == gfScale.z)
123  {
124  // only makes sense when scaling is isotropic
125  energy += d_gridded_scale[indexG] * gfScale.x *
126  (d_gridded_charge[indexG] * Vval);
127  }
128 #endif // use force interpolation
129 #ifdef ATOMIC_ADD_GRID_FORCES
130  // if run in different streams, each grid could race over a
131  // specific atom's force if any atoms are in multiple grids.
132  // In practice, we run them in the same stream
133  // and grids are usually non overlapping, so not necessary.
134 
135  // FYI: the performance difference is barely measurable
136  atomicAdd(&d_f_normal_x[soaID], f_x);
137  atomicAdd(&d_f_normal_y[soaID], f_y);
138  atomicAdd(&d_f_normal_z[soaID], f_z);
139 #else
140  d_f_normal_x[soaID]+=f_x;
141  d_f_normal_y[soaID]+=f_y;
142  d_f_normal_z[soaID]+=f_z;
143 #endif
144 
145  if(T_DOVIRIAL){
146  char3 t = d_transform[soaID];
147  Position vpos = lat.reverse_transform(pos_i, t);
148  g_netForce.x = f_x;
149  g_netForce.y = f_y;
150  g_netForce.z = f_z;
151  g_virial.xx = f_x * vpos.x;
152  g_virial.xy = f_x * vpos.y;
153  g_virial.xz = f_x * vpos.z;
154  g_virial.yx = f_y * vpos.x;
155  g_virial.yy = f_y * vpos.y;
156  g_virial.yz = f_y * vpos.z;
157  g_virial.zx = f_z * vpos.x;
158  g_virial.zy = f_z * vpos.y;
159  g_virial.zz = f_z * vpos.z;
160  }
161  }
162  }
163 #if 1
164  // Reduce energy and virials
165  typedef cub::BlockReduce<double, BLOCKS> BlockReduce;
166  __shared__ typename BlockReduce::TempStorage temp_storage;
167 
168  if(T_DOENERGY){
169  energy = BlockReduce(temp_storage).Sum(energy);
170  __syncthreads();
171  }
172 
173  if(T_DOVIRIAL){
174  g_netForce.x = BlockReduce(temp_storage).Sum(g_netForce.x);
175  __syncthreads();
176  g_netForce.y = BlockReduce(temp_storage).Sum(g_netForce.y);
177  __syncthreads();
178  g_netForce.z = BlockReduce(temp_storage).Sum(g_netForce.z);
179  __syncthreads();
180  g_virial.xx = BlockReduce(temp_storage).Sum(g_virial.xx);
181  __syncthreads();
182  g_virial.xy = BlockReduce(temp_storage).Sum(g_virial.xy);
183  __syncthreads();
184  g_virial.xz = BlockReduce(temp_storage).Sum(g_virial.xz);
185  __syncthreads();
186  g_virial.yx = BlockReduce(temp_storage).Sum(g_virial.yx);
187  __syncthreads();
188  g_virial.yy = BlockReduce(temp_storage).Sum(g_virial.yy);
189  __syncthreads();
190  g_virial.yz = BlockReduce(temp_storage).Sum(g_virial.yz);
191  __syncthreads();
192  g_virial.zx = BlockReduce(temp_storage).Sum(g_virial.zx);
193  __syncthreads();
194  g_virial.zy = BlockReduce(temp_storage).Sum(g_virial.zy);
195  __syncthreads();
196  g_virial.zz = BlockReduce(temp_storage).Sum(g_virial.zz);
197  __syncthreads();
198  }
199 
200  if(threadIdx.x == 0){
201  if(T_DOENERGY){
202  atomicAdd(d_netEnergy, energy);
203  }
204  if(T_DOVIRIAL){
205  atomicAdd(&(d_netForce->x), g_netForce.x);
206  atomicAdd(&(d_netForce->y), g_netForce.y);
207  atomicAdd(&(d_netForce->z), g_netForce.z);
208  atomicAdd(&(d_virial->xx), g_virial.xx);
209  atomicAdd(&(d_virial->xy), g_virial.xy);
210  atomicAdd(&(d_virial->xz), g_virial.xz);
211  atomicAdd(&(d_virial->yx), g_virial.yx);
212  atomicAdd(&(d_virial->yy), g_virial.yy);
213  atomicAdd(&(d_virial->yz), g_virial.yz);
214  atomicAdd(&(d_virial->zx), g_virial.zx);
215  atomicAdd(&(d_virial->zy), g_virial.zy);
216  atomicAdd(&(d_virial->zz), g_virial.zz);
217  }
218  __threadfence();
219  unsigned int value = atomicInc(d_tbcatomic, totaltb);
220  isLastBlockDone = (value == (totaltb -1));
221  }
222 #endif
223 
224 
225  __syncthreads();
226 
227  if(isLastBlockDone){
228  if(threadIdx.x == 0){
229  //updates to host-mapped mem
230  if(T_DOENERGY){
231  h_netEnergy[0] = d_netEnergy[0];
232  }
233  if(T_DOVIRIAL){
234  h_netForce->x = d_netForce->x;
235  h_netForce->y = d_netForce->y;
236  h_netForce->z = d_netForce->z;
237  h_virial->xx = d_virial->xx;
238  h_virial->xy = d_virial->xy;
239  h_virial->xz = d_virial->xz;
240  h_virial->yx = d_virial->yx;
241  h_virial->yy = d_virial->yy;
242  h_virial->yz = d_virial->yz;
243  h_virial->zx = d_virial->zx;
244  h_virial->zy = d_virial->zy;
245  h_virial->zz = d_virial->zz;
246  }
247  if(T_DOENERGY){
248  d_netEnergy[0] = 0;
249  }
250  if(T_DOVIRIAL){
251  d_netForce->x = 0;
252  d_netForce->y = 0;
253  d_netForce->z = 0;
254  d_virial->xx = 0;
255  d_virial->xy = 0;
256  d_virial->xz = 0;
257  d_virial->yx = 0;
258  d_virial->yy = 0;
259  d_virial->yz = 0;
260  d_virial->zx = 0;
261  d_virial->zy = 0;
262  d_virial->zz = 0;
263  }
264  d_tbcatomic[0] = 0;
265  __threadfence();
266  }
267  }
268 }
269 
270 void computeGridForce(
271  const int doEnergy,
272  const int doVirial,
273  const GridforceGridCUDA& theGrid,
274  const Lattice lat,
275  const double* d_pos_x,
276  const double* d_pos_y,
277  const double* d_pos_z,
278  const char3* __restrict d_transform,
279  const int* gridForcedAtomIdxArr,
280  const int* gridForcedAtomIdxMap,
281  const float* d_gridded_charge,
282  const float* d_gridded_scale,
283  double* __restrict d_f_normal_x,
284  double* __restrict d_f_normal_y,
285  double* __restrict d_f_normal_z,
286  double3* __restrict h_netForce,
287  double3* __restrict d_netForce,
288  double* __restrict h_netEnergy,
289  double* __restrict d_netEnergy,
290  cudaTensor* __restrict h_virial,
291  cudaTensor* __restrict d_virial,
292  const int numGriddedAtoms,
293  unsigned int* d_tbcatomic,
294  cudaStream_t stream
295 
296  )
297 
298 {
299  // most grids are not that big, put it one block
300 
301  // for (int idx = 0; idx < numGriddedAtoms; idx++)
302  // TODO use the block to replace the loop
303  int options = doEnergy + (doVirial << 1);
304 
305  if(numGriddedAtoms > 1024)
306  {
307  const int blocks = 128;
308  const int grid = (numGriddedAtoms+blocks-1)/blocks;
309 
310 #define CALL_DO_CALC(DOENERGY, DOVIRIAL) \
311  computeGridForceKernel<DOENERGY, DOVIRIAL, blocks> \
312  <<<grid, blocks, 0, stream>>>( theGrid, \
313  lat, \
314  d_pos_x, \
315  d_pos_y, \
316  d_pos_z, \
317  d_transform, \
318  gridForcedAtomIdxArr, \
319  gridForcedAtomIdxMap, \
320  d_gridded_charge, \
321  d_gridded_scale, \
322  d_f_normal_x, \
323  d_f_normal_y, \
324  d_f_normal_z, \
325  h_netForce, \
326  d_netForce, \
327  h_netEnergy, \
328  d_netEnergy, \
329  h_virial, \
330  d_virial, \
331  numGriddedAtoms, \
332  d_tbcatomic \
333  );
334  switch(options) {
335  case 0: CALL_DO_CALC(0, 0); break;
336  case 1: CALL_DO_CALC(1, 0); break;
337  case 2: CALL_DO_CALC(0, 1); break;
338  case 3: CALL_DO_CALC(1, 1); break;
339  }
340 #undef CALL_DO_CALC
341  }
342  else
343  {// relatively few gridded atoms can go in one threadblock
344  const int blocks = 1024;
345  const int grid = 1;
346 #define CALL_DO_CALC(DOENERGY, DOVIRIAL) \
347  computeGridForceKernel<DOENERGY, DOVIRIAL, blocks> \
348  <<<grid, blocks, 0, stream>>>( theGrid, \
349  lat, \
350  d_pos_x, \
351  d_pos_y, \
352  d_pos_z, \
353  d_transform, \
354  gridForcedAtomIdxArr, \
355  gridForcedAtomIdxMap, \
356  d_gridded_charge, \
357  d_gridded_scale, \
358  d_f_normal_x, \
359  d_f_normal_y, \
360  d_f_normal_z, \
361  h_netForce, \
362  d_netForce, \
363  h_netEnergy, \
364  d_netEnergy, \
365  h_virial, \
366  d_virial, \
367  numGriddedAtoms, \
368  d_tbcatomic \
369  );
370  switch(options) {
371  case 0: CALL_DO_CALC(0, 0); break;
372  case 1: CALL_DO_CALC(1, 0); break;
373  case 2: CALL_DO_CALC(0, 1); break;
374  case 3: CALL_DO_CALC(1, 1); break;
375  }
376 #undef CALL_DO_CALC
377  }
378 }
379 
380 
381 #endif // NODEGROUP_FORCE_REGISTER