2 #include <hipcub/hipcub.hpp>
8 #if __CUDACC_VER_MAJOR__ >= 11
11 #include <namd_cub/cub.cuh>
15 #include "ComputeGridForceCUDAKernel.h"
17 //CUDA needs everyone in one compilation unit.
18 #include "GridforceGridCUDAKernel.h"
21 #ifdef NODEGROUP_FORCE_REGISTER
23 template<int T_DOENERGY, int T_DOVIRIAL, int BLOCKS>
24 __global__ void computeGridForceKernel(
25 const GridforceGridCUDA& grid,
27 const double* d_pos_x,
28 const double* d_pos_y,
29 const double* d_pos_z,
30 const char3* __restrict d_transform,
31 const int* __restrict gridForcedAtomIdxArr,
32 const int* __restrict gridForcedAtomIdxMap,
33 const float* __restrict d_gridded_charge,
34 const float* __restrict d_gridded_scale,
35 double* __restrict d_f_normal_x,
36 double* __restrict d_f_normal_y,
37 double* __restrict d_f_normal_z,
38 double3* __restrict h_netForce,
39 double3* __restrict d_netForce,
40 double* __restrict h_netEnergy,
41 double* __restrict d_netEnergy,
42 cudaTensor* __restrict h_virial,
43 cudaTensor* __restrict d_virial,
44 const int numGriddedAtoms,
45 unsigned int* __restrict d_tbcatomic
48 int tid = threadIdx.x + blockIdx.x * blockDim.x;
49 int totaltb = gridDim.x;
64 g_virial.xx = 0.0; g_virial.xy = 0.0; g_virial.xz = 0.0;
65 g_virial.yx = 0.0; g_virial.yy = 0.0; g_virial.yz = 0.0;
66 g_virial.zx = 0.0; g_virial.zy = 0.0; g_virial.zz = 0.0;
72 Vector gfScale= grid.get_scale();
74 // Wrap coordinates using grid center
76 // FIXME: going back to struct form for the position is
77 // suboptimal, but this is a shortcut to getting something we
78 // can use without rewriting all the gridforcees code, as
79 // otherwise we have to SoAify both wrap_position and the chain
80 // of functions in compute_VdV. Which is a rat's nest of
81 // computational code that would likely be better rewritten
82 // entirely from scratch, because performance does not seem to
83 // have been the overriding priority in the initial
84 // implementation. This modest violation of best practices
85 // would at least let us compute all the grid updates inside the
86 // device, without having to touch the host side between
87 // migration steps. Even using the GPU badly is likely to
88 // perform orders of magnitude faster than syncing back to the
89 // host every step to use the current host side scheme.
90 if(tid < numGriddedAtoms)
92 int indexG = gridForcedAtomIdxMap[tid];
93 int soaID = gridForcedAtomIdxArr[indexG];
94 Position pos_i= Position(d_pos_x[soaID], d_pos_y[soaID], d_pos_z[soaID]);
97 // about 5x faster, but the energy and forces differ slightly, likely due to
98 // absent handling for the gapscale factor around the periodic boundary
99 //#define USE_INTERPOLATE_FORCE_D
100 #ifdef USE_INTERPOLATE_FORCE_D
101 Position pos_wrapped = grid.wrap_position(pos_i,lat);
102 ForceEnergy gridForceEnergy=grid.interpolateForceD(pos_wrapped);
103 f_x = gridForceEnergy.force.x * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x;
104 f_y = gridForceEnergy.force.y * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.y;
105 f_z = gridForceEnergy.force.z * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.z;
106 if(T_DOENERGY && gfScale.x == gfScale.y && gfScale.x == gfScale.z)
108 energy = gridForceEnergy.energy * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x;
112 // Here's where the action happens
115 Position pos = grid.wrap_position(pos_i,lat);
116 int err = grid.compute_VdV(pos, Vval, dV);
118 //Force force = scale * Tensor::diagonal(gfScale) * (-charge * dV);
119 f_x = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x * dV.x;
120 f_y = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.y * dV.y;
121 f_z = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.z * dV.z;
122 if(T_DOENERGY && gfScale.x == gfScale.y && gfScale.x == gfScale.z)
124 // only makes sense when scaling is isotropic
125 energy += d_gridded_scale[indexG] * gfScale.x *
126 (d_gridded_charge[indexG] * Vval);
128 #endif // use force interpolation
129 #ifdef ATOMIC_ADD_GRID_FORCES
130 // if run in different streams, each grid could race over a
131 // specific atom's force if any atoms are in multiple grids.
132 // In practice, we run them in the same stream
133 // and grids are usually non overlapping, so not necessary.
135 // FYI: the performance difference is barely measurable
136 atomicAdd(&d_f_normal_x[soaID], f_x);
137 atomicAdd(&d_f_normal_y[soaID], f_y);
138 atomicAdd(&d_f_normal_z[soaID], f_z);
140 d_f_normal_x[soaID]+=f_x;
141 d_f_normal_y[soaID]+=f_y;
142 d_f_normal_z[soaID]+=f_z;
146 char3 t = d_transform[soaID];
147 Position vpos = lat.reverse_transform(pos_i, t);
151 g_virial.xx = f_x * vpos.x;
152 g_virial.xy = f_x * vpos.y;
153 g_virial.xz = f_x * vpos.z;
154 g_virial.yx = f_y * vpos.x;
155 g_virial.yy = f_y * vpos.y;
156 g_virial.yz = f_y * vpos.z;
157 g_virial.zx = f_z * vpos.x;
158 g_virial.zy = f_z * vpos.y;
159 g_virial.zz = f_z * vpos.z;
164 // Reduce energy and virials
165 typedef cub::BlockReduce<double, BLOCKS> BlockReduce;
166 __shared__ typename BlockReduce::TempStorage temp_storage;
169 energy = BlockReduce(temp_storage).Sum(energy);
174 g_netForce.x = BlockReduce(temp_storage).Sum(g_netForce.x);
176 g_netForce.y = BlockReduce(temp_storage).Sum(g_netForce.y);
178 g_netForce.z = BlockReduce(temp_storage).Sum(g_netForce.z);
180 g_virial.xx = BlockReduce(temp_storage).Sum(g_virial.xx);
182 g_virial.xy = BlockReduce(temp_storage).Sum(g_virial.xy);
184 g_virial.xz = BlockReduce(temp_storage).Sum(g_virial.xz);
186 g_virial.yx = BlockReduce(temp_storage).Sum(g_virial.yx);
188 g_virial.yy = BlockReduce(temp_storage).Sum(g_virial.yy);
190 g_virial.yz = BlockReduce(temp_storage).Sum(g_virial.yz);
192 g_virial.zx = BlockReduce(temp_storage).Sum(g_virial.zx);
194 g_virial.zy = BlockReduce(temp_storage).Sum(g_virial.zy);
196 g_virial.zz = BlockReduce(temp_storage).Sum(g_virial.zz);
200 if(threadIdx.x == 0){
202 atomicAdd(d_netEnergy, energy);
205 atomicAdd(&(d_netForce->x), g_netForce.x);
206 atomicAdd(&(d_netForce->y), g_netForce.y);
207 atomicAdd(&(d_netForce->z), g_netForce.z);
208 atomicAdd(&(d_virial->xx), g_virial.xx);
209 atomicAdd(&(d_virial->xy), g_virial.xy);
210 atomicAdd(&(d_virial->xz), g_virial.xz);
211 atomicAdd(&(d_virial->yx), g_virial.yx);
212 atomicAdd(&(d_virial->yy), g_virial.yy);
213 atomicAdd(&(d_virial->yz), g_virial.yz);
214 atomicAdd(&(d_virial->zx), g_virial.zx);
215 atomicAdd(&(d_virial->zy), g_virial.zy);
216 atomicAdd(&(d_virial->zz), g_virial.zz);
219 unsigned int value = atomicInc(d_tbcatomic, totaltb);
220 isLastBlockDone = (value == (totaltb -1));
228 if(threadIdx.x == 0){
229 //updates to host-mapped mem
231 h_netEnergy[0] = d_netEnergy[0];
234 h_netForce->x = d_netForce->x;
235 h_netForce->y = d_netForce->y;
236 h_netForce->z = d_netForce->z;
237 h_virial->xx = d_virial->xx;
238 h_virial->xy = d_virial->xy;
239 h_virial->xz = d_virial->xz;
240 h_virial->yx = d_virial->yx;
241 h_virial->yy = d_virial->yy;
242 h_virial->yz = d_virial->yz;
243 h_virial->zx = d_virial->zx;
244 h_virial->zy = d_virial->zy;
245 h_virial->zz = d_virial->zz;
270 void computeGridForce(
273 const GridforceGridCUDA& theGrid,
275 const double* d_pos_x,
276 const double* d_pos_y,
277 const double* d_pos_z,
278 const char3* __restrict d_transform,
279 const int* gridForcedAtomIdxArr,
280 const int* gridForcedAtomIdxMap,
281 const float* d_gridded_charge,
282 const float* d_gridded_scale,
283 double* __restrict d_f_normal_x,
284 double* __restrict d_f_normal_y,
285 double* __restrict d_f_normal_z,
286 double3* __restrict h_netForce,
287 double3* __restrict d_netForce,
288 double* __restrict h_netEnergy,
289 double* __restrict d_netEnergy,
290 cudaTensor* __restrict h_virial,
291 cudaTensor* __restrict d_virial,
292 const int numGriddedAtoms,
293 unsigned int* d_tbcatomic,
299 // most grids are not that big, put it one block
301 // for (int idx = 0; idx < numGriddedAtoms; idx++)
302 // TODO use the block to replace the loop
303 int options = doEnergy + (doVirial << 1);
305 if(numGriddedAtoms > 1024)
307 const int blocks = 128;
308 const int grid = (numGriddedAtoms+blocks-1)/blocks;
310 #define CALL_DO_CALC(DOENERGY, DOVIRIAL) \
311 computeGridForceKernel<DOENERGY, DOVIRIAL, blocks> \
312 <<<grid, blocks, 0, stream>>>( theGrid, \
318 gridForcedAtomIdxArr, \
319 gridForcedAtomIdxMap, \
335 case 0: CALL_DO_CALC(0, 0); break;
336 case 1: CALL_DO_CALC(1, 0); break;
337 case 2: CALL_DO_CALC(0, 1); break;
338 case 3: CALL_DO_CALC(1, 1); break;
343 {// relatively few gridded atoms can go in one threadblock
344 const int blocks = 1024;
346 #define CALL_DO_CALC(DOENERGY, DOVIRIAL) \
347 computeGridForceKernel<DOENERGY, DOVIRIAL, blocks> \
348 <<<grid, blocks, 0, stream>>>( theGrid, \
354 gridForcedAtomIdxArr, \
355 gridForcedAtomIdxMap, \
371 case 0: CALL_DO_CALC(0, 0); break;
372 case 1: CALL_DO_CALC(1, 0); break;
373 case 2: CALL_DO_CALC(0, 1); break;
374 case 3: CALL_DO_CALC(1, 1); break;
381 #endif // NODEGROUP_FORCE_REGISTER