NAMD
ComputeConsForceCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
7 #else // NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
10 #define cub hipcub
11 #endif // end NAMD_CUDA vs. NAMD_HIP
12 
13 #include "HipDefines.h"
14 
15 #include "ComputeConsForceCUDAKernel.h"
16 
17 #ifdef NODEGROUP_FORCE_REGISTER
18 
19 
20 template<bool T_DOVIRIAL>
21 __global__ void computeConsForceKernel(
22  const int nConsForceAtoms,
23  const int* __restrict d_consForceSOA,
24  const int* __restrict d_consForceID,
25  const double* __restrict d_pos_x,
26  const double* __restrict d_pos_y,
27  const double* __restrict d_pos_z,
28  const double3* __restrict d_consForce,
29  const char3* __restrict d_transform,
30  double* __restrict f_normal_x,
31  double* __restrict f_normal_y,
32  double* __restrict f_normal_z,
33  double3* __restrict d_extForce,
34  double3* __restrict h_extForce,
35  cudaTensor* __restrict d_virial,
36  cudaTensor* __restrict h_extVirial,
37  const double scaling,
38  const Lattice lat,
39  unsigned int* __restrict tbcatomic,
40  cudaStream_t stream
41 )
42 {
43  int tid = threadIdx.x + (blockIdx.x * blockDim.x);
44 
45  int totaltb = gridDim.x;
46  bool isLastBlockDone;
47 
48  if(threadIdx.x == 0){
49  isLastBlockDone = 0;
50  }
51 
52  __syncthreads();
53  double3 pos = {0, 0, 0};
54  double3 pos_i = {0, 0, 0};
55  double3 r_netForce = {0, 0, 0};
56  double3 scale;
57  cudaTensor r_virial;
58  r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
59  r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
60  r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
61 
62  if(tid < nConsForceAtoms){
63  int forceID = d_consForceID[tid];
64  int soaID = d_consForceSOA[forceID];
65  scale.x = scaling * d_consForce[forceID].x;
66  scale.y = scaling * d_consForce[forceID].y;
67  scale.z = scaling * d_consForce[forceID].z;
68  atomicAdd(&(f_normal_x[soaID]), scale.x);
69  atomicAdd(&(f_normal_y[soaID]), scale.y);
70  atomicAdd(&(f_normal_z[soaID]), scale.z);
71  r_netForce.x += scale.x;
72  r_netForce.y += scale.y;
73  r_netForce.z += scale.z;
74  if(T_DOVIRIAL)
75  {
76  char3 t = d_transform[soaID];
77  pos.x = d_pos_x[soaID];
78  pos.y = d_pos_y[soaID];
79  pos.z = d_pos_z[soaID];
80  pos_i = lat.reverse_transform(pos, t);
81  r_virial.xx = scale.x * pos_i.x;
82  r_virial.xy = scale.x * pos_i.y;
83  r_virial.xz = scale.x * pos_i.z;
84  r_virial.yx = scale.y * pos_i.x;
85  r_virial.yy = scale.y * pos_i.y;
86  r_virial.yz = scale.y * pos_i.z;
87  r_virial.zx = scale.z * pos_i.x;
88  r_virial.zy = scale.z * pos_i.y;
89  r_virial.zz = scale.z * pos_i.z;
90  }
91  }
92  __syncthreads();
93  if(T_DOVIRIAL){
94  typedef cub::BlockReduce<double, 128> BlockReduce;
95  __shared__ typename BlockReduce::TempStorage temp_storage;
96 
97  r_netForce.x = BlockReduce(temp_storage).Sum(r_netForce.x);
98  __syncthreads();
99  r_netForce.y = BlockReduce(temp_storage).Sum(r_netForce.y);
100  __syncthreads();
101  r_netForce.z = BlockReduce(temp_storage).Sum(r_netForce.z);
102  __syncthreads();
103 
104  r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
105  __syncthreads();
106  r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
107  __syncthreads();
108  r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
109  __syncthreads();
110 
111  r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
112  __syncthreads();
113  r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
114  __syncthreads();
115  r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
116  __syncthreads();
117 
118  r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
119  __syncthreads();
120  r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
121  __syncthreads();
122  r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
123  __syncthreads();
124  if(threadIdx.x == 0){
125  atomicAdd(&(d_virial->xx), r_virial.xx);
126  atomicAdd(&(d_virial->xy), r_virial.xy);
127  atomicAdd(&(d_virial->xz), r_virial.xz);
128 
129  atomicAdd(&(d_virial->yx), r_virial.yx);
130  atomicAdd(&(d_virial->yy), r_virial.yy);
131  atomicAdd(&(d_virial->yz), r_virial.yz);
132 
133  atomicAdd(&(d_virial->zx), r_virial.zx);
134  atomicAdd(&(d_virial->zy), r_virial.zy);
135  atomicAdd(&(d_virial->zz), r_virial.zz);
136 
137  atomicAdd(&(d_extForce->x), r_netForce.x);
138  atomicAdd(&(d_extForce->y), r_netForce.y);
139  atomicAdd(&(d_extForce->z), r_netForce.z);
140 
141  __threadfence();
142  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
143  isLastBlockDone = (value == (totaltb -1));
144  }
145  __syncthreads();
146  // Last block will set the host values
147 
148  if(isLastBlockDone){
149  if(threadIdx.x == 0){
150  h_extVirial->xx = d_virial->xx;
151  h_extVirial->xy = d_virial->xy;
152  h_extVirial->xz = d_virial->xz;
153  h_extVirial->yx = d_virial->yx;
154  h_extVirial->yy = d_virial->yy;
155  h_extVirial->yz = d_virial->yz;
156  h_extVirial->zx = d_virial->zx;
157  h_extVirial->zy = d_virial->zy;
158  h_extVirial->zz = d_virial->zz;
159 
160  //reset the device virial value
161  d_virial->xx = 0;
162  d_virial->xy = 0;
163  d_virial->xz = 0;
164  d_virial->yx = 0;
165  d_virial->yy = 0;
166  d_virial->yz = 0;
167  d_virial->zx = 0;
168  d_virial->zy = 0;
169  d_virial->zz = 0;
170 
171  h_extForce->x = d_extForce->x;
172  h_extForce->y = d_extForce->y;
173  h_extForce->z = d_extForce->z;
174  d_extForce->x =0 ;
175  d_extForce->y =0 ;
176  d_extForce->z =0 ;
177  //resets atomic counter
178  tbcatomic[0] = 0;
179  __threadfence();
180  }
181  }
182  }
183 }
184 
185 void computeConsForce(
186  const bool doVirial,
187  const int nConsForceAtoms,
188  const int* __restrict d_consForceSOA,
189  const int* __restrict d_consForceID,
190  const double* __restrict d_pos_x,
191  const double* __restrict d_pos_y,
192  const double* __restrict d_pos_z,
193  const double3* __restrict d_consForce,
194  const char3* __restrict d_transform,
195  double* __restrict f_normal_x,
196  double* __restrict f_normal_y,
197  double* __restrict f_normal_z,
198  double3* __restrict d_extForce,
199  double3* __restrict h_extForce,
200  cudaTensor* __restrict d_virial,
201  cudaTensor* __restrict h_virial,
202  const double scaling,
203  const Lattice lat,
204  unsigned int* __restrict d_tbcatomic,
205  cudaStream_t stream
206 )
207 {
208  const int blocks = 128;
209  const int grid = (nConsForceAtoms + blocks - 1) / blocks;
210 
211  if(doVirial){
212  computeConsForceKernel<true> <<<grid, blocks, 0, stream>>> (
213  nConsForceAtoms,
214  d_consForceSOA,
215  d_consForceID,
216  d_pos_x,
217  d_pos_y,
218  d_pos_z,
219  d_consForce,
220  d_transform,
221  f_normal_x,
222  f_normal_y,
223  f_normal_z,
224  d_extForce,
225  h_extForce,
226  d_virial,
227  h_virial,
228  scaling,
229  lat,
230  d_tbcatomic,
231  stream);
232  }
233  else
234  {
235  computeConsForceKernel<false> <<<grid, blocks, 0, stream>>> (
236  nConsForceAtoms,
237  d_consForceSOA,
238  d_consForceID,
239  d_pos_x,
240  d_pos_y,
241  d_pos_z,
242  d_consForce,
243  d_transform,
244  f_normal_x,
245  f_normal_y,
246  f_normal_z,
247  d_extForce,
248  h_extForce,
249  d_virial,
250  h_virial,
251  scaling,
252  lat,
253  d_tbcatomic,
254  stream);
255  }
256 }
257 
258 #endif // NODEGROUP_FORCE_REGISTER
259