NAMD
ComputeCOMCudaKernel.h
Go to the documentation of this file.
1 #ifndef COM_CUDA_KERNEL_H
2 #define COM_CUDA_KERNEL_H
3 
4 #ifdef NAMD_CUDA
5 #if __CUDACC_VER_MAJOR__ >= 11
6 #include <cub/cub.cuh>
7 #else
8 #include <namd_cub/cub.cuh>
9 #endif
10 #endif // NAMD_CUDA
11 
12 //JVV No reason not to do this, no?
13 #ifdef NAMD_HIP
14 #include <hip/hip_runtime.h>
15 #include <hipcub/hipcub.hpp>
16 #include "HipDefines.h"
17 #define cub hipcub
18 #endif
19 
20 #include "CudaUtils.h"
21 #include "CudaRecord.h"
22 #include "Lattice.h"
23 
24 #ifdef NODEGROUP_FORCE_REGISTER
25 
27 template <int T_BLOCK_SIZE>
28 __global__ static void computeCOMKernel(
29  const int numAtoms,
30  const double inv_group_mass,
31  const Lattice lat,
32  const float * __restrict mass,
33  const double* __restrict pos_x,
34  const double* __restrict pos_y,
35  const double* __restrict pos_z,
36  const char3* __restrict transform,
37  const int* __restrict atomsSOAIndex,
38  double3* __restrict d_curCM,
39  double3* __restrict curCM,
40  unsigned int* __restrict tbcatomic){
41  int tid = threadIdx.x + blockIdx.x * blockDim.x;
42  int totaltb = gridDim.x;
43  bool isLastBlockDone = false;
44  double3 cm = {0, 0, 0};
45 
46  if(tid < numAtoms){
47  int SOAindex = atomsSOAIndex[tid];
48  // uncoalesced memory access: too bad!
49  double m = mass[SOAindex]; // Cast from float to double here
50  double3 pos;
51  pos.x = pos_x[SOAindex];
52  pos.y = pos_y[SOAindex];
53  pos.z = pos_z[SOAindex];
54 
55  // unwrap the coordinate to calculate COM
56  char3 t = transform[SOAindex];
57  pos = lat.reverse_transform(pos, t);
58 
59  cm.x = pos.x * m;
60  cm.y = pos.y * m;
61  cm.z = pos.z * m;
62  }
63 
64  // now reduce the values and add it to thread zero
65  typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
66  __shared__ typename BlockReduce::TempStorage temp_storage;
67 
68  cm.x = BlockReduce(temp_storage).Sum(cm.x);
69  __syncthreads();
70  cm.y = BlockReduce(temp_storage).Sum(cm.y);
71  __syncthreads();
72  cm.z = BlockReduce(temp_storage).Sum(cm.z);
73  __syncthreads();
74 
75  // add calculated (pos * mass) in each block
76  if(threadIdx.x == 0){
77  atomicAdd(&(d_curCM->x), cm.x);
78  atomicAdd(&(d_curCM->y), cm.y);
79  atomicAdd(&(d_curCM->z), cm.z);
80 
81  __threadfence();
82  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
83  isLastBlockDone = (value == (totaltb -1));
84  }
85  __syncthreads();
86 
87  // Last block will set the host COM values
88  if(isLastBlockDone){
89  if(threadIdx.x == 0){
90  // thread zero updates the host COM value
91  curCM->x = d_curCM->x * inv_group_mass; // We calculate COM here
92  curCM->y = d_curCM->y * inv_group_mass; // We calculate COM here
93  curCM->z = d_curCM->z * inv_group_mass; // We calculate COM here
94  // set the device values to zero
95  d_curCM->x = 0.0;
96  d_curCM->y = 0.0;
97  d_curCM->z = 0.0;
98  //resets atomic counter
99  tbcatomic[0] = 0;
100  __threadfence();
101  }
102  }
103 }
104 
109 template <int T_BLOCK_SIZE>
110 __global__ static void computeCOMKernelMgpu(
111  const int numAtoms,
112  const Lattice lat,
113  const float * __restrict mass,
114  const double* __restrict pos_x,
115  const double* __restrict pos_y,
116  const double* __restrict pos_z,
117  const char3* __restrict transform,
118  const int* __restrict atomsSOAIndex,
119  double3** __restrict d_peer_curCM,
120  const int numDevices,
121  const int deviceIndex,
122  unsigned int* __restrict tbcatomic){
123  int tid = threadIdx.x + blockIdx.x * blockDim.x;
124  double3 cm = {0, 0, 0};
125  if(tid < numAtoms){
126  int SOAindex = atomsSOAIndex[tid];
127  // uncoalesced memory access: too bad!
128  double m = mass[SOAindex]; // Cast from float to double here
129  double3 pos;
130  pos.x = pos_x[SOAindex];
131  pos.y = pos_y[SOAindex];
132  pos.z = pos_z[SOAindex];
133 
134  // unwrap the coordinate to calculate COM
135  char3 t = transform[SOAindex];
136  pos = lat.reverse_transform(pos, t);
137 
138  cm.x = pos.x * m;
139  cm.y = pos.y * m;
140  cm.z = pos.z * m;
141  }
142 
143  // now reduce the values and add it to thread zero
144  typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
145  __shared__ typename BlockReduce::TempStorage temp_storage;
146 
147  cm.x = BlockReduce(temp_storage).Sum(cm.x);
148  __syncthreads();
149  cm.y = BlockReduce(temp_storage).Sum(cm.y);
150  __syncthreads();
151  cm.z = BlockReduce(temp_storage).Sum(cm.z);
152  __syncthreads();
153 
154  // add calculated (pos * mass) in each block
155  if(threadIdx.x == 0){
156  atomicAdd(&(d_peer_curCM[deviceIndex][0].x), cm.x);
157  atomicAdd(&(d_peer_curCM[deviceIndex][0].y), cm.y);
158  atomicAdd(&(d_peer_curCM[deviceIndex][0].z), cm.z);
159  }
160 }
161 
166 __global__ static void computeDistCOMKernelMgpu(
167  double3** __restrict d_peerCOM,
168  double3* __restrict d_curCOM,
169  const int numDevices){
170  int tid = threadIdx.x + blockIdx.x * blockDim.x;
171  if (tid < numDevices) {
172  atomicAdd(&(d_curCOM->x), d_peerCOM[tid][0].x);
173  atomicAdd(&(d_curCOM->y), d_peerCOM[tid][0].y);
174  atomicAdd(&(d_curCOM->z), d_peerCOM[tid][0].z);
175  }
176 }
177 
178 
183 __global__ static void copyDistCOMKernel(
184  double3** __restrict d_peerCOM,
185  double3* __restrict h_peerCOM,
186  const int numDevices){
187  int tid = threadIdx.x + blockIdx.x * blockDim.x;
188  if (tid < numDevices) {
189  h_peerCOM[tid].x = d_peerCOM[tid][0].x;
190  h_peerCOM[tid].y = d_peerCOM[tid][0].y;
191  h_peerCOM[tid].z = d_peerCOM[tid][0].z;
192  }
193 }
194 
195 
201 template <int T_BLOCK_SIZE>
202 __global__ static void compute2COMKernel(
203  const int numGroup1Atoms,
204  const int totalNumRestrained,
205  const double inv_group1_mass,
206  const double inv_group2_mass,
207  const Lattice lat,
208  const float * __restrict mass,
209  const double* __restrict pos_x,
210  const double* __restrict pos_y,
211  const double* __restrict pos_z,
212  const char3* __restrict transform,
213  const int* __restrict groupAtomsSOAIndex,
214  double3* __restrict d_curCM1,
215  double3* __restrict d_curCM2,
216  double3* __restrict curCM1,
217  double3* __restrict curCM2,
218  unsigned int* __restrict tbcatomic){
219 
220  int tid = threadIdx.x + blockIdx.x * blockDim.x;
221  int totaltb = gridDim.x;
222  bool isLastBlockDone = false;
223  double3 com1 = {0, 0, 0};
224  double3 com2 = {0, 0, 0};
225 
226  if (tid < totalNumRestrained) {
227  int SOAindex = groupAtomsSOAIndex[tid];
228  // uncoalesced memory access: too bad!
229  double m = mass[SOAindex]; // Cast from float to double here
230  double3 pos;
231  pos.x = pos_x[SOAindex];
232  pos.y = pos_y[SOAindex];
233  pos.z = pos_z[SOAindex];
234 
235  // unwrap the coordinate to calculate COM
236  char3 t = transform[SOAindex];
237  pos = lat.reverse_transform(pos, t);
238 
239  if(tid < numGroup1Atoms){
240  // threads [0 , numGroup1Atoms) calculate COM of group 1
241  // we initialized the com2 to zero
242  com1.x = pos.x * m;
243  com1.y = pos.y * m;
244  com1.z = pos.z * m;
245  } else {
246  // threads [numGroup1Atoms , totalNumRestrained) calculate COM of group 2
247  // we initialized the com1 to zero
248  com2.x = pos.x * m;
249  com2.y = pos.y * m;
250  com2.z = pos.z * m;
251  }
252  }
253  __syncthreads();
254 
255  // now reduce the values and add it to thread zero
256  typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
257  __shared__ typename BlockReduce::TempStorage temp_storage;
258 
259  com1.x = BlockReduce(temp_storage).Sum(com1.x);
260  __syncthreads();
261  com1.y = BlockReduce(temp_storage).Sum(com1.y);
262  __syncthreads();
263  com1.z = BlockReduce(temp_storage).Sum(com1.z);
264  __syncthreads();
265  com2.x = BlockReduce(temp_storage).Sum(com2.x);
266  __syncthreads();
267  com2.y = BlockReduce(temp_storage).Sum(com2.y);
268  __syncthreads();
269  com2.z = BlockReduce(temp_storage).Sum(com2.z);
270  __syncthreads();
271 
272  // add calculated (pos * mass) in each block
273  if(threadIdx.x == 0){
274  atomicAdd(&(d_curCM1->x), com1.x);
275  atomicAdd(&(d_curCM1->y), com1.y);
276  atomicAdd(&(d_curCM1->z), com1.z);
277  atomicAdd(&(d_curCM2->x), com2.x);
278  atomicAdd(&(d_curCM2->y), com2.y);
279  atomicAdd(&(d_curCM2->z), com2.z);
280 
281  __threadfence();
282  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
283  isLastBlockDone = (value == (totaltb -1));
284  }
285  __syncthreads();
286 
287  // Last block will set the host COM values
288  if(isLastBlockDone){
289  if(threadIdx.x == 0){
290  // thread zero updates the host COM value
291  curCM1->x = d_curCM1->x * inv_group1_mass; // We calculate COM here
292  curCM1->y = d_curCM1->y * inv_group1_mass; // We calculate COM here
293  curCM1->z = d_curCM1->z * inv_group1_mass; // We calculate COM here
294  curCM2->x = d_curCM2->x * inv_group2_mass; // We calculate COM here
295  curCM2->y = d_curCM2->y * inv_group2_mass; // We calculate COM here
296  curCM2->z = d_curCM2->z * inv_group2_mass; // We calculate COM here
297  // set the device values to zero
298  d_curCM1->x = 0.0;
299  d_curCM1->y = 0.0;
300  d_curCM1->z = 0.0;
301  d_curCM2->x = 0.0;
302  d_curCM2->y = 0.0;
303  d_curCM2->z = 0.0;
304  //resets atomic counter
305  tbcatomic[0] = 0;
306  __threadfence();
307  }
308  }
309 }
310 
311 __global__ static void initPeerCOMKernel(
312  const int numDevices,
313  const int deviceIndex,
314  double3** d_peerPool,
315  double3* d_peerCOM)
316 {
317  int tid = threadIdx.x + blockIdx.x * blockDim.x;
318  if(tid==0)
319  d_peerPool[deviceIndex]=d_peerCOM;
320  if(tid < numDevices){
321  d_peerCOM[tid].x = 0.0;
322  d_peerCOM[tid].y = 0.0;
323  d_peerCOM[tid].z = 0.0;
324  }
325 }
326 
327 #endif // NODEGROUP_FORCE_REGISTER
328 #endif // COM_CUDA_KERNEL_H
NAMD_HOST_DEVICE Position reverse_transform(Position data, const Transform &t) const
Definition: Lattice.h:143
BigReal x
Definition: Vector.h:74