NAMD
ComputeCOMCudaKernel.h
Go to the documentation of this file.
1 #ifndef COM_CUDA_KERNEL_H
2 #define COM_CUDA_KERNEL_H
3 
4 #ifdef NAMD_CUDA
5 #if __CUDACC_VER_MAJOR__ >= 11
6 #include <cub/cub.cuh>
7 #else
8 #include <namd_cub/cub.cuh>
9 #endif
10 #endif // NAMD_CUDA
11 
12 //JVV No reason not to do this, no?
13 #ifdef NAMD_HIP
14 //#if 0
15 #include <hip/hip_runtime.h>
16 #include <hipcub/hipcub.hpp>
17 #include "HipDefines.h"
18 #define cub hipcub
19 #endif
20 
21 #include "CudaUtils.h"
22 #include "CudaRecord.h"
23 #include "Lattice.h"
24 
25 #ifdef NODEGROUP_FORCE_REGISTER
26 
28 template <int T_BLOCK_SIZE>
29 __global__ static void computeCOMKernel(
30  const int numAtoms,
31  const double inv_group_mass,
32  const Lattice lat,
33  const float * __restrict mass,
34  const double* __restrict pos_x,
35  const double* __restrict pos_y,
36  const double* __restrict pos_z,
37  const char3* __restrict transform,
38  const int* __restrict atomsSOAIndex,
39  double3* __restrict d_curCM,
40  double3* __restrict curCM,
41  unsigned int* __restrict tbcatomic){
42  int tid = threadIdx.x + blockIdx.x * blockDim.x;
43  int totaltb = gridDim.x;
44  bool isLastBlockDone = false;
45  double3 cm = {0, 0, 0};
46 
47  if(tid < numAtoms){
48  int SOAindex = atomsSOAIndex[tid];
49  // uncoalesced memory access: too bad!
50  double m = mass[SOAindex]; // Cast from float to double here
51  double3 pos;
52  pos.x = pos_x[SOAindex];
53  pos.y = pos_y[SOAindex];
54  pos.z = pos_z[SOAindex];
55 
56  // unwrap the coordinate to calculate COM
57  char3 t = transform[SOAindex];
58  pos = lat.reverse_transform(pos, t);
59 
60  cm.x = pos.x * m;
61  cm.y = pos.y * m;
62  cm.z = pos.z * m;
63  }
64 
65  // now reduce the values and add it to thread zero
66 #if 0
67  typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
68  __shared__ typename BlockReduce::TempStorage temp_storage;
69 
70  cm.x = BlockReduce(temp_storage).Sum(cm.x);
71  __syncthreads();
72  cm.y = BlockReduce(temp_storage).Sum(cm.y);
73  __syncthreads();
74  cm.z = BlockReduce(temp_storage).Sum(cm.z);
75  __syncthreads();
76 #endif
77 
78  // add calculated (pos * mass) in each block
79  if(threadIdx.x == 0){
80  atomicAdd(&(d_curCM->x), cm.x);
81  atomicAdd(&(d_curCM->y), cm.y);
82  atomicAdd(&(d_curCM->z), cm.z);
83 
84  __threadfence();
85  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
86  isLastBlockDone = (value == (totaltb -1));
87  }
88  __syncthreads();
89 
90  // Last block will set the host COM values
91  if(isLastBlockDone){
92  if(threadIdx.x == 0){
93  // thread zero updates the host COM value
94  curCM->x = d_curCM->x * inv_group_mass; // We calculate COM here
95  curCM->y = d_curCM->y * inv_group_mass; // We calculate COM here
96  curCM->z = d_curCM->z * inv_group_mass; // We calculate COM here
97  // set the device values to zero
98  d_curCM->x = 0.0;
99  d_curCM->y = 0.0;
100  d_curCM->z = 0.0;
101  //resets atomic counter
102  tbcatomic[0] = 0;
103  __threadfence();
104  }
105  }
106 }
107 
113 template <int T_BLOCK_SIZE>
114 __global__ static void compute2COMKernel(
115  const int numGroup1Atoms,
116  const int totalNumRestrained,
117  const double inv_group1_mass,
118  const double inv_group2_mass,
119  const Lattice lat,
120  const float * __restrict mass,
121  const double* __restrict pos_x,
122  const double* __restrict pos_y,
123  const double* __restrict pos_z,
124  const char3* __restrict transform,
125  const int* __restrict groupAtomsSOAIndex,
126  double3* __restrict d_curCM1,
127  double3* __restrict d_curCM2,
128  double3* __restrict curCM1,
129  double3* __restrict curCM2,
130  unsigned int* __restrict tbcatomic){
131 
132  int tid = threadIdx.x + blockIdx.x * blockDim.x;
133  int totaltb = gridDim.x;
134  bool isLastBlockDone = false;
135  double3 com1 = {0, 0, 0};
136  double3 com2 = {0, 0, 0};
137 
138  if (tid < totalNumRestrained) {
139  int SOAindex = groupAtomsSOAIndex[tid];
140  // uncoalesced memory access: too bad!
141  double m = mass[SOAindex]; // Cast from float to double here
142  double3 pos;
143  pos.x = pos_x[SOAindex];
144  pos.y = pos_y[SOAindex];
145  pos.z = pos_z[SOAindex];
146 
147  // unwrap the coordinate to calculate COM
148  char3 t = transform[SOAindex];
149  pos = lat.reverse_transform(pos, t);
150 
151  if(tid < numGroup1Atoms){
152  // threads [0 , numGroup1Atoms) calculate COM of group 1
153  // we initialized the com2 to zero
154  com1.x = pos.x * m;
155  com1.y = pos.y * m;
156  com1.z = pos.z * m;
157  } else {
158  // threads [numGroup1Atoms , totalNumRestrained) calculate COM of group 2
159  // we initialized the com1 to zero
160  com2.x = pos.x * m;
161  com2.y = pos.y * m;
162  com2.z = pos.z * m;
163  }
164  }
165  __syncthreads();
166 
167 #if 0
168  // now reduce the values and add it to thread zero
169  typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
170  __shared__ typename BlockReduce::TempStorage temp_storage;
171 
172  com1.x = BlockReduce(temp_storage).Sum(com1.x);
173  __syncthreads();
174  com1.y = BlockReduce(temp_storage).Sum(com1.y);
175  __syncthreads();
176  com1.z = BlockReduce(temp_storage).Sum(com1.z);
177  __syncthreads();
178  com2.x = BlockReduce(temp_storage).Sum(com2.x);
179  __syncthreads();
180  com2.y = BlockReduce(temp_storage).Sum(com2.y);
181  __syncthreads();
182  com2.z = BlockReduce(temp_storage).Sum(com2.z);
183  __syncthreads();
184 #endif
185  // add calculated (pos * mass) in each block
186  if(threadIdx.x == 0){
187  atomicAdd(&(d_curCM1->x), com1.x);
188  atomicAdd(&(d_curCM1->y), com1.y);
189  atomicAdd(&(d_curCM1->z), com1.z);
190  atomicAdd(&(d_curCM2->x), com2.x);
191  atomicAdd(&(d_curCM2->y), com2.y);
192  atomicAdd(&(d_curCM2->z), com2.z);
193 
194  __threadfence();
195  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
196  isLastBlockDone = (value == (totaltb -1));
197  }
198  __syncthreads();
199 
200  // Last block will set the host COM values
201  if(isLastBlockDone){
202  if(threadIdx.x == 0){
203  // thread zero updates the host COM value
204  curCM1->x = d_curCM1->x * inv_group1_mass; // We calculate COM here
205  curCM1->y = d_curCM1->y * inv_group1_mass; // We calculate COM here
206  curCM1->z = d_curCM1->z * inv_group1_mass; // We calculate COM here
207  curCM2->x = d_curCM2->x * inv_group2_mass; // We calculate COM here
208  curCM2->y = d_curCM2->y * inv_group2_mass; // We calculate COM here
209  curCM2->z = d_curCM2->z * inv_group2_mass; // We calculate COM here
210  // set the device values to zero
211  d_curCM1->x = 0.0;
212  d_curCM1->y = 0.0;
213  d_curCM1->z = 0.0;
214  d_curCM2->x = 0.0;
215  d_curCM2->y = 0.0;
216  d_curCM2->z = 0.0;
217  //resets atomic counter
218  tbcatomic[0] = 0;
219  __threadfence();
220  }
221  }
222 }
223 
224 #endif // NODEGROUP_FORCE_REGISTER
225 #endif // COM_CUDA_KERNEL_H
NAMD_HOST_DEVICE Position reverse_transform(Position data, const Transform &t) const
Definition: Lattice.h:143
BigReal x
Definition: Vector.h:74