NAMD
ComputeGlobalMasterVirialCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
7 #else // NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
10 #define cub hipcub
11 #endif // end NAMD_CUDA vs. NAMD_HIP
12 
13 #include "HipDefines.h"
14 
15 #include "ComputeGlobalMasterVirialCUDAKernel.h"
16 
17 #ifdef NODEGROUP_FORCE_REGISTER
18 
19 
20 __global__ void computeGlobalMasterVirialKernel(
21  const int numPatches,
22  CudaLocalRecord* localRecords,
23  const double* __restrict d_pos_x,
24  const double* __restrict d_pos_y,
25  const double* __restrict d_pos_z,
26  const char3* __restrict d_transform,
27  double* __restrict f_global_x,
28  double* __restrict f_global_y,
29  double* __restrict f_global_z,
30  double3* __restrict d_extForce,
31  double3* __restrict h_extForce,
32  cudaTensor* __restrict d_virial,
33  cudaTensor* __restrict h_extVirial,
34  const Lattice lat,
35  unsigned int* __restrict tbcatomic,
36  cudaStream_t stream
37 )
38 {
39  __shared__ CudaLocalRecord s_record;
40  using AccessType = int32_t;
41  AccessType* s_record_buffer = (AccessType*) &s_record;
42 
43  int totaltb = gridDim.x;
44  bool isLastBlockDone;
45 
46  if(threadIdx.x == 0){
47  isLastBlockDone = 0;
48  }
49 
50  __syncthreads();
51  double3 pos = {0, 0, 0};
52  double3 pos_i = {0, 0, 0};
53  double3 r_netForce = {0, 0, 0};
54  cudaTensor r_virial;
55  r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
56  r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
57  r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
58 
59  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
60  // Read in the CudaLocalRecord using multiple threads. This should
61 #pragma unroll 1
62  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
63  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
64  }
65  __syncthreads();
66 
67  const int numAtoms = s_record.numAtoms;
68  const int offset = s_record.bufferOffset;
69 
70  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
71  char3 t = d_transform[offset + i];
72  pos.x = d_pos_x[offset + i];
73  pos.y = d_pos_y[offset + i];
74  pos.z = d_pos_z[offset + i];
75  pos_i = lat.reverse_transform(pos, t);
76  r_virial.xx = f_global_x[offset + i] * pos_i.x;
77  r_virial.xy = f_global_x[offset + i] * pos_i.y;
78  r_virial.xz = f_global_x[offset + i] * pos_i.z;
79  r_virial.yx = f_global_y[offset + i] * pos_i.x;
80  r_virial.yy = f_global_y[offset + i] * pos_i.y;
81  r_virial.yz = f_global_y[offset + i] * pos_i.z;
82  r_virial.zx = f_global_z[offset + i] * pos_i.x;
83  r_virial.zy = f_global_z[offset + i] * pos_i.y;
84  r_virial.zz = f_global_z[offset + i] * pos_i.z;
85  }
86  }
87  __syncthreads();
88  typedef cub::BlockReduce<double, 1024> BlockReduce;
89  __shared__ typename BlockReduce::TempStorage temp_storage;
90 
91  r_netForce.x = BlockReduce(temp_storage).Sum(r_netForce.x);
92  __syncthreads();
93  r_netForce.y = BlockReduce(temp_storage).Sum(r_netForce.y);
94  __syncthreads();
95  r_netForce.z = BlockReduce(temp_storage).Sum(r_netForce.z);
96  __syncthreads();
97 
98  r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
99  __syncthreads();
100  r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
101  __syncthreads();
102  r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
103  __syncthreads();
104 
105  r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
106  __syncthreads();
107  r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
108  __syncthreads();
109  r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
110  __syncthreads();
111 
112  r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
113  __syncthreads();
114  r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
115  __syncthreads();
116  r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
117  __syncthreads();
118  if(threadIdx.x == 0){
119  atomicAdd(&(d_virial->xx), r_virial.xx);
120  atomicAdd(&(d_virial->xy), r_virial.xy);
121  atomicAdd(&(d_virial->xz), r_virial.xz);
122 
123  atomicAdd(&(d_virial->yx), r_virial.yx);
124  atomicAdd(&(d_virial->yy), r_virial.yy);
125  atomicAdd(&(d_virial->yz), r_virial.yz);
126 
127  atomicAdd(&(d_virial->zx), r_virial.zx);
128  atomicAdd(&(d_virial->zy), r_virial.zy);
129  atomicAdd(&(d_virial->zz), r_virial.zz);
130 
131  atomicAdd(&(d_extForce->x), r_netForce.x);
132  atomicAdd(&(d_extForce->y), r_netForce.y);
133  atomicAdd(&(d_extForce->z), r_netForce.z);
134 
135  __threadfence();
136  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
137  isLastBlockDone = (value == (totaltb -1));
138  }
139  __syncthreads();
140  // Last block will set the host values
141 
142  if(isLastBlockDone){
143  if(threadIdx.x == 0){
144  h_extVirial->xx = d_virial->xx;
145  h_extVirial->xy = d_virial->xy;
146  h_extVirial->xz = d_virial->xz;
147  h_extVirial->yx = d_virial->yx;
148  h_extVirial->yy = d_virial->yy;
149  h_extVirial->yz = d_virial->yz;
150  h_extVirial->zx = d_virial->zx;
151  h_extVirial->zy = d_virial->zy;
152  h_extVirial->zz = d_virial->zz;
153 
154  //reset the device virial value
155  d_virial->xx = 0;
156  d_virial->xy = 0;
157  d_virial->xz = 0;
158  d_virial->yx = 0;
159  d_virial->yy = 0;
160  d_virial->yz = 0;
161  d_virial->zx = 0;
162  d_virial->zy = 0;
163  d_virial->zz = 0;
164 
165  h_extForce->x = d_extForce->x;
166  h_extForce->y = d_extForce->y;
167  h_extForce->z = d_extForce->z;
168  d_extForce->x =0 ;
169  d_extForce->y =0 ;
170  d_extForce->z =0 ;
171  //resets atomic counter
172  tbcatomic[0] = 0;
173  __threadfence();
174  }
175  }
176 }
177 
178 void computeGlobalMasterVirial(
179  const int numPatches,
180  CudaLocalRecord* localRecords,
181  const double* __restrict d_pos_x,
182  const double* __restrict d_pos_y,
183  const double* __restrict d_pos_z,
184  const char3* __restrict d_transform,
185  double* __restrict f_global_x,
186  double* __restrict f_global_y,
187  double* __restrict f_global_z,
188  double3* __restrict d_extForce,
189  double3* __restrict h_extForce,
190  cudaTensor* __restrict d_virial,
191  cudaTensor* __restrict h_virial,
192  const Lattice lat,
193  unsigned int* __restrict d_tbcatomic,
194  cudaStream_t stream
195 )
196 {
197 
198  computeGlobalMasterVirialKernel<<<numPatches, 512, 0, stream>>> (
199  numPatches,
200  localRecords,
201  d_pos_x,
202  d_pos_y,
203  d_pos_z,
204  d_transform,
205  f_global_x,
206  f_global_y,
207  f_global_z,
208  d_extForce,
209  h_extForce,
210  d_virial,
211  h_virial,
212  lat,
213  d_tbcatomic,
214  stream);
215 
216 }
217 
218 #endif // NODEGROUP_FORCE_REGISTER
219