2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
11 #endif // end NAMD_CUDA vs. NAMD_HIP
13 #include "HipDefines.h"
15 #include "ComputeGlobalMasterVirialCUDAKernel.h"
17 #ifdef NODEGROUP_FORCE_REGISTER
19 template <int BLOCK_SIZE>
20 __global__ void computeGlobalMasterVirialKernel(
22 CudaLocalRecord* localRecords,
23 const double* __restrict d_pos_x,
24 const double* __restrict d_pos_y,
25 const double* __restrict d_pos_z,
26 const char3* __restrict d_transform,
27 double* __restrict f_global_x,
28 double* __restrict f_global_y,
29 double* __restrict f_global_z,
30 double3* __restrict d_extForce,
31 double3* __restrict h_extForce,
32 cudaTensor* __restrict d_virial,
33 cudaTensor* __restrict h_extVirial,
35 unsigned int* __restrict tbcatomic
38 double3 r_netForce = {0, 0, 0};
40 r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
41 r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
42 r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
43 int totaltb = gridDim.x;
44 int i = threadIdx.x + blockIdx.x*blockDim.x;
45 __shared__ bool isLastBlockDone;
58 const char3 t = d_transform[i];
59 pos_i = lat.reverse_transform(pos, t);
60 r_virial.xx = f_global_x[i] * pos_i.x;
61 r_virial.xy = f_global_x[i] * pos_i.y;
62 r_virial.xz = f_global_x[i] * pos_i.z;
63 r_virial.yx = f_global_y[i] * pos_i.x;
64 r_virial.yy = f_global_y[i] * pos_i.y;
65 r_virial.yz = f_global_y[i] * pos_i.z;
66 r_virial.zx = f_global_z[i] * pos_i.x;
67 r_virial.zy = f_global_z[i] * pos_i.y;
68 r_virial.zz = f_global_z[i] * pos_i.z;
69 r_netForce.x = f_global_x[i];
70 r_netForce.y = f_global_y[i];
71 r_netForce.z = f_global_z[i];
75 typedef cub::BlockReduce<double, BLOCK_SIZE> BlockReduce;
76 __shared__ typename BlockReduce::TempStorage temp_storage;
78 r_netForce.x = BlockReduce(temp_storage).Sum(r_netForce.x);
80 r_netForce.y = BlockReduce(temp_storage).Sum(r_netForce.y);
82 r_netForce.z = BlockReduce(temp_storage).Sum(r_netForce.z);
85 r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
87 r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
89 r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
92 r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
94 r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
96 r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
99 r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
101 r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
103 r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
106 if(threadIdx.x == 0){
107 atomicAdd(&(d_virial->xx), r_virial.xx);
108 atomicAdd(&(d_virial->xy), r_virial.xy);
109 atomicAdd(&(d_virial->xz), r_virial.xz);
111 atomicAdd(&(d_virial->yx), r_virial.yx);
112 atomicAdd(&(d_virial->yy), r_virial.yy);
113 atomicAdd(&(d_virial->yz), r_virial.yz);
115 atomicAdd(&(d_virial->zx), r_virial.zx);
116 atomicAdd(&(d_virial->zy), r_virial.zy);
117 atomicAdd(&(d_virial->zz), r_virial.zz);
119 atomicAdd(&(d_extForce->x), r_netForce.x);
120 atomicAdd(&(d_extForce->y), r_netForce.y);
121 atomicAdd(&(d_extForce->z), r_netForce.z);
124 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
125 isLastBlockDone = (value == (totaltb -1));
130 if(threadIdx.x == 0){
131 h_extVirial->xx = d_virial->xx;
132 h_extVirial->xy = d_virial->xy;
133 h_extVirial->xz = d_virial->xz;
134 h_extVirial->yx = d_virial->yx;
135 h_extVirial->yy = d_virial->yy;
136 h_extVirial->yz = d_virial->yz;
137 h_extVirial->zx = d_virial->zx;
138 h_extVirial->zy = d_virial->zy;
139 h_extVirial->zz = d_virial->zz;
141 //reset the device virial value
152 h_extForce->x = d_extForce->x;
153 h_extForce->y = d_extForce->y;
154 h_extForce->z = d_extForce->z;
158 //resets atomic counter
165 void computeGlobalMasterVirial(
166 const int numPatches,
168 CudaLocalRecord* localRecords,
169 const double* __restrict d_pos_x,
170 const double* __restrict d_pos_y,
171 const double* __restrict d_pos_z,
172 const char3* __restrict d_transform,
173 double* __restrict f_global_x,
174 double* __restrict f_global_y,
175 double* __restrict f_global_z,
176 double3* __restrict d_extForce,
177 double3* __restrict h_extForce,
178 cudaTensor* __restrict d_virial,
179 cudaTensor* __restrict h_virial,
181 unsigned int* __restrict d_tbcatomic,
186 const int atom_blocks = 128;
187 const int grid = (numAtoms + atom_blocks - 1) / atom_blocks;
188 computeGlobalMasterVirialKernel<atom_blocks><<<grid, atom_blocks, 0, stream>>>(
207 #endif // NODEGROUP_FORCE_REGISTER