2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
11 #endif // end NAMD_CUDA vs. NAMD_HIP
13 #include "HipDefines.h"
15 #include "ComputeConsForceCUDAKernel.h"
17 #ifdef NODEGROUP_FORCE_REGISTER
20 template<bool T_DOVIRIAL>
21 __global__ void computeConsForceKernel(
22 const int nConsForceAtoms,
23 const int* __restrict d_consForceSOA,
24 const int* __restrict d_consForceID,
25 const double* __restrict d_pos_x,
26 const double* __restrict d_pos_y,
27 const double* __restrict d_pos_z,
28 const double3* __restrict d_consForce,
29 const char3* __restrict d_transform,
30 double* __restrict f_normal_x,
31 double* __restrict f_normal_y,
32 double* __restrict f_normal_z,
33 double3* __restrict d_extForce,
34 double3* __restrict h_extForce,
35 cudaTensor* __restrict d_virial,
36 cudaTensor* __restrict h_extVirial,
39 unsigned int* __restrict tbcatomic,
43 int tid = threadIdx.x + (blockIdx.x * blockDim.x);
45 int totaltb = gridDim.x;
53 double3 pos = {0, 0, 0};
54 double3 pos_i = {0, 0, 0};
55 double3 r_netForce = {0, 0, 0};
58 r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
59 r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
60 r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
62 if(tid < nConsForceAtoms){
63 int forceID = d_consForceID[tid];
64 int soaID = d_consForceSOA[forceID];
65 scale.x = scaling * d_consForce[forceID].x;
66 scale.y = scaling * d_consForce[forceID].y;
67 scale.z = scaling * d_consForce[forceID].z;
68 atomicAdd(&(f_normal_x[soaID]), scale.x);
69 atomicAdd(&(f_normal_y[soaID]), scale.y);
70 atomicAdd(&(f_normal_z[soaID]), scale.z);
71 r_netForce.x += scale.x;
72 r_netForce.y += scale.y;
73 r_netForce.z += scale.z;
76 char3 t = d_transform[soaID];
77 pos.x = d_pos_x[soaID];
78 pos.y = d_pos_y[soaID];
79 pos.z = d_pos_z[soaID];
80 pos_i = lat.reverse_transform(pos, t);
81 r_virial.xx = scale.x * pos_i.x;
82 r_virial.xy = scale.x * pos_i.y;
83 r_virial.xz = scale.x * pos_i.z;
84 r_virial.yx = scale.y * pos_i.x;
85 r_virial.yy = scale.y * pos_i.y;
86 r_virial.yz = scale.y * pos_i.z;
87 r_virial.zx = scale.z * pos_i.x;
88 r_virial.zy = scale.z * pos_i.y;
89 r_virial.zz = scale.z * pos_i.z;
94 typedef cub::BlockReduce<double, 128> BlockReduce;
95 __shared__ typename BlockReduce::TempStorage temp_storage;
97 r_netForce.x = BlockReduce(temp_storage).Sum(r_netForce.x);
99 r_netForce.y = BlockReduce(temp_storage).Sum(r_netForce.y);
101 r_netForce.z = BlockReduce(temp_storage).Sum(r_netForce.z);
104 r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
106 r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
108 r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
111 r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
113 r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
115 r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
118 r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
120 r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
122 r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
124 if(threadIdx.x == 0){
125 atomicAdd(&(d_virial->xx), r_virial.xx);
126 atomicAdd(&(d_virial->xy), r_virial.xy);
127 atomicAdd(&(d_virial->xz), r_virial.xz);
129 atomicAdd(&(d_virial->yx), r_virial.yx);
130 atomicAdd(&(d_virial->yy), r_virial.yy);
131 atomicAdd(&(d_virial->yz), r_virial.yz);
133 atomicAdd(&(d_virial->zx), r_virial.zx);
134 atomicAdd(&(d_virial->zy), r_virial.zy);
135 atomicAdd(&(d_virial->zz), r_virial.zz);
137 atomicAdd(&(d_extForce->x), r_netForce.x);
138 atomicAdd(&(d_extForce->y), r_netForce.y);
139 atomicAdd(&(d_extForce->z), r_netForce.z);
142 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
143 isLastBlockDone = (value == (totaltb -1));
146 // Last block will set the host values
149 if(threadIdx.x == 0){
150 h_extVirial->xx = d_virial->xx;
151 h_extVirial->xy = d_virial->xy;
152 h_extVirial->xz = d_virial->xz;
153 h_extVirial->yx = d_virial->yx;
154 h_extVirial->yy = d_virial->yy;
155 h_extVirial->yz = d_virial->yz;
156 h_extVirial->zx = d_virial->zx;
157 h_extVirial->zy = d_virial->zy;
158 h_extVirial->zz = d_virial->zz;
160 //reset the device virial value
171 h_extForce->x = d_extForce->x;
172 h_extForce->y = d_extForce->y;
173 h_extForce->z = d_extForce->z;
177 //resets atomic counter
185 void computeConsForce(
187 const int nConsForceAtoms,
188 const int* __restrict d_consForceSOA,
189 const int* __restrict d_consForceID,
190 const double* __restrict d_pos_x,
191 const double* __restrict d_pos_y,
192 const double* __restrict d_pos_z,
193 const double3* __restrict d_consForce,
194 const char3* __restrict d_transform,
195 double* __restrict f_normal_x,
196 double* __restrict f_normal_y,
197 double* __restrict f_normal_z,
198 double3* __restrict d_extForce,
199 double3* __restrict h_extForce,
200 cudaTensor* __restrict d_virial,
201 cudaTensor* __restrict h_virial,
202 const double scaling,
204 unsigned int* __restrict d_tbcatomic,
208 const int blocks = 128;
209 const int grid = (nConsForceAtoms + blocks - 1) / blocks;
212 computeConsForceKernel<true> <<<grid, blocks, 0, stream>>> (
235 computeConsForceKernel<false> <<<grid, blocks, 0, stream>>> (
258 #endif // NODEGROUP_FORCE_REGISTER