2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
11 #endif // end NAMD_CUDA vs. NAMD_HIP
13 #include "HipDefines.h"
15 #include "ComputeGlobalMasterVirialCUDAKernel.h"
17 #ifdef NODEGROUP_FORCE_REGISTER
20 __global__ void computeGlobalMasterVirialKernel(
22 CudaLocalRecord* localRecords,
23 const double* __restrict d_pos_x,
24 const double* __restrict d_pos_y,
25 const double* __restrict d_pos_z,
26 const char3* __restrict d_transform,
27 double* __restrict f_global_x,
28 double* __restrict f_global_y,
29 double* __restrict f_global_z,
30 double3* __restrict d_extForce,
31 double3* __restrict h_extForce,
32 cudaTensor* __restrict d_virial,
33 cudaTensor* __restrict h_extVirial,
35 unsigned int* __restrict tbcatomic,
39 __shared__ CudaLocalRecord s_record;
40 using AccessType = int32_t;
41 AccessType* s_record_buffer = (AccessType*) &s_record;
43 int totaltb = gridDim.x;
51 double3 pos = {0, 0, 0};
52 double3 pos_i = {0, 0, 0};
53 double3 r_netForce = {0, 0, 0};
55 r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
56 r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
57 r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
59 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
60 // Read in the CudaLocalRecord using multiple threads. This should
62 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
63 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
67 const int numAtoms = s_record.numAtoms;
68 const int offset = s_record.bufferOffset;
70 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
71 char3 t = d_transform[offset + i];
72 pos.x = d_pos_x[offset + i];
73 pos.y = d_pos_y[offset + i];
74 pos.z = d_pos_z[offset + i];
75 pos_i = lat.reverse_transform(pos, t);
76 r_virial.xx = f_global_x[offset + i] * pos_i.x;
77 r_virial.xy = f_global_x[offset + i] * pos_i.y;
78 r_virial.xz = f_global_x[offset + i] * pos_i.z;
79 r_virial.yx = f_global_y[offset + i] * pos_i.x;
80 r_virial.yy = f_global_y[offset + i] * pos_i.y;
81 r_virial.yz = f_global_y[offset + i] * pos_i.z;
82 r_virial.zx = f_global_z[offset + i] * pos_i.x;
83 r_virial.zy = f_global_z[offset + i] * pos_i.y;
84 r_virial.zz = f_global_z[offset + i] * pos_i.z;
88 typedef cub::BlockReduce<double, 1024> BlockReduce;
89 __shared__ typename BlockReduce::TempStorage temp_storage;
91 r_netForce.x = BlockReduce(temp_storage).Sum(r_netForce.x);
93 r_netForce.y = BlockReduce(temp_storage).Sum(r_netForce.y);
95 r_netForce.z = BlockReduce(temp_storage).Sum(r_netForce.z);
98 r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
100 r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
102 r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
105 r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
107 r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
109 r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
112 r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
114 r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
116 r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
118 if(threadIdx.x == 0){
119 atomicAdd(&(d_virial->xx), r_virial.xx);
120 atomicAdd(&(d_virial->xy), r_virial.xy);
121 atomicAdd(&(d_virial->xz), r_virial.xz);
123 atomicAdd(&(d_virial->yx), r_virial.yx);
124 atomicAdd(&(d_virial->yy), r_virial.yy);
125 atomicAdd(&(d_virial->yz), r_virial.yz);
127 atomicAdd(&(d_virial->zx), r_virial.zx);
128 atomicAdd(&(d_virial->zy), r_virial.zy);
129 atomicAdd(&(d_virial->zz), r_virial.zz);
131 atomicAdd(&(d_extForce->x), r_netForce.x);
132 atomicAdd(&(d_extForce->y), r_netForce.y);
133 atomicAdd(&(d_extForce->z), r_netForce.z);
136 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
137 isLastBlockDone = (value == (totaltb -1));
140 // Last block will set the host values
143 if(threadIdx.x == 0){
144 h_extVirial->xx = d_virial->xx;
145 h_extVirial->xy = d_virial->xy;
146 h_extVirial->xz = d_virial->xz;
147 h_extVirial->yx = d_virial->yx;
148 h_extVirial->yy = d_virial->yy;
149 h_extVirial->yz = d_virial->yz;
150 h_extVirial->zx = d_virial->zx;
151 h_extVirial->zy = d_virial->zy;
152 h_extVirial->zz = d_virial->zz;
154 //reset the device virial value
165 h_extForce->x = d_extForce->x;
166 h_extForce->y = d_extForce->y;
167 h_extForce->z = d_extForce->z;
171 //resets atomic counter
178 void computeGlobalMasterVirial(
179 const int numPatches,
180 CudaLocalRecord* localRecords,
181 const double* __restrict d_pos_x,
182 const double* __restrict d_pos_y,
183 const double* __restrict d_pos_z,
184 const char3* __restrict d_transform,
185 double* __restrict f_global_x,
186 double* __restrict f_global_y,
187 double* __restrict f_global_z,
188 double3* __restrict d_extForce,
189 double3* __restrict h_extForce,
190 cudaTensor* __restrict d_virial,
191 cudaTensor* __restrict h_virial,
193 unsigned int* __restrict d_tbcatomic,
198 computeGlobalMasterVirialKernel<<<numPatches, 512, 0, stream>>> (
218 #endif // NODEGROUP_FORCE_REGISTER