1 #ifndef COM_CUDA_KERNEL_H 2 #define COM_CUDA_KERNEL_H 5 #if __CUDACC_VER_MAJOR__ >= 11 8 #include <namd_cub/cub.cuh> 14 #include <hip/hip_runtime.h> 15 #include <hipcub/hipcub.hpp> 24 #ifdef NODEGROUP_FORCE_REGISTER 27 template <
int T_BLOCK_SIZE>
28 __global__
static void computeCOMKernel(
30 const double inv_group_mass,
32 const float * __restrict mass,
33 const double* __restrict pos_x,
34 const double* __restrict pos_y,
35 const double* __restrict pos_z,
36 const char3* __restrict transform,
37 const int* __restrict atomsSOAIndex,
38 double3* __restrict d_curCM,
39 double3* __restrict curCM,
40 unsigned int* __restrict tbcatomic){
41 int tid = threadIdx.x + blockIdx.x * blockDim.x;
42 int totaltb = gridDim.x;
43 bool isLastBlockDone =
false;
44 double3 cm = {0, 0, 0};
47 int SOAindex = atomsSOAIndex[tid];
49 double m = mass[SOAindex];
51 pos.x = pos_x[SOAindex];
52 pos.y = pos_y[SOAindex];
53 pos.z = pos_z[SOAindex];
56 char3 t = transform[SOAindex];
65 typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
66 __shared__
typename BlockReduce::TempStorage temp_storage;
68 cm.x = BlockReduce(temp_storage).Sum(cm.x);
70 cm.y = BlockReduce(temp_storage).Sum(cm.y);
72 cm.z = BlockReduce(temp_storage).Sum(cm.z);
77 atomicAdd(&(d_curCM->x), cm.x);
78 atomicAdd(&(d_curCM->y), cm.y);
79 atomicAdd(&(d_curCM->z), cm.z);
82 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
83 isLastBlockDone = (value == (totaltb -1));
91 curCM->x = d_curCM->x * inv_group_mass;
92 curCM->y = d_curCM->y * inv_group_mass;
93 curCM->z = d_curCM->z * inv_group_mass;
109 template <
int T_BLOCK_SIZE>
110 __global__
static void computeCOMKernelMgpu(
113 const float * __restrict mass,
114 const double* __restrict pos_x,
115 const double* __restrict pos_y,
116 const double* __restrict pos_z,
117 const char3* __restrict transform,
118 const int* __restrict atomsSOAIndex,
119 double3** __restrict d_peer_curCM,
120 const int numDevices,
121 const int deviceIndex,
122 unsigned int* __restrict tbcatomic){
123 int tid = threadIdx.x + blockIdx.x * blockDim.x;
124 double3 cm = {0, 0, 0};
126 int SOAindex = atomsSOAIndex[tid];
128 double m = mass[SOAindex];
130 pos.x = pos_x[SOAindex];
131 pos.y = pos_y[SOAindex];
132 pos.z = pos_z[SOAindex];
135 char3 t = transform[SOAindex];
144 typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
145 __shared__
typename BlockReduce::TempStorage temp_storage;
147 cm.x = BlockReduce(temp_storage).Sum(cm.x);
149 cm.y = BlockReduce(temp_storage).Sum(cm.y);
151 cm.z = BlockReduce(temp_storage).Sum(cm.z);
155 if(threadIdx.x == 0){
156 atomicAdd(&(d_peer_curCM[deviceIndex][0].x), cm.x);
157 atomicAdd(&(d_peer_curCM[deviceIndex][0].y), cm.y);
158 atomicAdd(&(d_peer_curCM[deviceIndex][0].z), cm.z);
166 __global__
static void computeDistCOMKernelMgpu(
167 double3** __restrict d_peerCOM,
168 double3* __restrict d_curCOM,
169 const int numDevices){
170 int tid = threadIdx.x + blockIdx.x * blockDim.x;
171 if (tid < numDevices) {
172 atomicAdd(&(d_curCOM->x), d_peerCOM[tid][0].x);
173 atomicAdd(&(d_curCOM->y), d_peerCOM[tid][0].y);
174 atomicAdd(&(d_curCOM->z), d_peerCOM[tid][0].z);
183 __global__
static void copyDistCOMKernel(
184 double3** __restrict d_peerCOM,
185 double3* __restrict h_peerCOM,
186 const int numDevices){
187 int tid = threadIdx.x + blockIdx.x * blockDim.x;
188 if (tid < numDevices) {
189 h_peerCOM[tid].x = d_peerCOM[tid][0].x;
190 h_peerCOM[tid].y = d_peerCOM[tid][0].y;
191 h_peerCOM[tid].z = d_peerCOM[tid][0].z;
201 template <
int T_BLOCK_SIZE>
202 __global__
static void compute2COMKernel(
203 const int numGroup1Atoms,
204 const int totalNumRestrained,
205 const double inv_group1_mass,
206 const double inv_group2_mass,
208 const float * __restrict mass,
209 const double* __restrict pos_x,
210 const double* __restrict pos_y,
211 const double* __restrict pos_z,
212 const char3* __restrict transform,
213 const int* __restrict groupAtomsSOAIndex,
214 double3* __restrict d_curCM1,
215 double3* __restrict d_curCM2,
216 double3* __restrict curCM1,
217 double3* __restrict curCM2,
218 unsigned int* __restrict tbcatomic){
220 int tid = threadIdx.x + blockIdx.x * blockDim.x;
221 int totaltb = gridDim.x;
222 bool isLastBlockDone =
false;
223 double3 com1 = {0, 0, 0};
224 double3 com2 = {0, 0, 0};
226 if (tid < totalNumRestrained) {
227 int SOAindex = groupAtomsSOAIndex[tid];
229 double m = mass[SOAindex];
231 pos.x = pos_x[SOAindex];
232 pos.y = pos_y[SOAindex];
233 pos.z = pos_z[SOAindex];
236 char3 t = transform[SOAindex];
239 if(tid < numGroup1Atoms){
256 typedef cub::BlockReduce<double, T_BLOCK_SIZE> BlockReduce;
257 __shared__
typename BlockReduce::TempStorage temp_storage;
259 com1.x = BlockReduce(temp_storage).Sum(com1.x);
261 com1.y = BlockReduce(temp_storage).Sum(com1.y);
263 com1.z = BlockReduce(temp_storage).Sum(com1.z);
265 com2.x = BlockReduce(temp_storage).Sum(com2.x);
267 com2.y = BlockReduce(temp_storage).Sum(com2.y);
269 com2.z = BlockReduce(temp_storage).Sum(com2.z);
273 if(threadIdx.x == 0){
274 atomicAdd(&(d_curCM1->x), com1.x);
275 atomicAdd(&(d_curCM1->y), com1.y);
276 atomicAdd(&(d_curCM1->z), com1.z);
277 atomicAdd(&(d_curCM2->x), com2.x);
278 atomicAdd(&(d_curCM2->y), com2.y);
279 atomicAdd(&(d_curCM2->z), com2.z);
282 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
283 isLastBlockDone = (value == (totaltb -1));
289 if(threadIdx.x == 0){
291 curCM1->x = d_curCM1->x * inv_group1_mass;
292 curCM1->y = d_curCM1->y * inv_group1_mass;
293 curCM1->z = d_curCM1->z * inv_group1_mass;
294 curCM2->x = d_curCM2->x * inv_group2_mass;
295 curCM2->y = d_curCM2->y * inv_group2_mass;
296 curCM2->z = d_curCM2->z * inv_group2_mass;
311 __global__
static void initPeerCOMKernel(
312 const int numDevices,
313 const int deviceIndex,
314 double3** d_peerPool,
317 int tid = threadIdx.x + blockIdx.x * blockDim.x;
319 d_peerPool[deviceIndex]=d_peerCOM;
320 if(tid < numDevices){
321 d_peerCOM[tid].x = 0.0;
322 d_peerCOM[tid].y = 0.0;
323 d_peerCOM[tid].z = 0.0;
327 #endif // NODEGROUP_FORCE_REGISTER 328 #endif // COM_CUDA_KERNEL_H
NAMD_HOST_DEVICE Position reverse_transform(Position data, const Transform &t) const