NAMD
CudaGlobalMasterServerKernel.cu
Go to the documentation of this file.
1 #include "CudaGlobalMasterServerKernel.h"
2 #include "CudaUtils.h"
3 
4 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
5 
6 #define ATOM_BLOCKS 128
7 
8 template <bool copyPositions, bool copyMasses, bool copyCharges,
9  bool copyTransforms, bool copyVelocities>
10 __global__ void copyAtomsToClientsKernel(
11  const double *__restrict x, const double *__restrict y,
12  const double *__restrict z, const double *__restrict v_x,
13  const double *__restrict v_y, const double *__restrict v_z,
14  const char3 *__restrict d_transform, const float *__restrict d_mass,
15  const float *__restrict d_charge, const Lattice lat,
16  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
17  size_t numCopyTuples,
18  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
19  const int i = threadIdx.x + blockIdx.x * blockDim.x;
20  // double3 pos = {0, 0, 0};
21  if (i < numCopyTuples) {
22  const int srcIndex = d_copyList[i].m_soa_index;
23  const int dstClient = d_copyList[i].m_client_index;
24  const int dstIndex = d_copyList[i].m_client_atom_pos;
25  const size_t stride = d_clientBuffers[dstClient].sz;
26  if (copyPositions) {
27  const double3 pos = {x[srcIndex], y[srcIndex], z[srcIndex]};
28  const char3 t = d_transform[srcIndex];
29  const double3 pos_trans = lat.reverse_transform(pos, t);
30  d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
31  d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
32  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
33  }
34  if (copyMasses) {
35  d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcIndex];
36  }
37  if (copyCharges) {
38  d_clientBuffers[dstClient].d_charge[dstIndex] = d_charge[srcIndex];
39  }
40  if (copyTransforms) {
41  d_clientBuffers[dstClient].d_transform[dstIndex] =
42  d_transform[srcIndex].x;
43  d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
44  d_transform[srcIndex].y;
45  d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
46  d_transform[srcIndex].z;
47  }
48  if (copyVelocities) {
49  d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcIndex];
50  d_clientBuffers[dstClient].d_vel[dstIndex + stride] = v_y[srcIndex];
51  d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] = v_z[srcIndex];
52  }
53  }
54 }
55 
56 void copyAtomsToClientsCUDA(
57  bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
58  bool copyVelocities, const double *d_pos_x, const double *d_pos_y,
59  const double *d_pos_z, const double *d_vel_x, const double *d_vel_y,
60  const double *d_vel_z, const char3 *d_transform, const float *d_mass,
61  const float *d_charge, const Lattice lat,
62  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
63  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
64  size_t numClients, cudaStream_t stream) {
65  if (numCopyTuples == 0) return;
66  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
67 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
68  copyAtomsToClientsKernel<bool(POSITION), bool(MASS), bool(CHARGE), \
69  bool(TRANSFORM), bool(VELOCITY)> \
70  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
71  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z, d_transform, \
72  d_mass, d_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
73  const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
74  (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
75  (int(copyVelocities));
76  switch (options) {
77  case 0:
78  break; // Nothing is copied
79  case 1:
80  CALL(0, 0, 0, 0, 1);
81  break;
82  case 2:
83  CALL(0, 0, 0, 1, 0);
84  break;
85  case 3:
86  CALL(0, 0, 0, 1, 1);
87  break;
88  case 4:
89  CALL(0, 0, 1, 0, 0);
90  break;
91  case 5:
92  CALL(0, 0, 1, 0, 1);
93  break;
94  case 6:
95  CALL(0, 0, 1, 1, 0);
96  break;
97  case 7:
98  CALL(0, 0, 1, 1, 1);
99  break;
100  case 8:
101  CALL(0, 1, 0, 0, 0);
102  break;
103  case 9:
104  CALL(0, 1, 0, 0, 1);
105  break;
106  case 10:
107  CALL(0, 1, 0, 1, 0);
108  break;
109  case 11:
110  CALL(0, 1, 0, 1, 1);
111  break;
112  case 12:
113  CALL(0, 1, 1, 0, 0);
114  break;
115  case 13:
116  CALL(0, 1, 1, 0, 1);
117  break;
118  case 14:
119  CALL(0, 1, 1, 1, 0);
120  break;
121  case 15:
122  CALL(0, 1, 1, 1, 1);
123  break;
124  case 16:
125  CALL(1, 0, 0, 0, 0);
126  break;
127  case 17:
128  CALL(1, 0, 0, 0, 1);
129  break;
130  case 18:
131  CALL(1, 0, 0, 1, 0);
132  break;
133  case 19:
134  CALL(1, 0, 0, 1, 1);
135  break;
136  case 20:
137  CALL(1, 0, 1, 0, 0);
138  break;
139  case 21:
140  CALL(1, 0, 1, 0, 1);
141  break;
142  case 22:
143  CALL(1, 0, 1, 1, 0);
144  break;
145  case 23:
146  CALL(1, 0, 1, 1, 1);
147  break;
148  case 24:
149  CALL(1, 1, 0, 0, 0);
150  break;
151  case 25:
152  CALL(1, 1, 0, 0, 1);
153  break;
154  case 26:
155  CALL(1, 1, 0, 1, 0);
156  break;
157  case 27:
158  CALL(1, 1, 0, 1, 1);
159  break;
160  case 28:
161  CALL(1, 1, 1, 0, 0);
162  break;
163  case 29:
164  CALL(1, 1, 1, 0, 1);
165  break;
166  case 30:
167  CALL(1, 1, 1, 1, 0);
168  break;
169  case 31:
170  CALL(1, 1, 1, 1, 1);
171  break;
172  }
173 #undef CALL
174 }
175 
176 template <bool copyPositions, bool copyMasses, bool copyCharges,
177  bool copyTransforms, bool copyVelocities>
178 __global__ void copyAtomsToClientsKernelMGPU(
179  const double **__restrict x, const double **__restrict y,
180  const double **__restrict z, const double **__restrict v_x,
181  const double **__restrict v_y, const double **__restrict v_z,
182  const char3 **__restrict d_transform, const float **__restrict d_mass,
183  const float **__restrict d_charge, const Lattice lat,
184  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
185  size_t numCopyTuples,
186  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
187  const int i = threadIdx.x + blockIdx.x * blockDim.x;
188  // double3 pos = {0, 0, 0};
189  if (i < numCopyTuples) {
190  const int srcDevIdx = d_copyList[i].m_src_dev_index;
191  const int srcIndex = d_copyList[i].m_soa_index;
192  const int dstClient = d_copyList[i].m_client_index;
193  const int dstIndex = d_copyList[i].m_client_atom_pos;
194  const size_t stride = d_clientBuffers[dstClient].sz;
195  if (copyPositions) {
196  const double3 pos = {x[srcDevIdx][srcIndex], y[srcDevIdx][srcIndex],
197  z[srcDevIdx][srcIndex]};
198  const char3 t = d_transform[srcDevIdx][srcIndex];
199  const double3 pos_trans = lat.reverse_transform(pos, t);
200  d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
201  d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
202  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
203  }
204  if (copyMasses) {
205  d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcDevIdx][srcIndex];
206  }
207  if (copyCharges) {
208  d_clientBuffers[dstClient].d_charge[dstIndex] =
209  d_charge[srcDevIdx][srcIndex];
210  }
211  if (copyTransforms) {
212  d_clientBuffers[dstClient].d_transform[dstIndex] =
213  d_transform[srcDevIdx][srcIndex].x;
214  d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
215  d_transform[srcDevIdx][srcIndex].y;
216  d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
217  d_transform[srcDevIdx][srcIndex].z;
218  }
219  if (copyVelocities) {
220  d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcDevIdx][srcIndex];
221  d_clientBuffers[dstClient].d_vel[dstIndex + stride] =
222  v_y[srcDevIdx][srcIndex];
223  d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] =
224  v_z[srcDevIdx][srcIndex];
225  }
226  }
227 }
228 
229 void copyAtomsToClientsCUDAMGPU(
230  bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
231  bool copyVelocities, const double **d_peer_pos_x,
232  const double **d_peer_pos_y, const double **d_peer_pos_z,
233  const double **d_peer_vel_x, const double **d_peer_vel_y,
234  const double **d_peer_vel_z, const char3 **d_peer_transform,
235  const float **d_peer_mass, const float **d_peer_charge, const Lattice lat,
236  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
237  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
238  size_t numClients, cudaStream_t stream) {
239  if (numCopyTuples == 0) return;
240  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
241 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
242  copyAtomsToClientsKernelMGPU<bool(POSITION), bool(MASS), bool(CHARGE), \
243  bool(TRANSFORM), bool(VELOCITY)> \
244  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
245  d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_vel_x, \
246  d_peer_vel_y, d_peer_vel_z, d_peer_transform, d_peer_mass, \
247  d_peer_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
248  const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
249  (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
250  (int(copyVelocities));
251  switch (options) {
252  case 0:
253  break; // Nothing is copied
254  case 1:
255  CALL(0, 0, 0, 0, 1);
256  break;
257  case 2:
258  CALL(0, 0, 0, 1, 0);
259  break;
260  case 3:
261  CALL(0, 0, 0, 1, 1);
262  break;
263  case 4:
264  CALL(0, 0, 1, 0, 0);
265  break;
266  case 5:
267  CALL(0, 0, 1, 0, 1);
268  break;
269  case 6:
270  CALL(0, 0, 1, 1, 0);
271  break;
272  case 7:
273  CALL(0, 0, 1, 1, 1);
274  break;
275  case 8:
276  CALL(0, 1, 0, 0, 0);
277  break;
278  case 9:
279  CALL(0, 1, 0, 0, 1);
280  break;
281  case 10:
282  CALL(0, 1, 0, 1, 0);
283  break;
284  case 11:
285  CALL(0, 1, 0, 1, 1);
286  break;
287  case 12:
288  CALL(0, 1, 1, 0, 0);
289  break;
290  case 13:
291  CALL(0, 1, 1, 0, 1);
292  break;
293  case 14:
294  CALL(0, 1, 1, 1, 0);
295  break;
296  case 15:
297  CALL(0, 1, 1, 1, 1);
298  break;
299  case 16:
300  CALL(1, 0, 0, 0, 0);
301  break;
302  case 17:
303  CALL(1, 0, 0, 0, 1);
304  break;
305  case 18:
306  CALL(1, 0, 0, 1, 0);
307  break;
308  case 19:
309  CALL(1, 0, 0, 1, 1);
310  break;
311  case 20:
312  CALL(1, 0, 1, 0, 0);
313  break;
314  case 21:
315  CALL(1, 0, 1, 0, 1);
316  break;
317  case 22:
318  CALL(1, 0, 1, 1, 0);
319  break;
320  case 23:
321  CALL(1, 0, 1, 1, 1);
322  break;
323  case 24:
324  CALL(1, 1, 0, 0, 0);
325  break;
326  case 25:
327  CALL(1, 1, 0, 0, 1);
328  break;
329  case 26:
330  CALL(1, 1, 0, 1, 0);
331  break;
332  case 27:
333  CALL(1, 1, 0, 1, 1);
334  break;
335  case 28:
336  CALL(1, 1, 1, 0, 0);
337  break;
338  case 29:
339  CALL(1, 1, 1, 0, 1);
340  break;
341  case 30:
342  CALL(1, 1, 1, 1, 0);
343  break;
344  case 31:
345  CALL(1, 1, 1, 1, 1);
346  break;
347  }
348 #undef CALL
349 }
350 
351 template <bool FIXEDATOM>
352 __global__ void copyForcesToClientsKernel(
353  const double *__restrict d_f_normal_x,
354  const double *__restrict d_f_normal_y,
355  const double *__restrict d_f_normal_z, const double *__restrict d_f_nbond_x,
356  const double *__restrict d_f_nbond_y, const double *__restrict d_f_nbond_z,
357  const double *__restrict d_f_slow_x, const double *__restrict d_f_slow_y,
358  const double *__restrict d_f_slow_z, const int *__restrict d_atomFixed,
359  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
360  size_t numCopyTuples,
361  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
362  const int i = threadIdx.x + blockIdx.x * blockDim.x;
363  if (i < numCopyTuples) {
364  const int srcIndex = d_copyList[i].m_soa_index;
365  const int dstClient = d_copyList[i].m_client_index;
366  const int dstIndex = d_copyList[i].m_client_atom_pos;
367  const size_t stride = d_clientBuffers[dstClient].sz;
368  double fx, fy, fz;
369  if (FIXEDATOM) {
370  if (d_atomFixed[srcIndex]) {
371  fx = fy = fz = 0.0;
372  } else {
373  fx = d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] +
374  d_f_slow_x[srcIndex];
375  fy = d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] +
376  d_f_slow_y[srcIndex];
377  fz = d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] +
378  d_f_slow_z[srcIndex];
379  }
380  } else {
381  fx =
382  d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] + d_f_slow_x[srcIndex];
383  fy =
384  d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] + d_f_slow_y[srcIndex];
385  fz =
386  d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] + d_f_slow_z[srcIndex];
387  }
388  d_clientBuffers[dstClient].d_data[dstIndex] = fx;
389  d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
390  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
391  }
392 }
393 
394 void copyTotalForcesToClientsCUDA(
395  bool fixedOn, const double *d_f_normal_x, const double *d_f_normal_y,
396  const double *d_f_normal_z, const double *d_f_nbond_x,
397  const double *d_f_nbond_y, const double *d_f_nbond_z,
398  const double *d_f_slow_x, const double *d_f_slow_y,
399  const double *d_f_slow_z, const int *d_atomFixed,
400  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
401  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
402  size_t numClients, cudaStream_t stream) {
403  if (numCopyTuples == 0) return;
404  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
405  if (fixedOn) {
406  copyForcesToClientsKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
407  d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
408  d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
409  d_copyList, numCopyTuples, d_clientBuffers);
410  } else {
411  copyForcesToClientsKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
412  d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
413  d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
414  d_copyList, numCopyTuples, d_clientBuffers);
415  }
416 }
417 
418 template <bool FIXEDATOM>
419 __global__ void copyForcesToClientsKernelMGPU(
420  const double **__restrict d_f_normal_x,
421  const double **__restrict d_f_normal_y,
422  const double **__restrict d_f_normal_z,
423  const double **__restrict d_f_nbond_x,
424  const double **__restrict d_f_nbond_y,
425  const double **__restrict d_f_nbond_z, const double **__restrict d_f_slow_x,
426  const double **__restrict d_f_slow_y, const double **__restrict d_f_slow_z,
427  const int **__restrict d_atomFixed,
428  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
429  size_t numCopyTuples,
430  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
431  const int i = threadIdx.x + blockIdx.x * blockDim.x;
432  if (i < numCopyTuples) {
433  const int srcDevIdx = d_copyList[i].m_src_dev_index;
434  const int srcIndex = d_copyList[i].m_soa_index;
435  const int dstClient = d_copyList[i].m_client_index;
436  const int dstIndex = d_copyList[i].m_client_atom_pos;
437  const size_t stride = d_clientBuffers[dstClient].sz;
438  double fx, fy, fz;
439  if (FIXEDATOM) {
440  if (d_atomFixed[srcDevIdx][srcIndex]) {
441  fx = fy = fz = 0.0;
442  } else {
443  fx = d_f_normal_x[srcDevIdx][srcIndex] +
444  d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
445  fy = d_f_normal_y[srcDevIdx][srcIndex] +
446  d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
447  fz = d_f_normal_z[srcDevIdx][srcIndex] +
448  d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
449  }
450  } else {
451  fx = d_f_normal_x[srcDevIdx][srcIndex] +
452  d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
453  fy = d_f_normal_y[srcDevIdx][srcIndex] +
454  d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
455  fz = d_f_normal_z[srcDevIdx][srcIndex] +
456  d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
457  }
458  d_clientBuffers[dstClient].d_data[dstIndex] = fx;
459  d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
460  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
461  }
462 }
463 
464 void copyTotalForcesToClientsCUDAMGPU(
465  bool fixedOn, const double **d_peer_f_normal_x,
466  const double **d_peer_f_normal_y, const double **d_peer_f_normal_z,
467  const double **d_peer_f_nbond_x, const double **d_peer_f_nbond_y,
468  const double **d_peer_f_nbond_z, const double **d_peer_f_slow_x,
469  const double **d_peer_f_slow_y, const double **d_peer_f_slow_z,
470  const int **d_peer_atomFixed,
471  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
472  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
473  size_t numClients, cudaStream_t stream) {
474  if (numCopyTuples == 0) return;
475  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
476  if (fixedOn) {
477  copyForcesToClientsKernelMGPU<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
478  d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
479  d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
480  d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
481  numCopyTuples, d_clientBuffers);
482  } else {
483  copyForcesToClientsKernelMGPU<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
484  d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
485  d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
486  d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
487  numCopyTuples, d_clientBuffers);
488  }
489 }
490 
491 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
492 __global__ void addGlobalForcesFromClientsKernel(
493  double *__restrict d_f_global_x, double *__restrict d_f_global_y,
494  double *__restrict d_f_global_z, const int *__restrict d_atomFixed,
495  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
496  size_t numCopyTuples,
497  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
498  const int i = threadIdx.x + blockIdx.x * blockDim.x;
499  if (i < numCopyTuples) {
500  const int dstIndex = d_copyList[i].m_soa_index;
501  const int srcClient = d_copyList[i].m_client_index;
502  const int srcIndex = d_copyList[i].m_client_atom_pos;
503  const size_t stride = d_clientBuffers[srcClient].sz;
504  double fx, fy, fz;
505  if (FIXEDATOM) {
506  if (d_atomFixed[dstIndex]) {
507  fx = fy = fz = 0.0;
508  } else {
509  fx = d_clientBuffers[srcClient].d_data[srcIndex];
510  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
511  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
512  }
513  } else {
514  fx = d_clientBuffers[srcClient].d_data[srcIndex];
515  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
516  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
517  }
518  if (UNIQUE_ATOMS) {
519  d_f_global_x[dstIndex] += fx;
520  d_f_global_y[dstIndex] += fy;
521  d_f_global_z[dstIndex] += fz;
522  } else {
523  atomicAdd(&(d_f_global_x[dstIndex]), fx);
524  atomicAdd(&(d_f_global_y[dstIndex]), fy);
525  atomicAdd(&(d_f_global_z[dstIndex]), fz);
526  }
527  }
528 }
529 
530 void addGlobalForcesFromClients(
531  const int fixedOn, const int uniqueAtoms,
532  double *d_f_global_x, double *d_f_global_y,
533  double *d_f_global_z, const int *__restrict d_atomFixed,
534  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
535  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
536  size_t numClients, cudaStream_t stream) {
537  if (numCopyTuples == 0) return;
538  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
539  const int options = (fixedOn << 1) + uniqueAtoms;
540 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
541  addGlobalForcesFromClientsKernel<FIXEDATOM, UNIQUE_ATOMS> \
542  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
543  d_f_global_x, d_f_global_y, d_f_global_z, d_atomFixed, d_copyList, \
544  numCopyTuples, d_clientBuffers)
545  switch (options) {
546  case ((0<<1) + 0): CALL(0, 0); break;
547  case ((0<<1) + 1): CALL(0, 1); break;
548  case ((1<<1) + 0): CALL(1, 0); break;
549  case ((1<<1) + 1): CALL(1, 1); break;
550  default: {
551  const std::string error = "Error in addGlobalForcesFromClients. No kernel is called.\n";
552  NAMD_bug(error.c_str());
553  }
554  }
555 #undef CALL
556 }
557 
558 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
559 __global__ void addGlobalForcesFromClientsKernelMGPU(
560  double **__restrict d_f_global_x, double **__restrict d_f_global_y,
561  double **__restrict d_f_global_z, const int **__restrict d_atomFixed,
562  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
563  size_t numCopyTuples,
564  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
565  const int i = threadIdx.x + blockIdx.x * blockDim.x;
566  if (i < numCopyTuples) {
567  const int dstDev = d_copyList[i].m_src_dev_index;
568  const int dstIndex = d_copyList[i].m_soa_index;
569  const int srcClient = d_copyList[i].m_client_index;
570  const int srcIndex = d_copyList[i].m_client_atom_pos;
571  const size_t stride = d_clientBuffers[srcClient].sz;
572  double fx, fy, fz;
573  if (FIXEDATOM) {
574  if (d_atomFixed[dstIndex]) {
575  fx = fy = fz = 0.0;
576  } else {
577  fx = d_clientBuffers[srcClient].d_data[srcIndex];
578  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
579  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
580  }
581  } else {
582  fx = d_clientBuffers[srcClient].d_data[srcIndex];
583  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
584  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
585  }
586  if (UNIQUE_ATOMS) {
587  d_f_global_x[dstDev][dstIndex] += fx;
588  d_f_global_y[dstDev][dstIndex] += fy;
589  d_f_global_z[dstDev][dstIndex] += fz;
590  } else {
591  atomicAdd(&(d_f_global_x[dstDev][dstIndex]), fx);
592  atomicAdd(&(d_f_global_y[dstDev][dstIndex]), fy);
593  atomicAdd(&(d_f_global_z[dstDev][dstIndex]), fz);
594  }
595  }
596 }
597 
598 void addGlobalForcesFromClientsMGPU(
599  const int fixedOn, const int uniqueAtoms,
600  double **d_peer_f_global_x, double **d_peer_f_global_y,
601  double **d_peer_f_global_z, const int **d_peer_atomFixed,
602  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
603  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
604  size_t numClients, cudaStream_t stream) {
605  if (numCopyTuples == 0) return;
606  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
607  const int options = (fixedOn << 1) + uniqueAtoms;
608 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
609  addGlobalForcesFromClientsKernelMGPU<FIXEDATOM, UNIQUE_ATOMS>\
610  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
611  d_peer_f_global_x, d_peer_f_global_y, d_peer_f_global_z, \
612  d_peer_atomFixed, d_copyList, numCopyTuples, d_clientBuffers);
613  switch (options) {
614  case ((0<<1) + 0): CALL(0, 0); break;
615  case ((0<<1) + 1): CALL(0, 1); break;
616  case ((1<<1) + 0): CALL(1, 0); break;
617  case ((1<<1) + 1): CALL(1, 1); break;
618  default: {
619  const std::string error = "Error in addGlobalForcesFromClientsMGPU. No kernel is called.\n";
620  NAMD_bug(error.c_str());
621  }
622  }
623 #undef CALL
624 }
625 
626 #endif // (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)