NAMD
CudaGlobalMasterServerKernel.cu
Go to the documentation of this file.
1 #include "CudaGlobalMasterServerKernel.h"
2 #include "CudaUtils.h"
3 
4 #if defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)
5 
6 #define ATOM_BLOCKS 128
7 
8 template <bool copyPositions, bool copyMasses, bool copyCharges,
9  bool copyTransforms, bool copyVelocities>
10 __global__ void copyAtomsToClientsKernel(
11  const double *__restrict x, const double *__restrict y,
12  const double *__restrict z, const double *__restrict v_x,
13  const double *__restrict v_y, const double *__restrict v_z,
14  const char3 *__restrict d_transform, const float *__restrict d_mass,
15  const float *__restrict d_charge, const Lattice lat,
16  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
17  size_t numCopyTuples,
18  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
19  const int i = threadIdx.x + blockIdx.x * blockDim.x;
20  // double3 pos = {0, 0, 0};
21  if (i < numCopyTuples) {
22  const int srcIndex = d_copyList[i].m_soa_index;
23  const int dstClient = d_copyList[i].m_client_index;
24  const int dstIndex = d_copyList[i].m_client_atom_pos;
25  const size_t stride = d_clientBuffers[dstClient].sz;
26  if (copyPositions) {
27  const double3 pos = {x[srcIndex], y[srcIndex], z[srcIndex]};
28  const char3 t = d_transform[srcIndex];
29  const double3 pos_trans = lat.reverse_transform(pos, t);
30  d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
31  d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
32  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
33  }
34  if (copyMasses) {
35  d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcIndex];
36  }
37  if (copyCharges) {
38  d_clientBuffers[dstClient].d_charge[dstIndex] = d_charge[srcIndex];
39  }
40  if (copyTransforms) {
41  d_clientBuffers[dstClient].d_transform[dstIndex] =
42  d_transform[srcIndex].x;
43  d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
44  d_transform[srcIndex].y;
45  d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
46  d_transform[srcIndex].z;
47  }
48  if (copyVelocities) {
49  d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcIndex];
50  d_clientBuffers[dstClient].d_vel[dstIndex + stride] = v_y[srcIndex];
51  d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] = v_z[srcIndex];
52  }
53  }
54 }
55 
56 void copyAtomsToClientsCUDA(
57  bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
58  bool copyVelocities, const double *d_pos_x, const double *d_pos_y,
59  const double *d_pos_z, const double *d_vel_x, const double *d_vel_y,
60  const double *d_vel_z, const char3 *d_transform, const float *d_mass,
61  const float *d_charge, const Lattice lat,
62  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
63  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
64  size_t numClients, cudaStream_t stream) {
65  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
66 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
67  copyAtomsToClientsKernel<bool(POSITION), bool(MASS), bool(CHARGE), \
68  bool(TRANSFORM), bool(VELOCITY)> \
69  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
70  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z, d_transform, \
71  d_mass, d_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
72  const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
73  (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
74  (int(copyVelocities));
75  switch (options) {
76  case 0:
77  break; // Nothing is copied
78  case 1:
79  CALL(0, 0, 0, 0, 1);
80  break;
81  case 2:
82  CALL(0, 0, 0, 1, 0);
83  break;
84  case 3:
85  CALL(0, 0, 0, 1, 1);
86  break;
87  case 4:
88  CALL(0, 0, 1, 0, 0);
89  break;
90  case 5:
91  CALL(0, 0, 1, 0, 1);
92  break;
93  case 6:
94  CALL(0, 0, 1, 1, 0);
95  break;
96  case 7:
97  CALL(0, 0, 1, 1, 1);
98  break;
99  case 8:
100  CALL(0, 1, 0, 0, 0);
101  break;
102  case 9:
103  CALL(0, 1, 0, 0, 1);
104  break;
105  case 10:
106  CALL(0, 1, 0, 1, 0);
107  break;
108  case 11:
109  CALL(0, 1, 0, 1, 1);
110  break;
111  case 12:
112  CALL(0, 1, 1, 0, 0);
113  break;
114  case 13:
115  CALL(0, 1, 1, 0, 1);
116  break;
117  case 14:
118  CALL(0, 1, 1, 1, 0);
119  break;
120  case 15:
121  CALL(0, 1, 1, 1, 1);
122  break;
123  case 16:
124  CALL(1, 0, 0, 0, 0);
125  break;
126  case 17:
127  CALL(1, 0, 0, 0, 1);
128  break;
129  case 18:
130  CALL(1, 0, 0, 1, 0);
131  break;
132  case 19:
133  CALL(1, 0, 0, 1, 1);
134  break;
135  case 20:
136  CALL(1, 0, 1, 0, 0);
137  break;
138  case 21:
139  CALL(1, 0, 1, 0, 1);
140  break;
141  case 22:
142  CALL(1, 0, 1, 1, 0);
143  break;
144  case 23:
145  CALL(1, 0, 1, 1, 1);
146  break;
147  case 24:
148  CALL(1, 1, 0, 0, 0);
149  break;
150  case 25:
151  CALL(1, 1, 0, 0, 1);
152  break;
153  case 26:
154  CALL(1, 1, 0, 1, 0);
155  break;
156  case 27:
157  CALL(1, 1, 0, 1, 1);
158  break;
159  case 28:
160  CALL(1, 1, 1, 0, 0);
161  break;
162  case 29:
163  CALL(1, 1, 1, 0, 1);
164  break;
165  case 30:
166  CALL(1, 1, 1, 1, 0);
167  break;
168  case 31:
169  CALL(1, 1, 1, 1, 1);
170  break;
171  }
172 #undef CALL
173 }
174 
175 template <bool copyPositions, bool copyMasses, bool copyCharges,
176  bool copyTransforms, bool copyVelocities>
177 __global__ void copyAtomsToClientsKernelMGPU(
178  const double **__restrict x, const double **__restrict y,
179  const double **__restrict z, const double **__restrict v_x,
180  const double **__restrict v_y, const double **__restrict v_z,
181  const char3 **__restrict d_transform, const float **__restrict d_mass,
182  const float **__restrict d_charge, const Lattice lat,
183  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
184  size_t numCopyTuples,
185  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
186  const int i = threadIdx.x + blockIdx.x * blockDim.x;
187  // double3 pos = {0, 0, 0};
188  if (i < numCopyTuples) {
189  const int srcDevIdx = d_copyList[i].m_src_dev_index;
190  const int srcIndex = d_copyList[i].m_soa_index;
191  const int dstClient = d_copyList[i].m_client_index;
192  const int dstIndex = d_copyList[i].m_client_atom_pos;
193  const size_t stride = d_clientBuffers[dstClient].sz;
194  if (copyPositions) {
195  const double3 pos = {x[srcDevIdx][srcIndex], y[srcDevIdx][srcIndex],
196  z[srcDevIdx][srcIndex]};
197  const char3 t = d_transform[srcDevIdx][srcIndex];
198  const double3 pos_trans = lat.reverse_transform(pos, t);
199  d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
200  d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
201  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
202  }
203  if (copyMasses) {
204  d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcDevIdx][srcIndex];
205  }
206  if (copyCharges) {
207  d_clientBuffers[dstClient].d_charge[dstIndex] =
208  d_charge[srcDevIdx][srcIndex];
209  }
210  if (copyTransforms) {
211  d_clientBuffers[dstClient].d_transform[dstIndex] =
212  d_transform[srcDevIdx][srcIndex].x;
213  d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
214  d_transform[srcDevIdx][srcIndex].y;
215  d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
216  d_transform[srcDevIdx][srcIndex].z;
217  }
218  if (copyVelocities) {
219  d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcDevIdx][srcIndex];
220  d_clientBuffers[dstClient].d_vel[dstIndex + stride] =
221  v_y[srcDevIdx][srcIndex];
222  d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] =
223  v_z[srcDevIdx][srcIndex];
224  }
225  }
226 }
227 
228 void copyAtomsToClientsCUDAMGPU(
229  bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
230  bool copyVelocities, const double **d_peer_pos_x,
231  const double **d_peer_pos_y, const double **d_peer_pos_z,
232  const double **d_peer_vel_x, const double **d_peer_vel_y,
233  const double **d_peer_vel_z, const char3 **d_peer_transform,
234  const float **d_peer_mass, const float **d_peer_charge, const Lattice lat,
235  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
236  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
237  size_t numClients, cudaStream_t stream) {
238  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
239 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
240  copyAtomsToClientsKernelMGPU<bool(POSITION), bool(MASS), bool(CHARGE), \
241  bool(TRANSFORM), bool(VELOCITY)> \
242  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
243  d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_vel_x, \
244  d_peer_vel_y, d_peer_vel_z, d_peer_transform, d_peer_mass, \
245  d_peer_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
246  const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
247  (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
248  (int(copyVelocities));
249  switch (options) {
250  case 0:
251  break; // Nothing is copied
252  case 1:
253  CALL(0, 0, 0, 0, 1);
254  break;
255  case 2:
256  CALL(0, 0, 0, 1, 0);
257  break;
258  case 3:
259  CALL(0, 0, 0, 1, 1);
260  break;
261  case 4:
262  CALL(0, 0, 1, 0, 0);
263  break;
264  case 5:
265  CALL(0, 0, 1, 0, 1);
266  break;
267  case 6:
268  CALL(0, 0, 1, 1, 0);
269  break;
270  case 7:
271  CALL(0, 0, 1, 1, 1);
272  break;
273  case 8:
274  CALL(0, 1, 0, 0, 0);
275  break;
276  case 9:
277  CALL(0, 1, 0, 0, 1);
278  break;
279  case 10:
280  CALL(0, 1, 0, 1, 0);
281  break;
282  case 11:
283  CALL(0, 1, 0, 1, 1);
284  break;
285  case 12:
286  CALL(0, 1, 1, 0, 0);
287  break;
288  case 13:
289  CALL(0, 1, 1, 0, 1);
290  break;
291  case 14:
292  CALL(0, 1, 1, 1, 0);
293  break;
294  case 15:
295  CALL(0, 1, 1, 1, 1);
296  break;
297  case 16:
298  CALL(1, 0, 0, 0, 0);
299  break;
300  case 17:
301  CALL(1, 0, 0, 0, 1);
302  break;
303  case 18:
304  CALL(1, 0, 0, 1, 0);
305  break;
306  case 19:
307  CALL(1, 0, 0, 1, 1);
308  break;
309  case 20:
310  CALL(1, 0, 1, 0, 0);
311  break;
312  case 21:
313  CALL(1, 0, 1, 0, 1);
314  break;
315  case 22:
316  CALL(1, 0, 1, 1, 0);
317  break;
318  case 23:
319  CALL(1, 0, 1, 1, 1);
320  break;
321  case 24:
322  CALL(1, 1, 0, 0, 0);
323  break;
324  case 25:
325  CALL(1, 1, 0, 0, 1);
326  break;
327  case 26:
328  CALL(1, 1, 0, 1, 0);
329  break;
330  case 27:
331  CALL(1, 1, 0, 1, 1);
332  break;
333  case 28:
334  CALL(1, 1, 1, 0, 0);
335  break;
336  case 29:
337  CALL(1, 1, 1, 0, 1);
338  break;
339  case 30:
340  CALL(1, 1, 1, 1, 0);
341  break;
342  case 31:
343  CALL(1, 1, 1, 1, 1);
344  break;
345  }
346 #undef CALL
347 }
348 
349 template <bool FIXEDATOM>
350 __global__ void copyForcesToClientsKernel(
351  const double *__restrict d_f_normal_x,
352  const double *__restrict d_f_normal_y,
353  const double *__restrict d_f_normal_z, const double *__restrict d_f_nbond_x,
354  const double *__restrict d_f_nbond_y, const double *__restrict d_f_nbond_z,
355  const double *__restrict d_f_slow_x, const double *__restrict d_f_slow_y,
356  const double *__restrict d_f_slow_z, const int *__restrict d_atomFixed,
357  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
358  size_t numCopyTuples,
359  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
360  const int i = threadIdx.x + blockIdx.x * blockDim.x;
361  if (i < numCopyTuples) {
362  const int srcIndex = d_copyList[i].m_soa_index;
363  const int dstClient = d_copyList[i].m_client_index;
364  const int dstIndex = d_copyList[i].m_client_atom_pos;
365  const size_t stride = d_clientBuffers[dstClient].sz;
366  double fx, fy, fz;
367  if (FIXEDATOM) {
368  if (d_atomFixed[srcIndex]) {
369  fx = fy = fz = 0.0;
370  } else {
371  fx = d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] +
372  d_f_slow_x[srcIndex];
373  fy = d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] +
374  d_f_slow_y[srcIndex];
375  fz = d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] +
376  d_f_slow_z[srcIndex];
377  }
378  } else {
379  fx =
380  d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] + d_f_slow_x[srcIndex];
381  fy =
382  d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] + d_f_slow_y[srcIndex];
383  fz =
384  d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] + d_f_slow_z[srcIndex];
385  }
386  d_clientBuffers[dstClient].d_data[dstIndex] = fx;
387  d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
388  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
389  }
390 }
391 
392 void copyTotalForcesToClientsCUDA(
393  bool fixedOn, const double *d_f_normal_x, const double *d_f_normal_y,
394  const double *d_f_normal_z, const double *d_f_nbond_x,
395  const double *d_f_nbond_y, const double *d_f_nbond_z,
396  const double *d_f_slow_x, const double *d_f_slow_y,
397  const double *d_f_slow_z, const int *d_atomFixed,
398  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
399  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
400  size_t numClients, cudaStream_t stream) {
401  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
402  if (fixedOn) {
403  copyForcesToClientsKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
404  d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
405  d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
406  d_copyList, numCopyTuples, d_clientBuffers);
407  } else {
408  copyForcesToClientsKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
409  d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
410  d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
411  d_copyList, numCopyTuples, d_clientBuffers);
412  }
413 }
414 
415 template <bool FIXEDATOM>
416 __global__ void copyForcesToClientsKernelMGPU(
417  const double **__restrict d_f_normal_x,
418  const double **__restrict d_f_normal_y,
419  const double **__restrict d_f_normal_z,
420  const double **__restrict d_f_nbond_x,
421  const double **__restrict d_f_nbond_y,
422  const double **__restrict d_f_nbond_z, const double **__restrict d_f_slow_x,
423  const double **__restrict d_f_slow_y, const double **__restrict d_f_slow_z,
424  const int **__restrict d_atomFixed,
425  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
426  size_t numCopyTuples,
427  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
428  const int i = threadIdx.x + blockIdx.x * blockDim.x;
429  if (i < numCopyTuples) {
430  const int srcDevIdx = d_copyList[i].m_src_dev_index;
431  const int srcIndex = d_copyList[i].m_soa_index;
432  const int dstClient = d_copyList[i].m_client_index;
433  const int dstIndex = d_copyList[i].m_client_atom_pos;
434  const size_t stride = d_clientBuffers[dstClient].sz;
435  double fx, fy, fz;
436  if (FIXEDATOM) {
437  if (d_atomFixed[srcDevIdx][srcIndex]) {
438  fx = fy = fz = 0.0;
439  } else {
440  fx = d_f_normal_x[srcDevIdx][srcIndex] +
441  d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
442  fy = d_f_normal_y[srcDevIdx][srcIndex] +
443  d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
444  fz = d_f_normal_z[srcDevIdx][srcIndex] +
445  d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
446  }
447  } else {
448  fx = d_f_normal_x[srcDevIdx][srcIndex] +
449  d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
450  fy = d_f_normal_y[srcDevIdx][srcIndex] +
451  d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
452  fz = d_f_normal_z[srcDevIdx][srcIndex] +
453  d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
454  }
455  d_clientBuffers[dstClient].d_data[dstIndex] = fx;
456  d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
457  d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
458  }
459 }
460 
461 void copyTotalForcesToClientsCUDAMGPU(
462  bool fixedOn, const double **d_peer_f_normal_x,
463  const double **d_peer_f_normal_y, const double **d_peer_f_normal_z,
464  const double **d_peer_f_nbond_x, const double **d_peer_f_nbond_y,
465  const double **d_peer_f_nbond_z, const double **d_peer_f_slow_x,
466  const double **d_peer_f_slow_y, const double **d_peer_f_slow_z,
467  const int **d_peer_atomFixed,
468  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
469  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
470  size_t numClients, cudaStream_t stream) {
471  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
472  if (fixedOn) {
473  copyForcesToClientsKernelMGPU<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
474  d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
475  d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
476  d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
477  numCopyTuples, d_clientBuffers);
478  } else {
479  copyForcesToClientsKernelMGPU<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
480  d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
481  d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
482  d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
483  numCopyTuples, d_clientBuffers);
484  }
485 }
486 
487 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
488 __global__ void addGlobalForcesFromClientsKernel(
489  double *__restrict d_f_global_x, double *__restrict d_f_global_y,
490  double *__restrict d_f_global_z, const int *__restrict d_atomFixed,
491  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
492  size_t numCopyTuples,
493  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
494  const int i = threadIdx.x + blockIdx.x * blockDim.x;
495  if (i < numCopyTuples) {
496  const int dstIndex = d_copyList[i].m_soa_index;
497  const int srcClient = d_copyList[i].m_client_index;
498  const int srcIndex = d_copyList[i].m_client_atom_pos;
499  const size_t stride = d_clientBuffers[srcClient].sz;
500  double fx, fy, fz;
501  if (FIXEDATOM) {
502  if (d_atomFixed[dstIndex]) {
503  fx = fy = fz = 0.0;
504  } else {
505  fx = d_clientBuffers[srcClient].d_data[srcIndex];
506  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
507  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
508  }
509  } else {
510  fx = d_clientBuffers[srcClient].d_data[srcIndex];
511  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
512  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
513  }
514  if (UNIQUE_ATOMS) {
515  d_f_global_x[dstIndex] += fx;
516  d_f_global_y[dstIndex] += fy;
517  d_f_global_z[dstIndex] += fz;
518  } else {
519  atomicAdd(&(d_f_global_x[dstIndex]), fx);
520  atomicAdd(&(d_f_global_y[dstIndex]), fy);
521  atomicAdd(&(d_f_global_z[dstIndex]), fz);
522  }
523  }
524 }
525 
526 void addGlobalForcesFromClients(
527  const int fixedOn, const int uniqueAtoms,
528  double *d_f_global_x, double *d_f_global_y,
529  double *d_f_global_z, const int *__restrict d_atomFixed,
530  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
531  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
532  size_t numClients, cudaStream_t stream) {
533  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
534  const int options = (fixedOn << 1) + uniqueAtoms;
535 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
536  addGlobalForcesFromClientsKernel<FIXEDATOM, UNIQUE_ATOMS> \
537  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
538  d_f_global_x, d_f_global_y, d_f_global_z, d_atomFixed, d_copyList, \
539  numCopyTuples, d_clientBuffers)
540  switch (options) {
541  case ((0<<1) + 0): CALL(0, 0); break;
542  case ((0<<1) + 1): CALL(0, 1); break;
543  case ((1<<1) + 0): CALL(1, 0); break;
544  case ((1<<1) + 1): CALL(1, 1); break;
545  default: {
546  const std::string error = "Error in addGlobalForcesFromClients. No kernel is called.\n";
547  NAMD_bug(error.c_str());
548  }
549  }
550 #undef CALL
551 }
552 
553 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
554 __global__ void addGlobalForcesFromClientsKernelMGPU(
555  double **__restrict d_f_global_x, double **__restrict d_f_global_y,
556  double **__restrict d_f_global_z, const int **__restrict d_atomFixed,
557  const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
558  size_t numCopyTuples,
559  CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
560  const int i = threadIdx.x + blockIdx.x * blockDim.x;
561  if (i < numCopyTuples) {
562  const int dstDev = d_copyList[i].m_src_dev_index;
563  const int dstIndex = d_copyList[i].m_soa_index;
564  const int srcClient = d_copyList[i].m_client_index;
565  const int srcIndex = d_copyList[i].m_client_atom_pos;
566  const size_t stride = d_clientBuffers[srcClient].sz;
567  double fx, fy, fz;
568  if (FIXEDATOM) {
569  if (d_atomFixed[dstIndex]) {
570  fx = fy = fz = 0.0;
571  } else {
572  fx = d_clientBuffers[srcClient].d_data[srcIndex];
573  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
574  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
575  }
576  } else {
577  fx = d_clientBuffers[srcClient].d_data[srcIndex];
578  fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
579  fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
580  }
581  if (UNIQUE_ATOMS) {
582  d_f_global_x[dstDev][dstIndex] += fx;
583  d_f_global_y[dstDev][dstIndex] += fy;
584  d_f_global_z[dstDev][dstIndex] += fz;
585  } else {
586  atomicAdd(&(d_f_global_x[dstDev][dstIndex]), fx);
587  atomicAdd(&(d_f_global_y[dstDev][dstIndex]), fy);
588  atomicAdd(&(d_f_global_z[dstDev][dstIndex]), fz);
589  }
590  }
591 }
592 
593 void addGlobalForcesFromClientsMGPU(
594  const int fixedOn, const int uniqueAtoms,
595  double **d_peer_f_global_x, double **d_peer_f_global_y,
596  double **d_peer_f_global_z, const int **d_peer_atomFixed,
597  const CudaGlobalMasterServer::CopyListTuple *d_copyList,
598  size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
599  size_t numClients, cudaStream_t stream) {
600  const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
601  const int options = (fixedOn << 1) + uniqueAtoms;
602 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
603  addGlobalForcesFromClientsKernelMGPU<FIXEDATOM, UNIQUE_ATOMS>\
604  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
605  d_peer_f_global_x, d_peer_f_global_y, d_peer_f_global_z, \
606  d_peer_atomFixed, d_copyList, numCopyTuples, d_clientBuffers);
607  switch (options) {
608  case ((0<<1) + 0): CALL(0, 0); break;
609  case ((0<<1) + 1): CALL(0, 1); break;
610  case ((1<<1) + 0): CALL(1, 0); break;
611  case ((1<<1) + 1): CALL(1, 1); break;
612  default: {
613  const std::string error = "Error in addGlobalForcesFromClientsMGPU. No kernel is called.\n";
614  NAMD_bug(error.c_str());
615  }
616  }
617 #undef CALL
618 }
619 
620 #endif // defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)