NAMD
ComputeLonepairsCUDAKernel.cu
Go to the documentation of this file.
1 #include "ComputeLonepairsCUDA.h"
2 #include "ComputeLonepairsCUDAKernel.h"
3 #include "Vector.h"
4 #include <string>
5 
6 #ifdef NAMD_HIP
7 #include <hipcub/hipcub.hpp>
8 #define cub hipcub
9 #endif // NAMD_HIP
10 
11 #ifdef NAMD_CUDA
12 #include <cuda.h>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
15 #else
16 #include <namd_cub/cub.cuh>
17 #endif
18 #endif // NAMD_CUDA
19 
20 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
21 
22 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
23 __global__ void repositionRelativeKernel(
24  double* __restrict__ d_pos_x,
25  double* __restrict__ d_pos_y,
26  double* __restrict__ d_pos_z,
27  const ComputeLonepairsCUDA::LonepairRelative* __restrict__ d_lprelative_list,
28  size_t lprelative_list_size) {
29  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
30  if (tid < lprelative_list_size) {
31  const int i = d_lprelative_list[tid].i_soaid;
32  const int j = d_lprelative_list[tid].j_soaid;
33  const int k = d_lprelative_list[tid].k_soaid;
34  const int l = d_lprelative_list[tid].l_soaid;
35  // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
36  const double dcosa = d_lprelative_list[tid].dcosa;
37  const double dsinacost = d_lprelative_list[tid].dsinacost;
38  const double dsinasint = d_lprelative_list[tid].dsinasint;
39  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
40  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
41  const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
42  // I use the unit vectors directly instead of rjk and rkl
43  const Vector ujk = (rk - rj).unit();
44  const Vector ukl = (rl - rk).unit();
45  const Vector u3 = cross(ujk, ukl);
46  const Vector u2 = cross(u3, ujk);
47  const Vector ri = rj + dcosa * ujk + dsinacost * u2 - dsinasint * u3;
48  d_pos_x[i] = ri.x;
49  d_pos_y[i] = ri.y;
50  d_pos_z[i] = ri.z;
51  }
52 }
53 
54 void repositionRelative(
55  double* d_pos_x,
56  double* d_pos_y,
57  double* d_pos_z,
58  const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
59  size_t lprelative_list_size,
60  cudaStream_t stream) {
61  const int block_size = 128;
62  const int grid = (lprelative_list_size + block_size - 1) / block_size;
63  repositionRelativeKernel<<<grid, block_size, 0, stream>>>(
64  d_pos_x, d_pos_y, d_pos_z, d_lprelative_list, lprelative_list_size);
65 }
66 
67 __global__ void repositionBisectorKernel(
68  double* __restrict__ d_pos_x,
69  double* __restrict__ d_pos_y,
70  double* __restrict__ d_pos_z,
71  const ComputeLonepairsCUDA::LonepairBisector* __restrict__ d_lpbisector_list,
72  size_t lpbisector_list_size) {
73  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
74  if (tid < lpbisector_list_size) {
75  const int i = d_lpbisector_list[tid].i_soaid;
76  const int j = d_lpbisector_list[tid].j_soaid;
77  const int k = d_lpbisector_list[tid].k_soaid;
78  const int l = d_lpbisector_list[tid].l_soaid;
79  // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
80  const double dcosa = d_lpbisector_list[tid].dcosa;
81  const double dsinacost = d_lpbisector_list[tid].dsinacost;
82  const double dsinasint = d_lpbisector_list[tid].dsinasint;
83  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
84  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
85  const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
86  // The middle point of l and k
87  const Vector rb = 0.5 * (rl + rk);
88  // The difference between repositionRelativeKernel and this one is basically
89  // the rotation of subscripts, namely k -> b and l -> k.
90  // rk->rb and then rl->rk
91  const Vector ujk = (rb - rj).unit();
92  const Vector ukl = (rk - rb).unit();
93  const Vector u3 = cross(ujk, ukl);
94  const Vector u2 = cross(u3, ujk);
95  const Vector ri = rj + dcosa * ujk + dsinacost * u2 - dsinasint * u3;
96  d_pos_x[i] = ri.x;
97  d_pos_y[i] = ri.y;
98  d_pos_z[i] = ri.z;
99  }
100 }
101 
102 void repositionBisector(
103  double* d_pos_x,
104  double* d_pos_y,
105  double* d_pos_z,
106  const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
107  size_t lpbisector_list_size,
108  cudaStream_t stream) {
109  const int block_size = 128;
110  const int grid = (lpbisector_list_size + block_size - 1) / block_size;
111  repositionBisectorKernel<<<grid, block_size, 0, stream>>>(
112  d_pos_x, d_pos_y, d_pos_z, d_lpbisector_list, lpbisector_list_size);
113 }
114 
115 __global__ void repositionColinearKernel(
116  double* __restrict__ d_pos_x,
117  double* __restrict__ d_pos_y,
118  double* __restrict__ d_pos_z,
119  const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
120  size_t lpcolinear_list_size) {
121  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
122  if (tid < lpcolinear_list_size) {
123  const int i = d_lpcolinear_list[tid].i_soaid;
124  const int j = d_lpcolinear_list[tid].j_soaid;
125  const int k = d_lpcolinear_list[tid].k_soaid;
126  const double distance = d_lpcolinear_list[tid].distance;
127  const double scale_factor = d_lpcolinear_list[tid].scale_factor;
128  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
129  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
130  const Vector rkj = rj - rk;
131  const double inv_r = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
132  const Vector ri = rj + (scale_factor + distance * inv_r) * rkj;
133  d_pos_x[i] = ri.x;
134  d_pos_y[i] = ri.y;
135  d_pos_z[i] = ri.z;
136  }
137 }
138 
139 void repositionColinear(
140  double* __restrict__ d_pos_x,
141  double* __restrict__ d_pos_y,
142  double* __restrict__ d_pos_z,
143  const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
144  size_t lpcolinear_list_size,
145  cudaStream_t stream) {
146  const int block_size = 128;
147  const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
148  repositionColinearKernel<<<grid, block_size, 0, stream>>>(
149  d_pos_x, d_pos_y, d_pos_z, d_lpcolinear_list, lpcolinear_list_size);
150 }
151 
152 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
153 // I try to project multiple forces at once for MTS.
154 template <int numForces>
155 __inline__ __device__ void projectRelativeForces(
156  const Vector& ri,
157  const Vector& rj,
158  const Vector& rk,
159  const Vector& rl,
160  const Vector (&fi)[numForces],
161  Vector (&fj)[numForces],
162  Vector (&fk)[numForces],
163  Vector (&fl)[numForces]) {
164  const Vector rji = ri - rj;
165  const Vector rjk = rk - rj;
166  const Vector rkl = rl - rk;
167  const Vector rki = ri - rk;
168  const double inv_rjk_norm2 = 1.0 / rjk.length2();
169  const Vector plane_jkl = cross(rjk, rkl);
170  // Project the force on rij to keep the distance unchanged
171  const double rji_norm2 = rji.length2();
172  const double inv_rij_norm2 = 1.0 / rji_norm2;
173  if (rji_norm2 > 0) {
174  #pragma unroll
175  for (int m = 0; m < numForces; ++m) {
176  fj[m] += inv_rij_norm2 * fi[m].dot(rji) * rji;
177  }
178  }
179  // Project the force on the dihedral angle to keep it unchanged
180  Vector fp[numForces];
181  #pragma unroll
182  for (int m = 0; m < numForces; ++m) {
183  fp[m] = 0.0;
184  }
185  // Get the normal vector of plane ikj
186  Vector normal_ikj = cross(rji, rjk);
187  const double normal_ikj_norm = normal_ikj.length();
188  // The arm of the force on plane ikj (v2 + v3)
189  const Vector hijk_ri = rji - (rji.dot(rjk) * inv_rjk_norm2 * rjk);
190  // Only project the force to plane ijk if the ijk are not colinear
191  // If rji and rjk is colinear then rjiƗrjk is zero
192  if (normal_ikj_norm > 1e-6) {
193  normal_ikj /= normal_ikj_norm;
194  // Force on plane ikj
195  #pragma unroll
196  for (int m = 0; m < numForces; ++m) {
197  fp[m] = fi[m].dot(normal_ikj) * normal_ikj;
198  }
199  }
200  // Torque of the force on l after projecting the force on plane ijk to l
201  const Vector dir_l_to_jk = cross(plane_jkl, rjk).unit();
202  const Vector h_l = rkl.dot(dir_l_to_jk) * dir_l_to_jk;
203  const double inv_h_l_norm2 = 1.0 / h_l.length2();
204  #pragma unroll
205  for (int m = 0; m < numForces; ++m) {
206  // Torque of fp
207  const Vector torque_p = cross(hijk_ri, fp[m]);
208  // The force on l
209  const Vector fpl = inv_h_l_norm2 * cross(torque_p, h_l);
210  fl[m] += fpl;
211  // (a) The remaining force on j and k after subtracting the force on l from the torsional force
212  const Vector fj_prime = inv_rjk_norm2 * cross(cross(rki, fp[m]) - cross(rkl, fpl), -rjk);
213  const Vector fk_prime = fp[m] - fpl - fj_prime;
214  // (b) The remaining angular force
215  const Vector fai = fi[m] - fp[m] - fj[m];
216  // Sum the torsional force (a) on j and k
217  fj[m] += fj_prime;
218  fk[m] += fk_prime;
219  // Torque of the angular force (b) on k
220  const Vector torque_k = cross(rji, fai);
221  const Vector fak = cross(torque_k, rjk) * inv_rjk_norm2;
222  fk[m] += fak;
223  fj[m] += fai - fak;
224  }
225 }
226 
227 enum class LonepairThreeHostsType {Relative, Bisector};
228 
229 template <typename LplistT, int maxForceNumer, bool doVirial, int blockSize, LonepairThreeHostsType lptype>
230 __global__ void redistributeForceThreeHostsKernel(
231  double* __restrict__ d_f_normal_x,
232  double* __restrict__ d_f_normal_y,
233  double* __restrict__ d_f_normal_z,
234  double* __restrict__ d_f_nbond_x,
235  double* __restrict__ d_f_nbond_y,
236  double* __restrict__ d_f_nbond_z,
237  double* __restrict__ d_f_slow_x,
238  double* __restrict__ d_f_slow_y,
239  double* __restrict__ d_f_slow_z,
240  cudaTensor* __restrict__ d_virial_normal,
241  cudaTensor* __restrict__ d_virial_nbond,
242  cudaTensor* __restrict__ d_virial_slow,
243  const double* __restrict__ d_pos_x,
244  const double* __restrict__ d_pos_y,
245  const double* __restrict__ d_pos_z,
246  const LplistT* __restrict__ d_lp_list,
247  size_t lp_list_size) {
248  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
249  cudaTensor vir_slow, vir_nbond, vir_normal;
250  if (doVirial) {
251  switch (maxForceNumer) {
252  // Intentionally fall-through
253  case 2: {
254  vir_slow.xx = 0;
255  vir_slow.xy = 0;
256  vir_slow.xz = 0;
257  vir_slow.yx = 0;
258  vir_slow.yy = 0;
259  vir_slow.yz = 0;
260  vir_slow.zx = 0;
261  vir_slow.zy = 0;
262  vir_slow.zz = 0;
263  }
264  case 1: {
265  vir_nbond.xx = 0;
266  vir_nbond.xy = 0;
267  vir_nbond.xz = 0;
268  vir_nbond.yx = 0;
269  vir_nbond.yy = 0;
270  vir_nbond.yz = 0;
271  vir_nbond.zx = 0;
272  vir_nbond.zy = 0;
273  vir_nbond.zz = 0;
274  }
275  case 0: {
276  vir_normal.xx = 0;
277  vir_normal.xy = 0;
278  vir_normal.xz = 0;
279  vir_normal.yx = 0;
280  vir_normal.yy = 0;
281  vir_normal.yz = 0;
282  vir_normal.zx = 0;
283  vir_normal.zy = 0;
284  vir_normal.zz = 0;
285  }
286  }
287  }
288  if (tid < lp_list_size) {
289  const int i = d_lp_list[tid].i_soaid;
290  const int j = d_lp_list[tid].j_soaid;
291  const int k = d_lp_list[tid].k_soaid;
292  const int l = d_lp_list[tid].l_soaid;
293  // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
294  const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
295  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
296  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
297  const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
298  Vector fi[maxForceNumer+1];
299  Vector fj[maxForceNumer+1];
300  Vector fk[maxForceNumer+1];
301  Vector fl[maxForceNumer+1];
302  #pragma unroll
303  for (int m = 0; m < maxForceNumer + 1; ++m) {
304  fj[m] = 0.0;
305  fk[m] = 0.0;
306  fl[m] = 0.0;
307  }
308  switch (maxForceNumer) {
309  // Intentionally fall-through
310  case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
311  case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
312  case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
313  }
314  switch (lptype) {
315  case LonepairThreeHostsType::Relative: {
316  projectRelativeForces<maxForceNumer+1>(ri, rj, rk, rl, fi, fj, fk, fl);
317  break;
318  }
319  case LonepairThreeHostsType::Bisector: {
320  // The difference between the bisector and the relative cases is
321  // that the order of the vertices of the former is changed to ijmk
322  // from ijkl, so we can just reuse the function.
323  const Vector rm = 0.5 * (rj + rk);
324  // Store the force of m to fl, so I rotate the buffers fl and fk
325  projectRelativeForces<maxForceNumer+1>(ri, rj, rm, rk, fi, fj, fl, fk);
326  // fk and fl actually store the forces on m and k, respectively.
327  // The real force on k is 0.5 * fl + fk
328  // The real force on l is 0.5 * fl
329  #pragma unroll
330  for (int m = 0; m < maxForceNumer + 1; ++m) {
331  fl[m] *= 0.5;
332  fk[m] += fl[m];
333  }
334  break;
335  }
336  }
337  // Add the forces back to jkl
338  switch (maxForceNumer) {
339  // Intentionally fall-through
340  case 2: {
341  atomicAdd(&(d_f_slow_x[j]), fj[2].x);
342  atomicAdd(&(d_f_slow_y[j]), fj[2].y);
343  atomicAdd(&(d_f_slow_z[j]), fj[2].z);
344  atomicAdd(&(d_f_slow_x[k]), fk[2].x);
345  atomicAdd(&(d_f_slow_y[k]), fk[2].y);
346  atomicAdd(&(d_f_slow_z[k]), fk[2].z);
347  atomicAdd(&(d_f_slow_x[l]), fl[2].x);
348  atomicAdd(&(d_f_slow_y[l]), fl[2].y);
349  atomicAdd(&(d_f_slow_z[l]), fl[2].z);
350  }
351  case 1: {
352  atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
353  atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
354  atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
355  atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
356  atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
357  atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
358  atomicAdd(&(d_f_nbond_x[l]), fl[1].x);
359  atomicAdd(&(d_f_nbond_y[l]), fl[1].y);
360  atomicAdd(&(d_f_nbond_z[l]), fl[1].z);
361  }
362  case 0: {
363  atomicAdd(&(d_f_normal_x[j]), fj[0].x);
364  atomicAdd(&(d_f_normal_y[j]), fj[0].y);
365  atomicAdd(&(d_f_normal_z[j]), fj[0].z);
366  atomicAdd(&(d_f_normal_x[k]), fk[0].x);
367  atomicAdd(&(d_f_normal_y[k]), fk[0].y);
368  atomicAdd(&(d_f_normal_z[k]), fk[0].z);
369  atomicAdd(&(d_f_normal_x[l]), fl[0].x);
370  atomicAdd(&(d_f_normal_y[l]), fl[0].y);
371  atomicAdd(&(d_f_normal_z[l]), fl[0].z);
372  }
373  }
374  // Compute the virial contribution after redistributing the forces
375  if (doVirial) {
376  switch (maxForceNumer) {
377  // Intentionally fall-through
378  case 2: {
379  vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x + fl[2].x * rl.x - fi[2].x * ri.x;
380  vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y + fl[2].x * rl.y - fi[2].x * ri.y;
381  vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z + fl[2].x * rl.z - fi[2].x * ri.z;
382  vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x + fl[2].y * rl.x - fi[2].y * ri.x;
383  vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y + fl[2].y * rl.y - fi[2].y * ri.y;
384  vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z + fl[2].y * rl.z - fi[2].y * ri.z;
385  vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x + fl[2].z * rl.x - fi[2].z * ri.x;
386  vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y + fl[2].z * rl.y - fi[2].z * ri.y;
387  vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z + fl[2].z * rl.z - fi[2].z * ri.z;
388  }
389  case 1: {
390  vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x + fl[1].x * rl.x - fi[1].x * ri.x;
391  vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y + fl[1].x * rl.y - fi[1].x * ri.y;
392  vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z + fl[1].x * rl.z - fi[1].x * ri.z;
393  vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x + fl[1].y * rl.x - fi[1].y * ri.x;
394  vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y + fl[1].y * rl.y - fi[1].y * ri.y;
395  vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z + fl[1].y * rl.z - fi[1].y * ri.z;
396  vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x + fl[1].z * rl.x - fi[1].z * ri.x;
397  vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y + fl[1].z * rl.y - fi[1].z * ri.y;
398  vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z + fl[1].z * rl.z - fi[1].z * ri.z;
399  }
400  case 0: {
401  vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x + fl[0].x * rl.x - fi[0].x * ri.x;
402  vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y + fl[0].x * rl.y - fi[0].x * ri.y;
403  vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z + fl[0].x * rl.z - fi[0].x * ri.z;
404  vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x + fl[0].y * rl.x - fi[0].y * ri.x;
405  vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y + fl[0].y * rl.y - fi[0].y * ri.y;
406  vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z + fl[0].y * rl.z - fi[0].y * ri.z;
407  vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x + fl[0].z * rl.x - fi[0].z * ri.x;
408  vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y + fl[0].z * rl.y - fi[0].z * ri.y;
409  vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z + fl[0].z * rl.z - fi[0].z * ri.z;
410  }
411  }
412  }
413  // Clean the forces on the lone pair
414  switch (maxForceNumer) {
415  // Intentionally fall-through
416  case 2: {
417  d_f_slow_x[i] = 0;
418  d_f_slow_y[i] = 0;
419  d_f_slow_z[i] = 0;
420  }
421  case 1: {
422  d_f_nbond_x[i] = 0;
423  d_f_nbond_y[i] = 0;
424  d_f_nbond_z[i] = 0;
425  }
426  case 0: {
427  d_f_normal_x[i] = 0;
428  d_f_normal_y[i] = 0;
429  d_f_normal_z[i] = 0;
430  }
431  }
432  }
433  typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
434  __shared__ typename BlockReduce::TempStorage temp_storage;
435  if (doVirial) {
436  switch (maxForceNumer) {
437  // Intentionally fall-through
438  case 2: {
439  vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
440  __syncthreads();
441  }
442  case 1: {
443  vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
444  __syncthreads();
445  }
446  case 0: {
447  vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
448  __syncthreads();
449  }
450  }
451  if (threadIdx.x == 0) {
452  switch (maxForceNumer) {
453  // Intentionally fall-through
454  case 2: {
455  atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
456  atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
457  atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
458  atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
459  atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
460  atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
461  atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
462  atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
463  atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
464  }
465  case 1: {
466  atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
467  atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
468  atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
469  atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
470  atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
471  atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
472  atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
473  atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
474  atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
475  }
476  case 0: {
477  atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
478  atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
479  atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
480  atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
481  atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
482  atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
483  atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
484  atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
485  atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
486  }
487  }
488  }
489  }
490 }
491 
492 void redistributeForceRelative(
493  double* d_f_normal_x,
494  double* d_f_normal_y,
495  double* d_f_normal_z,
496  double* d_f_nbond_x,
497  double* d_f_nbond_y,
498  double* d_f_nbond_z,
499  double* d_f_slow_x,
500  double* d_f_slow_y,
501  double* d_f_slow_z,
502  cudaTensor* d_virial_normal,
503  cudaTensor* d_virial_nbond,
504  cudaTensor* d_virial_slow,
505  const double* d_pos_x,
506  const double* d_pos_y,
507  const double* d_pos_z,
508  const int maxForceNumber,
509  const int doVirial,
510  const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
511  size_t lprelative_list_size,
512  cudaStream_t stream) {
513  const int block_size = 128;
514  const int grid = (lprelative_list_size + block_size - 1) / block_size;
515 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
516  redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairRelative, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Relative> \
517  <<<grid, block_size, 0, stream>>>( \
518  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
519  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
520  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
521  d_virial_normal, d_virial_nbond, d_virial_slow, \
522  d_pos_x, d_pos_y, d_pos_z, \
523  d_lprelative_list, lprelative_list_size \
524  );
525  const int option = (maxForceNumber << 0) + (doVirial << 2);
526  switch (option) {
527  case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
528  case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
529  case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
530  case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
531  case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
532  case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
533  default: {
534  const std::string error =
535  "redistributeForceRelative: no kernel called (maxForceNumber = " +
536  std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
537  NAMD_bug(error.c_str());
538  }
539  }
540 #undef CALL
541 }
542 
543 void redistributeForceBisector(
544  double* d_f_normal_x,
545  double* d_f_normal_y,
546  double* d_f_normal_z,
547  double* d_f_nbond_x,
548  double* d_f_nbond_y,
549  double* d_f_nbond_z,
550  double* d_f_slow_x,
551  double* d_f_slow_y,
552  double* d_f_slow_z,
553  cudaTensor* d_virial_normal,
554  cudaTensor* d_virial_nbond,
555  cudaTensor* d_virial_slow,
556  const double* d_pos_x,
557  const double* d_pos_y,
558  const double* d_pos_z,
559  const int maxForceNumber,
560  const int doVirial,
561  const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
562  size_t lpbisector_list_size,
563  cudaStream_t stream) {
564  const int block_size = 128;
565  const int grid = (lpbisector_list_size + block_size - 1) / block_size;
566 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
567  redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairBisector, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Bisector> \
568  <<<grid, block_size, 0, stream>>>( \
569  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
570  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
571  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
572  d_virial_normal, d_virial_nbond, d_virial_slow, \
573  d_pos_x, d_pos_y, d_pos_z, \
574  d_lpbisector_list, lpbisector_list_size \
575  );
576  const int option = (maxForceNumber << 0) + (doVirial << 2);
577  switch (option) {
578  case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
579  case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
580  case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
581  case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
582  case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
583  case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
584  default: {
585  const std::string error =
586  "redistributeForceBisector: no kernel called (maxForceNumber = " +
587  std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
588  NAMD_bug(error.c_str());
589  }
590  }
591 #undef CALL
592 }
593 
594 template <int maxForceNumer, bool doVirial, int blockSize>
595 __global__ void redistributeForceColinearKernel(
596  double* __restrict__ d_f_normal_x,
597  double* __restrict__ d_f_normal_y,
598  double* __restrict__ d_f_normal_z,
599  double* __restrict__ d_f_nbond_x,
600  double* __restrict__ d_f_nbond_y,
601  double* __restrict__ d_f_nbond_z,
602  double* __restrict__ d_f_slow_x,
603  double* __restrict__ d_f_slow_y,
604  double* __restrict__ d_f_slow_z,
605  cudaTensor* __restrict__ d_virial_normal,
606  cudaTensor* __restrict__ d_virial_nbond,
607  cudaTensor* __restrict__ d_virial_slow,
608  const double* __restrict__ d_pos_x,
609  const double* __restrict__ d_pos_y,
610  const double* __restrict__ d_pos_z,
611  const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lp_list,
612  size_t lp_list_size) {
613  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
614  cudaTensor vir_slow, vir_nbond, vir_normal;
615  if (doVirial) {
616  switch (maxForceNumer) {
617  // Intentionally fall-through
618  case 2: {
619  vir_slow.xx = 0;
620  vir_slow.xy = 0;
621  vir_slow.xz = 0;
622  vir_slow.yx = 0;
623  vir_slow.yy = 0;
624  vir_slow.yz = 0;
625  vir_slow.zx = 0;
626  vir_slow.zy = 0;
627  vir_slow.zz = 0;
628  }
629  case 1: {
630  vir_nbond.xx = 0;
631  vir_nbond.xy = 0;
632  vir_nbond.xz = 0;
633  vir_nbond.yx = 0;
634  vir_nbond.yy = 0;
635  vir_nbond.yz = 0;
636  vir_nbond.zx = 0;
637  vir_nbond.zy = 0;
638  vir_nbond.zz = 0;
639  }
640  case 0: {
641  vir_normal.xx = 0;
642  vir_normal.xy = 0;
643  vir_normal.xz = 0;
644  vir_normal.yx = 0;
645  vir_normal.yy = 0;
646  vir_normal.yz = 0;
647  vir_normal.zx = 0;
648  vir_normal.zy = 0;
649  vir_normal.zz = 0;
650  }
651  }
652  }
653  if (tid < lp_list_size) {
654  const int i = d_lp_list[tid].i_soaid;
655  const int j = d_lp_list[tid].j_soaid;
656  const int k = d_lp_list[tid].k_soaid;
657  double distance = d_lp_list[tid].distance;
658  const double scale_factor = d_lp_list[tid].scale_factor;
659  const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
660  const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
661  const Vector rkj = rj - rk;
662  const double inv_rkj_norm = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
663  distance *= inv_rkj_norm;
664  // Prepare the force buffers
665  Vector fi[maxForceNumer+1];
666  Vector fj[maxForceNumer+1];
667  Vector fk[maxForceNumer+1];
668  #pragma unroll
669  for (int m = 0; m < maxForceNumer + 1; ++m) {
670  fj[m] = 0.0;
671  fk[m] = 0.0;
672  }
673  switch (maxForceNumer) {
674  // Intentionally fall-through
675  case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
676  case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
677  case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
678  }
679  // Project the forces
680  #pragma unroll
681  for (int m = 0; m < maxForceNumer + 1; ++m) {
682  const double fdot = distance * (fi[m].dot(rkj)) * inv_rkj_norm * inv_rkj_norm;
683  fj[m] = (1.0 + scale_factor + distance) * fi[m] - fdot * rkj;
684  fk[m] = fi[m] - fj[m];
685  }
686  // Add the forces back
687  switch (maxForceNumer) {
688  // Intentionally fall-through
689  case 2: {
690  atomicAdd(&(d_f_slow_x[j]), fj[2].x);
691  atomicAdd(&(d_f_slow_y[j]), fj[2].y);
692  atomicAdd(&(d_f_slow_z[j]), fj[2].z);
693  atomicAdd(&(d_f_slow_x[k]), fk[2].x);
694  atomicAdd(&(d_f_slow_y[k]), fk[2].y);
695  atomicAdd(&(d_f_slow_z[k]), fk[2].z);
696  }
697  case 1: {
698  atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
699  atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
700  atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
701  atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
702  atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
703  atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
704  }
705  case 0: {
706  atomicAdd(&(d_f_normal_x[j]), fj[0].x);
707  atomicAdd(&(d_f_normal_y[j]), fj[0].y);
708  atomicAdd(&(d_f_normal_z[j]), fj[0].z);
709  atomicAdd(&(d_f_normal_x[k]), fk[0].x);
710  atomicAdd(&(d_f_normal_y[k]), fk[0].y);
711  atomicAdd(&(d_f_normal_z[k]), fk[0].z);
712  }
713  }
714  if (doVirial) {
715  const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
716  switch (maxForceNumer) {
717  // Intentionally fall-through
718  case 2: {
719  vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x - fi[2].x * ri.x;
720  vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y - fi[2].x * ri.y;
721  vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z - fi[2].x * ri.z;
722  vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x - fi[2].y * ri.x;
723  vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y - fi[2].y * ri.y;
724  vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z - fi[2].y * ri.z;
725  vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x - fi[2].z * ri.x;
726  vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y - fi[2].z * ri.y;
727  vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z - fi[2].z * ri.z;
728  }
729  case 1: {
730  vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x - fi[1].x * ri.x;
731  vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y - fi[1].x * ri.y;
732  vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z - fi[1].x * ri.z;
733  vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x - fi[1].y * ri.x;
734  vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y - fi[1].y * ri.y;
735  vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z - fi[1].y * ri.z;
736  vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x - fi[1].z * ri.x;
737  vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y - fi[1].z * ri.y;
738  vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z - fi[1].z * ri.z;
739  }
740  case 0: {
741  vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x - fi[0].x * ri.x;
742  vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y - fi[0].x * ri.y;
743  vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z - fi[0].x * ri.z;
744  vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x - fi[0].y * ri.x;
745  vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y - fi[0].y * ri.y;
746  vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z - fi[0].y * ri.z;
747  vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x - fi[0].z * ri.x;
748  vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y - fi[0].z * ri.y;
749  vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z - fi[0].z * ri.z;
750  }
751  }
752  }
753  // Clean the forces on the lone pair
754  switch (maxForceNumer) {
755  // Intentionally fall-through
756  case 2: {
757  d_f_slow_x[i] = 0;
758  d_f_slow_y[i] = 0;
759  d_f_slow_z[i] = 0;
760  }
761  case 1: {
762  d_f_nbond_x[i] = 0;
763  d_f_nbond_y[i] = 0;
764  d_f_nbond_z[i] = 0;
765  }
766  case 0: {
767  d_f_normal_x[i] = 0;
768  d_f_normal_y[i] = 0;
769  d_f_normal_z[i] = 0;
770  }
771  }
772  }
773  typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
774  __shared__ typename BlockReduce::TempStorage temp_storage;
775  if (doVirial) {
776  switch (maxForceNumer) {
777  // Intentionally fall-through
778  case 2: {
779  vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
780  __syncthreads();
781  }
782  case 1: {
783  vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
784  __syncthreads();
785  }
786  case 0: {
787  vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
788  __syncthreads();
789  }
790  }
791  if (threadIdx.x == 0) {
792  switch (maxForceNumer) {
793  // Intentionally fall-through
794  case 2: {
795  atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
796  atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
797  atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
798  atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
799  atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
800  atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
801  atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
802  atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
803  atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
804  }
805  case 1: {
806  atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
807  atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
808  atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
809  atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
810  atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
811  atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
812  atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
813  atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
814  atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
815  }
816  case 0: {
817  atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
818  atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
819  atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
820  atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
821  atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
822  atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
823  atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
824  atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
825  atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
826  }
827  }
828  }
829  }
830 }
831 
832 void redistributeForceColinear(
833  double* d_f_normal_x,
834  double* d_f_normal_y,
835  double* d_f_normal_z,
836  double* d_f_nbond_x,
837  double* d_f_nbond_y,
838  double* d_f_nbond_z,
839  double* d_f_slow_x,
840  double* d_f_slow_y,
841  double* d_f_slow_z,
842  cudaTensor* d_virial_normal,
843  cudaTensor* d_virial_nbond,
844  cudaTensor* d_virial_slow,
845  const double* d_pos_x,
846  const double* d_pos_y,
847  const double* d_pos_z,
848  const int maxForceNumber,
849  const int doVirial,
850  const ComputeLonepairsCUDA::LonepairColinear* d_lpcolinear_list,
851  size_t lpcolinear_list_size,
852  cudaStream_t stream) {
853  const int block_size = 128;
854  const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
855 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
856  redistributeForceColinearKernel<MAXFORCENUMBER, DOVIRIAL, block_size> \
857  <<<grid, block_size, 0, stream>>>( \
858  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
859  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
860  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
861  d_virial_normal, d_virial_nbond, d_virial_slow, \
862  d_pos_x, d_pos_y, d_pos_z, \
863  d_lpcolinear_list, lpcolinear_list_size);
864  const int option = (maxForceNumber << 0) + (doVirial << 2);
865  switch (option) {
866  case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
867  case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
868  case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
869  case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
870  case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
871  case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
872  default: {
873  const std::string error =
874  "redistributeForceColinear: no kernel called (maxForceNumber = " +
875  std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
876  NAMD_bug(error.c_str());
877  }
878  }
879 #undef CALL
880 }
881 
882 #endif // NAMD_CUDA