1 #include "ComputeLonepairsCUDA.h"
2 #include "ComputeLonepairsCUDAKernel.h"
7 #include <hipcub/hipcub.hpp>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
16 #include <namd_cub/cub.cuh>
20 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
22 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
23 __global__ void repositionRelativeKernel(
24 double* __restrict__ d_pos_x,
25 double* __restrict__ d_pos_y,
26 double* __restrict__ d_pos_z,
27 const ComputeLonepairsCUDA::LonepairRelative* __restrict__ d_lprelative_list,
28 size_t lprelative_list_size) {
29 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
30 if (tid < lprelative_list_size) {
31 const int i = d_lprelative_list[tid].i_soaid;
32 const int j = d_lprelative_list[tid].j_soaid;
33 const int k = d_lprelative_list[tid].k_soaid;
34 const int l = d_lprelative_list[tid].l_soaid;
35 // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
36 const double dcosa = d_lprelative_list[tid].dcosa;
37 const double dsinacost = d_lprelative_list[tid].dsinacost;
38 const double dsinasint = d_lprelative_list[tid].dsinasint;
39 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
40 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
41 const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
42 // I use the unit vectors directly instead of rjk and rkl
43 const Vector ujk = (rk - rj).unit();
44 const Vector ukl = (rl - rk).unit();
45 const Vector u3 = cross(ujk, ukl);
46 const Vector u2 = cross(u3, ujk);
47 const Vector ri = rj + dcosa * ujk + dsinacost * u2 - dsinasint * u3;
54 void repositionRelative(
58 const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
59 size_t lprelative_list_size,
60 cudaStream_t stream) {
61 const int block_size = 128;
62 const int grid = (lprelative_list_size + block_size - 1) / block_size;
63 repositionRelativeKernel<<<grid, block_size, 0, stream>>>(
64 d_pos_x, d_pos_y, d_pos_z, d_lprelative_list, lprelative_list_size);
67 __global__ void repositionBisectorKernel(
68 double* __restrict__ d_pos_x,
69 double* __restrict__ d_pos_y,
70 double* __restrict__ d_pos_z,
71 const ComputeLonepairsCUDA::LonepairBisector* __restrict__ d_lpbisector_list,
72 size_t lpbisector_list_size) {
73 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
74 if (tid < lpbisector_list_size) {
75 const int i = d_lpbisector_list[tid].i_soaid;
76 const int j = d_lpbisector_list[tid].j_soaid;
77 const int k = d_lpbisector_list[tid].k_soaid;
78 const int l = d_lpbisector_list[tid].l_soaid;
79 // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
80 const double dcosa = d_lpbisector_list[tid].dcosa;
81 const double dsinacost = d_lpbisector_list[tid].dsinacost;
82 const double dsinasint = d_lpbisector_list[tid].dsinasint;
83 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
84 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
85 const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
86 // The middle point of l and k
87 const Vector rb = 0.5 * (rl + rk);
88 // The difference between repositionRelativeKernel and this one is basically
89 // the rotation of subscripts, namely k -> b and l -> k.
90 // rk->rb and then rl->rk
91 const Vector ujk = (rb - rj).unit();
92 const Vector ukl = (rk - rb).unit();
93 const Vector u3 = cross(ujk, ukl);
94 const Vector u2 = cross(u3, ujk);
95 const Vector ri = rj + dcosa * ujk + dsinacost * u2 - dsinasint * u3;
102 void repositionBisector(
106 const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
107 size_t lpbisector_list_size,
108 cudaStream_t stream) {
109 const int block_size = 128;
110 const int grid = (lpbisector_list_size + block_size - 1) / block_size;
111 repositionBisectorKernel<<<grid, block_size, 0, stream>>>(
112 d_pos_x, d_pos_y, d_pos_z, d_lpbisector_list, lpbisector_list_size);
115 __global__ void repositionColinearKernel(
116 double* __restrict__ d_pos_x,
117 double* __restrict__ d_pos_y,
118 double* __restrict__ d_pos_z,
119 const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
120 size_t lpcolinear_list_size) {
121 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
122 if (tid < lpcolinear_list_size) {
123 const int i = d_lpcolinear_list[tid].i_soaid;
124 const int j = d_lpcolinear_list[tid].j_soaid;
125 const int k = d_lpcolinear_list[tid].k_soaid;
126 const double distance = d_lpcolinear_list[tid].distance;
127 const double scale_factor = d_lpcolinear_list[tid].scale_factor;
128 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
129 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
130 const Vector rkj = rj - rk;
131 const double inv_r = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
132 const Vector ri = rj + (scale_factor + distance * inv_r) * rkj;
139 void repositionColinear(
140 double* __restrict__ d_pos_x,
141 double* __restrict__ d_pos_y,
142 double* __restrict__ d_pos_z,
143 const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lpcolinear_list,
144 size_t lpcolinear_list_size,
145 cudaStream_t stream) {
146 const int block_size = 128;
147 const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
148 repositionColinearKernel<<<grid, block_size, 0, stream>>>(
149 d_pos_x, d_pos_y, d_pos_z, d_lpcolinear_list, lpcolinear_list_size);
152 // See https://github.com/HanatoK/lone_pair_force/blob/main/lone_pair_relative.ipynb for the maths
153 // I try to project multiple forces at once for MTS.
154 template <int numForces>
155 __inline__ __device__ void projectRelativeForces(
160 const Vector (&fi)[numForces],
161 Vector (&fj)[numForces],
162 Vector (&fk)[numForces],
163 Vector (&fl)[numForces]) {
164 const Vector rji = ri - rj;
165 const Vector rjk = rk - rj;
166 const Vector rkl = rl - rk;
167 const Vector rki = ri - rk;
168 const double inv_rjk_norm2 = 1.0 / rjk.length2();
169 const Vector plane_jkl = cross(rjk, rkl);
170 // Project the force on rij to keep the distance unchanged
171 const double rji_norm2 = rji.length2();
172 const double inv_rij_norm2 = 1.0 / rji_norm2;
175 for (int m = 0; m < numForces; ++m) {
176 fj[m] += inv_rij_norm2 * fi[m].dot(rji) * rji;
179 // Project the force on the dihedral angle to keep it unchanged
180 Vector fp[numForces];
182 for (int m = 0; m < numForces; ++m) {
185 // Get the normal vector of plane ikj
186 Vector normal_ikj = cross(rji, rjk);
187 const double normal_ikj_norm = normal_ikj.length();
188 // The arm of the force on plane ikj (v2 + v3)
189 const Vector hijk_ri = rji - (rji.dot(rjk) * inv_rjk_norm2 * rjk);
190 // Only project the force to plane ijk if the ijk are not colinear
191 // If rji and rjk is colinear then rjiĆrjk is zero
192 if (normal_ikj_norm > 1e-6) {
193 normal_ikj /= normal_ikj_norm;
194 // Force on plane ikj
196 for (int m = 0; m < numForces; ++m) {
197 fp[m] = fi[m].dot(normal_ikj) * normal_ikj;
200 // Torque of the force on l after projecting the force on plane ijk to l
201 const Vector dir_l_to_jk = cross(plane_jkl, rjk).unit();
202 const Vector h_l = rkl.dot(dir_l_to_jk) * dir_l_to_jk;
203 const double inv_h_l_norm2 = 1.0 / h_l.length2();
205 for (int m = 0; m < numForces; ++m) {
207 const Vector torque_p = cross(hijk_ri, fp[m]);
209 const Vector fpl = inv_h_l_norm2 * cross(torque_p, h_l);
211 // (a) The remaining force on j and k after subtracting the force on l from the torsional force
212 const Vector fj_prime = inv_rjk_norm2 * cross(cross(rki, fp[m]) - cross(rkl, fpl), -rjk);
213 const Vector fk_prime = fp[m] - fpl - fj_prime;
214 // (b) The remaining angular force
215 const Vector fai = fi[m] - fp[m] - fj[m];
216 // Sum the torsional force (a) on j and k
219 // Torque of the angular force (b) on k
220 const Vector torque_k = cross(rji, fai);
221 const Vector fak = cross(torque_k, rjk) * inv_rjk_norm2;
227 enum class LonepairThreeHostsType {Relative, Bisector};
229 template <typename LplistT, int maxForceNumer, bool doVirial, int blockSize, LonepairThreeHostsType lptype>
230 __global__ void redistributeForceThreeHostsKernel(
231 double* __restrict__ d_f_normal_x,
232 double* __restrict__ d_f_normal_y,
233 double* __restrict__ d_f_normal_z,
234 double* __restrict__ d_f_nbond_x,
235 double* __restrict__ d_f_nbond_y,
236 double* __restrict__ d_f_nbond_z,
237 double* __restrict__ d_f_slow_x,
238 double* __restrict__ d_f_slow_y,
239 double* __restrict__ d_f_slow_z,
240 cudaTensor* __restrict__ d_virial_normal,
241 cudaTensor* __restrict__ d_virial_nbond,
242 cudaTensor* __restrict__ d_virial_slow,
243 const double* __restrict__ d_pos_x,
244 const double* __restrict__ d_pos_y,
245 const double* __restrict__ d_pos_z,
246 const LplistT* __restrict__ d_lp_list,
247 size_t lp_list_size) {
248 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
249 cudaTensor vir_slow, vir_nbond, vir_normal;
251 switch (maxForceNumer) {
252 // Intentionally fall-through
288 if (tid < lp_list_size) {
289 const int i = d_lp_list[tid].i_soaid;
290 const int j = d_lp_list[tid].j_soaid;
291 const int k = d_lp_list[tid].k_soaid;
292 const int l = d_lp_list[tid].l_soaid;
293 // Bernard Brooks: Do all LP math in double precision. Cost is not excessive.
294 const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
295 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
296 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
297 const Vector rl{d_pos_x[l], d_pos_y[l], d_pos_z[l]};
298 Vector fi[maxForceNumer+1];
299 Vector fj[maxForceNumer+1];
300 Vector fk[maxForceNumer+1];
301 Vector fl[maxForceNumer+1];
303 for (int m = 0; m < maxForceNumer + 1; ++m) {
308 switch (maxForceNumer) {
309 // Intentionally fall-through
310 case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
311 case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
312 case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
315 case LonepairThreeHostsType::Relative: {
316 projectRelativeForces<maxForceNumer+1>(ri, rj, rk, rl, fi, fj, fk, fl);
319 case LonepairThreeHostsType::Bisector: {
320 // The difference between the bisector and the relative cases is
321 // that the order of the vertices of the former is changed to ijmk
322 // from ijkl, so we can just reuse the function.
323 const Vector rm = 0.5 * (rj + rk);
324 // Store the force of m to fl, so I rotate the buffers fl and fk
325 projectRelativeForces<maxForceNumer+1>(ri, rj, rm, rk, fi, fj, fl, fk);
326 // fk and fl actually store the forces on m and k, respectively.
327 // The real force on k is 0.5 * fl + fk
328 // The real force on l is 0.5 * fl
330 for (int m = 0; m < maxForceNumer + 1; ++m) {
337 // Add the forces back to jkl
338 switch (maxForceNumer) {
339 // Intentionally fall-through
341 atomicAdd(&(d_f_slow_x[j]), fj[2].x);
342 atomicAdd(&(d_f_slow_y[j]), fj[2].y);
343 atomicAdd(&(d_f_slow_z[j]), fj[2].z);
344 atomicAdd(&(d_f_slow_x[k]), fk[2].x);
345 atomicAdd(&(d_f_slow_y[k]), fk[2].y);
346 atomicAdd(&(d_f_slow_z[k]), fk[2].z);
347 atomicAdd(&(d_f_slow_x[l]), fl[2].x);
348 atomicAdd(&(d_f_slow_y[l]), fl[2].y);
349 atomicAdd(&(d_f_slow_z[l]), fl[2].z);
352 atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
353 atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
354 atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
355 atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
356 atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
357 atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
358 atomicAdd(&(d_f_nbond_x[l]), fl[1].x);
359 atomicAdd(&(d_f_nbond_y[l]), fl[1].y);
360 atomicAdd(&(d_f_nbond_z[l]), fl[1].z);
363 atomicAdd(&(d_f_normal_x[j]), fj[0].x);
364 atomicAdd(&(d_f_normal_y[j]), fj[0].y);
365 atomicAdd(&(d_f_normal_z[j]), fj[0].z);
366 atomicAdd(&(d_f_normal_x[k]), fk[0].x);
367 atomicAdd(&(d_f_normal_y[k]), fk[0].y);
368 atomicAdd(&(d_f_normal_z[k]), fk[0].z);
369 atomicAdd(&(d_f_normal_x[l]), fl[0].x);
370 atomicAdd(&(d_f_normal_y[l]), fl[0].y);
371 atomicAdd(&(d_f_normal_z[l]), fl[0].z);
374 // Compute the virial contribution after redistributing the forces
376 switch (maxForceNumer) {
377 // Intentionally fall-through
379 vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x + fl[2].x * rl.x - fi[2].x * ri.x;
380 vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y + fl[2].x * rl.y - fi[2].x * ri.y;
381 vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z + fl[2].x * rl.z - fi[2].x * ri.z;
382 vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x + fl[2].y * rl.x - fi[2].y * ri.x;
383 vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y + fl[2].y * rl.y - fi[2].y * ri.y;
384 vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z + fl[2].y * rl.z - fi[2].y * ri.z;
385 vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x + fl[2].z * rl.x - fi[2].z * ri.x;
386 vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y + fl[2].z * rl.y - fi[2].z * ri.y;
387 vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z + fl[2].z * rl.z - fi[2].z * ri.z;
390 vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x + fl[1].x * rl.x - fi[1].x * ri.x;
391 vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y + fl[1].x * rl.y - fi[1].x * ri.y;
392 vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z + fl[1].x * rl.z - fi[1].x * ri.z;
393 vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x + fl[1].y * rl.x - fi[1].y * ri.x;
394 vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y + fl[1].y * rl.y - fi[1].y * ri.y;
395 vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z + fl[1].y * rl.z - fi[1].y * ri.z;
396 vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x + fl[1].z * rl.x - fi[1].z * ri.x;
397 vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y + fl[1].z * rl.y - fi[1].z * ri.y;
398 vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z + fl[1].z * rl.z - fi[1].z * ri.z;
401 vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x + fl[0].x * rl.x - fi[0].x * ri.x;
402 vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y + fl[0].x * rl.y - fi[0].x * ri.y;
403 vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z + fl[0].x * rl.z - fi[0].x * ri.z;
404 vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x + fl[0].y * rl.x - fi[0].y * ri.x;
405 vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y + fl[0].y * rl.y - fi[0].y * ri.y;
406 vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z + fl[0].y * rl.z - fi[0].y * ri.z;
407 vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x + fl[0].z * rl.x - fi[0].z * ri.x;
408 vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y + fl[0].z * rl.y - fi[0].z * ri.y;
409 vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z + fl[0].z * rl.z - fi[0].z * ri.z;
413 // Clean the forces on the lone pair
414 switch (maxForceNumer) {
415 // Intentionally fall-through
433 typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
434 __shared__ typename BlockReduce::TempStorage temp_storage;
436 switch (maxForceNumer) {
437 // Intentionally fall-through
439 vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
443 vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
447 vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
451 if (threadIdx.x == 0) {
452 switch (maxForceNumer) {
453 // Intentionally fall-through
455 atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
456 atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
457 atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
458 atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
459 atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
460 atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
461 atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
462 atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
463 atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
466 atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
467 atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
468 atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
469 atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
470 atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
471 atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
472 atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
473 atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
474 atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
477 atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
478 atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
479 atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
480 atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
481 atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
482 atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
483 atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
484 atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
485 atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
492 void redistributeForceRelative(
493 double* d_f_normal_x,
494 double* d_f_normal_y,
495 double* d_f_normal_z,
502 cudaTensor* d_virial_normal,
503 cudaTensor* d_virial_nbond,
504 cudaTensor* d_virial_slow,
505 const double* d_pos_x,
506 const double* d_pos_y,
507 const double* d_pos_z,
508 const int maxForceNumber,
510 const ComputeLonepairsCUDA::LonepairRelative* d_lprelative_list,
511 size_t lprelative_list_size,
512 cudaStream_t stream) {
513 const int block_size = 128;
514 const int grid = (lprelative_list_size + block_size - 1) / block_size;
515 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
516 redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairRelative, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Relative> \
517 <<<grid, block_size, 0, stream>>>( \
518 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
519 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
520 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
521 d_virial_normal, d_virial_nbond, d_virial_slow, \
522 d_pos_x, d_pos_y, d_pos_z, \
523 d_lprelative_list, lprelative_list_size \
525 const int option = (maxForceNumber << 0) + (doVirial << 2);
527 case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
528 case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
529 case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
530 case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
531 case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
532 case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
534 const std::string error =
535 "redistributeForceRelative: no kernel called (maxForceNumber = " +
536 std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
537 NAMD_bug(error.c_str());
543 void redistributeForceBisector(
544 double* d_f_normal_x,
545 double* d_f_normal_y,
546 double* d_f_normal_z,
553 cudaTensor* d_virial_normal,
554 cudaTensor* d_virial_nbond,
555 cudaTensor* d_virial_slow,
556 const double* d_pos_x,
557 const double* d_pos_y,
558 const double* d_pos_z,
559 const int maxForceNumber,
561 const ComputeLonepairsCUDA::LonepairBisector* d_lpbisector_list,
562 size_t lpbisector_list_size,
563 cudaStream_t stream) {
564 const int block_size = 128;
565 const int grid = (lpbisector_list_size + block_size - 1) / block_size;
566 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
567 redistributeForceThreeHostsKernel<ComputeLonepairsCUDA::LonepairBisector, MAXFORCENUMBER, DOVIRIAL, block_size, LonepairThreeHostsType::Bisector> \
568 <<<grid, block_size, 0, stream>>>( \
569 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
570 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
571 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
572 d_virial_normal, d_virial_nbond, d_virial_slow, \
573 d_pos_x, d_pos_y, d_pos_z, \
574 d_lpbisector_list, lpbisector_list_size \
576 const int option = (maxForceNumber << 0) + (doVirial << 2);
578 case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
579 case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
580 case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
581 case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
582 case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
583 case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
585 const std::string error =
586 "redistributeForceBisector: no kernel called (maxForceNumber = " +
587 std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
588 NAMD_bug(error.c_str());
594 template <int maxForceNumer, bool doVirial, int blockSize>
595 __global__ void redistributeForceColinearKernel(
596 double* __restrict__ d_f_normal_x,
597 double* __restrict__ d_f_normal_y,
598 double* __restrict__ d_f_normal_z,
599 double* __restrict__ d_f_nbond_x,
600 double* __restrict__ d_f_nbond_y,
601 double* __restrict__ d_f_nbond_z,
602 double* __restrict__ d_f_slow_x,
603 double* __restrict__ d_f_slow_y,
604 double* __restrict__ d_f_slow_z,
605 cudaTensor* __restrict__ d_virial_normal,
606 cudaTensor* __restrict__ d_virial_nbond,
607 cudaTensor* __restrict__ d_virial_slow,
608 const double* __restrict__ d_pos_x,
609 const double* __restrict__ d_pos_y,
610 const double* __restrict__ d_pos_z,
611 const ComputeLonepairsCUDA::LonepairColinear* __restrict__ d_lp_list,
612 size_t lp_list_size) {
613 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
614 cudaTensor vir_slow, vir_nbond, vir_normal;
616 switch (maxForceNumer) {
617 // Intentionally fall-through
653 if (tid < lp_list_size) {
654 const int i = d_lp_list[tid].i_soaid;
655 const int j = d_lp_list[tid].j_soaid;
656 const int k = d_lp_list[tid].k_soaid;
657 double distance = d_lp_list[tid].distance;
658 const double scale_factor = d_lp_list[tid].scale_factor;
659 const Vector rj{d_pos_x[j], d_pos_y[j], d_pos_z[j]};
660 const Vector rk{d_pos_x[k], d_pos_y[k], d_pos_z[k]};
661 const Vector rkj = rj - rk;
662 const double inv_rkj_norm = rnorm3d(rkj.x, rkj.y, rkj.z); // rkj.rlength()
663 distance *= inv_rkj_norm;
664 // Prepare the force buffers
665 Vector fi[maxForceNumer+1];
666 Vector fj[maxForceNumer+1];
667 Vector fk[maxForceNumer+1];
669 for (int m = 0; m < maxForceNumer + 1; ++m) {
673 switch (maxForceNumer) {
674 // Intentionally fall-through
675 case 2: fi[2] = Vector{d_f_slow_x[i], d_f_slow_y[i], d_f_slow_z[i]};
676 case 1: fi[1] = Vector{d_f_nbond_x[i], d_f_nbond_y[i], d_f_nbond_z[i]};
677 case 0: fi[0] = Vector{d_f_normal_x[i], d_f_normal_y[i], d_f_normal_z[i]};
679 // Project the forces
681 for (int m = 0; m < maxForceNumer + 1; ++m) {
682 const double fdot = distance * (fi[m].dot(rkj)) * inv_rkj_norm * inv_rkj_norm;
683 fj[m] = (1.0 + scale_factor + distance) * fi[m] - fdot * rkj;
684 fk[m] = fi[m] - fj[m];
686 // Add the forces back
687 switch (maxForceNumer) {
688 // Intentionally fall-through
690 atomicAdd(&(d_f_slow_x[j]), fj[2].x);
691 atomicAdd(&(d_f_slow_y[j]), fj[2].y);
692 atomicAdd(&(d_f_slow_z[j]), fj[2].z);
693 atomicAdd(&(d_f_slow_x[k]), fk[2].x);
694 atomicAdd(&(d_f_slow_y[k]), fk[2].y);
695 atomicAdd(&(d_f_slow_z[k]), fk[2].z);
698 atomicAdd(&(d_f_nbond_x[j]), fj[1].x);
699 atomicAdd(&(d_f_nbond_y[j]), fj[1].y);
700 atomicAdd(&(d_f_nbond_z[j]), fj[1].z);
701 atomicAdd(&(d_f_nbond_x[k]), fk[1].x);
702 atomicAdd(&(d_f_nbond_y[k]), fk[1].y);
703 atomicAdd(&(d_f_nbond_z[k]), fk[1].z);
706 atomicAdd(&(d_f_normal_x[j]), fj[0].x);
707 atomicAdd(&(d_f_normal_y[j]), fj[0].y);
708 atomicAdd(&(d_f_normal_z[j]), fj[0].z);
709 atomicAdd(&(d_f_normal_x[k]), fk[0].x);
710 atomicAdd(&(d_f_normal_y[k]), fk[0].y);
711 atomicAdd(&(d_f_normal_z[k]), fk[0].z);
715 const Vector ri{d_pos_x[i], d_pos_y[i], d_pos_z[i]};
716 switch (maxForceNumer) {
717 // Intentionally fall-through
719 vir_slow.xx = fj[2].x * rj.x + fk[2].x * rk.x - fi[2].x * ri.x;
720 vir_slow.xy = fj[2].x * rj.y + fk[2].x * rk.y - fi[2].x * ri.y;
721 vir_slow.xz = fj[2].x * rj.z + fk[2].x * rk.z - fi[2].x * ri.z;
722 vir_slow.yx = fj[2].y * rj.x + fk[2].y * rk.x - fi[2].y * ri.x;
723 vir_slow.yy = fj[2].y * rj.y + fk[2].y * rk.y - fi[2].y * ri.y;
724 vir_slow.yz = fj[2].y * rj.z + fk[2].y * rk.z - fi[2].y * ri.z;
725 vir_slow.zx = fj[2].z * rj.x + fk[2].z * rk.x - fi[2].z * ri.x;
726 vir_slow.zy = fj[2].z * rj.y + fk[2].z * rk.y - fi[2].z * ri.y;
727 vir_slow.zz = fj[2].z * rj.z + fk[2].z * rk.z - fi[2].z * ri.z;
730 vir_nbond.xx = fj[1].x * rj.x + fk[1].x * rk.x - fi[1].x * ri.x;
731 vir_nbond.xy = fj[1].x * rj.y + fk[1].x * rk.y - fi[1].x * ri.y;
732 vir_nbond.xz = fj[1].x * rj.z + fk[1].x * rk.z - fi[1].x * ri.z;
733 vir_nbond.yx = fj[1].y * rj.x + fk[1].y * rk.x - fi[1].y * ri.x;
734 vir_nbond.yy = fj[1].y * rj.y + fk[1].y * rk.y - fi[1].y * ri.y;
735 vir_nbond.yz = fj[1].y * rj.z + fk[1].y * rk.z - fi[1].y * ri.z;
736 vir_nbond.zx = fj[1].z * rj.x + fk[1].z * rk.x - fi[1].z * ri.x;
737 vir_nbond.zy = fj[1].z * rj.y + fk[1].z * rk.y - fi[1].z * ri.y;
738 vir_nbond.zz = fj[1].z * rj.z + fk[1].z * rk.z - fi[1].z * ri.z;
741 vir_normal.xx = fj[0].x * rj.x + fk[0].x * rk.x - fi[0].x * ri.x;
742 vir_normal.xy = fj[0].x * rj.y + fk[0].x * rk.y - fi[0].x * ri.y;
743 vir_normal.xz = fj[0].x * rj.z + fk[0].x * rk.z - fi[0].x * ri.z;
744 vir_normal.yx = fj[0].y * rj.x + fk[0].y * rk.x - fi[0].y * ri.x;
745 vir_normal.yy = fj[0].y * rj.y + fk[0].y * rk.y - fi[0].y * ri.y;
746 vir_normal.yz = fj[0].y * rj.z + fk[0].y * rk.z - fi[0].y * ri.z;
747 vir_normal.zx = fj[0].z * rj.x + fk[0].z * rk.x - fi[0].z * ri.x;
748 vir_normal.zy = fj[0].z * rj.y + fk[0].z * rk.y - fi[0].z * ri.y;
749 vir_normal.zz = fj[0].z * rj.z + fk[0].z * rk.z - fi[0].z * ri.z;
753 // Clean the forces on the lone pair
754 switch (maxForceNumer) {
755 // Intentionally fall-through
773 typedef cub::BlockReduce<cudaTensor, blockSize> BlockReduce;
774 __shared__ typename BlockReduce::TempStorage temp_storage;
776 switch (maxForceNumer) {
777 // Intentionally fall-through
779 vir_slow = BlockReduce(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
783 vir_nbond = BlockReduce(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
787 vir_normal = BlockReduce(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
791 if (threadIdx.x == 0) {
792 switch (maxForceNumer) {
793 // Intentionally fall-through
795 atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
796 atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
797 atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
798 atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
799 atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
800 atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
801 atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
802 atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
803 atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
806 atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
807 atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
808 atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
809 atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
810 atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
811 atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
812 atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
813 atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
814 atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
817 atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
818 atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
819 atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
820 atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
821 atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
822 atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
823 atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
824 atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
825 atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
832 void redistributeForceColinear(
833 double* d_f_normal_x,
834 double* d_f_normal_y,
835 double* d_f_normal_z,
842 cudaTensor* d_virial_normal,
843 cudaTensor* d_virial_nbond,
844 cudaTensor* d_virial_slow,
845 const double* d_pos_x,
846 const double* d_pos_y,
847 const double* d_pos_z,
848 const int maxForceNumber,
850 const ComputeLonepairsCUDA::LonepairColinear* d_lpcolinear_list,
851 size_t lpcolinear_list_size,
852 cudaStream_t stream) {
853 const int block_size = 128;
854 const int grid = (lpcolinear_list_size + block_size - 1) / block_size;
855 #define CALL(MAXFORCENUMBER, DOVIRIAL) \
856 redistributeForceColinearKernel<MAXFORCENUMBER, DOVIRIAL, block_size> \
857 <<<grid, block_size, 0, stream>>>( \
858 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
859 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
860 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
861 d_virial_normal, d_virial_nbond, d_virial_slow, \
862 d_pos_x, d_pos_y, d_pos_z, \
863 d_lpcolinear_list, lpcolinear_list_size);
864 const int option = (maxForceNumber << 0) + (doVirial << 2);
866 case ((0 << 0) + (0 << 2)): CALL(0, 0); break;
867 case ((1 << 0) + (0 << 2)): CALL(1, 0); break;
868 case ((2 << 0) + (0 << 2)): CALL(2, 0); break;
869 case ((0 << 0) + (1 << 2)): CALL(0, 1); break;
870 case ((1 << 0) + (1 << 2)): CALL(1, 1); break;
871 case ((2 << 0) + (1 << 2)): CALL(2, 1); break;
873 const std::string error =
874 "redistributeForceColinear: no kernel called (maxForceNumber = " +
875 std::to_string(maxForceNumber) + ", doVirial = " + std::to_string(doVirial);
876 NAMD_bug(error.c_str());