NAMD
MShakeKernel.cu
Go to the documentation of this file.
1 /**
2 *** Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by
3 *** The Board of Trustees of the University of Illinois.
4 *** All rights reserved.
5 **/
6 
7 #include "common.h"
8 #ifdef NAMD_CUDA
9 #include <cuda.h>
10 #endif // NAMD_CUDA
11 
12 #ifdef NAMD_HIP
13 #include <hip/hip_runtime.h>
14 #endif // NAMD_HIP
15 
16 #include <stdio.h>
17 #include <stdlib.h>
18 
19 #include "MShakeKernel.h"
20 #include "HipDefines.h"
21 
22 typedef float Real;
23 #ifdef SHORTREALS
24 typedef float BigReal;
25 #else
26 typedef double BigReal;
27 #endif
28 
29 //__constant__ SettleParameters constSP;
30 
31 // Haochuan: this function is not used so I disable it
32 #if 0
33 __global__ void rattlePair(
34  int nRattlePairs,
35  const double * __restrict vel_x,
36  const double * __restrict vel_y,
37  const double * __restrict vel_z,
38  const double * __restrict pos_x,
39  const double * __restrict pos_y,
40  const double * __restrict pos_z,
41  double * __restrict velNew_x,
42  double * __restrict velNew_y,
43  double * __restrict velNew_z,
44  double * __restrict posNew_x,
45  double * __restrict posNew_y,
46  double * __restrict posNew_z,
47  const int * __restrict hydrogenGroupSize,
48  const float * __restrict rigidBondLength,
49  const int * __restrict atomFixed,
50  CudaRattleElem * __restrict rattlePairList,
51  int* consFailure){
52 
53  int tid = threadIdx.x + blockIdx.x*blockDim.x;
54  int stride = blockDim.x * gridDim.x;
55  for(int i = tid; i < nRattlePairs; i+= stride){
56  consFailure[i] = 0;
57  int ig = rattlePairList[i].ig;
58  CudaRattleParam param = rattlePairList[i].params[0];
59  int a = ig + param.ia;
60  int b = ig + param.ib;
61  BigReal pabx = posNew_x[a] - posNew_x[b];
62  BigReal paby = posNew_y[a] - posNew_y[b];
63  BigReal pabz = posNew_z[a] - posNew_z[b];
64  BigReal pabsq = pabx*pabx + paby*paby + pabz*pabz;
65  BigReal rabsq = param.dsq;
66  BigReal diffsq = rabsq - pabsq;
67  BigReal rabx = pos_x[a] - pos_x[b];
68  BigReal raby = pos_y[a] - pos_y[b];
69  BigReal rabz = pos_z[a] - pos_z[b];
70 
71  BigReal refsq = rabx*rabx + raby*raby + rabz*rabz;
72  BigReal rpab = rabx*pabx + raby*paby + rabz*pabz;
73 
74  BigReal rma = param.rma;
75  BigReal rmb = param.rmb;
76 
77  BigReal gab;
78  BigReal sqrtarg = rpab*rpab + refsq*diffsq;
79  if ( sqrtarg < 0. ) {
80  consFailure[i] = 1;
81  gab = 0.;
82  } else {
83  consFailure[0] = 0;
84  gab = (-rpab + sqrt(sqrtarg))/(refsq*(rma + rmb));
85  }
86  BigReal dpx = rabx * gab;
87  BigReal dpy = raby * gab;
88  BigReal dpz = rabz * gab;
89  posNew_x[a] += rma * dpx;
90  posNew_y[a] += rma * dpy;
91  posNew_z[a] += rma * dpz;
92  posNew_x[b] -= rmb * dpx;
93  posNew_y[b] -= rmb * dpy;
94  posNew_z[b] -= rmb * dpz;
95  }
96 }
97 #endif
98 
99 // Haochuan: this function does not support SWM4 and TIP4,
100 // and is not used anywhere, so I disable it.
101 #if 0
102 __global__ void settleInit(int waterIndex, const float *mass,
103  const int *hydrogenGroupSize, const float *rigidBondLength,
104  SettleParameters *sp){
105 
106  //This initializes the water parameters
107 
108  //Water index points to oxygen, oatm poins to a hydrogen in the group
109  float pmO, pmH, hhdist, ohdist;
110  int oatm = waterIndex+1;
111  oatm += 1*(mass[waterIndex] < 0.5 || mass[waterIndex+1] < 0.5);
112 
113  //Assigns values to everyone
114  pmO = mass[waterIndex];
115  pmH = mass[oatm];
116  hhdist = rigidBondLength[waterIndex];
117  ohdist = rigidBondLength[oatm];
118  float rmT = 1.f / (pmO+pmH+pmH);
119  sp->mOrmT = pmO * rmT;
120  sp->mHrmT = pmH * rmT;
121  BigReal t1 = 0.5f*pmO/pmH;
122  sp->rc = 0.5f*hhdist;
123  sp->ra = sqrt(ohdist*ohdist-sp->rc*sp->rc)/(1.0+t1);
124  sp->rb = t1* sp->ra;
125  sp->rra = 1.f / sp->ra;
126 }
127 #endif
128 
129 __forceinline__ __device__ void settle1(
130  const double ref[3][3],
131  double pos[3][3],
132  const SettleParameters *sp) {
133 
134  // swiped from Settle.C
135  BigReal ref0x;
136  BigReal ref0y;
137  BigReal ref0z;
138  BigReal ref1x;
139  BigReal ref1y;
140  BigReal ref1z;
141  BigReal ref2x;
142  BigReal ref2y;
143  BigReal ref2z;
144 
145  BigReal pos0x;
146  BigReal pos0y;
147  BigReal pos0z;
148  BigReal pos1x;
149  BigReal pos1y;
150  BigReal pos1z;
151  BigReal pos2x;
152  BigReal pos2y;
153  BigReal pos2z;
154 
155  double mOrmT = sp->mOrmT;
156  double mHrmT = sp->mHrmT;
157  double rra = sp->rra;
158  double ra = sp->ra;
159  double rb = sp->rb;
160  double rc = sp->rc;
161 
162  ref0x = ref[0][0];
163  ref0y = ref[0][1];
164  ref0z = ref[0][2];
165  ref1x = ref[1][0];
166  ref1y = ref[1][1];
167  ref1z = ref[1][2];
168  ref2x = ref[2][0];
169  ref2y = ref[2][1];
170  ref2z = ref[2][2];
171 
172  pos0x = pos[0][0];
173  pos0y = pos[0][1];
174  pos0z = pos[0][2];
175  pos1x = pos[1][0];
176  pos1y = pos[1][1];
177  pos1z = pos[1][2];
178  pos2x = pos[2][0];
179  pos2y = pos[2][1];
180  pos2z = pos[2][2];
181 
182  // vectors in the plane of the original positions
183  BigReal b0x = ref1x - ref0x;
184  BigReal b0y = ref1y - ref0y;
185  BigReal b0z = ref1z - ref0z;
186 
187  BigReal c0x = ref2x - ref0x;
188  BigReal c0y = ref2y - ref0y;
189  BigReal c0z = ref2z - ref0z;
190 
191  // new center of mass
192  BigReal d0x = pos0x*mOrmT + ((pos1x + pos2x)*mHrmT);
193  BigReal d0y = pos0y*mOrmT + ((pos1y + pos2y)*mHrmT);
194  BigReal d0z = pos0z*mOrmT + ((pos1z + pos2z)*mHrmT);
195 
196  BigReal a1x = pos0x - d0x;
197  BigReal a1y = pos0y - d0y;
198  BigReal a1z = pos0z - d0z;
199 
200  BigReal b1x = pos1x - d0x;
201  BigReal b1y = pos1y - d0y;
202  BigReal b1z = pos1z - d0z;
203 
204  BigReal c1x = pos2x - d0x;
205  BigReal c1y = pos2y - d0y;
206  BigReal c1z = pos2z - d0z;
207 
208  // Vectors describing transformation from original coordinate system to
209  // the 'primed' coordinate system as in the diagram.
210  // n0 = b0 x c0
211  BigReal n0x = b0y*c0z-c0y*b0z;
212  BigReal n0y = c0x*b0z-b0x*c0z;
213  BigReal n0z = b0x*c0y-c0x*b0y;
214 
215  // n1 = a1 x n0
216  BigReal n1x = a1y*n0z-n0y*a1z;
217  BigReal n1y = n0x*a1z-a1x*n0z;
218  BigReal n1z = a1x*n0y-n0x*a1y;
219  // n2 = n0 x n1
220  BigReal n2x = n0y*n1z-n1y*n0z;
221  BigReal n2y = n1x*n0z-n0x*n1z;
222  BigReal n2z = n0x*n1y-n1x*n0y;
223  // Normalize n0
224  //Change by rsqrtf later
225  BigReal n0inv = 1.0/sqrt(n0x*n0x + n0y*n0y + n0z*n0z);
226  n0x *= n0inv;
227  n0y *= n0inv;
228  n0z *= n0inv;
229 
230  BigReal n1inv = 1.0/sqrt(n1x*n1x + n1y*n1y + n1z*n1z);
231  n1x *= n1inv;
232  n1y *= n1inv;
233  n1z *= n1inv;
234 
235  BigReal n2inv = 1.0/sqrt(n2x*n2x + n2y*n2y + n2z*n2z);
236  n2x *= n2inv;
237  n2y *= n2inv;
238  n2z *= n2inv;
239 
240  //b0 = Vector(n1*b0, n2*b0, n0*b0); // note: b0.z is never referenced again
241  BigReal n1b0 = n1x*b0x + n1y*b0y + n1z*b0z;
242  BigReal n2b0 = n2x*b0x + n2y*b0y + n2z*b0z;
243 
244  //c0 = Vector(n1*c0, n2*c0, n0*c0); // note: c0.z is never referenced again
245  BigReal n1c0 = n1x*c0x + n1y*c0y + n1z*c0z;
246  BigReal n2c0 = n2x*c0x + n2y*c0y + n2z*c0z;
247 
248  BigReal A1Z = n0x*a1x + n0y*a1y + n0z*a1z;
249 
250  //b1 = Vector(n1*b1, n2*b1, n0*b1);
251  BigReal n1b1 = n1x*b1x + n1y*b1y + n1z*b1z;
252  BigReal n2b1 = n2x*b1x + n2y*b1y + n2z*b1z;
253  BigReal n0b1 = n0x*b1x + n0y*b1y + n0z*b1z;
254 
255  //c1 = Vector(n1*c1, n2*c1, n0*c1);
256  BigReal n1c1 = n1x*c1x + n1y*c1y + n1z*c1z;
257  BigReal n2c1 = n2x*c1x + n2y*c1y + n2z*c1z;
258  BigReal n0c1 = n0x*c1x + n0y*c1y + n0z*c1z;
259 
260  // now we can compute positions of canonical water
261  BigReal sinphi = A1Z * rra;
262  BigReal tmp = 1.0-sinphi*sinphi;
263  BigReal cosphi = sqrt(tmp);
264  BigReal sinpsi = (n0b1 - n0c1)/(2.0*rc*cosphi);
265  tmp = 1.0-sinpsi*sinpsi;
266  BigReal cospsi = sqrt(tmp);
267 
268  BigReal rbphi = -rb*cosphi;
269  BigReal tmp1 = rc*sinpsi*sinphi;
270  //BigReal tmp2 = rc*sinpsi*cosphi;
271 
272  //Vector a2(0, ra*cosphi, ra*sinphi);
273  BigReal a2y = ra*cosphi;
274 
275  //Vector b2(-rc*cospsi, rbphi - tmp1, -rb*sinphi + tmp2);
276  BigReal b2x = -rc*cospsi;
277  BigReal b2y = rbphi - tmp1;
278 
279  //Vector c2( rc*cosphi, rbphi + tmp1, -rb*sinphi - tmp2);
280  BigReal c2y = rbphi + tmp1;
281 
282  // there are no a0 terms because we've already subtracted the term off
283  // when we first defined b0 and c0.
284  BigReal alpha = b2x*(n1b0 - n1c0) + n2b0*b2y + n2c0*c2y;
285  BigReal beta = b2x*(n2c0 - n2b0) + n1b0*b2y + n1c0*c2y;
286  BigReal gama = n1b0*n2b1 - n1b1*n2b0 + n1c0*n2c1 - n1c1*n2c0;
287 
288  BigReal a2b2 = alpha*alpha + beta*beta;
289  BigReal sintheta = (alpha*gama - beta*sqrt(a2b2 - gama*gama))/a2b2;
290  BigReal costheta = sqrt(1.0 - sintheta*sintheta);
291 
292  //Vector a3( -a2y*sintheta,
293  // a2y*costheta,
294  // A1Z);
295  BigReal a3x = -a2y*sintheta;
296  BigReal a3y = a2y*costheta;
297  BigReal a3z = A1Z;
298 
299  // Vector b3(b2x*costheta - b2y*sintheta,
300  // b2x*sintheta + b2y*costheta,
301  // n0b1);
302  BigReal b3x = b2x*costheta - b2y*sintheta;
303  BigReal b3y = b2x*sintheta + b2y*costheta;
304  BigReal b3z = n0b1;
305 
306  // Vector c3(-b2x*costheta - c2y*sintheta,
307  // -b2x*sintheta + c2y*costheta,
308  // n0c1);
309  BigReal c3x = -b2x*costheta - c2y*sintheta;
310  BigReal c3y = -b2x*sintheta + c2y*costheta;
311  BigReal c3z = n0c1;
312 
313  // undo the transformation; generate new normal vectors from the transpose.
314  // Vector m1(n1.x, n2.x, n0.x);
315  BigReal m1x = n1x;
316  BigReal m1y = n2x;
317  BigReal m1z = n0x;
318 
319  // Vector m2(n1.y, n2.y, n0.y);
320  BigReal m2x = n1y;
321  BigReal m2y = n2y;
322  BigReal m2z = n0y;
323 
324  // Vector m0(n1.z, n2.z, n0.z);
325  BigReal m0x = n1z;
326  BigReal m0y = n2z;
327  BigReal m0z = n0z;
328 
329  //pos[i*3+0] = Vector(a3*m1, a3*m2, a3*m0) + d0;
330  pos0x = a3x*m1x + a3y*m1y + a3z*m1z + d0x;
331  pos0y = a3x*m2x + a3y*m2y + a3z*m2z + d0y;
332  pos0z = a3x*m0x + a3y*m0y + a3z*m0z + d0z;
333 
334  // pos[i*3+1] = Vector(b3*m1, b3*m2, b3*m0) + d0;
335  pos1x = b3x*m1x + b3y*m1y + b3z*m1z + d0x;
336  pos1y = b3x*m2x + b3y*m2y + b3z*m2z + d0y;
337  pos1z = b3x*m0x + b3y*m0y + b3z*m0z + d0z;
338 
339  // pos[i*3+2] = Vector(c3*m1, c3*m2, c3*m0) + d0;
340  pos2x = c3x*m1x + c3y*m1y + c3z*m1z + d0x;
341  pos2y = c3x*m2x + c3y*m2y + c3z*m2z + d0y;
342  pos2z = c3x*m0x + c3y*m0y + c3z*m0z + d0z;
343 
344  pos[0][0] = pos0x;
345  pos[0][1] = pos0y;
346  pos[0][2] = pos0z;
347  pos[1][0] = pos1x;
348  pos[1][1] = pos1y;
349  pos[1][2] = pos1z;
350  pos[2][0] = pos2x;
351  pos[2][1] = pos2y;
352  pos[2][2] = pos2z;
353 }
354 
355 // swipe from HomePatch::tip4_omrepos
356 /* Reposition the om particle of a tip4p water
357  * A little geometry shows that the appropriate position is given by
358  * R_O + (1 / 2 r_ohc) * ( 0.5 (R_H1 + R_H2) - R_O )
359  * Here r_om is the distance from the oxygen to Om site, and r_ohc
360  * is the altitude from the oxygen to the hydrogen center of mass
361  * Those quantities are precalculated upon initialization of HomePatch
362  *
363  * Ordering of TIP4P atoms: O, H1, H2, LP.
364  */
365 __device__ void tip4_Om_reposition(
366  double pos[4][3],
367  const double r_om, const double r_ohc) {
368  const double factor = r_om / r_ohc;
369  pos[3][0] = pos[0][0] + (0.5 * (pos[1][0] + pos[2][0]) - pos[0][0]) * factor;
370  pos[3][1] = pos[0][1] + (0.5 * (pos[1][1] + pos[2][1]) - pos[0][1]) * factor;
371  pos[3][2] = pos[0][2] + (0.5 * (pos[1][2] + pos[2][2]) - pos[0][2]) * factor;
372 }
373 
374 // swipe from HomePatch::swm4_omrepos
375 /* Reposition lonepair (Om) particle of Drude SWM4 water.
376  * Same comments apply as to tip4_omrepos(), but the ordering of atoms
377  * is different: O, D, LP, H1, H2.
378  */
379 __device__ void swm4_Om_reposition(
380  double pos[4][3],
381  const double r_om, const double r_ohc) {
382  const double factor = r_om / r_ohc;
383  pos[2][0] = pos[0][0] + (0.5 * (pos[3][0] + pos[4][0]) - pos[0][0]) * factor;
384  pos[2][1] = pos[0][1] + (0.5 * (pos[3][1] + pos[4][1]) - pos[0][1]) * factor;
385  pos[2][2] = pos[0][2] + (0.5 * (pos[3][2] + pos[4][2]) - pos[0][2]) * factor;
386 }
387 
388 
389 void MSHAKEIterate(const int icnt, const CudaRattleElem* rattleList,
390  const BigReal *refx, const BigReal *refy, const BigReal *refz,
391  BigReal *posx, BigReal *posy, BigReal *posz,
392  const BigReal tol2, const int maxiter,
393  bool& done, bool& consFailure);
394 
395 __device__ inline BigReal det_3by3(BigReal A[4][4])
396 {
397  return
398  A[0][0]*(A[1][1]*A[2][2]-A[1][2]*A[2][1])-
399  A[0][1]*(A[1][0]*A[2][2]-A[1][2]*A[2][0])+
400  A[0][2]*(A[1][0]*A[2][1]-A[1][1]*A[2][0]);
401 }
402 
403 __device__ void swap_row(BigReal A[4][4], BigReal b[4], int r1, int r2)
404 {
405  #pragma unroll
406  for (int i = 0; i < 4; i++)
407  {
408  BigReal* p1 = &A[r1][i];
409  BigReal* p2 = &A[r2][i];
410  BigReal tmp = *p1;
411  *p1 = *p2;
412  *p2 = tmp;
413  }
414  BigReal tmp;
415  tmp = b[r1];
416  b[r1] = b[r2];
417  b[r2] = tmp;
418 }
419 
420 
421 
422 __device__ void solve_4by4(BigReal lambda [4], BigReal A[4][4], BigReal sigma[4])
423 {
424  BigReal tmp;
425  #pragma unroll
426  for (int k = 0; k < 4; ++k)
427  {
428  //#ifdef PIVOT
429  int piv_row = k;
430  BigReal Max = A[k][k];
431 
432  for (int row = k + 1; row < 4; ++row)
433  {
434  if ((tmp = fabs(A[row][k])) > Max)
435  {
436  piv_row = row;
437  Max = tmp;
438  }
439  }
440  if(k != piv_row)
441  swap_row(A, sigma, k, piv_row);
442  //#endif
443  for (int row = k + 1; row < 4; ++row)
444  {
445  tmp = A[row][k]/ A[k][k];
446  for (int col = k+1; col < 4; col++)
447  A[row][col] -= tmp * A[k][col];
448  A[row][k] = 0.;
449  sigma[row]-= tmp * sigma[k];
450  }
451  }
452  for (int row = 3; row >= 0; --row)
453  {
454  tmp = sigma[row];
455  for (int j = 3; j > row; --j)
456  tmp -= lambda[j] * A[row][j];
457  lambda[row] = tmp / A[row][row];
458  }
459 }
460 
461 
462 __device__ void solveMatrix(BigReal lambda [4], BigReal A[4][4], BigReal sigma[4], int icnt)
463 {
464  switch(icnt)
465  {
466  case 1:
467  {
468  lambda[0] = sigma[0]/A[0][0];
469  break;
470  }
471  case 2:
472  {
473 
474  BigReal det=1./(A[0][0]*A[1][1]-A[0][1]*A[1][0]);
475  lambda[0] = ( A[1][1]*sigma[0]-A[0][1]*sigma[1])*det;
476  lambda[1] = (-A[1][0]*sigma[0]+A[0][0]*sigma[1])*det;
477  break;
478  }
479  case 3:
480  {
481  BigReal det = 1./det_3by3(A);
482  lambda[0] = det*((A[1][1]*A[2][2]-A[1][2]*A[2][1])*sigma[0]-
483  (A[0][1]*A[2][2]-A[0][2]*A[2][1])*sigma[1]+
484  (A[0][1]*A[1][2]-A[0][2]*A[1][1])*sigma[2]);
485 
486  lambda[1] = det*((A[1][2]*A[2][0]-A[1][0]*A[2][2])*sigma[0]+
487  (A[0][0]*A[2][2]-A[0][2]*A[2][0])*sigma[1]-
488  (A[0][0]*A[1][2]-A[0][2]*A[1][0])*sigma[2]);
489 
490  lambda[2] = det*((A[1][0]*A[2][1]-A[1][1]*A[2][0])*sigma[0]-
491  (A[0][0]*A[2][1]-A[0][1]*A[2][0])*sigma[1]+
492  (A[0][0]*A[1][1]-A[0][1]*A[1][0])*sigma[2]);
493  break;
494  }
495  case 4:
496  {
497  solve_4by4(lambda, A, sigma);
498  break;
499  }
500  }
501 }
502 
503 //
504 // The choice of launch bounds is determined based on nvcc output of how many
505 // registers the kernel is using. We see optimal performance using a register
506 // limit of 128, which corresponds to __launch_bounds__(512,1).
507 //
508 template <bool DOENERGY, bool DOFIXED>
509 __global__
510 #ifdef NAMD_CUDA
511 __launch_bounds__(512,1)
512 #else
513 //TODO:HIP - tune for HIP
514 // __launch_bounds__(512,1)
515 #endif
516 void MSHAKE_CUDA_Kernel(
517  const CudaRattleElem *rattleList,
518  const int size,
519  const int *hgs,
520  const BigReal* __restrict__ refx_d,
521  const BigReal* __restrict__ refy_d,
522  const BigReal* __restrict__ refz_d,
523  BigReal* __restrict__ posx_d,
524  BigReal* __restrict__ posy_d,
525  BigReal* __restrict__ posz_d,
526  const BigReal* __restrict__ ref_velx_d, // for fixed atoms icnt == 0
527  const BigReal* __restrict__ ref_vely_d, // for fixed atoms icnt == 0
528  const BigReal* __restrict__ ref_velz_d, // for fixed atoms icnt == 0
529  BigReal* __restrict__ velx_d,
530  BigReal* __restrict__ vely_d,
531  BigReal* __restrict__ velz_d,
532  BigReal* __restrict__ f_normal_x,
533  BigReal* __restrict__ f_normal_y,
534  BigReal* __restrict__ f_normal_z,
535  cudaTensor* __restrict virial,
536  const float* __restrict mass,
537  const BigReal invdt,
538  const BigReal tol2_d,
539  const int maxiter_d,
540  int* consFailure,
541  const int* __restrict__ atomFixed)
542  {
543 // TODO: why is the CUDA (non-shared) path broken?
544 #if 1
545  __shared__ BigReal sh_posx[128][4+1];
546  __shared__ BigReal sh_posy[128][4+1];
547  __shared__ BigReal sh_posz[128][4+1];
548  __shared__ BigReal sh_refx[128][4+1];
549  __shared__ BigReal sh_refy[128][4+1];
550  __shared__ BigReal sh_refz[128][4+1];
551 #endif
552  int idx = threadIdx.x + blockIdx.x * blockDim.x;
553  // cudaTensor lVirial;
554  // lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
555  // lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
556  // lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
557  if(idx < size)
558  {
559  consFailure[idx] = 0;
560  int ig = rattleList[idx].ig;
561  int icnt = rattleList[idx].icnt;
562  BigReal tol2 = tol2_d;
563  BigReal sigma[4] = {0};
564  BigReal lambda[4]= {0};
565  BigReal A[4][4] = {0};
566  // BigReal df[3] = {0};
567  BigReal pabx[4] = {0};
568  BigReal rabx[4] = {0};
569  BigReal paby[4] = {0};
570  BigReal raby[4] = {0};
571  BigReal pabz[4] = {0};
572  BigReal rabz[4] = {0};
573 #if 0
574  BigReal posx[4+1] = {0};
575  BigReal posy[4+1] = {0};
576  BigReal posz[4+1] = {0};
577  BigReal refx[4+1] = {0};
578  BigReal refy[4+1] = {0};
579  BigReal refz[4+1] = {0};
580 #else
581  BigReal* posx = &sh_posx[threadIdx.x][0];
582  BigReal* posy = &sh_posy[threadIdx.x][0];
583  BigReal* posz = &sh_posz[threadIdx.x][0];
584  BigReal* refx = &sh_refx[threadIdx.x][0];
585  BigReal* refy = &sh_refy[threadIdx.x][0];
586  BigReal* refz = &sh_refz[threadIdx.x][0];
587  for(int i = 0; i < 4+1; ++i)
588  {
589  posx[i] = 0;
590  posy[i] = 0;
591  posz[i] = 0;
592  refx[i] = 0;
593  refy[i] = 0;
594  refz[i] = 0;
595  }
596 #endif
597  //load datas from global memory to local memory
598  CudaRattleParam rattleParam[4]; //Does this work?
599  for(int i = 0; i < 4+1; ++i)
600  //for(int i = 0; i < hgs[i]; ++i)
601  {
602  //rattleParam[i] = rattleParam_d[4*idx+i];
603  if (i < 4) rattleParam[i] = rattleList[idx].params[i];
604  /*posx[i] = posx_d[i+4*idx];
605  posy[i] = posy_d[i+4*idx];
606  posz[i] = posz_d[i+4*idx];
607  refx[i] = refx_d[i+4*idx];
608  refx[i] = refx_d[i+4*idx];
609  refx[i] = refx_d[i+4*idx];
610  refy[i] = refy_d[i+4*idx];
611  refz[i] = refz_d[i+4*idx];*/
612  //I can pass the HGS and check if I is >= HGS[i]
613  if(i < hgs[ig]){
614  posx[i] = posx_d[i+ig];
615  posy[i] = posy_d[i+ig];
616  posz[i] = posz_d[i+ig];
617  refx[i] = refx_d[i+ig];
618  refy[i] = refy_d[i+ig];
619  refz[i] = refz_d[i+ig];
620  }
621  }
622  int loop = 0;
623  int done = 1;
624  // Haochuan: there are atoms fixed in the group if icnt == 0,
625  // which should be in the noconstList in the CPU code
626  // and simulated without constraints, but the CUDA
627  // code here count all of them into the rattleList,
628  // so I check icnt == 0 and skip the whole MSHAKE.
629  if (!DOFIXED || (DOFIXED && icnt > 0)) {
630  //#pragma unroll
631  for(int i = 0; i < 4; ++i)
632  {
633  int a = rattleParam[i].ia;
634  int b = rattleParam[i].ib;
635  pabx[i] = posx[a] - posx[b];
636  paby[i] = posy[a] - posy[b];
637  pabz[i] = posz[a] - posz[b];
638  rabx[i] = refx[a] - refx[b];
639  raby[i] = refy[a] - refy[b];
640  rabz[i] = refz[a] - refz[b];
641  }
642  //#pragma unroll
643  for(int i = 0; i < 4; ++i)
644  {
645  BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
646  BigReal rabsq = rattleParam[i].dsq;
647  BigReal diffsq = pabsq - rabsq;
648  sigma[i] = diffsq;
649  if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
650  done = false;
651  }
652  int maxiter = maxiter_d;
653  for(loop = 0; loop < maxiter; ++loop)
654  {
655  if(!done)
656  {
657  //construct A
658  //#pragma unroll
659  for(int j = 0; j < 4; ++j)
660  {
661  BigReal rma = rattleParam[j].rma;
662  #pragma unroll
663  for(int i = 0; i < 4; ++i)
664  {
665  A[j][i] = 2.*(pabx[j]*rabx[i]+paby[j]*raby[i]+pabz[j]*rabz[i])*rma;
666  }
667  BigReal rmb = rattleParam[j].rmb;
668  A[j][j] += 2.*(pabx[j]*rabx[j]+paby[j]*raby[j]+pabz[j]*rabz[j])*rmb;
669  }
670 
671  solveMatrix(lambda, A, sigma, icnt);
672  //#pragma unroll
673  for(int i = 0; i < 4; ++i)
674  {
675  int a = rattleParam[i].ia;
676  int b = rattleParam[i].ib;
677  BigReal rma = rattleParam[i].rma * lambda[i];
678  BigReal rmb = rattleParam[i].rmb * lambda[i];
679 
680  posx[a] -= rma * rabx[i];
681  posy[a] -= rma * raby[i];
682  posz[a] -= rma * rabz[i];
683  posx[b] += rmb * rabx[i];
684  posy[b] += rmb * raby[i];
685  posz[b] += rmb * rabz[i];
686  }
687  }
688  else
689  break;
690  done = 1;
691  //#pragma unroll
692  for(int i = 0; i < 4; ++i)
693  {
694  int a = rattleParam[i].ia;
695  int b = rattleParam[i].ib;
696  pabx[i] = posx[a] - posx[b];
697  paby[i] = posy[a] - posy[b];
698  pabz[i] = posz[a] - posz[b];
699  BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
700  BigReal rabsq = rattleParam[i].dsq;
701  BigReal diffsq = pabsq - rabsq;
702  sigma[i] = diffsq;
703  if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
704  done = 0;
705  }
706  }
707  if(loop == maxiter)
708  consFailure[idx] = 1;
709  else
710  {
711  #pragma unroll
712  for(int i = 0; i < hgs[ig]; ++i)
713  {
714  posx_d[i+ig] = posx[i];
715  posy_d[i+ig] = posy[i];
716  posz_d[i+ig] = posz[i];
717 #if 1
718  // Velocity update
719  // double vx = velx_d[i + ig];
720  // double vy = vely_d[i + ig];
721  // double vz = velz_d[i + ig];
722  double vnx = (posx[i] - refx[i]) * invdt;
723  double vny = (posy[i] - refy[i]) * invdt;
724  double vnz = (posz[i] - refz[i]) * invdt;
725  // velx_d[i+ig] = (posx[i] - refx[i]) * invdt;
726  // vely_d[i+ig] = (posy[i] - refy[i]) * invdt;
727  // velz_d[i+ig] = (posz[i] - refz[i]) * invdt;
728  // velnew update
729  velx_d[i + ig] = vnx;
730  vely_d[i + ig] = vny;
731  velz_d[i + ig] = vnz;
732 #endif
733  }
734  consFailure[idx] = 0;
735  //printf("Loop is %d %d\n", loop, consFailure[idx]);
736  }
737  } else {
738  // Fixed atoms without constraints
739  for(int i = 0; i < hgs[ig]; ++i) {
740  // Copy from the old/reference positions.
741  posx_d[i+ig] = refx[i];
742  posy_d[i+ig] = refy[i];
743  posz_d[i+ig] = refz[i];
744  // velx_d[i+ig] = ref_velx_d[i+ig];
745  // vely_d[i+ig] = ref_vely_d[i+ig];
746  // velz_d[i+ig] = ref_velz_d[i+ig];
747  }
748  } // !DOFIXED || (DOFIXED && icnt > 0)
749  }
750  }
751 
752 // XXX TODO: Fix this, it's horrible
753 __global__
754 void CheckConstraints(int* consFailure, int* consFailure_h, int size)
755 {
756  __shared__ int result[128];
757  int idx = threadIdx.x;
758  result[idx] = 0;
759  for(int i = idx; i < size; i += 128)
760  result[idx] += consFailure[i];
761  __syncthreads();
762  if(idx < 64)
763  result[idx] += result[idx+64];
764  __syncthreads();
765  if(idx < 32)
766  result[idx] += result[idx+32];
767  __syncthreads();
768  if(idx < 16)
769  result[idx] += result[idx+16];
770  __syncthreads();
771  if(idx < 8)
772  result[idx] += result[idx+8];
773  __syncthreads();
774  if(idx < 4)
775  result[idx] += result[idx+4];
776  __syncthreads();
777  if(idx < 2)
778  result[idx] += result[idx+2];
779  __syncthreads();
780  if(idx < 1)
781  result[idx] += result[idx+1];
782  __syncthreads();
783  if(idx == 0)
784  {
785  consFailure_h[0] += result[0];
786  //printf("Constraints %d\n", onsFailure[0]);
787  }
788 }
789 
790 void MSHAKE_CUDA(
791  const bool doEnergy,
792  const bool doFixed,
793  const CudaRattleElem* rattleList,
794  const int size,
795  const int *hydrogenGroupSize,
796  const int *atomFixed,
797  const double *refx,
798  const double *refy,
799  const double *refz,
800  double *posx,
801  double *posy,
802  double *posz,
803  const double *ref_velx,
804  const double *ref_vely,
805  const double *ref_velz,
806  double *velx,
807  double *vely,
808  double *velz,
809  double *f_normal_x,
810  double *f_normal_y,
811  double *f_normal_z,
812  cudaTensor* rigidVirial,
813  const float *mass,
814  const double invdt,
815  const BigReal tol2,
816  const int maxiter,
817  int* consFailure_d,
818  int* consFailure,
819  cudaStream_t stream)
820 {
821  if(size == 0){
822  //fprintf(stderr, "No rattles, returning\n");
823  return;
824  }
825 
826  int gridDim = int(size/128)+(size%128==0 ? 0 : 1);
827  int blockDim = 128;
828 #define CALL(DOENERGY, DOFIXED) \
829  MSHAKE_CUDA_Kernel<DOENERGY, DOFIXED><<<gridDim, blockDim, 0, stream>>>( \
830  rattleList, size, hydrogenGroupSize, \
831  refx, refy, refz, posx, posy, posz, \
832  ref_velx, ref_vely, ref_velz, \
833  velx, vely, velz, \
834  f_normal_x, f_normal_y, f_normal_z, rigidVirial, \
835  mass, invdt, tol2, maxiter, consFailure_d, atomFixed)
836  if ( doEnergy && doFixed) CALL(true, true);
837  else if ( doEnergy && !doFixed) CALL(true, false);
838  else if (!doEnergy && doFixed) CALL(false, true);
839  else if (!doEnergy && !doFixed) CALL(false, false);
840  else NAMD_bug("No kernel function is called in MSHAKE_CUDA.\n");
841 #undef CALL
842  // cudaCheck(cudaGetLastError());
843  // JM NOTE: We can check the consFailure flag for every migration step
844  // CheckConstraints<<<1,128, 0, stream>>>(consFailure_d, consFailure, size);
845  // I can't do this here
846  // why do I need this variable?????
847  //done = !consFailure;
848 }
849 
850 //
851 // Perform a warp-wide read on 3-element clusters from global memory
852 // into shared memory.
853 // a - global memory buffer
854 // buf - shared memory buffer, dimension BLOCK*3
855 // list - index offset to beginning of each 3-element cluster
856 // n - total number of clusters
857 //
858 // We apply this to improve reading of water (positions and velocities).
859 // Data in global memory is stored in SOA form: pos_x, pos_y, pos_z, etc,
860 // so we call this routine six times to read positions and velocities.
861 // Reading is coalesced whenever water molecules are all consecutive.
862 // Although waters are sorted to the end of each patch, there will
863 // generally be gaps between patches due to solute.
864 //
865 template <int VECTOR_SIZE>
866 __device__ __forceinline__ void ld_vecN(const double *a, double buf[][VECTOR_SIZE], int *list, int n)
867 {
868  int laneid = threadIdx.x % warpSize;
869  int warpid = threadIdx.x / warpSize;
870 #ifdef NAMD_CUDA
871  __syncwarp();
872 #else
873  //TODO:HIP verify for HIP
874  //__all(1);
875 #endif
876 
877  for (int j = 0; j < VECTOR_SIZE; j++) {
878  int offset = laneid + j*warpSize;
879  int ii = offset % VECTOR_SIZE;
880  int jj = warpid*warpSize + offset/VECTOR_SIZE;
881  if (blockIdx.x*blockDim.x + jj < n) {
882  int idx = list[jj] + ii;
883  buf[jj][ii] = a[idx];
884  }
885  }
886 #ifdef NAMD_CUDA
887  __syncwarp();
888 #else
889  //TODO:HIP verify for HIP
890  //__all(1);
891 #endif
892 
893 }
894 
895 // Overload for float
896 template <int VECTOR_SIZE>
897 __device__ __forceinline__ void ld_vecN(const float *a, float buf[][VECTOR_SIZE], int *list, int n)
898 {
899  int laneid = threadIdx.x % warpSize;
900  int warpid = threadIdx.x / warpSize;
901 #ifdef NAMD_CUDA
902  __syncwarp();
903 #else
904  //TODO:HIP verify for HIP
905  //__all(1);
906 #endif
907 
908  for (int j = 0; j < VECTOR_SIZE; j++) {
909  int offset = laneid + j*warpSize;
910  int ii = offset % VECTOR_SIZE;
911  int jj = warpid*warpSize + offset/VECTOR_SIZE;
912  if (blockIdx.x*blockDim.x + jj < n) {
913  int idx = list[jj] + ii;
914  buf[jj][ii] = a[idx];
915  }
916  }
917 #ifdef NAMD_CUDA
918  __syncwarp();
919 #else
920  //TODO:HIP verify for HIP
921  //__all(1);
922 #endif
923 
924 }
925 
926 //
927 // Same as previous, only this time we write to global memory from
928 // shared memory.
929 //
930 template <int VECTOR_SIZE>
931 __device__ __forceinline__ void st_vecN(double *a, double buf[][VECTOR_SIZE], int *list, int n)
932 {
933  int laneid = threadIdx.x % warpSize;
934  int warpid = threadIdx.x / warpSize;
935 #ifdef NAMD_CUDA
936  __syncwarp();
937 #else
938  //TODO:HIP verify for HIP
939  //__all(1);
940 #endif
941  for (int j = 0; j < VECTOR_SIZE; j++) {
942  int offset = laneid + j*warpSize;
943  int ii = offset % VECTOR_SIZE;
944  int jj = warpid*warpSize + offset/VECTOR_SIZE;
945  if (blockIdx.x*blockDim.x + jj < n) {
946  int idx = list[jj] + ii;
947  a[idx] = buf[jj][ii];
948  }
949  }
950 #ifdef NAMD_CUDA
951  __syncwarp();
952 #else
953  //TODO:HIP verify for HIP
954  //__all(1);
955 #endif
956 }
957 
958 //
959 // Same as previous, but accumulates the value instead of copying
960 //
961 template <int VECTOR_SIZE>
962 __device__ __forceinline__ void acc_vecN(double* __restrict__ a, double buf[][VECTOR_SIZE], int *list, int n)
963 {
964  int laneid = threadIdx.x % warpSize;
965  int warpid = threadIdx.x / warpSize;
966 #ifdef NAMD_CUDA
967  __syncwarp();
968 #else
969  //TODO:HIP verify for HIP
970  //__all(1);
971 #endif
972  for (int j = 0; j < VECTOR_SIZE; j++) {
973  int offset = laneid + j*warpSize;
974  int ii = offset % VECTOR_SIZE;
975  int jj = warpid*warpSize + offset/VECTOR_SIZE;
976  if (blockIdx.x*blockDim.x + jj < n) {
977  int idx = list[jj] + ii;
978  a[idx] += buf[jj][ii];
979  }
980  }
981 #ifdef NAMD_CUDA
982  __syncwarp();
983 #else
984  //TODO:HIP verify for HIP
985  //__all(1);
986 #endif
987 }
988 
989 // Haochuan: fixed atoms in water are already excluded from the
990 // settleList, so we don't need to do anything related to
991 // fixed atoms here.
992 //
993 // The choice of launch bounds is determined based on nvcc output of how many
994 // registers the kernel is using. We see optimal performance using a register
995 // limit of 128, which corresponds to __launch_bounds__(512,1).
996 //
997 template <bool DOENERGY, int BLOCK,
998  int WATER_GROUP_SIZE,
999  WaterModel WATER_MODEL>
1000 __global__
1001 #ifdef NAMD_CUDA
1002 __launch_bounds__(512,1)
1003 #else
1004 //TODO:HIP tune
1005 // __launch_bounds__(512,1)
1006 #endif
1007 void SettleKernel(
1008  int numAtoms,
1009  const double dt,
1010  const double invdt,
1011  int nSettles,
1012  const double * __restrict vel_x,
1013  const double * __restrict vel_y,
1014  const double * __restrict vel_z,
1015  const double * __restrict pos_x,
1016  const double * __restrict pos_y,
1017  const double * __restrict pos_z,
1018  double * __restrict velNew_x,
1019  double * __restrict velNew_y,
1020  double * __restrict velNew_z,
1021  double * __restrict posNew_x,
1022  double * __restrict posNew_y,
1023  double * __restrict posNew_z,
1024  double * __restrict f_normal_x,
1025  double * __restrict f_normal_y,
1026  double * __restrict f_normal_z,
1027  cudaTensor* __restrict virial,
1028  const float* __restrict mass,
1029  const int * __restrict hydrogenGroupSize,
1030  const float * __restrict rigidBondLength,
1031  const int * __restrict atomFixed,
1032  const int * __restrict settleList,
1033  const SettleParameters * __restrict sp)
1034 {
1035  int tid = threadIdx.x + blockIdx.x*blockDim.x;
1036  double ref[WATER_GROUP_SIZE][3] = {0};
1037  double pos[WATER_GROUP_SIZE][3] = {0};
1038  double vel[WATER_GROUP_SIZE][3] = {0};
1039 
1040  __shared__ int list[BLOCK];
1041  __shared__ double buf[BLOCK][WATER_GROUP_SIZE];
1042  if (tid < nSettles) {
1043  list[threadIdx.x] = settleList[tid];
1044  }
1045  __syncthreads();
1046  // Do a warp-wide read on pos* and vel* from global memory to shmem
1047  //BEGIN OF FIRST PART OF RATTLE1: Settle_SIMD invocation
1048  ld_vecN<WATER_GROUP_SIZE>(pos_x, buf, list, nSettles);
1049  #pragma unroll
1050  for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][0] = buf[threadIdx.x][i];
1051  ld_vecN<WATER_GROUP_SIZE>(pos_y, buf, list, nSettles);
1052  #pragma unroll
1053  for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][1] = buf[threadIdx.x][i];
1054  ld_vecN<WATER_GROUP_SIZE>(pos_z, buf, list, nSettles);
1055  #pragma unroll
1056  for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][2] = buf[threadIdx.x][i];
1057  ld_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
1058  #pragma unroll
1059  for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][0] = buf[threadIdx.x][i];
1060  ld_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
1061  #pragma unroll
1062  for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][1] = buf[threadIdx.x][i];
1063  ld_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
1064  #pragma unroll
1065  for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][2] = buf[threadIdx.x][i];
1066 
1067  #pragma unroll
1068  for (int i = 0; i < WATER_GROUP_SIZE; ++i) {
1069  pos[i][0] = ref[i][0] + vel[i][0]*dt;
1070  pos[i][1] = ref[i][1] + vel[i][1]*dt;
1071  pos[i][2] = ref[i][2] + vel[i][2]*dt;
1072  }
1073 
1074  switch (WATER_MODEL) {
1075  case WaterModel::SWM4: {
1076  // SWM4 ordering: O D LP H1 H2
1077  // do swap(O,LP) and call settle with subarray O H1 H2
1078  double lp_ref[3];
1079  double lp_pos[3];
1080  #pragma unroll
1081  for (int i = 0; i < 3; ++i) {
1082  lp_ref[i] = ref[2][i];
1083  lp_pos[i] = pos[2][i];
1084  }
1085  #pragma unroll
1086  for (int i = 0; i < 3; ++i) {
1087  ref[2][i] = ref[0][i];
1088  pos[2][i] = pos[0][i];
1089  }
1090  settle1(ref+2, pos+2, sp);
1091  // swap back after we return
1092  #pragma unroll
1093  for (int i = 0; i < 3; ++i) {
1094  ref[0][i] = ref[2][i];
1095  pos[0][i] = pos[2][i];
1096  }
1097  #pragma unroll
1098  for (int i = 0; i < 3; ++i) {
1099  ref[2][i] = lp_ref[i];
1100  pos[2][i] = lp_pos[i];
1101  }
1102  swm4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1103  break;
1104  }
1105  case WaterModel::TIP4: {
1106  settle1(ref, pos, sp);
1107  tip4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1108  break;
1109  }
1110  default: {
1111  settle1(ref, pos, sp);
1112  }
1113  }
1114 
1115  // Update position and velocities with stuff calculated by settle1
1116 #if 1
1117  // Do a warp-wide write on pos* and vel* to global memory from shmem
1118  #pragma unroll
1119  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][0] - ref[i][0])*invdt;
1120  st_vecN<WATER_GROUP_SIZE>(velNew_x, buf, list, nSettles);
1121  #pragma unroll
1122  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][1] - ref[i][1])*invdt;
1123  st_vecN<WATER_GROUP_SIZE>(velNew_y, buf, list, nSettles);
1124  #pragma unroll
1125  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][2] - ref[i][2])*invdt;
1126  st_vecN<WATER_GROUP_SIZE>(velNew_z, buf, list, nSettles);
1127  #pragma unroll
1128  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][0];
1129  st_vecN<WATER_GROUP_SIZE>(posNew_x, buf, list, nSettles);
1130  #pragma unroll
1131  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][1];
1132  st_vecN<WATER_GROUP_SIZE>(posNew_y, buf, list, nSettles);
1133  #pragma unroll
1134  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][2];
1135  st_vecN<WATER_GROUP_SIZE>(posNew_z, buf, list, nSettles);
1136 #else
1137  // force+velocity update
1138  if (tid < nSettles) {
1139  double vnx, vny, vnz;
1140 #pragma unroll
1141  for(int i = 0 ; i < 3; i++){
1142  vnx = (pos[i][0] - ref[i][0]) * invdt;
1143  vny = (pos[i][1] - ref[i][1]) * invdt;
1144  vnz = (pos[i][2] - ref[i][2]) * invdt;
1145 
1146  df[0] = (vnx - vel[i][0]) * mass[i] * invdt;
1147  df[1] = (vny - vel[i][1]) * mass[i] * invdt;
1148  df[2] = (vnz - vel[i][2]) * mass[i] * invdt;
1149 
1150  velNew_x[ig+i] = vnx;
1151  velNew_y[ig+i] = vny;
1152  velNew_z[ig+i] = vnx;
1153  posNew_x[ig+i] = pos[i][0];
1154  posNew_y[ig+i] = pos[i][1];
1155  posNew_z[ig+i] = pos[i][2];
1156  f_normal_x[ig+i] += df[0];
1157  f_normal_y[ig+i] += df[1];
1158  f_normal_z[ig+i] += df[2];
1159 
1160  if (DOENERGY){
1161  lVirial.xx += df[0] * pos[i][0];
1162  // lVirial.xy += df[0] * pos[i][1];
1163  // lVirial.xz += df[0] * pos[i][2];
1164  lVirial.yx += df[1] * pos[i][0];
1165  lVirial.yy += df[1] * pos[i][1];
1166  // lVirial.yz += df[1] * pos[i][2];
1167  lVirial.zx += df[2] * pos[i][0];
1168  lVirial.zy += df[2] * pos[i][1];
1169  lVirial.zz += df[2] * pos[i][2];
1170  }
1171  }
1172  }
1173  if(DOENERGY){
1174  typedef cub::BlockReduce<BigReal, 128> BlockReduce;
1175  __shared__ typename BlockReduce::TempStorage temp_storage;
1176  lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
1177  // lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
1178  // lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
1179  lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
1180  lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy); __syncthreads();
1181  // lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
1182  lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
1183  lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
1184  lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
1185 
1186  // Every block has a locally reduced blockVirial
1187  // Now every thread does an atomicAdd to get a global virial
1188  if(threadIdx.x == 0){
1189  atomicAdd(&(virial->xx), lVirial.xx);
1190  atomicAdd(&(virial->xy), lVirial.yx);
1191  atomicAdd(&(virial->xz), lVirial.zx);
1192  atomicAdd(&(virial->yx), lVirial.yx);
1193  atomicAdd(&(virial->yy), lVirial.yy);
1194  atomicAdd(&(virial->yz), lVirial.zy);
1195  atomicAdd(&(virial->zx), lVirial.zx);
1196  atomicAdd(&(virial->zy), lVirial.zy);
1197  atomicAdd(&(virial->zz), lVirial.zz);
1198  }
1199  }
1200 #endif
1201  // }
1202 }
1203 
1204 void Settle(
1205  const bool doEnergy,
1206  int numAtoms,
1207  const double dt,
1208  const double invdt,
1209  const int nSettles,
1210  const double * vel_x,
1211  const double * vel_y,
1212  const double * vel_z,
1213  const double * pos_x,
1214  const double * pos_y,
1215  const double * pos_z,
1216  double * velNew_x,
1217  double * velNew_y,
1218  double * velNew_z,
1219  double * posNew_x,
1220  double * posNew_y,
1221  double * posNew_z,
1222  double * f_normal_x,
1223  double * f_normal_y,
1224  double * f_normal_z,
1225  cudaTensor* virial,
1226  const float* mass,
1227  const int * hydrogenGroupSize,
1228  const float * rigidBondLength,
1229  const int * atomFixed,
1230  int * settleList,
1231  const SettleParameters * sp,
1232  const WaterModel water_model,
1233  cudaStream_t stream)
1234 {
1235 
1236 #ifdef NAMD_CUDA
1237  const int blocks = 128;
1238 #else
1239  const int blocks = 128;
1240 #endif
1241  int grid = (nSettles + blocks - 1) / blocks;
1242 #define CALL(DOENERGY, WATER_MODEL) \
1243  SettleKernel<DOENERGY, blocks, getWaterModelGroupSize(WATER_MODEL), WATER_MODEL> \
1244  <<< grid, blocks, 0, stream >>> \
1245  (numAtoms, dt, invdt, nSettles, \
1246  vel_x, vel_y, vel_z, \
1247  pos_x, pos_y, pos_z, \
1248  velNew_x, velNew_y, velNew_z, \
1249  posNew_x, posNew_y, posNew_z, \
1250  f_normal_x, f_normal_y, f_normal_z, virial, mass, \
1251  hydrogenGroupSize, rigidBondLength, atomFixed, \
1252  settleList, sp \
1253  );
1254  switch (water_model) {
1255  case WaterModel::SWM4: {
1256  if ( doEnergy) CALL(true, WaterModel::SWM4);
1257  if (!doEnergy) CALL(false, WaterModel::SWM4);
1258  break;
1259  }
1260  case WaterModel::TIP4: {
1261  if ( doEnergy) CALL(true, WaterModel::TIP4);
1262  if (!doEnergy) CALL(false, WaterModel::TIP4);
1263  break;
1264  }
1265  default: {
1266  if ( doEnergy) CALL(true, WaterModel::TIP3);
1267  if (!doEnergy) CALL(false, WaterModel::TIP3);
1268  }
1269  }
1270 #undef CALL
1271  cudaCheck(cudaStreamSynchronize(stream));
1272 }
1273 
1274 // Fused kernel for rigid bonds constraints
1275 // Each block will do a single operation -> either SETTLE or MSHAKE.
1276 template <int WATER_GROUP_SIZE,
1277  WaterModel WATER_MODEL,
1278  bool DOFIXED>
1279 #ifdef NAMD_CUDA
1280 __launch_bounds__(512,1)
1281 #else
1282 //TODO:HIP tune
1283 // __launch_bounds__(512,1)
1284 #endif
1285 __global__ void Rattle1Kernel(
1286  int numAtoms,
1287  // Settle Parameters
1288  const double dt,
1289  const double invdt,
1290  const int nSettles,
1291  double * __restrict vel_x,
1292  double * __restrict vel_y,
1293  double * __restrict vel_z,
1294  const double * __restrict pos_x,
1295  const double * __restrict pos_y,
1296  const double * __restrict pos_z,
1297  double * __restrict f_normal_x,
1298  double * __restrict f_normal_y,
1299  double * __restrict f_normal_z,
1300  const float * __restrict mass,
1301  const int * __restrict hydrogenGroupSize,
1302  const float * __restrict rigidBondLength,
1303  const int * __restrict atomFixed,
1304  int * __restrict settleList,
1305  const SettleParameters * sp,
1306  const CudaRattleElem * __restrict rattleList,
1307  const int nShakes,
1308  const BigReal tol2_d,
1309  const int maxiter_d,
1310  int* consFailure,
1311  const int nSettleBlocks
1312 ){
1313  // Great, first step is fetching value
1314  int tid = threadIdx.x + blockIdx.x * blockDim.x;
1315  bool isSettle = blockIdx.x < nSettleBlocks;
1316  __shared__ int list[128];
1317  __shared__ double buf[128][WATER_GROUP_SIZE];
1318 #ifndef NAMD_CUDA
1319  __shared__ BigReal sh_posx[128][4+1];
1320  __shared__ BigReal sh_posy[128][4+1];
1321  __shared__ BigReal sh_posz[128][4+1];
1322  __shared__ BigReal sh_refx[128][4+1];
1323  __shared__ BigReal sh_refy[128][4+1];
1324  __shared__ BigReal sh_refz[128][4+1];
1325 #endif
1326 
1327  if( isSettle ){
1328  // SETTLE CODEPATH
1329  list[threadIdx.x] = settleList[tid];
1330  __syncthreads();
1331  double ref[WATER_GROUP_SIZE][3] = {0};
1332  double pos[WATER_GROUP_SIZE][3] = {0};
1333  double vel[WATER_GROUP_SIZE][3] = {0};
1334  float tmp_mass[WATER_GROUP_SIZE];
1335 
1336  ld_vecN<WATER_GROUP_SIZE>(pos_x, buf, list, nSettles);
1337  #pragma unroll
1338  for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][0] = buf[threadIdx.x][i];
1339  ld_vecN<WATER_GROUP_SIZE>(pos_y, buf, list, nSettles);
1340  #pragma unroll
1341  for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][1] = buf[threadIdx.x][i];
1342  ld_vecN<WATER_GROUP_SIZE>(pos_z, buf, list, nSettles);
1343  #pragma unroll
1344  for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][2] = buf[threadIdx.x][i];
1345  ld_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
1346  #pragma unroll
1347  for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][0] = buf[threadIdx.x][i];
1348  ld_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
1349  #pragma unroll
1350  for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][1] = buf[threadIdx.x][i];
1351  ld_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
1352  #pragma unroll
1353  for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][2] = buf[threadIdx.x][i];
1354  #pragma unroll
1355  for (int i = 0; i < WATER_GROUP_SIZE; ++i) {
1356  pos[i][0] = ref[i][0] + vel[i][0]*dt;
1357  pos[i][1] = ref[i][1] + vel[i][1]*dt;
1358  pos[i][2] = ref[i][2] + vel[i][2]*dt;
1359  }
1360  switch (WATER_MODEL) {
1361  case WaterModel::SWM4: {
1362  // SWM4 ordering: O D LP H1 H2
1363  // do swap(O,LP) and call settle with subarray O H1 H2
1364  double lp_ref[3];
1365  double lp_pos[3];
1366  #pragma unroll
1367  for (int i = 0; i < 3; ++i) {
1368  lp_ref[i] = ref[2][i];
1369  lp_pos[i] = pos[2][i];
1370  }
1371  #pragma unroll
1372  for (int i = 0; i < 3; ++i) {
1373  ref[2][i] = ref[0][i];
1374  pos[2][i] = pos[0][i];
1375  }
1376  settle1(ref+2, pos+2, sp);
1377  // swap back after we return
1378  #pragma unroll
1379  for (int i = 0; i < 3; ++i) {
1380  ref[0][i] = ref[2][i];
1381  pos[0][i] = pos[2][i];
1382  }
1383  #pragma unroll
1384  for (int i = 0; i < 3; ++i) {
1385  ref[2][i] = lp_ref[i];
1386  pos[2][i] = lp_pos[i];
1387  }
1388  swm4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1389  break;
1390  }
1391  case WaterModel::TIP4: {
1392  settle1(ref, pos, sp);
1393  tip4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1394  break;
1395  }
1396  default: {
1397  settle1(ref, pos, sp);
1398  }
1399  }
1400  // load the mass to calculate the force
1401  if (WATER_MODEL != WaterModel::TIP3) {
1402  __shared__ float mass_buf[128][WATER_GROUP_SIZE];
1403  ld_vecN<WATER_GROUP_SIZE>(mass, mass_buf, list, nSettles);
1404  #pragma unroll
1405  for (int i = 0; i < WATER_GROUP_SIZE; ++i) tmp_mass[i] = mass_buf[threadIdx.x][i];
1406  }
1407  // ------ Update the atomic velocities along X
1408  #pragma unroll
1409  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][0] - ref[i][0])*invdt;
1410 
1411  st_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
1412 
1413  // buf here holds the new velocities. now we calculate the force contributions
1414  // maybe it's this mass?
1415  if (WATER_MODEL != WaterModel::TIP3) {
1416  #pragma unroll
1417  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][0]) * (tmp_mass[i]*invdt);
1418  } else {
1419  buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][0]) * (sp->mO*invdt);
1420  buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][0]) * (sp->mH*invdt);
1421  buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][0]) * (sp->mH*invdt);
1422  }
1423  acc_vecN<WATER_GROUP_SIZE>(f_normal_x, buf, list, nSettles);
1424 
1425  // ------ Update the atomic velocities along Y
1426  #pragma unroll
1427  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][1] - ref[i][1])*invdt;
1428  st_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
1429  if (WATER_MODEL != WaterModel::TIP3) {
1430  #pragma unroll
1431  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][1]) * (tmp_mass[i]*invdt);
1432  } else {
1433  buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][1]) * (sp->mO*invdt);
1434  buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][1]) * (sp->mH*invdt);
1435  buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][1]) * (sp->mH*invdt);
1436  }
1437  acc_vecN<WATER_GROUP_SIZE>(f_normal_y, buf, list, nSettles);
1438 
1439 
1440  // ------ Update the atomic velocities along Z
1441  #pragma unroll
1442  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][2] - ref[i][2])*invdt;
1443  st_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
1444  if (WATER_MODEL != WaterModel::TIP3) {
1445  #pragma unroll
1446  for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][2]) * (tmp_mass[i]*invdt);
1447  } else {
1448  buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][2]) * (sp->mO*invdt);
1449  buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][2]) * (sp->mH*invdt);
1450  buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][2]) * (sp->mH*invdt);
1451  }
1452  acc_vecN<WATER_GROUP_SIZE>(f_normal_z, buf, list, nSettles);
1453 
1454  }else{
1455  // Remaining threadblocks do MSHAKE.
1456  // instead of tid - nSettle, IDX needs to be the first threadBlock that does MSHAKE instead of settle
1457  int idx = tid - blockDim.x * nSettleBlocks; // Numbers of blocks that did SETTLE
1458  // Ok, now we follow the logic of MSHAKE_CUDA_KERNEL
1459  if(idx < nShakes){
1460  // This needs to increase monotonically
1461  // if (threadIdx.x == 0) printf("tid %d idx %d\n", tid, idx);
1462  // consFailure[idx] = 0;
1463  int ig = rattleList[idx].ig;
1464  int icnt = rattleList[idx].icnt;
1465  BigReal tol2 = tol2_d;
1466  BigReal sigma[4] = {0};
1467  BigReal lambda[4]= {0};
1468  BigReal A[4][4] = {0};
1469  BigReal velx[4+1] = {0};
1470  BigReal vely[4+1] = {0};
1471  BigReal velz[4+1] = {0};
1472  BigReal pabx[4] = {0};
1473  BigReal rabx[4] = {0};
1474  BigReal paby[4] = {0};
1475  BigReal raby[4] = {0};
1476  BigReal pabz[4] = {0};
1477  BigReal rabz[4] = {0};
1478  BigReal df[3] = {0};
1479 #ifdef NAMD_CUDA
1480  BigReal posx[4+1] = {0};
1481  BigReal posy[4+1] = {0};
1482  BigReal posz[4+1] = {0};
1483  BigReal refx[4+1] = {0};
1484  BigReal refy[4+1] = {0};
1485  BigReal refz[4+1] = {0};
1486 #else
1487  BigReal* posx = &sh_posx[threadIdx.x][0];
1488  BigReal* posy = &sh_posy[threadIdx.x][0];
1489  BigReal* posz = &sh_posz[threadIdx.x][0];
1490  BigReal* refx = &sh_refx[threadIdx.x][0];
1491  BigReal* refy = &sh_refy[threadIdx.x][0];
1492  BigReal* refz = &sh_refz[threadIdx.x][0];
1493  for(int i = 0; i < 4+1; ++i)
1494  {
1495  posx[i] = 0;
1496  posy[i] = 0;
1497  posz[i] = 0;
1498  refx[i] = 0;
1499  refy[i] = 0;
1500  refz[i] = 0;
1501  }
1502 #endif
1503  //load datas from global memory to local memory
1504  CudaRattleParam rattleParam[4];
1505 #pragma unroll
1506  for(int i = 0; i < 4+1; ++i)
1507  //for(int i = 0; i < hgs[i]; ++i)
1508  {
1509  //rattleParam[i] = rattleParam_d[4*idx+i];
1510  if (i < 4) rattleParam[i] = rattleList[idx].params[i];
1511  /*posx[i] = posx_d[i+4*idx];
1512  posy[i] = posy_d[i+4*idx];
1513  posz[i] = posz_d[i+4*idx];
1514  refx[i] = refx_d[i+4*idx];
1515  refx[i] = refx_d[i+4*idx];
1516  refx[i] = refx_d[i+4*idx];
1517  refy[i] = refy_d[i+4*idx];
1518  refz[i] = refz_d[i+4*idx];*/
1519  //I can pass the HGS and check if I is >= HGS[i]
1520  if(i < hydrogenGroupSize[ig]){
1521  refx[i] = pos_x[i+ig];
1522  refy[i] = pos_y[i+ig];
1523  refz[i] = pos_z[i+ig];
1524  velx[i] = vel_x[i + ig];
1525  vely[i] = vel_y[i + ig];
1526  velz[i] = vel_z[i + ig];
1527  if (!DOFIXED) {
1528  posx[i] = refx[i] + (velx[i] * dt);
1529  posy[i] = refy[i] + (vely[i] * dt);
1530  posz[i] = refz[i] + (velz[i] * dt);
1531  } else {
1532  // Haochuan: don't update the positions for fixed atoms
1533  if (!atomFixed[ig+i]) {
1534  posx[i] = refx[i] + (velx[i] * dt);
1535  posy[i] = refy[i] + (vely[i] * dt);
1536  posz[i] = refz[i] + (velz[i] * dt);
1537  } else {
1538  posx[i] = refx[i];
1539  posy[i] = refy[i];
1540  posz[i] = refz[i];
1541  }
1542  }
1543  }
1544  }
1545  int loop = 0;
1546  int done = 1;
1547 #pragma unroll
1548  for(int i = 0; i < 4; ++i)
1549  {
1550  int a = rattleParam[i].ia;
1551  int b = rattleParam[i].ib;
1552  pabx[i] = posx[a] - posx[b];
1553  paby[i] = posy[a] - posy[b];
1554  pabz[i] = posz[a] - posz[b];
1555  rabx[i] = refx[a] - refx[b];
1556  raby[i] = refy[a] - refy[b];
1557  rabz[i] = refz[a] - refz[b];
1558  }
1559 
1560 #pragma unroll
1561  for(int i = 0; i < 4; ++i)
1562  {
1563  BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
1564  BigReal rabsq = rattleParam[i].dsq;
1565  BigReal diffsq = pabsq - rabsq;
1566  sigma[i] = diffsq;
1567  if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
1568  done = false;
1569  }
1570  int maxiter = maxiter_d;
1571  for(loop = 0; loop < maxiter; ++loop)
1572  {
1573  if(!done)
1574  {
1575  //construct A
1576 #pragma unroll
1577  for(int j = 0; j < 4; ++j)
1578  {
1579  BigReal rma = rattleParam[j].rma;
1580  #pragma unroll
1581  for(int i = 0; i < 4; ++i)
1582  {
1583  A[j][i] = 2.*(pabx[j]*rabx[i]+paby[j]*raby[i]+pabz[j]*rabz[i])*rma;
1584  }
1585  BigReal rmb = rattleParam[j].rmb;
1586  A[j][j] += 2.*(pabx[j]*rabx[j]+paby[j]*raby[j]+pabz[j]*rabz[j])*rmb;
1587  }
1588 
1589  solveMatrix(lambda, A, sigma, icnt);
1590 #pragma unroll
1591  for(int i = 0; i < 4; ++i)
1592  {
1593  int a = rattleParam[i].ia;
1594  int b = rattleParam[i].ib;
1595  BigReal rma = rattleParam[i].rma * lambda[i];
1596  BigReal rmb = rattleParam[i].rmb * lambda[i];
1597 
1598  posx[a] -= rma * rabx[i];
1599  posy[a] -= rma * raby[i];
1600  posz[a] -= rma * rabz[i];
1601  posx[b] += rmb * rabx[i];
1602  posy[b] += rmb * raby[i];
1603  posz[b] += rmb * rabz[i];
1604  }
1605  }
1606  else
1607  break;
1608  done = 1;
1609 #pragma unroll
1610  for(int i = 0; i < 4; ++i)
1611  {
1612  int a = rattleParam[i].ia;
1613  int b = rattleParam[i].ib;
1614  pabx[i] = posx[a] - posx[b];
1615  paby[i] = posy[a] - posy[b];
1616  pabz[i] = posz[a] - posz[b];
1617  BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
1618  BigReal rabsq = rattleParam[i].dsq;
1619  BigReal diffsq = pabsq - rabsq;
1620  sigma[i] = diffsq;
1621  if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
1622  done = 0;
1623  }
1624  }
1625  if(loop == maxiter)
1626  consFailure[idx] = 1;
1627  else
1628  {
1629 
1630 // double m = mass[ig];
1631  for(int i = 0; i < hydrogenGroupSize[ig]; ++i)
1632  {
1633  // posNew_x[i+ig] = posx[i];
1634  // posNew_y[i+ig] = posy[i];
1635  // posNew_z[i+ig] = posz[i];
1636  #if 1
1637  // Velocity update
1638  // double vx = velx_d[i + ig];
1639  // double vy = vely_d[i + ig];
1640  // double vz = velz_d[i + ig];
1641  const double m = mass[i + ig];
1642  double vnx = (posx[i] - refx[i]) * invdt;
1643  double vny = (posy[i] - refy[i]) * invdt;
1644  double vnz = (posz[i] - refz[i]) * invdt;
1645  df[0] = (vnx - velx[i]) * (m * invdt);
1646  df[1] = (vny - vely[i]) * (m * invdt);
1647  df[2] = (vnz - velz[i]) * (m * invdt);
1648 
1649 
1650  // Updates velocity and force
1651  vel_x[i + ig] = vnx;
1652  vel_y[i + ig] = vny;
1653  vel_z[i + ig] = vnz;
1654 
1655  f_normal_x[i + ig] += df[0];
1656  f_normal_y[i + ig] += df[1];
1657  f_normal_z[i + ig] += df[2];
1658 // m = h_m;
1659  // velx_d[i+ig] = (posx[i] - refx[i]) * invdt;
1660  // vely_d[i+ig] = (posy[i] - refy[i]) * invdt;
1661  // velz_d[i+ig] = (posz[i] - refz[i]) * invdt;
1662  // velnew update
1663  // velNew_x[i + ig] = vnx;
1664  // velNew_y[i + ig] = vny;
1665  // velNew_z[i + ig] = vnz;
1666  #endif
1667  }
1668  // consFailure[idx] = 0;
1669  //printf("Loop is %d %d\n", loop, consFailure[idx]);
1670  }
1671  }
1672  }
1673 }
1674 
1675 void CallRattle1Kernel(
1676  const bool doFixed,
1677  int numAtoms,
1678  // Settle Parameters
1679  const double dt,
1680  const double invdt,
1681  const int nSettles,
1682  double * __restrict vel_x,
1683  double * __restrict vel_y,
1684  double * __restrict vel_z,
1685  const double * __restrict pos_x,
1686  const double * __restrict pos_y,
1687  const double * __restrict pos_z,
1688  double * __restrict f_normal_x,
1689  double * __restrict f_normal_y,
1690  double * __restrict f_normal_z,
1691  const float * __restrict mass,
1692  const int * __restrict hydrogenGroupSize,
1693  const float * __restrict rigidBondLength,
1694  const int * __restrict atomFixed,
1695  int * __restrict settleList,
1696  const SettleParameters * sp,
1697  const CudaRattleElem * __restrict rattleList,
1698  const int nShakes,
1699  const BigReal tol2_d,
1700  const int maxiter_d,
1701  int* consFailure,
1702  const int nSettleBlocks,
1703  const int nShakeBlocks,
1704  const WaterModel water_model,
1705  cudaStream_t stream
1706 ) {
1707  const int nTotalBlocks = (nSettleBlocks + nShakeBlocks);
1708 #define CALL(WATER_MODEL, DOFIXED) \
1709  Rattle1Kernel<getWaterModelGroupSize(WATER_MODEL), WATER_MODEL, DOFIXED> \
1710  <<<nTotalBlocks, 128, 0, stream>>> \
1711  (numAtoms, dt, invdt, nSettles, \
1712  vel_x, vel_y, vel_z, \
1713  pos_x, pos_y, pos_z, \
1714  f_normal_x, f_normal_y, f_normal_z, \
1715  mass, hydrogenGroupSize, rigidBondLength, atomFixed, \
1716  settleList, sp, \
1717  rattleList, nShakes, tol2_d, maxiter_d, consFailure, nSettleBlocks);
1718  if (doFixed) {
1719  switch (water_model) {
1720  case WaterModel::SWM4: CALL(WaterModel::SWM4, true); break;
1721  case WaterModel::TIP4: CALL(WaterModel::TIP4, true); break;
1722  case WaterModel::TIP3: CALL(WaterModel::TIP3, true); break;
1723  }
1724  } else {
1725  switch (water_model) {
1726  case WaterModel::SWM4: CALL(WaterModel::SWM4, false); break;
1727  case WaterModel::TIP4: CALL(WaterModel::TIP4, false); break;
1728  case WaterModel::TIP3: CALL(WaterModel::TIP3, false); break;
1729  }
1730  }
1731 #undef CALL
1732  cudaStreamSynchronize(stream);
1733 }