2 *** Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by
3 *** The Board of Trustees of the University of Illinois.
4 *** All rights reserved.
13 #include <hip/hip_runtime.h>
19 #include "MShakeKernel.h"
20 #include "HipDefines.h"
24 typedef float BigReal;
26 typedef double BigReal;
29 //__constant__ SettleParameters constSP;
31 // Haochuan: this function is not used so I disable it
33 __global__ void rattlePair(
35 const double * __restrict vel_x,
36 const double * __restrict vel_y,
37 const double * __restrict vel_z,
38 const double * __restrict pos_x,
39 const double * __restrict pos_y,
40 const double * __restrict pos_z,
41 double * __restrict velNew_x,
42 double * __restrict velNew_y,
43 double * __restrict velNew_z,
44 double * __restrict posNew_x,
45 double * __restrict posNew_y,
46 double * __restrict posNew_z,
47 const int * __restrict hydrogenGroupSize,
48 const float * __restrict rigidBondLength,
49 const int * __restrict atomFixed,
50 CudaRattleElem * __restrict rattlePairList,
53 int tid = threadIdx.x + blockIdx.x*blockDim.x;
54 int stride = blockDim.x * gridDim.x;
55 for(int i = tid; i < nRattlePairs; i+= stride){
57 int ig = rattlePairList[i].ig;
58 CudaRattleParam param = rattlePairList[i].params[0];
59 int a = ig + param.ia;
60 int b = ig + param.ib;
61 BigReal pabx = posNew_x[a] - posNew_x[b];
62 BigReal paby = posNew_y[a] - posNew_y[b];
63 BigReal pabz = posNew_z[a] - posNew_z[b];
64 BigReal pabsq = pabx*pabx + paby*paby + pabz*pabz;
65 BigReal rabsq = param.dsq;
66 BigReal diffsq = rabsq - pabsq;
67 BigReal rabx = pos_x[a] - pos_x[b];
68 BigReal raby = pos_y[a] - pos_y[b];
69 BigReal rabz = pos_z[a] - pos_z[b];
71 BigReal refsq = rabx*rabx + raby*raby + rabz*rabz;
72 BigReal rpab = rabx*pabx + raby*paby + rabz*pabz;
74 BigReal rma = param.rma;
75 BigReal rmb = param.rmb;
78 BigReal sqrtarg = rpab*rpab + refsq*diffsq;
84 gab = (-rpab + sqrt(sqrtarg))/(refsq*(rma + rmb));
86 BigReal dpx = rabx * gab;
87 BigReal dpy = raby * gab;
88 BigReal dpz = rabz * gab;
89 posNew_x[a] += rma * dpx;
90 posNew_y[a] += rma * dpy;
91 posNew_z[a] += rma * dpz;
92 posNew_x[b] -= rmb * dpx;
93 posNew_y[b] -= rmb * dpy;
94 posNew_z[b] -= rmb * dpz;
99 // Haochuan: this function does not support SWM4 and TIP4,
100 // and is not used anywhere, so I disable it.
102 __global__ void settleInit(int waterIndex, const float *mass,
103 const int *hydrogenGroupSize, const float *rigidBondLength,
104 SettleParameters *sp){
106 //This initializes the water parameters
108 //Water index points to oxygen, oatm poins to a hydrogen in the group
109 float pmO, pmH, hhdist, ohdist;
110 int oatm = waterIndex+1;
111 oatm += 1*(mass[waterIndex] < 0.5 || mass[waterIndex+1] < 0.5);
113 //Assigns values to everyone
114 pmO = mass[waterIndex];
116 hhdist = rigidBondLength[waterIndex];
117 ohdist = rigidBondLength[oatm];
118 float rmT = 1.f / (pmO+pmH+pmH);
119 sp->mOrmT = pmO * rmT;
120 sp->mHrmT = pmH * rmT;
121 BigReal t1 = 0.5f*pmO/pmH;
122 sp->rc = 0.5f*hhdist;
123 sp->ra = sqrt(ohdist*ohdist-sp->rc*sp->rc)/(1.0+t1);
125 sp->rra = 1.f / sp->ra;
129 __forceinline__ __device__ void settle1(
130 const double ref[3][3],
132 const SettleParameters *sp) {
134 // swiped from Settle.C
155 double mOrmT = sp->mOrmT;
156 double mHrmT = sp->mHrmT;
157 double rra = sp->rra;
182 // vectors in the plane of the original positions
183 BigReal b0x = ref1x - ref0x;
184 BigReal b0y = ref1y - ref0y;
185 BigReal b0z = ref1z - ref0z;
187 BigReal c0x = ref2x - ref0x;
188 BigReal c0y = ref2y - ref0y;
189 BigReal c0z = ref2z - ref0z;
191 // new center of mass
192 BigReal d0x = pos0x*mOrmT + ((pos1x + pos2x)*mHrmT);
193 BigReal d0y = pos0y*mOrmT + ((pos1y + pos2y)*mHrmT);
194 BigReal d0z = pos0z*mOrmT + ((pos1z + pos2z)*mHrmT);
196 BigReal a1x = pos0x - d0x;
197 BigReal a1y = pos0y - d0y;
198 BigReal a1z = pos0z - d0z;
200 BigReal b1x = pos1x - d0x;
201 BigReal b1y = pos1y - d0y;
202 BigReal b1z = pos1z - d0z;
204 BigReal c1x = pos2x - d0x;
205 BigReal c1y = pos2y - d0y;
206 BigReal c1z = pos2z - d0z;
208 // Vectors describing transformation from original coordinate system to
209 // the 'primed' coordinate system as in the diagram.
211 BigReal n0x = b0y*c0z-c0y*b0z;
212 BigReal n0y = c0x*b0z-b0x*c0z;
213 BigReal n0z = b0x*c0y-c0x*b0y;
216 BigReal n1x = a1y*n0z-n0y*a1z;
217 BigReal n1y = n0x*a1z-a1x*n0z;
218 BigReal n1z = a1x*n0y-n0x*a1y;
220 BigReal n2x = n0y*n1z-n1y*n0z;
221 BigReal n2y = n1x*n0z-n0x*n1z;
222 BigReal n2z = n0x*n1y-n1x*n0y;
224 //Change by rsqrtf later
225 BigReal n0inv = 1.0/sqrt(n0x*n0x + n0y*n0y + n0z*n0z);
230 BigReal n1inv = 1.0/sqrt(n1x*n1x + n1y*n1y + n1z*n1z);
235 BigReal n2inv = 1.0/sqrt(n2x*n2x + n2y*n2y + n2z*n2z);
240 //b0 = Vector(n1*b0, n2*b0, n0*b0); // note: b0.z is never referenced again
241 BigReal n1b0 = n1x*b0x + n1y*b0y + n1z*b0z;
242 BigReal n2b0 = n2x*b0x + n2y*b0y + n2z*b0z;
244 //c0 = Vector(n1*c0, n2*c0, n0*c0); // note: c0.z is never referenced again
245 BigReal n1c0 = n1x*c0x + n1y*c0y + n1z*c0z;
246 BigReal n2c0 = n2x*c0x + n2y*c0y + n2z*c0z;
248 BigReal A1Z = n0x*a1x + n0y*a1y + n0z*a1z;
250 //b1 = Vector(n1*b1, n2*b1, n0*b1);
251 BigReal n1b1 = n1x*b1x + n1y*b1y + n1z*b1z;
252 BigReal n2b1 = n2x*b1x + n2y*b1y + n2z*b1z;
253 BigReal n0b1 = n0x*b1x + n0y*b1y + n0z*b1z;
255 //c1 = Vector(n1*c1, n2*c1, n0*c1);
256 BigReal n1c1 = n1x*c1x + n1y*c1y + n1z*c1z;
257 BigReal n2c1 = n2x*c1x + n2y*c1y + n2z*c1z;
258 BigReal n0c1 = n0x*c1x + n0y*c1y + n0z*c1z;
260 // now we can compute positions of canonical water
261 BigReal sinphi = A1Z * rra;
262 BigReal tmp = 1.0-sinphi*sinphi;
263 BigReal cosphi = sqrt(tmp);
264 BigReal sinpsi = (n0b1 - n0c1)/(2.0*rc*cosphi);
265 tmp = 1.0-sinpsi*sinpsi;
266 BigReal cospsi = sqrt(tmp);
268 BigReal rbphi = -rb*cosphi;
269 BigReal tmp1 = rc*sinpsi*sinphi;
270 //BigReal tmp2 = rc*sinpsi*cosphi;
272 //Vector a2(0, ra*cosphi, ra*sinphi);
273 BigReal a2y = ra*cosphi;
275 //Vector b2(-rc*cospsi, rbphi - tmp1, -rb*sinphi + tmp2);
276 BigReal b2x = -rc*cospsi;
277 BigReal b2y = rbphi - tmp1;
279 //Vector c2( rc*cosphi, rbphi + tmp1, -rb*sinphi - tmp2);
280 BigReal c2y = rbphi + tmp1;
282 // there are no a0 terms because we've already subtracted the term off
283 // when we first defined b0 and c0.
284 BigReal alpha = b2x*(n1b0 - n1c0) + n2b0*b2y + n2c0*c2y;
285 BigReal beta = b2x*(n2c0 - n2b0) + n1b0*b2y + n1c0*c2y;
286 BigReal gama = n1b0*n2b1 - n1b1*n2b0 + n1c0*n2c1 - n1c1*n2c0;
288 BigReal a2b2 = alpha*alpha + beta*beta;
289 BigReal sintheta = (alpha*gama - beta*sqrt(a2b2 - gama*gama))/a2b2;
290 BigReal costheta = sqrt(1.0 - sintheta*sintheta);
292 //Vector a3( -a2y*sintheta,
295 BigReal a3x = -a2y*sintheta;
296 BigReal a3y = a2y*costheta;
299 // Vector b3(b2x*costheta - b2y*sintheta,
300 // b2x*sintheta + b2y*costheta,
302 BigReal b3x = b2x*costheta - b2y*sintheta;
303 BigReal b3y = b2x*sintheta + b2y*costheta;
306 // Vector c3(-b2x*costheta - c2y*sintheta,
307 // -b2x*sintheta + c2y*costheta,
309 BigReal c3x = -b2x*costheta - c2y*sintheta;
310 BigReal c3y = -b2x*sintheta + c2y*costheta;
313 // undo the transformation; generate new normal vectors from the transpose.
314 // Vector m1(n1.x, n2.x, n0.x);
319 // Vector m2(n1.y, n2.y, n0.y);
324 // Vector m0(n1.z, n2.z, n0.z);
329 //pos[i*3+0] = Vector(a3*m1, a3*m2, a3*m0) + d0;
330 pos0x = a3x*m1x + a3y*m1y + a3z*m1z + d0x;
331 pos0y = a3x*m2x + a3y*m2y + a3z*m2z + d0y;
332 pos0z = a3x*m0x + a3y*m0y + a3z*m0z + d0z;
334 // pos[i*3+1] = Vector(b3*m1, b3*m2, b3*m0) + d0;
335 pos1x = b3x*m1x + b3y*m1y + b3z*m1z + d0x;
336 pos1y = b3x*m2x + b3y*m2y + b3z*m2z + d0y;
337 pos1z = b3x*m0x + b3y*m0y + b3z*m0z + d0z;
339 // pos[i*3+2] = Vector(c3*m1, c3*m2, c3*m0) + d0;
340 pos2x = c3x*m1x + c3y*m1y + c3z*m1z + d0x;
341 pos2y = c3x*m2x + c3y*m2y + c3z*m2z + d0y;
342 pos2z = c3x*m0x + c3y*m0y + c3z*m0z + d0z;
355 // swipe from HomePatch::tip4_omrepos
356 /* Reposition the om particle of a tip4p water
357 * A little geometry shows that the appropriate position is given by
358 * R_O + (1 / 2 r_ohc) * ( 0.5 (R_H1 + R_H2) - R_O )
359 * Here r_om is the distance from the oxygen to Om site, and r_ohc
360 * is the altitude from the oxygen to the hydrogen center of mass
361 * Those quantities are precalculated upon initialization of HomePatch
363 * Ordering of TIP4P atoms: O, H1, H2, LP.
365 __device__ void tip4_Om_reposition(
367 const double r_om, const double r_ohc) {
368 const double factor = r_om / r_ohc;
369 pos[3][0] = pos[0][0] + (0.5 * (pos[1][0] + pos[2][0]) - pos[0][0]) * factor;
370 pos[3][1] = pos[0][1] + (0.5 * (pos[1][1] + pos[2][1]) - pos[0][1]) * factor;
371 pos[3][2] = pos[0][2] + (0.5 * (pos[1][2] + pos[2][2]) - pos[0][2]) * factor;
374 // swipe from HomePatch::swm4_omrepos
375 /* Reposition lonepair (Om) particle of Drude SWM4 water.
376 * Same comments apply as to tip4_omrepos(), but the ordering of atoms
377 * is different: O, D, LP, H1, H2.
379 __device__ void swm4_Om_reposition(
381 const double r_om, const double r_ohc) {
382 const double factor = r_om / r_ohc;
383 pos[2][0] = pos[0][0] + (0.5 * (pos[3][0] + pos[4][0]) - pos[0][0]) * factor;
384 pos[2][1] = pos[0][1] + (0.5 * (pos[3][1] + pos[4][1]) - pos[0][1]) * factor;
385 pos[2][2] = pos[0][2] + (0.5 * (pos[3][2] + pos[4][2]) - pos[0][2]) * factor;
389 void MSHAKEIterate(const int icnt, const CudaRattleElem* rattleList,
390 const BigReal *refx, const BigReal *refy, const BigReal *refz,
391 BigReal *posx, BigReal *posy, BigReal *posz,
392 const BigReal tol2, const int maxiter,
393 bool& done, bool& consFailure);
395 __device__ inline BigReal det_3by3(BigReal A[4][4])
398 A[0][0]*(A[1][1]*A[2][2]-A[1][2]*A[2][1])-
399 A[0][1]*(A[1][0]*A[2][2]-A[1][2]*A[2][0])+
400 A[0][2]*(A[1][0]*A[2][1]-A[1][1]*A[2][0]);
403 __device__ void swap_row(BigReal A[4][4], BigReal b[4], int r1, int r2)
406 for (int i = 0; i < 4; i++)
408 BigReal* p1 = &A[r1][i];
409 BigReal* p2 = &A[r2][i];
422 __device__ void solve_4by4(BigReal lambda [4], BigReal A[4][4], BigReal sigma[4])
426 for (int k = 0; k < 4; ++k)
430 BigReal Max = A[k][k];
432 for (int row = k + 1; row < 4; ++row)
434 if ((tmp = fabs(A[row][k])) > Max)
441 swap_row(A, sigma, k, piv_row);
443 for (int row = k + 1; row < 4; ++row)
445 tmp = A[row][k]/ A[k][k];
446 for (int col = k+1; col < 4; col++)
447 A[row][col] -= tmp * A[k][col];
449 sigma[row]-= tmp * sigma[k];
452 for (int row = 3; row >= 0; --row)
455 for (int j = 3; j > row; --j)
456 tmp -= lambda[j] * A[row][j];
457 lambda[row] = tmp / A[row][row];
462 __device__ void solveMatrix(BigReal lambda [4], BigReal A[4][4], BigReal sigma[4], int icnt)
468 lambda[0] = sigma[0]/A[0][0];
474 BigReal det=1./(A[0][0]*A[1][1]-A[0][1]*A[1][0]);
475 lambda[0] = ( A[1][1]*sigma[0]-A[0][1]*sigma[1])*det;
476 lambda[1] = (-A[1][0]*sigma[0]+A[0][0]*sigma[1])*det;
481 BigReal det = 1./det_3by3(A);
482 lambda[0] = det*((A[1][1]*A[2][2]-A[1][2]*A[2][1])*sigma[0]-
483 (A[0][1]*A[2][2]-A[0][2]*A[2][1])*sigma[1]+
484 (A[0][1]*A[1][2]-A[0][2]*A[1][1])*sigma[2]);
486 lambda[1] = det*((A[1][2]*A[2][0]-A[1][0]*A[2][2])*sigma[0]+
487 (A[0][0]*A[2][2]-A[0][2]*A[2][0])*sigma[1]-
488 (A[0][0]*A[1][2]-A[0][2]*A[1][0])*sigma[2]);
490 lambda[2] = det*((A[1][0]*A[2][1]-A[1][1]*A[2][0])*sigma[0]-
491 (A[0][0]*A[2][1]-A[0][1]*A[2][0])*sigma[1]+
492 (A[0][0]*A[1][1]-A[0][1]*A[1][0])*sigma[2]);
497 solve_4by4(lambda, A, sigma);
504 // The choice of launch bounds is determined based on nvcc output of how many
505 // registers the kernel is using. We see optimal performance using a register
506 // limit of 128, which corresponds to __launch_bounds__(512,1).
508 template <bool DOENERGY, bool DOFIXED>
511 __launch_bounds__(512,1)
513 //TODO:HIP - tune for HIP
514 // __launch_bounds__(512,1)
516 void MSHAKE_CUDA_Kernel(
517 const CudaRattleElem *rattleList,
520 const BigReal* __restrict__ refx_d,
521 const BigReal* __restrict__ refy_d,
522 const BigReal* __restrict__ refz_d,
523 BigReal* __restrict__ posx_d,
524 BigReal* __restrict__ posy_d,
525 BigReal* __restrict__ posz_d,
526 const BigReal* __restrict__ ref_velx_d, // for fixed atoms icnt == 0
527 const BigReal* __restrict__ ref_vely_d, // for fixed atoms icnt == 0
528 const BigReal* __restrict__ ref_velz_d, // for fixed atoms icnt == 0
529 BigReal* __restrict__ velx_d,
530 BigReal* __restrict__ vely_d,
531 BigReal* __restrict__ velz_d,
532 BigReal* __restrict__ f_normal_x,
533 BigReal* __restrict__ f_normal_y,
534 BigReal* __restrict__ f_normal_z,
535 cudaTensor* __restrict virial,
536 const float* __restrict mass,
538 const BigReal tol2_d,
541 const int* __restrict__ atomFixed)
543 // TODO: why is the CUDA (non-shared) path broken?
545 __shared__ BigReal sh_posx[128][4+1];
546 __shared__ BigReal sh_posy[128][4+1];
547 __shared__ BigReal sh_posz[128][4+1];
548 __shared__ BigReal sh_refx[128][4+1];
549 __shared__ BigReal sh_refy[128][4+1];
550 __shared__ BigReal sh_refz[128][4+1];
552 int idx = threadIdx.x + blockIdx.x * blockDim.x;
553 // cudaTensor lVirial;
554 // lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
555 // lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
556 // lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
559 consFailure[idx] = 0;
560 int ig = rattleList[idx].ig;
561 int icnt = rattleList[idx].icnt;
562 BigReal tol2 = tol2_d;
563 BigReal sigma[4] = {0};
564 BigReal lambda[4]= {0};
565 BigReal A[4][4] = {0};
566 // BigReal df[3] = {0};
567 BigReal pabx[4] = {0};
568 BigReal rabx[4] = {0};
569 BigReal paby[4] = {0};
570 BigReal raby[4] = {0};
571 BigReal pabz[4] = {0};
572 BigReal rabz[4] = {0};
574 BigReal posx[4+1] = {0};
575 BigReal posy[4+1] = {0};
576 BigReal posz[4+1] = {0};
577 BigReal refx[4+1] = {0};
578 BigReal refy[4+1] = {0};
579 BigReal refz[4+1] = {0};
581 BigReal* posx = &sh_posx[threadIdx.x][0];
582 BigReal* posy = &sh_posy[threadIdx.x][0];
583 BigReal* posz = &sh_posz[threadIdx.x][0];
584 BigReal* refx = &sh_refx[threadIdx.x][0];
585 BigReal* refy = &sh_refy[threadIdx.x][0];
586 BigReal* refz = &sh_refz[threadIdx.x][0];
587 for(int i = 0; i < 4+1; ++i)
597 //load datas from global memory to local memory
598 CudaRattleParam rattleParam[4]; //Does this work?
599 for(int i = 0; i < 4+1; ++i)
600 //for(int i = 0; i < hgs[i]; ++i)
602 //rattleParam[i] = rattleParam_d[4*idx+i];
603 if (i < 4) rattleParam[i] = rattleList[idx].params[i];
604 /*posx[i] = posx_d[i+4*idx];
605 posy[i] = posy_d[i+4*idx];
606 posz[i] = posz_d[i+4*idx];
607 refx[i] = refx_d[i+4*idx];
608 refx[i] = refx_d[i+4*idx];
609 refx[i] = refx_d[i+4*idx];
610 refy[i] = refy_d[i+4*idx];
611 refz[i] = refz_d[i+4*idx];*/
612 //I can pass the HGS and check if I is >= HGS[i]
614 posx[i] = posx_d[i+ig];
615 posy[i] = posy_d[i+ig];
616 posz[i] = posz_d[i+ig];
617 refx[i] = refx_d[i+ig];
618 refy[i] = refy_d[i+ig];
619 refz[i] = refz_d[i+ig];
624 // Haochuan: there are atoms fixed in the group if icnt == 0,
625 // which should be in the noconstList in the CPU code
626 // and simulated without constraints, but the CUDA
627 // code here count all of them into the rattleList,
628 // so I check icnt == 0 and skip the whole MSHAKE.
629 if (!DOFIXED || (DOFIXED && icnt > 0)) {
631 for(int i = 0; i < 4; ++i)
633 int a = rattleParam[i].ia;
634 int b = rattleParam[i].ib;
635 pabx[i] = posx[a] - posx[b];
636 paby[i] = posy[a] - posy[b];
637 pabz[i] = posz[a] - posz[b];
638 rabx[i] = refx[a] - refx[b];
639 raby[i] = refy[a] - refy[b];
640 rabz[i] = refz[a] - refz[b];
643 for(int i = 0; i < 4; ++i)
645 BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
646 BigReal rabsq = rattleParam[i].dsq;
647 BigReal diffsq = pabsq - rabsq;
649 if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
652 int maxiter = maxiter_d;
653 for(loop = 0; loop < maxiter; ++loop)
659 for(int j = 0; j < 4; ++j)
661 BigReal rma = rattleParam[j].rma;
663 for(int i = 0; i < 4; ++i)
665 A[j][i] = 2.*(pabx[j]*rabx[i]+paby[j]*raby[i]+pabz[j]*rabz[i])*rma;
667 BigReal rmb = rattleParam[j].rmb;
668 A[j][j] += 2.*(pabx[j]*rabx[j]+paby[j]*raby[j]+pabz[j]*rabz[j])*rmb;
671 solveMatrix(lambda, A, sigma, icnt);
673 for(int i = 0; i < 4; ++i)
675 int a = rattleParam[i].ia;
676 int b = rattleParam[i].ib;
677 BigReal rma = rattleParam[i].rma * lambda[i];
678 BigReal rmb = rattleParam[i].rmb * lambda[i];
680 posx[a] -= rma * rabx[i];
681 posy[a] -= rma * raby[i];
682 posz[a] -= rma * rabz[i];
683 posx[b] += rmb * rabx[i];
684 posy[b] += rmb * raby[i];
685 posz[b] += rmb * rabz[i];
692 for(int i = 0; i < 4; ++i)
694 int a = rattleParam[i].ia;
695 int b = rattleParam[i].ib;
696 pabx[i] = posx[a] - posx[b];
697 paby[i] = posy[a] - posy[b];
698 pabz[i] = posz[a] - posz[b];
699 BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
700 BigReal rabsq = rattleParam[i].dsq;
701 BigReal diffsq = pabsq - rabsq;
703 if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
708 consFailure[idx] = 1;
712 for(int i = 0; i < hgs[ig]; ++i)
714 posx_d[i+ig] = posx[i];
715 posy_d[i+ig] = posy[i];
716 posz_d[i+ig] = posz[i];
719 // double vx = velx_d[i + ig];
720 // double vy = vely_d[i + ig];
721 // double vz = velz_d[i + ig];
722 double vnx = (posx[i] - refx[i]) * invdt;
723 double vny = (posy[i] - refy[i]) * invdt;
724 double vnz = (posz[i] - refz[i]) * invdt;
725 // velx_d[i+ig] = (posx[i] - refx[i]) * invdt;
726 // vely_d[i+ig] = (posy[i] - refy[i]) * invdt;
727 // velz_d[i+ig] = (posz[i] - refz[i]) * invdt;
729 velx_d[i + ig] = vnx;
730 vely_d[i + ig] = vny;
731 velz_d[i + ig] = vnz;
734 consFailure[idx] = 0;
735 //printf("Loop is %d %d\n", loop, consFailure[idx]);
738 // Fixed atoms without constraints
739 for(int i = 0; i < hgs[ig]; ++i) {
740 // Copy from the old/reference positions.
741 posx_d[i+ig] = refx[i];
742 posy_d[i+ig] = refy[i];
743 posz_d[i+ig] = refz[i];
744 // velx_d[i+ig] = ref_velx_d[i+ig];
745 // vely_d[i+ig] = ref_vely_d[i+ig];
746 // velz_d[i+ig] = ref_velz_d[i+ig];
748 } // !DOFIXED || (DOFIXED && icnt > 0)
752 // XXX TODO: Fix this, it's horrible
754 void CheckConstraints(int* consFailure, int* consFailure_h, int size)
756 __shared__ int result[128];
757 int idx = threadIdx.x;
759 for(int i = idx; i < size; i += 128)
760 result[idx] += consFailure[i];
763 result[idx] += result[idx+64];
766 result[idx] += result[idx+32];
769 result[idx] += result[idx+16];
772 result[idx] += result[idx+8];
775 result[idx] += result[idx+4];
778 result[idx] += result[idx+2];
781 result[idx] += result[idx+1];
785 consFailure_h[0] += result[0];
786 //printf("Constraints %d\n", onsFailure[0]);
793 const CudaRattleElem* rattleList,
795 const int *hydrogenGroupSize,
796 const int *atomFixed,
803 const double *ref_velx,
804 const double *ref_vely,
805 const double *ref_velz,
812 cudaTensor* rigidVirial,
822 //fprintf(stderr, "No rattles, returning\n");
826 int gridDim = int(size/128)+(size%128==0 ? 0 : 1);
828 #define CALL(DOENERGY, DOFIXED) \
829 MSHAKE_CUDA_Kernel<DOENERGY, DOFIXED><<<gridDim, blockDim, 0, stream>>>( \
830 rattleList, size, hydrogenGroupSize, \
831 refx, refy, refz, posx, posy, posz, \
832 ref_velx, ref_vely, ref_velz, \
834 f_normal_x, f_normal_y, f_normal_z, rigidVirial, \
835 mass, invdt, tol2, maxiter, consFailure_d, atomFixed)
836 if ( doEnergy && doFixed) CALL(true, true);
837 else if ( doEnergy && !doFixed) CALL(true, false);
838 else if (!doEnergy && doFixed) CALL(false, true);
839 else if (!doEnergy && !doFixed) CALL(false, false);
840 else NAMD_bug("No kernel function is called in MSHAKE_CUDA.\n");
842 // cudaCheck(cudaGetLastError());
843 // JM NOTE: We can check the consFailure flag for every migration step
844 // CheckConstraints<<<1,128, 0, stream>>>(consFailure_d, consFailure, size);
845 // I can't do this here
846 // why do I need this variable?????
847 //done = !consFailure;
851 // Perform a warp-wide read on 3-element clusters from global memory
852 // into shared memory.
853 // a - global memory buffer
854 // buf - shared memory buffer, dimension BLOCK*3
855 // list - index offset to beginning of each 3-element cluster
856 // n - total number of clusters
858 // We apply this to improve reading of water (positions and velocities).
859 // Data in global memory is stored in SOA form: pos_x, pos_y, pos_z, etc,
860 // so we call this routine six times to read positions and velocities.
861 // Reading is coalesced whenever water molecules are all consecutive.
862 // Although waters are sorted to the end of each patch, there will
863 // generally be gaps between patches due to solute.
865 template <int VECTOR_SIZE>
866 __device__ __forceinline__ void ld_vecN(const double *a, double buf[][VECTOR_SIZE], int *list, int n)
868 int laneid = threadIdx.x % warpSize;
869 int warpid = threadIdx.x / warpSize;
873 //TODO:HIP verify for HIP
877 for (int j = 0; j < VECTOR_SIZE; j++) {
878 int offset = laneid + j*warpSize;
879 int ii = offset % VECTOR_SIZE;
880 int jj = warpid*warpSize + offset/VECTOR_SIZE;
881 if (blockIdx.x*blockDim.x + jj < n) {
882 int idx = list[jj] + ii;
883 buf[jj][ii] = a[idx];
889 //TODO:HIP verify for HIP
895 // Overload for float
896 template <int VECTOR_SIZE>
897 __device__ __forceinline__ void ld_vecN(const float *a, float buf[][VECTOR_SIZE], int *list, int n)
899 int laneid = threadIdx.x % warpSize;
900 int warpid = threadIdx.x / warpSize;
904 //TODO:HIP verify for HIP
908 for (int j = 0; j < VECTOR_SIZE; j++) {
909 int offset = laneid + j*warpSize;
910 int ii = offset % VECTOR_SIZE;
911 int jj = warpid*warpSize + offset/VECTOR_SIZE;
912 if (blockIdx.x*blockDim.x + jj < n) {
913 int idx = list[jj] + ii;
914 buf[jj][ii] = a[idx];
920 //TODO:HIP verify for HIP
927 // Same as previous, only this time we write to global memory from
930 template <int VECTOR_SIZE>
931 __device__ __forceinline__ void st_vecN(double *a, double buf[][VECTOR_SIZE], int *list, int n)
933 int laneid = threadIdx.x % warpSize;
934 int warpid = threadIdx.x / warpSize;
938 //TODO:HIP verify for HIP
941 for (int j = 0; j < VECTOR_SIZE; j++) {
942 int offset = laneid + j*warpSize;
943 int ii = offset % VECTOR_SIZE;
944 int jj = warpid*warpSize + offset/VECTOR_SIZE;
945 if (blockIdx.x*blockDim.x + jj < n) {
946 int idx = list[jj] + ii;
947 a[idx] = buf[jj][ii];
953 //TODO:HIP verify for HIP
959 // Same as previous, but accumulates the value instead of copying
961 template <int VECTOR_SIZE>
962 __device__ __forceinline__ void acc_vecN(double* __restrict__ a, double buf[][VECTOR_SIZE], int *list, int n)
964 int laneid = threadIdx.x % warpSize;
965 int warpid = threadIdx.x / warpSize;
969 //TODO:HIP verify for HIP
972 for (int j = 0; j < VECTOR_SIZE; j++) {
973 int offset = laneid + j*warpSize;
974 int ii = offset % VECTOR_SIZE;
975 int jj = warpid*warpSize + offset/VECTOR_SIZE;
976 if (blockIdx.x*blockDim.x + jj < n) {
977 int idx = list[jj] + ii;
978 a[idx] += buf[jj][ii];
984 //TODO:HIP verify for HIP
989 // Haochuan: fixed atoms in water are already excluded from the
990 // settleList, so we don't need to do anything related to
993 // The choice of launch bounds is determined based on nvcc output of how many
994 // registers the kernel is using. We see optimal performance using a register
995 // limit of 128, which corresponds to __launch_bounds__(512,1).
997 template <bool DOENERGY, int BLOCK,
998 int WATER_GROUP_SIZE,
999 WaterModel WATER_MODEL>
1002 __launch_bounds__(512,1)
1005 // __launch_bounds__(512,1)
1012 const double * __restrict vel_x,
1013 const double * __restrict vel_y,
1014 const double * __restrict vel_z,
1015 const double * __restrict pos_x,
1016 const double * __restrict pos_y,
1017 const double * __restrict pos_z,
1018 double * __restrict velNew_x,
1019 double * __restrict velNew_y,
1020 double * __restrict velNew_z,
1021 double * __restrict posNew_x,
1022 double * __restrict posNew_y,
1023 double * __restrict posNew_z,
1024 double * __restrict f_normal_x,
1025 double * __restrict f_normal_y,
1026 double * __restrict f_normal_z,
1027 cudaTensor* __restrict virial,
1028 const float* __restrict mass,
1029 const int * __restrict hydrogenGroupSize,
1030 const float * __restrict rigidBondLength,
1031 const int * __restrict atomFixed,
1032 const int * __restrict settleList,
1033 const SettleParameters * __restrict sp)
1035 int tid = threadIdx.x + blockIdx.x*blockDim.x;
1036 double ref[WATER_GROUP_SIZE][3] = {0};
1037 double pos[WATER_GROUP_SIZE][3] = {0};
1038 double vel[WATER_GROUP_SIZE][3] = {0};
1040 __shared__ int list[BLOCK];
1041 __shared__ double buf[BLOCK][WATER_GROUP_SIZE];
1042 if (tid < nSettles) {
1043 list[threadIdx.x] = settleList[tid];
1046 // Do a warp-wide read on pos* and vel* from global memory to shmem
1047 //BEGIN OF FIRST PART OF RATTLE1: Settle_SIMD invocation
1048 ld_vecN<WATER_GROUP_SIZE>(pos_x, buf, list, nSettles);
1050 for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][0] = buf[threadIdx.x][i];
1051 ld_vecN<WATER_GROUP_SIZE>(pos_y, buf, list, nSettles);
1053 for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][1] = buf[threadIdx.x][i];
1054 ld_vecN<WATER_GROUP_SIZE>(pos_z, buf, list, nSettles);
1056 for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][2] = buf[threadIdx.x][i];
1057 ld_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
1059 for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][0] = buf[threadIdx.x][i];
1060 ld_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
1062 for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][1] = buf[threadIdx.x][i];
1063 ld_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
1065 for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][2] = buf[threadIdx.x][i];
1068 for (int i = 0; i < WATER_GROUP_SIZE; ++i) {
1069 pos[i][0] = ref[i][0] + vel[i][0]*dt;
1070 pos[i][1] = ref[i][1] + vel[i][1]*dt;
1071 pos[i][2] = ref[i][2] + vel[i][2]*dt;
1074 switch (WATER_MODEL) {
1075 case WaterModel::SWM4: {
1076 // SWM4 ordering: O D LP H1 H2
1077 // do swap(O,LP) and call settle with subarray O H1 H2
1081 for (int i = 0; i < 3; ++i) {
1082 lp_ref[i] = ref[2][i];
1083 lp_pos[i] = pos[2][i];
1086 for (int i = 0; i < 3; ++i) {
1087 ref[2][i] = ref[0][i];
1088 pos[2][i] = pos[0][i];
1090 settle1(ref+2, pos+2, sp);
1091 // swap back after we return
1093 for (int i = 0; i < 3; ++i) {
1094 ref[0][i] = ref[2][i];
1095 pos[0][i] = pos[2][i];
1098 for (int i = 0; i < 3; ++i) {
1099 ref[2][i] = lp_ref[i];
1100 pos[2][i] = lp_pos[i];
1102 swm4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1105 case WaterModel::TIP4: {
1106 settle1(ref, pos, sp);
1107 tip4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1111 settle1(ref, pos, sp);
1115 // Update position and velocities with stuff calculated by settle1
1117 // Do a warp-wide write on pos* and vel* to global memory from shmem
1119 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][0] - ref[i][0])*invdt;
1120 st_vecN<WATER_GROUP_SIZE>(velNew_x, buf, list, nSettles);
1122 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][1] - ref[i][1])*invdt;
1123 st_vecN<WATER_GROUP_SIZE>(velNew_y, buf, list, nSettles);
1125 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][2] - ref[i][2])*invdt;
1126 st_vecN<WATER_GROUP_SIZE>(velNew_z, buf, list, nSettles);
1128 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][0];
1129 st_vecN<WATER_GROUP_SIZE>(posNew_x, buf, list, nSettles);
1131 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][1];
1132 st_vecN<WATER_GROUP_SIZE>(posNew_y, buf, list, nSettles);
1134 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = pos[i][2];
1135 st_vecN<WATER_GROUP_SIZE>(posNew_z, buf, list, nSettles);
1137 // force+velocity update
1138 if (tid < nSettles) {
1139 double vnx, vny, vnz;
1141 for(int i = 0 ; i < 3; i++){
1142 vnx = (pos[i][0] - ref[i][0]) * invdt;
1143 vny = (pos[i][1] - ref[i][1]) * invdt;
1144 vnz = (pos[i][2] - ref[i][2]) * invdt;
1146 df[0] = (vnx - vel[i][0]) * mass[i] * invdt;
1147 df[1] = (vny - vel[i][1]) * mass[i] * invdt;
1148 df[2] = (vnz - vel[i][2]) * mass[i] * invdt;
1150 velNew_x[ig+i] = vnx;
1151 velNew_y[ig+i] = vny;
1152 velNew_z[ig+i] = vnx;
1153 posNew_x[ig+i] = pos[i][0];
1154 posNew_y[ig+i] = pos[i][1];
1155 posNew_z[ig+i] = pos[i][2];
1156 f_normal_x[ig+i] += df[0];
1157 f_normal_y[ig+i] += df[1];
1158 f_normal_z[ig+i] += df[2];
1161 lVirial.xx += df[0] * pos[i][0];
1162 // lVirial.xy += df[0] * pos[i][1];
1163 // lVirial.xz += df[0] * pos[i][2];
1164 lVirial.yx += df[1] * pos[i][0];
1165 lVirial.yy += df[1] * pos[i][1];
1166 // lVirial.yz += df[1] * pos[i][2];
1167 lVirial.zx += df[2] * pos[i][0];
1168 lVirial.zy += df[2] * pos[i][1];
1169 lVirial.zz += df[2] * pos[i][2];
1174 typedef cub::BlockReduce<BigReal, 128> BlockReduce;
1175 __shared__ typename BlockReduce::TempStorage temp_storage;
1176 lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
1177 // lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
1178 // lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
1179 lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
1180 lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy); __syncthreads();
1181 // lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
1182 lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
1183 lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
1184 lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
1186 // Every block has a locally reduced blockVirial
1187 // Now every thread does an atomicAdd to get a global virial
1188 if(threadIdx.x == 0){
1189 atomicAdd(&(virial->xx), lVirial.xx);
1190 atomicAdd(&(virial->xy), lVirial.yx);
1191 atomicAdd(&(virial->xz), lVirial.zx);
1192 atomicAdd(&(virial->yx), lVirial.yx);
1193 atomicAdd(&(virial->yy), lVirial.yy);
1194 atomicAdd(&(virial->yz), lVirial.zy);
1195 atomicAdd(&(virial->zx), lVirial.zx);
1196 atomicAdd(&(virial->zy), lVirial.zy);
1197 atomicAdd(&(virial->zz), lVirial.zz);
1205 const bool doEnergy,
1210 const double * vel_x,
1211 const double * vel_y,
1212 const double * vel_z,
1213 const double * pos_x,
1214 const double * pos_y,
1215 const double * pos_z,
1222 double * f_normal_x,
1223 double * f_normal_y,
1224 double * f_normal_z,
1227 const int * hydrogenGroupSize,
1228 const float * rigidBondLength,
1229 const int * atomFixed,
1231 const SettleParameters * sp,
1232 const WaterModel water_model,
1233 cudaStream_t stream)
1237 const int blocks = 128;
1239 const int blocks = 128;
1241 int grid = (nSettles + blocks - 1) / blocks;
1242 #define CALL(DOENERGY, WATER_MODEL) \
1243 SettleKernel<DOENERGY, blocks, getWaterModelGroupSize(WATER_MODEL), WATER_MODEL> \
1244 <<< grid, blocks, 0, stream >>> \
1245 (numAtoms, dt, invdt, nSettles, \
1246 vel_x, vel_y, vel_z, \
1247 pos_x, pos_y, pos_z, \
1248 velNew_x, velNew_y, velNew_z, \
1249 posNew_x, posNew_y, posNew_z, \
1250 f_normal_x, f_normal_y, f_normal_z, virial, mass, \
1251 hydrogenGroupSize, rigidBondLength, atomFixed, \
1254 switch (water_model) {
1255 case WaterModel::SWM4: {
1256 if ( doEnergy) CALL(true, WaterModel::SWM4);
1257 if (!doEnergy) CALL(false, WaterModel::SWM4);
1260 case WaterModel::TIP4: {
1261 if ( doEnergy) CALL(true, WaterModel::TIP4);
1262 if (!doEnergy) CALL(false, WaterModel::TIP4);
1266 if ( doEnergy) CALL(true, WaterModel::TIP3);
1267 if (!doEnergy) CALL(false, WaterModel::TIP3);
1271 cudaCheck(cudaStreamSynchronize(stream));
1274 // Fused kernel for rigid bonds constraints
1275 // Each block will do a single operation -> either SETTLE or MSHAKE.
1276 template <int WATER_GROUP_SIZE,
1277 WaterModel WATER_MODEL,
1280 __launch_bounds__(512,1)
1283 // __launch_bounds__(512,1)
1285 __global__ void Rattle1Kernel(
1287 // Settle Parameters
1291 double * __restrict vel_x,
1292 double * __restrict vel_y,
1293 double * __restrict vel_z,
1294 const double * __restrict pos_x,
1295 const double * __restrict pos_y,
1296 const double * __restrict pos_z,
1297 double * __restrict f_normal_x,
1298 double * __restrict f_normal_y,
1299 double * __restrict f_normal_z,
1300 const float * __restrict mass,
1301 const int * __restrict hydrogenGroupSize,
1302 const float * __restrict rigidBondLength,
1303 const int * __restrict atomFixed,
1304 int * __restrict settleList,
1305 const SettleParameters * sp,
1306 const CudaRattleElem * __restrict rattleList,
1308 const BigReal tol2_d,
1309 const int maxiter_d,
1311 const int nSettleBlocks
1313 // Great, first step is fetching value
1314 int tid = threadIdx.x + blockIdx.x * blockDim.x;
1315 bool isSettle = blockIdx.x < nSettleBlocks;
1316 __shared__ int list[128];
1317 __shared__ double buf[128][WATER_GROUP_SIZE];
1319 __shared__ BigReal sh_posx[128][4+1];
1320 __shared__ BigReal sh_posy[128][4+1];
1321 __shared__ BigReal sh_posz[128][4+1];
1322 __shared__ BigReal sh_refx[128][4+1];
1323 __shared__ BigReal sh_refy[128][4+1];
1324 __shared__ BigReal sh_refz[128][4+1];
1329 list[threadIdx.x] = settleList[tid];
1331 double ref[WATER_GROUP_SIZE][3] = {0};
1332 double pos[WATER_GROUP_SIZE][3] = {0};
1333 double vel[WATER_GROUP_SIZE][3] = {0};
1334 float tmp_mass[WATER_GROUP_SIZE];
1336 ld_vecN<WATER_GROUP_SIZE>(pos_x, buf, list, nSettles);
1338 for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][0] = buf[threadIdx.x][i];
1339 ld_vecN<WATER_GROUP_SIZE>(pos_y, buf, list, nSettles);
1341 for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][1] = buf[threadIdx.x][i];
1342 ld_vecN<WATER_GROUP_SIZE>(pos_z, buf, list, nSettles);
1344 for (int i = 0; i < WATER_GROUP_SIZE; ++i) ref[i][2] = buf[threadIdx.x][i];
1345 ld_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
1347 for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][0] = buf[threadIdx.x][i];
1348 ld_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
1350 for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][1] = buf[threadIdx.x][i];
1351 ld_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
1353 for (int i = 0; i < WATER_GROUP_SIZE; ++i) vel[i][2] = buf[threadIdx.x][i];
1355 for (int i = 0; i < WATER_GROUP_SIZE; ++i) {
1356 pos[i][0] = ref[i][0] + vel[i][0]*dt;
1357 pos[i][1] = ref[i][1] + vel[i][1]*dt;
1358 pos[i][2] = ref[i][2] + vel[i][2]*dt;
1360 switch (WATER_MODEL) {
1361 case WaterModel::SWM4: {
1362 // SWM4 ordering: O D LP H1 H2
1363 // do swap(O,LP) and call settle with subarray O H1 H2
1367 for (int i = 0; i < 3; ++i) {
1368 lp_ref[i] = ref[2][i];
1369 lp_pos[i] = pos[2][i];
1372 for (int i = 0; i < 3; ++i) {
1373 ref[2][i] = ref[0][i];
1374 pos[2][i] = pos[0][i];
1376 settle1(ref+2, pos+2, sp);
1377 // swap back after we return
1379 for (int i = 0; i < 3; ++i) {
1380 ref[0][i] = ref[2][i];
1381 pos[0][i] = pos[2][i];
1384 for (int i = 0; i < 3; ++i) {
1385 ref[2][i] = lp_ref[i];
1386 pos[2][i] = lp_pos[i];
1388 swm4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1391 case WaterModel::TIP4: {
1392 settle1(ref, pos, sp);
1393 tip4_Om_reposition(pos, sp->r_om, sp->r_ohc);
1397 settle1(ref, pos, sp);
1400 // load the mass to calculate the force
1401 if (WATER_MODEL != WaterModel::TIP3) {
1402 __shared__ float mass_buf[128][WATER_GROUP_SIZE];
1403 ld_vecN<WATER_GROUP_SIZE>(mass, mass_buf, list, nSettles);
1405 for (int i = 0; i < WATER_GROUP_SIZE; ++i) tmp_mass[i] = mass_buf[threadIdx.x][i];
1407 // ------ Update the atomic velocities along X
1409 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][0] - ref[i][0])*invdt;
1411 st_vecN<WATER_GROUP_SIZE>(vel_x, buf, list, nSettles);
1413 // buf here holds the new velocities. now we calculate the force contributions
1414 // maybe it's this mass?
1415 if (WATER_MODEL != WaterModel::TIP3) {
1417 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][0]) * (tmp_mass[i]*invdt);
1419 buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][0]) * (sp->mO*invdt);
1420 buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][0]) * (sp->mH*invdt);
1421 buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][0]) * (sp->mH*invdt);
1423 acc_vecN<WATER_GROUP_SIZE>(f_normal_x, buf, list, nSettles);
1425 // ------ Update the atomic velocities along Y
1427 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][1] - ref[i][1])*invdt;
1428 st_vecN<WATER_GROUP_SIZE>(vel_y, buf, list, nSettles);
1429 if (WATER_MODEL != WaterModel::TIP3) {
1431 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][1]) * (tmp_mass[i]*invdt);
1433 buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][1]) * (sp->mO*invdt);
1434 buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][1]) * (sp->mH*invdt);
1435 buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][1]) * (sp->mH*invdt);
1437 acc_vecN<WATER_GROUP_SIZE>(f_normal_y, buf, list, nSettles);
1440 // ------ Update the atomic velocities along Z
1442 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (pos[i][2] - ref[i][2])*invdt;
1443 st_vecN<WATER_GROUP_SIZE>(vel_z, buf, list, nSettles);
1444 if (WATER_MODEL != WaterModel::TIP3) {
1446 for (int i = 0; i < WATER_GROUP_SIZE; ++i) buf[threadIdx.x][i] = (buf[threadIdx.x][i] - vel[i][2]) * (tmp_mass[i]*invdt);
1448 buf[threadIdx.x][0] = (buf[threadIdx.x][0] - vel[0][2]) * (sp->mO*invdt);
1449 buf[threadIdx.x][1] = (buf[threadIdx.x][1] - vel[1][2]) * (sp->mH*invdt);
1450 buf[threadIdx.x][2] = (buf[threadIdx.x][2] - vel[2][2]) * (sp->mH*invdt);
1452 acc_vecN<WATER_GROUP_SIZE>(f_normal_z, buf, list, nSettles);
1455 // Remaining threadblocks do MSHAKE.
1456 // instead of tid - nSettle, IDX needs to be the first threadBlock that does MSHAKE instead of settle
1457 int idx = tid - blockDim.x * nSettleBlocks; // Numbers of blocks that did SETTLE
1458 // Ok, now we follow the logic of MSHAKE_CUDA_KERNEL
1460 // This needs to increase monotonically
1461 // if (threadIdx.x == 0) printf("tid %d idx %d\n", tid, idx);
1462 // consFailure[idx] = 0;
1463 int ig = rattleList[idx].ig;
1464 int icnt = rattleList[idx].icnt;
1465 BigReal tol2 = tol2_d;
1466 BigReal sigma[4] = {0};
1467 BigReal lambda[4]= {0};
1468 BigReal A[4][4] = {0};
1469 BigReal velx[4+1] = {0};
1470 BigReal vely[4+1] = {0};
1471 BigReal velz[4+1] = {0};
1472 BigReal pabx[4] = {0};
1473 BigReal rabx[4] = {0};
1474 BigReal paby[4] = {0};
1475 BigReal raby[4] = {0};
1476 BigReal pabz[4] = {0};
1477 BigReal rabz[4] = {0};
1478 BigReal df[3] = {0};
1480 BigReal posx[4+1] = {0};
1481 BigReal posy[4+1] = {0};
1482 BigReal posz[4+1] = {0};
1483 BigReal refx[4+1] = {0};
1484 BigReal refy[4+1] = {0};
1485 BigReal refz[4+1] = {0};
1487 BigReal* posx = &sh_posx[threadIdx.x][0];
1488 BigReal* posy = &sh_posy[threadIdx.x][0];
1489 BigReal* posz = &sh_posz[threadIdx.x][0];
1490 BigReal* refx = &sh_refx[threadIdx.x][0];
1491 BigReal* refy = &sh_refy[threadIdx.x][0];
1492 BigReal* refz = &sh_refz[threadIdx.x][0];
1493 for(int i = 0; i < 4+1; ++i)
1503 //load datas from global memory to local memory
1504 CudaRattleParam rattleParam[4];
1506 for(int i = 0; i < 4+1; ++i)
1507 //for(int i = 0; i < hgs[i]; ++i)
1509 //rattleParam[i] = rattleParam_d[4*idx+i];
1510 if (i < 4) rattleParam[i] = rattleList[idx].params[i];
1511 /*posx[i] = posx_d[i+4*idx];
1512 posy[i] = posy_d[i+4*idx];
1513 posz[i] = posz_d[i+4*idx];
1514 refx[i] = refx_d[i+4*idx];
1515 refx[i] = refx_d[i+4*idx];
1516 refx[i] = refx_d[i+4*idx];
1517 refy[i] = refy_d[i+4*idx];
1518 refz[i] = refz_d[i+4*idx];*/
1519 //I can pass the HGS and check if I is >= HGS[i]
1520 if(i < hydrogenGroupSize[ig]){
1521 refx[i] = pos_x[i+ig];
1522 refy[i] = pos_y[i+ig];
1523 refz[i] = pos_z[i+ig];
1524 velx[i] = vel_x[i + ig];
1525 vely[i] = vel_y[i + ig];
1526 velz[i] = vel_z[i + ig];
1528 posx[i] = refx[i] + (velx[i] * dt);
1529 posy[i] = refy[i] + (vely[i] * dt);
1530 posz[i] = refz[i] + (velz[i] * dt);
1532 // Haochuan: don't update the positions for fixed atoms
1533 if (!atomFixed[ig+i]) {
1534 posx[i] = refx[i] + (velx[i] * dt);
1535 posy[i] = refy[i] + (vely[i] * dt);
1536 posz[i] = refz[i] + (velz[i] * dt);
1548 for(int i = 0; i < 4; ++i)
1550 int a = rattleParam[i].ia;
1551 int b = rattleParam[i].ib;
1552 pabx[i] = posx[a] - posx[b];
1553 paby[i] = posy[a] - posy[b];
1554 pabz[i] = posz[a] - posz[b];
1555 rabx[i] = refx[a] - refx[b];
1556 raby[i] = refy[a] - refy[b];
1557 rabz[i] = refz[a] - refz[b];
1561 for(int i = 0; i < 4; ++i)
1563 BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
1564 BigReal rabsq = rattleParam[i].dsq;
1565 BigReal diffsq = pabsq - rabsq;
1567 if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
1570 int maxiter = maxiter_d;
1571 for(loop = 0; loop < maxiter; ++loop)
1577 for(int j = 0; j < 4; ++j)
1579 BigReal rma = rattleParam[j].rma;
1581 for(int i = 0; i < 4; ++i)
1583 A[j][i] = 2.*(pabx[j]*rabx[i]+paby[j]*raby[i]+pabz[j]*rabz[i])*rma;
1585 BigReal rmb = rattleParam[j].rmb;
1586 A[j][j] += 2.*(pabx[j]*rabx[j]+paby[j]*raby[j]+pabz[j]*rabz[j])*rmb;
1589 solveMatrix(lambda, A, sigma, icnt);
1591 for(int i = 0; i < 4; ++i)
1593 int a = rattleParam[i].ia;
1594 int b = rattleParam[i].ib;
1595 BigReal rma = rattleParam[i].rma * lambda[i];
1596 BigReal rmb = rattleParam[i].rmb * lambda[i];
1598 posx[a] -= rma * rabx[i];
1599 posy[a] -= rma * raby[i];
1600 posz[a] -= rma * rabz[i];
1601 posx[b] += rmb * rabx[i];
1602 posy[b] += rmb * raby[i];
1603 posz[b] += rmb * rabz[i];
1610 for(int i = 0; i < 4; ++i)
1612 int a = rattleParam[i].ia;
1613 int b = rattleParam[i].ib;
1614 pabx[i] = posx[a] - posx[b];
1615 paby[i] = posy[a] - posy[b];
1616 pabz[i] = posz[a] - posz[b];
1617 BigReal pabsq = pabx[i]*pabx[i] + paby[i]*paby[i] + pabz[i]*pabz[i];
1618 BigReal rabsq = rattleParam[i].dsq;
1619 BigReal diffsq = pabsq - rabsq;
1621 if ( fabs(diffsq) > (rabsq * tol2) && i < icnt)
1626 consFailure[idx] = 1;
1630 // double m = mass[ig];
1631 for(int i = 0; i < hydrogenGroupSize[ig]; ++i)
1633 // posNew_x[i+ig] = posx[i];
1634 // posNew_y[i+ig] = posy[i];
1635 // posNew_z[i+ig] = posz[i];
1638 // double vx = velx_d[i + ig];
1639 // double vy = vely_d[i + ig];
1640 // double vz = velz_d[i + ig];
1641 const double m = mass[i + ig];
1642 double vnx = (posx[i] - refx[i]) * invdt;
1643 double vny = (posy[i] - refy[i]) * invdt;
1644 double vnz = (posz[i] - refz[i]) * invdt;
1645 df[0] = (vnx - velx[i]) * (m * invdt);
1646 df[1] = (vny - vely[i]) * (m * invdt);
1647 df[2] = (vnz - velz[i]) * (m * invdt);
1650 // Updates velocity and force
1651 vel_x[i + ig] = vnx;
1652 vel_y[i + ig] = vny;
1653 vel_z[i + ig] = vnz;
1655 f_normal_x[i + ig] += df[0];
1656 f_normal_y[i + ig] += df[1];
1657 f_normal_z[i + ig] += df[2];
1659 // velx_d[i+ig] = (posx[i] - refx[i]) * invdt;
1660 // vely_d[i+ig] = (posy[i] - refy[i]) * invdt;
1661 // velz_d[i+ig] = (posz[i] - refz[i]) * invdt;
1663 // velNew_x[i + ig] = vnx;
1664 // velNew_y[i + ig] = vny;
1665 // velNew_z[i + ig] = vnz;
1668 // consFailure[idx] = 0;
1669 //printf("Loop is %d %d\n", loop, consFailure[idx]);
1675 void CallRattle1Kernel(
1678 // Settle Parameters
1682 double * __restrict vel_x,
1683 double * __restrict vel_y,
1684 double * __restrict vel_z,
1685 const double * __restrict pos_x,
1686 const double * __restrict pos_y,
1687 const double * __restrict pos_z,
1688 double * __restrict f_normal_x,
1689 double * __restrict f_normal_y,
1690 double * __restrict f_normal_z,
1691 const float * __restrict mass,
1692 const int * __restrict hydrogenGroupSize,
1693 const float * __restrict rigidBondLength,
1694 const int * __restrict atomFixed,
1695 int * __restrict settleList,
1696 const SettleParameters * sp,
1697 const CudaRattleElem * __restrict rattleList,
1699 const BigReal tol2_d,
1700 const int maxiter_d,
1702 const int nSettleBlocks,
1703 const int nShakeBlocks,
1704 const WaterModel water_model,
1707 const int nTotalBlocks = (nSettleBlocks + nShakeBlocks);
1708 #define CALL(WATER_MODEL, DOFIXED) \
1709 Rattle1Kernel<getWaterModelGroupSize(WATER_MODEL), WATER_MODEL, DOFIXED> \
1710 <<<nTotalBlocks, 128, 0, stream>>> \
1711 (numAtoms, dt, invdt, nSettles, \
1712 vel_x, vel_y, vel_z, \
1713 pos_x, pos_y, pos_z, \
1714 f_normal_x, f_normal_y, f_normal_z, \
1715 mass, hydrogenGroupSize, rigidBondLength, atomFixed, \
1717 rattleList, nShakes, tol2_d, maxiter_d, consFailure, nSettleBlocks);
1719 switch (water_model) {
1720 case WaterModel::SWM4: CALL(WaterModel::SWM4, true); break;
1721 case WaterModel::TIP4: CALL(WaterModel::TIP4, true); break;
1722 case WaterModel::TIP3: CALL(WaterModel::TIP3, true); break;
1725 switch (water_model) {
1726 case WaterModel::SWM4: CALL(WaterModel::SWM4, false); break;
1727 case WaterModel::TIP4: CALL(WaterModel::TIP4, false); break;
1728 case WaterModel::TIP3: CALL(WaterModel::TIP3, false); break;
1732 cudaStreamSynchronize(stream);