13 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)
14 #include <emmintrin.h>
15 #if defined(__INTEL_COMPILER)
16 #define __align(X) __declspec(align(X) )
18 #define __align(X) __attribute__((aligned(X) ))
19 #define MISSING_mm_cvtsd_f64
20 #elif defined(__GNUC__)
21 #define __align(X) __attribute__((aligned(X) ))
23 #define MISSING_mm_cvtsd_f64
26 #define __align(X) __declspec(align(X) )
44 BigReal rmT = 1.0 / (pmO+pmH+pmH);
49 ra = sqrt(ohdist*ohdist-rc*rc)/(1.0+t1);
58 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)
72 __m128d REF0xy = _mm_loadu_pd((
double *) &ref[0].
x);
73 __m128d REF1xy = _mm_loadu_pd((
double *) &ref[1].x);
75 __m128d B0xy = _mm_sub_pd(REF1xy, REF0xy);
76 _mm_storeu_pd((
double *) &b0.
x, B0xy);
77 b0.
z = ref[1].
z - ref[0].
z;
79 __m128d REF2xy = _mm_loadu_pd((
double *) &ref[2].x);
81 __m128d C0xy = _mm_sub_pd(REF2xy, REF0xy);
82 _mm_storeu_pd((
double *) &c0.
x, C0xy);
83 c0.
z = ref[2].
z - ref[0].
z;
92 __m128d POS1xy = _mm_loadu_pd((
double *) &pos[1].x);
93 __m128d POS2xy = _mm_loadu_pd((
double *) &pos[2].x);
94 __m128d PMHrmTxy = _mm_mul_pd(_mm_add_pd(POS1xy, POS2xy), _mm_set1_pd(mHrmT));
96 __m128d POS0xy = _mm_loadu_pd((
double *) &pos[0].x);
97 __m128d PMOrmTxy = _mm_mul_pd(POS0xy, _mm_set1_pd(mOrmT));
98 __m128d D0xy = _mm_add_pd(PMOrmTxy, PMHrmTxy);
100 d0.
z = pos[0].
z * mOrmT + ((pos[1].
z + pos[2].
z) * mHrmT);
101 a1.
z = pos[0].
z - d0.
z;
102 b1.
z = pos[1].
z - d0.
z;
103 c1.
z = pos[2].
z - d0.
z;
105 __m128d A1xy = _mm_sub_pd(POS0xy, D0xy);
106 _mm_store_pd((
double *) &a1.
x, A1xy);
108 __m128d B1xy = _mm_sub_pd(POS1xy, D0xy);
109 _mm_store_pd((
double *) &b1.
x, B1xy);
111 __m128d C1xy = _mm_sub_pd(POS2xy, D0xy);
112 _mm_store_pd((
double *) &c1.
x, C1xy);
114 _mm_store_pd((
double *) &d0.
x, D0xy);
123 Vector b0 = ref[1]-ref[0];
124 Vector c0 = ref[2]-ref[0];
127 Vector d0 = pos[0]*mOrmT + ((pos[1] + pos[2])*mHrmT);
140 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE) && ! defined(MISSING_mm_cvtsd_f64)
141 __m128d l1 = _mm_set_pd(n0.
x, n0.
y);
142 l1 = _mm_mul_pd(l1, l1);
144 double l1xy0 = _mm_cvtsd_f64(_mm_add_sd(l1, _mm_shuffle_pd(l1, l1, 1)));
146 __m128d l3 = _mm_set_pd(n1.
y, n1.
z);
147 l3 = _mm_mul_pd(l3, l3);
149 double l3yz1 = _mm_cvtsd_f64(_mm_add_sd(l3, _mm_shuffle_pd(l3, l3, 1)));
151 __m128d l2 = _mm_set_pd(n1.
x, n0.
z);
153 __m128d ts01 = _mm_add_pd(_mm_set_pd(l3yz1, l1xy0), _mm_mul_pd(l2, l2));
155 __m128d l4 = _mm_set_pd(n2.
x, n2.
y);
156 l4 = _mm_mul_pd(l4, l4);
158 double l4xy2 = _mm_cvtsd_f64(_mm_add_sd(l4, _mm_shuffle_pd(l4, l4, 1)));
159 double ts2 = l4xy2 + (n2.
z * n2.
z);
164 static const __m128d fvecd1p0 = _mm_set1_pd(1.0);
167 __m128d invlen12 = _mm_div_pd(fvecd1p0, _mm_sqrt_pd(ts01));
170 _mm_storeu_pd(invlens, invlen12);
172 n0 = n0 * invlens[0];
179 BigReal tmp = 1.0-sinphi*sinphi;
181 __m128d n2cosphi = _mm_sqrt_pd(_mm_set_pd(tmp, ts2));
183 _mm_storeu_pd(invlens+2, n2cosphi);
185 n1 = n1 * invlens[1];
186 n2 = n2 * (1.0 / invlens[2]);
189 b0 =
Vector(n1*b0, n2*b0, n0*b0);
190 c0 =
Vector(n1*c0, n2*c0, n0*c0);
192 b1 =
Vector(n1*b1, n2*b1, n0*b1);
193 c1 =
Vector(n1*c1, n2*c1, n0*c1);
196 BigReal sinpsi = (b1.z - c1.z)/(2.0*rc*cosphi);
197 tmp = 1.0-sinpsi*sinpsi;
204 b0 =
Vector(n1*b0, n2*b0, n0*b0);
205 c0 =
Vector(n1*c0, n2*c0, n0*c0);
208 b1 =
Vector(n1*b1, n2*b1, n0*b1);
209 c1 =
Vector(n1*c1, n2*c1, n0*c1);
213 BigReal tmp = 1.0-sinphi*sinphi;
215 BigReal sinpsi = (b1.z - c1.z)/(2.0*rc*cosphi);
216 tmp = 1.0-sinpsi*sinpsi;
221 BigReal tmp1 = rc*sinpsi*sinphi;
222 BigReal tmp2 = rc*sinpsi*cosphi;
224 Vector a2(0, ra*cosphi, ra*sinphi);
225 Vector b2(-rc*cospsi, rbphi - tmp1, -rb*sinphi + tmp2);
226 Vector c2( rc*cosphi, rbphi + tmp1, -rb*sinphi - tmp2);
230 BigReal alpha = b2.
x*(b0.x - c0.x) + b0.y*b2.
y + c0.y*c2.
y;
231 BigReal beta = b2.
x*(c0.y - b0.y) + b0.x*b2.
y + c0.x*c2.
y;
232 BigReal gama = b0.x*b1.y - b1.x*b0.y + c0.x*c1.y - c1.x*c0.y;
234 BigReal a2b2 = alpha*alpha + beta*beta;
235 BigReal sintheta = (alpha*gama - beta*sqrt(a2b2 - gama*gama))/a2b2;
236 BigReal costheta = sqrt(1.0 - sintheta*sintheta);
242 Vector b3(b2.
x*costheta - b2.
y*sintheta,
243 b2.
x*sintheta + b2.
y*costheta,
245 Vector c3(c2.
x*costheta - c2.
y*sintheta,
246 c2.
x*sintheta + c2.
y*costheta,
253 Vector b3(b2.
x*costheta - b2.
y*sintheta,
254 b2.
x*sintheta + b2.
y*costheta,
256 Vector c3(-b2.
x*costheta - c2.
y*sintheta,
257 -b2.
x*sintheta + c2.
y*costheta,
267 pos[0] =
Vector(a3*m1, a3*m2, a3*m0) + d0;
268 pos[1] =
Vector(b3*m1, b3*m2, b3*m0) + d0;
269 pos[2] =
Vector(c3*m1, c3*m2, c3*m0) + d0;
273 vel[0] = (pos[0]-ref[0])*invdt;
274 vel[1] = (pos[1]-ref[1])*invdt;
275 vel[2] = (pos[2]-ref[2])*invdt;
284 template <
int veclen>
309 for (
int i=0;i < veclen;i++) {
310 ref0xt[i] = ref[i*3+0].
x;
311 ref0yt[i] = ref[i*3+0].
y;
312 ref0zt[i] = ref[i*3+0].
z;
313 ref1xt[i] = ref[i*3+1].
x;
314 ref1yt[i] = ref[i*3+1].
y;
315 ref1zt[i] = ref[i*3+1].
z;
316 ref2xt[i] = ref[i*3+2].
x;
317 ref2yt[i] = ref[i*3+2].
y;
318 ref2zt[i] = ref[i*3+2].
z;
320 pos0xt[i] = pos[i*3+0].
x;
321 pos0yt[i] = pos[i*3+0].
y;
322 pos0zt[i] = pos[i*3+0].
z;
323 pos1xt[i] = pos[i*3+1].
x;
324 pos1yt[i] = pos[i*3+1].
y;
325 pos1zt[i] = pos[i*3+1].
z;
326 pos2xt[i] = pos[i*3+2].
x;
327 pos2yt[i] = pos[i*3+2].
y;
328 pos2zt[i] = pos[i*3+2].
z;
332 for (
int i=0;i < veclen;i++) {
364 BigReal d0x = pos0x*mOrmT + ((pos1x + pos2x)*mHrmT);
365 BigReal d0y = pos0y*mOrmT + ((pos1y + pos2y)*mHrmT);
366 BigReal d0z = pos0z*mOrmT + ((pos1z + pos2z)*mHrmT);
398 BigReal n0inv = 1.0/sqrt(n0x*n0x + n0y*n0y + n0z*n0z);
403 BigReal n1inv = 1.0/sqrt(n1x*n1x + n1y*n1y + n1z*n1z);
408 BigReal n2inv = 1.0/sqrt(n2x*n2x + n2y*n2y + n2z*n2z);
414 BigReal n1b0 = n1x*b0x + n1y*b0y + n1z*b0z;
415 BigReal n2b0 = n2x*b0x + n2y*b0y + n2z*b0z;
418 BigReal n1c0 = n1x*c0x + n1y*c0y + n1z*c0z;
419 BigReal n2c0 = n2x*c0x + n2y*c0y + n2z*c0z;
421 BigReal A1Z = n0x*a1x + n0y*a1y + n0z*a1z;
424 BigReal n1b1 = n1x*b1x + n1y*b1y + n1z*b1z;
425 BigReal n2b1 = n2x*b1x + n2y*b1y + n2z*b1z;
426 BigReal n0b1 = n0x*b1x + n0y*b1y + n0z*b1z;
429 BigReal n1c1 = n1x*c1x + n1y*c1y + n1z*c1z;
430 BigReal n2c1 = n2x*c1x + n2y*c1y + n2z*c1z;
431 BigReal n0c1 = n0x*c1x + n0y*c1y + n0z*c1z;
435 BigReal tmp = 1.0-sinphi*sinphi;
437 BigReal sinpsi = (n0b1 - n0c1)/(2.0*rc*cosphi);
438 tmp = 1.0-sinpsi*sinpsi;
442 BigReal tmp1 = rc*sinpsi*sinphi;
443 BigReal tmp2 = rc*sinpsi*cosphi;
457 BigReal alpha = b2x*(n1b0 - n1c0) + n2b0*b2y + n2c0*c2y;
458 BigReal beta = b2x*(n2c0 - n2b0) + n1b0*b2y + n1c0*c2y;
459 BigReal gama = n1b0*n2b1 - n1b1*n2b0 + n1c0*n2c1 - n1c1*n2c0;
461 BigReal a2b2 = alpha*alpha + beta*beta;
462 BigReal sintheta = (alpha*gama - beta*sqrt(a2b2 - gama*gama))/a2b2;
463 BigReal costheta = sqrt(1.0 - sintheta*sintheta);
475 BigReal b3x = b2x*costheta - b2y*sintheta;
476 BigReal b3y = b2x*sintheta + b2y*costheta;
482 BigReal c3x = -b2x*costheta - c2y*sintheta;
483 BigReal c3y = -b2x*sintheta + c2y*costheta;
503 pos0x = a3x*m1x + a3y*m1y + a3z*m1z + d0x;
504 pos0y = a3x*m2x + a3y*m2y + a3z*m2z + d0y;
505 pos0z = a3x*m0x + a3y*m0y + a3z*m0z + d0z;
508 pos1x = b3x*m1x + b3y*m1y + b3z*m1z + d0x;
509 pos1y = b3x*m2x + b3y*m2y + b3z*m2z + d0y;
510 pos1z = b3x*m0x + b3y*m0y + b3z*m0z + d0z;
513 pos2x = c3x*m1x + c3y*m1y + c3z*m1z + d0x;
514 pos2y = c3x*m2x + c3y*m2y + c3z*m2z + d0y;
515 pos2z = c3x*m0x + c3y*m0y + c3z*m0z + d0z;
528 for (
int i=0;i < veclen;i++) {
529 pos[i*3+0].
x = pos0xt[i];
530 pos[i*3+0].
y = pos0yt[i];
531 pos[i*3+0].
z = pos0zt[i];
532 pos[i*3+1].
x = pos1xt[i];
533 pos[i*3+1].
y = pos1yt[i];
534 pos[i*3+1].
z = pos1zt[i];
535 pos[i*3+2].
x = pos2xt[i];
536 pos[i*3+2].
y = pos2yt[i];
537 pos[i*3+2].
z = pos2zt[i];
545 template <
int veclen>
550 int a = rattleParam[0].
ia;
551 int b = rattleParam[0].
ib;
552 BigReal pabx = posx[a] - posx[b];
553 BigReal paby = posy[a] - posy[b];
554 BigReal pabz = posz[a] - posz[b];
555 BigReal pabsq = pabx*pabx + paby*paby + pabz*pabz;
557 BigReal diffsq = rabsq - pabsq;
558 BigReal rabx = refx[a] - refx[b];
559 BigReal raby = refy[a] - refy[b];
560 BigReal rabz = refz[a] - refz[b];
562 BigReal refsq = rabx*rabx + raby*raby + rabz*rabz;
564 BigReal rpab = rabx*pabx + raby*paby + rabz*pabz;
570 BigReal sqrtarg = rpab*rpab + refsq*diffsq;
571 if ( sqrtarg < 0. ) {
576 gab = (-rpab + sqrt(sqrtarg))/(refsq*(rma + rmb));
582 posx[a] += rma * dpx;
583 posy[a] += rma * dpy;
584 posz[a] += rma * dpz;
585 posx[b] -= rmb * dpx;
586 posy[b] -= rmb * dpy;
587 posz[b] -= rmb * dpz;
594 const BigReal tol2,
const int maxiter,
595 bool& done,
bool& consFailure) {
597 for (
int iter = 0; iter < maxiter; ++iter ) {
600 for (
int i = 0; i < icnt; ++i ) {
601 int a = rattleParam[i].
ia;
602 int b = rattleParam[i].
ib;
603 BigReal pabx = posx[a] - posx[b];
604 BigReal paby = posy[a] - posy[b];
605 BigReal pabz = posz[a] - posz[b];
606 BigReal pabsq = pabx*pabx + paby*paby + pabz*pabz;
608 BigReal diffsq = rabsq - pabsq;
609 if ( fabs(diffsq) > (rabsq * tol2) ) {
610 BigReal rabx = refx[a] - refx[b];
611 BigReal raby = refy[a] - refy[b];
612 BigReal rabz = refz[a] - refz[b];
613 BigReal rpab = rabx*pabx + raby*paby + rabz*pabz;
614 if ( rpab < ( rabsq * 1.0e-6 ) ) {
621 BigReal gab = diffsq / ( 2.0 * ( rma + rmb ) * rpab );
625 posx[a] += rma * dpx;
626 posy[a] += rma * dpy;
627 posz[a] += rma * dpz;
628 posx[b] -= rmb * dpx;
629 posy[b] -= rmb * dpy;
630 posz[b] -= rmb * dpz;
655 Vector rAB = pos[1]-pos[0];
656 Vector rBC = pos[2]-pos[1];
657 Vector rCA = pos[0]-pos[2];
667 BigReal vab = (vel[1]-vel[0])*AB;
668 BigReal vbc = (vel[2]-vel[1])*BC;
669 BigReal vca = (vel[0]-vel[2])*CA;
673 BigReal d = (2*mab*mab + 2*ma*mb*cosA*cosB*cosC - 2*mb*mb*cosA*cosA
674 - ma*mab*(cosB*cosB + cosC*cosC))*0.5/mb;
676 BigReal tab = (vab*(2*mab - ma*cosC*cosC) +
677 vbc*(mb*cosC*cosA - mab*cosB) +
678 vca*(ma*cosB*cosC - 2*mb*cosA))*ma/d;
680 BigReal tbc = (vbc*(mab*mab - mb*mb*cosA*cosA) +
681 vca*ma*(mb*cosA*cosB - mab*cosC) +
682 vab*ma*(mb*cosC*cosA - mab*cosB))/d;
684 BigReal tca = (vca*(2*mab - ma*cosB*cosB) +
685 vab*(ma*cosB*cosC - 2*mb*cosA) +
686 vbc*(mb*cosA*cosB - mab*cosC))*ma/d;
688 Vector ga = tab*AB - tca*CA;
689 Vector gb = tbc*BC - tab*AB;
690 Vector gc = tca*CA - tbc*BC;
693 *virial += 0.5*
outer(tab, rAB)/dt;
694 *virial += 0.5*
outer(tbc, rBC)/dt;
695 *virial += 0.5*
outer(tca, rCA)/dt;
698 vel[0] += (0.5/ma)*ga;
699 vel[1] += (0.5/mb)*gb;
700 vel[2] += (0.5/mb)*gc;
708 settlev(pos, mO, mH, vel, dt, virial);
template void settle1_SIMD< 1 >(const Vector *ref, Vector *pos, BigReal mOrmT, BigReal mHrmT, BigReal ra, BigReal rb, BigReal rc, BigReal rra)
static int settlev(const Vector *pos, BigReal ma, BigReal mb, Vector *vel, BigReal dt, Tensor *virial)
void rattlePair(const RattleParam *rattleParam, const BigReal *refx, const BigReal *refy, const BigReal *refz, BigReal *posx, BigReal *posy, BigReal *posz, bool &consFailure)
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
template void settle1_SIMD< 2 >(const Vector *ref, Vector *pos, BigReal mOrmT, BigReal mHrmT, BigReal ra, BigReal rb, BigReal rc, BigReal rra)
int settle2(BigReal mO, BigReal mH, const Vector *pos, Vector *vel, BigReal dt, Tensor *virial)
template void rattlePair< 1 >(const RattleParam *rattleParam, const BigReal *refx, const BigReal *refy, const BigReal *refz, BigReal *posx, BigReal *posy, BigReal *posz, bool &consFailure)
Tensor outer(const Vector &v1, const Vector &v2)
int settle1(const Vector *ref, Vector *pos, Vector *vel, BigReal invdt, BigReal mOrmT, BigReal mHrmT, BigReal ra, BigReal rb, BigReal rc, BigReal rra)
optimized settle1 algorithm, reuses water properties as much as possible
void settle1init(BigReal pmO, BigReal pmH, BigReal hhdist, BigReal ohdist, BigReal &mOrmT, BigReal &mHrmT, BigReal &ra, BigReal &rb, BigReal &rc, BigReal &rra)
initialize cached water properties
void settle1_SIMD(const Vector *ref, Vector *pos, BigReal mOrmT, BigReal mHrmT, BigReal ra, BigReal rb, BigReal rc, BigReal rra)
void rattleN(const int icnt, const RattleParam *rattleParam, const BigReal *refx, const BigReal *refy, const BigReal *refz, BigReal *posx, BigReal *posy, BigReal *posz, const BigReal tol2, const int maxiter, bool &done, bool &consFailure)