1 #ifndef GRIDFORCEGRIDCUDAKERNEL_H 2 #define GRIDFORCEGRIDCUDAKERNEL_H 9 #include <hip/hip_runtime.h> 18 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 19 #ifdef NODEGROUP_FORCE_REGISTER 40 __host__ __device__ ForceEnergy(
Force in_f,
double in_e): force(in_f), energy(in_e)
46 class GridforceGridCUDA
50 __host__ __device__ GridforceGridCUDA(){}
70 __device__
inline ForceEnergy interpolateForceD(
const Vector& pos)
const 77 const Vector l = inv*(pos - origin);
79 const int homeX = int(floor(l.
x));
80 const int homeY = int(floor(l.
y));
81 const int homeZ = int(floor(l.
z));
82 const float wx = l.
x - homeX;
83 const float wy = l.
y - homeY;
84 const float wz = l.
z - homeZ;
85 const float wx2 = wx*wx;
90 for (
int iz = 0; iz < 4; iz++) {
92 int jz = (iz + homeZ - 1);
94 jz += cont[2] ? ( jz <0 ? k[2] : ( jz >= k[2] ? -k[2] : 0) ) : 0;
95 for (
int iy = 0; iy < 4; iy++) {
97 int jy = (iy + homeY - 1);
99 jy += cont[1] ? ( jy <0 ? k[1] : ( jy >= k[1] ? -k[1] : 0) ) : 0;
100 for (
int ix = 0; ix < 4; ix++) {
101 int jx = (ix + homeX - 1);
103 jx += cont[0] ? ( jx <0 ? k[0] : ( jx >= k[0] ? -k[0] : 0) ) : 0;
104 const int ind = jz + jy*k[2] + jx*k[2]*k[1];
105 v[ix] = jz < 0 || jz >= k[2] || jy < 0 || jy >= k[1] || jx < 0 || jx >= k[0] ?
108 const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2;
109 const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx;
110 const float a1 = 0.5f*(-v[0] + v[2]);
111 g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1;
112 g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1];
117 g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy +
118 0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3]) *wy*wy +
119 0.5f*(-g2[0][0] + g2[0][2]) *wy +
124 const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy;
125 const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy;
126 const float a1 = 0.5f*(-g2[1][0] + g2[1][2]);
127 g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1;
128 g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1];
133 f.
x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
134 -0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
135 -0.5f*(-g3[0][0] + g3[0][2]) *wz -
137 f.
y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
138 -0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
139 -0.5f*(-g3[1][0] + g3[1][2]) *wz -
141 f.
z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
142 (2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]) *wz -
143 0.5f*(-g3[2][0] + g3[2][2]);
144 float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
145 0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]) *wz*wz +
146 0.5f*(-g3[2][0] + g3[2][2]) *wz +
154 Tensor inv_t = transpose(inv);
156 return ForceEnergy(f,e);
160 __device__
int get_inds(
171 for (
int i = 0; i < 3; i++) {
172 inds[i] = (int)floor(g[i]);
173 dg[i] = g[i] - inds[i];
176 for (
int i = 0; i < 3; i++) {
177 if (inds[i] < 0 || inds[i] >= k[i]-1) {
178 if (cont[i]) inds[i] = k[i]-1;
181 if (cont[i] && inds[i] == k[i]-1) {
183 gapscale[i] *= gapinv[i];
184 if (g[i] < 0.0) dg[i] = 1.0 + g[i]*gapinv[i];
185 else dg[i] = (g[i] - inds[i]) * gapinv[i];
192 __device__
float compute_V(
196 const float *z)
const 200 for (
int l = 0; l < 4; l++) {
201 for (
int k = 0; k < 4; k++) {
203 for (
int j = 0; j < 4; j++) {
204 V += a[ind] * x[j] * y[k] * z[l];
213 __device__
Vector compute_dV(
217 const float *z)
const 222 for (
int l = 0; l < 4; l++) {
223 for (
int k = 0; k < 4; k++) {
225 for (
int j = 0; j < 4; j++) {
226 if (j > 0) dV.
x += a[ind] * j * x[j-1] * y[k] * z[l];
227 if (k > 0) dV.
y += a[ind] * k * x[j] * y[k-1] * z[l];
228 if (l > 0) dV.
z += a[ind] * l * x[j] * y[k] * z[l-1];
237 __device__
Vector compute_d2V(
241 const float *z)
const 245 for (
int l = 0; l < 4; l++) {
246 for (
int k = 0; k < 4; k++) {
247 for (
int j = 0; j < 4; j++) {
248 if (j > 0 && k > 0) d2V.
x += a[ind] * j * k * x[j-1] * y[k-1] * z[l];
249 if (j > 0 && l > 0) d2V.
y += a[ind] * j * l * x[j-1] * y[k] * z[l-1];
250 if (k > 0 && l > 0) d2V.
z += a[ind] * k * l * x[j] * y[k-1] * z[l-1];
258 __device__
float compute_d3V(
262 const float *z)
const 266 for (
int l = 0; l < 4; l++) {
267 for (
int k = 0; k < 4; k++) {
268 for (
int j = 0; j < 4; j++) {
269 if (j > 0 && k > 0 && l > 0) d3V += a[ind] * j * k * l * x[j-1] * y[k-1] * z[l-1];
278 __device__
void compute_a(
280 const float *b)
const 285 a[2] = -3*b[0] + 3*b[1] - 2*b[8] - b[9];
286 a[3] = 2*b[0] - 2*b[1] + b[8] + b[9];
289 a[6] = -3*b[16] + 3*b[17] - 2*b[32] - b[33];
290 a[7] = 2*b[16] - 2*b[17] + b[32] + b[33];
291 a[8] = -3*b[0] + 3*b[2] - 2*b[16] - b[18];
292 a[9] = -3*b[8] + 3*b[10] - 2*b[32] - b[34];
293 a[10] = 9*b[0] - 9*b[1] - 9*b[2] + 9*b[3] + 6*b[8] + 3*b[9] - 6*b[10] - 3*b[11]
294 + 6*b[16] - 6*b[17] + 3*b[18] - 3*b[19] + 4*b[32] + 2*b[33] + 2*b[34] + b[35];
295 a[11] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 3*b[8] - 3*b[9] + 3*b[10] + 3*b[11]
296 - 4*b[16] + 4*b[17] - 2*b[18] + 2*b[19] - 2*b[32] - 2*b[33] - b[34] - b[35];
297 a[12] = 2*b[0] - 2*b[2] + b[16] + b[18];
298 a[13] = 2*b[8] - 2*b[10] + b[32] + b[34];
299 a[14] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 4*b[8] - 2*b[9] + 4*b[10] + 2*b[11]
300 - 3*b[16] + 3*b[17] - 3*b[18] + 3*b[19] - 2*b[32] - b[33] - 2*b[34] - b[35];
301 a[15] = 4*b[0] - 4*b[1] - 4*b[2] + 4*b[3] + 2*b[8] + 2*b[9] - 2*b[10] - 2*b[11]
302 + 2*b[16] - 2*b[17] + 2*b[18] - 2*b[19] + b[32] + b[33] + b[34] + b[35];
305 a[18] = -3*b[24] + 3*b[25] - 2*b[40] - b[41];
306 a[19] = 2*b[24] - 2*b[25] + b[40] + b[41];
309 a[22] = -3*b[48] + 3*b[49] - 2*b[56] - b[57];
310 a[23] = 2*b[48] - 2*b[49] + b[56] + b[57];
311 a[24] = -3*b[24] + 3*b[26] - 2*b[48] - b[50];
312 a[25] = -3*b[40] + 3*b[42] - 2*b[56] - b[58];
313 a[26] = 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43]
314 + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 4*b[56] + 2*b[57] + 2*b[58] + b[59];
315 a[27] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43]
316 - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 2*b[56] - 2*b[57] - b[58] - b[59];
317 a[28] = 2*b[24] - 2*b[26] + b[48] + b[50];
318 a[29] = 2*b[40] - 2*b[42] + b[56] + b[58];
319 a[30] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43]
320 - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 2*b[56] - b[57] - 2*b[58] - b[59];
321 a[31] = 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43]
322 + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + b[56] + b[57] + b[58] + b[59];
323 a[32] = -3*b[0] + 3*b[4] - 2*b[24] - b[28];
324 a[33] = -3*b[8] + 3*b[12] - 2*b[40] - b[44];
325 a[34] = 9*b[0] - 9*b[1] - 9*b[4] + 9*b[5] + 6*b[8] + 3*b[9] - 6*b[12] - 3*b[13]
326 + 6*b[24] - 6*b[25] + 3*b[28] - 3*b[29] + 4*b[40] + 2*b[41] + 2*b[44] + b[45];
327 a[35] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 3*b[8] - 3*b[9] + 3*b[12] + 3*b[13]
328 - 4*b[24] + 4*b[25] - 2*b[28] + 2*b[29] - 2*b[40] - 2*b[41] - b[44] - b[45];
329 a[36] = -3*b[16] + 3*b[20] - 2*b[48] - b[52];
330 a[37] = -3*b[32] + 3*b[36] - 2*b[56] - b[60];
331 a[38] = 9*b[16] - 9*b[17] - 9*b[20] + 9*b[21] + 6*b[32] + 3*b[33] - 6*b[36] - 3*b[37]
332 + 6*b[48] - 6*b[49] + 3*b[52] - 3*b[53] + 4*b[56] + 2*b[57] + 2*b[60] + b[61];
333 a[39] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 3*b[32] - 3*b[33] + 3*b[36] + 3*b[37]
334 - 4*b[48] + 4*b[49] - 2*b[52] + 2*b[53] - 2*b[56] - 2*b[57] - b[60] - b[61];
335 a[40] = 9*b[0] - 9*b[2] - 9*b[4] + 9*b[6] + 6*b[16] + 3*b[18] - 6*b[20] - 3*b[22]
336 + 6*b[24] - 6*b[26] + 3*b[28] - 3*b[30] + 4*b[48] + 2*b[50] + 2*b[52] + b[54];
337 a[41] = 9*b[8] - 9*b[10] - 9*b[12] + 9*b[14] + 6*b[32] + 3*b[34] - 6*b[36] - 3*b[38]
338 + 6*b[40] - 6*b[42] + 3*b[44] - 3*b[46] + 4*b[56] + 2*b[58] + 2*b[60] + b[62];
339 a[42] = -27*b[0] + 27*b[1] + 27*b[2] - 27*b[3] + 27*b[4] - 27*b[5] - 27*b[6] + 27*b[7]
340 - 18*b[8] - 9*b[9] + 18*b[10] + 9*b[11] + 18*b[12] + 9*b[13] - 18*b[14] - 9*b[15]
341 - 18*b[16] + 18*b[17] - 9*b[18] + 9*b[19] + 18*b[20] - 18*b[21] + 9*b[22] - 9*b[23]
342 - 18*b[24] + 18*b[25] + 18*b[26] - 18*b[27] - 9*b[28] + 9*b[29] + 9*b[30] - 9*b[31]
343 - 12*b[32] - 6*b[33] - 6*b[34] - 3*b[35] + 12*b[36] + 6*b[37] + 6*b[38] + 3*b[39]
344 - 12*b[40] - 6*b[41] + 12*b[42] + 6*b[43] - 6*b[44] - 3*b[45] + 6*b[46] + 3*b[47]
345 - 12*b[48] + 12*b[49] - 6*b[50] + 6*b[51] - 6*b[52] + 6*b[53] - 3*b[54] + 3*b[55]
346 - 8*b[56] - 4*b[57] - 4*b[58] - 2*b[59] - 4*b[60] - 2*b[61] - 2*b[62] - b[63];
347 a[43] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
348 + 9*b[8] + 9*b[9] - 9*b[10] - 9*b[11] - 9*b[12] - 9*b[13] + 9*b[14] + 9*b[15]
349 + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
350 + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
351 + 6*b[32] + 6*b[33] + 3*b[34] + 3*b[35] - 6*b[36] - 6*b[37] - 3*b[38] - 3*b[39]
352 + 6*b[40] + 6*b[41] - 6*b[42] - 6*b[43] + 3*b[44] + 3*b[45] - 3*b[46] - 3*b[47]
353 + 8*b[48] - 8*b[49] + 4*b[50] - 4*b[51] + 4*b[52] - 4*b[53] + 2*b[54] - 2*b[55]
354 + 4*b[56] + 4*b[57] + 2*b[58] + 2*b[59] + 2*b[60] + 2*b[61] + b[62] + b[63];
355 a[44] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 3*b[16] - 3*b[18] + 3*b[20] + 3*b[22]
356 - 4*b[24] + 4*b[26] - 2*b[28] + 2*b[30] - 2*b[48] - 2*b[50] - b[52] - b[54];
357 a[45] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 3*b[32] - 3*b[34] + 3*b[36] + 3*b[38]
358 - 4*b[40] + 4*b[42] - 2*b[44] + 2*b[46] - 2*b[56] - 2*b[58] - b[60] - b[62];
359 a[46] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
360 + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
361 + 9*b[16] - 9*b[17] + 9*b[18] - 9*b[19] - 9*b[20] + 9*b[21] - 9*b[22] + 9*b[23]
362 + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
363 + 6*b[32] + 3*b[33] + 6*b[34] + 3*b[35] - 6*b[36] - 3*b[37] - 6*b[38] - 3*b[39]
364 + 8*b[40] + 4*b[41] - 8*b[42] - 4*b[43] + 4*b[44] + 2*b[45] - 4*b[46] - 2*b[47]
365 + 6*b[48] - 6*b[49] + 6*b[50] - 6*b[51] + 3*b[52] - 3*b[53] + 3*b[54] - 3*b[55]
366 + 4*b[56] + 2*b[57] + 4*b[58] + 2*b[59] + 2*b[60] + b[61] + 2*b[62] + b[63];
367 a[47] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
368 - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
369 - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
370 - 8*b[24] + 8*b[25] + 8*b[26] - 8*b[27] - 4*b[28] + 4*b[29] + 4*b[30] - 4*b[31]
371 - 3*b[32] - 3*b[33] - 3*b[34] - 3*b[35] + 3*b[36] + 3*b[37] + 3*b[38] + 3*b[39]
372 - 4*b[40] - 4*b[41] + 4*b[42] + 4*b[43] - 2*b[44] - 2*b[45] + 2*b[46] + 2*b[47]
373 - 4*b[48] + 4*b[49] - 4*b[50] + 4*b[51] - 2*b[52] + 2*b[53] - 2*b[54] + 2*b[55]
374 - 2*b[56] - 2*b[57] - 2*b[58] - 2*b[59] - b[60] - b[61] - b[62] - b[63];
375 a[48] = 2*b[0] - 2*b[4] + b[24] + b[28];
376 a[49] = 2*b[8] - 2*b[12] + b[40] + b[44];
377 a[50] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 4*b[8] - 2*b[9] + 4*b[12] + 2*b[13]
378 - 3*b[24] + 3*b[25] - 3*b[28] + 3*b[29] - 2*b[40] - b[41] - 2*b[44] - b[45];
379 a[51] = 4*b[0] - 4*b[1] - 4*b[4] + 4*b[5] + 2*b[8] + 2*b[9] - 2*b[12] - 2*b[13]
380 + 2*b[24] - 2*b[25] + 2*b[28] - 2*b[29] + b[40] + b[41] + b[44] + b[45];
381 a[52] = 2*b[16] - 2*b[20] + b[48] + b[52];
382 a[53] = 2*b[32] - 2*b[36] + b[56] + b[60];
383 a[54] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 4*b[32] - 2*b[33] + 4*b[36] + 2*b[37]
384 - 3*b[48] + 3*b[49] - 3*b[52] + 3*b[53] - 2*b[56] - b[57] - 2*b[60] - b[61];
385 a[55] = 4*b[16] - 4*b[17] - 4*b[20] + 4*b[21] + 2*b[32] + 2*b[33] - 2*b[36] - 2*b[37]
386 + 2*b[48] - 2*b[49] + 2*b[52] - 2*b[53] + b[56] + b[57] + b[60] + b[61];
387 a[56] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 4*b[16] - 2*b[18] + 4*b[20] + 2*b[22]
388 - 3*b[24] + 3*b[26] - 3*b[28] + 3*b[30] - 2*b[48] - b[50] - 2*b[52] - b[54];
389 a[57] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 4*b[32] - 2*b[34] + 4*b[36] + 2*b[38]
390 - 3*b[40] + 3*b[42] - 3*b[44] + 3*b[46] - 2*b[56] - b[58] - 2*b[60] - b[62];
391 a[58] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
392 + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
393 + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
394 + 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 9*b[28] - 9*b[29] - 9*b[30] + 9*b[31]
395 + 8*b[32] + 4*b[33] + 4*b[34] + 2*b[35] - 8*b[36] - 4*b[37] - 4*b[38] - 2*b[39]
396 + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43] + 6*b[44] + 3*b[45] - 6*b[46] - 3*b[47]
397 + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 6*b[52] - 6*b[53] + 3*b[54] - 3*b[55]
398 + 4*b[56] + 2*b[57] + 2*b[58] + b[59] + 4*b[60] + 2*b[61] + 2*b[62] + b[63];
399 a[59] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
400 - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
401 - 8*b[16] + 8*b[17] - 4*b[18] + 4*b[19] + 8*b[20] - 8*b[21] + 4*b[22] - 4*b[23]
402 - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
403 - 4*b[32] - 4*b[33] - 2*b[34] - 2*b[35] + 4*b[36] + 4*b[37] + 2*b[38] + 2*b[39]
404 - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43] - 3*b[44] - 3*b[45] + 3*b[46] + 3*b[47]
405 - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 4*b[52] + 4*b[53] - 2*b[54] + 2*b[55]
406 - 2*b[56] - 2*b[57] - b[58] - b[59] - 2*b[60] - 2*b[61] - b[62] - b[63];
407 a[60] = 4*b[0] - 4*b[2] - 4*b[4] + 4*b[6] + 2*b[16] + 2*b[18] - 2*b[20] - 2*b[22]
408 + 2*b[24] - 2*b[26] + 2*b[28] - 2*b[30] + b[48] + b[50] + b[52] + b[54];
409 a[61] = 4*b[8] - 4*b[10] - 4*b[12] + 4*b[14] + 2*b[32] + 2*b[34] - 2*b[36] - 2*b[38]
410 + 2*b[40] - 2*b[42] + 2*b[44] - 2*b[46] + b[56] + b[58] + b[60] + b[62];
411 a[62] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
412 - 8*b[8] - 4*b[9] + 8*b[10] + 4*b[11] + 8*b[12] + 4*b[13] - 8*b[14] - 4*b[15]
413 - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
414 - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
415 - 4*b[32] - 2*b[33] - 4*b[34] - 2*b[35] + 4*b[36] + 2*b[37] + 4*b[38] + 2*b[39]
416 - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43] - 4*b[44] - 2*b[45] + 4*b[46] + 2*b[47]
417 - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 3*b[52] + 3*b[53] - 3*b[54] + 3*b[55]
418 - 2*b[56] - b[57] - 2*b[58] - b[59] - 2*b[60] - b[61] - 2*b[62] - b[63];
419 a[63] = 8*b[0] - 8*b[1] - 8*b[2] + 8*b[3] - 8*b[4] + 8*b[5] + 8*b[6] - 8*b[7]
420 + 4*b[8] + 4*b[9] - 4*b[10] - 4*b[11] - 4*b[12] - 4*b[13] + 4*b[14] + 4*b[15]
421 + 4*b[16] - 4*b[17] + 4*b[18] - 4*b[19] - 4*b[20] + 4*b[21] - 4*b[22] + 4*b[23]
422 + 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 4*b[28] - 4*b[29] - 4*b[30] + 4*b[31]
423 + 2*b[32] + 2*b[33] + 2*b[34] + 2*b[35] - 2*b[36] - 2*b[37] - 2*b[38] - 2*b[39]
424 + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43] + 2*b[44] + 2*b[45] - 2*b[46] - 2*b[47]
425 + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + 2*b[52] - 2*b[53] + 2*b[54] - 2*b[55]
426 + b[56] + b[57] + b[58] + b[59] + b[60] + b[61] + b[62] + b[63];
429 __device__
void compute_b(
434 for (
int i0 = 0; i0 < 8; i0++) {
436 int zero_derivs =
FALSE;
441 for (
int i1 = 0; i1 < 3; i1++) {
442 inds2[i1] = (inds[i1] + ((i0 & bit) ? 1 : 0)) % k[i1];
445 if (cont[i1] && inds[i1] == (k[i1]-1) && inds2[i1] == 0) {
464 int d_hi[3] = {1, 1, 1};
465 int d_lo[3] = {1, 1, 1};
467 float dscales[3] = {0.5, 0.5, 0.5};
469 for (
int i1 = 0; i1 < 3; i1++) {
470 if (inds2[i1] == 0) {
472 d_lo[i1] = -(k[i1]-1);
473 voffs[i1] = offset[i1];
474 dscales[i1] = 1.0/(1.0 + gap[i1]) * 1.0/gapscale[i1];
476 else zero_derivs =
TRUE;
478 else if (inds2[i1] == k[i1]-1) {
480 d_hi[i1] = -(k[i1]-1);
481 voffs[i1] = offset[i1];
482 dscales[i1] = 1.0/(1.0 + gap[i1]) * 1.0/gapscale[i1];
484 else zero_derivs =
TRUE;
493 b[i0] = get_grid(inds2[0],inds2[1],inds2[2]) + voff;
504 b[8+i0] = dscales[0] * (get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]) + voffs[0]);
505 b[16+i0] = dscales[1] * (get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]) - get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]) + voffs[1]);
506 b[24+i0] = dscales[2] * (get_grid_d(inds2[0],inds2[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0],inds2[1],inds2[2]-d_lo[2]) + voffs[2]);
507 b[32+i0] = dscales[0] * dscales[1]
508 * (get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]) -
509 get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]));
510 b[40+i0] = dscales[0] * dscales[2]
511 * (get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]+d_hi[2]) -
512 get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]-d_lo[2]));
513 b[48+i0] = dscales[1] * dscales[2]
514 * (get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) -
515 get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));
517 b[56+i0] = dscales[0] * dscales[1] * dscales[2]
518 * (get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) -
519 get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) +
520 get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) +
521 get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));
535 int err = get_inds(pos, inds, dg, gapscale);
543 compute_b(b, inds, gapscale);
551 float x[4], y[4], z[4];
552 x[0] = 1; y[0] = 1; z[0] = 1;
554 for (
int j = 1; j < 4; j++) {
555 x[j] = x[j-1] * dg.
x;
556 y[j] = y[j-1] * dg.
y;
557 z[j] = z[j-1] * dg.
z;
560 V = compute_V(a, x, y, z);
567 __device__
inline float get_grid(
const int i0,
const int i1,
const int i2)
const {
568 return grid[grid_index(i0, i1, i2)];
570 __device__
inline double get_grid_d(
const int i0,
const int i1,
const int i2)
const {
571 return double(get_grid(i0, i1, i2));
581 __device__
inline long int grid_index(
int i0,
int i1,
int i2)
const {
582 int inds[3] = {i0, i1, i2};
583 return inds[0]*dk[0] + inds[1]*dk[1] + inds[2]*dk[2];
586 __host__ __device__ GridforceGridCUDA(
622 k[0]=h_k1; k[1]=h_k2; k[2]=h_k3;
623 dk[0]=dh_k1; dk[1]=dh_k2; dk[2]=dh_k3;
624 cont[0]=h_cont1; cont[1]=h_cont2; cont[2]=h_cont3;
625 gapinv[0]=h_gapinv1; gapinv[1]=h_gapinv2; gapinv[2]=h_gapinv3;
626 gap[0]=h_gap1; gap[1]=h_gap2; gap[2]=h_gap3;
627 offset[0]=h_offset1; offset[1]=h_offset2; offset[2]=h_offset3;
657 #endif //NODEGROUP_FORCE_REGISTER
virtual Position get_center(void) const =0
static NAMD_HOST_DEVICE Tensor diagonal(const Vector &v1)
virtual Vector get_scale(void) const =0
Position get_corner(int idx)
Position wrap_position(const Position &pos, const Lattice &lattice)
int compute_VdV(Position pos, float &V, Vector &dV) const
NAMD_HOST_DEVICE Vector wrap_delta(const Position &pos1) const
NAMD_HOST_DEVICE Vector origin() const