NAMD
GridforceGridCUDAKernel.h
Go to the documentation of this file.
1 #ifndef GRIDFORCEGRIDCUDAKERNEL_H
2 #define GRIDFORCEGRIDCUDAKERNEL_H
3 #include "GridforceGridCUDA.h"
4 
5 #ifdef NAMD_CUDA
6 #include <cuda.h>
7 #endif
8 #ifdef NAMD_HIP
9 #include <hip/hip_runtime.h>
10 #endif
11 
12 #include <vector>
13 #include "Lattice.h"
14 #include "CudaUtils.h"
15 #include "CudaRecord.h"
16 #include "HipDefines.h"
17 
18 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
19 #ifdef NODEGROUP_FORCE_REGISTER
20 
21 // A slice of GridForceGrid.h and GridForceGrid.inl, but we only have
22 // the subset necessary to support the compute_[abVdV] on the device
23 // side.
24 
25 // Minus any of the loading/packing/setup
26 
27 // We copy only the members we need in our compute methods from the
28 // host grid in our constructor.
29 
30 // Any pointers in here are to fixed size arrays, or to buffers
31 // allocated by the caller of the constructor.
32 
33 class ForceEnergy
34 {
35 
36 public:
37  Force force;
38  double energy;
39 
40  __host__ __device__ ForceEnergy(Force in_f, double in_e): force(in_f), energy(in_e)
41  {
42 
43  }
44 
45 };
46 class GridforceGridCUDA
47 {
48  friend class GridforceGrid;
49  public:
50  __host__ __device__ GridforceGridCUDA(){}
51  __device__ Position wrap_position(
52  const Position &pos,
53  const Lattice &lattice) const
54  {
55  // Wrap 'pos' about grid center, using periodic cell information in 'lattice'
56  // Position pos_wrapped = pos;
57  // Position center = get_center();
58  // pos_wrapped += lattice.wrap_delta(pos);
59  // pos_wrapped += lattice.delta(pos_wrapped, center) - (pos_wrapped - center);
60  Position pos_wrapped = pos + lattice.wrap_delta(pos - get_center() + lattice.origin());
61  return pos_wrapped;
62  }
63 
64  __device__ Position get_corner(int idx);
65  __device__ inline Position get_center(void) const { return center; }
66 
67  // code from Chris Maffeo
68  // adapted by Eric Bohm
69  // disabled by default in computeGridForceKernel due to results not agreeing with the prior method
70  __device__ inline ForceEnergy interpolateForceD(const Vector& pos) const
71  {
72  Vector f;
73 
74  // *original* const Vector l = basisInv.transform(pos - origin);
75  // we have inv, but it has no transform method.
76  // conversion below
77  const Vector l = inv*(pos - origin);
78 
79  const int homeX = int(floor(l.x));
80  const int homeY = int(floor(l.y));
81  const int homeZ = int(floor(l.z));
82  const float wx = l.x - homeX;
83  const float wy = l.y - homeY;
84  const float wz = l.z - homeZ;
85  const float wx2 = wx*wx;
86 
87  /* f.x */
88  float g3[3][4];
89 #pragma unroll
90  for (int iz = 0; iz < 4; iz++) {
91  float g2[2][4];
92  int jz = (iz + homeZ - 1);
93  // manage boundary case
94  jz += cont[2] ? ( jz <0 ? k[2] : ( jz >= k[2] ? -k[2] : 0) ) : 0;
95  for (int iy = 0; iy < 4; iy++) {
96  float v[4];
97  int jy = (iy + homeY - 1);
98  // manage boundary case
99  jy += cont[1] ? ( jy <0 ? k[1] : ( jy >= k[1] ? -k[1] : 0) ) : 0;
100  for (int ix = 0; ix < 4; ix++) {
101  int jx = (ix + homeX - 1);
102  // manage boundary case
103  jx += cont[0] ? ( jx <0 ? k[0] : ( jx >= k[0] ? -k[0] : 0) ) : 0;
104  const int ind = jz + jy*k[2] + jx*k[2]*k[1];
105  v[ix] = jz < 0 || jz >= k[2] || jy < 0 || jy >= k[1] || jx < 0 || jx >= k[0] ?
106  0 : grid[ind];
107  }
108  const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2;
109  const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx;
110  const float a1 = 0.5f*(-v[0] + v[2]);
111  g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1; /* f.x (derivative) */
112  g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1]; /* f.y & f.z */
113  }
114 
115  // Mix along y.
116  {
117  g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy +
118  0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3]) *wy*wy +
119  0.5f*(-g2[0][0] + g2[0][2]) *wy +
120  g2[0][1];
121  }
122 
123  {
124  const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy;
125  const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy;
126  const float a1 = 0.5f*(-g2[1][0] + g2[1][2]);
127  g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1; /* f.y */
128  g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1]; /* f.z */
129  }
130  }
131 
132  // Mix along z.
133  f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
134  -0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
135  -0.5f*(-g3[0][0] + g3[0][2]) *wz -
136  g3[0][1];
137  f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
138  -0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
139  -0.5f*(-g3[1][0] + g3[1][2]) *wz -
140  g3[1][1];
141  f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
142  (2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]) *wz -
143  0.5f*(-g3[2][0] + g3[2][2]);
144  float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
145  0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]) *wz*wz +
146  0.5f*(-g3[2][0] + g3[2][2]) *wz +
147  g3[2][1];
148 
149  // *original* f = basisInv.transpose().transform(f);
150  // basisInv is a "set of unit vectors for the grid"
151  // inv is the inverse Tensor of unit vector Tensor e
152  // so inv seems to be the right thing.
153  // missing ingredient : the transform method is not defined for Tensor.
154  Tensor inv_t = transpose(inv);
155  f=inv_t*f;
156  return ForceEnergy(f,e);
157  }
158 
159 
160  __device__ int get_inds(
161  const Position& pos,
162  int *inds,
163  Vector &dg,
164  Vector &gapscale) const
165  {
166  Vector p = pos - origin;
167  Vector g;
168 
169  g = inv * p;
170 #pragma unroll
171  for (int i = 0; i < 3; i++) {
172  inds[i] = (int)floor(g[i]);
173  dg[i] = g[i] - inds[i];
174  }
175 #pragma unroll
176  for (int i = 0; i < 3; i++) {
177  if (inds[i] < 0 || inds[i] >= k[i]-1) {
178  if (cont[i]) inds[i] = k[i]-1;
179  else return -1; // Outside potential and grid is not continuous
180  }
181  if (cont[i] && inds[i] == k[i]-1) {
182  // Correct for non-unit spacing between continuous grid images
183  gapscale[i] *= gapinv[i];
184  if (g[i] < 0.0) dg[i] = 1.0 + g[i]*gapinv[i]; // = (gap[i] + g[i]) * gapinv[i]
185  else dg[i] = (g[i] - inds[i]) * gapinv[i];
186  }
187  }
188  return 0;
189  }
190 
191 
192  __device__ float compute_V(
193  const float *a,
194  const float *x,
195  const float *y,
196  const float *z) const
197  {
198  float V = 0.0;
199  long int ind = 0;
200  for (int l = 0; l < 4; l++) {
201  for (int k = 0; k < 4; k++) {
202 #pragma unroll
203  for (int j = 0; j < 4; j++) {
204  V += a[ind] * x[j] * y[k] * z[l];
205  ind++;
206  }
207  }
208  }
209  return V;
210  }
211 
212 
213  __device__ Vector compute_dV(
214  const float *a,
215  const float *x,
216  const float *y,
217  const float *z) const
218  {
219  Vector dV = 0;
220  long int ind = 0;
221 
222  for (int l = 0; l < 4; l++) {
223  for (int k = 0; k < 4; k++) {
224 #pragma unroll
225  for (int j = 0; j < 4; j++) {
226  if (j > 0) dV.x += a[ind] * j * x[j-1] * y[k] * z[l]; // dV/dx
227  if (k > 0) dV.y += a[ind] * k * x[j] * y[k-1] * z[l]; // dV/dy
228  if (l > 0) dV.z += a[ind] * l * x[j] * y[k] * z[l-1]; // dV/dz
229  ind++;
230  }
231  }
232  }
233  return dV;
234  }
235 
236 
237  __device__ Vector compute_d2V(
238  const float *a,
239  const float *x,
240  const float *y,
241  const float *z) const
242  {
243  Vector d2V = 0;
244  int ind = 0;
245  for (int l = 0; l < 4; l++) {
246  for (int k = 0; k < 4; k++) {
247  for (int j = 0; j < 4; j++) {
248  if (j > 0 && k > 0) d2V.x += a[ind] * j * k * x[j-1] * y[k-1] * z[l]; // d2V/dxdy
249  if (j > 0 && l > 0) d2V.y += a[ind] * j * l * x[j-1] * y[k] * z[l-1]; // d2V/dxdz
250  if (k > 0 && l > 0) d2V.z += a[ind] * k * l * x[j] * y[k-1] * z[l-1]; // d2V/dydz
251  ind++;
252  }
253  }
254  }
255  return d2V;
256  }
257 
258  __device__ float compute_d3V(
259  const float *a,
260  const float *x,
261  const float *y,
262  const float *z) const
263  {
264  float d3V = 0.0;
265  long int ind = 0;
266  for (int l = 0; l < 4; l++) {
267  for (int k = 0; k < 4; k++) {
268  for (int j = 0; j < 4; j++) {
269  if (j > 0 && k > 0 && l > 0) d3V += a[ind] * j * k * l * x[j-1] * y[k-1] * z[l-1]; // d3V/dxdydz
270  ind++;
271  }
272  }
273  }
274  return d3V;
275  }
276 
277 
278  __device__ void compute_a(
279  float *a,
280  const float *b) const
281  {
282  // Static sparse 64x64 matrix times vector ... nicer looking way than this?
283  a[0] = b[0];
284  a[1] = b[8];
285  a[2] = -3*b[0] + 3*b[1] - 2*b[8] - b[9];
286  a[3] = 2*b[0] - 2*b[1] + b[8] + b[9];
287  a[4] = b[16];
288  a[5] = b[32];
289  a[6] = -3*b[16] + 3*b[17] - 2*b[32] - b[33];
290  a[7] = 2*b[16] - 2*b[17] + b[32] + b[33];
291  a[8] = -3*b[0] + 3*b[2] - 2*b[16] - b[18];
292  a[9] = -3*b[8] + 3*b[10] - 2*b[32] - b[34];
293  a[10] = 9*b[0] - 9*b[1] - 9*b[2] + 9*b[3] + 6*b[8] + 3*b[9] - 6*b[10] - 3*b[11]
294  + 6*b[16] - 6*b[17] + 3*b[18] - 3*b[19] + 4*b[32] + 2*b[33] + 2*b[34] + b[35];
295  a[11] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 3*b[8] - 3*b[9] + 3*b[10] + 3*b[11]
296  - 4*b[16] + 4*b[17] - 2*b[18] + 2*b[19] - 2*b[32] - 2*b[33] - b[34] - b[35];
297  a[12] = 2*b[0] - 2*b[2] + b[16] + b[18];
298  a[13] = 2*b[8] - 2*b[10] + b[32] + b[34];
299  a[14] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 4*b[8] - 2*b[9] + 4*b[10] + 2*b[11]
300  - 3*b[16] + 3*b[17] - 3*b[18] + 3*b[19] - 2*b[32] - b[33] - 2*b[34] - b[35];
301  a[15] = 4*b[0] - 4*b[1] - 4*b[2] + 4*b[3] + 2*b[8] + 2*b[9] - 2*b[10] - 2*b[11]
302  + 2*b[16] - 2*b[17] + 2*b[18] - 2*b[19] + b[32] + b[33] + b[34] + b[35];
303  a[16] = b[24];
304  a[17] = b[40];
305  a[18] = -3*b[24] + 3*b[25] - 2*b[40] - b[41];
306  a[19] = 2*b[24] - 2*b[25] + b[40] + b[41];
307  a[20] = b[48];
308  a[21] = b[56];
309  a[22] = -3*b[48] + 3*b[49] - 2*b[56] - b[57];
310  a[23] = 2*b[48] - 2*b[49] + b[56] + b[57];
311  a[24] = -3*b[24] + 3*b[26] - 2*b[48] - b[50];
312  a[25] = -3*b[40] + 3*b[42] - 2*b[56] - b[58];
313  a[26] = 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43]
314  + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 4*b[56] + 2*b[57] + 2*b[58] + b[59];
315  a[27] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43]
316  - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 2*b[56] - 2*b[57] - b[58] - b[59];
317  a[28] = 2*b[24] - 2*b[26] + b[48] + b[50];
318  a[29] = 2*b[40] - 2*b[42] + b[56] + b[58];
319  a[30] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43]
320  - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 2*b[56] - b[57] - 2*b[58] - b[59];
321  a[31] = 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43]
322  + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + b[56] + b[57] + b[58] + b[59];
323  a[32] = -3*b[0] + 3*b[4] - 2*b[24] - b[28];
324  a[33] = -3*b[8] + 3*b[12] - 2*b[40] - b[44];
325  a[34] = 9*b[0] - 9*b[1] - 9*b[4] + 9*b[5] + 6*b[8] + 3*b[9] - 6*b[12] - 3*b[13]
326  + 6*b[24] - 6*b[25] + 3*b[28] - 3*b[29] + 4*b[40] + 2*b[41] + 2*b[44] + b[45];
327  a[35] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 3*b[8] - 3*b[9] + 3*b[12] + 3*b[13]
328  - 4*b[24] + 4*b[25] - 2*b[28] + 2*b[29] - 2*b[40] - 2*b[41] - b[44] - b[45];
329  a[36] = -3*b[16] + 3*b[20] - 2*b[48] - b[52];
330  a[37] = -3*b[32] + 3*b[36] - 2*b[56] - b[60];
331  a[38] = 9*b[16] - 9*b[17] - 9*b[20] + 9*b[21] + 6*b[32] + 3*b[33] - 6*b[36] - 3*b[37]
332  + 6*b[48] - 6*b[49] + 3*b[52] - 3*b[53] + 4*b[56] + 2*b[57] + 2*b[60] + b[61];
333  a[39] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 3*b[32] - 3*b[33] + 3*b[36] + 3*b[37]
334  - 4*b[48] + 4*b[49] - 2*b[52] + 2*b[53] - 2*b[56] - 2*b[57] - b[60] - b[61];
335  a[40] = 9*b[0] - 9*b[2] - 9*b[4] + 9*b[6] + 6*b[16] + 3*b[18] - 6*b[20] - 3*b[22]
336  + 6*b[24] - 6*b[26] + 3*b[28] - 3*b[30] + 4*b[48] + 2*b[50] + 2*b[52] + b[54];
337  a[41] = 9*b[8] - 9*b[10] - 9*b[12] + 9*b[14] + 6*b[32] + 3*b[34] - 6*b[36] - 3*b[38]
338  + 6*b[40] - 6*b[42] + 3*b[44] - 3*b[46] + 4*b[56] + 2*b[58] + 2*b[60] + b[62];
339  a[42] = -27*b[0] + 27*b[1] + 27*b[2] - 27*b[3] + 27*b[4] - 27*b[5] - 27*b[6] + 27*b[7]
340  - 18*b[8] - 9*b[9] + 18*b[10] + 9*b[11] + 18*b[12] + 9*b[13] - 18*b[14] - 9*b[15]
341  - 18*b[16] + 18*b[17] - 9*b[18] + 9*b[19] + 18*b[20] - 18*b[21] + 9*b[22] - 9*b[23]
342  - 18*b[24] + 18*b[25] + 18*b[26] - 18*b[27] - 9*b[28] + 9*b[29] + 9*b[30] - 9*b[31]
343  - 12*b[32] - 6*b[33] - 6*b[34] - 3*b[35] + 12*b[36] + 6*b[37] + 6*b[38] + 3*b[39]
344  - 12*b[40] - 6*b[41] + 12*b[42] + 6*b[43] - 6*b[44] - 3*b[45] + 6*b[46] + 3*b[47]
345  - 12*b[48] + 12*b[49] - 6*b[50] + 6*b[51] - 6*b[52] + 6*b[53] - 3*b[54] + 3*b[55]
346  - 8*b[56] - 4*b[57] - 4*b[58] - 2*b[59] - 4*b[60] - 2*b[61] - 2*b[62] - b[63];
347  a[43] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
348  + 9*b[8] + 9*b[9] - 9*b[10] - 9*b[11] - 9*b[12] - 9*b[13] + 9*b[14] + 9*b[15]
349  + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
350  + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
351  + 6*b[32] + 6*b[33] + 3*b[34] + 3*b[35] - 6*b[36] - 6*b[37] - 3*b[38] - 3*b[39]
352  + 6*b[40] + 6*b[41] - 6*b[42] - 6*b[43] + 3*b[44] + 3*b[45] - 3*b[46] - 3*b[47]
353  + 8*b[48] - 8*b[49] + 4*b[50] - 4*b[51] + 4*b[52] - 4*b[53] + 2*b[54] - 2*b[55]
354  + 4*b[56] + 4*b[57] + 2*b[58] + 2*b[59] + 2*b[60] + 2*b[61] + b[62] + b[63];
355  a[44] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 3*b[16] - 3*b[18] + 3*b[20] + 3*b[22]
356  - 4*b[24] + 4*b[26] - 2*b[28] + 2*b[30] - 2*b[48] - 2*b[50] - b[52] - b[54];
357  a[45] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 3*b[32] - 3*b[34] + 3*b[36] + 3*b[38]
358  - 4*b[40] + 4*b[42] - 2*b[44] + 2*b[46] - 2*b[56] - 2*b[58] - b[60] - b[62];
359  a[46] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
360  + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
361  + 9*b[16] - 9*b[17] + 9*b[18] - 9*b[19] - 9*b[20] + 9*b[21] - 9*b[22] + 9*b[23]
362  + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
363  + 6*b[32] + 3*b[33] + 6*b[34] + 3*b[35] - 6*b[36] - 3*b[37] - 6*b[38] - 3*b[39]
364  + 8*b[40] + 4*b[41] - 8*b[42] - 4*b[43] + 4*b[44] + 2*b[45] - 4*b[46] - 2*b[47]
365  + 6*b[48] - 6*b[49] + 6*b[50] - 6*b[51] + 3*b[52] - 3*b[53] + 3*b[54] - 3*b[55]
366  + 4*b[56] + 2*b[57] + 4*b[58] + 2*b[59] + 2*b[60] + b[61] + 2*b[62] + b[63];
367  a[47] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
368  - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
369  - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
370  - 8*b[24] + 8*b[25] + 8*b[26] - 8*b[27] - 4*b[28] + 4*b[29] + 4*b[30] - 4*b[31]
371  - 3*b[32] - 3*b[33] - 3*b[34] - 3*b[35] + 3*b[36] + 3*b[37] + 3*b[38] + 3*b[39]
372  - 4*b[40] - 4*b[41] + 4*b[42] + 4*b[43] - 2*b[44] - 2*b[45] + 2*b[46] + 2*b[47]
373  - 4*b[48] + 4*b[49] - 4*b[50] + 4*b[51] - 2*b[52] + 2*b[53] - 2*b[54] + 2*b[55]
374  - 2*b[56] - 2*b[57] - 2*b[58] - 2*b[59] - b[60] - b[61] - b[62] - b[63];
375  a[48] = 2*b[0] - 2*b[4] + b[24] + b[28];
376  a[49] = 2*b[8] - 2*b[12] + b[40] + b[44];
377  a[50] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 4*b[8] - 2*b[9] + 4*b[12] + 2*b[13]
378  - 3*b[24] + 3*b[25] - 3*b[28] + 3*b[29] - 2*b[40] - b[41] - 2*b[44] - b[45];
379  a[51] = 4*b[0] - 4*b[1] - 4*b[4] + 4*b[5] + 2*b[8] + 2*b[9] - 2*b[12] - 2*b[13]
380  + 2*b[24] - 2*b[25] + 2*b[28] - 2*b[29] + b[40] + b[41] + b[44] + b[45];
381  a[52] = 2*b[16] - 2*b[20] + b[48] + b[52];
382  a[53] = 2*b[32] - 2*b[36] + b[56] + b[60];
383  a[54] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 4*b[32] - 2*b[33] + 4*b[36] + 2*b[37]
384  - 3*b[48] + 3*b[49] - 3*b[52] + 3*b[53] - 2*b[56] - b[57] - 2*b[60] - b[61];
385  a[55] = 4*b[16] - 4*b[17] - 4*b[20] + 4*b[21] + 2*b[32] + 2*b[33] - 2*b[36] - 2*b[37]
386  + 2*b[48] - 2*b[49] + 2*b[52] - 2*b[53] + b[56] + b[57] + b[60] + b[61];
387  a[56] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 4*b[16] - 2*b[18] + 4*b[20] + 2*b[22]
388  - 3*b[24] + 3*b[26] - 3*b[28] + 3*b[30] - 2*b[48] - b[50] - 2*b[52] - b[54];
389  a[57] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 4*b[32] - 2*b[34] + 4*b[36] + 2*b[38]
390  - 3*b[40] + 3*b[42] - 3*b[44] + 3*b[46] - 2*b[56] - b[58] - 2*b[60] - b[62];
391  a[58] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
392  + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
393  + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
394  + 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 9*b[28] - 9*b[29] - 9*b[30] + 9*b[31]
395  + 8*b[32] + 4*b[33] + 4*b[34] + 2*b[35] - 8*b[36] - 4*b[37] - 4*b[38] - 2*b[39]
396  + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43] + 6*b[44] + 3*b[45] - 6*b[46] - 3*b[47]
397  + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 6*b[52] - 6*b[53] + 3*b[54] - 3*b[55]
398  + 4*b[56] + 2*b[57] + 2*b[58] + b[59] + 4*b[60] + 2*b[61] + 2*b[62] + b[63];
399  a[59] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
400  - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
401  - 8*b[16] + 8*b[17] - 4*b[18] + 4*b[19] + 8*b[20] - 8*b[21] + 4*b[22] - 4*b[23]
402  - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
403  - 4*b[32] - 4*b[33] - 2*b[34] - 2*b[35] + 4*b[36] + 4*b[37] + 2*b[38] + 2*b[39]
404  - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43] - 3*b[44] - 3*b[45] + 3*b[46] + 3*b[47]
405  - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 4*b[52] + 4*b[53] - 2*b[54] + 2*b[55]
406  - 2*b[56] - 2*b[57] - b[58] - b[59] - 2*b[60] - 2*b[61] - b[62] - b[63];
407  a[60] = 4*b[0] - 4*b[2] - 4*b[4] + 4*b[6] + 2*b[16] + 2*b[18] - 2*b[20] - 2*b[22]
408  + 2*b[24] - 2*b[26] + 2*b[28] - 2*b[30] + b[48] + b[50] + b[52] + b[54];
409  a[61] = 4*b[8] - 4*b[10] - 4*b[12] + 4*b[14] + 2*b[32] + 2*b[34] - 2*b[36] - 2*b[38]
410  + 2*b[40] - 2*b[42] + 2*b[44] - 2*b[46] + b[56] + b[58] + b[60] + b[62];
411  a[62] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
412  - 8*b[8] - 4*b[9] + 8*b[10] + 4*b[11] + 8*b[12] + 4*b[13] - 8*b[14] - 4*b[15]
413  - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
414  - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
415  - 4*b[32] - 2*b[33] - 4*b[34] - 2*b[35] + 4*b[36] + 2*b[37] + 4*b[38] + 2*b[39]
416  - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43] - 4*b[44] - 2*b[45] + 4*b[46] + 2*b[47]
417  - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 3*b[52] + 3*b[53] - 3*b[54] + 3*b[55]
418  - 2*b[56] - b[57] - 2*b[58] - b[59] - 2*b[60] - b[61] - 2*b[62] - b[63];
419  a[63] = 8*b[0] - 8*b[1] - 8*b[2] + 8*b[3] - 8*b[4] + 8*b[5] + 8*b[6] - 8*b[7]
420  + 4*b[8] + 4*b[9] - 4*b[10] - 4*b[11] - 4*b[12] - 4*b[13] + 4*b[14] + 4*b[15]
421  + 4*b[16] - 4*b[17] + 4*b[18] - 4*b[19] - 4*b[20] + 4*b[21] - 4*b[22] + 4*b[23]
422  + 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 4*b[28] - 4*b[29] - 4*b[30] + 4*b[31]
423  + 2*b[32] + 2*b[33] + 2*b[34] + 2*b[35] - 2*b[36] - 2*b[37] - 2*b[38] - 2*b[39]
424  + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43] + 2*b[44] + 2*b[45] - 2*b[46] - 2*b[47]
425  + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + 2*b[52] - 2*b[53] + 2*b[54] - 2*b[55]
426  + b[56] + b[57] + b[58] + b[59] + b[60] + b[61] + b[62] + b[63];
427  }
428 
429  __device__ void compute_b(
430  float *b,
431  const int *inds,
432  Vector gapscale) const
433  {
434  for (int i0 = 0; i0 < 8; i0++) {
435  int inds2[3];
436  int zero_derivs = FALSE;
437 
438  float voff = 0.0;
439  int bit = 1; // bit = 2^i1 in the below loop
440 #pragma unroll
441  for (int i1 = 0; i1 < 3; i1++) {
442  inds2[i1] = (inds[i1] + ((i0 & bit) ? 1 : 0)) % k[i1];
443 
444  // Deal with voltage offsets
445  if (cont[i1] && inds[i1] == (k[i1]-1) && inds2[i1] == 0) {
446  voff += offset[i1];
447  }
448 
449  bit <<= 1; // i.e. multiply by 2
450  }
451 
452  // NOTE: leaving everything in terms of unit cell coordinates for now,
453  // eventually will multiply by inv tensor when applying the force
454 
455  // First set variables 'dk_{hi,lo}' (glob notation). The 'hi'
456  // ('lo') variable in a given dimension is the number added (subtracted)
457  // to go up (down) one grid point in that dimension; both are normally
458  // just the corresponding 'dk[i]'. However, if we are sitting on a
459  // boundary and we are using a continuous grid, then we want to map the
460  // next point off the grid back around to the other side. e.g. point
461  // (k[0], i1, k) maps to point (0, i1, k), which would be
462  // accomplished by changing 'dk1_hi' to -(k[0]-1)*dk1.
463 
464  int d_hi[3] = {1, 1, 1};
465  int d_lo[3] = {1, 1, 1};
466  float voffs[3];
467  float dscales[3] = {0.5, 0.5, 0.5};
468 #pragma unroll
469  for (int i1 = 0; i1 < 3; i1++) {
470  if (inds2[i1] == 0) {
471  if (cont[i1]) {
472  d_lo[i1] = -(k[i1]-1);
473  voffs[i1] = offset[i1];
474  dscales[i1] = 1.0/(1.0 + gap[i1]) * 1.0/gapscale[i1];
475  }
476  else zero_derivs = TRUE;
477  }
478  else if (inds2[i1] == k[i1]-1) {
479  if (cont[i1]) {
480  d_hi[i1] = -(k[i1]-1);
481  voffs[i1] = offset[i1];
482  dscales[i1] = 1.0/(1.0 + gap[i1]) * 1.0/gapscale[i1];
483  }
484  else zero_derivs = TRUE;
485  }
486  else {
487  voffs[i1] = 0.0;
488  }
489  }
490 
491 
492  // V
493  b[i0] = get_grid(inds2[0],inds2[1],inds2[2]) + voff;
494 
495  if (zero_derivs) {
496  b[8+i0] = 0.0;
497  b[16+i0] = 0.0;
498  b[24+i0] = 0.0;
499  b[32+i0] = 0.0;
500  b[40+i0] = 0.0;
501  b[48+i0] = 0.0;
502  b[56+i0] = 0.0;
503  } else {
504  b[8+i0] = dscales[0] * (get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]) + voffs[0]); // dV/dx
505  b[16+i0] = dscales[1] * (get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]) - get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]) + voffs[1]); // dV/dy
506  b[24+i0] = dscales[2] * (get_grid_d(inds2[0],inds2[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0],inds2[1],inds2[2]-d_lo[2]) + voffs[2]); // dV/dz
507  b[32+i0] = dscales[0] * dscales[1]
508  * (get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]) -
509  get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2])); // d2V/dxdy
510  b[40+i0] = dscales[0] * dscales[2]
511  * (get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]+d_hi[2]) -
512  get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]-d_lo[2])); // d2V/dxdz
513  b[48+i0] = dscales[1] * dscales[2]
514  * (get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) -
515  get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2])); // d2V/dydz
516 
517  b[56+i0] = dscales[0] * dscales[1] * dscales[2] // d3V/dxdydz
518  * (get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) -
519  get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) +
520  get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) +
521  get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));
522  }
523  }
524  }
525 
526  __device__ int compute_VdV(
527  const Position pos,
528  float &V,
529  Vector &dV) const
530  {
531  int inds[3];
532  Vector g, dg;
533  Vector gapscale = Vector(1, 1, 1);
534 
535  int err = get_inds(pos, inds, dg, gapscale);
536  if (err) {
537  return -1;
538  }
539 
540 
541  // Compute b
542  float b[64]; // Matrix of values at 8 box corners
543  compute_b(b, inds, gapscale);
544 
545  // Compute a
546  float a[64];
547  compute_a(a, b);
548 
549  // Calculate powers of x, y, z for later use
550  // e.g. x[2] = x^2
551  float x[4], y[4], z[4];
552  x[0] = 1; y[0] = 1; z[0] = 1;
553 #pragma unroll
554  for (int j = 1; j < 4; j++) {
555  x[j] = x[j-1] * dg.x;
556  y[j] = y[j-1] * dg.y;
557  z[j] = z[j-1] * dg.z;
558  }
559 
560  V = compute_V(a, x, y, z);
561  dV = Tensor::diagonal(gapscale) * (compute_dV(a, x, y, z) * inv);
562 
563  return 0;
564  }
565 
566 
567  __device__ inline float get_grid(const int i0, const int i1, const int i2) const {
568  return grid[grid_index(i0, i1, i2)];
569  }
570  __device__ inline double get_grid_d(const int i0, const int i1, const int i2) const {
571  return double(get_grid(i0, i1, i2));
572  }
573  __device__ inline Vector get_scale(void) const { return scale; }
574 
575  struct GridIndices {
576  int inds2;
577  int dk_hi;
578  int dk_lo;
579  Bool zero_derivs;
580  };
581  __device__ inline long int grid_index(int i0, int i1, int i2) const {
582  int inds[3] = {i0, i1, i2};
583  return inds[0]*dk[0] + inds[1]*dk[1] + inds[2]*dk[2];
584  }
585 
586  __host__ __device__ GridforceGridCUDA(
587  int h_k1,
588  int h_k2,
589  int h_k3,
590  long int h_size,
591  long int dh_k1,
592  long int dh_k2,
593  long int dh_k3,
594  float h_factor,
595  Position h_origin,
596  Position h_center,
597  Bool h_cont1,
598  Bool h_cont2,
599  Bool h_cont3,
600  float h_gapinv1,
601  float h_gapinv2,
602  float h_gapinv3,
603  float h_gap1,
604  float h_gap2,
605  float h_gap3,
606  Tensor h_inv,
607  float h_offset1,
608  float h_offset2,
609  float h_offset3,
610  Vector h_scale,
611  float *h_grid
612  ):
613  size(h_size),
614  factor(h_factor),
615  origin(h_origin),
616  center(h_center),
617  inv(h_inv),
618  scale(h_scale),
619  grid(h_grid)
620 
621  {
622  k[0]=h_k1; k[1]=h_k2; k[2]=h_k3;
623  dk[0]=dh_k1; dk[1]=dh_k2; dk[2]=dh_k3;
624  cont[0]=h_cont1; cont[1]=h_cont2; cont[2]=h_cont3;
625  gapinv[0]=h_gapinv1; gapinv[1]=h_gapinv2; gapinv[2]=h_gapinv3;
626  gap[0]=h_gap1; gap[1]=h_gap2; gap[2]=h_gap3;
627  offset[0]=h_offset1; offset[1]=h_offset2; offset[2]=h_offset3;
628  }
629 
630 
631  long int dk[3];
632  long int size;
633  float *grid; // Actual grid
634 private:
635 
636  Vector corners[8];
637 
638 
639  int k[3]; // Grid dimensions
640 
641 
642  float factor;
643 
644  Position origin; // Grid origin
645  Position center; // Center of grid (for wrapping)
646 
647  Bool cont[3]; // Whether grid is continuous in each dimension
648 
649  float gapinv[3]; // 1.0/gap
650  Vector scale;
651 
652  Tensor inv; // Inverse of unit vectors
653  float offset[3]; // Potential offset in each dimension
654  float gap[3]; // Gap between images of grid in grid units for each dimension
655 };
656 
657 #endif //NODEGROUP_FORCE_REGISTER
658 #endif
659 #endif
Definition: Vector.h:72
BigReal z
Definition: Vector.h:74
#define FALSE
Definition: common.h:127
virtual Position get_center(void) const =0
static NAMD_HOST_DEVICE Tensor diagonal(const Vector &v1)
Definition: Tensor.h:37
int Bool
Definition: common.h:142
BigReal x
Definition: Vector.h:74
virtual Vector get_scale(void) const =0
Definition: Tensor.h:15
BigReal y
Definition: Vector.h:74
Position get_corner(int idx)
Position wrap_position(const Position &pos, const Lattice &lattice)
int compute_VdV(Position pos, float &V, Vector &dV) const
Definition: GridForceGrid.h:53
NAMD_HOST_DEVICE Vector wrap_delta(const Position &pos1) const
Definition: Lattice.h:222
#define TRUE
Definition: common.h:128
NAMD_HOST_DEVICE Vector origin() const
Definition: Lattice.h:278