namd/doxygen/GridforceGridCUDAKernel_8h_source.html

 #ifndef GRIDFORCEGRIDCUDAKERNEL_H
 #define GRIDFORCEGRIDCUDAKERNEL_H
 #include "GridforceGridCUDA.h"

 #ifdef NAMD_CUDA
 #include <cuda.h>
 #endif
 #ifdef NAMD_HIP
 #include <hip/hip_runtime.h>
 #endif

 #include <vector>
 #include "Lattice.h"
 #include "CudaUtils.h"
 #include "CudaRecord.h"
 #include "HipDefines.h"

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 #ifdef NODEGROUP_FORCE_REGISTER

 // A slice of GridForceGrid.h and GridForceGrid.inl, but we only have
 // the subset necessary to support the compute_[abVdV] on the device
 // side.

 // Minus any of the loading/packing/setup

 // We copy only the members we need in our compute methods from the
 // host grid in our constructor.

 // Any pointers in here are to fixed size arrays, or to buffers
 // allocated by the caller of the constructor.

 class ForceEnergy
 {

 public:
   Force force;
   double energy;

  __host__ __device__ ForceEnergy(Force in_f, double in_e): force(in_f), energy(in_e)
   {

   }

 };
 class GridforceGridCUDA
 {
  friend class GridforceGrid;
  public:
   __host__ __device__ GridforceGridCUDA(){}
  __device__ Position wrap_position(
     const Position &pos,
     const Lattice &lattice) const
   {
     // Wrap 'pos' about grid center, using periodic cell information in 'lattice'
     // Position pos_wrapped = pos;
     // Position center = get_center();
     // pos_wrapped += lattice.wrap_delta(pos);
     // pos_wrapped += lattice.delta(pos_wrapped, center) - (pos_wrapped - center);
     Position pos_wrapped = pos + lattice.wrap_delta(pos - get_center() + lattice.origin());
     return pos_wrapped;
   }

   __device__ Position get_corner(int idx);
   __device__ inline Position get_center(void) const { return center; }

   // code from Chris Maffeo
   // adapted by Eric Bohm
   // disabled by default in computeGridForceKernel due to results not agreeing with the prior method
   __device__ inline ForceEnergy interpolateForceD(const Vector& pos) const
   {
     Vector f;

     // *original*   const Vector l = basisInv.transform(pos - origin);
     // we have inv, but it has no transform method.
     // conversion below
     const Vector l = inv*(pos - origin);

     const int homeX = int(floor(l.x));
     const int homeY = int(floor(l.y));
     const int homeZ = int(floor(l.z));
     const float wx = l.x - homeX;
     const float wy = l.y - homeY;
     const float wz = l.z - homeZ;
     const float wx2 = wx*wx;

     /* f.x */
     float g3[3][4];
 #pragma unroll
     for (int iz = 0; iz < 4; iz++) {
       float g2[2][4];
       int jz = (iz + homeZ - 1);
       // manage boundary case
       jz += cont[2] ? ( jz <0 ? k[2] : ( jz >= k[2]  ? -k[2] : 0) ) : 0;
       for (int iy = 0; iy < 4; iy++) {
         float v[4];
         int jy = (iy + homeY - 1);
         // manage boundary case
         jy += cont[1] ? ( jy <0 ? k[1] : ( jy >= k[1]  ? -k[1] : 0) ) : 0;
         for (int ix = 0; ix < 4; ix++) {
           int jx = (ix + homeX - 1);
           // manage boundary case
           jx += cont[0] ? ( jx <0 ? k[0] : ( jx >= k[0]  ? -k[0] : 0) ) : 0;
           const int ind = jz + jy*k[2] + jx*k[2]*k[1];
           v[ix] = jz < 0 || jz >= k[2] || jy < 0 || jy >= k[1] || jx < 0 || jx >= k[0] ?
             0 : grid[ind];
         }
         const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2;
         const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx;
         const float a1 = 0.5f*(-v[0] + v[2]);
         g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1;       /* f.x (derivative) */
         g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1]; /* f.y & f.z */
       }

       // Mix along y.
       {
         g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy +
           0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3])      *wy*wy +
           0.5f*(-g2[0][0] + g2[0][2])                                          *wy +
           g2[0][1];
       }

       {
         const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy;
         const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy;
         const float a1 = 0.5f*(-g2[1][0] + g2[1][2]);
         g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1;                                             /* f.y */
         g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1]; /* f.z */
       }
     }

     // Mix along z.
     f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
       -0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
       -0.5f*(-g3[0][0] + g3[0][2])                                    *wz -
       g3[0][1];
     f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
       -0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
       -0.5f*(-g3[1][0] + g3[1][2])                                    *wz -
       g3[1][1];
     f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
       (2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])      *wz -
       0.5f*(-g3[2][0] + g3[2][2]);
     float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
       0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])    *wz*wz +
       0.5f*(-g3[2][0] + g3[2][2])                                        *wz +
       g3[2][1];

     // *original*    f = basisInv.transpose().transform(f);
     // basisInv is a "set of unit vectors for the grid"
     // inv is the inverse Tensor of unit vector Tensor e
     // so inv seems to be the right thing.
     // missing ingredient : the transform method is not defined for Tensor.
     Tensor inv_t = transpose(inv);
     f=inv_t*f;
     return ForceEnergy(f,e);
   }


   __device__ int get_inds(
                const Position& pos,
                int *inds,
                Vector &dg,
                Vector &gapscale) const
   {
     Vector p = pos - origin;
     Vector g;

     g = inv * p;
 #pragma unroll
     for (int i = 0; i < 3; i++) {
         inds[i] = (int)floor(g[i]);
         dg[i] = g[i] - inds[i];
     }
 #pragma unroll
     for (int i = 0; i < 3; i++) {
         if (inds[i] < 0 || inds[i] >= k[i]-1) {
             if (cont[i]) inds[i] = k[i]-1;
             else return -1;     // Outside potential and grid is not continuous
         }
         if (cont[i] && inds[i] == k[i]-1) {
             // Correct for non-unit spacing between continuous grid images
             gapscale[i] *= gapinv[i];
             if (g[i] < 0.0) dg[i] = 1.0 + g[i]*gapinv[i]; // = (gap[i] + g[i]) * gapinv[i]
             else dg[i] = (g[i] - inds[i]) * gapinv[i];
         }
     }
     return 0;
   }


   __device__ float compute_V(
                   const float *a,
                   const float *x,
                   const float *y,
                   const float *z) const
   {
     float V = 0.0;
     long int ind = 0;
     for (int l = 0; l < 4; l++) {
         for (int k = 0; k < 4; k++) {
 #pragma unroll
             for (int j = 0; j < 4; j++) {
                 V += a[ind] * x[j] * y[k] * z[l];
                 ind++;
             }
         }
     }
     return V;
   }


   __device__ Vector compute_dV(
                     const float *a,
                     const float *x,
                     const float *y,
                     const float *z) const
   {
     Vector dV = 0;
     long int ind = 0;

     for (int l = 0; l < 4; l++) {
       for (int k = 0; k < 4; k++) {
 #pragma unroll
         for (int j = 0; j < 4; j++) {
           if (j > 0) dV.x += a[ind] * j * x[j-1] * y[k]   * z[l];               // dV/dx
           if (k > 0) dV.y += a[ind] * k * x[j]   * y[k-1] * z[l];               // dV/dy
           if (l > 0) dV.z += a[ind] * l * x[j]   * y[k]   * z[l-1];     // dV/dz
           ind++;
         }
       }
     }
     return dV;
   }


   __device__ Vector compute_d2V(
                      const float *a,
                      const float *x,
                      const float *y,
                      const float *z) const
   {
     Vector d2V = 0;
     int ind = 0;
     for (int l = 0; l < 4; l++) {
       for (int k = 0; k < 4; k++) {
         for (int j = 0; j < 4; j++) {
           if (j > 0 && k > 0) d2V.x += a[ind] * j * k * x[j-1] * y[k-1] * z[l]; // d2V/dxdy
           if (j > 0 && l > 0) d2V.y += a[ind] * j * l * x[j-1] * y[k]   * z[l-1];       // d2V/dxdz
           if (k > 0 && l > 0) d2V.z += a[ind] * k * l * x[j]   * y[k-1] * z[l-1];       // d2V/dydz
           ind++;
         }
       }
     }
     return d2V;
   }

   __device__ float compute_d3V(
                     const float *a,
                     const float *x,
                     const float *y,
                     const float *z) const
   {
     float d3V = 0.0;
     long int ind = 0;
     for (int l = 0; l < 4; l++) {
       for (int k = 0; k < 4; k++) {
         for (int j = 0; j < 4; j++) {
           if (j > 0 && k > 0 && l > 0) d3V += a[ind] * j * k * l * x[j-1] * y[k-1] * z[l-1];    // d3V/dxdydz
           ind++;
         }
       }
     }
     return d3V;
   }


   __device__ void compute_a(
                  float *a,
                  const float *b) const
   {
     // Static sparse 64x64 matrix times vector ... nicer looking way than this?
     a[0] = b[0];
     a[1] = b[8];
     a[2] = -3*b[0] + 3*b[1] - 2*b[8] - b[9];
     a[3] = 2*b[0] - 2*b[1] + b[8] + b[9];
     a[4] = b[16];
     a[5] = b[32];
     a[6] = -3*b[16] + 3*b[17] - 2*b[32] - b[33];
     a[7] = 2*b[16] - 2*b[17] + b[32] + b[33];
     a[8] = -3*b[0] + 3*b[2] - 2*b[16] - b[18];
     a[9] = -3*b[8] + 3*b[10] - 2*b[32] - b[34];
     a[10] = 9*b[0] - 9*b[1] - 9*b[2] + 9*b[3] + 6*b[8] + 3*b[9] - 6*b[10] - 3*b[11]
       + 6*b[16] - 6*b[17] + 3*b[18] - 3*b[19] + 4*b[32] + 2*b[33] + 2*b[34] + b[35];
     a[11] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 3*b[8] - 3*b[9] + 3*b[10] + 3*b[11]
       - 4*b[16] + 4*b[17] - 2*b[18] + 2*b[19] - 2*b[32] - 2*b[33] - b[34] - b[35];
     a[12] = 2*b[0] - 2*b[2] + b[16] + b[18];
     a[13] = 2*b[8] - 2*b[10] + b[32] + b[34];
     a[14] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 4*b[8] - 2*b[9] + 4*b[10] + 2*b[11]
       - 3*b[16] + 3*b[17] - 3*b[18] + 3*b[19] - 2*b[32] - b[33] - 2*b[34] - b[35];
     a[15] = 4*b[0] - 4*b[1] - 4*b[2] + 4*b[3] + 2*b[8] + 2*b[9] - 2*b[10] - 2*b[11]
       + 2*b[16] - 2*b[17] + 2*b[18] - 2*b[19] + b[32] + b[33] + b[34] + b[35];
     a[16] = b[24];
     a[17] = b[40];
     a[18] = -3*b[24] + 3*b[25] - 2*b[40] - b[41];
     a[19] = 2*b[24] - 2*b[25] + b[40] + b[41];
     a[20] = b[48];
     a[21] = b[56];
     a[22] = -3*b[48] + 3*b[49] - 2*b[56] - b[57];
     a[23] = 2*b[48] - 2*b[49] + b[56] + b[57];
     a[24] = -3*b[24] + 3*b[26] - 2*b[48] - b[50];
     a[25] = -3*b[40] + 3*b[42] - 2*b[56] - b[58];
     a[26] = 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43]
       + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 4*b[56] + 2*b[57] + 2*b[58] + b[59];
     a[27] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43]
       - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 2*b[56] - 2*b[57] - b[58] - b[59];
     a[28] = 2*b[24] - 2*b[26] + b[48] + b[50];
     a[29] = 2*b[40] - 2*b[42] + b[56] + b[58];
     a[30] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43]
       - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 2*b[56] - b[57] - 2*b[58] - b[59];
     a[31] = 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43]
       + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + b[56] + b[57] + b[58] + b[59];
     a[32] = -3*b[0] + 3*b[4] - 2*b[24] - b[28];
     a[33] = -3*b[8] + 3*b[12] - 2*b[40] - b[44];
     a[34] = 9*b[0] - 9*b[1] - 9*b[4] + 9*b[5] + 6*b[8] + 3*b[9] - 6*b[12] - 3*b[13]
       + 6*b[24] - 6*b[25] + 3*b[28] - 3*b[29] + 4*b[40] + 2*b[41] + 2*b[44] + b[45];
     a[35] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 3*b[8] - 3*b[9] + 3*b[12] + 3*b[13]
       - 4*b[24] + 4*b[25] - 2*b[28] + 2*b[29] - 2*b[40] - 2*b[41] - b[44] - b[45];
     a[36] = -3*b[16] + 3*b[20] - 2*b[48] - b[52];
     a[37] = -3*b[32] + 3*b[36] - 2*b[56] - b[60];
     a[38] = 9*b[16] - 9*b[17] - 9*b[20] + 9*b[21] + 6*b[32] + 3*b[33] - 6*b[36] - 3*b[37]
       + 6*b[48] - 6*b[49] + 3*b[52] - 3*b[53] + 4*b[56] + 2*b[57] + 2*b[60] + b[61];
     a[39] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 3*b[32] - 3*b[33] + 3*b[36] + 3*b[37]
       - 4*b[48] + 4*b[49] - 2*b[52] + 2*b[53] - 2*b[56] - 2*b[57] - b[60] - b[61];
     a[40] = 9*b[0] - 9*b[2] - 9*b[4] + 9*b[6] + 6*b[16] + 3*b[18] - 6*b[20] - 3*b[22]
       + 6*b[24] - 6*b[26] + 3*b[28] - 3*b[30] + 4*b[48] + 2*b[50] + 2*b[52] + b[54];
     a[41] = 9*b[8] - 9*b[10] - 9*b[12] + 9*b[14] + 6*b[32] + 3*b[34] - 6*b[36] - 3*b[38]
       + 6*b[40] - 6*b[42] + 3*b[44] - 3*b[46] + 4*b[56] + 2*b[58] + 2*b[60] + b[62];
     a[42] = -27*b[0] + 27*b[1] + 27*b[2] - 27*b[3] + 27*b[4] - 27*b[5] - 27*b[6] + 27*b[7]
       - 18*b[8] - 9*b[9] + 18*b[10] + 9*b[11] + 18*b[12] + 9*b[13] - 18*b[14] - 9*b[15]
       - 18*b[16] + 18*b[17] - 9*b[18] + 9*b[19] + 18*b[20] - 18*b[21] + 9*b[22] - 9*b[23]
       - 18*b[24] + 18*b[25] + 18*b[26] - 18*b[27] - 9*b[28] + 9*b[29] + 9*b[30] - 9*b[31]
       - 12*b[32] - 6*b[33] - 6*b[34] - 3*b[35] + 12*b[36] + 6*b[37] + 6*b[38] + 3*b[39]
       - 12*b[40] - 6*b[41] + 12*b[42] + 6*b[43] - 6*b[44] - 3*b[45] + 6*b[46] + 3*b[47]
       - 12*b[48] + 12*b[49] - 6*b[50] + 6*b[51] - 6*b[52] + 6*b[53] - 3*b[54] + 3*b[55]
       - 8*b[56] - 4*b[57] - 4*b[58] - 2*b[59] - 4*b[60] - 2*b[61] - 2*b[62] - b[63];
     a[43] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
       + 9*b[8] + 9*b[9] - 9*b[10] - 9*b[11] - 9*b[12] - 9*b[13] + 9*b[14] + 9*b[15]
       + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
       + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
       + 6*b[32] + 6*b[33] + 3*b[34] + 3*b[35] - 6*b[36] - 6*b[37] - 3*b[38] - 3*b[39]
       + 6*b[40] + 6*b[41] - 6*b[42] - 6*b[43] + 3*b[44] + 3*b[45] - 3*b[46] - 3*b[47]
       + 8*b[48] - 8*b[49] + 4*b[50] - 4*b[51] + 4*b[52] - 4*b[53] + 2*b[54] - 2*b[55]
       + 4*b[56] + 4*b[57] + 2*b[58] + 2*b[59] + 2*b[60] + 2*b[61] + b[62] + b[63];
     a[44] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 3*b[16] - 3*b[18] + 3*b[20] + 3*b[22]
       - 4*b[24] + 4*b[26] - 2*b[28] + 2*b[30] - 2*b[48] - 2*b[50] - b[52] - b[54];
     a[45] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 3*b[32] - 3*b[34] + 3*b[36] + 3*b[38]
       - 4*b[40] + 4*b[42] - 2*b[44] + 2*b[46] - 2*b[56] - 2*b[58] - b[60] - b[62];
     a[46] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
       + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
       + 9*b[16] - 9*b[17] + 9*b[18] - 9*b[19] - 9*b[20] + 9*b[21] - 9*b[22] + 9*b[23]
       + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
       + 6*b[32] + 3*b[33] + 6*b[34] + 3*b[35] - 6*b[36] - 3*b[37] - 6*b[38] - 3*b[39]
       + 8*b[40] + 4*b[41] - 8*b[42] - 4*b[43] + 4*b[44] + 2*b[45] - 4*b[46] - 2*b[47]
       + 6*b[48] - 6*b[49] + 6*b[50] - 6*b[51] + 3*b[52] - 3*b[53] + 3*b[54] - 3*b[55]
       + 4*b[56] + 2*b[57] + 4*b[58] + 2*b[59] + 2*b[60] + b[61] + 2*b[62] + b[63];
     a[47] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
       - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
       - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
       - 8*b[24] + 8*b[25] + 8*b[26] - 8*b[27] - 4*b[28] + 4*b[29] + 4*b[30] - 4*b[31]
       - 3*b[32] - 3*b[33] - 3*b[34] - 3*b[35] + 3*b[36] + 3*b[37] + 3*b[38] + 3*b[39]
       - 4*b[40] - 4*b[41] + 4*b[42] + 4*b[43] - 2*b[44] - 2*b[45] + 2*b[46] + 2*b[47]
       - 4*b[48] + 4*b[49] - 4*b[50] + 4*b[51] - 2*b[52] + 2*b[53] - 2*b[54] + 2*b[55]
       - 2*b[56] - 2*b[57] - 2*b[58] - 2*b[59] - b[60] - b[61] - b[62] - b[63];
     a[48] = 2*b[0] - 2*b[4] + b[24] + b[28];
     a[49] = 2*b[8] - 2*b[12] + b[40] + b[44];
     a[50] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 4*b[8] - 2*b[9] + 4*b[12] + 2*b[13]
       - 3*b[24] + 3*b[25] - 3*b[28] + 3*b[29] - 2*b[40] - b[41] - 2*b[44] - b[45];
     a[51] = 4*b[0] - 4*b[1] - 4*b[4] + 4*b[5] + 2*b[8] + 2*b[9] - 2*b[12] - 2*b[13]
       + 2*b[24] - 2*b[25] + 2*b[28] - 2*b[29] + b[40] + b[41] + b[44] + b[45];
     a[52] = 2*b[16] - 2*b[20] + b[48] + b[52];
     a[53] = 2*b[32] - 2*b[36] + b[56] + b[60];
     a[54] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 4*b[32] - 2*b[33] + 4*b[36] + 2*b[37]
       - 3*b[48] + 3*b[49] - 3*b[52] + 3*b[53] - 2*b[56] - b[57] - 2*b[60] - b[61];
     a[55] = 4*b[16] - 4*b[17] - 4*b[20] + 4*b[21] + 2*b[32] + 2*b[33] - 2*b[36] - 2*b[37]
       + 2*b[48] - 2*b[49] + 2*b[52] - 2*b[53] + b[56] + b[57] + b[60] + b[61];
     a[56] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 4*b[16] - 2*b[18] + 4*b[20] + 2*b[22]
       - 3*b[24] + 3*b[26] - 3*b[28] + 3*b[30] - 2*b[48] - b[50] - 2*b[52] - b[54];
     a[57] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 4*b[32] - 2*b[34] + 4*b[36] + 2*b[38]
       - 3*b[40] + 3*b[42] - 3*b[44] + 3*b[46] - 2*b[56] - b[58] - 2*b[60] - b[62];
     a[58] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
       + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
       + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
       + 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 9*b[28] - 9*b[29] - 9*b[30] + 9*b[31]
       + 8*b[32] + 4*b[33] + 4*b[34] + 2*b[35] - 8*b[36] - 4*b[37] - 4*b[38] - 2*b[39]
       + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43] + 6*b[44] + 3*b[45] - 6*b[46] - 3*b[47]
       + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 6*b[52] - 6*b[53] + 3*b[54] - 3*b[55]
       + 4*b[56] + 2*b[57] + 2*b[58] + b[59] + 4*b[60] + 2*b[61] + 2*b[62] + b[63];
     a[59] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
       - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
       - 8*b[16] + 8*b[17] - 4*b[18] + 4*b[19] + 8*b[20] - 8*b[21] + 4*b[22] - 4*b[23]
       - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
       - 4*b[32] - 4*b[33] - 2*b[34] - 2*b[35] + 4*b[36] + 4*b[37] + 2*b[38] + 2*b[39]
       - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43] - 3*b[44] - 3*b[45] + 3*b[46] + 3*b[47]
       - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 4*b[52] + 4*b[53] - 2*b[54] + 2*b[55]
       - 2*b[56] - 2*b[57] - b[58] - b[59] - 2*b[60] - 2*b[61] - b[62] - b[63];
     a[60] = 4*b[0] - 4*b[2] - 4*b[4] + 4*b[6] + 2*b[16] + 2*b[18] - 2*b[20] - 2*b[22]
       + 2*b[24] - 2*b[26] + 2*b[28] - 2*b[30] + b[48] + b[50] + b[52] + b[54];
     a[61] = 4*b[8] - 4*b[10] - 4*b[12] + 4*b[14] + 2*b[32] + 2*b[34] - 2*b[36] - 2*b[38]
       + 2*b[40] - 2*b[42] + 2*b[44] - 2*b[46] + b[56] + b[58] + b[60] + b[62];
     a[62] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
       - 8*b[8] - 4*b[9] + 8*b[10] + 4*b[11] + 8*b[12] + 4*b[13] - 8*b[14] - 4*b[15]
       - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
       - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
       - 4*b[32] - 2*b[33] - 4*b[34] - 2*b[35] + 4*b[36] + 2*b[37] + 4*b[38] + 2*b[39]
       - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43] - 4*b[44] - 2*b[45] + 4*b[46] + 2*b[47]
       - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 3*b[52] + 3*b[53] - 3*b[54] + 3*b[55]
       - 2*b[56] - b[57] - 2*b[58] - b[59] - 2*b[60] - b[61] - 2*b[62] - b[63];
     a[63] = 8*b[0] - 8*b[1] - 8*b[2] + 8*b[3] - 8*b[4] + 8*b[5] + 8*b[6] - 8*b[7]
       + 4*b[8] + 4*b[9] - 4*b[10] - 4*b[11] - 4*b[12] - 4*b[13] + 4*b[14] + 4*b[15]
       + 4*b[16] - 4*b[17] + 4*b[18] - 4*b[19] - 4*b[20] + 4*b[21] - 4*b[22] + 4*b[23]
       + 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 4*b[28] - 4*b[29] - 4*b[30] + 4*b[31]
       + 2*b[32] + 2*b[33] + 2*b[34] + 2*b[35] - 2*b[36] - 2*b[37] - 2*b[38] - 2*b[39]
       + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43] + 2*b[44] + 2*b[45] - 2*b[46] - 2*b[47]
       + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + 2*b[52] - 2*b[53] + 2*b[54] - 2*b[55]
       + b[56] + b[57] + b[58] + b[59] + b[60] + b[61] + b[62] + b[63];
   }

   __device__ void compute_b(
                  float *b,
                  const int *inds,
                  Vector gapscale) const
   {
     for (int i0 = 0; i0 < 8; i0++) {
       int inds2[3];
       int zero_derivs = FALSE;

       float voff = 0.0;
       int bit = 1;      // bit = 2^i1 in the below loop
 #pragma unroll
       for (int i1 = 0; i1 < 3; i1++) {
         inds2[i1] = (inds[i1] + ((i0 & bit) ? 1 : 0)) % k[i1];

         // Deal with voltage offsets
         if (cont[i1] && inds[i1] == (k[i1]-1) && inds2[i1] == 0) {
           voff += offset[i1];
         }

         bit <<= 1;      // i.e. multiply by 2
       }

       // NOTE: leaving everything in terms of unit cell coordinates for now,
       // eventually will multiply by inv tensor when applying the force

       // First set variables 'dk_{hi,lo}' (glob notation). The 'hi'
       // ('lo') variable in a given dimension is the number added (subtracted)
       // to go up (down) one grid point in that dimension; both are normally
       // just the corresponding 'dk[i]'. However, if we are sitting on a
       // boundary and we are using a continuous grid, then we want to map the
       // next point off the grid back around to the other side. e.g. point
       // (k[0], i1, k) maps to point (0, i1, k), which would be
       // accomplished by changing 'dk1_hi' to -(k[0]-1)*dk1.

       int d_hi[3] = {1, 1, 1};
       int d_lo[3] = {1, 1, 1};
       float voffs[3];
       float dscales[3] = {0.5, 0.5, 0.5};
 #pragma unroll
       for (int i1 = 0; i1 < 3; i1++) {
         if (inds2[i1] == 0) {
           if (cont[i1]) {
             d_lo[i1] = -(k[i1]-1);
             voffs[i1] = offset[i1];
             dscales[i1] = 1.0/(1.0 + gap[i1]) * 1.0/gapscale[i1];
           }
           else zero_derivs = TRUE;
         }
         else if (inds2[i1] == k[i1]-1) {
           if (cont[i1]) {
             d_hi[i1] = -(k[i1]-1);
             voffs[i1] = offset[i1];
             dscales[i1] = 1.0/(1.0 + gap[i1]) * 1.0/gapscale[i1];
           }
           else zero_derivs = TRUE;
         }
         else {
           voffs[i1] = 0.0;
         }
       }


       // V
       b[i0] = get_grid(inds2[0],inds2[1],inds2[2]) + voff;

       if (zero_derivs) {
         b[8+i0] = 0.0;
         b[16+i0] = 0.0;
         b[24+i0] = 0.0;
         b[32+i0] = 0.0;
         b[40+i0] = 0.0;
         b[48+i0] = 0.0;
         b[56+i0] = 0.0;
       } else {
         b[8+i0]  = dscales[0] * (get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]) + voffs[0]);   //  dV/dx
         b[16+i0] = dscales[1] * (get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]) - get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]) + voffs[1]);   //  dV/dy
         b[24+i0] = dscales[2] * (get_grid_d(inds2[0],inds2[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0],inds2[1],inds2[2]-d_lo[2]) + voffs[2]);   //  dV/dz
         b[32+i0] = dscales[0] * dscales[1]
           * (get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]) -
              get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]));  //  d2V/dxdy
         b[40+i0] = dscales[0] * dscales[2]
           * (get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]+d_hi[2]) -
              get_grid_d(inds2[0]+d_hi[0],inds2[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1],inds2[2]-d_lo[2]));  //  d2V/dxdz
         b[48+i0] = dscales[1] * dscales[2]
           * (get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) -
              get_grid_d(inds2[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));  //  d2V/dydz

         b[56+i0] = dscales[0] * dscales[1] * dscales[2]                                 // d3V/dxdydz
           * (get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]+d_hi[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) -
              get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]+d_hi[2]) +
              get_grid_d(inds2[0]+d_hi[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]) + get_grid_d(inds2[0]-d_lo[0],inds2[1]+d_hi[1],inds2[2]-d_lo[2]) +
              get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]+d_hi[2]) - get_grid_d(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));
       }
     }
   }

   __device__ int compute_VdV(
                   const Position pos,
                   float &V,
                   Vector &dV) const
   {
     int inds[3];
     Vector g, dg;
     Vector gapscale = Vector(1, 1, 1);

     int err = get_inds(pos, inds, dg, gapscale);
     if (err) {
       return -1;
     }


     // Compute b
     float b[64];        // Matrix of values at 8 box corners
     compute_b(b, inds, gapscale);

     // Compute a
     float a[64];
     compute_a(a, b);

     // Calculate powers of x, y, z for later use
     // e.g. x[2] = x^2
     float x[4], y[4], z[4];
     x[0] = 1; y[0] = 1; z[0] = 1;
 #pragma unroll
     for (int j = 1; j < 4; j++) {
       x[j] = x[j-1] * dg.x;
       y[j] = y[j-1] * dg.y;
       z[j] = z[j-1] * dg.z;
     }

     V = compute_V(a, x, y, z);
     dV = Tensor::diagonal(gapscale) * (compute_dV(a, x, y, z) * inv);

     return 0;
   }


   __device__ inline float get_grid(const int i0, const int i1, const int i2) const {
         return grid[grid_index(i0, i1, i2)];
     }
   __device__ inline double get_grid_d(const int i0, const int i1, const int i2) const {
         return double(get_grid(i0, i1, i2));
     }
   __device__ inline Vector get_scale(void) const { return scale; }

   struct GridIndices {
     int inds2;
     int dk_hi;
     int dk_lo;
     Bool zero_derivs;
   };
   __device__ inline long int grid_index(int i0, int i1, int i2) const {
     int inds[3] = {i0, i1, i2};
     return inds[0]*dk[0] + inds[1]*dk[1] + inds[2]*dk[2];
   }

   __host__ __device__ GridforceGridCUDA(
                               int h_k1,
                               int h_k2,
                               int h_k3,
                               long int h_size,
                               long int dh_k1,
                               long int dh_k2,
                               long int dh_k3,
                               float h_factor,
                               Position h_origin,
                               Position h_center,
                               Bool h_cont1,
                               Bool h_cont2,
                               Bool h_cont3,
                               float h_gapinv1,
                               float h_gapinv2,
                               float h_gapinv3,
                               float h_gap1,
                               float h_gap2,
                               float h_gap3,
                               Tensor h_inv,
                               float h_offset1,
                               float h_offset2,
                               float h_offset3,
                               Vector h_scale,
                               float *h_grid
                                ):
   size(h_size),
   factor(h_factor),
   origin(h_origin),
   center(h_center),
   inv(h_inv),
   scale(h_scale),
   grid(h_grid)

   {
     k[0]=h_k1; k[1]=h_k2; k[2]=h_k3;
     dk[0]=dh_k1; dk[1]=dh_k2; dk[2]=dh_k3;
     cont[0]=h_cont1; cont[1]=h_cont2; cont[2]=h_cont3;
     gapinv[0]=h_gapinv1; gapinv[1]=h_gapinv2; gapinv[2]=h_gapinv3;
     gap[0]=h_gap1; gap[1]=h_gap2; gap[2]=h_gap3;
     offset[0]=h_offset1; offset[1]=h_offset2; offset[2]=h_offset3;
   }


   long int dk[3];
   long int size;
   float *grid;  // Actual grid
 private:

   Vector corners[8];


     int k[3];           // Grid dimensions


     float factor;

     Position origin;    // Grid origin
     Position center;    // Center of grid (for wrapping)

     Bool cont[3];       // Whether grid is continuous in each dimension

     float gapinv[3];    // 1.0/gap
     Vector scale;

     Tensor inv;         // Inverse of unit vectors
     float offset[3];    // Potential offset in each dimension
     float gap[3];       // Gap between images of grid in grid units for each dimension
 };

 #endif //NODEGROUP_FORCE_REGISTER
 #endif
 #endif
CudaUtils.h

GridforceGridCUDA.h

Vector
Definition: Vector.h:72

HipDefines.h

Vector::z
BigReal z
Definition: Vector.h:74

FALSE
#define FALSE
Definition: common.h:127

GridforceGrid::get_center
virtual Position get_center(void) const =0

Tensor::diagonal
static NAMD_HOST_DEVICE Tensor diagonal(const Vector &v1)
Definition: Tensor.h:37

Bool
int Bool
Definition: common.h:142

Vector::x
BigReal x
Definition: Vector.h:74

GridforceGrid::get_scale
virtual Vector get_scale(void) const =0

Tensor
Definition: Tensor.h:15

Vector::y
BigReal y
Definition: Vector.h:74

GridforceGrid::get_corner
Position get_corner(int idx)
Definition: GridForceGrid.C:101

GridforceGrid::wrap_position
Position wrap_position(const Position &pos, const Lattice &lattice)
Definition: GridForceGrid.inl:463

GridforceGrid::compute_VdV
int compute_VdV(Position pos, float &V, Vector &dV) const
Definition: GridForceGrid.h:53

Lattice.h

Lattice
Definition: Lattice.h:17

GridforceGrid
Definition: GridForceGrid.h:27

Lattice::wrap_delta
NAMD_HOST_DEVICE Vector wrap_delta(const Position &pos1) const
Definition: Lattice.h:222

TRUE
#define TRUE
Definition: common.h:128

Lattice::origin
NAMD_HOST_DEVICE Vector origin() const
Definition: Lattice.h:278

CudaRecord.h