/*
 * Copyright (C) 2004-2005 by David J. Hardy.  All rights reserved.
 *
 * phermite.c
 */

#include <stdio.h>
#include "mgrid/split.h"
#include "mgrid/hermite.h"
/* #define DEBUG_WATCH */
#include "debug/debug.h"

#define NELEMS(a)  (sizeof(a)/sizeof((a)[0]))

/* prototypes of internal routines */
#ifdef DEBUG_WATCH
static void print_gridpoint(const HermGridpt *p);
#endif
static void add_herm_direct(HermDirect *g, double c, double h, double a,
    int32 i, int32 j, int32 k, int32 split);
static void done(Mgrid *mg);
static int force(Mgrid *mg, MgridSystem *sys);
static int atoms_to_grid(Mgrid *mg, const MgridSystem *sys);
static int grid_to_atoms(Mgrid *mg, MgridSystem *sys);
static int restrict_grid(Mgrid *mg, int32 level);
static int prolong_grid(Mgrid *mg, int32 level);
static int direct_sum(Mgrid *mg, int32 level);


#ifdef DEBUG_WATCH
void print_gridpoint(const HermGridpt *gp)
{
  const double *p = (const double *) gp;
  int k;

  for (k = 0;  k < sizeof(HermGridpt) / sizeof(double);  k++) {
    printf("%g ", p[k]);
  }
  printf("\n");
}
#endif


int mgrid_phermite_setup(Mgrid *mg, const MgridSystem *sys)
{
  const double *q = sys->charge;

  /* omit top level when not debugging */
#ifdef DEBUG_SUPPORT
  const int32 nlevels = mg->param.nlevels;
#else
  const int32 nlevels = mg->param.nlevels - 1;
#endif

  const int32 natoms = mg->param.natoms;
  const int32 split = mg->param.split;
  HermDirect *gd;
  double r2, s, gs;
  int32 i, j, k, n, m;
  int32 gdim, index;
  const double a = mg->param.cutoff;
  const double h = mg->param.spacing;
  const double h_1 = 1./h;
  double hm, am;

  /* assign virtual methods */
  mg->long_force = force;
  mg->long_done = done;

  /* set inverse of grid spacing */
  mg->inv_spacing = h_1;

  /* set location of multi-grids origin with lattice label (0,0,0) */
  mg->origin.x = mg->param.center.x - 0.5 * mg->param.length;
  mg->origin.y = mg->param.center.y - 0.5 * mg->param.length;
  mg->origin.z = mg->param.center.z - 0.5 * mg->param.length;

  VEC(mg->origin);

  /* allocate arrays of charge and potential lattices */
  mg->qgrid = (MgridLattice *) malloc(nlevels * sizeof(MgridLattice));
  mg->egrid = (MgridLattice *) malloc(nlevels * sizeof(MgridLattice));
  if (mg->qgrid == NULL || mg->egrid == NULL) return MGRID_FAIL;

  /* setup charge and potential lattices for each level */
  n = mg->param.nspacings;
  for (k = 0;  k < nlevels;  k++, n >>= 1) {
    if (n == 0 || ((n & 1) && n != 1)) return MGRID_FAIL;
    if (mgrid_lattice_init(&(mg->qgrid[k]))
        || mgrid_lattice_init(&(mg->egrid[k]))) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->qgrid[k]), sizeof(HermGridpt),
          0, n-1, 0, n-1, 0, n-1)
        || mgrid_lattice_setup(&(mg->egrid[k]), sizeof(HermGridpt),
          0, n-1, 0, n-1, 0, n-1)) {
      return MGRID_FAIL;
    }
  }
#ifdef DEBUG_SUPPORT
  ASSERT(n == 0);  /* i.e. top level has 1 grid point */
#else
  if (n != 1) return MGRID_FAIL;  /* i.e. top level omitted */
#endif

  /* allocate arrays for direct sum weights */
  ASSERT(nlevels > 0);
  mg->gdsum_list = (MgridLattice *) calloc(nlevels, sizeof(MgridLattice));
  if (mg->gdsum_list == NULL) return MGRID_FAIL;
  mg->gdsum_radius_list = (int32 *) calloc(nlevels, sizeof(int32));
  if (mg->gdsum_radius_list == NULL) return MGRID_FAIL;

  /* determine radius for direct sum "sphere" of weights */
  n = (int32) (2.0 * a * h_1);
  hm = h;  /* grid spacing for level 0 */
  am = a;  /* cutoff for level 0 */
  for (m = 0;  m < nlevels;  m++) {
    /* allocate direct sum weight lattice for this level */
    mg->gdsum_radius_list[m] = n;
    if (mgrid_lattice_init(&(mg->gdsum_list[m]))) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->gdsum_list[m]), sizeof(HermDirect),
          -n, n, -n, n, -n, n)) {
      return MGRID_FAIL;
    }
    /* compute direct sum weights */
    if (mgrid_lattice_zero(&(mg->gdsum_list[m]))) {
      return MGRID_FAIL;
    }
    gdim = mg->gdsum_list[m].ni;
    ASSERT(gdim == 2 * n + 1);
    gd = (HermDirect *)(mg->gdsum_list[m].data);
    for (k = -n;  k <= n;  k++) {
      for (j = -n;  j <= n;  j++) {
        for (i = -n;  i <= n;  i++) {
          r2 = (i*i + j*j + k*k) * hm*hm;
          if (r2 >= 4*am*am) continue;  /* zero beyond radius r == 2*am */
          index = (k * gdim + j) * gdim + i;
          ASSERT(&gd[index]
              == mgrid_lattice_elem(&(mg->gdsum_list[m]), i, j, k));

          /* accumulate D( g_{a}(r,r') ) term for this level */
          add_herm_direct(&gd[index], 1.0, hm, am, i, j, k, split);

          /* accumulate D( -g_{2a}(r,r') ) term for this level */
          add_herm_direct(&gd[index], -1.0, hm, 2*am, i, j, k, split);
        }
      }
    }
    hm *= 2.0;  /* grid spacing twice as big on next level */
    am *= 2.0;  /* cutoff twice as big on next level */
  }

#if 0
  /* determine radius for last level direct sum "sphere" of weights */
  ASSERT(m == nlevels-1);
  n = mg->qgrid[m].ni - 1;
  /* allocate lattice */
  mg->gdsum_radius_list[m] = n;
  if (mgrid_lattice_setup(&(mg->gdsum_list[m]), sizeof(HermDirect),
        -n, n, -n, n, -n, n)) {
    return MGRID_FAIL;
  }
  /* compute last level direct sum weights */
  if (mgrid_lattice_zero(&(mg->gdsum_list[m]))) {
    return MGRID_FAIL;
  }
  gdim = mg->gdsum_list[m].ni;
  ASSERT(gdim == 2 * n + 1);
  gd = (HermDirect *)(mg->gdsum_list[m].data);
  for (k = -n;  k <= n;  k++) {
    for (j = -n;  j <= n;  j++) {
      for (i = -n;  i <= n;  i++) {
        /* unlike previous, this sum is not truncated */
        index = (k * gdim + j) * gdim + i;
        ASSERT(&gd[index]
            == mgrid_lattice_elem(&(mg->gdsum_list[m]), i, j, k));

        /* accumulate D( g_{a}(r,r') ) term for this level */
        add_herm_direct(&gd[index], 1.0, hm, am, i, j, k, split);
      }
    }
  }
#endif

#if 0
  /* allocate and compute scalings for each direct sum on each level */
  mg->scaling = (double *) malloc(nlevels * sizeof(double));
  if (mg->scaling == NULL) return MGRID_FAIL;
  mg->scaling[0] = 1.0;
  for (k = 1;  k < nlevels;  k++) {
    mg->scaling[k] = 0.5 * mg->scaling[k-1];
  }

  /* allocate space for direct sum weights */
  n = (int32) (2.0 * a * h_1);
  mg->gdsum_radius = n;
  if (mgrid_lattice_setup(&(mg->gdsum), sizeof(HermDirect),
        -n, n, -n, n, -n, n)) {
    return MGRID_FAIL;
  }

  /* compute direct sum weights */
  if (mgrid_lattice_zero(&(mg->gdsum))) {
    return MGRID_FAIL;
  }
  gdim = mg->gdsum.ni;
  ASSERT(gdim == 2 * n + 1);
  gd = (HermDirect *)(mg->gdsum.data);
  for (k = -n;  k <= n;  k++) {
    for (j = -n;  j <= n;  j++) {
      for (i = -n;  i <= n;  i++) {
        r2 = (i*i +  j*j + k*k) * h*h;
        if (r2 >= 4*a*a) continue;  /* zero beyond r == 2*a */
        index = (k * gdim + j) * gdim + i;
        ASSERT(&gd[index] == mgrid_lattice_elem(&(mg->gdsum), i, j, k));

        /* accumulate D( g_a(r,r') ) term */
        add_herm_direct(&gd[index], 1.0, h, a, i, j, k, split);

        /* accumulate D( -g_{2a}(r,r') ) term */
        add_herm_direct(&gd[index], -1.0, h, 2*a, i, j, k, split);

#if 0
        s = r2 / (a*a);
        t = r2 / (4*a*a);
        ASSERT(t < s);
        mgrid_ndgamma(gs, nderiv, s, split);
        mgrid_ndgamma(gt, nderiv, t, split);



        if (s <= 1) gamma(&gs, s, split);
        else gs = 1/sqrt(s);
        if (t < 1) {
          gamma(&gt, t, split);
          gd[index] = gs/a - gt/(2*a);
        }
        else {
          /* then gt=1/sqrt(t), which implies that gs/a - gt(2*a) == 0 */
          gd[index] = 0;
        }
#endif
      }
    }
  }

  /* allocate space for direct sum weights on last level */
  n = mg->qgrid[nlevels-1].ni - 1;
  mg->glast_radius = n;
  if (mgrid_lattice_setup((&mg->glast), sizeof(HermDirect),
        -n, n, -n, n, -n, n)) {
    return MGRID_FAIL;
  }

  /* compute direct sum weights on last level */
  if (mgrid_lattice_zero(&(mg->glast))) {
    return MGRID_FAIL;
  }
  gdim = mg->glast.ni;
  ASSERT(gdim == 2 * n + 1);
  gd = (HermDirect *)(mg->glast.data);
  for (k = -n;  k <= n;  k++) {
    for (j = -n;  j <= n;  j++) {
      for (i = -n;  i <= n;  i++) {
        /* unlike previous, this sum is not truncated */
        index = (k * gdim + j) * gdim + i;
        ASSERT(&gd[index] == mgrid_lattice_elem(&(mg->glast), i, j, k));

        /* accumulate D( g_a(r,r') ) term */
        add_herm_direct(&gd[index], 1.0, h, a, i, j, k, split);

#if 0
        r2 = (i*i + j*j + k*k) * h*h;
        s = r2 / (a*a);
        index = (k * gdim + j) * gdim + i;
        ASSERT(&gd[index] == mgrid_lattice_elem(&(mg->glast), i, j, k));
        /* unlike previous, this sum is not truncated */
        if (s <= 1) gamma(&gs, s, split);
        else gs = 1/sqrt(s);
        gd[index] = gs/a;
#endif
      }
    }
  }
#endif

  /* compute g_a(0) */
  s = 0;
  gamma(&gs, s, split);
  mg->g_zero = gs/a;

  /* compute self potential */
  s = 0;
  for (n = 0;  n < natoms;  n++) {
    s += q[n] * q[n];
  }
  mg->u_self = 0.5 * mg->g_zero * s;

  return 0;
}


/*
 * computation of HermDirect matrix components, taken straight from notes
 */
void add_herm_direct(HermDirect *g, double c, double h, double a,
    int32 i, int32 j, int32 k, int32 split)
{
  const double a_1 = 1./a;
  const double a_2 = a_1 * a_1;
  const double s = (i*i + j*j + k*k) * h*h * a_2;
  const double dx = -2 * i * h * a_2;  /* ds/dx */
  const double dy = -2 * j * h * a_2;  /* ds/dy */
  const double dz = -2 * k * h * a_2;  /* ds/dz */
  const double dd = 2 * a_2;  /* d^2s/dx^2 = d^2s/dy^2 = d^2s/dz^2 */
  double tmp;
  enum { nderiv = 7 };
  double p[nderiv];

  /* multiply entire matrix by this coefficient */
  c = c * a_1;

  /* compute derivatives of splitting p(s) */
  mgrid_ndgamma(p, nderiv, s, split);

  /* weight 0 */
  tmp = c * p[0];
  g->d000_000 += tmp;

  /* weight 1 */
  tmp = c * p[1] * dx;
  g->d100_000 += tmp;
  g->d000_100 -= tmp;

  tmp = c * p[1] * dy;
  g->d010_000 += tmp;
  g->d000_010 -= tmp;

  tmp = c * p[1] * dz;
  g->d001_000 += tmp;
  g->d000_001 -= tmp;

  /* weight 2 */
  tmp = c * p[2] * dx * dy;
  g->d110_000 += tmp;
  g->d000_110 += tmp;
  g->d100_010 -= tmp;
  g->d010_100 -= tmp;

  tmp = c * p[2] * dx * dz;
  g->d101_000 += tmp;
  g->d000_101 += tmp;
  g->d100_001 -= tmp;
  g->d001_100 -= tmp;

  tmp = c * p[2] * dy * dz;
  g->d011_000 += tmp;
  g->d000_011 += tmp;
  g->d010_001 -= tmp;
  g->d001_010 -= tmp;

  tmp = c * (p[2] * dx*dx + p[1] * dd);
  g->d100_100 -= tmp;
  tmp = c * (p[2] * dy*dy + p[1] * dd);
  g->d010_010 -= tmp;
  tmp = c * (p[2] * dz*dz + p[1] * dd);
  g->d001_001 -= tmp;

  /* weight 3 */
  tmp = c * p[3] * dx * dy * dz;
  g->d111_000 += tmp;
  g->d110_001 -= tmp;
  g->d101_010 -= tmp;
  g->d011_100 -= tmp;
  g->d100_011 += tmp;
  g->d010_101 += tmp;
  g->d001_110 += tmp;
  g->d000_111 -= tmp;

  tmp = c * (p[3] * dx*dx * dy + p[2] * dd * dy);
  g->d110_100 -= tmp;
  g->d100_110 += tmp;

  tmp = c * (p[3] * dx*dx * dz + p[2] * dd * dz);
  g->d101_100 -= tmp;
  g->d100_101 += tmp;

  tmp = c * (p[3] * dy*dy * dx + p[2] * dd * dx);
  g->d110_010 -= tmp;
  g->d010_110 += tmp;

  tmp = c * (p[3] * dy*dy * dz + p[2] * dd * dz);
  g->d011_010 -= tmp;
  g->d010_011 += tmp;

  tmp = c * (p[3] * dz*dz * dx + p[2] * dd * dx);
  g->d101_001 -= tmp;
  g->d001_101 += tmp;

  tmp = c * (p[3] * dz*dz * dy + p[2] * dd * dy);
  g->d011_001 -= tmp;
  g->d001_011 += tmp;

  /* weight 4 */
  tmp = c * (p[4] * dx*dx * dy * dz + p[3] * dd * dy * dz);
  g->d111_100 -= tmp;
  g->d100_111 -= tmp;
  g->d110_101 += tmp;
  g->d101_110 += tmp;

  tmp = c * (p[4] * dy*dy * dx * dz + p[3] * dd * dx * dz);
  g->d111_010 -= tmp;
  g->d010_111 -= tmp;
  g->d110_011 += tmp;
  g->d011_110 += tmp;

  tmp = c * (p[4] * dz*dz * dx * dy + p[3] * dd * dx * dy);
  g->d111_001 -= tmp;
  g->d001_111 -= tmp;
  g->d101_011 += tmp;
  g->d011_101 += tmp;

  tmp = c * (p[4] * dx*dx * dy*dy + p[3] * dx*dx * dd
      + p[3] * dd * dy*dy + p[2] * dd * dd);
  g->d110_110 += tmp;
  tmp = c * (p[4] * dx*dx * dz*dz + p[3] * dx*dx * dd
      + p[3] * dd * dz*dz + p[2] * dd * dd);
  g->d101_101 += tmp;
  tmp = c * (p[4] * dy*dy * dz*dz + p[3] * dy*dy * dd
      + p[3] * dd * dz*dz + p[2] * dd * dd);
  g->d011_011 += tmp;

  /* weight 5 */
  tmp = c * (p[5] * dx*dx * dy*dy * dz + p[4] * dx*dx * dd * dz
      + p[4] * dd * dy*dy * dz + p[3] * dd * dd * dz);
  g->d111_110 += tmp;
  g->d110_111 -= tmp;

  tmp = c * (p[5] * dx*dx * dz*dz * dy + p[4] * dx*dx * dd * dy
      + p[4] * dd * dz*dz * dy + p[3] * dd * dd * dy);
  g->d111_101 += tmp;
  g->d101_111 -= tmp;

  tmp = c * (p[5] * dy*dy * dz*dz * dx + p[4] * dy*dy * dd * dx
      + p[4] * dd * dz*dz * dx + p[3] * dd * dd * dx);
  g->d111_011 += tmp;
  g->d011_111 -= tmp;

  /* weight 6 */
  tmp = c * (p[6] * dx*dx * dy*dy * dz*dz + p[5] * dx*dx * dy*dy * dd
      + p[5] * dx*dx * dd * dz*dz + p[5] * dd * dy*dy * dz*dz
      + p[4] * dx*dx * dd * dd + p[4] * dd * dy*dy * dd
      + p[4] * dd * dd * dz*dz + p[3] * dd * dd * dd);
  g->d111_111 -= tmp;
}


void done(Mgrid *mg)
{
  /* omit top level when not debugging */
#ifdef DEBUG_SUPPORT
  const int32 nlevels = mg->param.nlevels;
#else
  const int32 nlevels = mg->param.nlevels - 1;
#endif
  int32 n;

#if 0
  mgrid_lattice_done(&(mg->gdsum));
  mgrid_lattice_done(&(mg->glast));
#endif
  for (n = 0;  n < nlevels;  n++) {
    mgrid_lattice_done(&(mg->qgrid[n]));
    mgrid_lattice_done(&(mg->egrid[n]));
    mgrid_lattice_done(&(mg->gdsum_list[n]));
  }
  free(mg->qgrid);
  free(mg->egrid);
#if 0
  free(mg->scaling);
#endif
  free(mg->gdsum_list);
  free(mg->gdsum_radius_list);
}


int force(Mgrid *mg, MgridSystem *sys)
{
  /* omit top level when not debugging */
#ifdef DEBUG_SUPPORT
  const int32 nlevels = mg->param.nlevels;
#else
  const int32 nlevels = mg->param.nlevels - 1;
#endif
  int32 n;

  /* first charge grid must be zeroed */
  if (mgrid_lattice_zero(&(mg->qgrid[0]))) return MGRID_FAIL;

  /* computation is "inverse V-cycle" */
  if (atoms_to_grid(mg, sys)) return MGRID_FAIL;
  for (n = 0;  n < nlevels - 1;  n++) {
    if (direct_sum(mg, n)) return MGRID_FAIL;
    if (restrict_grid(mg, n)) return MGRID_FAIL;
  }
  if (direct_sum(mg, n)) return MGRID_FAIL;
  for (--n;  n >= 0;  --n) {
    if (prolong_grid(mg, n)) return MGRID_FAIL;
  }
  if (grid_to_atoms(mg, sys)) return MGRID_FAIL;

  return 0;
}


int atoms_to_grid(Mgrid *mg, const MgridSystem *sys)
{
  double xphi[2], yphi[2], zphi[2];     /* phi grid func along x, y, z */
  double xpsi[2], ypsi[2], zpsi[2];     /* psi grid func along x, y, z */
  double dx_h, dy_h, dz_h;              /* distance between atom and origin */
  double t;                             /* param to phi function */
  double c_zphi, c_zpsi;
  double c_yphi_zphi, c_yphi_zpsi;
  double c_ypsi_zphi, c_ypsi_zpsi;
  const double h = mg->param.spacing;   /* grid spacing */
  const double h_1 = mg->inv_spacing;   /* inverse grid spacing */
  const MD_Dvec origin = mg->origin;    /* lowest corner of domain */
  const double *q = sys->charge;        /* atom charges */
  const MD_Dvec *p = sys->pos;          /* atom positions */

  HermGridpt *qh = (HermGridpt *)(mg->qgrid[0].data);
      /* treat charge grid as 3D-lattice, index (i,j,k) */

  const int32 ndim = mg->qgrid[0].ni;  /* number points in each dimension */
  const int32 natoms = mg->param.natoms;  /* number of atoms */
  const int32 mask = ndim - 1;         /* indexing mask for periodic grids */
  int32 n, i, j, k, ilo, jlo, klo;
  int32 index, koff, jkoff;

  FLT(h);
  VEC(origin);

  for (n = 0;  n < natoms;  n++) {

    /* distance between atom and corner measured in grid points */
    dx_h = (p[n].x - origin.x) * h_1;
    dy_h = (p[n].y - origin.y) * h_1;
    dz_h = (p[n].z - origin.z) * h_1;

    /* find smallest numbered grid point in stencil */
    ilo = ((int32) dx_h);
    jlo = ((int32) dy_h);
    klo = ((int32) dz_h);

    /* find t for x dimension and compute xphi */
    t = dx_h - (double) ilo;
    xphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    xpsi[0] = h * t * (1 - t) * (1 - t);
    t--;
    xphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    xpsi[1] = h * t * (1 + t) * (1 + t);

    /* find t for y dimension and compute yphi */
    t = dy_h - (double) jlo;
    yphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    ypsi[0] = h * t * (1 - t) * (1 - t);
    t--;
    yphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    ypsi[1] = h * t * (1 + t) * (1 + t);

    /* find t for z dimension and compute zphi */
    t = dz_h - (double) klo;
    zphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    zpsi[0] = h * t * (1 - t) * (1 - t);
    t--;
    zphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    zpsi[1] = h * t * (1 + t) * (1 + t);

    /* determine charge on 8=2*2*2 qh grid point stencil */
    for (k = 0;  k < 2;  k++) {
      koff = ((k + klo) & mask) * ndim;
      c_zphi = zphi[k] * q[n];
      c_zpsi = zpsi[k] * q[n];
      for (j = 0;  j < 2;  j++) {
        jkoff = (koff + ((j + jlo) & mask)) * ndim;
        c_yphi_zphi = yphi[j] * c_zphi;
        c_ypsi_zphi = ypsi[j] * c_zphi;
        c_yphi_zpsi = yphi[j] * c_zpsi;
        c_ypsi_zpsi = ypsi[j] * c_zpsi;
        for (i = 0;  i < 2;  i++) {
          index = jkoff + ((i + ilo) & mask);
          ASSERT(&qh[index]
              == mgrid_lattice_elem(&(mg->qgrid[0]),
                (i+ilo) & mask, (j+jlo) & mask, (k+klo) & mask));
          qh[index].d000 += xphi[i] * c_yphi_zphi;
          qh[index].d100 += xpsi[i] * c_yphi_zphi;
          qh[index].d010 += xphi[i] * c_ypsi_zphi;
          qh[index].d001 += xphi[i] * c_yphi_zpsi;
          qh[index].d110 += xpsi[i] * c_ypsi_zphi;
          qh[index].d101 += xpsi[i] * c_yphi_zpsi;
          qh[index].d011 += xphi[i] * c_ypsi_zpsi;
          qh[index].d111 += xpsi[i] * c_ypsi_zpsi;
        }
      }
    }
  } /* end loop over n */
#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("charge grid point qh(%d,%d,%d):\n", i, j, k);
  index = (k * ndim + j) * ndim + i;
  print_gridpoint(&qh[index]);
#endif
#if 0
  fprintf(stderr, "qh[%d].d000 = %g\n", index, qh[index].d000);
  fprintf(stderr, "qh[%d].d100 = %g\n", index, qh[index].d100);
  fprintf(stderr, "qh[%d].d010 = %g\n", index, qh[index].d010);
  fprintf(stderr, "qh[%d].d001 = %g\n", index, qh[index].d001);
  fprintf(stderr, "qh[%d].d110 = %g\n", index, qh[index].d110);
  fprintf(stderr, "qh[%d].d101 = %g\n", index, qh[index].d101);
  fprintf(stderr, "qh[%d].d011 = %g\n", index, qh[index].d011);
  fprintf(stderr, "qh[%d].d111 = %g\n", index, qh[index].d111);
#endif
  return 0;
}


int grid_to_atoms(Mgrid *mg, MgridSystem *sys)
{
  double xphi[2], yphi[2], zphi[2];     /* phi grid func along x, y, z */
  double xpsi[2], ypsi[2], zpsi[2];     /* psi grid func along x, y, z */
  double dxphi[2], dyphi[2], dzphi[2];  /* dphi grid func along x, y, z */
  double dxpsi[2], dypsi[2], dzpsi[2];  /* dpsi grid func along x, y, z */
  double dx_h, dy_h, dz_h;              /* distance between atom and origin */
  double t;                             /* param to phi and dphi functions */
  double c_yphi_zphi, c_ypsi_zphi, c_yphi_zpsi, c_ypsi_zpsi;
  double c_yphi_dzphi, c_ypsi_dzphi, c_yphi_dzpsi, c_ypsi_dzpsi;
  double c_dyphi_zphi, c_dypsi_zphi, c_dyphi_zpsi, c_dypsi_zpsi;
  MD_Dvec fv;
  double u;
  const double u_self = mg->u_self;     /* self potential */
  const double h = mg->param.spacing;   /* grid spacing */
  const double h_1 = mg->inv_spacing;   /* inverse grid spacing */
  const MD_Dvec origin = mg->origin;    /* lowest corner of domain */
  const double *q = sys->charge;        /* atom charges */
  const MD_Dvec *p = sys->pos;          /* atom positions */
  MD_Dvec *f = (sys->f_long ? sys->f_long : sys->f_elec);
      /* accumulate contributions to atom force in f_long if allocated */

  HermGridpt *eh = (HermGridpt *)(mg->egrid[0].data);
      /* treat potential grid as 3D-lattice, index (i,j,k) */

  const HermGridpt *evh = (HermGridpt *)(mg->egrid[0].databuffer);
  const HermGridpt *qvh = (HermGridpt *)(mg->qgrid[0].databuffer);
      /* treat potential and charge grids as 1D arrays, index n */

  const int32 nelems = mg->egrid[0].nelems;  /* total number of grid points */
  const int32 ndim = mg->egrid[0].ni;    /* number points in each dimension */
  const int32 natoms = mg->param.natoms;     /* number of atoms */
  const int32 mask = ndim - 1;          /* indexing mask for periodic grids */

  int32 n, i, j, k, ilo, jlo, klo;
  int32 index, koff, jkoff;

#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("potential grid point eh(%d,%d,%d):\n", i, j, k);
  index = (k * ndim + j) * ndim + i;
  print_gridpoint(&eh[index]);
#endif

  /* compute force */
  for (n = 0;  n < natoms;  n++) {

    /* distance between atom and corner measured in grid points */
    dx_h = (p[n].x - origin.x) * h_1;
    dy_h = (p[n].y - origin.y) * h_1;
    dz_h = (p[n].z - origin.z) * h_1;

    /* find largest numbered grid point in stencil */
    ilo = ((int32) dx_h);
    jlo = ((int32) dy_h);
    klo = ((int32) dz_h);

    /* find t for x dimension, compute xphi and dxphi */ 
    t = dx_h - (double) ilo;
    xphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    xpsi[0] = h * t * (1 - t) * (1 - t);
    dxphi[0] = -6 * t * (1 - t) * h_1;
    dxpsi[0] = (1 - t) * (1 - 3*t);
    t--;
    xphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    xpsi[1] = h * t * (1 + t) * (1 + t);
    dxphi[1] = -6 * t * (1 + t) * h_1;
    dxpsi[1] = (1 + t) * (1 + 3*t);

    /* find t for y dimension, compute yphi and dyphi */
    t = dy_h - (double) jlo;
    yphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    ypsi[0] = h * t * (1 - t) * (1 - t);
    dyphi[0] = -6 * t * (1 - t) * h_1;
    dypsi[0] = (1 - t) * (1 - 3*t);
    t--;
    yphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    ypsi[1] = h * t * (1 + t) * (1 + t);
    dyphi[1] = -6 * t * (1 + t) * h_1;
    dypsi[1] = (1 + t) * (1 + 3*t);

    /* find t for z dimension, compute zphi and dzphi */
    t = dz_h - (double) klo;
    zphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    zpsi[0] = h * t * (1 - t) * (1 - t);
    dzphi[0] = -6 * t * (1 - t) * h_1;
    dzpsi[0] = (1 - t) * (1 - 3*t);
    t--;
    zphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    zpsi[1] = h * t * (1 + t) * (1 + t);
    dzphi[1] = -6 * t * (1 + t) * h_1;
    dzpsi[1] = (1 + t) * (1 + 3*t);

    /* determine 8=2*2*2 eh grid stencil contribution to force */
    fv.x = fv.y = fv.z = 0;
    for (k = 0;  k < 2;  k++) {
      koff = ((k + klo) & mask) * ndim;
      for (j = 0;  j < 2;  j++) {
        jkoff = (koff + ((j + jlo) & mask)) * ndim;
        c_yphi_zphi = yphi[j] * zphi[k];
        c_ypsi_zphi = ypsi[j] * zphi[k];
        c_yphi_zpsi = yphi[j] * zpsi[k];
        c_ypsi_zpsi = ypsi[j] * zpsi[k];
        c_yphi_dzphi = yphi[j] * dzphi[k];
        c_ypsi_dzphi = ypsi[j] * dzphi[k];
        c_yphi_dzpsi = yphi[j] * dzpsi[k];
        c_ypsi_dzpsi = ypsi[j] * dzpsi[k];
        c_dyphi_zphi = dyphi[j] * zphi[k];
        c_dypsi_zphi = dypsi[j] * zphi[k];
        c_dyphi_zpsi = dyphi[j] * zpsi[k];
        c_dypsi_zpsi = dypsi[j] * zpsi[k];
        for (i = 0;  i < 2;  i++) {
          index = jkoff + ((i + ilo) & mask);
          ASSERT(&eh[index]
              == mgrid_lattice_elem(&(mg->egrid[0]),
                (i+ilo) & mask, (j+jlo) & mask, (k+klo) & mask));
          fv.x += dxphi[i] * (c_yphi_zphi * eh[index].d000
              + c_ypsi_zphi * eh[index].d010
              + c_yphi_zpsi * eh[index].d001
              + c_ypsi_zpsi * eh[index].d011)
            + dxpsi[i] * (c_yphi_zphi * eh[index].d100
              + c_ypsi_zphi * eh[index].d110
              + c_yphi_zpsi * eh[index].d101
              + c_ypsi_zpsi * eh[index].d111);
          fv.y += xphi[i] * (c_dyphi_zphi * eh[index].d000
              + c_dypsi_zphi * eh[index].d010
              + c_dyphi_zpsi * eh[index].d001
              + c_dypsi_zpsi * eh[index].d011)
            + xpsi[i] * (c_dyphi_zphi * eh[index].d100
              + c_dypsi_zphi * eh[index].d110
              + c_dyphi_zpsi * eh[index].d101
              + c_dypsi_zpsi * eh[index].d111);
          fv.z += xphi[i] * (c_yphi_dzphi * eh[index].d000
              + c_ypsi_dzphi * eh[index].d010
              + c_yphi_dzpsi * eh[index].d001
              + c_ypsi_dzpsi * eh[index].d011)
            + xpsi[i] * (c_yphi_dzphi * eh[index].d100
              + c_ypsi_dzphi * eh[index].d110
              + c_yphi_dzpsi * eh[index].d101
              + c_ypsi_dzpsi * eh[index].d111);
        }
      }
    }

    /* update force */
    f[n].x -= q[n] * fv.x;
    f[n].y -= q[n] * fv.y;
    f[n].z -= q[n] * fv.z;

  } /* end loop over n */

  /* compute potential */
  u = 0.0;
  for (n = 0;  n < nelems;  n++) {
    u += qvh[n].d000 * evh[n].d000 + qvh[n].d100 * evh[n].d100
      + qvh[n].d010 * evh[n].d010 + qvh[n].d001 * evh[n].d001
      + qvh[n].d110 * evh[n].d110 + qvh[n].d101 * evh[n].d101
      + qvh[n].d011 * evh[n].d011 + qvh[n].d111 * evh[n].d111;
  }

  /* must subtract self potential, save long range potential, add to total */
  sys->u_long = (0.5 * u) - u_self;
  sys->u_elec += sys->u_long;

  /* if separate long range force array, add to total force */
  if (sys->f_long != NULL) {
    for (n = 0;  n < natoms;  n++) {
      sys->f_elec[n].x += sys->f_long[n].x;
      sys->f_elec[n].y += sys->f_long[n].y;
      sys->f_elec[n].z += sys->f_long[n].z;
    }
  }

  return 0;
}



/*
 * constants for grid transfer operations
 * cubic numerical Hermite interpolant
 */

/* length of stencil */
enum { nstencil = 3 };

/*
 * interpolating functions along one dimension of grid stencil
 *
 * note that psi needs to be multiplied by 2h
 *
 * note that even though theta values are identical to phi values,
 * they are different functions (Hermite vs. linear interpolation)
 * so are each represented here separately
 */
static const double phi[nstencil] = { 0.5, 1, 0.5 };
static const double psi[nstencil] = { -0.125, 0, 0.125 };
static const double theta[nstencil] = { 0.5, 1, 0.5 };

/*
 * stencil offsets from a central grid point on a finer grid level;
 * these offsets are where function weights above have been evaluated
 * on h-grid wrt to 2h-grid
 */
static const int32 offset[nstencil] = { -1, 0, 1 };


int restrict_grid(Mgrid *mg, int32 level)
{
  HermGridpt q2h_sum;
  double yphi_zphi, yhpsi_zphi, ytheta_zphi;
  double yphi_zhpsi, yhpsi_zhpsi, ytheta_zhpsi;
  double yphi_ztheta, yhpsi_ztheta, ytheta_ztheta;

  /* lattices of charge */
  const HermGridpt *qh = (HermGridpt *)(mg->qgrid[level].data); /* finer grid */
  HermGridpt *q2h = (HermGridpt *)(mg->qgrid[level+1].data);  /* coarser grid */
  const double hh = mg->param.spacing * (1 << (level+1));   /* coarse spacing */
  double hpsi[nstencil];               /* psi weighted by coarse grid spacing */

  /* finer grid - grid size and labeling identical in each dimension */
  const int32 ndim1 = mg->qgrid[level].ni;    /* number points along dim */
  const int32 mask = ndim1 - 1;        /* indexing mask for periodic grids */

  /* coarser grid - grid size and labeling identical in each dimension */
  const int32 ndim2 = mg->qgrid[level+1].ni;  /* number points along dim */

  /* other variables */
  int32 i1, j1, k1, index1, jk1off, k1off;
  int32 i2, j2, k2, index2, jk2off, k2off;
  int32 i, j, k;

  FLT(hh);

  /* compute hpsi */
  for (k = 0;  k < nstencil;  k++) {
    hpsi[k] = hh * psi[k];
  }

  /* loop over coarser grid points */
  for (k2 = 0;  k2 < ndim2;  k2++) {
    k2off = k2 * ndim2;  /* coarser grid index offset for k-coord */
    k1 = k2 * 2;         /* k-coord of same-space point on finer grid */
    for (j2 = 0;  j2 < ndim2;  j2++) {
      jk2off = (k2off + j2) * ndim2;  /* add offset for j-coord coarser */
      j1 = j2 * 2;                    /* j-coord same-space finer grid */
      for (i2 = 0;  i2 < ndim2;  i2++) {
        index2 = jk2off + i2;         /* index in coarser grid */
        i1 = i2 * 2;                  /* i-coord same-space finer grid */

        /* sum weighted charge contribution from finer grid stencil */
        q2h_sum.d000 = 0.0;
        q2h_sum.d100 = 0.0;
        q2h_sum.d010 = 0.0;
        q2h_sum.d001 = 0.0;
        q2h_sum.d110 = 0.0;
        q2h_sum.d101 = 0.0;
        q2h_sum.d011 = 0.0;
        q2h_sum.d111 = 0.0;
        for (k = 0;  k < nstencil;  k++) {
          /* offset k-coord finer grid */
          k1off = ((k1 + offset[k]) & mask) * ndim1;
          for (j = 0;  j < nstencil;  j++) {
            /* add offset j */
            jk1off = (k1off + ((j1 + offset[j]) & mask)) * ndim1;

            /* mult weights in each dim */
            yphi_zphi = phi[j] * phi[k];
            yhpsi_zphi = hpsi[j] * phi[k];
            ytheta_zphi = theta[j] * phi[k];
            yphi_zhpsi = phi[j] * hpsi[k];
            yhpsi_zhpsi = hpsi[j] * hpsi[k];
            ytheta_zhpsi = theta[j] * hpsi[k];
            yphi_ztheta = phi[j] * theta[k];
            yhpsi_ztheta = hpsi[j] * theta[k];
            ytheta_ztheta = theta[j] * theta[k];

            for (i = 0;  i < nstencil;  i++) {
              /* index in finer grid */
              index1 = jk1off + ((i1 + offset[i]) & mask);
              ASSERT(&qh[index1]
                  == mgrid_lattice_elem(&(mg->qgrid[level]),
                    (i1 + offset[i]) & mask, (j1 + offset[j]) & mask,
                    (k1 + offset[k]) & mask));

              /* sum weighted charge */
              q2h_sum.d000 += phi[i] * yphi_zphi * qh[index1].d000;
              q2h_sum.d100 += yphi_zphi * (hpsi[i] * qh[index1].d000
                  + theta[i] * qh[index1].d100);
              q2h_sum.d010 += phi[i] * (yhpsi_zphi * qh[index1].d000
                  + ytheta_zphi * qh[index1].d010);
              q2h_sum.d001 += phi[i] * (yphi_zhpsi * qh[index1].d000
                  + yphi_ztheta * qh[index1].d001);
              q2h_sum.d110 += hpsi[i] * (yhpsi_zphi * qh[index1].d000
                  + ytheta_zphi * qh[index1].d010)
                + theta[i] * (yhpsi_zphi * qh[index1].d100
                    + ytheta_zphi * qh[index1].d110);
              q2h_sum.d101 += hpsi[i] * (yphi_zhpsi * qh[index1].d000
                  + yphi_ztheta * qh[index1].d001)
                + theta[i] * (yphi_zhpsi * qh[index1].d100
                    + yphi_ztheta * qh[index1].d101);
              q2h_sum.d011 += phi[i] * (yhpsi_zhpsi * qh[index1].d000
                  + ytheta_zhpsi * qh[index1].d010
                  + yhpsi_ztheta * qh[index1].d001
                  + ytheta_ztheta * qh[index1].d011);
              q2h_sum.d111 += hpsi[i] * (yhpsi_zhpsi * qh[index1].d000
                  + ytheta_zhpsi * qh[index1].d010
                  + yhpsi_ztheta * qh[index1].d001
                  + ytheta_ztheta * qh[index1].d011)
                + theta[i] * (yhpsi_zhpsi * qh[index1].d100
                    + ytheta_zhpsi * qh[index1].d110
                    + yhpsi_ztheta * qh[index1].d101
                    + ytheta_ztheta * qh[index1].d111);
            }
          }
        } /* end loop over finer grid stencil */

        ASSERT(&q2h[index2]
            == mgrid_lattice_elem(&(mg->qgrid[level+1]), i2, j2, k2));
        q2h[index2] = q2h_sum;  /* store charge to coarser grid */

      }
    }
  } /* end loop over each coarser grid point */
#if 0
#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("charge grid point q%dh(%d,%d,%d):\n", 1 << (level+1), i, j, k);
  index2 = (k * ndim2 + j) * ndim2 + i;
  ASSERT(&q2h[index2]
      == mgrid_lattice_elem(&(mg->qgrid[level+1]), i, j, k));
  print_gridpoint(&q2h[index2]);
#endif
#endif

  return 0;
}


int prolong_grid(Mgrid *mg, int32 level)
{
  double yphi_zphi, yhpsi_zphi, ytheta_zphi;
  double yphi_zhpsi, yhpsi_zhpsi, ytheta_zhpsi;
  double yphi_ztheta, yhpsi_ztheta, ytheta_ztheta;

  /* lattices of potential */
  HermGridpt *eh = (HermGridpt *)(mg->egrid[level].data);       /* finer grid */
  const HermGridpt *e2h = (HermGridpt *)(mg->egrid[level+1].data); /* coarser */
  const double hh = mg->param.spacing * (1 << (level+1));   /* coarse spacing */
  double hpsi[nstencil];               /* psi weighted by coarse grid spacing */

  /* finer grid - grid size and labeling identical in each dimension */
  const int32 ndim1 = mg->egrid[level].ni;    /* number points along dim */
  const int32 mask = ndim1 - 1;        /* indexing mask for periodic grids */

  /* coarser grid - grid size and labeling identical in each dimension */
  const int32 ndim2 = mg->egrid[level+1].ni;  /* number points along dim */

  /* other variables */
  int32 i1, j1, k1, index1, jk1off, k1off;
  int32 i2, j2, k2, index2, jk2off, k2off;
  int32 i, j, k;

#if 0
#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("potential grid point e%dh(%d,%d,%d):\n", 1 << (level+1), i, j, k);
  index2 = (k * ndim2 + j) * ndim2 + i;
  ASSERT(&e2h[index2]
      == mgrid_lattice_elem(&(mg->egrid[level+1]), i, j, k));
  print_gridpoint(&e2h[index2]);
#endif
#endif

  /* compute hpsi */
  for (k = 0;  k < nstencil;  k++) {
    hpsi[k] = hh * psi[k];
  }

  /* loop over coarser grid points */
  for (k2 = 0;  k2 < ndim2;  k2++) {
    k2off = k2 * ndim2;  /* coarser grid index offset for k-coord */
    k1 = k2 * 2;         /* k-coord of same-space point on finer grid */
    for (j2 = 0;  j2 < ndim2;  j2++) {
      jk2off = (k2off + j2) * ndim2;  /* add offset for j-coord coarser */
      j1 = j2 * 2;                    /* j-coord same-space finer grid */
      for (i2 = 0;  i2 < ndim2;  i2++) {
        index2 = jk2off + i2;         /* index in coarser grid */
        i1 = i2 * 2;                  /* i-coord same-space finer grid */

        /* sum weighted potential contribution to finer grid stencil */
        for (k = 0;  k < nstencil;  k++) {
          /* offset k-coord finer grid */
          k1off = ((k1 + offset[k]) & mask) * ndim1;
          for (j = 0;  j < nstencil;  j++) {
            /* add offset j */
            jk1off = (k1off + ((j1 + offset[j]) & mask)) * ndim1;
            
            /* mult weights in each dim */
            yphi_zphi = phi[j] * phi[k];
            yhpsi_zphi = hpsi[j] * phi[k];
            ytheta_zphi = theta[j] * phi[k];
            yphi_zhpsi = phi[j] * hpsi[k];
            yhpsi_zhpsi = hpsi[j] * hpsi[k];
            ytheta_zhpsi = theta[j] * hpsi[k];
            yphi_ztheta = phi[j] * theta[k];
            yhpsi_ztheta = hpsi[j] * theta[k];
            ytheta_ztheta = theta[j] * theta[k];

            for (i = 0;  i < nstencil;  i++) {
              /* index in finer grid */
              index1 = jk1off + ((i1 + offset[i]) & mask);
              ASSERT(&eh[index1]
                  == mgrid_lattice_elem(&(mg->egrid[level]),
                    (i1 + offset[i]) & mask, (j1 + offset[j]) & mask,
                    (k1 + offset[k]) & mask));
              ASSERT(&e2h[index2]
                  == mgrid_lattice_elem(&(mg->egrid[level+1]), i2, j2, k2));

              /* sum weighted potential from coarser grid */
              eh[index1].d000 += phi[i] * (yphi_zphi * e2h[index2].d000
                  + yhpsi_zphi * e2h[index2].d010
                  + yphi_zhpsi * e2h[index2].d001
                  + yhpsi_zhpsi * e2h[index2].d011)
                + hpsi[i] * (yphi_zphi * e2h[index2].d100
                  + yhpsi_zphi * e2h[index2].d110
                  + yphi_zhpsi * e2h[index2].d101
                  + yhpsi_zhpsi * e2h[index2].d111);
              eh[index1].d100 += theta[i] * (yphi_zphi * e2h[index2].d100
                  + yhpsi_zphi * e2h[index2].d110
                  + yphi_zhpsi * e2h[index2].d101
                  + yhpsi_zhpsi * e2h[index2].d111);
              eh[index1].d010 += phi[i] * (ytheta_zphi * e2h[index2].d010
                  + ytheta_zhpsi * e2h[index2].d011)
                + hpsi[i] * (ytheta_zphi * e2h[index2].d110
                    + ytheta_zhpsi * e2h[index2].d111);
              eh[index1].d001 += phi[i] * (yphi_ztheta * e2h[index2].d001
                  + yhpsi_ztheta * e2h[index2].d011)
                + hpsi[i] * (yphi_ztheta * e2h[index2].d101
                    + yhpsi_ztheta * e2h[index2].d111);
              eh[index1].d110 += theta[i] * (ytheta_zphi * e2h[index2].d110
                  + ytheta_zhpsi * e2h[index2].d111);
              eh[index1].d101 += theta[i] * (yphi_ztheta * e2h[index2].d101
                  + yhpsi_ztheta * e2h[index2].d111);
              eh[index1].d011 += ytheta_ztheta * (phi[i] * e2h[index2].d011
                  + hpsi[i] * e2h[index2].d111);
              eh[index1].d111 += theta[i] * ytheta_ztheta * e2h[index2].d111;
            }
          }
        } /* end loop over finer grid stencil */

      }
    }
  } /* end loop over each coarser grid point */
#if 0
#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("potential grid point e%dh(%d,%d,%d):\n", 1 << level, i, j, k);
  index1 = (k * ndim1 + j) * ndim1 + i;
  ASSERT(&eh[index1]
      == mgrid_lattice_elem(&(mg->egrid[level]), i, j, k));
  print_gridpoint(&eh[index1]);
#endif
#endif

  return 0;
}


int direct_sum(Mgrid *mg, int32 level)
{
  HermGridpt eh_sum;
#if 0
  double scaling = mg->scaling[level];     /* scaling factor for gd */
#endif

  /* lattices of charge and potential */
  const HermGridpt *qh = (HermGridpt *)(mg->qgrid[level].data);
  HermGridpt *eh = (HermGridpt *)(mg->egrid[level].data);
  const int32 ndim = mg->qgrid[level].ni;  /* number points in each dim */
  const int32 mask = ndim - 1;        /* indexing mask for periodic grids */

  /*
   * for last level use different block of weights that
   * produces pairwise interactions between all grid points
   */

  /* lattice of weights for short range charge influence on potential */
#if 0
  const HermDirect *gd = (level == mg->param.nlevels - 1 ?
      (HermDirect *)(mg->glast.data) : (HermDirect *)(mg->gdsum.data));
#endif
  const HermDirect *gd = (HermDirect *)(mg->gdsum_list[level].data);

  /* for computing matrix vector product */
  const double *gm;                  /* points to matrix */
  const double *qv;                  /* points to vector */
  double *ev = (double *)(&eh_sum);  /* points to resulting vector */

  /* dimension of gd lattice */
#if 0
  const int32 ngdim = (level == mg->param.nlevels - 1 ?
      mg->glast.ni : mg->gdsum.ni);
#endif
  const int32 ngdim = mg->gdsum_list[level].ni;

  /* "radius" of gd "sphere" */
#if 0
  const int32 ngrad = (level == mg->param.nlevels - 1 ?
      mg->glast_radius : mg->gdsum_radius);
#endif
  const int32 ngrad = mg->gdsum_radius_list[level];

  int32 i, j, k, ig, jg, kg, ii, jj, kk;
  int32 index, jkoff, koff;
  int32 nindex, jknoff, knoff;
  int32 ngindex, jkgoff, kgoff;

  enum { veclen = 8 };

  ASSERT(veclen == sizeof(HermGridpt) / sizeof(double));
  ASSERT(veclen * veclen == sizeof(HermDirect) / sizeof(double));

/* #ifdef DEBUG_SUPPORT */
#if 0
  if (ndim == 1) {
    ASSERT(level == mg->param.nlevels - 1);
    printf("#++ top level charge:  %g\n",
        qh[0].d000, qh[0].d100, qh[0].d010, qh[0].d001,
        qh[0].d110, qh[0].d101, qh[0].d011, qh[0].d111);
    eh[0].d000 = 0.0;
    eh[0].d100 = 0.0;
    eh[0].d010 = 0.0;
    eh[0].d001 = 0.0;
    eh[0].d110 = 0.0;
    eh[0].d101 = 0.0;
    eh[0].d011 = 0.0;
    eh[0].d111 = 0.0;
    return 0;
  }
#endif

#if 0
  PTR(mg->glast.data);
  PTR(gd);

  INT(mg->glast.ni);
  INT(ngdim);

  INT(mg->glast_radius);
  INT(ngrad);
#endif

#if 0
#ifdef DEBUG_WATCH
  mgrid_lattice_print(&(mg->glast));
#endif
#endif

  /* loop over all grid points */
  for (k = 0;  k < ndim;  k++) {
    /* grid index offset for k-coord */
    koff = k * ndim;

    for (j = 0;  j < ndim;  j++) {
      /* add offset for j-coord */
      jkoff = (koff + j) * ndim;

      for (i = 0;  i < ndim;  i++) {
        /* index for potential grid point */
        index = jkoff + i;

        /* sum over "sphere" of weighted charge */
        eh_sum.d000 = 0.0;
        eh_sum.d100 = 0.0;
        eh_sum.d010 = 0.0;
        eh_sum.d001 = 0.0;
        eh_sum.d110 = 0.0;
        eh_sum.d101 = 0.0;
        eh_sum.d011 = 0.0;
        eh_sum.d111 = 0.0;
        for (kg = -ngrad;  kg <= ngrad;  kg++) {
          /* grid index offset for k-coord */
          knoff = ((k + kg) & mask) * ndim;
          kgoff = kg * ngdim;       /* gd index offset for k-coord */

          for (jg = -ngrad;  jg <= ngrad;  jg++) {
            /* add offset for j-coord */
            jknoff = (knoff + ((j + jg) & mask)) * ndim;
            jkgoff = (kgoff + jg) * ngdim;       /* gd add offset j-coord */

            for (ig = -ngrad;  ig <= ngrad;  ig++) {
              /* index of charge contribution */
              nindex = jknoff + ((i + ig) & mask);
              ngindex = jkgoff + ig;       /* index of gd weighting factor */
              ASSERT(&qh[nindex]
                  == mgrid_lattice_elem(&(mg->qgrid[level]),
                    (i + ig) & mask, (j + jg) & mask, (k + kg) & mask));
              ASSERT(&gd[ngindex]
                  == mgrid_lattice_elem(&(mg->gdsum_list[level]), ig, jg, kg));

              /* sum weighted charge */
              qv = (const double *)(&qh[nindex]);  /* point to charge vector */
              gm = (const double *)(&gd[ngindex]); /* point to direct matrix */
              /* compute matrix vector product */
              for (kk = 0, jj = 0;  jj < veclen;  jj++) {
                for (ii = 0;  ii < veclen;  ii++, kk++) {
                  ev[jj] += gm[kk] * qv[ii];
                }
              }

            }
          }
        } /* end loop over "sphere" of charge */

        ASSERT(&eh[index]
            == mgrid_lattice_elem(&(mg->egrid[level]), i, j, k));
#if 0
        /* scale and store potential */
        eh[index].d000 = scaling * eh_sum.d000;
        eh[index].d100 = scaling * eh_sum.d100;
        eh[index].d010 = scaling * eh_sum.d010;
        eh[index].d001 = scaling * eh_sum.d001;
        eh[index].d110 = scaling * eh_sum.d110;
        eh[index].d101 = scaling * eh_sum.d101;
        eh[index].d011 = scaling * eh_sum.d011;
        eh[index].d111 = scaling * eh_sum.d111;
#endif
        /* store potential */
        eh[index] = eh_sum;
      }
    }
  } /* end loop over all grid points */
#if 0
#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("(direct sum) potential grid point e%dh(%d,%d,%d):\n",
      1 << level, i, j, k);
  index = (k * ndim + j) * ndim + i;
  ASSERT(&eh[index]
      == mgrid_lattice_elem(&(mg->egrid[level]), i, j, k));
  print_gridpoint(&eh[index]);
#endif
#endif

#if 0
  INT(level);
#ifdef DEBUG_WATCH
  mgrid_lattice_print(&(mg->egrid[level]));
#endif
#endif

  return 0;
}
