/*
 * Copyright (C) 2004-2005 by David J. Hardy.  All rights reserved.
 *
 * nhermite.c
 */

#include <stdio.h>
#include "mgrid/split.h"
#include "mgrid/hermite.h"
#undef DEBUG_SUPPORT
#undef DEBUG_WATCH
#include "debug/debug.h"

#define NELEMS(a)  (sizeof(a)/sizeof((a)[0]))

/* prototypes of internal routines */
#ifdef DEBUG_WATCH
static void print_gridpoint(const HermGridpt *p);
#endif
static void add_herm_direct(HermDirect *g, double c, double h, double a,
    int32 i, int32 j, int32 k, int32 split);
static void done(Mgrid *mg);
static int force(Mgrid *mg, MgridSystem *sys);
static int atoms_to_grid(Mgrid *mg, const MgridSystem *sys);
static int grid_to_atoms(Mgrid *mg, MgridSystem *sys);
static int restrict_grid(Mgrid *mg, int32 level);
static int prolong_grid(Mgrid *mg, int32 level);
static int direct_sum(Mgrid *mg, int32 level);


#ifdef DEBUG_WATCH
void print_gridpoint(const HermGridpt *gp)
{
  const double *p = (const double *) gp;
  int k;

  for (k = 0;  k < sizeof(HermGridpt) / sizeof(double);  k++) {
    printf("%g ", p[k]);
  }
  printf("\n");
}
#endif


/*
 * constants for grid transfer operations: Hermite interpolation
 */

/* length of stencil */
enum { NSTENCIL = 3 };

static const double  PHI0[NSTENCIL] = { 0.5, 1, 0.5 };
static const double DPHI0[NSTENCIL] = { 1.5, 0, -1.5 };
static const double  PHI1[NSTENCIL] = { -0.125, 0, 0.125 };
static const double DPHI1[NSTENCIL] = { -0.25, 1, -0.25 };


int mgrid_nhermite_setup(Mgrid *mg, const MgridSystem *sys)
{
  const double *q = sys->charge;
  const int32 nlevels = mg->param.nlevels;
  const int32 natoms = mg->param.natoms;
  const int32 split = mg->param.split;
  HermDirect *gd;
  double r2, s, gs;
  int32 i, j, k, n, na, nb, m;
  int32 gdim, index;
  const double a = mg->param.cutoff;
  const double h = mg->param.spacing;
  const double h_1 = 1./h;
  double hm, am;
  double phi0_base_array[NSTENCIL];
  double dphi0_base_array[NSTENCIL];
  double phi1_base_array[NSTENCIL];
  double dphi1_base_array[NSTENCIL];
  double *phi0, *dphi0, *phi1, *dphi1;
  HermDirect *op, *res;
  int32 *is_zero;

  /*
   * refuse any exotic splittings
   * (makes assumption on ordering of splitting IDs)
   */
  if (split > MGRID_ERRMIN3) return MGRID_FAIL;

  /* assign virtual methods */
  mg->long_force = force;
  mg->long_done = done;

  /* set inverse of grid spacing */
  mg->inv_spacing = h_1;

  /* set location of multi-grids origin with lattice label (0,0,0) */
  mg->origin.x = mg->param.center.x - 0.5 * mg->param.length;
  mg->origin.y = mg->param.center.y - 0.5 * mg->param.length;
  mg->origin.z = mg->param.center.z - 0.5 * mg->param.length;

  VEC(mg->origin);

  /* allocate arrays of charge and potential lattices */
  mg->qgrid = (MgridLattice *) malloc(nlevels * sizeof(MgridLattice));
  mg->egrid = (MgridLattice *) malloc(nlevels * sizeof(MgridLattice));
  if (mg->qgrid == NULL || mg->egrid == NULL) return MGRID_FAIL;

  /* setup charge and potential lattices for each level */
  for (k = 0;  k < nlevels;  k++) {
    if (mgrid_lattice_init(&(mg->qgrid[k]))
        || mgrid_lattice_init(&(mg->egrid[k]))) {
      return MGRID_FAIL;
    }
  }
  na = 0;
  nb = mg->param.nspacings;
  for (k = 0;  k < nlevels;  k++) {
    if (nb - na + 1 <= 2 && k < nlevels - 1) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->qgrid[k]), sizeof(HermGridpt),
          na, nb, na, nb, na, nb)
        || mgrid_lattice_setup(&(mg->egrid[k]), sizeof(HermGridpt),
          na, nb, na, nb, na, nb)) {
      return MGRID_FAIL;
    }
#ifdef DEBUG_WATCH
    printf("level:  k = %d   na = %d   nb = %d\n", k, na, nb);
#endif
    na = -((-na+1)/2);
    nb = (nb+1)/2;
  }

  /* allocate arrays for direct sum weights */
  ASSERT(nlevels > 0);
  mg->gdsum_list = (MgridLattice *) calloc(nlevels, sizeof(MgridLattice));
  if (mg->gdsum_list == NULL) return MGRID_FAIL;
  mg->gdsum_radius_list = (int32 *) calloc(nlevels, sizeof(int32));
  if (mg->gdsum_radius_list == NULL) return MGRID_FAIL;
  mg->is_zero = (MgridLattice *) calloc(nlevels, sizeof(MgridLattice));
  if (mg->is_zero == NULL) return MGRID_FAIL;

  /* determine radius for direct sum "sphere" of weights */
  n = (int32) (2.0 * a * h_1);
  hm = h;  /* grid spacing for level 0 */
  am = a;  /* cutoff for level 0 */
  for (m = 0;  m < nlevels-1;  m++) {
    /* allocate direct sum weight lattice for this level */
    mg->gdsum_radius_list[m] = n;
    if (mgrid_lattice_init(&(mg->gdsum_list[m]))) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->gdsum_list[m]), sizeof(HermDirect),
          -n, n, -n, n, -n, n)) {
      return MGRID_FAIL;
    }
    /* accumulate direct sum weights, start with zero matrix */
    if (mgrid_lattice_zero(&(mg->gdsum_list[m]))) {
      return MGRID_FAIL;
    }
    gdim = mg->gdsum_list[m].ni;
    ASSERT(gdim == 2 * n + 1);
    gd = (HermDirect *)(mg->gdsum_list[m].data);
    /* allocate "zero flag" to distinguish matrices that are all zero */
    if (mgrid_lattice_init(&(mg->is_zero[m]))) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->is_zero[m]), sizeof(int32),
          -n, n, -n, n, -n, n)) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_zero(&(mg->is_zero[m]))) {
      return MGRID_FAIL;
    }
    is_zero = (int32 *)(mg->is_zero[m].data);
    for (k = -n;  k <= n;  k++) {
      for (j = -n;  j <= n;  j++) {
        for (i = -n;  i <= n;  i++) {
          index = (k * gdim + j) * gdim + i;
          r2 = (i*i + j*j + k*k) * hm*hm;
          if (r2 >= 4*am*am) {
            /* is zero beyond radius r == 2*am, set flag */
            ASSERT(is_zero[index]
               == mgrid_lattice_elem(&(mg->is_zero[m]), i, j, k));
            is_zero[index] = 1;
            continue;
          }
          ASSERT(&gd[index]
              == mgrid_lattice_elem(&(mg->gdsum_list[m]), i, j, k));

          /* accumulate D( g_{a}(r,r') ) term for this level */
          add_herm_direct(&gd[index], 1.0, hm, am, i, j, k, split);

          /* accumulate D( -g_{2a}(r,r') ) term for this level */
          add_herm_direct(&gd[index], -1.0, hm, 2*am, i, j, k, split);
        }
      }
    }
    hm *= 2.0;  /* grid spacing twice as big on next level */
    am *= 2.0;  /* cutoff twice as big on next level */
  }

  /* determine radius for last level direct sum "sphere" of weights */
  ASSERT(m == nlevels-1);
  n = mg->qgrid[m].ni - 1;
  /* allocate lattice */
  mg->gdsum_radius_list[m] = n;
  if (mgrid_lattice_init(&(mg->gdsum_list[m]))) {
    return MGRID_FAIL;
  }
  if (mgrid_lattice_setup(&(mg->gdsum_list[m]), sizeof(HermDirect),
        -n, n, -n, n, -n, n)) {
    return MGRID_FAIL;
  }
  /* accumulate direct sum weights for last level */
  if (mgrid_lattice_zero(&(mg->gdsum_list[m]))) {
    return MGRID_FAIL;
  }
  gdim = mg->gdsum_list[m].ni;
  ASSERT(gdim == 2 * n + 1);
  gd = (HermDirect *)(mg->gdsum_list[m].data);
  /* allocate "zero flag" to distinguish matrices that are all zero */
  if (mgrid_lattice_init(&(mg->is_zero[m]))) {
    return MGRID_FAIL;
  }
  if (mgrid_lattice_setup(&(mg->is_zero[m]), sizeof(int32),
        -n, n, -n, n, -n, n)) {
    return MGRID_FAIL;
  }
  if (mgrid_lattice_zero(&(mg->is_zero[m]))) {
    return MGRID_FAIL;
  }
  /* (all entries of is_zero remain false for last level) */
  for (k = -n;  k <= n;  k++) {
    for (j = -n;  j <= n;  j++) {
      for (i = -n;  i <= n;  i++) {
        /* unlike previous, this sum is not truncated */
        index = (k * gdim + j) * gdim + i;
        ASSERT(&gd[index]
            == mgrid_lattice_elem(&(mg->gdsum_list[m]), i, j, k));

        /* accumulate D( g_{a}(r,r') ) term for this level */
        add_herm_direct(&gd[index], 1.0, hm, am, i, j, k, split);
      }
    }
  }

  /*
   * compute grid transfer matrices: restriction operator
   */
  mg->opres_list = (MgridLattice *) calloc(nlevels-1, sizeof(MgridLattice));
  if (mg->opres_list == NULL) return MGRID_FAIL;

  for (n = 0;  n < NSTENCIL;  n++) {
    phi0_base_array[n] = PHI0[n];
    dphi0_base_array[n] = h_1 * DPHI0[n];  /* scale by grid spacing */
    phi1_base_array[n] = h * PHI1[n];      /* scale by grid spacing */
    dphi1_base_array[n] = DPHI1[n];
  }
  phi0 = phi0_base_array + NSTENCIL/2;
  dphi0 = dphi0_base_array + NSTENCIL/2;
  phi1 = phi1_base_array + NSTENCIL/2;
  dphi1 = dphi1_base_array + NSTENCIL/2;

  for (m = 0;  m < nlevels-1;  m++) {
    if (mgrid_lattice_init(&(mg->opres_list[m]))) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->opres_list[m]), sizeof(HermDirect),
          -NSTENCIL/2, NSTENCIL/2,
          -NSTENCIL/2, NSTENCIL/2,
          -NSTENCIL/2, NSTENCIL/2)) {
      return MGRID_FAIL;
    }
    op = (HermDirect *)(mg->opres_list[m].data);

    /* adjust for next loop */
    for (n = 0;  n < NSTENCIL;  n++) {
      dphi0_base_array[n] *= 0.5;  /* scale by next grid spacing */
      phi1_base_array[n] *= 2;     /* scale by next grid spacing */
    }

    for (k = -NSTENCIL/2;  k <= NSTENCIL/2;  k++) {
      for (j = -NSTENCIL/2;  j <= NSTENCIL/2;  j++) {
        for (i = -NSTENCIL/2;  i <= NSTENCIL/2;  i++) {
          index = (k * NSTENCIL + j) * NSTENCIL + i;
          ASSERT(&op[index]
              == mgrid_lattice_elem(&(mg->opres_list[m]), i, j, k));

          op[index].d000_000 =  phi0[i] *  phi0[j] *  phi0[k];
          op[index].d000_100 = dphi0[i] *  phi0[j] *  phi0[k];
          op[index].d000_010 =  phi0[i] * dphi0[j] *  phi0[k];
          op[index].d000_001 =  phi0[i] *  phi0[j] * dphi0[k];
          op[index].d000_110 = dphi0[i] * dphi0[j] *  phi0[k];
          op[index].d000_101 = dphi0[i] *  phi0[j] * dphi0[k];
          op[index].d000_011 =  phi0[i] * dphi0[j] * dphi0[k];
          op[index].d000_111 = dphi0[i] * dphi0[j] * dphi0[k];

          op[index].d100_000 =  phi1[i] *  phi0[j] *  phi0[k];
          op[index].d100_100 = dphi1[i] *  phi0[j] *  phi0[k];
          op[index].d100_010 =  phi1[i] * dphi0[j] *  phi0[k];
          op[index].d100_001 =  phi1[i] *  phi0[j] * dphi0[k];
          op[index].d100_110 = dphi1[i] * dphi0[j] *  phi0[k];
          op[index].d100_101 = dphi1[i] *  phi0[j] * dphi0[k];
          op[index].d100_011 =  phi1[i] * dphi0[j] * dphi0[k];
          op[index].d100_111 = dphi1[i] * dphi0[j] * dphi0[k];

          op[index].d010_000 =  phi0[i] *  phi1[j] *  phi0[k];
          op[index].d010_100 = dphi0[i] *  phi1[j] *  phi0[k];
          op[index].d010_010 =  phi0[i] * dphi1[j] *  phi0[k];
          op[index].d010_001 =  phi0[i] *  phi1[j] * dphi0[k];
          op[index].d010_110 = dphi0[i] * dphi1[j] *  phi0[k];
          op[index].d010_101 = dphi0[i] *  phi1[j] * dphi0[k];
          op[index].d010_011 =  phi0[i] * dphi1[j] * dphi0[k];
          op[index].d010_111 = dphi0[i] * dphi1[j] * dphi0[k];

          op[index].d001_000 =  phi0[i] *  phi0[j] *  phi1[k];
          op[index].d001_100 = dphi0[i] *  phi0[j] *  phi1[k];
          op[index].d001_010 =  phi0[i] * dphi0[j] *  phi1[k];
          op[index].d001_001 =  phi0[i] *  phi0[j] * dphi1[k];
          op[index].d001_110 = dphi0[i] * dphi0[j] *  phi1[k];
          op[index].d001_101 = dphi0[i] *  phi0[j] * dphi1[k];
          op[index].d001_011 =  phi0[i] * dphi0[j] * dphi1[k];
          op[index].d001_111 = dphi0[i] * dphi0[j] * dphi1[k];

          op[index].d110_000 =  phi1[i] *  phi1[j] *  phi0[k];
          op[index].d110_100 = dphi1[i] *  phi1[j] *  phi0[k];
          op[index].d110_010 =  phi1[i] * dphi1[j] *  phi0[k];
          op[index].d110_001 =  phi1[i] *  phi1[j] * dphi0[k];
          op[index].d110_110 = dphi1[i] * dphi1[j] *  phi0[k];
          op[index].d110_101 = dphi1[i] *  phi1[j] * dphi0[k];
          op[index].d110_011 =  phi1[i] * dphi1[j] * dphi0[k];
          op[index].d110_111 = dphi1[i] * dphi1[j] * dphi0[k];

          op[index].d101_000 =  phi1[i] *  phi0[j] *  phi1[k];
          op[index].d101_100 = dphi1[i] *  phi0[j] *  phi1[k];
          op[index].d101_010 =  phi1[i] * dphi0[j] *  phi1[k];
          op[index].d101_001 =  phi1[i] *  phi0[j] * dphi1[k];
          op[index].d101_110 = dphi1[i] * dphi0[j] *  phi1[k];
          op[index].d101_101 = dphi1[i] *  phi0[j] * dphi1[k];
          op[index].d101_011 =  phi1[i] * dphi0[j] * dphi1[k];
          op[index].d101_111 = dphi1[i] * dphi0[j] * dphi1[k];

          op[index].d011_000 =  phi0[i] *  phi1[j] *  phi1[k];
          op[index].d011_100 = dphi0[i] *  phi1[j] *  phi1[k];
          op[index].d011_010 =  phi0[i] * dphi1[j] *  phi1[k];
          op[index].d011_001 =  phi0[i] *  phi1[j] * dphi1[k];
          op[index].d011_110 = dphi0[i] * dphi1[j] *  phi1[k];
          op[index].d011_101 = dphi0[i] *  phi1[j] * dphi1[k];
          op[index].d011_011 =  phi0[i] * dphi1[j] * dphi1[k];
          op[index].d011_111 = dphi0[i] * dphi1[j] * dphi1[k];

          op[index].d111_000 =  phi1[i] *  phi1[j] *  phi1[k];
          op[index].d111_100 = dphi1[i] *  phi1[j] *  phi1[k];
          op[index].d111_010 =  phi1[i] * dphi1[j] *  phi1[k];
          op[index].d111_001 =  phi1[i] *  phi1[j] * dphi1[k];
          op[index].d111_110 = dphi1[i] * dphi1[j] *  phi1[k];
          op[index].d111_101 = dphi1[i] *  phi1[j] * dphi1[k];
          op[index].d111_011 =  phi1[i] * dphi1[j] * dphi1[k];
          op[index].d111_111 = dphi1[i] * dphi1[j] * dphi1[k];
        }
      }
    }

  } /* end loop over grid levels */

  /*
   * compute grid transfer matrices: prolongation operator
   */
  mg->oppro_list = (MgridLattice *) calloc(nlevels-1, sizeof(MgridLattice));
  if (mg->oppro_list == NULL) return MGRID_FAIL;

  for (m = 0;  m < nlevels-1;  m++) {
    if (mgrid_lattice_init(&(mg->oppro_list[m]))) {
      return MGRID_FAIL;
    }
    if (mgrid_lattice_setup(&(mg->oppro_list[m]), sizeof(HermDirect),
          -NSTENCIL/2, NSTENCIL/2,
          -NSTENCIL/2, NSTENCIL/2,
          -NSTENCIL/2, NSTENCIL/2)) {
      return MGRID_FAIL;
    }
    op = (HermDirect *)(mg->oppro_list[m].data);
    res = (HermDirect *)(mg->opres_list[m].data);

    for (k = -NSTENCIL/2;  k <= NSTENCIL/2;  k++) {
      for (j = -NSTENCIL/2;  j <= NSTENCIL/2;  j++) {
        for (i = -NSTENCIL/2;  i <= NSTENCIL/2;  i++) {
          index = (k * NSTENCIL + j) * NSTENCIL + i;
          ASSERT(&op[index]
              == mgrid_lattice_elem(&(mg->oppro_list[m]), i, j, k));
          ASSERT(&res[index]
              == mgrid_lattice_elem(&(mg->opres_list[m]), i, j, k));

          /* compute prolongation as transpose of restriction */
          op[index].d000_000 = res[index].d000_000;
          op[index].d000_100 = res[index].d100_000;
          op[index].d000_010 = res[index].d010_000;
          op[index].d000_001 = res[index].d001_000;
          op[index].d000_110 = res[index].d110_000;
          op[index].d000_101 = res[index].d101_000;
          op[index].d000_011 = res[index].d011_000;
          op[index].d000_111 = res[index].d111_000;

          op[index].d100_000 = res[index].d000_100;
          op[index].d100_100 = res[index].d100_100;
          op[index].d100_010 = res[index].d010_100;
          op[index].d100_001 = res[index].d001_100;
          op[index].d100_110 = res[index].d110_100;
          op[index].d100_101 = res[index].d101_100;
          op[index].d100_011 = res[index].d011_100;
          op[index].d100_111 = res[index].d111_100;

          op[index].d010_000 = res[index].d000_010;
          op[index].d010_100 = res[index].d100_010;
          op[index].d010_010 = res[index].d010_010;
          op[index].d010_001 = res[index].d001_010;
          op[index].d010_110 = res[index].d110_010;
          op[index].d010_101 = res[index].d101_010;
          op[index].d010_011 = res[index].d011_010;
          op[index].d010_111 = res[index].d111_010;

          op[index].d001_000 = res[index].d000_001;
          op[index].d001_100 = res[index].d100_001;
          op[index].d001_010 = res[index].d010_001;
          op[index].d001_001 = res[index].d001_001;
          op[index].d001_110 = res[index].d110_001;
          op[index].d001_101 = res[index].d101_001;
          op[index].d001_011 = res[index].d011_001;
          op[index].d001_111 = res[index].d111_001;

          op[index].d110_000 = res[index].d000_110;
          op[index].d110_100 = res[index].d100_110;
          op[index].d110_010 = res[index].d010_110;
          op[index].d110_001 = res[index].d001_110;
          op[index].d110_110 = res[index].d110_110;
          op[index].d110_101 = res[index].d101_110;
          op[index].d110_011 = res[index].d011_110;
          op[index].d110_111 = res[index].d111_110;

          op[index].d101_000 = res[index].d000_101;
          op[index].d101_100 = res[index].d100_101;
          op[index].d101_010 = res[index].d010_101;
          op[index].d101_001 = res[index].d001_101;
          op[index].d101_110 = res[index].d110_101;
          op[index].d101_101 = res[index].d101_101;
          op[index].d101_011 = res[index].d011_101;
          op[index].d101_111 = res[index].d111_101;

          op[index].d011_000 = res[index].d000_011;
          op[index].d011_100 = res[index].d100_011;
          op[index].d011_010 = res[index].d010_011;
          op[index].d011_001 = res[index].d001_011;
          op[index].d011_110 = res[index].d110_011;
          op[index].d011_101 = res[index].d101_011;
          op[index].d011_011 = res[index].d011_011;
          op[index].d011_111 = res[index].d111_011;

          op[index].d111_000 = res[index].d000_111;
          op[index].d111_100 = res[index].d100_111;
          op[index].d111_010 = res[index].d010_111;
          op[index].d111_001 = res[index].d001_111;
          op[index].d111_110 = res[index].d110_111;
          op[index].d111_101 = res[index].d101_111;
          op[index].d111_011 = res[index].d011_111;
          op[index].d111_111 = res[index].d111_111;
        }
      }
    }

  } /* end loop over grid levels */

  /* compute g_a(0) */
  s = 0;
  gamma(&gs, s, split);
  mg->g_zero = gs/a;

  /* compute self potential */
  s = 0;
  for (n = 0;  n < natoms;  n++) {
    s += q[n] * q[n];
  }
  mg->u_self = 0.5 * mg->g_zero * s;

  return 0;
}


/*
 * computation of HermDirect matrix components, taken straight from notes
 */
void add_herm_direct(HermDirect *g, double c, double h, double a,
    int32 i, int32 j, int32 k, int32 split)
{
  const double a_1 = 1./a;
  const double a_2 = a_1 * a_1;
  const double s = (i*i + j*j + k*k) * h*h * a_2;
  const double dx = -2 * i * h * a_2;  /* ds/dx */
  const double dy = -2 * j * h * a_2;  /* ds/dy */
  const double dz = -2 * k * h * a_2;  /* ds/dz */
  const double dd = 2 * a_2;  /* d^2s/dx^2 = d^2s/dy^2 = d^2s/dz^2 */
  double tmp;
  enum { nderiv = 7 };
  double p[nderiv];

  /* multiply entire matrix by this coefficient */
  c = c * a_1;

  /* compute derivatives of splitting p(s) */
  mgrid_ndgamma(p, nderiv, s, split);

  /* weight 0 */
  tmp = c * p[0];
  g->d000_000 += tmp;

  /* weight 1 */
  tmp = c * p[1] * dx;
  g->d100_000 += tmp;
  g->d000_100 -= tmp;

  tmp = c * p[1] * dy;
  g->d010_000 += tmp;
  g->d000_010 -= tmp;

  tmp = c * p[1] * dz;
  g->d001_000 += tmp;
  g->d000_001 -= tmp;

  if (split == MGRID_TAYLOR1) return;

  /* weight 2 */
  tmp = c * p[2] * dx * dy;
  g->d110_000 += tmp;
  g->d000_110 += tmp;
  g->d100_010 -= tmp;
  g->d010_100 -= tmp;

  tmp = c * p[2] * dx * dz;
  g->d101_000 += tmp;
  g->d000_101 += tmp;
  g->d100_001 -= tmp;
  g->d001_100 -= tmp;

  tmp = c * p[2] * dy * dz;
  g->d011_000 += tmp;
  g->d000_011 += tmp;
  g->d010_001 -= tmp;
  g->d001_010 -= tmp;

  tmp = c * (p[2] * dx*dx + p[1] * dd);
  g->d100_100 -= tmp;
  tmp = c * (p[2] * dy*dy + p[1] * dd);
  g->d010_010 -= tmp;
  tmp = c * (p[2] * dz*dz + p[1] * dd);
  g->d001_001 -= tmp;

  if (split == MGRID_TAYLOR2 || split == MGRID_ERRMIN3) return;

  /* weight 3 */
  tmp = c * p[3] * dx * dy * dz;
  g->d111_000 += tmp;
  g->d110_001 -= tmp;
  g->d101_010 -= tmp;
  g->d011_100 -= tmp;
  g->d100_011 += tmp;
  g->d010_101 += tmp;
  g->d001_110 += tmp;
  g->d000_111 -= tmp;

  tmp = c * (p[3] * dx*dx * dy + p[2] * dd * dy);
  g->d110_100 -= tmp;
  g->d100_110 += tmp;

  tmp = c * (p[3] * dx*dx * dz + p[2] * dd * dz);
  g->d101_100 -= tmp;
  g->d100_101 += tmp;

  tmp = c * (p[3] * dy*dy * dx + p[2] * dd * dx);
  g->d110_010 -= tmp;
  g->d010_110 += tmp;

  tmp = c * (p[3] * dy*dy * dz + p[2] * dd * dz);
  g->d011_010 -= tmp;
  g->d010_011 += tmp;

  tmp = c * (p[3] * dz*dz * dx + p[2] * dd * dx);
  g->d101_001 -= tmp;
  g->d001_101 += tmp;

  tmp = c * (p[3] * dz*dz * dy + p[2] * dd * dy);
  g->d011_001 -= tmp;
  g->d001_011 += tmp;

  if (split == MGRID_TAYLOR3) return;

  /* weight 4 */
  tmp = c * (p[4] * dx*dx * dy * dz + p[3] * dd * dy * dz);
  g->d111_100 -= tmp;
  g->d100_111 -= tmp;
  g->d110_101 += tmp;
  g->d101_110 += tmp;

  tmp = c * (p[4] * dy*dy * dx * dz + p[3] * dd * dx * dz);
  g->d111_010 -= tmp;
  g->d010_111 -= tmp;
  g->d110_011 += tmp;
  g->d011_110 += tmp;

  tmp = c * (p[4] * dz*dz * dx * dy + p[3] * dd * dx * dy);
  g->d111_001 -= tmp;
  g->d001_111 -= tmp;
  g->d101_011 += tmp;
  g->d011_101 += tmp;

  tmp = c * (p[4] * dx*dx * dy*dy + p[3] * dx*dx * dd
      + p[3] * dd * dy*dy + p[2] * dd * dd);
  g->d110_110 += tmp;
  tmp = c * (p[4] * dx*dx * dz*dz + p[3] * dx*dx * dd
      + p[3] * dd * dz*dz + p[2] * dd * dd);
  g->d101_101 += tmp;
  tmp = c * (p[4] * dy*dy * dz*dz + p[3] * dy*dy * dd
      + p[3] * dd * dz*dz + p[2] * dd * dd);
  g->d011_011 += tmp;

  if (split == MGRID_TAYLOR4) return;

  /* weight 5 */
  tmp = c * (p[5] * dx*dx * dy*dy * dz + p[4] * dx*dx * dd * dz
      + p[4] * dd * dy*dy * dz + p[3] * dd * dd * dz);
  g->d111_110 += tmp;
  g->d110_111 -= tmp;

  tmp = c * (p[5] * dx*dx * dz*dz * dy + p[4] * dx*dx * dd * dy
      + p[4] * dd * dz*dz * dy + p[3] * dd * dd * dy);
  g->d111_101 += tmp;
  g->d101_111 -= tmp;

  tmp = c * (p[5] * dy*dy * dz*dz * dx + p[4] * dy*dy * dd * dx
      + p[4] * dd * dz*dz * dx + p[3] * dd * dd * dx);
  g->d111_011 += tmp;
  g->d011_111 -= tmp;

  if (split == MGRID_TAYLOR5) return;

  /* weight 6 */
  tmp = c * (p[6] * dx*dx * dy*dy * dz*dz + p[5] * dx*dx * dy*dy * dd
      + p[5] * dx*dx * dd * dz*dz + p[5] * dd * dy*dy * dz*dz
      + p[4] * dx*dx * dd * dd + p[4] * dd * dy*dy * dd
      + p[4] * dd * dd * dz*dz + p[3] * dd * dd * dd);
  g->d111_111 -= tmp;
}


void done(Mgrid *mg)
{
  int32 n;

  for (n = 0;  n < mg->param.nlevels;  n++) {
    mgrid_lattice_done(&(mg->qgrid[n]));
    mgrid_lattice_done(&(mg->egrid[n]));
    mgrid_lattice_done(&(mg->gdsum_list[n]));
    mgrid_lattice_done(&(mg->is_zero[n]));
  }
  for (n = 0;  n < mg->param.nlevels-1;  n++) {
    mgrid_lattice_done(&(mg->opres_list[n]));
    mgrid_lattice_done(&(mg->oppro_list[n]));
  }
  free(mg->qgrid);
  free(mg->egrid);
  free(mg->gdsum_list);
  free(mg->gdsum_radius_list);
  free(mg->is_zero);
}


int force(Mgrid *mg, MgridSystem *sys)
{
  int32 n;

  /* first charge grid must be zeroed */
  if (mgrid_lattice_zero(&(mg->qgrid[0]))) return MGRID_FAIL;

  /* computation is "inverse V-cycle" */
  if (atoms_to_grid(mg, sys)) return MGRID_FAIL;
  for (n = 0;  n < mg->param.nlevels - 1;  n++) {
    if (direct_sum(mg, n)) return MGRID_FAIL;
    if (restrict_grid(mg, n)) return MGRID_FAIL;
  }
  if (direct_sum(mg, n)) return MGRID_FAIL;
  for (--n;  n >= 0;  --n) {
    if (prolong_grid(mg, n)) return MGRID_FAIL;
  }
  if (grid_to_atoms(mg, sys)) return MGRID_FAIL;

  return 0;
}


int atoms_to_grid(Mgrid *mg, const MgridSystem *sys)
{
  double xphi[2], yphi[2], zphi[2];     /* phi grid func along x, y, z */
  double xpsi[2], ypsi[2], zpsi[2];     /* psi grid func along x, y, z */
  double dx_h, dy_h, dz_h;              /* distance between atom and origin */
  double t;                             /* param to phi function */
  double c_zphi, c_zpsi;
  double c_yphi_zphi, c_yphi_zpsi;
  double c_ypsi_zphi, c_ypsi_zpsi;
  const double h = mg->param.spacing;   /* grid spacing */
  const double h_1 = mg->inv_spacing;   /* inverse grid spacing */
  const MD_Dvec origin = mg->origin;    /* lowest corner of domain */
  const double *q = sys->charge;        /* atom charges */
  const MD_Dvec *p = sys->pos;          /* atom positions */

  HermGridpt *qh = (HermGridpt *)(mg->qgrid[0].data);
      /* treat charge grid as 3D-lattice, index (i,j,k) */

  const int32 ndim = mg->qgrid[0].ni;  /* number points in each dimension */
  const int32 natoms = mg->param.natoms;  /* number of atoms */
  int32 n, i, j, k, ilo, jlo, klo;
  int32 index, koff, jkoff;

  FLT(h);
  VEC(origin);

  for (n = 0;  n < natoms;  n++) {

    /* distance between atom and corner measured in grid points */
    dx_h = (p[n].x - origin.x) * h_1;
    dy_h = (p[n].y - origin.y) * h_1;
    dz_h = (p[n].z - origin.z) * h_1;

    /* find smallest numbered grid point in stencil */
    ilo = ((int32) dx_h);
    jlo = ((int32) dy_h);
    klo = ((int32) dz_h);

    /* find t for x dimension and compute xphi */
    t = dx_h - (double) ilo;
    xphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    xpsi[0] = h * t * (1 - t) * (1 - t);
    t--;
    xphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    xpsi[1] = h * t * (1 + t) * (1 + t);

    /* find t for y dimension and compute yphi */
    t = dy_h - (double) jlo;
    yphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    ypsi[0] = h * t * (1 - t) * (1 - t);
    t--;
    yphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    ypsi[1] = h * t * (1 + t) * (1 + t);

    /* find t for z dimension and compute zphi */
    t = dz_h - (double) klo;
    zphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    zpsi[0] = h * t * (1 - t) * (1 - t);
    t--;
    zphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    zpsi[1] = h * t * (1 + t) * (1 + t);

    /* determine charge on 8=2*2*2 qh grid point stencil */
    for (k = 0;  k < 2;  k++) {
      koff = (k + klo) * ndim;
      c_zphi = zphi[k] * q[n];
      c_zpsi = zpsi[k] * q[n];
      for (j = 0;  j < 2;  j++) {
        jkoff = (koff + (j + jlo)) * ndim;
        c_yphi_zphi = yphi[j] * c_zphi;
        c_ypsi_zphi = ypsi[j] * c_zphi;
        c_yphi_zpsi = yphi[j] * c_zpsi;
        c_ypsi_zpsi = ypsi[j] * c_zpsi;
        for (i = 0;  i < 2;  i++) {
          index = jkoff + (i + ilo);
          ASSERT(&qh[index]
              == mgrid_lattice_elem(&(mg->qgrid[0]), i+ilo, j+jlo, k+klo));
          qh[index].d000 += xphi[i] * c_yphi_zphi;
          qh[index].d100 += xpsi[i] * c_yphi_zphi;
          qh[index].d010 += xphi[i] * c_ypsi_zphi;
          qh[index].d001 += xphi[i] * c_yphi_zpsi;
          qh[index].d110 += xpsi[i] * c_ypsi_zphi;
          qh[index].d101 += xpsi[i] * c_yphi_zpsi;
          qh[index].d011 += xphi[i] * c_ypsi_zpsi;
          qh[index].d111 += xpsi[i] * c_ypsi_zpsi;
        }
      }
    }
  } /* end loop over n */
#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("charge grid point qh(%d,%d,%d):\n", i, j, k);
  index = (k * ndim + j) * ndim + i;
  print_gridpoint(&qh[index]);
#endif
  return 0;
}


int grid_to_atoms(Mgrid *mg, MgridSystem *sys)
{
  double xphi[2], yphi[2], zphi[2];     /* phi grid func along x, y, z */
  double xpsi[2], ypsi[2], zpsi[2];     /* psi grid func along x, y, z */
  double dxphi[2], dyphi[2], dzphi[2];  /* dphi grid func along x, y, z */
  double dxpsi[2], dypsi[2], dzpsi[2];  /* dpsi grid func along x, y, z */
  double dx_h, dy_h, dz_h;              /* distance between atom and origin */
  double t;                             /* param to phi and dphi functions */
  double c_yphi_zphi, c_ypsi_zphi, c_yphi_zpsi, c_ypsi_zpsi;
  double c_yphi_dzphi, c_ypsi_dzphi, c_yphi_dzpsi, c_ypsi_dzpsi;
  double c_dyphi_zphi, c_dypsi_zphi, c_dyphi_zpsi, c_dypsi_zpsi;
  MD_Dvec fv;
  double u;
  const double u_self = mg->u_self;     /* self potential */
  const double h = mg->param.spacing;   /* grid spacing */
  const double h_1 = mg->inv_spacing;   /* inverse grid spacing */
  const MD_Dvec origin = mg->origin;    /* lowest corner of domain */
  const double *q = sys->charge;        /* atom charges */
  const MD_Dvec *p = sys->pos;          /* atom positions */
  MD_Dvec *f = (sys->f_long ? sys->f_long : sys->f_elec);
      /* accumulate contributions to atom force in f_long if allocated */

  HermGridpt *eh = (HermGridpt *)(mg->egrid[0].data);
      /* treat potential grid as 3D-lattice, index (i,j,k) */

  const HermGridpt *evh = (HermGridpt *)(mg->egrid[0].databuffer);
  const HermGridpt *qvh = (HermGridpt *)(mg->qgrid[0].databuffer);
      /* treat potential and charge grids as 1D arrays, index n */

  const int32 nelems = mg->egrid[0].nelems;  /* total number of grid points */
  const int32 ndim = mg->egrid[0].ni;    /* number points in each dimension */
  const int32 natoms = mg->param.natoms;     /* number of atoms */

  int32 n, i, j, k, ilo, jlo, klo;
  int32 index, koff, jkoff;

#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("potential grid point eh(%d,%d,%d):\n", i, j, k);
  index = (k * ndim + j) * ndim + i;
  print_gridpoint(&eh[index]);
#endif

  /* compute force */
  for (n = 0;  n < natoms;  n++) {

    /* distance between atom and corner measured in grid points */
    dx_h = (p[n].x - origin.x) * h_1;
    dy_h = (p[n].y - origin.y) * h_1;
    dz_h = (p[n].z - origin.z) * h_1;

    /* find largest numbered grid point in stencil */
    ilo = ((int32) dx_h);
    jlo = ((int32) dy_h);
    klo = ((int32) dz_h);

    /* find t for x dimension, compute xphi and dxphi */ 
    t = dx_h - (double) ilo;
    xphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    xpsi[0] = h * t * (1 - t) * (1 - t);
    dxphi[0] = -6 * t * (1 - t) * h_1;
    dxpsi[0] = (1 - t) * (1 - 3*t);
    t--;
    xphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    xpsi[1] = h * t * (1 + t) * (1 + t);
    dxphi[1] = -6 * t * (1 + t) * h_1;
    dxpsi[1] = (1 + t) * (1 + 3*t);

    /* find t for y dimension, compute yphi and dyphi */
    t = dy_h - (double) jlo;
    yphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    ypsi[0] = h * t * (1 - t) * (1 - t);
    dyphi[0] = -6 * t * (1 - t) * h_1;
    dypsi[0] = (1 - t) * (1 - 3*t);
    t--;
    yphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    ypsi[1] = h * t * (1 + t) * (1 + t);
    dyphi[1] = -6 * t * (1 + t) * h_1;
    dypsi[1] = (1 + t) * (1 + 3*t);

    /* find t for z dimension, compute zphi and dzphi */
    t = dz_h - (double) klo;
    zphi[0] = (1 - t) * (1 - t) * (1 + 2*t);
    zpsi[0] = h * t * (1 - t) * (1 - t);
    dzphi[0] = -6 * t * (1 - t) * h_1;
    dzpsi[0] = (1 - t) * (1 - 3*t);
    t--;
    zphi[1] = (1 + t) * (1 + t) * (1 - 2*t);
    zpsi[1] = h * t * (1 + t) * (1 + t);
    dzphi[1] = -6 * t * (1 + t) * h_1;
    dzpsi[1] = (1 + t) * (1 + 3*t);

    /* determine 8=2*2*2 eh grid stencil contribution to force */
    fv.x = fv.y = fv.z = 0;
    for (k = 0;  k < 2;  k++) {
      koff = (k + klo) * ndim;
      for (j = 0;  j < 2;  j++) {
        jkoff = (koff + (j + jlo)) * ndim;
        c_yphi_zphi = yphi[j] * zphi[k];
        c_ypsi_zphi = ypsi[j] * zphi[k];
        c_yphi_zpsi = yphi[j] * zpsi[k];
        c_ypsi_zpsi = ypsi[j] * zpsi[k];
        c_yphi_dzphi = yphi[j] * dzphi[k];
        c_ypsi_dzphi = ypsi[j] * dzphi[k];
        c_yphi_dzpsi = yphi[j] * dzpsi[k];
        c_ypsi_dzpsi = ypsi[j] * dzpsi[k];
        c_dyphi_zphi = dyphi[j] * zphi[k];
        c_dypsi_zphi = dypsi[j] * zphi[k];
        c_dyphi_zpsi = dyphi[j] * zpsi[k];
        c_dypsi_zpsi = dypsi[j] * zpsi[k];
        for (i = 0;  i < 2;  i++) {
          index = jkoff + (i + ilo);
          ASSERT(&eh[index]
              == mgrid_lattice_elem(&(mg->egrid[0]), i+ilo, j+jlo, k+klo));
          fv.x += dxphi[i] * (c_yphi_zphi * eh[index].d000
              + c_ypsi_zphi * eh[index].d010
              + c_yphi_zpsi * eh[index].d001
              + c_ypsi_zpsi * eh[index].d011)
            + dxpsi[i] * (c_yphi_zphi * eh[index].d100
              + c_ypsi_zphi * eh[index].d110
              + c_yphi_zpsi * eh[index].d101
              + c_ypsi_zpsi * eh[index].d111);
          fv.y += xphi[i] * (c_dyphi_zphi * eh[index].d000
              + c_dypsi_zphi * eh[index].d010
              + c_dyphi_zpsi * eh[index].d001
              + c_dypsi_zpsi * eh[index].d011)
            + xpsi[i] * (c_dyphi_zphi * eh[index].d100
              + c_dypsi_zphi * eh[index].d110
              + c_dyphi_zpsi * eh[index].d101
              + c_dypsi_zpsi * eh[index].d111);
          fv.z += xphi[i] * (c_yphi_dzphi * eh[index].d000
              + c_ypsi_dzphi * eh[index].d010
              + c_yphi_dzpsi * eh[index].d001
              + c_ypsi_dzpsi * eh[index].d011)
            + xpsi[i] * (c_yphi_dzphi * eh[index].d100
              + c_ypsi_dzphi * eh[index].d110
              + c_yphi_dzpsi * eh[index].d101
              + c_ypsi_dzpsi * eh[index].d111);
        }
      }
    }

    /* update force */
    f[n].x -= q[n] * fv.x;
    f[n].y -= q[n] * fv.y;
    f[n].z -= q[n] * fv.z;

  } /* end loop over n */

  /* compute potential */
  u = 0.0;
  for (n = 0;  n < nelems;  n++) {
    u += qvh[n].d000 * evh[n].d000 + qvh[n].d100 * evh[n].d100
      + qvh[n].d010 * evh[n].d010 + qvh[n].d001 * evh[n].d001
      + qvh[n].d110 * evh[n].d110 + qvh[n].d101 * evh[n].d101
      + qvh[n].d011 * evh[n].d011 + qvh[n].d111 * evh[n].d111;
  }

  /* must subtract self potential, save long range potential, add to total */
  sys->u_long = (0.5 * u) - u_self;
  sys->u_elec += sys->u_long;

  /* if separate long range force array, add to total force */
  if (sys->f_long != NULL) {
    for (n = 0;  n < natoms;  n++) {
      sys->f_elec[n].x += sys->f_long[n].x;
      sys->f_elec[n].y += sys->f_long[n].y;
      sys->f_elec[n].z += sys->f_long[n].z;
    }
  }

  return 0;
}



/*
 * stencil offsets from a central grid point on a finer grid level;
 * these offsets are where function weights above have been evaluated
 * on h-grid wrt to 2h-grid
 */
static const int32 OFFSET[NSTENCIL] = { -1, 0, 1 };



int restrict_grid(Mgrid *mg, int32 level)
{
  HermGridpt q2h_sum;

  /* lattices of potential */
  const HermGridpt *qh = (HermGridpt *)(mg->qgrid[level].data);  /* finer */
  HermGridpt *q2h = (HermGridpt *)(mg->qgrid[level+1].data);     /* coarser */

  /* transfer matrix */
  const HermDirect *op = (HermDirect *)(mg->opres_list[level].data);

  /* for computing matrix vector product */
  const double *m;                    /* points to matrix */
  const double *v;                    /* points to vector */
  double *qv = (double *)(&q2h_sum);  /* points to resulting vector */

  /* finer grid - grid size and labeling identical in each dimension */
  const int32 na1 = mg->qgrid[level].ia;      /* lowest grid index */
  const int32 nb1 = mg->qgrid[level].ib;      /* highest grid index */
  const int32 ndim1 = mg->qgrid[level].ni;    /* number points along dim */

  /* coarser grid - grid size and labeling identical in each dimension */
  const int32 na2 = mg->qgrid[level+1].ia;    /* lowest grid index */
  const int32 nb2 = mg->qgrid[level+1].ib;    /* highest grid index */
  const int32 ndim2 = mg->qgrid[level+1].ni;  /* number points along dim */

  /* other variables */
  int32 i1, j1, k1, index1, jk1off, k1off;
  int32 i2, j2, k2, index2, jk2off, k2off;
  int32 i, j, k, indext, jktoff, ktoff;

  int32 n, ii, jj, kk;

  enum { VECLEN = 8 };

  ASSERT(VECLEN == sizeof(HermGridpt) / sizeof(double));
  ASSERT(VECLEN * VECLEN == sizeof(HermDirect) / sizeof(double));

  /* loop over coarser grid points */
  for (k2 = na2;  k2 <= nb2;  k2++) {
    k2off = k2 * ndim2;  /* coarser grid index offset for k-coord */
    k1 = k2 * 2;         /* k-coord of same-space point on finer grid */

    for (j2 = na2;  j2 <= nb2;  j2++) {
      jk2off = (k2off + j2) * ndim2;  /* add offset for j-coord coarser */
      j1 = j2 * 2;                    /* j-coord same-space finer grid */

      for (i2 = na2;  i2 <= nb2;  i2++) {
        index2 = jk2off + i2;         /* index in coarser grid */
        i1 = i2 * 2;                  /* i-coord same-space finer grid */

        /* accumulate charge contribution from finer grid stencil */
        for (n = 0;  n < VECLEN;  n++) {
          qv[n] = 0;  /* (this clears q2h_sum before accumulating) */
        }

        /* sum weighted potential contribution to finer grid stencil */
        for (k = 0;  k < NSTENCIL;  k++) {
          /* early loop termination if outside lattice */
          if (k1 + OFFSET[k] < na1) continue;
          else if (k1 + OFFSET[k] > nb1) break;
          k1off = (k1 + OFFSET[k]) * ndim1;  /* offset k-coord finer grid */
          ktoff = OFFSET[k] * NSTENCIL;

          for (j = 0;  j < NSTENCIL;  j++) {
            /* early loop termination if outside lattice */
            if (j1 + OFFSET[j] < na1) continue;
            else if (j1 + OFFSET[j] > nb1) break;
            jk1off = (k1off + (j1 + OFFSET[j])) * ndim1;  /* add offset j */
            jktoff = (ktoff + OFFSET[j]) * NSTENCIL;
            
            for (i = 0;  i < NSTENCIL;  i++) {
              /* early loop termination if outside lattice */
              if (i1 + OFFSET[i] < na1) continue;
              else if (i1 + OFFSET[i] > nb1) break;
              index1 = jk1off + (i1 + OFFSET[i]);    /* index in finer grid */
              indext = jktoff + OFFSET[i];

              ASSERT(&qh[index1]
                  == mgrid_lattice_elem(&(mg->qgrid[level]),
                    i1 + OFFSET[i], j1 + OFFSET[j], k1 + OFFSET[k]));
              ASSERT(&op[indext]
                  == mgrid_lattice_elem(&(mg->opres_list[level]),
                    OFFSET[i], OFFSET[j], OFFSET[k]));

              /* accumulate potential to finer grid */
              v = (const double *)(&qh[index1]);   /* point to vector */
              m = (const double *)(&op[indext]);   /* point to matrix */
              /* compute matrix vector product */
              for (kk = 0, jj = 0;  jj < VECLEN;  jj++) {
                for (ii = 0;  ii < VECLEN;  ii++, kk++) {
                  qv[jj] += m[kk] * v[ii];
                }
              } /* end matrix vector product */

            }
          }
        } /* end loop over finer grid points */

        /* assign accumulated values to the coarser grid */
        ASSERT(&q2h[index2]
            == mgrid_lattice_elem(&(mg->qgrid[level+1]), i2, j2, k2));
        q2h[index2] = q2h_sum;

      }
    }
  } /* end loop over coarser grid points */

  return 0;
}


int prolong_grid(Mgrid *mg, int32 level)
{
  /* lattices of potential */
  HermGridpt *eh = (HermGridpt *)(mg->egrid[level].data);       /* finer grid */
  const HermGridpt *e2h = (HermGridpt *)(mg->egrid[level+1].data); /* coarser */

  /* transfer matrix */
  const HermDirect *op = (HermDirect *)(mg->oppro_list[level].data);

  /* for computing matrix vector product */
  const double *m;  /* points to matrix */
  const double *v;  /* points to vector */
  double *ev;       /* points to resulting vector */

  /* finer grid - grid size and labeling identical in each dimension */
  const int32 na1 = mg->egrid[level].ia;      /* lowest grid index */
  const int32 nb1 = mg->egrid[level].ib;      /* highest grid index */
  const int32 ndim1 = mg->egrid[level].ni;    /* number points along dim */

  /* coarser grid - grid size and labeling identical in each dimension */
  const int32 na2 = mg->egrid[level+1].ia;    /* lowest grid index */
  const int32 nb2 = mg->egrid[level+1].ib;    /* highest grid index */
  const int32 ndim2 = mg->egrid[level+1].ni;  /* number points along dim */

  /* other variables */
  int32 i1, j1, k1, index1, jk1off, k1off;
  int32 i2, j2, k2, index2, jk2off, k2off;
  int32 i, j, k, indext, jktoff, ktoff;

  int32 ii, jj, kk;

  enum { VECLEN = 8 };

  ASSERT(VECLEN == sizeof(HermGridpt) / sizeof(double));
  ASSERT(VECLEN * VECLEN == sizeof(HermDirect) / sizeof(double));

  /* loop over coarser grid points */
  for (k2 = na2;  k2 <= nb2;  k2++) {
    k2off = k2 * ndim2;  /* coarser grid index offset for k-coord */
    k1 = k2 * 2;         /* k-coord of same-space point on finer grid */

    for (j2 = na2;  j2 <= nb2;  j2++) {
      jk2off = (k2off + j2) * ndim2;  /* add offset for j-coord coarser */
      j1 = j2 * 2;                    /* j-coord same-space finer grid */

      for (i2 = na2;  i2 <= nb2;  i2++) {
        index2 = jk2off + i2;         /* index in coarser grid */
        i1 = i2 * 2;                  /* i-coord same-space finer grid */

        /* sum weighted potential contribution to finer grid stencil */
        for (k = 0;  k < NSTENCIL;  k++) {
          /* early loop termination if outside lattice */
          if (k1 + OFFSET[k] < na1) continue;
          else if (k1 + OFFSET[k] > nb1) break;
          k1off = (k1 + OFFSET[k]) * ndim1;  /* offset k-coord finer grid */
          ktoff = OFFSET[k] * NSTENCIL;

          for (j = 0;  j < NSTENCIL;  j++) {
            /* early loop termination if outside lattice */
            if (j1 + OFFSET[j] < na1) continue;
            else if (j1 + OFFSET[j] > nb1) break;
            jk1off = (k1off + (j1 + OFFSET[j])) * ndim1;  /* add offset j */
            jktoff = (ktoff + OFFSET[j]) * NSTENCIL;
            
            for (i = 0;  i < NSTENCIL;  i++) {
              /* early loop termination if outside lattice */
              if (i1 + OFFSET[i] < na1) continue;
              else if (i1 + OFFSET[i] > nb1) break;
              index1 = jk1off + (i1 + OFFSET[i]);    /* index in finer grid */
              indext = jktoff + OFFSET[i];

              ASSERT(&eh[index1]
                  == mgrid_lattice_elem(&(mg->egrid[level]),
                    i1 + OFFSET[i], j1 + OFFSET[j], k1 + OFFSET[k]));
              ASSERT(&e2h[index2]
                  == mgrid_lattice_elem(&(mg->egrid[level+1]), i2, j2, k2));
              ASSERT(&op[indext]
                  == mgrid_lattice_elem(&(mg->oppro_list[level]),
                    OFFSET[i], OFFSET[j], OFFSET[k]));

              /* accumulate potential to finer grid */
              ev = (double *)(&eh[index1]);        /* point to vector */
              v = (const double *)(&e2h[index2]);  /* point to vector */
              m = (const double *)(&op[indext]);   /* point to matrix */
              /* compute matrix vector product */
              for (kk = 0, jj = 0;  jj < VECLEN;  jj++) {
                for (ii = 0;  ii < VECLEN;  ii++, kk++) {
                  ev[jj] += m[kk] * v[ii];
                }
              } /* end matrix vector product */

            }
          }
        } /* end loop over finer grid points */

      }
    }
  } /* end loop over coarser grid points */

  return 0;
}


int direct_sum(Mgrid *mg, int32 level)
{
  HermGridpt eh_sum;

  /* lattices of charge and potential */
  const HermGridpt *qh = (HermGridpt *)(mg->qgrid[level].data);
  HermGridpt *eh = (HermGridpt *)(mg->egrid[level].data);
  const int32 ndim = mg->qgrid[level].ni;  /* number points in each dim */
  const int32 na = mg->qgrid[level].ia;    /* lowest numbered point */
  const int32 nb = mg->qgrid[level].ib;    /* highest numbered point */

  /*
   * for last level use different block of weights that
   * produces pairwise interactions between all grid points
   */

  /* lattice of weights for short range charge influence on potential */
  const HermDirect *gd = (HermDirect *)(mg->gdsum_list[level].data);

  /* does matrix have all zeros? */
  const int32 *is_zero = (int32 *)(mg->is_zero[level].data);

  /* for computing matrix vector product */
  const double *gm;                  /* points to matrix */
  const double *qv;                  /* points to vector */
  double *ev = (double *)(&eh_sum);  /* points to resulting vector */

  /* dimension of gd lattice */
  const int32 ngdim = mg->gdsum_list[level].ni;

  /* "radius" of gd "sphere" */
  const int32 ngrad = mg->gdsum_radius_list[level];

  int32 i, j, k, ig, jg, kg, ii, jj, kk;
  int32 iga, igb, jga, jgb, kga, kgb;
  int32 index, jkoff, koff;
  int32 nindex, jknoff, knoff;
  int32 ngindex, jkgoff, kgoff;

  enum { VECLEN = 8 };

  ASSERT(VECLEN == sizeof(HermGridpt) / sizeof(double));
  ASSERT(VECLEN * VECLEN == sizeof(HermDirect) / sizeof(double));

  /* loop over all grid points */
  for (k = na;  k <= nb;  k++) {
    /* find k-coord index ranges that keep gd on grid */
    kga = (na - k > -ngrad ? na - k : -ngrad);
    kgb = (nb - k < ngrad ? nb - k : ngrad);

    koff = k * ndim;  /* grid index offset for k-coord */

    for (j = na;  j <= nb;  j++) {
      /* find j-coord index ranges that keep gd on grid */
      jga = (na - j > -ngrad ? na - j : -ngrad);
      jgb = (nb - j < ngrad ? nb - j : ngrad);

      jkoff = (koff + j) * ndim;  /* add offset for j-coord */

      for (i = na;  i <= nb;  i++) {
        /* find i-coord index ranges that keep gd on grid */
        iga = (na - i > -ngrad ? na - i : -ngrad);
        igb = (nb - i < ngrad ? nb - i : ngrad);

        index = jkoff + i;  /* index for potential grid point */

#if 0
#ifdef DEBUG_WATCH
        printf("(%d, %d, %d):  sphere index (%d..%d, %d..%d, %d..%d)\n",
            i, j, k, iga, igb, jga, jgb, kga, kgb);
#endif
#endif
        /* sum over "sphere" of weighted charge */
        eh_sum.d000 = 0.0;
        eh_sum.d100 = 0.0;
        eh_sum.d010 = 0.0;
        eh_sum.d001 = 0.0;
        eh_sum.d110 = 0.0;
        eh_sum.d101 = 0.0;
        eh_sum.d011 = 0.0;
        eh_sum.d111 = 0.0;
        for (kg = kga;  kg <= kgb;  kg++) {
          knoff = (k + kg) * ndim;  /* grid index offset for k-coord */
          kgoff = kg * ngdim;       /* gd index offset for k-coord */

          for (jg = jga;  jg <= jgb;  jg++) {
            jknoff = (knoff + (j + jg)) * ndim;  /* add offset for j-coord */
            jkgoff = (kgoff + jg) * ngdim;       /* gd add offset j-coord */

            for (ig = iga;  ig <= igb;  ig++) {
              nindex = jknoff + (i + ig);  /* index of charge contribution */
              ngindex = jkgoff + ig;       /* index of gd weighting factor */
              ASSERT(&is_zero[ngindex]
                  == mgrid_lattice_elem(&(mg->is_zero[level]), ig, jg, kg));
              /* don't have to multiply by a matrix of all zeros */
              if (is_zero[ngindex]) continue;
              ASSERT(&qh[nindex]
                  == mgrid_lattice_elem(&(mg->qgrid[level]),
                    i + ig, j + jg, k + kg));
              ASSERT(&gd[ngindex]
                  == mgrid_lattice_elem(&(mg->gdsum_list[level]), ig, jg, kg));

              /* sum weighted charge */
              qv = (const double *)(&qh[nindex]);  /* point to charge vector */
              gm = (const double *)(&gd[ngindex]); /* point to direct matrix */
              /* compute matrix vector product */
              for (kk = 0, jj = 0;  jj < VECLEN;  jj++) {
                for (ii = 0;  ii < VECLEN;  ii++, kk++) {
                  ev[jj] += gm[kk] * qv[ii];
                }
              }

            }
          }
        } /* end loop over "sphere" of charge */

        ASSERT(&eh[index]
            == mgrid_lattice_elem(&(mg->egrid[level]), i, j, k));

        /* store potential */
        eh[index] = eh_sum;
      }
    }
  } /* end loop over all grid points */

#ifdef DEBUG_WATCH
  i = 1;  j = 1;  k = 1;
  printf("(direct sum) potential grid point e%dh(%d,%d,%d):\n",
      1 << level, i, j, k);
  index = (k * ndim + j) * ndim + i;
  ASSERT(&eh[index]
      == mgrid_lattice_elem(&(mg->egrid[level]), i, j, k));
  print_gridpoint(&eh[index]);
#endif

  return 0;
}
