/*
 * Copyright (C) 2004-2006 by Wei Wang.  All rights reserved.
 */


#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include "PmeRealSpace.h"
#include "helper.h"
#include "utilities.h"
#include "pme_utilities.h"


MD_Errcode pmerealspace_init(struct PmeRealSpace_Tag *prs, 
			     const struct PmeGrid_Tag *grid, 
			     const MD_Int natoms)
{
  size_t msize;
  assert(NULL != prs);
  assert(NULL != grid);
  assert(0 < natoms);

  prs->natoms = natoms;
  prs->myGrid = *grid;
  msize = (size_t) natoms*3*prs->myGrid.order;
  prs->M   = my_calloc(msize, sizeof(MD_Double), "M");
  prs->dM  = my_calloc(msize, sizeof(MD_Double), "dM");
  prs->ddM = my_calloc(msize, sizeof(MD_Double), "ddM");

  return OK;
}


MD_Errcode pmerealspace_destroy(struct PmeRealSpace_Tag *prs)
{
  free(prs->M);
  free(prs->dM);
  free(prs->ddM);
  memset(prs, 0, sizeof *prs);

  return OK;
}


void pmerealspace_fill_bspline1(struct PmeRealSpace_Tag *prs, 
				const MD_Dvec p[]) 
{
  MD_Double fr[3]; 
  MD_Double *Mi, *dMi;
  const MD_Int natoms = prs->natoms;
  const MD_Int order = prs->myGrid.order;
  const MD_Int stride = 3 * order;
  MD_Int i;

  Mi = prs->M; dMi = prs->dM;
  for (i=0; i<natoms; i++) {
    fr[0] = p[i].x - floor(p[i].x);  /* MD_int ==> floor, x,y,z > 0 */
    fr[1] = p[i].y - floor(p[i].y);
    fr[2] = p[i].z - floor(p[i].z);
    compute_bspline1(fr, Mi, dMi, order); /* PmeBase.h */
    Mi  += stride;
    dMi += stride;
  }
}

void pmerealspace_fill_charges(const struct PmeRealSpace_Tag *prs,
			       const MD_Dvec scaled_pos[], 
			       const MD_Double charge[], 
			       MD_Double q_arr[]) 
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  MD_Int iatom, i1, i2, i3;
  MD_Double q;
  MD_Int u1, u2, u2i, u3i;
  MD_Double *qline;
  MD_Double m1, m1m2;
  MD_Int ind1, ind2, ind, u3;

  memset(q_arr, 0, K1 * dim2 * dim3 * sizeof(*q_arr));

  /* according to the definition of Q array (q_arr), a charge q
   * located at x, contributes to Q(k) by q*Mn(x-k).
   * Mn(y)!=0 only if 0<y<n. Let u = floor(x), then q has nonzero  
   * contribution to k=u-n+1, u-n+2, ..., u. or k=u-i for i=0to n-1.
   * Let f = x-u be the fractional part of x, then Mn(x-k) = Mn(u-k+f).
   *
   * weight  M[0]          M[n-1]
   *    0  Mn(n-1+f)       Mn(f)
   * ---|------|------|------|---#--|-------------->
   *   v-n   v-n+1           v   x  v+1    (n=order)
   * x: particle position (0<=x< K)
   * u = floor(x), f = x-u
  */
  for (iatom=0; iatom<natoms; iatom++) {
    q = charge[iatom];
    u1  = (MD_Int)scaled_pos[iatom].x - order;
    u2i = (MD_Int)scaled_pos[iatom].y - order;
    u3i = (MD_Int)scaled_pos[iatom].z - order;
    for (i1=0; i1<order; i1++) {
      m1 = M1[i1]*q;
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      for (i2=0; i2<order; i2++) {
        m1m2 = m1*M2[i2];
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2)) * dim3;
	qline = q_arr + ind2;
	u3 = u3i;
        for (i3=0; i3<order; i3++) {
	  u3++;
          ind = CYCLIC_INDEX(u3, K3);
          qline[ind] += m1m2 * M3[i3]; 
        }
      }
    }
    M1 += stride; M2 += stride; M3 += stride;
  }

  return;
}

#if 0  /* older, slower, but correct */
void pmerealspace_charge_calc_force(struct PmeRealSpace_Tag *prs,
				    MD_Double q_arr[],
				    const MD_Dvec scaled_pos[], 
				    const MD_Double charge[], 
				    const struct Lattice_Tag lattice,
				    MD_Dvec force[])
{  
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  MD_Double f1, f2, f3;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  MD_Double q, m1, m2, m3, d1, d2, d3, m1m2, m1d2, d1m2, term;          
  MD_Int u1, u2, u2i, u3i, u3;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;
  const MD_Double *qline;
  MD_Dvec r1, r2, r3;

  MD_vec_mul(lattice.b1, K1, r1);
  MD_vec_mul(lattice.b2, K2, r2);
  MD_vec_mul(lattice.b3, K3, r3);

  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    f1=f2=f3=0.0;
    for (i1=0; i1<order; i1++) {
      m1 =  M1[i1];
      d1 = dM1[i1];    
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      for (i2=0; i2<order; i2++) {
        m2 =  M2[i2];
	d2 = dM2[i2];    
	m1m2 = m1*m2;
	m1d2 = m1*d2;
	d1m2 = d1*m2;
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	qline = q_arr + ind2;
	u3 = u3i;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind = CYCLIC_INDEX(u3,K3);
	  term = qline[ind];
	  m3 =  M3[i3] * term;
	  d3 = dM3[i3] * term;  
	  f1 -= d1m2 * m3;
	  f2 -= m1d2 * m3;
	  f3 -= m1m2 * d3;
        }
      }
    }
    q = charge[iatom];
    f1 *= q; f2 *= q; f3 *= q;
    force[iatom].x = r1.x*f1 + r2.x*f2 + r3.x*f3;
    force[iatom].y = r1.y*f1 + r2.y*f2 + r3.y*f3;
    force[iatom].z = r1.z*f1 + r2.z*f2 + r3.z*f3;
     M1 += stride;  M2 += stride;  M3 += stride;
    dM1 += stride; dM2 += stride; dM3 += stride;
  }

  return;
}
#else  /* faster, should be correct */
void pmerealspace_charge_calc_force(struct PmeRealSpace_Tag *prs,
				    MD_Double q_arr[],
				    const MD_Dvec scaled_pos[], 
				    const MD_Double charge[], 
				    const struct Lattice_Tag lattice,
				    MD_Dvec force[])
{  
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  MD_Double f1, f2, f3;
  MD_Double q, m1, m2, dm1, dm2, term;  
  MD_Double s3, sd3, s23, sd23, s2d3;        
  MD_Int u1, u2, u2i, u3i, u3;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;
  const MD_Double *qline;
  MD_Dvec r1, r2, r3;

  MD_vec_mul(lattice.b1, K1, r1);
  MD_vec_mul(lattice.b2, K2, r2);
  MD_vec_mul(lattice.b3, K3, r3);

  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    f1=f2=f3=0.0;
    for (i1=0; i1<order; i1++) {
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      s23 = sd23 = s2d3 = 0.0;
      for (i2=0; i2<order; i2++) {   
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	qline = q_arr + ind2;
	u3 = u3i;
	s3 = sd3 = 0.0;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind = CYCLIC_INDEX(u3,K3);
	  term = qline[ind];
	  s3  += M3[i3] * term;
	  sd3 += dM3[i3] * term;
        }
        m2  =  M2[i2];
	dm2 = dM2[i2]; 
	s23 += m2 * s3;
	sd23 += dm2 * s3;
	s2d3 += m2 * sd3;
      }
      m1  =  M1[i1];
      dm1 = dM1[i1];    
      f1 -= dm1 * s23;
      f2 -= m1 * sd23;
      f3 -= m1 * s2d3;
    }
    q = charge[iatom];
    f1 *= q; f2 *= q; f3 *= q;
    force[iatom].x = r1.x*f1 + r2.x*f2 + r3.x*f3;
    force[iatom].y = r1.y*f1 + r2.y*f2 + r3.y*f3;
    force[iatom].z = r1.z*f1 + r2.z*f2 + r3.z*f3;
     M1 += stride;  M2 += stride;  M3 += stride;
    dM1 += stride; dM2 += stride; dM3 += stride;
  }

  return;
}
#endif


/*
 *
 * dipole computation functions
 *
 */
void pmerealspace_fill_bspline2(struct PmeRealSpace_Tag *prs, 
				 const MD_Dvec p[]) 
{
  MD_Double fr[3]; 
  MD_Double *Mi, *dMi, *ddMi;
  const MD_Int natoms = prs->natoms;
  const MD_Int order = prs->myGrid.order;
  const MD_Int stride = 3 * order;
  MD_Int i;

  Mi = prs->M; dMi = prs->dM; ddMi = prs->ddM;
  for (i=0; i<natoms; i++) {
    fr[0] = p[i].x - floor(p[i].x);  /* MD_int ==> floor, x,y,z > 0 */
    fr[1] = p[i].y - floor(p[i].y);
    fr[2] = p[i].z - floor(p[i].z);
    compute_bspline2(fr, Mi, dMi, ddMi, order); /* PmeBase.h */
    Mi  += stride; dMi += stride; ddMi+= stride;
  }
}


/* compute I1*d */
void pmerealspace_fill_dipoles(const struct PmeRealSpace_Tag *prs,
			       const MD_Dvec scaled_pos[], 
			       const struct Lattice_Tag lattice,
			       const MD_Double dipole[],
			       MD_Double d_arr[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  const MD_Double *dip;
  MD_Dvec t1, t2, t3;
  MD_Double d1, d2, d3, m1, m2, d1dm1, d1dm1m2, m1d2dm2, m1m2;
  MD_Double *dline;
  MD_Int iatom, i1, i2, i3;
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int ind1, ind2, ind3;

  MD_vec_mul(lattice.b1, K1, t1);
  MD_vec_mul(lattice.b2, K2, t2);
  MD_vec_mul(lattice.b3, K3, t3);

  memset(d_arr, 0, K1*dim2*dim3*sizeof(*d_arr)); 

  dip = dipole;
  for (iatom=0; iatom<natoms; iatom++) {
    T_MUL_d(t1, t2, t3, dip[0], dip[1], dip[2], d1, d2, d3);
    u1  = (MD_Int)scaled_pos[iatom].x - order;
    u2i = (MD_Int)scaled_pos[iatom].y - order;
    u3i = (MD_Int)scaled_pos[iatom].z - order;
    for (i1=0; i1<order; i1++) {
      m1 = M1[i1];
      d1dm1 = d1 * dM1[i1];
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      for (i2=0; i2<order; i2++) {
        m2 = M2[i2];
	d1dm1m2 = d1dm1*m2;
	m1d2dm2 = m1*d2*dM2[i2];
	m1m2 = m1*m2;
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	dline = d_arr + ind2;
	u3 = u3i;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind3 = CYCLIC_INDEX(u3, K3);
	  dline[ind3] += (d1dm1m2 + m1d2dm2)*M3[i3] + m1m2*d3*dM3[i3];
        }
      }
    }
     M1 += stride;   M2 += stride;   M3 += stride;
    dM1 += stride;  dM2 += stride;  dM3 += stride;
    dip += 3;
  }

#ifdef DEBUG_PME
  { MD_Double dsum = array_sum(d_arr, K1*dim2*dim3);
    if (fabs(dsum) > 1e-14) printf("dsum=%g\n", dsum);
  }
#endif
  return;
}


/* compute Ih1 * grid_vector */  
#if 0
void pmerealspace_ungrid_dipoles(const struct PmeRealSpace_Tag *prs,
				 const MD_Dvec scaled_pos[], 
				 const struct Lattice_Tag lattice,
				 const MD_Double d_arr[],
				 MD_Double dipole[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  const MD_Double *dline;
  MD_Dvec t1, t2, t3;
  MD_Double *dip = dipole;
  MD_Double m1, m2, m3, dm1, dm2, dm3, m1m2, m1dm2, dm1m2;
  MD_Double d1, d2, d3, d;
  MD_Int iatom, i1, i2, i3;
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int ind1, ind2, ind3;

  MD_vec_mul(lattice.b1, K1, t1);
  MD_vec_mul(lattice.b2, K2, t2);
  MD_vec_mul(lattice.b3, K3, t3);

  for (iatom=0; iatom<natoms; iatom++) {
    d1 = d2 = d3 = 0.0;
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    for (i1=0; i1<order; i1++) {
      m1 = M1[i1];
      dm1 = dM1[i1];
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      for (i2=0; i2<order; i2++) {
        m2 = M2[i2];
	dm2 = dM2[i2];
	m1m2 = m1*m2;
	m1dm2 = m1*dm2;
	dm1m2 = dm1*m2;
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	dline = d_arr + ind2;
	u3 = u3i;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind3 = CYCLIC_INDEX(u3, K3);
	  d = dline[ind3];
	  m3 = M3[i3] * d;
	  dm3 = dM3[i3] * d;
	  d1 += dm1m2*m3;
	  d2 += m1dm2*m3;
	  d3 += m1m2*dm3;
        }
      }
    }
    Ttrans_MUL_d(t1, t2, t3, d1, d2, d3, dip[X], dip[Y], dip[Z]);
    dip += 3;
    M1  += stride;   M2 += stride;   M3 += stride;
    dM1 += stride;  dM2 += stride;  dM3 += stride;
  }

  return;
}
#else
void pmerealspace_ungrid_dipoles(const struct PmeRealSpace_Tag *prs,
				 const MD_Dvec scaled_pos[], 
				 const struct Lattice_Tag lattice,
				 const MD_Double d_arr[],
				 MD_Double dipole[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  const MD_Double *dline;
  MD_Dvec t1, t2, t3;
  MD_Double *dip = dipole;
  MD_Double m1, m2;
  MD_Double s23, sd23, s2d3, s3, sd3;
  MD_Double d1, d2, d3, d;
  MD_Int iatom, i1, i2, i3;
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int ind1, ind2, ind3;

  MD_vec_mul(lattice.b1, K1, t1);
  MD_vec_mul(lattice.b2, K2, t2);
  MD_vec_mul(lattice.b3, K3, t3);

  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    d1 = d2 = d3 = 0.0;
    for (i1=0; i1<order; i1++) {
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      s23 = sd23 = s2d3 = 0.0;
      for (i2=0; i2<order; i2++) {
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	dline = d_arr + ind2;
	u3 = u3i;
	s3 = sd3 = 0.0;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind3 = CYCLIC_INDEX(u3, K3);
	  d = dline[ind3];
	  s3 += M3[i3]*d;
	  sd3 += dM3[i3]*d;
        }
        m2 = M2[i2];
	s23  += m2*s3;
	sd23 += dM2[i2]*s3;
	s2d3 += m2*sd3;
      }
      m1 = M1[i1];
      d1 += dM1[i1] * s23;
      d2 += m1 * sd23;
      d3 += m1 * s2d3;
    }
    Ttrans_MUL_d(t1, t2, t3, d1, d2, d3, dip[X], dip[Y], dip[Z]);
    dip += 3;
    M1  += stride;   M2 += stride;   M3 += stride;
    dM1 += stride;  dM2 += stride;  dM3 += stride;
  }

}
#endif


#if 0
void pmerealspace_dipole_calc_force(const struct PmeRealSpace_Tag *prs,
				    const MD_Double vh[], 
				    const MD_Dvec scaled_pos[], 
				    const MD_Double charge[],
				    const MD_Double dipole[],
				    const struct Lattice_Tag lattice,
				    MD_Dvec force[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double   *M1 = prs->M;
  const MD_Double   *M2 = M1 + order;
  const MD_Double   *M3 = M2 + order;
  const MD_Double  *dM1 = prs->dM;
  const MD_Double  *dM2 = dM1 + order;
  const MD_Double  *dM3 = dM2 + order;
  const MD_Double *ddM1 = prs->ddM;
  const MD_Double *ddM2 = ddM1 + order;
  const MD_Double *ddM3 = ddM2 + order;
  const MD_Double *dip, *vk;
  MD_Dvec *fi;
  MD_Dvec t1, t2, t3;
  MD_Double f1, f2, f3;
  MD_Double q, d1, d2, d3;
  MD_Double cq1, cq2, cq3, d11, d12, d13, d22, d23, d33;
  MD_Double m1, m2, m3, dm1, dm2, dm3, ddm1, ddm2, ddm3;
  MD_Double m1m2, m1dm2, dm1m2, ddm1m2, dm1dm2, m1ddm2;
  MD_Double term;          
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;

  MD_vec_mul(lattice.b1, K1, t1);
  MD_vec_mul(lattice.b2, K2, t2);
  MD_vec_mul(lattice.b3, K3, t3);

  dip = dipole; fi = force;  /* do not need to clear force array */
  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    cq1 = cq2 = cq3 = 0.0;
    d11 = d12 = d13 = d22 = d23 = d33 = 0.0;
    for (i1=0; i1<order; i1++) {
        m1 =   M1[i1];
       dm1 =  dM1[i1];
      ddm1 = ddM1[i1];      
      u1++; 
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      for (i2=0; i2<order; i2++) {
          m2 =   M2[i2];
	 dm2 =  dM2[i2];
	ddm2 = ddM2[i2];   
	  m1m2 = m1*m2;
	 m1dm2 = m1*dm2;
	 dm1m2 = dm1*m2;
	ddm1m2 = ddm1*m2;
	dm1dm2 = dm1*dm2;
	m1ddm2 = m1*ddm2;
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	vk = vh + ind2;
	u3 = u3i;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind = CYCLIC_INDEX(u3,K3);
	  term = vk[ind];
	    m3 =   M3[i3]*term;
	   dm3 =  dM3[i3]*term;
	  ddm3 = ddM3[i3]*term;
	  cq1 -= dm1m2*m3;
	  cq2 -= m1dm2*m3;
	  cq3 -= m1m2*dm3;
	  d11 -= ddm1m2*m3;
	  d12 -= dm1dm2*m3;
	  d13 -= dm1m2*dm3;
	  d22 -= m1ddm2*m3;
	  d23 -= m1dm2*dm3;
	  d33 -= m1m2*ddm3;
        }
      }
    }
    /* f_k = - \sum_m (q_k (I1)_k,m + (I2)_k,m d_k) v(m)
     *     = - \sum_m (q_k T^t (DI0) + T^t (D^2 I0) T * d_k) * v
     *     = T^t [q_k * \sum_m(-DI0 * v) + (\sum_m (-D^2 I0) * v) * (T * d_k)]
     *   [K1 b1^t]   [t1^t]
     * T=[K2 b2^t] = [t2^t]  DI0=\grad I0, D^2 I0=\grad\grad I0 (3x3 matrix)
     *   [K3 b3^t]   [t3^t]       
     */
    q = charge[iatom];
    T_MUL_d(t1, t2, t3, dip[0], dip[1], dip[2], d1, d2, d3);
    /*
    d1=d2=d3=0;
    */
    f1 = cq1*q + d11*d1 + d12*d2 + d13*d3;
    f2 = cq2*q + d12*d1 + d22*d2 + d23*d3;
    f3 = cq3*q + d13*d1 + d23*d2 + d33*d3;
    Ttrans_MUL_d(t1, t2, t3, f1, f2, f3, fi->x, fi->y, fi->z);
    dip += 3;  fi++;
      M1 += stride;   M2 += stride;   M3 += stride;
     dM1 += stride;  dM2 += stride;  dM3 += stride;
    ddM1 += stride; ddM2 += stride; ddM3 += stride;
  }    

  return;
}
#else 
void pmerealspace_dipole_calc_force(const struct PmeRealSpace_Tag *prs,
				    const MD_Double vh[], 
				    const MD_Dvec scaled_pos[], 
				    const MD_Double charge[],
				    const MD_Double dipole[],
				    const struct Lattice_Tag lattice,
				    MD_Dvec force[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double   *M1 = prs->M;
  const MD_Double   *M2 = M1 + order;
  const MD_Double   *M3 = M2 + order;
  const MD_Double  *dM1 = prs->dM;
  const MD_Double  *dM2 = dM1 + order;
  const MD_Double  *dM3 = dM2 + order;
  const MD_Double *ddM1 = prs->ddM;
  const MD_Double *ddM2 = ddM1 + order;
  const MD_Double *ddM3 = ddM2 + order;
  const MD_Double *dip, *vk;
  MD_Dvec *fi;
  MD_Dvec t1, t2, t3;
  MD_Double f1, f2, f3;
  MD_Double q, d1, d2, d3;
  MD_Double cq1, cq2, cq3, d11, d12, d13, d22, d23, d33;
  MD_Double s3, sd3, sdd3, s23, sd23, s2d3, sdd23, sd2d3, s2dd3; 
  MD_Double m1, m2, dm1, dm2, ddm1, ddm2;
  MD_Double term;          
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;

  MD_vec_mul(lattice.b1, K1, t1);
  MD_vec_mul(lattice.b2, K2, t2);
  MD_vec_mul(lattice.b3, K3, t3);

  dip = dipole; fi = force;  /* do not need to clear force array */
  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    cq1 = cq2 = cq3 = 0.0;
    d11 = d12 = d13 = d22 = d23 = d33 = 0.0;
    for (i1=0; i1<order; i1++) {    
      u1++; 
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      s23 = sd23 = s2d3 = sdd23 = sd2d3 = s2dd3 = 0.0;
      for (i2=0; i2<order; i2++) {
	u2++;
	ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
	vk = vh + ind2;
	u3 = u3i;
	s3 = sd3 = sdd3 = 0.0;
        for (i3=0; i3<order; i3++) {
	  u3++;
	  ind = CYCLIC_INDEX(u3,K3);
	  term = vk[ind];
	  s3   +=   M3[i3] * term;
	  sd3  +=  dM3[i3] * term;
	  sdd3 += ddM3[i3] * term;
        }
          m2 =   M2[i2];
	 dm2 =  dM2[i2];
	ddm2 = ddM2[i2];   
	s23  += m2 * s3;
	sd23 += dm2 * s3;
	s2d3 += m2 * sd3;
	sdd23 += ddm2 * s3;
	sd2d3 += dm2 * sd3;
	s2dd3 += m2 * sdd3;
      }
        m1 =   M1[i1];
       dm1 =  dM1[i1];
      ddm1 = ddM1[i1];  
      cq1 -= dm1 * s23;
      cq2 -= m1 * sd23;
      cq3 -= m1 * s2d3;
      d11 -= ddm1 * s23;
      d12 -= dm1 * sd23;
      d13 -= dm1 * s2d3;
      d22 -= m1 * sdd23;
      d23 -= m1 * sd2d3;
      d33 -= m1 * s2dd3;
    }
    /* f_k = - \sum_m (q_k (I1)_k,m + (I2)_k,m d_k) v(m)
     *     = - \sum_m (q_k T^t (DI0) + T^t (D^2 I0) T * d_k) * v
     *     = T^t [q_k * \sum_m(-DI0 * v) + (\sum_m (-D^2 I0) * v) * (T * d_k)]
     *   [K1 b1^t]   [t1^t]
     * T=[K2 b2^t] = [t2^t]  DI0=\grad I0, D^2 I0=\grad\grad I0 (3x3 matrix)
     *   [K3 b3^t]   [t3^t]       
     */
    q = charge[iatom];
    T_MUL_d(t1, t2, t3, dip[0], dip[1], dip[2], d1, d2, d3);
    f1 = cq1*q + d11*d1 + d12*d2 + d13*d3;
    f2 = cq2*q + d12*d1 + d22*d2 + d23*d3;
    f3 = cq3*q + d13*d1 + d23*d2 + d33*d3;
    Ttrans_MUL_d(t1, t2, t3, f1, f2, f3, fi->x, fi->y, fi->z);
    dip += 3;  fi++;
      M1 += stride;   M2 += stride;   M3 += stride;
     dM1 += stride;  dM2 += stride;  dM3 += stride;
    ddM1 += stride; ddM2 += stride; ddM3 += stride;
  }    

  return;
}
#endif
