/*
 * Copyright (C) 2004-2006 by Wei Wang.  All rights reserved.
 */


#include <assert.h>
#include <string.h>
#include "dpoly.h"
#include "pme_utilities.h"
#include "lattice.h"
#include "pme_direct.h"
#include "pme_recip.h"
#include "PmeRealSpace.h"
#include "PmeKSpace.h"
#include "constant.h"
#include "unit.h"
#include "helper.h"
#ifdef TIMING_DPOLY
# include "timer.h"
static MD_Double totaltime = 0.0;
static MD_Int tcounter = 0;
#endif

#define MAX_DEGREE 2    /* largest polynomial degree to approximate dipole */
#define DEBUG_EPSILON 1e-14  

static 
void add_dir_dforce_degree0(struct PmeDirect_Tag *dir,
			    const MD_Double dipole[],
			    const MD_Double charge[],
			    const MD_Double coeff[]);
static 
void add_dir_dforce_degree1(struct PmeDirect_Tag *dir,
			    const MD_Double dipole[],
			    const MD_Double charge[],
			    const MD_Double coeff[]);
static 
void add_dir_dforce_degree2(struct PmeDirect_Tag *dir,
			    const MD_Double dipole[],
			    const MD_Double charge[],
			    const MD_Double coeff[]);
typedef 
void (*add_dir_dforce_type)(struct PmeDirect_Tag *dir,
			    const MD_Double dipoles[],
			    const MD_Double charge[],
			    const MD_Double coeff[]);
static 
add_dir_dforce_type (add_dir_charge_dipole_force[MAX_DEGREE+1]) = {
  add_dir_dforce_degree0, 
  add_dir_dforce_degree1, 
  add_dir_dforce_degree2,
};

/* C does not allow to declare "const double **", so dipoles, and gridpotens
   has to be declared this way  */
static
void compute_rec_force_degree0(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[]);
static 
void compute_rec_force_degree1(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[]);
static 
void compute_rec_force_degree2(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[]);

typedef 
void (*compute_rec_force_type)(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[]);
static 
compute_rec_force_type (compute_rec_force[MAX_DEGREE+1]) = {
  compute_rec_force_degree0, 
  compute_rec_force_degree1, 
  compute_rec_force_degree2,
};

static
void lastd_degree1(const MD_Int nd, MD_Double *d[], const MD_Double invdiag[],
		   const MD_Double coeff[], const MD_Double negdirG2di[]);
static
void lastd_degree2(const MD_Int nd, MD_Double *d[], const MD_Double invdiag[],
		   const MD_Double coeff[], const MD_Double negdirG2di[]);
typedef 
void (*lastd_type)(const MD_Int, MD_Double *d[], const MD_Double invdiag[],
		   const MD_Double coeff[], const MD_Double negdirG2di[]);
lastd_type lastd[] = {NULL, lastd_degree1, lastd_degree2};




MD_Errcode dpoly_init(struct DPoly_Tag *dp, struct PmeParams_Tag *params,
		      MD_Int degree)
{
  MD_Int nd = params->natoms*3;

  assert(dp);
  assert(params);
  assert(degree >= 0);
  dp->degree = degree;
  assert(dp->degree <= MAX_DEGREE);
  assert(nd > 0);

  printf("dpoly module (non-iterative): degree=%d\n", dp->degree);

  dp->pme = calloc((size_t)1, sizeof *dp->pme);
  assert(dp->pme);
  if (pme_init(dp->pme, params)) {
    fprintf(stderr, "failed to initialize pme\n");
    return MD_FAIL;
  }

  dp->invdiag = my_malloc(nd * sizeof(MD_Double), "invdiag");
  pme_fill_diagonal(dp->pme, dp->invdiag);
  {MD_Int i; for(i=0;i<nd; i++) dp->invdiag[i]=1./dp->invdiag[i];};

  dp->dipoles=my_malloc((dp->degree+1)*nd*sizeof(MD_Double), "ds");

  {
    struct PmeGrid_Tag *myGrid = &(dp->pme->rec->myGrid);
    size_t gridsize = (size_t) myGrid->K1 * myGrid->dim2  * myGrid->dim3;
    dp->gridpotens=my_malloc((dp->degree+2)*gridsize*sizeof(MD_Double), "es");
  }

  dp->work = my_malloc(nd*sizeof(MD_Double), "work");

  return OK;
}


MD_Errcode dpoly_destroy(struct DPoly_Tag *dp)
{
#ifdef TIMING_DPOLY
  printf("total time cost for dpoly = %f seconds, average = %f second/evaluation\n", 
	 totaltime, totaltime / (MD_Double) tcounter);
#endif

  if (pme_destroy(dp->pme)) {
    fprintf(stderr, "failed to destroy pme in dpoly destructor\n");
    return MD_FAIL;
  }
  free(dp->dipoles);
  free(dp->gridpotens);
  free(dp->invdiag);
  free(dp->work);
  return OK;
}


const MD_Double *dpoly_get_dipole(const struct DPoly_Tag *dp)
{
  return dp->dipoles + dp->degree * dp->pme->savePmeParams.natoms * 3;
}
MD_Double dpoly_get_energy(const struct DPoly_Tag *dp)
{
  return dp->pme->potential;
}
const MD_Dvec *dpoly_get_force(const struct DPoly_Tag *dp)
{
  return dp->pme->force;
}


void lastd_degree1(const MD_Int nd, MD_Double *d[], const MD_Double invdiag[],
		   const MD_Double coeff[], const MD_Double negdirG2di[])
{
  MD_Double * const d0 = d[0];
  MD_Double * const d1 = d[1];
  const MD_Double a = coeff[1];
  const MD_Double b = coeff[0];
  MD_Int j;
  for (j=0; j<nd; j++) {
    d1[j] = a*invdiag[j]*(d1[j] - negdirG2di[j]) + b*d0[j];
  }
}
void lastd_degree2(const MD_Int nd, MD_Double *d[], const MD_Double invdiag[],
		   const MD_Double coeff[], const MD_Double negdirG2di[])
{
  const MD_Double * const d0 = d[0];
  const MD_Double * const d1 = d[1];
  MD_Double * const d2 = d[2];
  const MD_Double u =  coeff[2];
  const MD_Double v = -(coeff[2] + coeff[1]);
  const MD_Double w = coeff[0] + coeff[1] + coeff[2];
  MD_Int j;
  for (j=0; j<nd; j++) {
    d2[j] = u*invdiag[j]*(negdirG2di[j] - d2[j]) + v*d1[j] + w*d0[j];
  }
}


#ifdef HAVE_FFTW
#define  COMPUTE_RECED(rec, d, e) { \
pmerealspace_fill_dipoles(rec->myRealSpace,rec->scaled_pos,rec->lattice,d,e);\
rfftwnd_one_real_to_complex(rec->forward_plan, e, NULL);                     \
pmekspace_apply_potential(rec->myKSpace, rec->lattice, e);                   \
rfftwnd_one_complex_to_real(rec->backward_plan, (fftw_complex *)e, NULL);    \
}
#else
#define  COMPUTE_RECED(rec, d, e)  NEED_FFTW(1);
#endif


MD_Errcode dpoly_compute(struct DPoly_Tag *dp)
{
  struct Pme_Tag *pme = dp->pme;
  struct PmeParams_Tag *params = &(pme->savePmeParams);
  struct PmeDirect_Tag *dir = pme->dir;
  struct PmeRecip_Tag *rec = pme->rec;
  struct PmeGrid_Tag *myGrid = &(rec->myGrid);
  const MD_Int gridsize = myGrid->K1 * myGrid->dim2  * myGrid->dim3;
  MD_Double *d[MAX_DEGREE+1];
  MD_Double *e[MAX_DEGREE+2];
  MD_Double *d0;
  const MD_Double *invdiag = dp->invdiag;
  const MD_Double *neg_dirG1q = dir->neg_dirG1q; /* modified by dir_setup */
  const MD_Int nd = 3 * params->natoms;
  const MD_Int degree = dp->degree;
  MD_Int i;
  /* see thesis dpoly chapter */
  const MD_Double ccc[][MAX_DEGREE+1] = {  /* ad hoc */
    {1.15,     0, 0},        /* degree 0 */
    {1.05, -1.20, 0},    /* degree 1 */
#if 0
    {1,-1,1}   /* debug */
#else
    {0.99, -1.07, 1.28}  /* degree 2 */
#endif
  };
  const MD_Double *coeff;

#ifdef TIMING_DPOLY
  MD_Double tstart = time_of_day();
#endif

  assert(degree <= MAX_DEGREE);

  coeff = ccc[degree];

  for (i=0; i <=degree; i++) {
    d[i] = dp->dipoles + i*nd;
    e[i] = dp->gridpotens + i*gridsize;
  }
  e[degree+1] = e[degree] + gridsize;
  d0 = d[0];

  /* I.1 compute charge-charge energy, force, -dirG1*q, store B array */
  pmedir_dipole_setup(dir, params, pme->charge, params->ppos);
  /* I.2 bspline, qh, exp term inited */
  pmerec_dipole_setup(rec, params->ppos, pme->charge);

  /* II compute dipole, rec_pot sequence */
  /* II.1 compute e0 */
#ifdef HAVE_FFTW
  memcpy(e[0], rec->q_arr, gridsize * sizeof(MD_Double));
  rfftwnd_one_real_to_complex(rec->forward_plan, e[0], NULL);
  pmekspace_apply_potential(rec->myKSpace, rec->lattice, e[0]);
  rfftwnd_one_complex_to_real(rec->backward_plan, (fftw_complex *)e[0], NULL);
#else
  NEED_FFTW(1);
#endif
  /* II.2 compute d0 */
  pmerealspace_ungrid_dipoles(rec->myRealSpace, rec->scaled_pos, 
			      rec->lattice, e[0], d0);
  if (0 == degree) {
    for (i=0; i<nd; i++) d0[i] = coeff[0]*invdiag[i]*(neg_dirG1q[i]-d0[i]);
  } else {
    for (i=0; i<nd; i++) d0[i] = invdiag[i] * (neg_dirG1q[i]-d0[i]);
  }
#ifdef DEBUG_DPOLY
  printf("<d0,d0>=%20.15f\n", DOT(d0, d0, nd));
#endif

  /* II.3 compute e1 */
  COMPUTE_RECED(rec, d0, e[1]); 

  for(i=1; i<=degree; i++) {
    MD_Double *negdirG2di = dp->work; 
    MD_Double *di = d[i];
    /* compute d[i] */
    pmerealspace_ungrid_dipoles(rec->myRealSpace, rec->scaled_pos, 
				rec->lattice, e[i], di);
    pmedir_compute_pseudores(dir, d[i-1], 0, negdirG2di);
    if (i < degree) {  
      MD_Int j;
      for (j=0; j<nd; j++) di[j] = d0[j] + (negdirG2di[j]-di[j])*invdiag[j];
    } else {
      lastd[i](nd, d, invdiag, coeff, negdirG2di); 
    }
#ifdef DEBUG_DPOLY
    printf("<d%1d,d%1d>=%20.15f\n", i, i, DOT(di, di, nd));
#endif
    COMPUTE_RECED(rec, di, e[i+1]);
  }

  dir->potential -= 0.5 * DOT(d[degree], neg_dirG1q, nd);

  /* III: compute force, degree specific */
  add_dir_charge_dipole_force[degree](dir, dp->dipoles, pme->charge, coeff);
  compute_rec_force[degree](rec->force, rec->myRealSpace, 
			    &(rec->lattice),rec->scaled_pos,  
			    pme->charge, dp->dipoles, dp->gridpotens, coeff);
#ifdef DEBUG_DPOLY
  { MD_Double f2 = 0.0;
    MD_Dvec ftot = {0.0, 0.0, 0.0};
    const MD_Int natoms = pme->savePmeParams.natoms;
    const MD_Dvec* force = dir->force;
    printf("direct potential (internal unit): %20.15f\n", dir->potential);
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(force[i], force[i]);
    printf("direct <f,f>=%20.15f\n", f2);
    for (i=0; i<natoms; i++) MD_vec_add(ftot, force[i], ftot);
    if (fabs(ftot.x) > DEBUG_EPSILON || fabs(ftot.y) > DEBUG_EPSILON ||
	fabs(ftot.z) > DEBUG_EPSILON) {
      printf("direct sumf=(%g, %g, %g), should be 0\n", 
	     ftot.x, ftot.y, ftot.z);
      ftot.x=ftot.y=ftot.z = 0.0;
    }
    force = rec->force;
    f2 = 0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(force[i], force[i]);
    printf("rec <f,f>=%20.15f\n", f2);
  }
#endif


  /* IV: reciprocal potential, rec force zero out */
  {
    const MD_Double *qh = rec->q_arr; /* should be Ih0 * q */
    const MD_Double *e0 = e[0];
    const MD_Double *en = e[degree+1];
    MD_Double pot = 0.0;
    for (i=0; i<gridsize; i++) pot += (e0[i] + en[i]) * qh[i];
    rec->potential = 0.5 * pot;
#ifdef DEBUG_DPOLY
    printf("reciprocal potential (internal unit): %20.15f\n", rec->potential);
#endif
#ifndef PME_CONSERVE_ENERGY
    {
    MD_Dvec ftot = {0.0, 0.0, 0.0};
    MD_Dvec *f = pme->rec->force;
    MD_Int natoms = pme->savePmeParams.natoms;
    for (i=0;  i<natoms; i++) MD_vec_add(ftot, f[i], ftot);
    MD_vec_mul(ftot, 1.0/natoms, ftot);
    for (i=0;  i<natoms; i++) MD_vec_substract(f[i], ftot, f[i]);
    }
#endif
  }


  /* V: sum up */
  {
    const MD_Dvec *df = dir->force;		
    const MD_Dvec *rf = rec->force;	
    MD_Dvec *tf = pme->force;		
    const MD_Double c = COULOMB_SQR;	
    const MD_Int natoms = pme->savePmeParams.natoms;
    for (i=0;  i<natoms;  i++) MD_vec_add_mul(df[i], rf[i], c, tf[i]);
    pme->potential = (pme->qq_self_potential + dir->potential +	
                      rec->potential) * c;		
  }

#ifdef TIMING_DPOLY
  totaltime += time_of_day() - tstart;
  tcounter ++;
#endif


  return OK;
}



#if 0
MD_Errcode dpoly_compute(struct DPoly_Tag *dp)
{
  struct Pme_Tag *pme = dp->pme;
  struct PmeParams_Tag *params = &(pme->savePmeParams);
  struct PmeDirect_Tag *dir = pme->dir;
  struct PmeRecip_Tag *rec = pme->rec;
  struct PmeGrid_Tag *myGrid = &(rec->myGrid);
  const MD_Int gridsize = myGrid->K1 * myGrid->dim2  * myGrid->dim3;
  MD_Double *d[MAX_DEGREE+1];
  MD_Double *e[MAX_DEGREE+2];
  const MD_Double *invdiag = dp->invdiag;
  const MD_Double *neg_dirG1q = dir->neg_dirG1q; /* modified by dir_setup */
  const MD_Int nd = 3 * params->natoms;
  MD_Int i;

  for (i=0; i <=dp->degree; i++) {
    d[i] = dp->dipoles + i*nd;
    e[i] = dp->gridpotens + i*gridsize;
  }
  e[dp->degree+1] = e[dp->degree] + gridsize;

  /* I.1 compute charge-charge energy, force, -dirG1*q, store B array */
  pmedir_dipole_setup(dir, params, pme->charge, params->ppos);
  /* I.2 bspline, qh, exp term inited */
  pmerec_dipole_setup(rec, params->ppos, pme->charge);

  /* II compute dipole */
  /* II.1 compute e0 */
  memcpy(e[0], rec->q_arr, gridsize * sizeof(MD_Double));
  rfftwnd_one_real_to_complex(rec->forward_plan, e[0], NULL);
  pmekspace_apply_potential(rec->myKSpace, rec->lattice, e[0]);
  rfftwnd_one_complex_to_real(rec->backward_plan, (fftw_complex *)e[0], NULL);
  /* II.2 compute d0 */
  pmerealspace_ungrid_dipoles(rec->myRealSpace, rec->scaled_pos, 
			      rec->lattice, e[0], d[0]);
  for (i=0; i<nd; i++) d[0][i] = invdiag[i] * (neg_dirG1q[i]-d[0][i]);
#ifdef DEBUG_DPOLY
  printf("<d0,d0>=%20.15f\n", DOT(d[0], d[0], nd));
#endif
  /* II.3 compute e1 */
  pmerealspace_fill_dipoles(rec->myRealSpace, rec->scaled_pos, rec->lattice,
                            d[0], e[1]);
  rfftwnd_one_real_to_complex(rec->forward_plan, e[1], NULL);
  pmekspace_apply_potential(rec->myKSpace, rec->lattice, e[1]);
  rfftwnd_one_complex_to_real(rec->backward_plan, (fftw_complex *)e[1], NULL);

  if (dp->degree > 0) { 
    MD_Double *negdirG2di = dp->work; 
    const MD_Double *d0 = d[0];
    for(i=1; i<=dp->degree; i++) {
      MD_Double *di = d[i];
      MD_Int j;
      /* compute d[i] */
      pmerealspace_ungrid_dipoles(rec->myRealSpace, rec->scaled_pos, 
				  rec->lattice, e[i], di);
      pmedir_compute_pseudores(dir, d[i-1], 0, negdirG2di);
      for (j=0; j<nd; j++) di[j] = d0[j] + (negdirG2di[j]-di[j]) * invdiag[j];
#ifdef DEBUG_DPOLY
      printf("<d%1d,d%1d>=%20.15f\n", i, i, DOT(di, di, nd));
#endif
      /* compute e[i+1] */
      pmerealspace_fill_dipoles(rec->myRealSpace, rec->scaled_pos, 
				rec->lattice, di, e[i+1]);
      rfftwnd_one_real_to_complex(rec->forward_plan, e[i+1], NULL);
      pmekspace_apply_potential(rec->myKSpace, rec->lattice, e[i+1]);
      rfftwnd_one_complex_to_real(rec->backward_plan, (fftw_complex *)e[i+1], 
				  NULL);
    }
  }

  dir->potential -= 0.5 * DOT(d[dp->degree], neg_dirG1q, nd);
#ifdef DEBUG_DPOLY
  printf("direct potential (internal unit): %20.15f\n", dir->potential);
#endif

  /* III: compute force, degree specific */
  add_dir_charge_dipole_force[dp->degree](dir, dp->dipoles, pme->charge, 1.0);
  compute_rec_force[dp->degree](rec->force, rec->myRealSpace, 
				&(rec->lattice),rec->scaled_pos,  
				pme->charge, dp->dipoles, dp->gridpotens, 1.0);

  {
    const MD_Double *qh = rec->q_arr; /* should be Ih0 * q */
    const MD_Double *e0 = e[0];
    const MD_Double *en = e[dp->degree+1];
    MD_Double pot = 0.0;
    MD_Dvec ftot = {0.0, 0.0, 0.0};
    MD_Dvec *f = pme->rec->force;
    MD_Int natoms = pme->savePmeParams.natoms;
    for (i=0; i<gridsize; i++) pot += (e0[i] + en[i]) * qh[i];
    rec->potential = 0.5 * pot;
#ifdef DEBUG_DPOLY
    if (1 == dp->degree) {
      printf("reciprocal potential (internal unit): %20.15f\n", rec->potential);
      pot = 0.0;
      pmerealspace_fill_dipoles(rec->myRealSpace, rec->scaled_pos, 
				rec->lattice, d[1], e[2]);
      for(i=0; i<gridsize;i++) pot+=(qh[i] + e[2][i]) * e[0][i];
      printf("reciprocal potential (2): %20.15f\n", pot*0.5);
    }
#endif
    for (i=0;  i<natoms; i++) MD_vec_add(ftot, f[i], ftot);
    MD_vec_mul(ftot, 1.0/natoms, ftot);
    for (i=0;  i<natoms; i++) MD_vec_substract(f[i], ftot, f[i]);
  }


  {
    const MD_Dvec *df = dir->force;		
    const MD_Dvec *rf = rec->force;	
    MD_Dvec *tf = pme->force;		
    const MD_Double c = COULOMB_SQR;	
    const MD_Int natoms = pme->savePmeParams.natoms;
    for (i=0;  i<natoms;  i++) MD_vec_add_mul(df[i], rf[i], c, tf[i]);
    pme->potential = (pme->qq_self_potential + dir->potential +	
                      rec->potential) * c;		
  }

  return OK;
}
#endif

/* charge-dipole force = G2_ij * (di*qj - dj*qi)
 * G2 = B1 * I - B2 * r * r^T, I: identity matrix, r: column vector 
 * G2*v = B1*v - B2*dot(r,v)*r,  B1, B2: scaler 
 */
#define CALC_QD_FORCE(qi, qj, di, dj, rij, b1, b2, df) {	\
    MD_Double rdq;						\
    MD_Dvec dq;							\
    dq.x = di[X]*qj - dj[X]*qi;					\
    dq.y = di[Y]*qj - dj[Y]*qi;					\
    dq.z = di[Z]*qj - dj[Z]*qi;					\
    rdq = b2 * MD_vec_dot(rij, dq);				\
    df.x = b1*dq.x - rdq*rij.x;					\
    df.y = b1*dq.y - rdq*rij.y;					\
    df.z = b1*dq.z - rdq*rij.z;					\
  }

/* dipole-dipole force = c * di^T*G3*dj 
 * G3 = B3*r1*r2*r3 - B2(r1 + r2 + r3), 3D matrix
 * di^T*G3*dj = -B2*(dot(di,dj)*r + dot(r,di)*dj + dot(r,dj)*di)
 *	        +B3*dot(r,di)*dot(r,dj)*r 
 * all input must be "unit" quantities, cannot be expression 
 */
#define ADD_DD_FORCE(di, dj, r, b2, b3, f, c) {		    	        \
    MD_Double rdi = r.x*di[X] + r.y*di[Y] + r.z*di[Z];			\
    MD_Double rdj = r.x*dj[X] + r.y*dj[Y] + r.z*dj[Z];			\
    MD_Double b3rdrd = b3 * rdi * rdj;					\
    MD_Double dd = di[X]*dj[X] + di[Y]*dj[Y] + di[Z]*dj[Z];		\
    f.x += (c) * (b3rdrd*r.x - b2 * (dd*r.x + rdi*dj[X] + rdj*di[X]));  \
    f.y += (c) * (b3rdrd*r.y - b2 * (dd*r.y + rdi*dj[Y] + rdj*di[Y]));  \
    f.z += (c) * (b3rdrd*r.z - b2 * (dd*r.z + rdi*dj[Z] + rdj*di[Z]));  \
  }

/* a simlified version of pmedir_dipole_force */
void add_dir_dforce_degree0(struct PmeDirect_Tag *dir,
			    const MD_Double dipole[],
			    const MD_Double charge[],
			    const MD_Double coeff[] /* not used */)
{
  MD_Dvec *force = dir->force;
  const MD_Dvec *pos = dir->ppos;
  const MD_Dvec systemsize = dir->systemsize;
  const MD_Double *di, *dj;
  const MD_Int natoms = dir->natoms;
  MD_Dvec posi, rij, df;
  const MD_Double *b;
  MD_Double qi, qj;
  const MD_Int *neibrlist;
  MD_Int numneibrs;
  MD_Int i, j, jj;

  /* diagonal elements of G2 do not contribute */
  for (i = 0; i < natoms; i++) {
    qi = charge[i];
    di = dipole + 3*i;
    numneibrs = dir->numneibrs[i];
    neibrlist = dir->neibrlist[i];
    posi = pos[i];
    b = dir->b_arr[i] - 1;
    for (jj = 0; jj < numneibrs; jj++) {
      j = neibrlist[jj];
      ASSERT(0 <= j && j < natoms);
      qj = charge[j];
      dj = dipole + 3*j;
      MD_vec_substract(posi, pos[j], rij);
      SIMPLE_BOUND_VEC(rij, systemsize);
      ASSERT(MD_vec_dot(rij,rij) <= dir->cutoff * dir->cutoff);
      CALC_QD_FORCE(qi, qj, di, dj, rij, b[1], b[2], df);
      MD_vec_add(force[i], df, force[i]);
      MD_vec_substract(force[j], df, force[j]);
      b += 3;
    }
  }  
}


void add_dir_dforce_degree1(struct PmeDirect_Tag *dir,
			    const MD_Double dipole[],
			    const MD_Double charge[],
			    const MD_Double coeff[])
{
  MD_Dvec *force = dir->force;
  const MD_Dvec *pos = dir->ppos;
  const MD_Dvec systemsize = dir->systemsize;
  const MD_Int natoms = dir->natoms;
  const MD_Double *d0 = dipole;
  const MD_Double *d1 = d0 + natoms * 3;
  const MD_Double *d0i, *d0j, *d1i, *d1j;
  const MD_Double a = coeff[1];
  MD_Dvec posi, rij, df;
  const MD_Double *b;
  MD_Double qi, qj;
  const MD_Int *neibrlist;
  MD_Int numneibrs;
  MD_Int i, j, jj;

  /* diagonal elements of G2 do not contribute */
  for (i = 0; i < natoms; i++) {
    qi = charge[i];
    d0i = d0 + 3*i;
    d1i = d1 + 3*i;
    numneibrs = dir->numneibrs[i];
    neibrlist = dir->neibrlist[i];
    posi = pos[i];
    b = dir->b_arr[i] - 1;
    for (jj = 0; jj < numneibrs; jj++) {
      j = neibrlist[jj];
      ASSERT(0 != i-j);
      ASSERT(0 <= j && j < natoms);
      qj = charge[j];
      d0j = d0 + 3*j;
      d1j = d1 + 3*j;
      /* charge-dipole force */
      MD_vec_substract(posi, pos[j], rij);
      SIMPLE_BOUND_VEC(rij, systemsize);
      ASSERT(MD_vec_dot(rij,rij) <= dir->cutoff * dir->cutoff);
      CALC_QD_FORCE(qi, qj, d1i, d1j, rij, b[1], b[2], df);
      ADD_DD_FORCE(d0i, d0j, rij, b[2], b[3], df, a);
      MD_vec_add(force[i], df, force[i]);
      MD_vec_substract(force[j], df, force[j]);
      b += 3;
    }
  }
  
}


void add_dir_dforce_degree2(struct PmeDirect_Tag *dir,
			    const MD_Double dipole[],
			    const MD_Double charge[],
			    const MD_Double coeff[])
{
  MD_Dvec *force = dir->force;
  const MD_Dvec *pos = dir->ppos;
  const MD_Dvec systemsize = dir->systemsize;
  const MD_Int natoms = dir->natoms;
  const MD_Double *d0 = dipole;
  const MD_Double *d1 = d0 + natoms * 3;
  const MD_Double *d2 = d1 + natoms * 3;
  const MD_Double *d0i, *d0j, *d1i, *d1j, *d2i, *d2j;
  const MD_Double a = coeff[2];
  const MD_Double c = 2*a + coeff[1];
  MD_Dvec posi, rij, df;
  const MD_Double *b;
  MD_Double qi, qj;
  const MD_Int *neibrlist;
  MD_Int numneibrs;
  MD_Int i, j, jj;

  /* diagonal elements of G2 do not contribute */
  for (i = 0; i < natoms; i++) {
    qi = charge[i];
    d0i = d0 + 3*i;
    d1i = d1 + 3*i;
    d2i = d2 + 3*i;
    numneibrs = dir->numneibrs[i];
    neibrlist = dir->neibrlist[i];
    posi = pos[i];
    b = dir->b_arr[i] - 1;
    for (jj = 0; jj < numneibrs; jj++) {
      j = neibrlist[jj];
      ASSERT(0 != i-j);
      ASSERT(0 <= j && j < natoms);
      qj = charge[j];
      d0j = d0 + 3*j;
      d1j = d1 + 3*j;
      d2j = d2 + 3*j;
      MD_vec_substract(posi, pos[j], rij);
      SIMPLE_BOUND_VEC(rij, systemsize);
      ASSERT(MD_vec_dot(rij,rij) <= dir->cutoff * dir->cutoff);
      CALC_QD_FORCE(qi, qj, d2i, d2j, rij, b[1], b[2], df);
      ADD_DD_FORCE(d0i, d0j, rij, b[2], b[3], df, c);
      ADD_DD_FORCE(d0i, d1j, rij, b[2], b[3], df, -a);
      ADD_DD_FORCE(d0j, d1i, rij, b[2], b[3], df, -a);
      MD_vec_add(force[i], df, force[i]);
      MD_vec_substract(force[j], df, force[j]);
      b += 3;
    }
  }
  
}


/* modified from pmerealspace_dipole_calc_force */
void compute_rec_force_degree0(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[] /* not used */)
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int gridlen = K1 * dim2 * dim3;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  const MD_Double *ddM1 = prs->ddM;
  const MD_Double *ddM2 = ddM1 + order;
  const MD_Double *ddM3 = ddM2 + order;
  const MD_Double *dip, *v0, *v1;
  const MD_Double *dipole = dipoles;
  const MD_Double *e0 = gridpotens;
  const MD_Double *e1 = gridpotens + gridlen;
  MD_Dvec *fi;
  MD_Dvec t1, t2, t3;
  MD_Double f1, f2, f3;
  MD_Double q, d1, d2, d3;
  MD_Double cq1, cq2, cq3, d11, d12, d13, d22, d23, d33;
  MD_Double s3, sd3, sdd3, s23, sd23, s2d3, sdd23, sd2d3, s2dd3;
  MD_Double r3, rd3, r23, rd23, r2d3;
  MD_Double m1, m2, dm1, dm2, ddm1, ddm2;
  MD_Double term, term2;
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;

  MD_vec_mul(lattice->b1, K1, t1);
  MD_vec_mul(lattice->b2, K2, t2);
  MD_vec_mul(lattice->b3, K3, t3);
 
  dip = dipole; fi = force;  /* do not need to clear force array */
  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    cq1 = cq2 = cq3 = 0.0;
    d11 = d12 = d13 = d22 = d23 = d33 = 0.0;
    for (i1=0; i1<order; i1++) {
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      s23 = sd23 = s2d3 = sdd23 = sd2d3 = s2dd3 = 0.0;
      r23 = rd23 = r2d3 = 0.0;
      for (i2=0; i2<order; i2++) {
        u2++;
        ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
        v0 = e0 + ind2; 
	v1 = e1 + ind2;
        u3 = u3i;
        s3 = sd3 = sdd3 = 0.0;  r3 = rd3 = 0.0;
        for (i3=0; i3<order; i3++) {
          u3++;
          ind   = CYCLIC_INDEX(u3,K3);
          term  = v0[ind];          term2 = term + v1[ind];
          s3   +=   M3[i3] * term;  r3   +=   M3[i3] * term2;
          sd3  +=  dM3[i3] * term;  rd3  +=  dM3[i3] * term2;
          sdd3 += ddM3[i3] * term;  
        }
          m2 =   M2[i2];
         dm2 =  dM2[i2];
        ddm2 = ddM2[i2];
        s23  += m2 * s3;
        sd23 += dm2 * s3;
        s2d3 += m2 * sd3;
        sdd23 += ddm2 * s3;
        sd2d3 += dm2 * sd3;
        s2dd3 += m2 * sdd3;
	r23  += m2 * r3;
	rd23 += dm2 * r3;
	r2d3 += m2 * rd3;
      }
        m1 =   M1[i1];
       dm1 =  dM1[i1];
      ddm1 = ddM1[i1];
      cq1 -= dm1 * r23;
      cq2 -= m1 * rd23;
      cq3 -= m1 * r2d3;
      d11 -= ddm1 * s23;
      d12 -= dm1 * sd23;
      d13 -= dm1 * s2d3;
      d22 -= m1 * sdd23;
      d23 -= m1 * sd2d3;
      d33 -= m1 * s2dd3;
    }
    /* f_k = - \sum_m (q_k (I1)_k,m + (I2)_k,m d_k) v(m)
     *     = - \sum_m (q_k T^t (DI0) + T^t (D^2 I0) T * d_k) * v
     *     = T^t [q_k * \sum_m(-DI0 * v) + (\sum_m (-D^2 I0) * v) * (T * d_k)]
     *   [K1 b1^t]   [t1^t]
     * T=[K2 b2^t] = [t2^t]  DI0=\grad I0, D^2 I0=\grad\grad I0 (3x3 matrix)
     *   [K3 b3^t]   [t3^t]
     */
    q = charge[iatom];
    T_MUL_d(t1, t2, t3, dip[0], dip[1], dip[2], d1, d2, d3);
    f1 = cq1*q + d11*d1 + d12*d2 + d13*d3;
    f2 = cq2*q + d12*d1 + d22*d2 + d23*d3;
    f3 = cq3*q + d13*d1 + d23*d2 + d33*d3;
    Ttrans_MUL_d(t1, t2, t3, f1, f2, f3, fi->x, fi->y, fi->z);
    dip += 3;  fi++;
      M1 += stride;   M2 += stride;   M3 += stride;
     dM1 += stride;  dM2 += stride;  dM3 += stride;
    ddM1 += stride; ddM2 += stride; ddM3 += stride;
  }

  return;
}


void compute_rec_force_degree1(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int gridlen = K1 * dim2 * dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  const MD_Double *ddM1 = prs->ddM;
  const MD_Double *ddM2 = ddM1 + order;
  const MD_Double *ddM3 = ddM2 + order;
  const MD_Double *d0 = dipoles;
  const MD_Double *d1 = d0 + 3*natoms;
  const MD_Double *e0 = gridpotens;
  const MD_Double *e1 = e0 + gridlen;
  const MD_Double *e2 = e1 + gridlen;
  const MD_Double *v0, *v1, *v2;
  const MD_Double a = coeff[1];
  MD_Dvec *fi;
  MD_Dvec t1, t2, t3;
  MD_Double f1, f2, f3;
  MD_Double q, td1, td2, td3;
  MD_Double cq1, cq2, cq3;
  MD_Double d11, d12, d13, d22, d23, d33;
  MD_Double z11, z12, z13, z22, z23, z33;
  MD_Double p3, pd3, pdd3, p23, p2d3, p2dd3, pd23, pd2d3, pdd23;
  MD_Double q3, qd3, qdd3, q23, q2d3, q2dd3, qd23, qd2d3, qdd23;
  MD_Double r3, rd3, r23, rd23, r2d3;
  MD_Double m1, m2, dm1, dm2, ddm1, ddm2;
  MD_Double term0, term1, term2;
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;

  MD_vec_mul(lattice->b1, K1, t1);
  MD_vec_mul(lattice->b2, K2, t2);
  MD_vec_mul(lattice->b3, K3, t3);
 
  fi = force;  /* do not need to clear force array */
  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    cq1 = cq2 = cq3 = 0.0;
    d11 = d12 = d13 = d22 = d23 = d33 = 0.0;
    z11 = z12 = z13 = z22 = z23 = z33 = 0.0;
    for (i1=0; i1<order; i1++) {
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      p23 = p2d3 = p2dd3 = pd23 = pd2d3 = pdd23 = 0.0;
      q23 = q2d3 = q2dd3 = qd23 = qd2d3 = qdd23 = 0.0;
      r23 = r2d3 = rd23  = 0.0;
      for (i2=0; i2<order; i2++) {
        u2++;
        ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
        v0 = e0 + ind2; 
	v1 = e1 + ind2;
	v2 = e2 + ind2;
        u3 = u3i;
	p3 = pd3 = pdd3 = q3 = qd3 = qdd3 = r3 = rd3 = 0.0; 
        for (i3=0; i3<order; i3++) {
          u3++;
          ind   = CYCLIC_INDEX(u3,K3);
          term0 = v0[ind];         
	  term1 = v1[ind];
	  term2 = term0 + v2[ind];
          p3   +=   M3[i3] * term0;  
          pd3  +=  dM3[i3] * term0; 
          pdd3 += ddM3[i3] * term0; 
	  q3   +=   M3[i3] * term1;
	  qd3  +=  dM3[i3] * term1;
	  qdd3 += ddM3[i3] * term1;
	  r3   +=   M3[i3] * term2;
	  rd3  +=  dM3[i3] * term2;
        }
        m2=M2[i2];  dm2=dM2[i2];  ddm2=ddM2[i2];
	p23  += m2*p3;   p2d3  += m2*pd3;   p2dd3 += m2*pdd3;
	pd23 += dm2*p3;  pd2d3 += dm2*pd3;  pdd23 += ddm2*p3;
	q23  += m2*q3;   q2d3  += m2*qd3;   q2dd3 += m2*qdd3;
	qd23 += dm2*q3;  qd2d3 += dm2*qd3;  qdd23 += ddm2*q3;
	r23  += m2*r3;   r2d3  += m2*rd3;   rd23  += dm2*r3;
      }
      m1 = M1[i1]; dm1 = dM1[i1];  ddm1 = ddM1[i1];
      cq1 += dm1 * r23;  cq2 += m1 * rd23;  cq3 += m1 * r2d3;
      d11 += ddm1 * p23; d12 += dm1 * pd23; d13 += dm1 * p2d3;
      d22 += m1 * pdd23; d23 += m1 * pd2d3; d33 += m1 * p2dd3;
      z11 += ddm1 * q23; z12 += dm1 * qd23; z13 += dm1 * q2d3;
      z22 += m1 * qdd23; z23 += m1 * qd2d3; z33 += m1 * q2dd3;
    }
    /*   [K1 b1^t]   [t1^t]
     * T=[K2 b2^t] = [t2^t]  DI0=\grad I0, D^2 I0=\grad\grad I0 (3x3 matrix)
     *   [K3 b3^t]   [t3^t]
     * \grad is with respect to u, not r.
     * f_k = - \sum_m (I1)_{k,m} * (e0+e2)_m * q_k
     *       +a\sum_m (I2)_{k,m} * e1_m * d0_k
     *       - \sum_m (I2)_{k,m} * e0_m * d1_k
     *     = T^t [- (sum_m (e0+e2)_m * (DI0)_{k,m}) * q_k          (1)
     *            +a(\sum_m (e1)_m * (D^2 I0)_{k,m}) (T * d0_k)    (2)
     *            - (\sum_m (e0)_m * (D^2 I0)_{k,m}) (T * d1_k)]   (3)
     */
    q = charge[iatom];
    f1 = -cq1 * q;
    f2 = -cq2 * q;
    f3 = -cq3 * q;                                              /* (1) */
    T_MUL_d(t1, t2, t3, d0[0], d0[1], d0[2], td1, td2, td3);    
    f1 += a * (z11*td1 + z12*td2 + z13*td3);
    f2 += a * (z12*td1 + z22*td2 + z23*td3);
    f3 += a * (z13*td1 + z23*td2 + z33*td3);                    /* (2) */
    T_MUL_d(t1, t2, t3, d1[0], d1[1], d1[2], td1, td2, td3);    
    f1 -= d11*td1 + d12*td2 + d13*td3;
    f2 -= d12*td1 + d22*td2 + d23*td3;
    f3 -= d13*td1 + d23*td2 + d33*td3;                          /* (3) */
    Ttrans_MUL_d(t1, t2, t3, f1, f2, f3, fi->x, fi->y, fi->z);
    d0 += 3;  d1 += 3; fi++;
      M1 += stride;   M2 += stride;   M3 += stride;
     dM1 += stride;  dM2 += stride;  dM3 += stride;
    ddM1 += stride; ddM2 += stride; ddM3 += stride;
  }

  return;
}


void compute_rec_force_degree2(MD_Dvec force[], 
			       const struct PmeRealSpace_Tag *prs,
			       const struct Lattice_Tag *lattice,
			       const MD_Dvec scaled_pos[],
			       const MD_Double charge[],
			       const MD_Double dipoles[],
			       const MD_Double gridpotens[],
			       const MD_Double coeff[])
{
  const MD_Int K1   = prs->myGrid.K1;
  const MD_Int K2   = prs->myGrid.K2;
  const MD_Int K3   = prs->myGrid.K3;
  const MD_Int dim2 = prs->myGrid.dim2;
  const MD_Int dim3 = prs->myGrid.dim3;
  const MD_Int gridlen = K1 * dim2 * dim3;
  const MD_Int order  = prs->myGrid.order;
  const MD_Int stride = 3*order;
  const MD_Int natoms = prs->natoms;
  const MD_Double *M1 = prs->M;
  const MD_Double *M2 = M1 + order;
  const MD_Double *M3 = M2 + order;
  const MD_Double *dM1 = prs->dM;
  const MD_Double *dM2 = dM1 + order;
  const MD_Double *dM3 = dM2 + order;
  const MD_Double *ddM1 = prs->ddM;
  const MD_Double *ddM2 = ddM1 + order;
  const MD_Double *ddM3 = ddM2 + order;
  const MD_Double *d0 = dipoles;
  const MD_Double *d1 = d0 + 3*natoms;
  const MD_Double *d2 = d1 + 3*natoms;
  const MD_Double *e0 = gridpotens;
  const MD_Double *e1 = e0 + gridlen;
  const MD_Double *e2 = e1 + gridlen;
  const MD_Double *e3 = e2 + gridlen;
  const MD_Double *v0, *v1, *v2, *v3;
  const MD_Double a = coeff[2];
  const MD_Double c = 2.0*a + coeff[1];
  MD_Dvec *fi;
  MD_Dvec t1, t2, t3;
  MD_Double f1, f2, f3;
  MD_Double q, td1, td2, td3, z0, z1, z2;
  MD_Double cq1, cq2, cq3;
  MD_Double ie0_11, ie0_12, ie0_13, ie0_22, ie0_23, ie0_33; /* Ih2*e0 */
  MD_Double ie1_11, ie1_12, ie1_13, ie1_22, ie1_23, ie1_33; /* Ih2*e1 */
  MD_Double ie2_11, ie2_12, ie2_13, ie2_22, ie2_23, ie2_33; /* Ih2*e2 */
  MD_Double p3, pd3, pdd3, p23, p2d3, p2dd3, pd23, pd2d3, pdd23;
  MD_Double q3, qd3, qdd3, q23, q2d3, q2dd3, qd23, qd2d3, qdd23;
  MD_Double r3, rd3, rdd3, r23, r2d3, r2dd3, rd23, rd2d3, rdd23;
  MD_Double s3, sd3, s23, sd23, s2d3;
  MD_Double m1, m2, m3, dm1, dm2, dm3, ddm1, ddm2, ddm3;
  MD_Double term0, term1, term2, term3;
  MD_Int u1, u2, u3, u2i, u3i;
  MD_Int iatom, i1, i2, i3;
  MD_Int ind1, ind2, ind;

  MD_vec_mul(lattice->b1, K1, t1);
  MD_vec_mul(lattice->b2, K2, t2);
  MD_vec_mul(lattice->b3, K3, t3);
 
  fi = force;  /* do not need to clear force array */
  for (iatom=0; iatom<natoms; iatom++) {
    u1  = ((MD_Int)scaled_pos[iatom].x) - order;
    u2i = ((MD_Int)scaled_pos[iatom].y) - order;
    u3i = ((MD_Int)scaled_pos[iatom].z) - order;
    cq1 = cq2 = cq3 = 0.0;
    ie0_11 = ie0_12 = ie0_13 = ie0_22 = ie0_23 = ie0_33 = 0.0;
    ie1_11 = ie1_12 = ie1_13 = ie1_22 = ie1_23 = ie1_33 = 0.0;
    ie2_11 = ie2_12 = ie2_13 = ie2_22 = ie2_23 = ie2_33 = 0.0;
    for (i1=0; i1<order; i1++) {
      u1++;
      ind1 = CYCLIC_INDEX(u1, K1) * dim2;
      u2 = u2i;
      p23 = p2d3 = p2dd3 = pd23 = pd2d3 = pdd23 = 0.0;
      q23 = q2d3 = q2dd3 = qd23 = qd2d3 = qdd23 = 0.0;
      s23 = s2d3 = sd23  = 0.0;
      r23 = r2d3 = r2dd3 = rd23 = rd2d3 = rdd23 = 0.0;
      for (i2=0; i2<order; i2++) {
        u2++;
        ind2 = (ind1 + CYCLIC_INDEX(u2, K2))*dim3;
        v0 = e0 + ind2; 
	v1 = e1 + ind2;
	v2 = e2 + ind2;
	v3 = e3 + ind2;
        u3 = u3i;
	p3=pd3=pdd3=q3=qd3=qdd3=s3=sd3=r3=rd3=rdd3=0.0; 
        for (i3=0; i3<order; i3++) {
          u3++;
          ind   = CYCLIC_INDEX(u3,K3);
          term0 = v0[ind];         
	  term1 = v1[ind];
	  term2 = v2[ind];
	  term3 = term0 + v3[ind];
	  m3 = M3[i3]; dm3 = dM3[i3]; ddm3 = ddM3[i3];
          p3   +=   m3 * term0;  
          pd3  +=  dm3 * term0; 
          pdd3 += ddm3 * term0; 
	  q3   +=   m3 * term1;
	  qd3  +=  dm3 * term1;
	  qdd3 += ddm3 * term1;
	  r3   +=   m3 * term2;
	  rd3  +=  dm3 * term2;
	  rdd3 += ddm3 * term2;
	  s3   +=   m3 * term3;
	  sd3  +=  dm3 * term3;
        }
        m2=M2[i2];  dm2=dM2[i2];  ddm2=ddM2[i2];
	p23  += m2*p3;   p2d3  += m2*pd3;   p2dd3 += m2*pdd3;
	pd23 += dm2*p3;  pd2d3 += dm2*pd3;  pdd23 += ddm2*p3;
	q23  += m2*q3;   q2d3  += m2*qd3;   q2dd3 += m2*qdd3;
	qd23 += dm2*q3;  qd2d3 += dm2*qd3;  qdd23 += ddm2*q3;
	r23  += m2*r3;   r2d3  += m2*rd3;   r2dd3 += m2*rdd3;
	rd23 += dm2*r3;  rd2d3 += dm2*rd3;  rdd23 += ddm2*r3;	
	s23  += m2*s3;   s2d3  += m2*sd3;   sd23  += dm2*s3;
      }
      m1 = M1[i1]; dm1 = dM1[i1];  ddm1 = ddM1[i1];
      cq1 += dm1 * s23;  cq2 += m1 * sd23;  cq3 += m1 * s2d3;
      ie0_11 += ddm1 * p23; ie0_12 += dm1 * pd23; ie0_13 += dm1 * p2d3;
      ie0_22 += m1 * pdd23; ie0_23 += m1 * pd2d3; ie0_33 += m1 * p2dd3;
      ie1_11 += ddm1 * q23; ie1_12 += dm1 * qd23; ie1_13 += dm1 * q2d3;
      ie1_22 += m1 * qdd23; ie1_23 += m1 * qd2d3; ie1_33 += m1 * q2dd3;
      ie2_11 += ddm1 * r23; ie2_12 += dm1 * rd23; ie2_13 += dm1 * r2d3;
      ie2_22 += m1 * rdd23; ie2_23 += m1 * rd2d3; ie2_33 += m1 * r2dd3;
    }
    /*   [K1 b1^t]   [t1^t]      I0 = number
     * T=[K2 b2^t] = [t2^t]    D I0 = \grad I0, 3x1 vector
     *   [K3 b3^t]   [t3^t]  D^2 I0 = \grad\grad I0 (3x3 matrix)
     * \grad is with respect to u, not r.
     * f_k = - q_k * \sum_m (I1)_{k,m} * (e0+e3)_m
     *       - \sum_m (I2)_{k,m} * e0_m * d2_k
     *       - \sum_m (I2)_{k,m} * e1_m * (a*d1 - (2a+b)*d0)_k
     *       - \sum_m (I2)_{k,m} * e2_m * a * d0_k 
     * = T^t[- (sum_m (e0+e3)_m * (DI0)_{k,m}) * q_k                    (1)
     *       - (\sum_m (e0)_m * (D^2 I0)_{k,m}) (T * d2_k)              (2)
     *       - (\sum_m (e1)_m * (D^2 I0)_{k,m}) (T * (a*d1-(2a+b)d0)_k) (3)
     *       - (\sum_m (e2)_m * (D^2 I0)_{k,m}) (T * d0_k) * a]         (4)
     */
    q = charge[iatom];
    f1 = -cq1 * q;
    f2 = -cq2 * q;
    f3 = -cq3 * q;                                            /* (1) */
    T_MUL_d(t1, t2, t3, d0[0], d0[1], d0[2], td1, td2, td3);
    f1 -= a * (ie2_11*td1 + ie2_12*td2 + ie2_13*td3);
    f2 -= a * (ie2_12*td1 + ie2_22*td2 + ie2_23*td3);
    f3 -= a * (ie2_13*td1 + ie2_23*td2 + ie2_33*td3);         /* (4) */
    z0 = a*d1[0] - c*d0[0];	
    z1 = a*d1[1] - c*d0[1];
    z2 = a*d1[2] - c*d0[2];
    T_MUL_d(t1, t2, t3, z0, z1, z2, td1, td2, td3);
    f1 -= ie1_11*td1 + ie1_12*td2 + ie1_13*td3;
    f2 -= ie1_12*td1 + ie1_22*td2 + ie1_23*td3;
    f3 -= ie1_13*td1 + ie1_23*td2 + ie1_33*td3;               /* (3) */
    T_MUL_d(t1, t2, t3, d2[0], d2[1], d2[2], td1, td2, td3);
    f1 -= ie0_11*td1 + ie0_12*td2 + ie0_13*td3;
    f2 -= ie0_12*td1 + ie0_22*td2 + ie0_23*td3;
    f3 -= ie0_13*td1 + ie0_23*td2 + ie0_33*td3;               /* (2) */
    Ttrans_MUL_d(t1, t2, t3, f1, f2, f3, fi->x, fi->y, fi->z);
    d0 += 3;  d1 += 3;  d2 += 3;  fi++;
      M1 += stride;   M2 += stride;   M3 += stride;
     dM1 += stride;  dM2 += stride;  dM3 += stride;
    ddM1 += stride; ddM2 += stride; ddM3 += stride;
  }

  return;
}
