/*
 * Copyright (C) 2004-2006 by Wei Wang.  All rights reserved.
 */


/* implementation of the standard Ewald sum, direct sum part */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <assert.h>
#include "timer.h"
#include "unit.h"
#include "constant.h"
#include "utilities.h"
#include "helper.h"
#include "standEwald_dir.h"
#include "linkcell.h"

#ifndef EXACT_EWALD
#  include "myerfc.h"
#endif

/* calling sequence: **************************************************
  stdEwinit:
    + a1, a2, a3, volume
    + rcut, beta, kcut
    + allocate general-purpose array: force, charge, neg_G1q
    + init_dir 
      ++ intialize linkcell: linkcell_init()
      ++ neighborlist memory allocation
      ++ dirG2, dirG3 memory allocation
      ++ setup direct space lattice
      ++ compute diagonal elements of dirG0, dirG2
    + init_rec
    + setup_dsolver
  stdEwcompute  <-- used in every step of molecular dynamics integration
    + dir_setup() 
        compute direct q-q energy, -G1q array, dirG2, dirG3
    + rec_setup() <-- must be called after dir_setup
    + dsolver_solve: solve the dipole equation.
      ++ 
    + (energy computed)
    + compute_cc_force()
    + compute_dir_dipole_force()
      ++
    + compute_rec_dipole_force()
 ***********************************************************************/

/* erfc is not a function of the standard C library, but provided by many 
 * compilers. If I have -ansi flag for gcc, then gcc may refuse to 
 * recognize "erfc" function. Neglecting the warning message results in 
 * "erfc" returning garbage and breaking down the code.
 * The declaration here is a guard against -ansi flag: it forces
 * the compiler recognize "erfc" function.  On the other hand, if 
 * the compiler's c library does not contain erfc function, error will occur 
 * at link stage.
 */
double erfc(double x);

#ifndef EXACT_EWALD
  static struct Erfc_Tag g_myerfc;
#endif



/***********************************************************************/
/*********************** implementations *******************************/
/***********************************************************************/

#define NCELL 2

/*=======================================================================*/


MD_Errcode init_dir(struct standEwald_Tag *se)
{
  const MD_Dvec a1 = se->a1;
  const MD_Dvec a2 = se->a2;
  const MD_Dvec a3 = se->a3;
  const MD_Double rcut = se->rcut;
  const MD_Double rc2 = rcut * rcut;
  const MD_Double beta = se->beta;
  /* maximum number of lattice sites along each direction */
  const MD_Double epsilon = 1e-12;
  const MD_Int n1max = (MD_Int) (0.5 + rcut/se->systemsize.x - epsilon);
  const MD_Int n2max = (MD_Int) (0.5 + rcut/se->systemsize.y - epsilon);
  const MD_Int n3max = (MD_Int) (0.5 + rcut/se->systemsize.z - epsilon);
  const MD_Int natoms = se->natoms;
  MD_Double latveclen_max;
  MD_Int i,j,k;
  MD_Int ndirlatt, idirlatt;
  MD_Double diagdirG0;
  MD_Double diagdirG2xx, diagdirG2yy, diagdirG2zz;
  struct LinkCell_init_Tag lc_init_data;

  se->dirforce = my_calloc((size_t)se->natoms, 
				sizeof(*se->dirforce), "direct force");

  lc_init_data.nxcell = NCELL;
  lc_init_data.nycell = NCELL;
  lc_init_data.nzcell = NCELL;
  lc_init_data.systemsize = se->systemsize;
  MD_vec_mul(lc_init_data.systemsize, -0.5, lc_init_data.min);
  lc_init_data.cutoff = rcut;
  lc_init_data.natoms = se->natoms;
  se->linkcell = my_calloc(sizeof(struct LinkCell_Tag), (size_t)1, 
				"linkcell");
  if (linkcell_init(se->linkcell, &lc_init_data)) {
    fprintf(stderr, "failed to init linkcell\n");
    return MD_FAIL;
  }

  se->max_neibrs = (MD_Int) ((MD_Double) natoms/se->volume
       * (4.0/3.0*Pi*rcut*rcut*rcut) * 2.0); /* 2.0 is a safety factor */
  if (se->max_neibrs < 10) se->max_neibrs = 10;
  se->neibrlist = my_calloc((size_t)se->natoms, 
				 sizeof(MD_Int *), "neibrlist");
  for (i = 0; i < natoms; i++) {
    se->neibrlist[i] = my_calloc((size_t)se->max_neibrs, 
				      sizeof(MD_Int), "neibr[i]");
  }

  se->numneibrs = my_calloc((size_t)se->natoms, sizeof(MD_Int),
				 "numneibrs");
  se->neg_dirG1q = my_calloc((size_t)(3*se->natoms), 
				  sizeof *se->neg_dirG1q,
				  "-dirG1q");

  if (se->has_induced_dipole) {
    se->dirG2 = my_calloc((size_t)natoms, sizeof(MD_Double *), "dirG2");
    se->dirG3 = my_calloc((size_t)natoms, sizeof(MD_Double *), "dirG3");
    for (i = 0; i < natoms; i++) {
      se->dirG2[i] = my_calloc((size_t)se->max_neibrs, 
				    sizeof(MD_Double) * NIND2, "dirG2[i]");
      se->dirG3[i] = my_calloc((size_t)se->max_neibrs, 
				    sizeof(MD_Double) * NIND3, "dirG3[i]");
    }
  }

  se->nmax[0] = n1max;
  se->nmax[1] = n2max;
  se->nmax[2] = n3max;
  se->dirlatt = 
    my_calloc((size_t) (2*n1max+1) * (2*n2max+1) * (2*n3max+1), 
	      sizeof(MD_Dvec), "direct sum lattice");
  latveclen_max = (se->systemsize.x > se->systemsize.y) ? 
    se->systemsize.x : se->systemsize.y;
  latveclen_max = (latveclen_max > se->systemsize.z) ? latveclen_max : 
    se->systemsize.z; 
  ndirlatt = 0;
  for (i = -n1max; i <= n1max; i++) {
    for (j = -n2max; j <= n2max; j++) {
      for (k = -n3max; k <= n3max; k++) {
	MD_Dvec lattvec;
	MD_Double veclensqr;
	lattvec.x = i*a1.x + j*a2.x + k*a3.x;
	lattvec.y = i*a1.y + j*a2.y + k*a3.y;
	lattvec.z = i*a1.z + j*a2.z + k*a3.z;
	veclensqr = MD_vec_dot(lattvec, lattvec);
	if (veclensqr > (rcut+latveclen_max)*(rcut+latveclen_max)) continue;
	/*printf("i,j,k=%d,%d,%d\n",i,j,k);*/
	se->dirlatt[ndirlatt] = lattvec;
	ndirlatt ++;
      }
    }
  }
  se->ndirlatt = ndirlatt;
  se->dirlatt = realloc((void *)se->dirlatt, 
			     (size_t)se->ndirlatt * sizeof(MD_Dvec));
  assert(NULL != se->dirlatt);
  printf("  ndirlatt = %d,  ", se->ndirlatt);
  printf("  Nmax = (%d, %d, %d)\n", n1max, n2max, n3max);

#ifndef EXACT_EWALD
  erfc_init(&g_myerfc);
#endif

  diagdirG0  = 0.0;
  diagdirG2xx = diagdirG2yy = diagdirG2zz = 0.0;
  for (idirlatt = 0; idirlatt < se->ndirlatt; idirlatt++) {
    MD_Double nx = se->dirlatt[idirlatt].x;
    MD_Double ny = se->dirlatt[idirlatt].y;
    MD_Double nz = se->dirlatt[idirlatt].z;
    MD_Double rn2 = nx*nx + ny*ny + nz*nz;
    /*printf("nx=%f, ny=%f, nz=%f\n", nx, ny, nz);*/
    MD_Double rn = sqrt(rn2);
    if (0.0 < rn && rn2 < rc2 ) {
      MD_Double bn = beta * rn;
#ifdef EXACT_EWALD
      MD_Double erfcbn = erfc(bn); 
#else
      MD_Double erfcbn = erfc_eval(&g_myerfc, bn);
#endif
      /*printf("OKed: nx=%f, ny=%f, nz=%f, bn=%f, erfcbn=%f, %15.10f\n", nx, ny, nz, bn, erfcbn, erfcbn/rn);*/
      diagdirG0 += erfcbn / rn;
      if (se->has_induced_dipole) {
	diagdirG2xx += erfcbn / (rn2*rn2*rn) * (rn2 - 3.0*nx*nx)
	  + two_over_sqrtPi * bn * exp(-bn*bn) / (rn2*rn2*rn)
	  * (rn2 - 3.0*nx*nx - 2.0*bn*bn*nx*nx);
	diagdirG2yy += erfcbn / (rn2*rn2*rn) * (rn2 - 3.0*ny*ny)
	  + two_over_sqrtPi * bn * exp(-bn*bn) / (rn2*rn2*rn)
	  * (rn2 - 3.0*ny*ny - 2.0*bn*bn*ny*ny);
	diagdirG2zz += erfcbn / (rn2*rn2*rn) * (rn2 - 3.0*nz*nz)
	  + two_over_sqrtPi * bn * exp(-bn*bn) / (rn2*rn2*rn)
	  * (rn2 - 3.0*nz*nz - 2.0*bn*bn*nz*nz);
      }
    } 
  }
  se->diagdirG0 = diagdirG0;
  printf("  diagdirG0=%20.15f\n", se->diagdirG0);
  if (se->has_induced_dipole) {
    se->diagdirG2xx = diagdirG2xx;
    se->diagdirG2yy = diagdirG2yy;
    se->diagdirG2zz = diagdirG2zz;
    printf("  diagdirG2=%20.15f %20.15f %20.15f\n", se->diagdirG2xx,
	   se->diagdirG2yy, se->diagdirG2zz);
  }

  return OK;
}


MD_Errcode destroy_dir(struct standEwald_Tag *se)
{
  MD_Int i;

  free(se->dirforce);

  if (linkcell_destroy(se->linkcell)) {
    fprintf(stderr, "failed to destroy linkcell\n");
    return MD_FAIL;
  }

  free(se->dirlatt);
  for(i = 0; i < se->natoms; i++) {
    free(se->neibrlist[i]);
  }
  free(se->neibrlist);
  free(se->numneibrs);

  free(se->neg_dirG1q);
  if (se->has_induced_dipole) {
    for(i = 0; i < se->natoms; i++) {
      free(se->dirG2[i]);
      free(se->dirG3[i]);
    }
    free(se->dirG2);
    free(se->dirG3);
  }

#ifndef EXACT_EWALD
  erfc_destroy(&g_myerfc);
#endif

  return OK;
}


void compute_dirG2v(const struct standEwald_Tag *se, 
		    const MD_Double *v, MD_Double *g2v)
{
  const MD_Int natoms = se->natoms;
  const MD_Double g2xx = se->diagdirG2xx;
  const MD_Double g2yy = se->diagdirG2yy;
  const MD_Double g2zz = se->diagdirG2zz;
  MD_Double** dirG2 = se->dirG2;
  const MD_Double *g2ij, *vj, *vi;
  MD_Double *g2vi, *g2vj;
  const MD_Int* neibrlist;
  MD_Int numneibrs;
  MD_Int i, j, jj; 

  /* contribution of G2's diagonal elements, also clear g2v */
  for (i = 0; i < natoms; i++) {
    g2v[3*i+X] = g2xx * v[3*i+X];
    g2v[3*i+Y] = g2yy * v[3*i+Y];
    g2v[3*i+Z] = g2zz * v[3*i+Z];
  }
  /* nondiagonal contribution */
  for (i = 0; i < natoms; i++) {
    vi = v + 3*i;
    g2ij = dirG2[i];
    g2vi = g2v + 3*i;
    numneibrs = se->numneibrs[i];
    neibrlist = se->neibrlist[i];
    for (jj = 0; jj < numneibrs; jj++) {
      j = neibrlist[jj]; 
      ASSERT(0 <= j && j < natoms);
      vj = v + 3*j;
      g2vi[X] += g2ij[XX]*vj[X] + g2ij[XY]*vj[Y] + g2ij[XZ]*vj[Z];
      g2vi[Y] += g2ij[YX]*vj[X] + g2ij[YY]*vj[Y] + g2ij[YZ]*vj[Z];
      g2vi[Z] += g2ij[ZX]*vj[X] + g2ij[ZY]*vj[Y] + g2ij[ZZ]*vj[Z];
      g2vj = g2v + 3*j;
      g2vj[X] += g2ij[XX]*vi[X] + g2ij[XY]*vi[Y] + g2ij[XZ]*vi[Z];
      g2vj[Y] += g2ij[YX]*vi[X] + g2ij[YY]*vi[Y] + g2ij[YZ]*vi[Z];
      g2vj[Z] += g2ij[ZX]*vi[X] + g2ij[ZY]*vi[Y] + g2ij[ZZ]*vi[Z];
      g2ij+=NIND2; 
    }
  } 

  return;
}


void compute_dir_pseudores(const struct standEwald_Tag *se, 
			   const MD_Double *v, const MD_Int flag, 
			   MD_Double *pseudores)
{
  const MD_Double *neg_G1q = se->neg_dirG1q;
  const MD_Int n = 3 * se->natoms;
  MD_Int i;

  compute_dirG2v(se, v, pseudores);
  if (flag) {
    for (i=0; i<n; i++) pseudores[i] = neg_G1q[i] - pseudores[i];
  } else {
    for (i=0; i<n; i++) pseudores[i] = - pseudores[i];
  }

}


/* see note 2003-force */
void compute_dir_dipole_force(struct standEwald_Tag *se)
{
  MD_Dvec *f = se->dirforce;
  const MD_Double *q = se->charge;
  const MD_Double *di, *dj;
  const MD_Double *dipole = dsolver_get_dipole(se->dsolver);
  const MD_Double *g2ij, *g3ij;
  const MD_Int natoms = se->natoms;
  MD_Double **dirG2 = se->dirG2;
  MD_Double **dirG3 = se->dirG3;
  MD_Double dq[3];
  MD_Double s[NIND2];
  MD_Double qi, qj;
  MD_Dvec df;  
  MD_Int numneibrs;
  const MD_Int *neibrlist;
  MD_Int i, j, jj; 

  /* diagonal elements of G2 do not contribute */
  for (i = 0; i < natoms; i++) {
    qi = q[i];
    di = dipole + 3*i;
    g2ij = dirG2[i];
    g3ij = dirG3[i];
    numneibrs = se->numneibrs[i];
    neibrlist = se->neibrlist[i];
    for (jj = 0; jj < numneibrs; jj++) { 
      j = neibrlist[jj];
      ASSERT(0 <= j && j < natoms);
      qj = q[j];
      dj = dipole + 3*j;
      /* charge-dipole force */
      dq[X] = dj[X] * qi - di[X] * qj;
      dq[Y] = dj[Y] * qi - di[Y] * qj;
      dq[Z] = dj[Z] * qi - di[Z] * qj;
      df.x = g2ij[XX]*dq[X] + g2ij[XY]*dq[Y] + g2ij[XZ]*dq[Z];
      df.y = g2ij[YX]*dq[X] + g2ij[YY]*dq[Y] + g2ij[YZ]*dq[Z];
      df.z = g2ij[ZX]*dq[X] + g2ij[ZY]*dq[Y] + g2ij[ZZ]*dq[Z];
      /* dipole-dipole force */
      s[XX] = g3ij[XXX]*dj[X] + g3ij[XXY]*dj[Y] + g3ij[XXZ]*dj[Z];
      s[XY] = g3ij[XYX]*dj[X] + g3ij[XYY]*dj[Y] + g3ij[XYZ]*dj[Z];
      s[XZ] = g3ij[XZX]*dj[X] + g3ij[XZY]*dj[Y] + g3ij[XZZ]*dj[Z];
      s[YY] = g3ij[YYX]*dj[X] + g3ij[YYY]*dj[Y] + g3ij[YYZ]*dj[Z];
      s[YZ] = g3ij[YZX]*dj[X] + g3ij[YZY]*dj[Y] + g3ij[YZZ]*dj[Z];
      s[ZZ] = g3ij[ZZX]*dj[X] + g3ij[ZZY]*dj[Y] + g3ij[ZZZ]*dj[Z];
      df.x += di[X]*s[XX] + di[Y]*s[XY] + di[Z]*s[XZ];
      df.y += di[X]*s[YX] + di[Y]*s[YY] + di[Z]*s[YZ];
      df.z += di[X]*s[ZX] + di[Y]*s[ZY] + di[Z]*s[ZZ];
      MD_vec_substract(f[i], df, f[i]);
      MD_vec_add(f[j], df, f[j]);
      g2ij+=NIND2; g3ij+=NIND3;
    }
  }  
  
  se->dirEnergy -= 0.5 * DOT(dsolver_get_dipole(se->dsolver),
				  se->neg_dirG1q, 3 * natoms);
  
#ifdef DEBUG_STANDEWALD
  {MD_Double f2=0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(f[i], f[i]);
    printf("sedir <f,f>=%f\n", f2);
    printf("direct energy: %f\n", se->dirEnergy);
  }
#endif
  
}


/* compute: 
     1. direct charge--charge energy, 
     2. direct part of -G1q,
     3. direct part of the elements of G2, G3
*/
void dipole_dir_setup(struct standEwald_Tag *se)
{
  MD_Dvec *f = se->dirforce;
  const MD_Dvec *pos = se->ppos;
  const MD_Dvec systemsize = se->systemsize;
  const MD_Double *q = se->charge;
  const MD_Double rcut2 = se->rcut2;
  const MD_Int natoms = se->natoms;
  const MD_Int ng1q = 3 * natoms;
  const MD_Int numcell = se->linkcell->numcell;
  struct Cell_Tag *cell = se->linkcell->cell;
  MD_Double **dirG2 = se->dirG2;
  MD_Double **dirG3 = se->dirG3;
  MD_Int* head = se->linkcell->head;
  MD_Int* list = se->linkcell->list;
  MD_Double * const neg_dirG1q = se->neg_dirG1q;
  MD_Dvec posi;
  MD_Double qi, qj, g0, qg0q, g0qj;
  MD_Dvec g1;
  MD_Dvec r_ij;   /* r_ij = r_i - r_j */
  MD_Int* neibrlisti;
  MD_Int i, j, k, nn;
  MD_Int ineibr;
  MD_Double *g2ij, *g3ij;

  linkcell_hash_atoms(se->linkcell, pos); 

  qg0q = 0.0;
  memset(neg_dirG1q, 0, ng1q*sizeof(*neg_dirG1q));
  for (k = 0;  k < numcell;  k++) {
    i = head[k];  /* first atom in cell k */
    while(i >= 0) {  
      qi = q[i];
      posi = pos[i];
      g2ij = dirG2[i];
      g3ij = dirG3[i];
      ineibr = 0;
      g0qj = 0.0;
      neibrlisti = se->neibrlist[i];
      j = list[i];  
      while(j >= 0) {   /* check other atoms in this cell */
        /*printf("evaluate G matrices between %d, %d in same cell.\n", i,j);*/
        MD_vec_substract(posi, pos[j], r_ij);
        SIMPLE_BOUND_VEC(r_ij, systemsize); 
        if (MD_vec_dot(r_ij, r_ij) < rcut2) {
	  dipole_compute_gbar_dir(se, r_ij, i, j, &g0, &g1, g2ij, g3ij);
	  qj = q[j];
	  g0qj += g0 * qj;
	  neg_dirG1q[3*i+X] -= g1.x * qj;
	  neg_dirG1q[3*i+Y] -= g1.y * qj;
	  neg_dirG1q[3*i+Z] -= g1.z * qj;
	  neg_dirG1q[3*j+X] += g1.x * qi;
	  neg_dirG1q[3*j+Y] += g1.y * qi;
	  neg_dirG1q[3*j+Z] += g1.z * qi;
	  neibrlisti[ineibr] = j;
	  ineibr++;
	  ASSERT(ineibr < se->max_neibrs); /* hard boundary */
	  g2ij += NIND2;
	  g3ij += NIND3;
	}
	j = list[j]; 
      } 
      for (nn=0; nn<cell[k].numnbrs;  nn++) { /* check atoms in neighbors */
        j = head[cell[k].nbr[nn]];
        while (j >= 0) {
          MD_vec_substract(posi, pos[j], r_ij);
          SIMPLE_BOUND_VEC(r_ij, systemsize); 
          if (MD_vec_dot(r_ij, r_ij) < rcut2) { 
            dipole_compute_gbar_dir(se, r_ij, i, j, &g0, &g1, g2ij, g3ij);
	    qj = q[j];
	    g0qj += g0 * qj;
	    neg_dirG1q[3*i+X] -= g1.x * qj;
	    neg_dirG1q[3*i+Y] -= g1.y * qj;
	    neg_dirG1q[3*i+Z] -= g1.z * qj;
	    neg_dirG1q[3*j+X] += g1.x * qi;
	    neg_dirG1q[3*j+Y] += g1.y * qi;
	    neg_dirG1q[3*j+Z] += g1.z * qi;
            neibrlisti[ineibr] = j;
            ineibr++;
            ASSERT(ineibr <= se->max_neibrs); /* hard boundary */
            g2ij += NIND2;
            g3ij += NIND3;
          }
          j = list[j]; /* next atom */
        }
      } /* nn loop is over */
      qg0q += qi * g0qj;  /* accumulate direct charge--charge energy */
      se->numneibrs[i] = ineibr;
      i = list[i];
    } /* i loop over */
  } /* k (cell) loop over */

  for (i=0; i<natoms; i++) {  /* charge-charge force */
    f[i].x = neg_dirG1q[3*i+X] * q[i];
    f[i].y = neg_dirG1q[3*i+Y] * q[i];
    f[i].z = neg_dirG1q[3*i+Z] * q[i];
    /*
    printf("dfq[%d]: %f, %f, %f\n", i, f[i].x, f[i].y, f[i].z);
    */
  }

  /* no 1/2, since j=0-->i-1 */
  se->dirEnergy = qg0q + 0.5 * se->diagdirG0 * se->qq; 
#ifdef DEBUG_STANDEWALD
  printf("direct q-q energy is %20.15f\n", se->dirEnergy);
  
  {MD_Double f2=0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(f[i], f[i]);
    printf("standEwald_dir <f,f>=%f\n", f2);
  }
#endif

  return;
}



/*
 *  r = r_i - r_j.  
 *  "excluded" means atom j is in the exclusion list of i.
 *  g_bar_dir(r) = g_dir(r)    if not excluded
 *               = g_dir(r) - 1/r  if excluded.
 *
 *                         erfc(beta|r + n|)
 *  g_dir(r) =    sigma   -------------------
 *                 (n)          |r + n|
 *
 * use recursive formula presented from Toukmaji, Sagui, Board, Darden
 * Jour Chem. Phys. vol 113, 10913, 2000
 * see my note Nov-19-2003
 */
void 
dipole_compute_gbar_dir(const struct standEwald_Tag *se, 
			const MD_Dvec r,  
			const MD_Int i, 
			const MD_Int j, 
			MD_Double *gzero, 
			MD_Dvec *gone,
			MD_Double gtwo[NIND2], 
			MD_Double gthree[NIND3]
			)
{
  const MD_Double beta = se->beta;
  const MD_Double rcut2 = se->rcut2;
  const MD_Dvec *dirlatt = se->dirlatt;
  const MD_Int ndirlatt = se->ndirlatt;
  MD_Int** excllist = se->pexcllist;
  MD_Int *excl;
  MD_Int excld = 0; /* important since excllist can be NULL */
  MD_Double g0 = 0.0;
  MD_Dvec g1 = {0.0, 0.0, 0.0};
  MD_Double g2[NIND2] = {0.0};
  MD_Double g3[NIND3] = {0.0}; 
  MD_Double rnlen, rnlensqr, inv_rnlensqr;
  MD_Double x, y, z;
  MD_Double xx, yy, zz, xy, xz, yz;
  MD_Double betar, betar_sqr, erfc_betar;
  register MD_Double  tmp, B0, B1, B2, B3;
  MD_Double B3xx, B3yy, B3zz, threeB2;
  const MD_Double two_beta2 = 2.0 * beta * beta;
  MD_Int k;

  for (k = 0; k < ndirlatt; k++) {
    x = dirlatt[k].x + r.x;
    y = dirlatt[k].y + r.y;
    z = dirlatt[k].z + r.z;
    rnlensqr = x * x + y * y + z * z;
    if (rnlensqr > rcut2) continue;
    rnlen = sqrt(rnlensqr);
    inv_rnlensqr = 1.0 / rnlensqr;
    betar = beta * rnlen;
#ifdef EXACT_EWALD
    erfc_betar = erfc(betar); 
#else
    erfc_betar = erfc_eval(&g_myerfc, betar);
#endif
    /*
    printf("betar = %g, erfc=%g\n", betar, erfc_betar);
    */
    betar_sqr = betar * betar;
    B0 = erfc_betar / rnlen;                  
    tmp = exp(-betar_sqr) * 2.0 * beta * one_over_sqrtPi; 
    B1 = (B0 + tmp) * inv_rnlensqr;
    tmp *= two_beta2;
    B2 = (3.0*B1 + tmp) * inv_rnlensqr;
    tmp *= two_beta2;
    B3 = (5.0*B2 + tmp) * inv_rnlensqr;
    g0 += B0;  /* g0 */
    g1.x -= B1 * x;
    g1.y -= B1 * y;
    g1.z -= B1 * z;
    xx = x * x; xy = x * y; xz = x * z;
    yy = y * y; yz = y * z; zz = z * z;
    g2[XX] -= B2 * xx - B1;
    g2[YY] -= B2 * yy - B1;
    g2[ZZ] -= B2 * zz - B1;
    g2[XY] -= B2 * xy;
    g2[XZ] -= B2 * xz;
    g2[YZ] -= B2 * yz;
    B3xx = B3 * xx;
    B3yy = B3 * yy;
    B3zz = B3 * zz;
    threeB2 = 3.0 * B2;
    g3[XXX] += (B3xx - threeB2) * x;
    g3[YYY] += (B3yy - threeB2) * y;
    g3[ZZZ] += (B3zz - threeB2) * z;
    g3[XXY] += (B3xx - B2     ) * y;
    g3[XXZ] += (B3xx - B2     ) * z;
    g3[XYY] += (B3yy - B2     ) * x;
    g3[XZZ] += (B3zz - B2     ) * x;
    g3[YYZ] += (B3yy - B2     ) * z;
    g3[YZZ] += (B3zz - B2     ) * y;
    g3[XYZ] +=  B3 * xy * z;
  }

  /*
    printf("i=%d,j=%d, g0=%f, g1=%f, %f, %f\n", i, j,
	   gzero, g1->x, g1->y, g1->z);
  */

  if (excllist != NULL) {  /* the exclusion list is short */
    excl = excllist[i];
    while (*excl < j) excl++;
    excld = (0 == (*excl) - j);
  }

  if (excld) {
    /* n = 0 now */
    x = r.x; y = r.y; z = r.z;
    inv_rnlensqr = 1.0 / (x*x + y*y + z*z);
    B0 = sqrt(inv_rnlensqr);
    B1 = B0 * inv_rnlensqr;
    B2 = B1 * inv_rnlensqr * 3.0;
    B3 = B2 * inv_rnlensqr * 5.0;
    g0 -= B0;
    g1.x += B1 * x;
    g1.y += B1 * y;
    g1.z += B1 * z;
    xx = x * x; xy = x * y; xz = x * z;
    yy = y * y; yz = y * z; zz = z * z;
    g2[XX] += B2 * xx - B1;
    g2[YY] += B2 * yy - B1;
    g2[ZZ] += B2 * zz - B1;
    g2[XY] += B2 * xy;
    g2[XZ] += B2 * xz;
    g2[YZ] += B2 * yz;
    B3xx = B3 * xx;
    B3yy = B3 * yy;
    B3zz = B3 * zz;
    threeB2 = 3.0 * B2;
    g3[XXX] -= (B3xx - threeB2) * x;
    g3[YYY] -= (B3yy - threeB2) * y;
    g3[ZZZ] -= (B3zz - threeB2) * z;
    g3[XXY] -= (B3xx - B2     ) * y;
    g3[XXZ] -= (B3xx - B2     ) * z;
    g3[XYY] -= (B3yy - B2     ) * x;
    g3[XZZ] -= (B3zz - B2     ) * x;
    g3[YYZ] -= (B3yy - B2     ) * z;
    g3[YZZ] -= (B3zz - B2     ) * y;
    g3[XYZ] -=  B3 * xy * z;
  }

  *gzero = g0;
  *gone = g1;
  memcpy(gtwo, g2, NIND2 * sizeof(MD_Double));
  memcpy(gthree, g3, NIND3 * sizeof(MD_Double));

  return;	
}


/* compute: 
     1. direct charge--charge energy, force
     2. direct part of -G1q,
*/
void charge_dir_setup(struct standEwald_Tag *se)
{
  MD_Dvec *f = se->dirforce;
  const MD_Dvec *pos = se->ppos;
  const MD_Dvec systemsize = se->systemsize;
  const MD_Double *q = se->charge;
  const MD_Double rcut2 = se->rcut2;
  const MD_Int natoms = se->natoms;
  const MD_Int ng1q = 3 * natoms;
  const MD_Int numcell = se->linkcell->numcell;
  struct Cell_Tag *cell = se->linkcell->cell;
  MD_Int *head = se->linkcell->head;
  MD_Int *list = se->linkcell->list;
  MD_Double * const neg_dirG1q = se->neg_dirG1q;
  MD_Double qi, qj, g0, qg0q, g0qj;
  MD_Dvec g1;
  MD_Dvec r_ij;   /* r_ij = r_i - r_j */
  MD_Int i, j, k, nn;
  MD_Dvec posi;

  linkcell_hash_atoms(se->linkcell, pos); 

  qg0q = 0.0;
  memset(neg_dirG1q, 0, ng1q*sizeof(*neg_dirG1q));
  for (k = 0;  k < numcell;  k++) {
    i = head[k];  /* first atom in cell k */
    while(i >= 0) {     /* pairs in same cell */
      qi = q[i];
      posi = pos[i];
      g0qj = 0.0;
      j = list[i];     
      while(j >= 0) {   /* other atoms in this cell */
        /*printf("evaluate se between %d, %d in same cell.\n", i,j);*/
        MD_vec_substract(posi, pos[j], r_ij);
        SIMPLE_BOUND_VEC(r_ij, systemsize); 
	if (MD_vec_dot(r_ij, r_ij) < rcut2) {
	  charge_compute_gbar_dir(se, r_ij, i, j, &g0, &g1);
	  qj = q[j];
	  g0qj += g0 * qj;
	  neg_dirG1q[3*i+X] -= g1.x * qj;   /* anti-symmetry of G1 */
	  neg_dirG1q[3*i+Y] -= g1.y * qj;
	  neg_dirG1q[3*i+Z] -= g1.z * qj;
	  neg_dirG1q[3*j+X] += g1.x * qi;
	  neg_dirG1q[3*j+Y] += g1.y * qi;
	  neg_dirG1q[3*j+Z] += g1.z * qi;
	}
        j = list[j]; /* next atom */
      } /* j loop over */
      /* atoms in neighbor cell */
      for (nn = 0;  nn < cell[k].numnbrs;  nn++) {
        j = head[cell[k].nbr[nn]];
        while (j >= 0) {
	  MD_vec_substract(posi, pos[j], r_ij);
	  SIMPLE_BOUND_VEC(r_ij, systemsize);
	  if (MD_vec_dot(r_ij, r_ij) < rcut2) {
	    charge_compute_gbar_dir(se, r_ij, i, j, &g0, &g1);
	    qj = q[j];
	    g0qj += g0 * qj;
	    neg_dirG1q[3*i+X] -= g1.x * qj;   /* anti-symmetry of G1 */
	    neg_dirG1q[3*i+Y] -= g1.y * qj;
	    neg_dirG1q[3*i+Z] -= g1.z * qj;
	    neg_dirG1q[3*j+X] += g1.x * qi;
	    neg_dirG1q[3*j+Y] += g1.y * qi;
	    neg_dirG1q[3*j+Z] += g1.z * qi;
	  }
	  j = list[j]; /* next atom */
        }
      } /* nn loop is over */
      i = list[i];
      qg0q += qi * g0qj;  /* accumulate direct charge--charge energy */
    } /* i loop over */
  } /* k (cell) loop over */

  /* no 1/2, since j=0-->i-1 */
  se->dirEnergy = qg0q + 0.5 * se->diagdirG0 * se->qq;
#ifdef DEBUG_STANDEWALD
  printf("direct sum = %20.15f\n", se->dirEnergy);
#endif

  for (i=0; i<natoms; i++) {
    f[i].x = neg_dirG1q[3*i+X] * q[i];
    f[i].y = neg_dirG1q[3*i+Y] * q[i];
    f[i].z = neg_dirG1q[3*i+Z] * q[i];
  }

  return;
}


void 
charge_compute_gbar_dir(const struct standEwald_Tag *se, 
			const MD_Dvec r, const MD_Int i, const MD_Int j, 
                        MD_Double *gzero, MD_Dvec *gone)
{
  const MD_Double beta = se->beta;
  const MD_Double rcut2 = se->rcut2;
  const MD_Dvec *dirlatt = se->dirlatt;
  const MD_Int ndirlatt = se->ndirlatt;
  MD_Int** excllist = se->pexcllist;
  MD_Int *excl;
  MD_Int excld = 0; /* important since excllist can be NULL */
  MD_Double g0 = 0.0;
  MD_Dvec g1 = {0.0, 0.0, 0.0};
  MD_Double rnlen, rnlensqr, inv_rnlensqr;
  MD_Double x, y, z;
  MD_Double betar, erfc_betar;
  register MD_Double tmp, B0, B1;
  MD_Int k;

  for (k = 0; k < ndirlatt; k++) {
    x = dirlatt[k].x + r.x;
    y = dirlatt[k].y + r.y;
    z = dirlatt[k].z + r.z;
    rnlensqr = x * x + y * y + z * z;
    if (rnlensqr > rcut2) continue;
    rnlen = sqrt(rnlensqr);
    inv_rnlensqr = 1.0 / rnlensqr;
    betar = beta * rnlen;
#ifdef EXACT_EWALD
    erfc_betar = erfc(betar); 
#else
    erfc_betar = erfc_eval(&g_myerfc, betar);
#endif
    /*
    printf("betar = %g, erfc=%g\n", betar, erfc_betar);
    */
    B0 = erfc_betar / rnlen;                  
    tmp = exp(-betar * betar) * 2.0 * beta * one_over_sqrtPi; 
    B1 = (B0 + tmp) * inv_rnlensqr;
    g0 += B0;  /* g0 */
    g1.x -= B1 * x;
    g1.y -= B1 * y;
    g1.z -= B1 * z;
  }

  /*
    printf("i=%d,j=%d, g0=%f, g1=%f, %f, %f\n", i, j,
	   gzero, g1->x, g1->y, g1->z);
  */

  if (excllist != NULL) {  /* the exclusion list is short */
    excl = excllist[i];
    while (*excl < j) excl++;
    excld = (0 == (*excl) - j);
  }

  if (excld) {
    inv_rnlensqr = 1.0 /  MD_vec_dot(r,r);
    B0 = sqrt(inv_rnlensqr);
    B1 = B0 * inv_rnlensqr;
    g0 -= B0;
    g1.x += B1 * r.x;
    g1.y += B1 * r.y;
    g1.z += B1 * r.z;
  }

  *gzero = g0;
  *gone = g1;

  return;	
}


MD_Double calc_beta(MD_Double rc, MD_Double errTol)
{
  MD_Double beta = 8.0, low = 0.0, high;
  MD_Int i;

  while (erfc(beta*rc) > errTol)  beta += beta;
  high = beta;
  for (i = 0; i < 100; i++) {
    beta = 0.5 * (high+low);
    if (erfc(beta*rc) > errTol) low=beta; 
    else high=beta;
  }

  return beta;
}


