/*
 * Copyright (C) 2004-2006 by Wei Wang.  All rights reserved.
 */


/* implementation of the standard Ewald sum, direct sum part */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include "timer.h"
#include "unit.h"
#include "constant.h"
#include "utilities.h"
#include "helper.h"
#include "pme.h"
#include "pme_direct.h"

#ifndef EXACT_EWALD
#  include "myerfc.h"
#endif


/* erfc is not a function of the standard C library, but provided by many 
 * compilers. If I have -ansi flag for gcc, then gcc may refuse to 
 * recognize "erfc" function. Neglecting the warning message results in 
 * "erfc" returning garbage and breaking down the code.
 * The declaration here is a guard against -ansi flag: it forces
 * the compiler recognize "erfc" function.  On the other hand, if 
 * the compiler's c library does not contain erfc function, error will occur 
 * at link stage.
 */
double erfc(double x);


/*=======================================================================*/

MD_Errcode pmedir_init(struct PmeDirect_Tag* dir,
		       const struct PmeParams_Tag *params)
{
  struct LinkCell_init_Tag lc_init;
  const MD_Double rcut = params->cutoff;
  const MD_Int natoms = params->natoms;
  MD_Int i;
  MD_Double volume;

  dir->ppos = params->ppos;
  dir->cutoff = params->cutoff;
  /* only orthogonal lattice is considered here */
  dir->systemsize.x = params->cellBasisVector1.x; 
  dir->systemsize.y = params->cellBasisVector2.y;
  dir->systemsize.z = params->cellBasisVector3.z;
  lc_init.systemsize = dir->systemsize;
  MD_vec_mul(lc_init.systemsize, -0.5, lc_init.min);
  lc_init.nxcell = lc_init.systemsize.x / rcut;
  lc_init.nycell = lc_init.systemsize.y / rcut;
  lc_init.nzcell = lc_init.systemsize.z / rcut;
  lc_init.cutoff = rcut;
  lc_init.natoms = natoms;
  dir->linkcell = my_calloc(1, sizeof(struct LinkCell_Tag), "linkcell");
  if (linkcell_init(dir->linkcell, &lc_init)) {
    fprintf(stderr, "failed to init linkcell\n");
    return MD_FAIL;
  }

  dir->natoms = natoms;
  dir->force  = my_calloc((size_t)natoms, sizeof(MD_Dvec), "dirforce");
  dir->neg_dirG1q = my_calloc((size_t)(3*natoms),sizeof(MD_Double),"-dirG1q");
  volume = params->cellBasisVector1.x * params->cellBasisVector2.y
    * params->cellBasisVector3.z;

  if (params->has_induced_dipole) {
    /* 2 is a safety factor */ 
    MD_Int maxnb = (MD_Int) ((natoms*2) * 4.0/3.0*Pi*rcut*rcut*rcut/volume);
    if (maxnb < 10) maxnb = 10;
    dir->max_neibrs = maxnb;
    printf("max neibr is: %d\n", dir->max_neibrs);
    dir->neibrlist = my_calloc((size_t)natoms, sizeof(MD_Int*), "neibrlist");
    for (i = 0; i < natoms; i++) {
      dir->neibrlist[i] = my_calloc((size_t)maxnb, sizeof(MD_Int), "neibr[i]");
    }
    dir->numneibrs = my_calloc((size_t)natoms, sizeof(MD_Int), "numneibrs");

    dir->b_arr = my_calloc((size_t)natoms, sizeof(MD_Double*), "b_arr");
    for (i=0; i<natoms; i++) {
      dir->b_arr[i] = my_calloc((size_t)maxnb*3, sizeof(MD_Double), "b[i]");
    }
  }

#ifndef EXACT_EWALD
  dir->myerfc = my_calloc((size_t)1, sizeof(struct Erfc_Tag),"my erfc");
  erfc_init(dir->myerfc);
#endif

  return OK;
}


MD_Errcode pmedir_destroy(struct PmeDirect_Tag* dir)
{
  MD_Int i;

  if (dir->myerfc) {
#ifndef EXACT_EWALD
    if (erfc_destroy(dir->myerfc)) {
      fprintf(stderr, "failed to destroy myerfc\n");
      return MD_FAIL;
    }
#endif
    free(dir->myerfc);
  }

  if (dir->b_arr) for(i=0; i<dir->natoms; i++) free(dir->b_arr[i]);
  free(dir->b_arr); 

  if (dir->neibrlist) for(i=0; i<dir->natoms; i++) free(dir->neibrlist[i]);
  free(dir->neibrlist);
  free(dir->numneibrs);

  free(dir->neg_dirG1q);

  free(dir->force);

  if (linkcell_destroy(dir->linkcell)) {
    fprintf(stderr, "failed to destroy linkcell\n");
    return MD_FAIL;
  }
  free(dir->linkcell);

  return OK;
}


/* add direct sum contribution to -G2*d or -G1*q - G2*d */
void pmedir_compute_pseudores(struct PmeDirect_Tag* dir, const MD_Double *v,
			      const MD_Int flag, MD_Double *pr)
{
  const MD_Dvec *pos = dir->ppos;
  const MD_Dvec systemsize = dir->systemsize;
  MD_Dvec rij;
  const MD_Double *b;
  const MD_Double *vj, *vi;
  MD_Double *pri, *prj;
  MD_Double b2rv;
  MD_Double prix, priy, priz; /* to reduce write */
  const MD_Int* neibrlist;
  const MD_Int natoms = dir->natoms;
  MD_Int numneibrs;
  MD_Int i, j, jj; 

  if (0 == flag) {
    memset(pr, 0, 3*natoms*sizeof(*pr));
  } else if (1 == flag) {
    memcpy(pr, dir->neg_dirG1q, 3*natoms*sizeof(*pr));
  } else {
    fprintf(stderr, "wrong flag value, %d\n", flag);
    return;
  }

  for (i = 0; i < natoms; i++) { /* G2's diagonal elements are zero in PME */
    vi = v + 3*i;
    pri = pr + 3*i;
    numneibrs = dir->numneibrs[i];
    neibrlist = dir->neibrlist[i];
    b = dir->b_arr[i] - 1;
/*
    printf("i=%d, # of neibrs=%d\n", i, numneibrs);
*/
    prix = priy = priz = 0.0;
    for (jj = 0; jj < numneibrs; jj++) {
      j = neibrlist[jj]; 
      ASSERT(0 <= j && j < natoms);
      vj = v + 3*j;
      MD_vec_substract(pos[i], pos[j], rij);
      SIMPLE_BOUND_VEC(rij, systemsize); 
      ASSERT(MD_vec_dot(rij,rij) <= dir->cutoff * dir->cutoff);
      /* G2 = B1 * I - B2 * r * r^T, I: identity, r: column vector */
      b2rv = b[2] * (rij.x*vj[X] + rij.y*vj[Y] + rij.z*vj[Z]);
      prix -= b[1]*vj[X] - b2rv*rij.x;
      priy -= b[1]*vj[Y] - b2rv*rij.y;
      priz -= b[1]*vj[Z] - b2rv*rij.z;
      prj = pr + 3*j;
      b2rv = b[2] * (rij.x*vi[X] + rij.y*vi[Y] + rij.z*vi[Z]);
      prj[X] -= b[1]*vi[X] - b2rv*rij.x;
      prj[Y] -= b[1]*vi[Y] - b2rv*rij.y;
      prj[Z] -= b[1]*vi[Z] - b2rv*rij.z;
      b += 3;
    }
    pri[X] += prix; pri[Y] += priy; pri[Z] += priz;
    ASSERT(b+1 <= dir->b_arr[i] + 3*dir->max_neibrs);
  } 

}


/* see note 2003-force, compute direct potential, force, and virial */
void pmedir_dipole_force(struct PmeDirect_Tag* dir, const MD_Dvec *pos, 
			 const MD_Double *charge, const MD_Double *dipole)
{
  MD_Dvec *force = dir->force;
  const MD_Dvec systemsize = dir->systemsize;
  const MD_Double *di, *dj;
  const MD_Int natoms = dir->natoms;
  MD_Dvec dq, df; /* dipole force */
  MD_Dvec posi, rij;
  MD_Dvec fi;
  const MD_Double *b;
  MD_Double qi, qj;
  MD_Double rdq, rdi, rdj, dd, b3rdrd;
  const MD_Int *neibrlist;
  MD_Int numneibrs; 
  MD_Int i, j, jj; 

  /* diagonal elements of G2 do not contribute */
  for (i = 0; i < natoms; i++) {
    qi = charge[i];
    di = dipole + 3*i;
    numneibrs = dir->numneibrs[i];
    neibrlist = dir->neibrlist[i];
    posi = pos[i];
    b = dir->b_arr[i] - 1;
    fi.x = fi.y = fi.z = 0.0;
    for (jj = 0; jj < numneibrs; jj++) { 
      j = neibrlist[jj];
      ASSERT(0 <= j && j < natoms);
      qj = charge[j];
      dj = dipole + 3*j;
      /* charge-dipole force */
      dq.x = di[X]*qj - dj[X]*qi;
      dq.y = di[Y]*qj - dj[Y]*qi;
      dq.z = di[Z]*qj - dj[Z]*qi;
      MD_vec_substract(posi, pos[j], rij);
      SIMPLE_BOUND_VEC(rij, systemsize); 
      ASSERT(MD_vec_dot(rij,rij) <= dir->cutoff * dir->cutoff);
      /* G2 = B1 * I - B2 * r * r^T, I: identity matrix, r: column vector */
      /* G2*v = B1*v - B2*dot(r,v)*r,  B1,B2: scaler */
      rdq = b[2] * MD_vec_dot(rij, dq);
      df.x = b[1]*dq.x - rdq*rij.x;
      df.y = b[1]*dq.y - rdq*rij.y;
      df.z = b[1]*dq.z - rdq*rij.z;
      /* dipole-dipole force */
      /* G3 = B3*r1*r2*r3 - B2(r1 + r2 + r3), 3D matrix */
      /* -di^T*G3*dj = -B3*dot(r,di)*dot(r,dj)*r + B2*(dot(d,d)*r + 
       *                       dot(r,di)*dj + dot(r,dj)*di) */
      rdi = rij.x * di[X] + rij.y * di[Y] + rij.z * di[Z];
      rdj = rij.x * dj[X] + rij.y * dj[Y] + rij.z * dj[Z];
      dd  = di[X] * dj[X] + di[Y] * dj[Y] + di[Z] * dj[Z];
      b3rdrd = b[3]*rdi*rdj;
      df.x += b[2] * (dd*rij.x + rdi*dj[X] + rdj*di[X]) - b3rdrd*rij.x;
      df.y += b[2] * (dd*rij.y + rdi*dj[Y] + rdj*di[Y]) - b3rdrd*rij.y;
      df.z += b[2] * (dd*rij.z + rdi*dj[Z] + rdj*di[Z]) - b3rdrd*rij.z;
      MD_vec_add(fi, df, fi);
      MD_vec_substract(force[j], df, force[j]);
      b += 3;
    }
    MD_vec_add(force[i], fi, force[i]);
  }  

#ifdef DEBUG_PME
  {MD_Double f2=0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(force[i], force[i]);
    printf("direct <f,f>=%f\n", f2);
  }
#endif

  dir->potential -= 0.5 * DOT(dir->neg_dirG1q, dipole, 3*natoms);

  return;
}


/* compute the B0, B1, B2, B3 or D0, D1, D2, D3 */
#ifdef COMPUTE_B
#undef COMPUTE_B
#endif
#ifdef EXACT_EWALD    /* c = 2.0*beta/sqrtPi */
#define COMPUTE_B(b, b0, rsqr, c, beta, two_beta2, excld) {	\
    MD_Double inv_rsqr = 1.0 / rsqr;				\
    MD_Double r = sqrt(rsqr);					\
    MD_Double br = beta * r;					\
    MD_Double tmp = exp(-br*br)*c;				\
    b0 = (erfc(br) - excld) / r;			       	\
    b[1] = (    b0   + tmp) * inv_rsqr;				\
    tmp *= two_beta2;		 				\
    b[2] = (3.0*b[1] + tmp) * inv_rsqr;				\
    tmp *= two_beta2;						\
    b[3] = (5.0*b[2] + tmp) * inv_rsqr;				\
  }
#else  /* the only difference is how to evaluate erfc */
#define COMPUTE_B(b, b0, rsqr, c, beta, two_beta2, excld, myerfc) {	\
    MD_Double inv_rsqr = 1.0 / rsqr;				\
    MD_Double r = sqrt(rsqr);					\
    MD_Double br = beta * r;					\
    MD_Double tmp = exp(-br*br)*c;				\
    b0 = (erfc_eval(myerfc, br) - excld) / r;			\
    b[1] = (    b0   + tmp) * inv_rsqr;				\
    tmp *= two_beta2;						\
    b[2] = (3.0*b[1] + tmp) * inv_rsqr;				\
    tmp *= two_beta2;						\
    b[3] = (5.0*b[2] + tmp) * inv_rsqr;				\
  }
#endif


/* compute charge-charge energy, force, -dirG1*q, store B array */
void pmedir_dipole_setup(struct PmeDirect_Tag* dir,
			 struct PmeParams_Tag *params,
			 const MD_Double *charge, const MD_Dvec *pos)
{
  MD_Dvec * const force = dir->force;
  const MD_Dvec systemsize = dir->linkcell->systemsize;
  const MD_Double rcut2 = params->cutoff * params->cutoff;
  const MD_Double ewaldcof = params->ewaldcof;
  const MD_Double two_ewaldcof2 = 2.0 * ewaldcof * ewaldcof;
  const MD_Double c = 2.0 * ewaldcof * one_over_sqrtPi;
  MD_Double *neg_dirG1q = dir->neg_dirG1q;
  MD_Double *b;
  MD_Dvec posi, rij, fij;   
  MD_Dvec bqri, fi;  /* to reduce the number of writes in the inner loop */
  MD_Double qi, qj, qg0q, g0qj;
  MD_Double rsqr, b0, b1q;
  struct Cell_Tag *cell = dir->linkcell->cell;
#ifndef EXACT_EWALD
  struct Erfc_Tag *myerfc = dir->myerfc;
#endif
  const MD_Int natoms = dir->natoms;
  const MD_Int numcell = dir->linkcell->numcell;
  MD_Int* neibrs;
  MD_Int ineibr;
  MD_Int **excllist = params->pexcllist;
  const MD_Int *excl;
  MD_Int excld = 0;  /* in case excllist is NULL, excld must be inited */
  MD_Int* head = dir->linkcell->head;
  MD_Int* list = dir->linkcell->list;
  MD_Int i, j, kcell, nn;

  linkcell_hash_atoms(dir->linkcell, pos); 

  qg0q = 0.0;
  memset(neg_dirG1q, 0, 3*natoms*sizeof(*neg_dirG1q));
  memset(force, 0, natoms*sizeof(*force));

  for (kcell = 0;  kcell < numcell;  kcell++) {
    i = head[kcell];  
    while(i >= 0) { 
      qi = charge[i];
      posi = pos[i];
      bqri.x = bqri.y = bqri.z = 0.0;
      fi.x = fi.y = fi.z = 0.0;
      ineibr = 0;
      neibrs = dir->neibrlist[i];
      b = dir->b_arr[i] - 1;
      g0qj = 0.0;
      j = list[i];  
      while(j >= 0) {   /* check other atoms in this cell */
        /*printf("evaluate pme between %d, %d in same cell.\n", i,j);*/
        MD_vec_substract(posi, pos[j], rij);
        SIMPLE_BOUND_VEC(rij, systemsize); 
	rsqr = MD_vec_dot(rij, rij);
        if (rsqr < rcut2) {
          if (excllist != NULL) {  /* the exclusion list is short */
            excl = excllist[i];
            while (*excl < j) excl++;
            excld = (0 == (*excl) - j);
          }
#ifdef EXACT_EWALD
	  COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld);
#else 
	  COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld, myerfc);
#endif
          qj = charge[j];
	  g0qj += b0 * qj;
	  b1q = b[1] * qj;
	  bqri.x += b1q * rij.x;
	  bqri.y += b1q * rij.y;
	  bqri.z += b1q * rij.z;
	  b1q = b[1] * qi;
	  neg_dirG1q[3*j+X] -= b1q * rij.x;
	  neg_dirG1q[3*j+Y] -= b1q * rij.y;
	  neg_dirG1q[3*j+Z] -= b1q * rij.z;
	  b1q *= qj;
	  MD_vec_mul(rij, b1q, fij);
	  MD_vec_add(fi, fij, fi);
	  MD_vec_substract(force[j], fij, force[j]);
          neibrs[ineibr] = j;            
	  ineibr++;
	  ASSERT(ineibr <= dir->max_neibrs);
	  /* computation of virial is neglected */
	  b += 3;
	}
	j = list[j]; 
      } 
      for (nn=0; nn<cell[kcell].numnbrs;  nn++) { /* check neighbor cells */
        j = head[cell[kcell].nbr[nn]];
        while (j >= 0) {
	  MD_vec_substract(posi, pos[j], rij);
	  SIMPLE_BOUND_VEC(rij, systemsize); 
	  rsqr = MD_vec_dot(rij, rij);
	  if (rsqr < rcut2) {
	    if (excllist != NULL) {  /* the exclusion list is short */
	      excl = excllist[i];
	      while (*excl < j) excl++;
	      excld = (0 == (*excl) - j);
	    }
#ifdef EXACT_EWALD
	    COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld);
#else 
	    COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld, myerfc);
#endif
	    qj = charge[j];
	    g0qj += b0 * qj;
	    b1q = b[1] * qj;
	    bqri.x += b1q * rij.x;
	    bqri.y += b1q * rij.y;
	    bqri.z += b1q * rij.z;
	    b1q = b[1] * qi;
	    neg_dirG1q[3*j+X] -= b1q * rij.x;
	    neg_dirG1q[3*j+Y] -= b1q * rij.y;
	    neg_dirG1q[3*j+Z] -= b1q * rij.z;
	    b1q *= qj;
	    MD_vec_mul(rij, b1q, fij);
	    MD_vec_add(fi, fij, fi);
	    MD_vec_substract(force[j], fij, force[j]); 
	    /* computation of virial is neglected */
	    neibrs[ineibr] = j;            
	    ineibr++;
	    ASSERT(ineibr <= dir->max_neibrs);
	    b += 3;
	  }
          j = list[j]; /* next atom */
        }
      } /* nn loop is over */
      neg_dirG1q[3*i+X] += bqri.x;
      neg_dirG1q[3*i+Y] += bqri.y;
      neg_dirG1q[3*i+Z] += bqri.z;
      MD_vec_add(force[i], fi, force[i]);
      qg0q += qi * g0qj;  /* accumulate direct charge--charge potential */
      dir->numneibrs[i] = ineibr;
      ASSERT(b+1 <= dir->b_arr[i] + 3*dir->max_neibrs);
      i = list[i];
    } /* i loop over */
  } /* kcell loop over */

  /* no 1/2, since j=0-->i-1 */
  dir->potential = qg0q;  /* dirG0 = 0 */

#ifdef DEBUG_PME
  /* this is a faster way to compute force, can it compute virial ? */
  /* cancellation error doesnot seem cause a problem */
  printf("direct qq potential = %20.15f\n", dir->potential);
  {MD_Double f2=0.0;
   MD_Double dx, dy, dz, dd = 0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(force[i], force[i]);
    printf("directqq: <f,f>=%f\n", f2);
    for (i=0; i<natoms; i++) {
      dx = force[i].x - charge[i] * neg_dirG1q[3*i+X]; 
      dy = force[i].y - charge[i] * neg_dirG1q[3*i+Y]; 
      dz = force[i].z - charge[i] * neg_dirG1q[3*i+Z]; 
      dd += dx*dx + dy*dy + dz*dz; 
    }
    printf("2-norm relative error: |f - q*negG1q|/|f|=%g\n", sqrt(dd/f2));
 }
#endif

  return;
}

#if 0
/* Wei says that this is an unfinished addition to his code - DJH */
void pmedir_dipole_setup_with_precond(struct PmeDirect_Tag* dir,
				      struct PmeParams_Tag *params,
				      const MD_Double *charge, 
				      const MD_Dvec *pos,
				      struct Preconditioner_Tag *precond)
{
  MD_Dvec * const force = dir->force;
  const MD_Dvec systemsize = dir->linkcell->systemsize;
  const MD_Double rcut2 = params->cutoff * params->cutoff;

  const MD_Double prec_rcut2 = precond->rcut * precond->rcut;
  MD_Int *prec_nneibrs    = precond->nneibrs;
  MD_Int **prec_neibrlist = precond->neibrlist;
  MD_Int in;

  const MD_Double ewaldcof = params->ewaldcof;
  const MD_Double two_ewaldcof2 = 2.0 * ewaldcof * ewaldcof;
  const MD_Double c = 2.0 * ewaldcof * one_over_sqrtPi;
  MD_Double *neg_dirG1q = dir->neg_dirG1q;
  MD_Double *b;
  MD_Dvec posi, rij, fij;   
  MD_Dvec bqri, fi;  /* to reduce the number of writes in the inner loop */
  MD_Double qi, qj, qg0q, g0qj;
  MD_Double rsqr, b0, b1q;
  struct Cell_Tag *cell = dir->linkcell->cell;
#ifndef EXACT_EWALD
  struct Erfc_Tag *myerfc = dir->myerfc;
#endif
  const MD_Int natoms = dir->natoms;
  const MD_Int numcell = dir->linkcell->numcell;
  MD_Int* neibrs;
  MD_Int ineibr;
  MD_Int **excllist = params->pexcllist;
  const MD_Int *excl;
  MD_Int excld = 0;  /* in case excllist is NULL, excld must be inited */
  MD_Int* head = dir->linkcell->head;
  MD_Int* list = dir->linkcell->list;
  MD_Int i, j, kcell, nn;

  linkcell_hash_atoms(dir->linkcell, pos); 

  qg0q = 0.0;
  memset(neg_dirG1q, 0, 3*natoms*sizeof(*neg_dirG1q));
  memset(force, 0, natoms*sizeof(*force));

  for (kcell = 0;  kcell < numcell;  kcell++) {
    i = head[kcell];  
    in = 0;
    while(i >= 0) { 
      qi = charge[i];
      posi = pos[i];
      bqri.x = bqri.y = bqri.z = 0.0;
      fi.x = fi.y = fi.z = 0.0;
      ineibr = 0;
      neibrs = dir->neibrlist[i];
      b = dir->b_arr[i] - 1;
      g0qj = 0.0;
      j = list[i];  
      while(j >= 0) {   /* check other atoms in this cell */
        /*printf("evaluate pme between %d, %d in same cell.\n", i,j);*/
        MD_vec_substract(posi, pos[j], rij);
        SIMPLE_BOUND_VEC(rij, systemsize); 
	rsqr = MD_vec_dot(rij, rij);
        if (rsqr < rcut2) {
          if (excllist != NULL) {  /* the exclusion list is short */
            excl = excllist[i];
            while (*excl < j) excl++;
            excld = (0 == (*excl) - j);
          }

	  if (!excld && rsqr < prec_rcut2) {
	    prec_neibrlist[in++] = j;
	  }

#ifdef EXACT_EWALD
	  COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld);
#else 
	  COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld, myerfc);
#endif
          qj = charge[j];
	  g0qj += b0 * qj;
	  b1q = b[1] * qj;
	  bqri.x += b1q * rij.x;
	  bqri.y += b1q * rij.y;
	  bqri.z += b1q * rij.z;
	  b1q = b[1] * qi;
	  neg_dirG1q[3*j+X] -= b1q * rij.x;
	  neg_dirG1q[3*j+Y] -= b1q * rij.y;
	  neg_dirG1q[3*j+Z] -= b1q * rij.z;
	  b1q *= qj;
	  MD_vec_mul(rij, b1q, fij);
	  MD_vec_add(fi, fij, fi);
	  MD_vec_substract(force[j], fij, force[j]);
          neibrs[ineibr] = j;            
	  ineibr++;
	  ASSERT(ineibr <= dir->max_neibrs);
	  /* computation of virial is neglected */
	  b += 3;
	}
	j = list[j]; 
      } 
      for (nn=0; nn<cell[kcell].numnbrs;  nn++) { /* check neighbor cells */
        j = head[cell[kcell].nbr[nn]];
        while (j >= 0) {
	  MD_vec_substract(posi, pos[j], rij);
	  SIMPLE_BOUND_VEC(rij, systemsize); 
	  rsqr = MD_vec_dot(rij, rij);
	  if (rsqr < rcut2) {
	    if (excllist != NULL) {  /* the exclusion list is short */
	      excl = excllist[i];
	      while (*excl < j) excl++;
	      excld = (0 == (*excl) - j);
	    }

	  if (!excld && rsqr < prec_rcut2) {
	    prec_neibrlist[in++] = j;
	  }

#ifdef EXACT_EWALD
	    COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld);
#else 
	    COMPUTE_B(b, b0, rsqr, c, ewaldcof, two_ewaldcof2, excld, myerfc);
#endif
	    qj = charge[j];
	    g0qj += b0 * qj;
	    b1q = b[1] * qj;
	    bqri.x += b1q * rij.x;
	    bqri.y += b1q * rij.y;
	    bqri.z += b1q * rij.z;
	    b1q = b[1] * qi;
	    neg_dirG1q[3*j+X] -= b1q * rij.x;
	    neg_dirG1q[3*j+Y] -= b1q * rij.y;
	    neg_dirG1q[3*j+Z] -= b1q * rij.z;
	    b1q *= qj;
	    MD_vec_mul(rij, b1q, fij);
	    MD_vec_add(fi, fij, fi);
	    MD_vec_substract(force[j], fij, force[j]); 
	    /* computation of virial is neglected */
	    neibrs[ineibr] = j;            
	    ineibr++;
	    ASSERT(ineibr <= dir->max_neibrs);
	    b += 3;
	  }
          j = list[j]; /* next atom */
        }
      } /* nn loop is over */

      prec_nneibrs[i] = in;

      neg_dirG1q[3*i+X] += bqri.x;
      neg_dirG1q[3*i+Y] += bqri.y;
      neg_dirG1q[3*i+Z] += bqri.z;
      MD_vec_add(force[i], fi, force[i]);
      qg0q += qi * g0qj;  /* accumulate direct charge--charge potential */
      dir->numneibrs[i] = ineibr;
      ASSERT(b+1 <= dir->b_arr[i] + 3*dir->max_neibrs);
      i = list[i];
    } /* i loop over */
  } /* kcell loop over */

  /* no 1/2, since j=0-->i-1 */
  dir->potential = qg0q;  /* dirG0 = 0 */

#ifdef DEBUG_PME
  /* this is a faster way to compute force, can it compute virial ? */
  /* cancellation error doesnot seem cause a problem */
  printf("direct qq potential = %20.15f\n", dir->potential);
  {MD_Double f2=0.0;
   MD_Double dx, dy, dz, dd = 0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(force[i], force[i]);
    printf("directqq: <f,f>=%f\n", f2);
    for (i=0; i<natoms; i++) {
      dx = force[i].x - charge[i] * neg_dirG1q[3*i+X]; 
      dy = force[i].y - charge[i] * neg_dirG1q[3*i+Y]; 
      dz = force[i].z - charge[i] * neg_dirG1q[3*i+Z]; 
      dd += dx*dx + dy*dy + dz*dz; 
    }
    printf("2-norm relative error: |f - q*negG1q|/|f|=%g\n", sqrt(dd/f2));
 }
#endif

  return;
}
#endif







#ifdef COMPUTE_B
#undef COMPUTE_B
#endif
#ifdef EXACT_EWALD  /* c = 2*beta/sqrtPi */
#define COMPUTE_B(b0, b1, rsqr, beta, c, excld) {	\
    MD_Double r = sqrt(rsqr);				\
    MD_Double br = beta * r;				\
    MD_Double tmp = exp(-br*br) * c;			\
    b0 = (erfc(br) - excld) / r;			\
    b1 = (b0 + tmp) / rsqr;				\
  }
#else
#define COMPUTE_B(b0, b1, rsqr, beta, c, excld, myerfc) {	\
    MD_Double r = sqrt(rsqr);					\
    MD_Double br = beta * r;					\
    MD_Double tmp = exp(-br*br) * c;				\
    b0 = (erfc_eval(myerfc, br) - excld) / r;			\
    b1 = (b0 + tmp) / rsqr;					\
  }
#endif


/* charge-charge energy and force, copied+extracted from pmedir_dipole_setup */
void pmedir_charge_compute(struct PmeDirect_Tag* dir, 
			   struct PmeParams_Tag *params,
			   const MD_Double *charge, const MD_Dvec *pos)
{
  MD_Dvec * const force = dir->force;
  const MD_Dvec systemsize = dir->linkcell->systemsize;
  const MD_Double rcut2 = params->cutoff * params->cutoff;
  const MD_Double ewaldcof = params->ewaldcof;
  const MD_Double c = 2.0 * ewaldcof * one_over_sqrtPi;
  MD_Dvec posi, rij, fij;   
  MD_Double qi, qj, qg0q, g0qj;
  MD_Double rsqr, b0, b1, b1q;
  struct Cell_Tag *cell = dir->linkcell->cell;
#ifndef EXACT_EWALD
  struct Erfc_Tag *myerfc = dir->myerfc;
#endif
  const MD_Int natoms = dir->natoms;
  const MD_Int numcell = dir->linkcell->numcell;
  MD_Int **excllist = params->pexcllist;
  const MD_Int *excl;
  MD_Int excld = 0; /* in case exclist==NULL, excld must be inited properly */
  MD_Int* head = dir->linkcell->head;
  MD_Int* list = dir->linkcell->list;
  MD_Int i, j, kcell, nn;

  linkcell_hash_atoms(dir->linkcell, pos); 

  qg0q = 0.0;
  for (i=0; i<natoms; i++) force[i].x=force[i].y=force[i].z=0.0;
  for (kcell = 0;  kcell < numcell;  kcell++) {
    i = head[kcell];  /* first atom in cell */
    while(i >= 0) {  
      qi = charge[i];
      posi = pos[i];
      g0qj = 0.0;
      j = list[i];  
      while(j >= 0) {   /* check other atoms in this cell */
        /*printf("evaluate pme between %d, %d in same cell.\n", i,j);*/
	ASSERT(j < natoms);
        MD_vec_substract(posi, pos[j], rij);
        SIMPLE_BOUND_VEC(rij, systemsize); 
	rsqr = MD_vec_dot(rij, rij);
        if (rsqr < rcut2) {
	  if (NULL != excllist) {  /* the exclusion list is short */
	    excl = excllist[i];
	    while (*excl < j) excl++;
	    excld = (0 == (*excl) - j);
	  }
#ifdef EXACT_EWALD
	  COMPUTE_B(b0, b1, rsqr, ewaldcof, c, excld);
#else 
	  COMPUTE_B(b0, b1, rsqr, ewaldcof, c, excld, myerfc);
#endif
          qj = charge[j];
	  g0qj += b0 * qj;
	  b1q = b1 * qi * qj;
	  MD_vec_mul(rij, b1q, fij);
	  MD_vec_add(force[i], fij, force[i]);
	  MD_vec_substract(force[j], fij, force[j]);
	}
	j = list[j]; 
      } 
      for (nn=0; nn<cell[kcell].numnbrs;  nn++) { /* check neighbors */
        j = head[cell[kcell].nbr[nn]];
	ASSERT(j < natoms);
        while (j >= 0) {
	  MD_vec_substract(posi, pos[j], rij);
	  SIMPLE_BOUND_VEC(rij, systemsize); 
	  rsqr = MD_vec_dot(rij, rij);
	  if (rsqr < rcut2) {
	    if (NULL != excllist) {  /* the exclusion list is short */
	      excl = excllist[i];
	      while (*excl < j) excl++;
	      excld = (0 == (*excl) - j);
	    }
#ifdef EXACT_EWALD
	    COMPUTE_B(b0, b1, rsqr, ewaldcof, c, excld);
#else 
	    COMPUTE_B(b0, b1, rsqr, ewaldcof, c, excld, myerfc);
#endif
	    qj = charge[j];
	    g0qj += b0 * qj;
	    b1q = b1 * qi * qj;
	    MD_vec_mul(rij, b1q, fij);
	    MD_vec_add(force[i], fij, force[i]);
	    MD_vec_substract(force[j], fij, force[j]);
	  }
          j = list[j]; /* next atom */
        }
      } /* nn loop is over */
      qg0q += qi * g0qj;  
      i = list[i];
    } /* i loop over */
  } /* kcell loop over */

  /* no 1/2, since j=0-->i-1 */
  dir->potential = qg0q;  /* dirG0 = 0 */

#ifdef DEBUG_PME
  printf("direct qq potential = %20.15f\n", dir->potential);
  {MD_Double f2=0.0;
    for (i=0; i<natoms; i++) f2+= MD_vec_dot(force[i], force[i]);
    printf("directqq: <f,f>=%f\n", f2);
  }
#endif 

  return;
}


