/*
 * Copyright (C) 2004-2006 by Wei Wang.  All rights reserved.
 */

/*
 * dipole equation solvers implementation.
 *
 */
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include "dsolvers.h"
#include "utilities.h"
#include "explicitG.h"
#include "helper.h"
#include "unit.h"
#include "standEwald.h"
#include "pme.h"
/*
#include "pme_direct.h"
*/
#include "constant.h"
#ifdef TIMING
# include "timer.h"
  static MD_Double Tpred = 0.0;
#endif

static MD_Errcode FPicard(struct Dsolver_Tag *dsolver, MD_Double *x);
static MD_Errcode FdamPicard(struct Dsolver_Tag *dsolver, MD_Double *x);
static MD_Errcode FCG(struct Dsolver_Tag *dsolver, MD_Double *x);
static MD_Errcode FJCG_X(struct Dsolver_Tag *dsolver, MD_Double *x);/*D prec*/
static MD_Errcode FJCG_R(struct Dsolver_Tag *dsolver, MD_Double *x);
static MD_Errcode FPCG_X(struct Dsolver_Tag *dsolver, MD_Double *x);/*M prec*/
static MD_Errcode FPCG_R(struct Dsolver_Tag *dsolver, MD_Double *x);
static MD_Errcode FCheby(struct Dsolver_Tag *dsolver, MD_Double *x);
static MD_Errcode FQCG_X(struct Dsolver_Tag *dsolver, MD_Double *x);/*quickstart*/
static MD_Errcode FMCG_X(struct Dsolver_Tag *dsolver, MD_Double *x);/*modified CG for quick convergence recoginition */

typedef MD_Errcode (*solver_function_type) (struct Dsolver_Tag*, MD_Double *); 

/* private member of dsolver module */
typedef struct private_solver_type {
  solver_function_type action;
  MD_String name; 
  MD_Int nvectors; /* number of vectors stored here */
} private_solver_type;

static private_solver_type Private_Solver[] = { 
  /* must be in same order as Dsolver_method */
  {FPicard, "Picard", 2},
  {FdamPicard, "Damped Picard", 2},
  {FCG, "Conjugate Gradient", 3},
  {FJCG_X, "Jacobi-preconditioned CG (|x(n)-x(n-1)|^2/n < tol^2)", 3},
  {FJCG_R, "Jacobi-preconditioned CG (|residue|^2/n < tol^2)", 3},
  {FQCG_X, "Quick-start CG (|x(n)-x(n-1)|^2/n <tol^2)", 5},
  {FPCG_X, "cutoff-preconditioned CG, (|x(n)-x(n-1)|^2/n < tol^2)", 4},
  {FPCG_R, "cutoff-preconditioned CG (|residue|^2/n < tol^2)", 4},
  {FMCG_X, "peek CG, (|x(n)-x(n-1)|^2/n < tol^2)", 5},
  {FCheby, "Chebyshev semi-iterative", 4},
};


static MD_Int total_iter;     /* total number of iterations so far */
static MD_Int total_calls;    /* total number of calls to dsolver_solve */
static const MD_Int skip = 100; /* initial number of steps skipped when */
                                /* calc <mat-vec-mul> */

#define OUTPUT_AVG_ITER(iter, output_freq, method_name) \
{ \
  total_calls ++; \
  if (total_calls > skip) { \
    total_iter += iter; \
    if (0 == total_calls % output_freq) { \
      printf("%s <iteration>=%6.3f\n", method_name, \
	     (MD_Double) total_iter / (MD_Double) (total_calls - skip)); \
    } \
  } \
}  


/***********************************************************************
 *
 * Implementation
 *
 ***********************************************************************/

MD_Errcode dsolver_init(struct Dsolver_Tag *dsolver, 
			struct Dsolver_Init_Tag *init_data)
{
  assert(NULL != dsolver); 

  dsolver->electro           = init_data->electro;
  dsolver->ewaldmethod       = init_data->ewaldmethod;
  dsolver->mat_vec_mul_mod   = init_data->mat_vec_mul_mod;
  dsolver->compute_pseudores = init_data->compute_pseudores;
  dsolver->matrixsize        = init_data->matrixsize;
  dsolver->errTol2           = init_data->specified_param.errTol2;
  dsolver->method            = init_data->specified_param.method;
  dsolver->maxiter           = init_data->specified_param.maxiter;

  dsolver->diag = my_calloc((size_t)dsolver->matrixsize, 
			    sizeof(* dsolver->diag), "diagonal array");

  if (ES_StandardEwald == dsolver->ewaldmethod) {
    stdEw_fill_diagonal(dsolver->electro, dsolver->diag);
  } else if (ES_SPME == dsolver->ewaldmethod) {
    pme_fill_diagonal(dsolver->electro, dsolver->diag);
  } else {
    fprintf(stderr, "unknown Ewald method: %d\n", dsolver->ewaldmethod);
    return MD_FAIL;
  }

  dsolver->output_freq     = 1000; /* 10000 */

  printf("Dsolver module:\n");
  printf("  error tolerance^2: (|xnew-xold|^2/n) is %g DEBYE^2\n", 
	 dsolver->errTol2 / (DEBYE*DEBYE));
  printf("  maximum iteration: %d\n", dsolver->maxiter);
  printf("   iteration method: %s\n", Private_Solver[dsolver->method].name);
  printf("   output frequency: %d\n", dsolver->output_freq);
#ifdef DIPOLE_POLY
  printf(" -- note: use DIPOLE_POLY noniterative method\n");
#endif

  if (Chebyshev == dsolver->method || PCG_X == dsolver->method || 
      PCG_R == dsolver->method || MCG_X == dsolver->method)  {
    MD_Double rc = 0.0;   /* hard-wired */
    /* 2 IS needed for practical purpose, preconditioner has a different 
     * cutoff radius, generally smaller than cell size, so there is no 1/2 
     * save in space, as there is for the link-cell structure */
    MD_Int maxneibrs = ceil(init_data->density * rc*rc*rc*4.0*Pi/3.0) * 2.0;
    dsolver->precond = my_calloc((size_t)1, sizeof(struct Preconditioner_Tag),
                                 "preconditioner");
    if (ES_StandardEwald == dsolver->ewaldmethod) {
      struct standEwald_Tag * se = dsolver->electro;
      preconditioner_init(dsolver->precond, 
			  se->natoms, 
			  rc, 
			  se->ppos, dsolver->diag, 
			  se->systemsize, 
			  se->pexcllist, 
			  maxneibrs, 
			  se->neibrlist, 
			  se->numneibrs,
			  1);
    } else if (ES_SPME == dsolver->ewaldmethod) {
      struct Pme_Tag *pme = dsolver->electro;
      MD_Dvec systemsize;
      systemsize.x = pme->savePmeParams.cellBasisVector1.x;
      systemsize.y = pme->savePmeParams.cellBasisVector2.y;
      systemsize.z = pme->savePmeParams.cellBasisVector3.z;
      preconditioner_init(dsolver->precond, 
			  pme->savePmeParams.natoms, 
			  rc, 
			  pme->savePmeParams.ppos, 
			  dsolver->diag, 
			  systemsize, 
			  pme->savePmeParams.pexcllist, 
			  maxneibrs, 
			  init_data->neibrlist, 
			  init_data->numneibrs,
			  1);
    }
  }

  dsolver->predictor = my_calloc((size_t)1, sizeof(struct Predictor_Tag),
                                 "predictor");
  if (predictor_init(dsolver->predictor, 
                     init_data->specified_param.pred_type,
                     init_data->specified_param.pred_degree, 
                     dsolver->matrixsize, 
                     init_data->specified_param.restart)) {
    fprintf(stderr, "cannot initiate predictor\n");
    return MD_FAIL;
  }

  dsolver->dipole = my_calloc((size_t)dsolver->matrixsize, sizeof(MD_Double),
                              "dipole vector");
  dsolver->workspace = my_calloc(  
    (size_t) Private_Solver[dsolver->method].nvectors * dsolver->matrixsize, 
    sizeof(MD_Double), "workspace");

  return OK;
}


MD_Errcode dsolver_destroy(struct Dsolver_Tag *dsolver)
{
#ifdef TIMING
  printf("prediction time: %f\n", Tpred);
#endif

  if (total_calls - skip > 0) {
    printf("<iteration>=%6.3f\n",  ((MD_Double) total_iter)
	 / ((MD_Double) total_calls - skip));
  }

  free(dsolver->diag);
  free(dsolver->dipole);
  if (NULL != dsolver->workspace) free(dsolver->workspace);
  if (NULL != dsolver->precond) {
    preconditioner_destroy(dsolver->precond);
    free(dsolver->precond);
  }
  if (predictor_destroy(dsolver->predictor)) {
    printf("error when destorying predictor\n");
    return MD_FAIL;
  }
  memset(dsolver, 0, sizeof(struct Dsolver_Tag));
  return OK;
}


MD_Double *dsolver_get_dipole(const struct Dsolver_Tag *dsolver)
{
  if (NULL == dsolver) {  /* elegant exit */
    printf("wrong: called with dsolver = NULL, has to stop !! \n");
    exit(1);
  }
  return dsolver->dipole;
}


MD_Errcode dsolver_dump_dipole(const struct Dsolver_Tag *dsolver,
			       const char* filename)
{      
  if (vec_buffer_bindump(dsolver->predictor->old_vectors, filename)) {
    fprintf(stderr, "cannot dump dipoles\n");
    return MD_FAIL;
  } 
  return OK;
}


MD_Errcode dsolver_solve(struct Dsolver_Tag *dsolver)
{
  MD_Double *dipole = dsolver->dipole;
#ifdef ANALYZE_MATRIX
  static MD_Int steps = 0;
  MD_Int interval = 2000;
#endif
#ifdef TIMING
  MD_Double tstart;
#endif

/*
fprintf(stderr, "to predict\n");
*/

#ifdef TIMING
  tstart = time_of_day();
#endif
   if( predictor_predict(dsolver->predictor, dipole) ) {
    printf("failed to predict\n");
    return MD_FAIL;
  }
#ifdef TIMING
  Tpred += time_of_day() - tstart;
/*
  printf("prediction time is: %g\n", Tpred);
*/
#endif
 
/*
fprintf(stderr, "dipole predicted, <d,d>=%20.15f \n"
  DOT(dipole, dipole, dsolver->matrixsize));
*/

#ifdef ANALYZE_MATRIX
  steps++;
  if ((steps-2) % interval == 0 && steps > 2) {  
    MD_String filename; 
    const MD_Int matrixsize = dsolver->matrixsize;
    MD_Double *G0, *G1, *G2dir, *G2rec;
    assert (sprintf(filename, "output_Gs/d0_%d.dat", steps-2) + 1 <
	(MD_Int)sizeof(MD_String));
    bindump_array(dipole, matrixsize, filename);
    assert (sprintf(filename, "output_Gs/olddipoles_%d.dat", steps-2) + 1 <
	(MD_Int)sizeof(MD_String));    
    if (vec_buffer_bindump(dsolver->predictor->old_vectors, filename)) {
      return MD_FAIL;
    }

    assert (sprintf(filename, "output_Gs/pos_%d.dat", steps-2) + 1 <
	(MD_Int)sizeof(MD_String));
    bindump_vec_array(dsolver->electro->ppos, matrixsize/3, filename);

    printf("to compute G2 matrix elements.....\n");
    explicitG2(dsolver->electro, NULL, &G0, &G1, &G2dir, &G2rec);
    fprintf(stderr, "G2 obtained, to dump G2 into disk...\n");
    assert (sprintf(filename, "output_Gs/G2dir_%d.dat", steps-2) + 1 <
            (MD_Int)sizeof(MD_String));
    bindump_array(G2dir, matrixsize*matrixsize, filename);
    assert (sprintf(filename, "output_Gs/G2rec_%d.dat", steps-2) + 1 <
            (MD_Int)sizeof(MD_String));
    bindump_array(G2rec, matrixsize*matrixsize, filename);
    fprintf(stderr, "to destroy explicitG\n");
    explicitG_destroy();
    /*
    MD_String filename;
    const MD_Int matrixsize = dsolver->matrixsize;
    MD_Double *G0, *G1, *G2, *G3;
    assert (sprintf(filename, "output_Gs/d0_%d.dat", steps-2) + 1 <
	(MD_Int)sizeof(MD_String));
    bindump_array(dipole, matrixsize, filename);
    fprintf(stderr, "to compute G2 matrix elements.....\n");
    explicitG(dsolver->electro, NULL, &G0, &G1, &G2, &G3);
    fprintf(stderr, "G2 obtained, to dump G2 into disk...\n");
    assert (sprintf(filename, "output_Gs/G2_%d.dat", steps-2) + 1 <
            (MD_Int)sizeof(MD_String));
    bindump_array(G2, matrixsize*matrixsize, filename);
    fprintf(stderr, "to destroy explicitG\n");
    explicitG_destroy();
    */
    assert (sprintf(filename, "output_Gs/negG1q_%d.dat", steps-2) + 1 <
	    (MD_Int)sizeof(MD_String));
    bindump_array(dsolver->pb,  matrixsize, filename);
    /*
    outputv(dipole, matrixsize, "prediction of dipole");
    outputv(dsolver->pb, matrixsize, "-G1q");
    */
  }
#endif

  if(Private_Solver[dsolver->method].action(dsolver, dipole)) {
    fprintf(stderr, "cannot solve the dipole equation\n");
    return MD_FAIL;
  }


  if (predictor_update(dsolver->predictor, dipole)) {
    fprintf(stderr, "failed to update predictor\n");
    return MD_FAIL;
  }

#ifdef ANALYZE_MATRIX
  if ((steps-2) % interval == 0 && steps > 2) {
    MD_String filename;
    assert (sprintf(filename, "output_Gs/d_%d.dat", steps-2) + 1 <
	(MD_Int)sizeof(MD_String));
    bindump_array(dsolver->dipole, dsolver->matrixsize, filename);
  }
#endif

  return OK; 
}


/* preconditioned Chebyshev method, implementation follows 
 * Numerical Analysis in Modern Scientific Computing: An Introduction, 2nd Ed
 * Peter Deuflhard, Andreas Hohmann,  page 247, 248
 */
MD_Errcode FCheby(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2; 
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  MD_Double *x_2 = x;
  MD_Double *x_1 = dsolver->workspace; /* 2 vectors */
  MD_Double *residue = x_1 + matrixsize * 2;
  MD_Double *invQr = residue + matrixsize;
  MD_Double * const solution = x;  
  /* temporarily hard-coded */
  const MD_Double emin = -0.34;  /* cutoff = 0 angstrom in preconditioner */
  const MD_Double emax =  0.26; 
#if 0    /* data pool */
  const MD_Double emin = -0.34;  /* cutoff = 0 angstrom in preconditioner */
  const MD_Double emax =  0.26; 
  const MD_Double emin = -0.28;  /* cutoff = 2 angstrom in preconditioner */
  const MD_Double emax =  0.18;
  const MD_Double emin = -0.24;  /* cutoff = 3 angstrom in preconditioner */
  const MD_Double emax =  0.14;
  const MD_Double emin = -0.21;  /* cutoff = 4 angstrom in preconditioner */
  const MD_Double emax =  0.13; 
#endif
  const MD_Double t = (2.0 - emax - emin) / (emax - emin);
  const MD_Double omega = 2.0 / (2.0 - emax - emin);
  MD_Double *tmp = NULL;  /* for pointer rotation */
  MD_Double rho;
  MD_Double tk, tk_1, tk_2; 
  MD_Double diff, err2;
  MD_Int i;
  MD_Int iter;
  static MD_Int firstime = 1;

  if (firstime) {
    firstime = 0;
    printf("Chebyshev parameter: emax= %f, emin= %f\n", emax, emin);
  }

  x = x_1 + matrixsize; 
  preconditioner_setup(dsolver->precond);

  /* x1 = x0 + omega * inv(Q) * (b - A*x0) , x0 = x_2, x1 = x_1 */
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x_2, 1, x_1);
  for (i=0; i<matrixsize; i++) residue[i] = x_1[i] - x_2[i]*diag[i];
  preconditioner_solve(dsolver->precond, residue, invQr);

  err2 = 0.0;
  for (i = 0; i < matrixsize; i++) {
    diff  = omega * invQr[i];
    x_1[i] = x_2[i] + diff;
    err2 += diff*diff;
  } 
  err2 /= (MD_Double) matrixsize;

  iter = 1;
#ifdef DEBUG_DSOLVER
  printf("iter=%d, |xnew-xold|^2/n= %g Debye^2\n", iter, err2/(DEBYE*DEBYE));
#endif 

  tk_2 = 1.0;  /* T0(u) = 1 */
  tk_1 = t;    /* T1(t) = t */
  while (err2 > errTol2 && iter < maxiter) {
    iter++;
    tk = 2.0 * t * tk_1 - tk_2;
    rho = 2.0 * t * tk_1 / tk;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x_1, 1, x);
    for (i=0; i<matrixsize; i++) residue[i] = x[i] - x_1[i]*diag[i];
    preconditioner_solve(dsolver->precond, residue, invQr);
    err2 = 0.0;
    for (i = 0; i < matrixsize; i++) {
      x[i] = x_2[i] + rho * (x_1[i] - x_2[i] + omega * invQr[i]);
      diff = x[i] - x_1[i];
      err2 += diff * diff;
    }
    err2 /= (MD_Double)matrixsize;
    tk_2 = tk_1;
    tk_1 = tk;
    tmp  = x_2;  /* pointer rotation */
    x_2  = x_1;
    x_1  = x;
    x    = tmp;
#ifdef DEBUG_DSOLVER
  printf("iter=%d |xnew-xold|^2/n = %g Debye^2\n", iter, err2/(DEBYE*DEBYE));
#endif   
  };

  if (solution != x_1) /* not x since pointer rotation is done */
    memcpy(solution, x_1, matrixsize*sizeof(MD_Double));

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "Chebyshev");

  if (err2 > errTol2) {
    fprintf(stderr, "*** Cheby does not converge after %d iterations !\n", 
            iter);
    return MD_FAIL;
  }

  return OK; /* success */
}


/* based on 
 * Numerical Analysis in Modern Scientific Computing: An Introduction, 2nd Ed 
 * Peter Deuflhard, Andreas Hohmann, page 257, (B -> M^{-1}, q -> s)
 * the modification is we do one more Picard iteration at the end of each
 * CG iteration step, which is readily available, and if that one gives us
 * convergence, we can just stop. 
 */
MD_Errcode FMCG_X(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int n = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  MD_Double *s = dsolver->workspace;
  MD_Double *r = s + n;
  MD_Double *as = r + n;
  MD_Double *invMr = as + n;
  MD_Double *y = invMr + n;
  MD_Double alpha, beta, rinvMr, old_rinvMr, tmp, diff2, err2;
  MD_Int i, iter;

  preconditioner_setup(dsolver->precond);

  iter = 1;
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, r); /* b-G2*s */
  for (i=0; i<n; i++) {
    r[i] -= diag[i]*x[i];   /* r = b - G2*s - D*s */
#ifdef DEBUG_DSOLVER
    tmp = r[i]/diag[i];
    diff2 += tmp*tmp;
#endif
#if 0
    y[i] = x[i] + tmp;      /* one Picard iteration, peek */
#endif
  }
#ifdef DEBUG_DSOLVER
  diff2 /= n;
  printf("iter=%d |r|^2/n = %g, |y(n) - x(n)|^2/n=%g\n", iter, DOT(r,r,n) / n,
	 diff2/(DEBYE*DEBYE));
#endif
/* do not peek at the first step */
#if 0
  if (diff2 < errTol2) {
    memcpy(x, y, (size_t)n*sizeof(*x));
    return OK;
  }
#endif

  preconditioner_solve(dsolver->precond, r, s); /* do not need invMr yet */
  rinvMr = DOT(r, s, n);

  do {
    iter++;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as); /* -G2*s */
    for (i=0; i<n; i++) as[i] = diag[i] * s[i] - as[i];  /* as = A*s */
    alpha = rinvMr / DOT(s, as, n);  /* alpha = (r, M^{-1}r) / (s,As) */
    diff2 = err2 = 0.0;
    for (i=0; i<n; i++) {
      tmp = alpha*s[i];
      diff2+= tmp*tmp;
      x[i] += tmp;              /* x = x + alpha*s */
      r[i] -= alpha*as[i];      /* r = r - alpha*A*s */
      tmp = r[i]/diag[i];
      y[i] = x[i] + tmp;        /* one more Picard iteration, peek */
      err2+= tmp*tmp;
    }
    diff2 /= n;  err2 /= n;
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |r|^2/n = %g, |x(n) - x(n-1)|^2/n=%g Debye^2 err2 = %g Debye^2\n", 
	   iter, DOT(r,r,n)/n, diff2/(DEBYE*DEBYE), err2/(DEBYE*DEBYE));
#endif 
    if (err2 < errTol2) {
      memcpy(x, y, (size_t)n*sizeof(*x));
      break;
    }
    if (diff2 < errTol2) break;
    old_rinvMr = rinvMr;
    preconditioner_solve(dsolver->precond, r, invMr);   /* invMr = M^{-1}r */
    rinvMr = DOT(r, invMr, n);                          
    beta = rinvMr / old_rinvMr;               /* beta = (r,M^{-1}r)/oldrMr */
    for (i=0; i<n; i++) s[i] = invMr[i] + beta*s[i]; /* s=M^{-1}r + beta*r */
  } while (iter < maxiter);

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "PCG_X"); 

  if (err2 >= errTol2 && diff2 >= errTol2) {
    fprintf(stderr, "*** Peek-CG does not converge after %d iterations !\n", iter);
    return FAILURE; 
  }
  fflush(stdout);

  return OK;
}


/* based on 
 * Numerical Analysis in Modern Scientific Computing: An Introduction, 2nd Ed 
 * Peter Deuflhard, Andreas Hohmann, page 257, (B -> M^{-1}, q -> s)
 */
MD_Errcode FPCG_X(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int n = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  MD_Double *s = dsolver->workspace;
  MD_Double *r = s + n;
  MD_Double *as = r + n;
  MD_Double *invMr = as + n;
  MD_Double alpha, beta, rinvMr, old_rinvMr, tmp, diff2;
  MD_Int i, iter;

  preconditioner_setup(dsolver->precond);

  iter = 1;
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, r); /* b-G2*s */
  for (i=0; i<n; i++) r[i] -= diag[i]*x[i];   /* r = b - G2*s - D*s */
#ifdef DEBUG_DSOLVER
  printf("iter=%d |r|^2/n = %g\n", iter, DOT(r,r,n) / n);
#endif
  preconditioner_solve(dsolver->precond, r, s); /* do not need invMr yet */
  rinvMr = DOT(r, s, n);
 
  do {
    iter++;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as); /* -G2*s */
    for (i=0; i<n; i++) as[i] = diag[i] * s[i] - as[i];  /* as = A*s */
    alpha = rinvMr / DOT(s, as, n);  /* alpha = (r, M^{-1}r) / (s,As) */
    diff2 = 0.0;
    for (i=0; i<n; i++) {
      x[i] += (tmp = alpha*s[i]);  /* x = x + alpha*s */
      r[i] -= alpha*as[i];         /* r = r - alpha*A*s */
      diff2+= tmp*tmp;
    }
    diff2 /= n;
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |r|^2/n = %g, |x(n) - x(n-1)|^2/n=%g Debye^2\n", 
	   iter, DOT(r,r,n)/n, diff2/(DEBYE*DEBYE));
#endif 
    if (diff2 < errTol2) break; /* r will be used */
    old_rinvMr = rinvMr;
    preconditioner_solve(dsolver->precond, r, invMr);   /* invMr = M^{-1}r */
    rinvMr = DOT(r, invMr, n);                          
    beta = rinvMr / old_rinvMr;               /* beta = (r,M^{-1}r)/oldrMr */
    for (i=0; i<n; i++) s[i] = invMr[i] + beta*s[i]; /* s=M^{-1}r + beta*r */
  } while (iter < maxiter);

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "PCG_X"); 

  if (diff2 >= errTol2) {
    fprintf(stderr, "*** PCG does not converge after %d iterations !\n", iter);
    return FAILURE; 
  }
  fflush(stdout);

  return OK;
}


MD_Errcode FPCG_R(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int n = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  MD_Double *s = dsolver->workspace;
  MD_Double *r = s + n;
  MD_Double *as = r + n;
  MD_Double *invMr = as + n;
  MD_Double alpha, beta, rinvMr, old_rinvMr, tmp, diff2;
  MD_Int i, iter;

  preconditioner_setup(dsolver->precond);

  iter = 1;
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, r); /* b-G2*s */
  for (i=0; i<n; i++) r[i] -= diag[i]*x[i];   /* r = b - G2*s - D*s */
#ifdef DEBUG_DSOLVER
  printf("iter=%d |r|^2/n = %g\n", iter, DOT(r,r,n) / n);
#endif
  preconditioner_solve(dsolver->precond, r, s); /* do not need invMr yet */
  rinvMr = DOT(r, s, n);
 
  do {
    iter++;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as); /* -G2*s */
    for (i=0; i<n; i++) as[i] = diag[i] * s[i] - as[i];  /* A*s */
    alpha = rinvMr / DOT(s, as, n);
    diff2 = 0.0;
    for (i=0; i<n; i++) {
      x[i] += alpha*s[i];             /* x = x + alpha*s */
      r[i] -= (tmp = alpha*as[i]);    /* r = r - alpha*A*s */
      diff2+= tmp*tmp;
    }
    diff2 /= n;
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |r|^2/n = %g\n", iter, diff2/n);
#endif 
    if (diff2 < errTol2) break;
    old_rinvMr = rinvMr;
    preconditioner_solve(dsolver->precond, r, invMr);
    rinvMr = DOT(r, invMr, n);
    beta = rinvMr / old_rinvMr;
    for (i=0; i<n; i++) s[i] = invMr[i] + beta * s[i];
  } while (iter < maxiter);

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "PCG_R"); 

  if (diff2 >= errTol2) {
    fprintf(stderr, "*** JCG does not converge after %d iterations !\n", iter);
    return FAILURE; 
  }
  fflush(stdout);

  return OK;
}



/*
 * solve (A+D)x = b by Conjugate Gradient iteration.
 *    using Jacobi precondition (M=D), i.e. inv(M) = inv(diag).
 * initial guess is provided through input.
 * A: n x n,  D: n x 1 vector. n = 3 * natoms.
 * reference: Scientic Computing, 2nd Ed. M. Heath
 */
MD_Errcode FJCG_R(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  const MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *s = dsolver->workspace;
  MD_Double *r = s + matrixsize;
  MD_Double *as = r + matrixsize;
  MD_Double alpha, beta, rnorm2, sas, rMr, err2, old_rMr;
  MD_Double difi, diff;
  MD_Int i, iter;

  iter = 0;
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, as);
  iter++;
  rnorm2 = 0.0;
  for (i = 0; i < matrixsize; i++) {
    r[i] = as[i] - diag[i]*x[i];
    /*
    printf("%d, as=%f, diag=%f, x=%f, r=%f\n", i,as[i],diag[i],x[i],r[i]); 
    */
    s[i] = r[i] / diag[i];
    rnorm2 += r[i] * r[i];
  }
  err2 = rnorm2 * inv_matrixsize;

#ifdef DEBUG_DSOLVER
  printf("iter=%d |r|^2/n = %g\n", iter, rnorm2 * inv_matrixsize);
#endif 
 
  while (err2 > errTol2 && iter < maxiter) {
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as);
    iter++;
    sas = 0.0;
    rMr = 0.0;
    for (i = 0; i < matrixsize; i++) {  /* compute A * s */
      as[i] = diag[i] * s[i] - as[i];
      sas += s[i] * as[i];
      rMr += r[i] * r[i] / diag[i];
    }
    alpha = rMr / sas;  
    old_rMr = rMr;   
    rMr = 0.0;
    rnorm2 = 0.0;
    diff = 0.0;
    for (i = 0; i < matrixsize; i++) {
      difi = alpha * s[i];
      diff += difi*difi; 
      x[i] += difi;
      r[i] -= alpha * as[i];
      rnorm2 += r[i] * r[i];
      rMr += r[i] * r[i] / diag[i];
    }
    err2 = rnorm2 * inv_matrixsize;
    diff *= inv_matrixsize;
    beta = rMr / old_rMr;
    for (i = 0; i < matrixsize; i++) {
      s[i] = r[i]/diag[i] + beta * s[i];
    }
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |r|^2/2=%g, |x(n)-x(n-1)|^2/n=%g Debye^2\n", iter, err2, 
	   diff/(DEBYE*DEBYE));
#endif 
  };

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "JCG");

  if (err2 >= errTol2) {
    fprintf(stderr, "*** JCG does not converge after %d iterations !\n", iter);
    return FAILURE; 
  }
  fflush(stdout);

  return OK;
}



/*
 * solve (A+D)x = b by Conjugate Gradient iteration.
 *    using Jacobi precondition (M=D), i.e. inv(M) = inv(diag).
 * initial guess is provided through input.
 * A: n x n,  D: n x 1 vector. n = 3 * natoms.
 * reference: Scientic Computing, 2nd Ed. M. Heath
 */
MD_Errcode FJCG_X(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  const MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *s = dsolver->workspace;
  MD_Double *r = s + matrixsize;
  MD_Double *as = r + matrixsize;
  MD_Double alpha, beta, rnorm2, sas, rMr, err2, old_rMr;
  MD_Double difi, diff;
  MD_Int i, iter;

  iter = 0;
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, as);
  iter++;
  rnorm2 = 0.0;
  for (i = 0; i < matrixsize; i++) {
    r[i] = as[i] - diag[i]*x[i];
    /*
    printf("%d, as=%f, diag=%f, x=%f, r=%f\n", i, as[i],diag[i],x[i],r[i]); 
    */
    s[i] = r[i] / diag[i];
    rnorm2 += r[i] * r[i];
  }

#ifdef DEBUG_DSOLVER
  err2 = rnorm2 * inv_matrixsize;
  printf("iter=%d |r|^2/n = %g\n", iter, err2);
#endif 
 
  do {
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as);
    iter++;
    sas = 0.0;
    rMr = 0.0;
    for (i = 0; i < matrixsize; i++) {  /* compute A * s */
      as[i] = diag[i] * s[i] - as[i];
      sas += s[i] * as[i];
      rMr += r[i] * r[i] / diag[i];
    }
    alpha = rMr / sas;  
    old_rMr = rMr;   
    rMr = 0.0;
    rnorm2 = 0.0;
    diff = 0.0;
    for (i = 0; i < matrixsize; i++) {
      difi = alpha * s[i];
      diff += difi*difi;
      x[i] += difi;
      r[i] -= alpha * as[i];
      rnorm2 += r[i] * r[i];
      rMr += r[i] * r[i] / diag[i];
    }
    diff *= inv_matrixsize;
    err2 = rnorm2 * inv_matrixsize;
    beta = rMr / old_rMr;
    for (i = 0; i < matrixsize; i++) {
      s[i] = r[i]/diag[i] + beta * s[i];
    }
#ifdef DEBUG_DSOLVER
    printf("iter=%d, alpha= %f  beta= %f\n", iter, alpha, beta);
    printf("iter=%d |r|^2/n = %g, |x(n) - x(n-1)|^2/n=%g Debye^2\n", 
	   iter, err2, diff/(DEBYE*DEBYE));
#endif 
  } while (diff > errTol2 && err2 > 1e-30 && iter < maxiter);


  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "JCG"); 

  if (diff >= errTol2 && err2 > 1e-30 && rnorm2 > 0.1 * errTol2) {
    fprintf(stderr, "*** JCG does not converge after %d iterations !\n", iter);
    return FAILURE; 
  }
  fflush(stdout);

  return OK;
}



/*
 * solve (G2 + diag)x = b by Conjugate gradient iteration.
 * initial guess is provided through input.
 * G2: n x n,  diag: n x 1 vector. n = 3N.
 * implementation based on M. Heath <<Scientific Computing>>, p. 473
 */
MD_Errcode FCG(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  const MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *s = dsolver->workspace;
  MD_Double *r = s + matrixsize;
  MD_Double *as = r + matrixsize;
  MD_Double alpha, beta, rnorm2, sas, err2;
  MD_Double difi, diff;
  MD_Int i, iter;

  iter = 1;
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, as);

  rnorm2 = 0.0;
  for (i = 0; i < matrixsize; i++) {
    s[i] = r[i] = as[i] - diag[i]*x[i];
    rnorm2 += r[i] * r[i];
  }
  err2 = rnorm2 * inv_matrixsize;

#ifdef DEBUG_DSOLVER
  printf("iter=%d |r|^2/n = %g\n", iter, err2);
#endif 

  do {
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as);
    iter++;
    sas = 0.0;
    for (i = 0; i < matrixsize; i++) {  
      as[i] = diag[i] * s[i] - as[i];
      sas += s[i] * as[i];
    }
    alpha = rnorm2 / sas; 
    beta = rnorm2;   /* save previous residue norm */
    rnorm2 = 0.0;
    diff = 0.0;
    for (i = 0; i < matrixsize; i++) {
      difi = alpha * s[i];
      diff += difi*difi;
      x[i] += alpha * s[i];
      r[i] -= alpha * as[i];
      rnorm2 += r[i] * r[i];
    }
    diff *= inv_matrixsize;
    beta = rnorm2 / beta;
    for (i = 0; i < matrixsize; i++) {
      s[i] = r[i] + beta * s[i];
    }
    err2 = rnorm2 * inv_matrixsize;
#ifdef DEBUG_DSOLVER
    printf("iter=%d |r|^2/n = %g\n", iter, err2);
#endif 
  /* the err2 > 1e-30 is againt too small residue */
  }  while (diff > errTol2 && err2 > 1e-30 && iter < maxiter);


  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "CG"); 

  if (diff >= errTol2 && err2 > 1e-30) {
    fprintf(stderr, "*** CG does not converge after %d iterations !\n", iter);
    return FAILURE; 
  }

  return OK; /* success */
}


/*
 * solve (diag+ND)x = b, using Picard iteration.
 * diag is represented as as an 1D array. ND is a 2D array, not neccesarily
 * having 0 diagonal elements.
 * initial guess is provided in x. 
 */
MD_Errcode FPicard(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  MD_Double * const solution = x;
  MD_Double errTol2 = dsolver->errTol2;  /* !!! not constant any more */
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  const  MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *xn = dsolver->workspace;
  MD_Double *g2x = xn + matrixsize;
  MD_Double *tmp;
  MD_Double diff, diff2;
  MD_Int i, iter;

#if 0
  /* ------------------------------------------------------------ */
  /* try to periodically solve the equation to exact */
  static MD_Int counter = 0;

  /* printf("dsolver->errTol2 = %g\n", dsolver->errTol2); */
  if (counter++ % 100 == 0) {  /* every 100 steps, do a full iteration. */
    errTol2 = 1e-8*DEBYE;
    errTol2 *= errTol2;
    printf("converge to high precision now: from %g to %g\n",
	   dsolver->errTol2, errTol2);
  }
  /* ------------------------------------------------------------ */
#endif

#if 0
  {
#include "random.h"
  for (i = 0; i < matrixsize; i++) x[i] = rannyu();
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 0, g2x);
  exit(1);
  }
#endif

  iter = 0;
  do {
    iter++;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, g2x);
    diff2 = 0.0;
    for (i = 0; i < matrixsize; i++) {
      xn[i] = g2x[i] / diag[i];
      diff = xn[i] - x[i];
      diff2 += diff * diff;
    }
    diff2 *= inv_matrixsize;
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |xnew-xold|^2/n = %g Debye^2\n", iter, 
	   diff2/(DEBYE*DEBYE));
#endif
    tmp = x;
    x = xn;
    xn = tmp;  /* pointer rotation */
  } while (diff2 > errTol2 && iter < maxiter) ;

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "Picard"); 

  if (solution != x) {
    memcpy(solution, x, matrixsize * sizeof(MD_Double));
  }

  if (diff2 > errTol2) {
    fprintf(stderr, "*** Picard does not converge after %d iterations\n", 
	    iter);
    return MD_FAIL; /* fail */
  }

  return OK; /* success */
}


/*
 * solve (diag+ND)x = b, using Picard iteration.
 * diag is represented as as an 1D array. ND is a 2D array, not neccesarily
 * having 0 diagonal elements.
 * initial guess is provided in x. 
 */
MD_Errcode FdamPicard(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  static MD_Double damp_fac; 
  static MD_Double one_sub_damp_fac;

  const MD_Double *diag = dsolver->diag;
  MD_Double * const solution = x;
  const MD_Double errTol2 = dsolver->errTol2;  
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  const  MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *xn = dsolver->workspace;
  MD_Double *g2x = xn + matrixsize;
  MD_Double *tmp;
  MD_Double diff, diff2;
  MD_Int i, iter;
  static MD_Int firstime = 1;

  if (firstime) {
#ifndef ASPC
    const MD_Double emin = -0.34;
    const MD_Double emax =  0.26; 
    damp_fac = 2.0 / (2.0 - emin - emax);
    one_sub_damp_fac = 1.0 - damp_fac;
#else
    /* the optimal factors are for standard Ewald, not for PME. */
                          /* 0    1    2     3     4     5     6     7 */
    MD_Double dfactor[] = {1.0, 1.0, 0.89, 0.85, 0.82, 0.81, 0.80, 0.79,
                          /* 8     9     10 */
                          /* 0.79, 0.79, 0.78};  not stable */
                             0.75, 0.75, 0.75};   /* stable */
    MD_Int kvalue = predictor_get_degree(dsolver->predictor);
    printf("**** use ASPC method ****\n");
    if (sizeof(dfactor) > sizeof(MD_Double) * (size_t)kvalue) {
      damp_fac = dfactor[kvalue];
    } else  {
      damp_fac = (MD_Double)(kvalue + 1) /  (MD_Double)(kvalue*2 + 1);   
      printf("optimal damping factor value is not decided, use conservative value\n");
    }
    damp_fac = (MD_Double)(kvalue + 1) /  (MD_Double)(kvalue*2 + 1);   
    one_sub_damp_fac = 1.0 - damp_fac;
#endif 
    printf("  -- Picard damping factor is: %f\n", damp_fac);
    firstime = 0;
  }

#if 0   /* a hack for study of ASPC */
  {static MD_Int first = 1;
  if (first) {
    first = 0;
    damp_fac = 0.0;
    one_sub_damp_fac = 1.0 - damp_fac;
    printf(" damping factor is fixed to = %f\n", damp_fac);
  }
  }
#endif

#if 0 /* a hack for high-order ASPC */
#ifdef ASPC
  { static MD_Int counter = 0;
    counter ++;
    if (counter <= predictor_get_degree(dsolver->predictor)) {
      dsolver->maxiter = 10;
      return FPicard(dsolver, x);
    } else {
      dsolver->maxiter = 1;
    } 
  }
#endif
#endif

  iter = 0;
  do {
    iter++;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, g2x);
    diff2 = 0.0;
    for (i = 0; i < matrixsize; i++) {
      xn[i] = g2x[i] / diag[i] * damp_fac + one_sub_damp_fac * x[i];
      diff = xn[i] - x[i];
      diff2 += diff * diff;
    }
    diff2 *= inv_matrixsize;
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |xnew-xold|^2/n = %g Debye^2\n",iter,diff2/(DEBYE*DEBYE));
#endif
    tmp = x;
    x = xn;
    xn = tmp;  /* pointer rotation */
  } while (diff2 > errTol2 && iter < maxiter) ;

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "damp Picard"); 

  if (solution != x) {
    memcpy(solution, x, matrixsize * sizeof(MD_Double));
  }

#ifndef ASPC
  if (diff2 > errTol2) {
    fprintf(stderr, "*** damPicard does not converge after %d iterations\n", 
	    iter);
    return MD_FAIL; /* fail */
  }
#endif

  return OK; /* success */
}


#if 0
/* JOR */
MD_Errcode FJOR(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double *b = dsolver->pb;
  MD_Double * const solution = x;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter;
  const  MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *xn = dsolver->workspace;
  MD_Double *g2x = xn + matrixsize;
  MD_Double *tmp;
  MD_Double diff, diff2 = 1.0 + errTol2;
  MD_Int i, iter;

  iter = 0;
  do {
    iter++;
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 1, g2x);
    diff2 = 0.0;
    for (i = 0; i < matrixsize; i++) {
      xn[i] = g2x[i] / diag[i];
      diff = xn[i] - x[i];
      diff2 += diff * diff;
    }
    diff2 *= inv_matrixsize;
#ifdef DEBUG_DSOLVER
    printf("iter=%d, |xnew-xold|^2/n = %g Debye^2\n", iter, 
	   diff2/(DEBYE*DEBYE));
#endif
    tmp = x;
    x = xn;
    xn = tmp;  /* pointer rotation */
  } while (diff2 > errTol2 && iter < maxiter) ;

  OUTPUT_AVG_ITER(iter, dsolver->output_freq, "JOR"); 

  if (solution != x) {
    memcpy(solution, x, matrixsize * sizeof(MD_Double));
  }

  if (diff2 > errTol2) {
    fprintf(stderr, "*** Picard does not converge after %d iterations\n", 
	    iter);
    return MD_FAIL; /* fail */
  }

  return OK; /* success */
}

#endif



/*
 * solve (A+D)x = b by Conjugate Gradient iteration with a quickstart
 *    using Jacobi precondition (M=D), i.e. inv(M) = inv(diag).
 * initial guess is provided through input.
 * A: n x n,  D: n x 1 vector. n = 3 * natoms.
 * reference: Scientic Computing, Feb-02-2004 note
 */
# if 1
MD_Errcode FQCG_X(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  dsolver = NULL;
  x = NULL;
  fprintf(stderr, "QCG not working yet\n");
  return MD_FAIL;
}
#else 
MD_Errcode FQCG_X(struct Dsolver_Tag *dsolver, MD_Double *x)
{
  const MD_Double *diag = dsolver->diag;
  const MD_Double *b = dsolver->pb;
  const MD_Double errTol2 = dsolver->errTol2;
  const MD_Int matrixsize = dsolver->matrixsize;
  const MD_Int maxiter = dsolver->maxiter * 100;
  const MD_Double inv_matrixsize = 1.0 / (MD_Double) matrixsize;
  MD_Double *s = dsolver->workspace;  /* search direction. */
  MD_Double *r = s + matrixsize;      /* residue */
  MD_Double *as = r + matrixsize;     /* A * s */
  MD_Double *s0 = as + matrixsize;    /* store s0 */
  MD_Double *invMr0 = s0 + matrixsize;  /* store inv(M)*r0 */
  MD_Double alpha, beta, beta0; 
  MD_Double old_rMr;
  MD_Double sr, rMAs, rMr0;
  register MD_Double rnorm2, sas, rMr;
  register MD_Double difi, diff;
  MD_Double s0As0;
  MD_Int matvecmul = 0;
  MD_Int i;

  /* note that the code breaks down if x = 0 */
  memcpy(s,  x, matrixsize * sizeof(MD_Double)); /* search direction = x0 */
  memcpy(s0, s, matrixsize * sizeof(MD_Double)); 
  dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, x, 0, as);
  matvecmul++;
  rnorm2 = rMr = s0As0 = sr = 0.0;
  for (i = 0; i < matrixsize; i++) { /* compute A*s, s*A*s, r, inv(M)*r */
    as[i]     = diag[i] * x[i] - as[i];
    s0As0    += s[i] * as[i];
    r[i]      = b[i] - as[i];
    rnorm2   += r[i] * r[i];
    invMr0[i] = r[i] / diag[i];
    sr       += s[i] * r[i];
  }
#ifdef DEBUG_DSOLVER
  printf("sAs = %f\n", s0As0);
  if (0.0 == s0As0) {
    fprintf(stderr, "zero initial guess does not work with QCG\n");
    return MD_FAIL;
  }
  printf("init guess: |r|^2/n = %g\n", rnorm2 * inv_matrixsize);
#endif 
  alpha = sr / s0As0;
  rMAs = 0.0;
  diff = 0.0;
  for (i = 0; i < matrixsize; i++) { /* update x, residue */
    difi = alpha * s[i];
    x[i]   += difi;
    diff   += difi*difi;
    r[i]   -= alpha * as[i];
    rnorm2 += r[i] * r[i];
    rMAs   += r[i] * as[i] / diag[i];
  }
#ifdef DEBUG_DSOLVER
  printf("matvecmul=%d |r|^2/n=%g, |xnew-xold|^2/n=%g DEBYE^2\n", 
	 matvecmul, rnorm2 * inv_matrixsize, 
	 diff * inv_matrixsize / (DEBYE*DEBYE));
  /*
  printf("dot(r, d) = 0 ? %g\n", DOT(r, s, matrixsize));
  */
#endif 
  beta0 = - rMAs / s0As0;
  for (i = 0; i < matrixsize; i++) {
    s[i] = r[i] / diag[i] + beta0 * s[i];
  }
  printf("alpha=%f, beta=%g\n", alpha, beta0);

  do {
    dsolver->compute_pseudores(dsolver->mat_vec_mul_mod, s, 0, as);
    matvecmul++;
    sas = rMr = 0.0;
    for (i = 0; i < matrixsize; i++) {  /* compute A * s */
      as[i]  = diag[i] * s[i] - as[i];
      sas   += s[i] * as[i];
      rMr   += r[i] * r[i] / diag[i]; /* old r */
    }
    alpha = rMr / sas;  
    old_rMr = rMr;   
    rMr = rMr0 = rnorm2 = diff = 0.0;
    diff = 0.0;
    for (i = 0; i < matrixsize; i++) { /* update x and residue */
      difi    = alpha * s[i];
      diff   += difi*difi;
      x[i]   += difi;
      r[i]   -= alpha * as[i];
      rnorm2 += r[i] * r[i];
      rMr    += r[i] * r[i] / diag[i];
      rMr0   += r[i] * invMr0[i];
    }
    diff *= inv_matrixsize;
    beta0 = - rMr0 / s0As0;
    beta  = rMr / old_rMr;
    for (i = 0; i < matrixsize; i++) {
      s[i] = r[i] / diag[i] + beta * s[i] + beta0 * s0[i];
    }
#ifdef DEBUG_DSOLVER
    
    printf("matvecmul=%d, alpha= %f  beta= %g  beta0= %g\n", 
	   matvecmul, alpha, beta, beta0);
    
    printf("matvecmul=%d |r|^2/n = %g, |xnew-xold|^2/n=%g Debye^2\n", 
	   matvecmul, rnorm2 * inv_matrixsize, diff/(DEBYE*DEBYE));
    /*
    printf("dot(r, d) = 0 ? %g\n", DOT(r, s, matrixsize));
    */
#endif 
  } while (diff > errTol2 && matvecmul < maxiter);


  OUTPUT_AVG_ITER(matvecmul, dsolver->output_freq, "QCG"); 

  /*
  if (err2 >= errTol2) {
  */
  if (diff >= errTol2 && rnorm2 > 0.1 * errTol2) {
    fprintf(stderr, "*** QCG_X does not converge after %d iterations !\n", 
	    matvecmul);
    return FAILURE; 
  }
  fflush(stdout);

  return OK;
}
#endif
