/*
 * Copyright (C) 2004-2006 by Wei Wang.  All rights reserved.
 */


#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "predictor.h"
#include "helper.h"
#include "utilities.h"

static const char* Prediction_name[] = {
  "Polynomial",                   /* Polynomial */
  "Difference Polynomial",        /* dPolynomial */
  "Time-Reversible",              /* Time_Reversible */
  "Time-Reversible, High-Order",  /* TR_HO */
  "Initial Guess = 0",            /* Zero */
  "Least Square",                 /* LeastSquare */
  "Constraint Least Square",      /* ConstraintLeastSquare */
  "Least Square (on vector difference) ",   /* dLeastSquare */
  "Constraint Least Square (on vector difference) ",/*dConstraintLeastSquare*/
  "Time Reversible Least Square ", /* TimeReversibleLeastSquare */
};

static MD_Double *buffer = NULL;  /* Householder buffer */


static void calc_poly_coeff(   MD_Double *coeff, const MD_Int degree);
static void calc_timerev_coeff(MD_Double *coeff, const MD_Int degree);
static void calc_ho_timerev_coeff(MD_Double *coeff, const MD_Int kvalue);
static MD_Double calc_leastsquare_coeff(MD_Double *coeff,
                                        MD_Double **dip,
                                        const MD_Int ndip,
                                        const MD_Int veclen,
                                        const MD_Int constraint);
static MD_Double calc_ls_coeff_householder(MD_Double *coeff,
                                        MD_Double **dip,
                                        const MD_Int ndip,
                                        const MD_Int veclen);

/* time-reversible-least-square */
static MD_Double calc_trls_coeff(MD_Double *coeff,
				 MD_Double **vec,
				 const MD_Int nvec,
				 const MD_Int veclen,
				 const MD_Int constraint);


/* private members */
static MD_Double *next_to_last;    
static MD_Int g_nconstraints = 10;  /* for time-reversible least-square */


MD_Int predictor_init(struct Predictor_Tag *pred, Prediction_Type pred_type, 
 		      MD_Int pred_degree, MD_Int veclen, MD_Int restart)
{
  MD_Int nvec = 0;

  assert(pred_type < NPtypes);
  assert(pred_degree >= 0);

  pred->pred_type = pred_type;
  pred->degree = pred_degree;

  printf("Predictor module:\n");
  printf("  prediction   type: %s\n", Prediction_name[pred->pred_type]);

  switch (pred->pred_type) {
  case dPolynomial:
    next_to_last = my_calloc((size_t)veclen, sizeof(MD_Double), "ntlv");
  case Polynomial:
    printf("  prediction degree: %d\n", pred->degree);
    nvec = pred->degree + 1; 
    pred->coeff = my_calloc((size_t)nvec, sizeof(MD_Double), "coeff");
    calc_poly_coeff(pred->coeff, pred->degree);
    break;
  case Time_Reversible: 
    printf("             kvalue: %d\n", pred->degree);
    nvec = pred->degree + 1; 
    pred->coeff = my_calloc((size_t)nvec, sizeof(MD_Double), "coeff");
    calc_timerev_coeff(pred->coeff, pred->degree);
    break;
  case TR_HO:
    printf("             kvalue: %d (only 5,6,7 are allowed)\n",pred->degree); 
    assert(5 == pred->degree || 6 == pred->degree || 7 == pred->degree);
    nvec = pred->degree;
    pred->coeff = my_calloc((size_t)nvec, sizeof(MD_Double), "coeff");
    calc_ho_timerev_coeff(pred->coeff, nvec);
    break;
  case LeastSquare:
  case ConstraintLeastSquare:
    printf("  # of vectors to compute the init. guess: %d\n", pred->degree);
    nvec = pred->degree + 1;
    if (LeastSquare==pred->pred_type) 
      buffer=my_calloc((size_t)(nvec+1)*veclen, sizeof(MD_Double), "Householder buffer");
    pred->coeff = my_calloc( (size_t)
        (LeastSquare == pred->pred_type ? pred->degree : pred->degree+1), 
        sizeof(MD_Double), "coeff");
    /* dynamically compute the coefficient */
    break;
  case dLeastSquare:
  case dConstraintLeastSquare:
    next_to_last = my_calloc((size_t)veclen, sizeof(MD_Double), "ntlv");
    printf("  # of vectors to compute the init. guess: %d\n", pred->degree);
    nvec = pred->degree + 1;
    pred->coeff = my_calloc( (size_t)
        (dLeastSquare == pred->pred_type ? pred->degree : pred->degree+1), 
        sizeof(MD_Double), "coeff");
    /* dynamically compute the coefficient */
    break;
  case Zero:
    printf(" initial guess = 0 \n");
    pred->degree = 0;
    nvec = 0;
    break;
  case TimeReversibleLeastSquare:
    printf(" # of vectors to compute the init. guess: %d\n", pred->degree);
    printf(" # of time-reversible constrainits: %d\n", g_nconstraints);
    nvec = pred->degree;
    pred->coeff = my_calloc( (size_t) nvec, sizeof(MD_Double), "coeff");
    if (g_nconstraints >= nvec) {
      printf("too many constraints, reduce it from %d to maximum possible:"
             " %d\n", g_nconstraints, nvec-1);
      g_nconstraints = nvec - 1;
    }
    break;
  default:
    fprintf(stderr, "undefined predictor type\n");
    return MD_FAIL;
  }

  if (Zero != pred->pred_type) {
    pred->pvec = my_calloc((size_t)nvec, sizeof(MD_Double *), "oldvec");
  }

  /* must be initialized */
  pred->old_vectors=my_calloc((size_t)1,sizeof(struct VecBuffer_Tag),"vpool");
  if (vec_buffer_init(pred->old_vectors, nvec, veclen,
           (restart && Zero != pred->pred_type) ? "dipoles.dat":NULL)) {
    fprintf(stderr, "failed to init vec buffer \n");
    return MD_FAIL;
  }

  return OK;
}


MD_Int predictor_destroy(struct Predictor_Tag* pred)
{
  if (NULL != buffer) free(buffer); 
  if (NULL != pred->coeff) free(pred->coeff);
  if (NULL != pred->pvec)  free(pred->pvec);
  if (NULL != pred->old_vectors) {
    if (vec_buffer_destroy(pred->old_vectors)) {
      fprintf(stderr, "failed to destroy old_vectors\n");
      return MD_FAIL;
    }
    free(pred->old_vectors);
  }

  if (NULL != next_to_last) free(next_to_last);

  memset(pred, 0, sizeof(struct Predictor_Tag));
  return OK;
}


MD_Int predictor_update(struct Predictor_Tag *pred, const MD_Double *newvec)
{
  ASSERT(NULL != newvec);

  if (dPolynomial == pred->pred_type ||
      dLeastSquare == pred->pred_type || 
      dConstraintLeastSquare == pred->pred_type) {
    MD_Int veclen = vec_buffer_get_veclen(pred->old_vectors);
    static MD_Int firstime = 1;
    
    if (firstime) {  
      firstime = 0;
      memcpy(next_to_last, newvec, veclen*sizeof(MD_Double));
    } else {
      MD_Int i;
      for(i = 0; i < veclen; i++)  {
        next_to_last[i] = newvec[i] - next_to_last[i];
      }
      if (vec_buffer_update(pred->old_vectors, next_to_last)) {
        fprintf(stderr, "failed to udpate vector buffer\n");
        return MD_FAIL;
      }
      memcpy(next_to_last, newvec, veclen*sizeof(MD_Double));
    };
  } else  {  /* simply copy the vector */
    return vec_buffer_update(pred->old_vectors, newvec);
  }
 
  return OK;
}


MD_Int predictor_get_degree(struct Predictor_Tag *pred)
{
  return pred->degree;
}


Prediction_Type predictor_get_type(struct Predictor_Tag *pred)
{
  return pred->pred_type;
}


MD_Int predictor_predict(struct Predictor_Tag *pred, MD_Double *predvec)
{
  MD_Double **pvec = pred->pvec;
  const MD_Int veclen = vec_buffer_get_veclen(pred->old_vectors);
  const MD_Int nvec = vec_buffer_get_nvec(pred->old_vectors);
  MD_Int i, j;

  ASSERT(NULL != predvec);

  for (i = 0; i < veclen; i++) predvec[i] = 0.0;
  if (Zero == pred->pred_type) return OK;

  if (FAILURE == vec_buffer_get_vec_array(pred->old_vectors, pvec, nvec)) {
    printf("vector buffer error");
    return MD_FAIL;
  }

  switch(pred->pred_type) {
  case dPolynomial:
    memcpy(predvec, next_to_last, veclen * sizeof(MD_Double));
    /* yes, fall through */
  case Polynomial:
    /* and yes, fall through */
  case Time_Reversible:
    /* and yes, fall through */
  case TR_HO:  /* high order time reversible predictor */
    for (i = 0; i < nvec; i++) {
      const MD_Double *pv = pvec[i];
      const MD_Double c = pred->coeff[i];
      for (j = 0; j < veclen; j++)  predvec[j] += c * pv[j];
    }
    break;
  case dLeastSquare:
  case dConstraintLeastSquare:
    /* use last vector */
    memcpy(predvec, next_to_last, veclen * sizeof(MD_Double));
    calc_leastsquare_coeff(pred->coeff, pvec, nvec, veclen, 
			   dConstraintLeastSquare == pred->pred_type);
    /* plus the prediction to the difference */
    for (i = 0; i < nvec-1; i++) {  /* 1 less */
      MD_Double* pv = pvec[i];
      MD_Double c = pred->coeff[i];
      for (j = 0; j < veclen; j++) predvec[j] += c * pv[j];
    }
    break;
  case LeastSquare:  /* fall through */
  case ConstraintLeastSquare:
    /*
    calc_leastsquare_coeff(pred->coeff, pvec, nvec, veclen, 
			   ConstraintLeastSquare == pred->pred_type);
    */
    assert(LeastSquare == pred->pred_type);
    /*
    calc_ls_coeff_householder(pred->coeff, pvec, nvec, veclen);
    */
    calc_leastsquare_coeff(pred->coeff, pvec, nvec, veclen, 0); 
    
    for (i = 0; i < nvec-1; i++) {  /* 1 less */
      MD_Double* pv = pvec[i];
      MD_Double c = pred->coeff[i];
      for (j = 0; j < veclen; j++) predvec[j] += c * pv[j];
    }
    break;
  case TimeReversibleLeastSquare:
    {
      MD_Int ncons = g_nconstraints;
      if (ncons >= nvec) {
        printf("too many constraints, reduce it from %d to maximum possible:"
             " %d\n", g_nconstraints, nvec-1);
        ncons = nvec - 1;
      }
      calc_trls_coeff(pred->coeff, pvec, nvec, veclen, ncons);
      for (i = 0; i < nvec-1; i++) {  /* 1 less */
        MD_Double* pv = pvec[i];
        MD_Double c = pred->coeff[i];
        for (j = 0; j < veclen; j++) predvec[j] += c * pv[j];
      }
    }
    break;
  case Zero:
    /* already assigned 0 */
    break;
  default:
    fprintf(stderr, "wrong prediction type %d\n", pred->pred_type);
    return MD_FAIL;
  }

  return OK;
} 


/* for degree p prediction
 *   p   coeff
 *   0:   1
 *   1:   2, -1
 *   2:   3, -3, 1
 *   ...
 * n-1:   choose(n,1), -choose(n,2), .... (-1)^{n-1} choose(n,n)
 */
void calc_poly_coeff(MD_Double *coeff, const MD_Int degree)
{
  const MD_Int n = degree+1;
  MD_Int i;

  ASSERT(degree >= 0);

  coeff[0] = (MD_Double) n;  /* coeff[i] = C(n, i+1) */

  for (i = 1; i <= degree; i++) { /* C(n,i+1)/C(n,i) = (n-i)/(i+1) */
    /* round-off error is negligible */
    coeff[i] = -coeff[i-1] * (n-i) / (i+1);
  }

  outputv(coeff, degree+1, "coefficient");
  return;
}


/* prediction:
 * k    coeff
 * 0:   1
 * 1:   2, -1
 * 2:   5/2, -2, 1/2
 * see note ...
 */
void calc_timerev_coeff(MD_Double *coeff, const MD_Int kvalue)
{
  MD_Int i;

  ASSERT(kvalue >= 0);

  coeff[0] = (MD_Double) (4*kvalue+2) / (MD_Double) (kvalue+2);

  for (i = 1; i <= kvalue; i++) {
    /* truncation error is negligible */
    coeff[i] = -coeff[i-1] * ((i+1) * (kvalue-i+1)) / (i * (kvalue+i+2));
  }

  outputv(coeff, kvalue+1, "coefficient");
  return;
}

void calc_ho_timerev_coeff(MD_Double *coeff, const MD_Int kvalue)
{
  if (5 == kvalue) {
    /* d_p - d = c4 \delta^4 + c_6 \delta^6 + c7 \delta^7 + ... */
    /* sum |coeff| = 77/3 */
    coeff[0] = 14.0/3.0;
    coeff[1] = -26.0/3.0;
    coeff[2] = 8.0;
    coeff[3] = -11.0/3.0;
    coeff[4] = 2.0/3.0;
  } else if (6 == kvalue) {
    /* d_p - d = c4 \delta^4 + c6 \delta^6 + c8 \delta^8 + c9 \delta^9 ... */
    /* sum of abs(coeff) = 39 */
    coeff[0] =   36.0/7.0;
    coeff[1] = -153.0/14.0;
    coeff[2] =   86.0/7.0;
    coeff[3] = - 54.0/7.0;
    coeff[4] =   18.0/7.0;
    coeff[5] = -  5.0/14.0;
  } else if (7 == kvalue) {
    /* d_p - d = c_6 \delta^6 + c8 \delta^8 + c9 \delta^9 ... */
    /* sum of abs(coeff) = 111 */
    coeff[0] =   6.75;
    coeff[1] = -19.5;
    coeff[2] =  31.25; 
    coeff[3] = -30.0;
    coeff[4] =  17.25;
    coeff[5] = - 5.5;
    coeff[6] =   0.75;
  } else {
    fprintf(stderr, "cannot  deal with the kvalue yet\n");
    exit(1);
  }

  outputv(coeff, kvalue, "coefficient");

  return;
}


/*
 * compute a linear predictor to minimize the 2-norm of the prediction error.
 * the target vector is vec[0],  the latest vector
 * input: 
 *    vec       : an group of vectors.
 *    veclen    : the length of each vecole 
 *    nvec      : overall # of vectors.
 *    constraint: 1 if requires sum c_i = 1, 0 otherwise.
 * output 
 *         coeff: coefficient array, of size nvec, 
 *  return value: |error|_2/|v[0]|, (relative error)
 * see note 2004-06-29
 */
MD_Double calc_leastsquare_coeff(MD_Double *coeff,
				 MD_Double **vec, 
			         const MD_Int nvec, 
				 const MD_Int veclen,
				 const MD_Int constraint)
{
  /* not elegant */
  enum {MAX_DIPOLES=20};
  static MD_Double A[MAX_DIPOLES][MAX_DIPOLES];
  static MD_Double workspace[MAX_DIPOLES];
  MD_Int matsize;
  MD_Int i,j, m;
  MD_Double c, error;

  matsize = constraint ? nvec:(nvec-1);

  ASSERT(MAX_DIPOLES > matsize + 1);

  /* storage of A:  (k = nvec - 1)
   *   A[1..k, 1..k]  : A[i,j] = dot(vec[i], vec[j])
   *   A[:,0], A[0,:] : A[i,0] = dot(vec[0], vec[i]) = b[i]
   *   A[1..k,k+1] = A[k+1,1..k] = 1
   *   A[k+1,k+1] = 0
   */
  for (i = 0; i < nvec; i++) { /* assign A even if unconstraint */
    A[i][i] = DOT(vec[i], vec[i], veclen);
    A[i][nvec] = 1.0;
    A[nvec][i] = 1.0;
/*
    printf("A[%d, %d] = %g\n", i, i, A[i][i]);  
*/
    for (j = 0; j < i; j++) {
      A[i][j] = A[j][i] = DOT(vec[i], vec[j], veclen);
/*
      printf("A[%d, %d] = %g\n", i, j, A[i][j]);  
*/
    }
  }
  A[nvec][nvec] = 0.0;

  /* solve Ax = b by Gaussian elimination, even though the matrix is positive
   * definite, pivoting is still needed, since A is almost singular */
  for (i = 1; i <= matsize; i++) {
    /* pivoting */
    MD_Double amax = fabs(A[i][i]);
    MD_Int jmax = i;
    for (j = i+1; j <= matsize; j++) {
      if (amax < fabs(A[j][i])) {amax = fabs(A[j][i]); jmax = j;}
    }
    if (jmax > i) {  /* exchange row i with jmax */
      MD_Double tmp = A[i][0];
      A[i][0] = A[jmax][0]; 
      A[jmax][0] = tmp;
      /* in C, matrix is row major */
      memcpy(workspace,     &(A[i][i]),    (matsize-i+1) * sizeof(MD_Double));
      memcpy(&(A[i][i]),    &(A[jmax][i]), (matsize-i+1) * sizeof(MD_Double));
      memcpy(&(A[jmax][i]), workspace,     (matsize-i+1) * sizeof(MD_Double));
    }
    for (j = i+1; j <= matsize; j++) { /* elimination */
      c = A[j][i] / A[i][i];
      A[j][0] -= A[i][0] * c; 
      for (m = i+1; m <= matsize; m++) A[j][m] -= A[i][m] * c;
    }
  }

  for (i = matsize; i > 0; i--) { /* back substitution */
    c = A[i][0];
    for (j = i+1; j <= matsize; j++) c -= A[i][j] * coeff[j-1];
    coeff[i-1] = c / A[i][i];  /* result start with index 0  */
#ifdef DEBUG_LEAST_SQUARE
    printf("coeff[%d]=%f\n", i-1, coeff[i-1]);
#endif
  }

  
#ifdef DEBUG_LEAST_SQUARE
  {MD_Double det=1.0;
    for (i = 1; i <= matsize; i++) det *= A[i][i];
    printf("det(A)=%g\n", det);
  }
#endif

  /* in case of constraint: last one is lambda */
  error = constraint ? (A[0][0] - coeff[matsize-1]) : A[0][0];

#ifdef DEBUG_LEAST_SQUARE
  {
  static MD_Double smax = 0.0;
  MD_Double sumabs = 0.0;
  static MD_Double esum = 0.0;
  static MD_Double csum = 0.0;
  static MD_Int count = 0;
  /*
  printf("lambda = %g\n", coeff[matsize-1]);
  printf("coeff: "); 
  */ 
#endif

  for (i = 1; i < nvec; i++) {
#ifdef DEBUG_LEAST_SQUARE
    /*
    printf("%g, ", coeff[i-1]);     
    */
    sumabs += fabs(coeff[i-1]);
#endif
    error -= coeff[i-1] * A[0][i];
  }

#ifdef DEBUG_LEAST_SQUARE
  if (sumabs > smax) smax = sumabs;
  printf("sum |coefficient| = %f, overall max. = %f\n", sumabs, smax);
#endif

  error = sqrt(error/A[0][0]);

#ifdef DEBUG_LEAST_SQUARE
  count ++;
  esum += error;
  csum += sumabs;
  printf("error=%g, <error>=%g, <csum>=%f\n",error, esum/count, csum/count); 
  }
#endif

  return error;  /* 2-norm square of the eror */
}


/* index starts from 1 */
#ifdef INDEX
#under INDEX
#endif
#define INDEX(i,j,n) (((i)-1)*(n)+(j)-1)

MD_Double calc_trls_coeff(MD_Double *coeff,
			  MD_Double **vec,  /* array of vector */
			  const MD_Int nvec, /* # of vectors */
			  const MD_Int veclen, 
			  const MD_Int constraints /* # constraints */)
{
  /* not elegant */
  enum {MAX_UNKNOWNS=40};
  static MD_Double A[MAX_UNKNOWNS * MAX_UNKNOWNS];
  static MD_Double b[MAX_UNKNOWNS];
  const MD_Int k = nvec - 1;
  const MD_Int m = constraints - 1;
  const MD_Int n = k + constraints;  /* matrixsize */
  MD_Int i,j;

  ASSERT(A && b && MAX_UNKNOWNS > (k+m+1) && constraints >= 0);
  ASSERT(constraints < nvec);

#ifdef DEBUG_LEAST_SQUARE
  printf("nvec=%d, constraints=%d, k=%d, m=%d, n=%d\n", nvec, constraints,k,m,n);
#endif
  /* The symmetric linear system is
   *
   *  [                          1 1  1^3 ... 1^{2m-1} ] [c1  ]    [D_{01}] 
   *  [ D_{ij} = <vec_i, vec_j>  1 2  2^3 ... 2^{2m-1} ] [c2  ]    [D_{02}]
   *  [   i,j=1..k               1 ...                 ] [..  ]    [......]
   *  [                          1 k  k^3 ... k^{2m-1} ] [ck  ]    [D_{0k}]
   *  [ 1   1        ... 1                             ] [lmd0] =  [  1   ]
   *  [ 1   2        ... k                             ] [lmd1]    [  0   ]
   *  [ 1   2^3      ... k^3                           ] [lmd2]    [  0   ]
   *  [ ... ...                            0           ] [....]    [ ...  ]
   *  [ 1   2^{2m-1} ... k^{2m-1}                      ] [lmdm]    [  0   ]
   *  
   */
  for (i = 1; i <= k; i++) {
    b[i-1] = DOT(vec[0], vec[i], veclen); /* index starts from 0 */
    A[INDEX(i,i,n)] = DOT(vec[i],vec[i],veclen);
    for (j=i+1; j<= k; j++) 
      A[INDEX(j,i,n)] = A[INDEX(i,j,n)] = DOT(vec[i],vec[j],veclen);
  }
  if (constraints > 0) {
    memset((void*)(b+k), 0, constraints * sizeof(*b));
    b[k] = 1.0;  /* index starts from 0 */
    for (i=1; i<=k; i++) A[INDEX(i,k+1,n)] = A[INDEX(k+1,i,n)] = 1.0;
    if (constraints > 1) {
      for (i=1; i<=k; i++) A[INDEX(i,k+2,n)]=A[INDEX(k+2,i,n)]=(MD_Double)i;
    }
    if (constraints > 2) {
      for (i=1; i<=k; i++) {
        MD_Double tmp = (MD_Double) (i*i);
        for (j=2; j<=m; j++) {
	  A[INDEX(i,j+k+1,n)] = A[INDEX(j+k+1,i,n)] = A[INDEX(i,j+k,n)] * tmp;
        }
      }
    }
    for (i=k+1; i<=n; i++) for (j=k+1; j<=n; j++)  A[INDEX(i,j,n)] = 0.0;
  }

#ifdef DEBUG_LEAST_SQUARE
  printf("A: \n");
  for (i=1; i<=n; i++) {
    for (j=1; j<=n; j++) printf("%f, ", A[INDEX(i,j,n)]);
    printf("\n");
  }
  outputv(b, n, "b");
#endif
  if (GaussElmination(n, A, b)) {
    printf("failed to solve the equation\n");
    return MD_FAIL;
  }

  memcpy(coeff, b, (nvec-1)*sizeof(*coeff));  /* copy solution */

#ifdef DEBUG_LEAST_SQUARE
  { MD_Double sumabs = 0.0;
    {MD_Double det=1.0;
      for (i = 1; i <= n; i++) det *= A[INDEX(i,i,n)];
      printf("det(A)=%g\n", det);
    }

    printf("coefficients: ");
    for (i = 0; i < nvec-1; i++) {
      printf("%g, ", coeff[i]);     
      sumabs += fabs(coeff[i]);
    }
    printf("\n""sum |coefficient| = %f, \n", sumabs);
  }
#endif

  return 0.0;  /* not computed yet */
}


/* follow Numerical Linear Algebra Lloyd N. Trefethen and David Bau, III
 * lecture 10, algorithm 10.1 */

static MD_Double calc_ls_coeff_householder(MD_Double *coeff,
					   MD_Double **vecs,
					   const MD_Int nvec,
					   const MD_Int veclen)
{
  MD_Int K = nvec - 1;
  MD_Double *vi, *vj, *b, *h;
  MD_Double c, tmp;
  MD_Int i, j, k;
  enum {NMAX=20};
  MD_Int order[NMAX];
  MD_Double cc[NMAX];

  assert(nvec < NMAX + 1);

  printf("K=%d\n", K);

  for (i=1; i<nvec; i++) {
    memcpy(buffer + (i-1)*veclen, vecs[i], veclen*sizeof(MD_Double));
  }
  memcpy(buffer + (nvec-1)*veclen, vecs[0], veclen*sizeof(MD_Double));
  b = buffer + (nvec-1)*veclen;  /* right-hand-side */
  h = buffer + nvec*veclen;  /* reflection vector */

  {
    MD_String name;
    static MD_Int istep = 0;
    sprintf(name, "output/prev%d", istep++);
    bindump_array(buffer, (nvec-1)*veclen, name);
  }

  /* Householder rotations */
  for (i=0; i<K; i++) order[i] = i;
  for (i=0; i<K; i++) {
    vi = buffer + i*veclen;
    c = sqrt(DOT(vi+i, vi+i, veclen-i));
    if (vi[i] < 0) c = 0.0-c;  /* sign(vi[0])*||vi|| */
    for (j=i; j<veclen; j++) h[j] = vi[j]; /* prapare reflection vector */
    h[i] += c;
    tmp = 1.0 / sqrt(DOT(h+i, h+i, veclen-i));
    for (j=i; j<veclen; j++) h[j] *= tmp;
    vi[i] = -c; /* vi[j] = 0 for j > i */
    /* the last modified vector is the vector on the right-hand-side */
    for (j=i+1; j<=K; j++) { 
      vj = buffer + j*veclen;
      tmp = 2.0 * DOT(h+i, vj+i, veclen-i);
      for (k=i; k<veclen; k++) vj[k] -= tmp*h[k];
    }
    /* pivoting, choose the vector with largest length */
#if 1
    if (K-2 > i) {
      MD_Int imax = i+1;
      MD_Double maxv = DOT(buffer+imax*veclen+imax, buffer+imax*veclen+imax, 
			   veclen-imax);
      for (j=i+2; j<K; j++) {
	MD_Double vn = DOT(buffer+j*veclen+i+1, buffer+j*veclen+i+1, 
			   veclen-i-1);
	if (maxv < vn) { maxv=vn; imax=j; }
      }
      printf("i=%d, imax=%d, max(||v||)=%g\n", i, imax, sqrt(maxv));
      /*
      if (maxv < 2e-5*2e-5) {
	printf("cutoff, maxv=%g\n", maxv);
	K = i+1;
	break;
      }
      */
      if (i+1 != imax) { /* interchange rows, h here is a temporary var */
	printf("exchange %d and %d\n", i+1, imax);
	j=order[i+1]; order[i+1]=order[imax]; order[imax]=j;
	memcpy(h, buffer + imax*veclen, veclen*sizeof(MD_Double));
	memcpy(buffer + imax*veclen, buffer + (i+1)*veclen, 
	       veclen*sizeof(MD_Double));
	memcpy(buffer + (i+1)*veclen, h, veclen*sizeof(MD_Double));
      }
    }  
#endif
  }

  /*
  for (i=0; i<=K-1; i++) {
    for (j=i; j<=K-1; j++) printf("%f,", buffer[j*veclen+i]);
    printf("\n");
  }
  for (i=0; i<=K-1; i++) printf("%f\n", b[i]);
  */

  /* back-substitution */
  for (i=K-1; i>=0; i--) {
    cc[i] = b[i] / buffer[i*veclen + i];
    for (j=0; j<i; j++)  b[j] -= buffer[i*veclen+j] * cc[i];
    /* printf("Householder, cc[%d]=%f\n", i, cc[i]); */
  }

  for (i=0; i<nvec-1; i++) coeff[i] = 0;
  for (i=0; i<K; i++) coeff[order[i]] = cc[i];
  for (i=0; i<K; i++) {
    printf("Householder, order[%d]=%d, coeff[%d]=%f\n", i, order[i], 
	   order[i], coeff[order[i]]);
  }

  c = 0.0;
  for (i=K; i<veclen; i++) c += b[i]*b[i];
  c = sqrt(c);

  printf("Householder error: ||b-Ax||_2 = %g\n", c);

  return c;  /* return error in 2-norm square */

}
