/***************************************************************************
 *cr
 *cr            (C) Copyright 1995-2007 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/
/***************************************************************************
 * RCS INFORMATION:
 *
 *      $RCSfile: VolCPotential.C,v $
 *      $Author: johns $        $Locker:  $             $State: Exp $
 *      $Revision: 1.11 $      $Date: 2007/03/13 18:09:27 $
 *
 ***************************************************************************/
/*
 * Calculate a coulombic potential map
 */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "config.h"         // force recompile when configuration changes
#include "utilities.h"
#include "Inform.h"
#include "VMDThreads.h"
#include "VolCPotential.h" 
#include "CUDAKernels.h"

typedef struct {
  int threadid;
  int threadcount;
  float* atoms;
  float* grideners;
  long int numplane;
  long int numcol;
  long int numpt;
  long int natoms;
  float gridspacing;
} enthrparms;

/* thread prototype */
static void * energythread(void *);


static int vol_cpotential_cpu(long int natoms, float* atoms, float* grideners, long int numplane, long int numcol, long int numpt, float gridspacing) {
  int i;
  enthrparms *parms;
  vmd_thread_t * threads;

#if defined(VMDTHREADS)
  int numprocs = vmd_thread_numprocessors();
#else
  int numprocs = 1;
#endif

  printf("Using %d processors\n", numprocs);  

  /* allocate array of threads */
  threads = (vmd_thread_t *) calloc(numprocs * sizeof(vmd_thread_t), 1);

  /* allocate and initialize array of thread parameters */
  parms = (enthrparms *) malloc(numprocs * sizeof(enthrparms));
  for (i=0; i<numprocs; i++) {
    parms[i].threadid = i;
    parms[i].threadcount = numprocs;
    parms[i].atoms = atoms;
    parms[i].grideners = grideners;
    parms[i].numplane = numplane;
    parms[i].numcol = numcol;
    parms[i].numpt = numpt;
    parms[i].natoms = natoms;
    parms[i].gridspacing = gridspacing;
  }

#if defined(VMDTHREADS)
  /* spawn child threads to do the work */
  for (i=0; i<numprocs; i++) {
    vmd_thread_create(&threads[i], energythread, &parms[i]);
  }

  /* join the threads after work is done */
  for (i=0; i<numprocs; i++) {
    vmd_thread_join(threads[i], NULL);
  } 
#else
  /* single thread does all of the work */
  energythread((void *) &parms[0]);
#endif

  /* free thread parms */
  free(parms);
  free(threads);

  return 0;
}


static void * energythread(void *voidparms) {
  enthrparms *parms = (enthrparms *) voidparms;
  /* 
   * copy in per-thread parameters 
   */
  const float *atoms = parms->atoms;
  float* grideners = parms->grideners;
  const long int numplane = parms->numplane;
  const long int numcol = parms->numcol;
  const long int numpt = parms->numpt;
  const long int natoms = parms->natoms;
  const float gridspacing = parms->gridspacing;
  const int threadid = parms->threadid;
  const int threadcount = parms->threadcount;
  int i, j, k, n;
  double starttime, lasttime, totaltime;
  msgtimer msgt;

  /* Calculate the coulombic energy at each grid point from each atom
   * This is by far the most time consuming part of the process
   * We iterate over z,y,x, and then atoms
   */

  printf("thread %d started...\n", threadid);
  starttime = time_of_day();
  msg_timer_init(&msgt, 5);

  // Holds atom x, dy**2+dz**2, and atom q as x/r/q/x/r/q...
  float * xrq = (float *) malloc(3*natoms * sizeof(float)); 
  int maxn = natoms * 3;

  // For each point in the cube...
  for (k=threadid; k<numplane; k+= threadcount) {
    const float z = gridspacing * (float) k;
    lasttime = time_of_day() - starttime;
    for (j=0; j<numcol; j++) {
      const float y = gridspacing * (float) j;
      long int voxaddr = numcol*numpt*k + numpt*j;

      // Prebuild a table of dy and dz values on a per atom basis
      for (n=0; n<natoms; n++) {
        int addr3 = n*3;
        int addr4 = n*4;
        float dy = y - atoms[addr4 + 1];
        float dz = z - atoms[addr4 + 2];
        xrq[addr3    ] = atoms[addr4];
        xrq[addr3 + 1] = dz*dz + dy*dy;
        xrq[addr3 + 2] = atoms[addr4 + 3];
      }

#if defined(__INTEL_COMPILER)
// help the vectorizer make reasonable decisions (used prime to keep it honest)
#pragma loop count(1009)
#endif
      for (i=0; i<numpt; i++) {
        float energy = grideners[voxaddr + i]; // Energy at current grid point
        const float x = gridspacing * (float) i;

#if defined(__INTEL_COMPILER)
// help the vectorizer make reasonable decisions
#pragma vector always 
#endif
        // Calculate the interaction with each atom
        for (n=0; n<maxn; n+=3) {
          float dx = x - xrq[n];
          energy += xrq[n + 2] / sqrtf(dx*dx + xrq[n + 1]);
        }
        grideners[voxaddr + i] = energy;
      }
    }
    totaltime = time_of_day() - starttime;

    if (msg_timer_timeout(&msgt)) {
      // XXX: we have to use printf here as msgInfo is not thread-safe yet.
      printf("thread[%d] plane %d/%ld time %.2f, elapsed %.1f, est. total: %.1f\n",
             threadid, k, numplane,
             totaltime - lasttime, totaltime, 
             totaltime * numplane / (k+1));
    }
  }

  free(xrq);

  return NULL;
}

int vol_cpotential(long int natoms, float* atoms, float* grideners, long int numplane, long int numcol, long int numpt, float gridspacing) {
  int rc = -1; // init rc value to indicate we haven't run yet
  double starttime = time_of_day();

#if defined(VMDCUDA)
  rc = vmd_cuda_vol_cpotential(natoms, atoms, grideners, 
                               numplane, numcol, numpt, gridspacing);
#endif

  // if we tried to run on the GPU and failed, or we haven't run yet,
  // run on the CPU
  if (rc != 0)
    rc = vol_cpotential_cpu(natoms, atoms, grideners, 
                            numplane, numcol, numpt, gridspacing);

  double totaltime = time_of_day() - starttime;
  msgInfo << "Coulombic potential map calculation complete: "
          << totaltime << " seconds" << sendmsg;
 
  return rc;
}



