/***************************************************************************
 *cr
 *cr            (C) Copyright 2006 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/

/* Multithreaded and vectorized implementation by John Stone */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>

#include "util.h"    /* timer code taken from Tachyon */
#include "threads.h" /* threads code taken from Tachyon */

typedef struct {
  int threadid;
  int threadcount;
  float *atoms;
  float* grideners;
  long int numplane;
  long int numcol;
  long int numpt;
  long int natoms;
  float gridspacing;
  unsigned char* excludepos;
} enthrparms;

/* thread prototype */
static void * energythread(void *);

/* 
 * Master thread, sets up and tears town the infrastructure to run all of
 * the slave threads, spawns them, and then joins them when they complete.
 */
int calc_grid_energies_excl_thr(float* atoms, float* grideners, long int numplane, long int numcol, long int numpt, long int natoms, float gridspacing, unsigned char* excludepos, int maxnumprocs) {
  int i;
  enthrparms *parms;
  rt_thread_t * threads;

#if defined(THR)
  int availprocs = rt_thread_numprocessors();
  int numprocs;
  if (maxnumprocs <= availprocs) {
    numprocs = maxnumprocs;
  } else {
    numprocs = availprocs;
  }

  printf("Multithreaded build, using %d processors\n", numprocs);  
#else
  int numprocs = 1;
  printf("Single-threaded build, using 1 processor\n");  
#endif

  /* allocate array of threads */
  threads = (rt_thread_t *) calloc(numprocs * sizeof(rt_thread_t), 1);

  /* allocate and initialize array of thread parameters */
  parms = (enthrparms *) malloc(numprocs * sizeof(enthrparms));
  for (i=0; i<numprocs; i++) {
    parms[i].threadid = i;
    parms[i].threadcount = numprocs;
    parms[i].atoms = atoms;
    parms[i].grideners = grideners;
    parms[i].numplane = numplane;
    parms[i].numcol = numcol;
    parms[i].numpt = numpt;
    parms[i].natoms = natoms;
    parms[i].gridspacing = gridspacing;
    parms[i].excludepos = excludepos;
  }

#if defined(THR)
  /* spawn child threads to do the work */
  for (i=0; i<numprocs; i++) {
    rt_thread_create(&threads[i], energythread, &parms[i]);
  }

  /* join the threads after work is done */
  for (i=0; i<numprocs; i++) {
    rt_thread_join(threads[i], NULL);
  } 
#else
  /* single thread does all of the work */
  energythread((void *) &parms[0]);
#endif

  /* free thread parms */
  free(parms);
  free(threads);

  return 0;
}


/* 
 * energythread(): worker thread entry point function
 *
 * Calculate the coulombic energy at each grid point from each atom
 * This is by far the most time consuming part of the process
 * We iterate over z,y,x, and then atoms
 * This function is the same as the original calc_grid_energies, except
 * that it utilizes the exclusion grid
 */
static void * energythread(void *voidparms) {
  enthrparms *parms = (enthrparms *) voidparms;
  /* 
   * copy in per-thread parameters 
   */
  const float *atoms = parms->atoms;
  float* grideners = parms->grideners;
  const long int numplane = parms->numplane;
  const long int numcol = parms->numcol;
  const long int numpt = parms->numpt;
  const long int natoms = parms->natoms;
  const float gridspacing = parms->gridspacing;
  const unsigned char* excludepos = parms->excludepos;
  const int threadid = parms->threadid;
  const int threadcount = parms->threadcount;

  int i,j,k,n; /* Loop counters */

  rt_timerhandle timer = rt_timer_create();
  rt_timer_start(timer);
  float lasttime;
  float totaltime;

  printf("thread %d started...\n", threadid);

  /* Holds atom x, dy**2+dz**2, and atom q as x/r/q/x/r/q... */
  float *xrq = (float *) malloc(3*natoms * sizeof(float)); 
  int maxn = natoms * 3;

  /* For each point in the cube... */
  for (k=threadid; k<numplane; k+= threadcount) {
    lasttime = rt_timer_timenow(timer);
    const float z = gridspacing * (float) k;
    for (j=0; j<numcol; j++) {
      const float y = gridspacing * (float) j;
      long int voxaddr = numcol*numpt*k + numpt*j;

      /* Prebuild a table of dy and dz values on a per atom basis */
      for (n=0; n<natoms; n++) {
        int addr3 = n*3;
        int addr4 = n*4;
        float dy = y - atoms[addr4 + 1];
        float dz = z - atoms[addr4 + 2];
        xrq[addr3    ] = atoms[addr4];
        xrq[addr3 + 1] = dz*dz + dy*dy;
        xrq[addr3 + 2] = atoms[addr4 + 3];
      }

#if defined(__INTEL_COMPILER)
/* help the vectorizer make good decisions (used prime to keep it honest) */
#pragma loop count(1009)
#endif
      for (i=0; i<numpt; i++) {
        /* Check if we're on an excluded point, and skip it if we are */
        if (excludepos[voxaddr + i] != 0) continue;

        float energy = 0; /* Energy of current grid point */
        const float x = gridspacing * (float) i;

#if defined(__INTEL_COMPILER)
/* help the vectorizer make reasonable decisions */
#pragma vector always 
#endif
        /* Calculate the interaction with each atom */
        for (n=0; n<maxn; n+=3) {
          float dx = x - xrq[n];
          energy += xrq[n + 2] / sqrtf(dx*dx + xrq[n + 1]);
        }
        grideners[voxaddr + i] = energy;
      }
    }
    totaltime = rt_timer_timenow(timer);
    printf("thread[%d] plane %d/%ld time %.2f, elapsed %.1f, est. total: %.1f\n",
           threadid, k, numplane,

           totaltime - lasttime, totaltime, 
           totaltime * numplane / (k+1));
    fflush(stdout);
  }

  rt_timer_destroy(timer);
  free(xrq);

  return NULL;
}

