/*
 * A set of simple Multiple Debye-Huckel (MDH) kernels 
 * inspired by APBS:
 *   http://www.poissonboltzmann.org/ 
 *
 * This code was all originally written by David Gohara on MacOS X, 
 * and has been subsequently been modified by John Stone, porting to Linux,
 * adding vectorization, and several other platform-specific 
 * performance optimizations.
 * 
 */

#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "WKFThreads.h"
#include "WKFUtils.h"

#include "cpukerns.h"

#if 1
#define CUERR { cudaError_t err; \
  if ((err = cudaGetLastError()) != cudaSuccess) { \
  printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
  printf("Thread aborting...\n"); \
  return -1; }}
#else
#define CUERR
#endif


#define SEP printf("-----------------------------------------------------------\n")

void readfile(float *ax,float *ay,float *az,
              float *gx,float *gy,float *gz,
              float *charge,float *size,int natom,int ngrid){
    
    int i;    
    
    FILE * pFile = NULL;
    pFile = fopen("atom.txt","r");
    
    printf("Reading input file 1\n");
    for (i=0; i<natom; i++) {
        
        fscanf(pFile,"%g %g %g %g %g",
               &ax[i],&ay[i],&az[i],&charge[i],&size[i]);
    }
    fclose(pFile);
    
    pFile = fopen("grid.txt","r");
    
    printf("Reading input file 2\n");
    for (i=0; i<ngrid; i++) {
        
        fscanf(pFile,"%g %g %g",
               &gx[i],&gy[i],&gz[i]);
    }
    fclose(pFile);
    
    printf("Done reading inputs.");
}

void gendata(float *ax,float *ay,float *az,
             float *gx,float *gy,float *gz,
             float *charge,float *size,int natom,int ngrid){
    
    int i;    
    
    printf("Generating Data 1\n");
    for (i=0; i<natom; i++) {
        ax[i] = ((float) rand() / (float) RAND_MAX);
        ay[i] = ((float) rand() / (float) RAND_MAX);
        az[i] = ((float) rand() / (float) RAND_MAX);
        charge[i] = ((float) rand() / (float) RAND_MAX);
        size[i] = ((float) rand() / (float) RAND_MAX);
    }
    
    printf("Generating Data 2\n");
    for (i=0; i<ngrid; i++) {
        gx[i] = ((float) rand() / (float) RAND_MAX);
        gy[i] = ((float) rand() / (float) RAND_MAX);
        gz[i] = ((float) rand() / (float) RAND_MAX);
    }
    printf("Done generating inputs.\n\n");
}

void print_total(float * arr, int ngrid){
    int i;
    double accum = 0.0;
    for (i=0; i<ngrid; i++){
        accum += arr[i];
    }
    printf("Accumulated value: %1.7g\n",accum);
}


#if 1
#define FDIVIDE(a, b) __fdividef((a), (b))
#else
#define FDIVIDE(a, b) ((a) / (b))
#endif

#if 1
#define EXPF __expf
#else
#define EXPF expf
#endif

#if 1
#define RESTRICT __restrict__
#else
#define RESTRICT 
#endif

#if 1
#define USETEMPLATE
#elif 0
#define WGSIZE 64
#else
#define WGSIZE blockDim.x
#endif

__global__
void mdh_scalar(float *ax, float *ay, float *az,
                float *charge, float *size, 
                float *val,
                float *gx, float *gy, float *gz,
                float pre1, float xkappa, int natoms) {
  extern __shared__ float smem[];
  int igrid = (blockIdx.x * blockDim.x) + threadIdx.x;
  int lsize = blockDim.x;
  int lid = threadIdx.x;
  float lgx = gx[igrid];
  float lgy = gy[igrid];
  float lgz = gz[igrid];
  float v = 0.0f;
  for (int jatom = 0; jatom < natoms; jatom+=lsize) {
    if ((jatom+lsize) > natoms) lsize = natoms - jatom;

    __syncthreads();
    if ((jatom + lid) < natoms) {
      smem[lid          ] = ax[jatom + lid];
      smem[lid +   lsize] = ay[jatom + lid];
      smem[lid + 2*lsize] = az[jatom + lid];
      smem[lid + 3*lsize] = charge[jatom + lid];
      smem[lid + 4*lsize] = size[jatom + lid];
    }
    __syncthreads();
    for (int i=0; i<lsize; i++) {
      float dx = lgx - smem[i          ];
      float dy = lgy - smem[i +   lsize];
      float dz = lgz - smem[i + 2*lsize];
      float dist = sqrtf(dx*dx + dy*dy + dz*dz);
      v += pre1 * FDIVIDE(smem[i + 3*lsize], dist) *
           FDIVIDE(EXPF(-xkappa * (dist - smem[i + 4*lsize])),
           (1.0f + xkappa * smem[i + 4*lsize]));
    }
  }
  val[igrid] = v;
}


#if defined(USETEMPLATE)
template <int WGSIZE>
#endif
__global__
void mdh_scalar_opt(float * RESTRICT ax, 
                    float * RESTRICT ay, 
                    float * RESTRICT az,
                    float * RESTRICT charge, 
                    float * RESTRICT size, 
                    float * RESTRICT val,
                    float * RESTRICT gx, 
                    float * RESTRICT gy, 
                    float * RESTRICT gz,
                    float pre1, float xkappa, int natoms) {
  extern __shared__ float smem[];
  int igrid = (blockIdx.x * WGSIZE) + threadIdx.x;
  int lsize = WGSIZE;
  int lid = threadIdx.x;
  float lgx = gx[igrid];
  float lgy = gy[igrid];
  float lgz = gz[igrid];
  float v = 0.0f;
  for (int jatom = 0; jatom < natoms; jatom+=WGSIZE) {
    __syncthreads();
    if ((jatom + lid) < natoms) {
      smem[lid           ] = ax[jatom + lid];
      smem[lid +   WGSIZE] = ay[jatom + lid];
      smem[lid + 2*WGSIZE] = az[jatom + lid];
      smem[lid + 3*WGSIZE] = charge[jatom + lid];
      smem[lid + 4*WGSIZE] = size[jatom + lid];
    }
    __syncthreads();
    if ((jatom+lsize) > natoms) lsize = natoms - jatom;
    for (int i=0; i<lsize; i++) {
      float dx = lgx - smem[i           ];
      float dy = lgy - smem[i +   WGSIZE];
      float dz = lgz - smem[i + 2*WGSIZE];
      float dist = sqrtf(dx*dx + dy*dy + dz*dz);
      v += smem[i + 3*WGSIZE] * 
           FDIVIDE(EXPF(-xkappa * (dist - smem[i + 4*WGSIZE])),
           ((1.0f + xkappa * smem[i + 4*WGSIZE])*dist));
    }
  }
  val[igrid] = pre1 * v;
}

#if defined(USETEMPLATE)
#define MDHKERN(a) \
    mdh_scalar_opt<(a)><<<GSz, BSz, shared_size>>>(ax_d, ay_d, az_d, \
                                                   charge_d, size_d, \
                                                   val_d,            \
                                                   gx_d, gy_d, gz_d, \
                                                   pre1, xkappa, natom)
#else
#define MDHKERN(a) \
    mdh_scalar_opt<<<GSz, BSz, shared_size>>>(ax_d, ay_d, az_d, \
                                              charge_d, size_d, \
                                              val_d,            \
                                              gx_d, gy_d, gz_d, \
                                              pre1, xkappa, natom)
#endif


int exec_kernel(int wgsize, int vecsize, 
                int ngrid, int natom, int ngadj, int naadj,
                float *ax, float *ay, float *az,
                float *gx, float *gy, float *gz,
                float *charge, float *size, 
                float xkappa, float pre1, 
                float *val, int itmax,
                double *runtotal){
    double cu_attach, cu_alloc, cu_enqueue, cu_read, cu_total;

    cu_attach=0.0;
    float *foo;
    if (getenv("CUDADEV")) 
      cudaSetDevice(atoi(getenv("CUDADEV")));
    cudaMalloc((void**) &foo, sizeof(float));
    cudaFree(foo);

    wkf_timerhandle timer = wkf_timer_create();
    wkf_timerhandle totaltimer = wkf_timer_create();

    //Allocate memory for programs and kernels
    size_t atom_buffer_size = sizeof(float) * natom;
    size_t grid_buffer_size = sizeof(float) * ngrid;
    size_t gadj_buffer_size = sizeof(float) * ngadj;
   
    wkf_timer_start(totaltimer);
    wkf_timer_start(timer); 

    //Allocate memory and queue it to be written to the device
    float *ax_d, *ay_d, *az_d, *charge_d, *size_d;
    float *gx_d, *gy_d, *gz_d, *val_d;

    cudaMalloc((void**) &ax_d, atom_buffer_size);
    cudaMemcpy(ax_d, ax, atom_buffer_size, cudaMemcpyHostToDevice);
    cudaMalloc((void**) &ay_d, atom_buffer_size);
    cudaMemcpy(ay_d, ay, atom_buffer_size, cudaMemcpyHostToDevice);
    cudaMalloc((void**) &az_d, atom_buffer_size);
    cudaMemcpy(az_d, az, atom_buffer_size, cudaMemcpyHostToDevice);
    
    cudaMalloc((void**) &charge_d, atom_buffer_size);
    cudaMemcpy(charge_d, charge, atom_buffer_size, cudaMemcpyHostToDevice);
    
    cudaMalloc((void**) &size_d, atom_buffer_size);
    cudaMemcpy(size_d, size, atom_buffer_size, cudaMemcpyHostToDevice);

    cudaMalloc((void**) &gx_d, gadj_buffer_size);
    cudaMemcpy(gx_d, gx, gadj_buffer_size, cudaMemcpyHostToDevice);
    cudaMalloc((void**) &gy_d, gadj_buffer_size);
    cudaMemcpy(gy_d, gy, gadj_buffer_size, cudaMemcpyHostToDevice);
    cudaMalloc((void**) &gz_d, gadj_buffer_size);
    cudaMemcpy(gz_d, gz, gadj_buffer_size, cudaMemcpyHostToDevice);
   
    cudaMalloc((void**) &val_d, gadj_buffer_size);
    cudaMemcpy(val_d, val, gadj_buffer_size, cudaMemcpyHostToDevice);
    
    //Push the data out to device
    cudaThreadSynchronize();
    CUERR;
 
    wkf_timer_stop(timer); 
    cu_alloc = wkf_timer_time(timer);

    // scale number of work units by vector size
    int shared_size = (5 * wgsize) * sizeof(float);
    
    dim3 BSz = dim3(wgsize, 1, 1); 
    dim3 GSz = dim3(ngadj / (vecsize * wgsize), 1, 1); 
printf("Bsz: %d %d %d,  Gsz: %d %d %d  shared: %d\n",
       BSz.x, BSz.y, BSz.z, GSz.x, GSz.y, GSz.z, shared_size);

    // warm-up run
    switch(wgsize) {
      case 256: MDHKERN(256); break;
      case 128: MDHKERN(128); break;
       default:
      case  64: MDHKERN( 64); break;
    }
    cudaThreadSynchronize();

    wkf_timer_start(timer);
    //Queue up the kernels itmax times
#if defined(USETEMPLATE)
    switch(wgsize) {
      case 256: for(int i=0;i<itmax;i++) MDHKERN(256); break;
      case 128: for(int i=0;i<itmax;i++) MDHKERN(128); break;
      default:
      case  64: for(int i=0;i<itmax;i++) MDHKERN( 64); break;
    }
#else
    for(int i=0;i<itmax;i++) {
      mdh_scalar_opt<<<GSz, BSz, shared_size>>>(ax_d, ay_d, az_d, 
                                                charge_d, size_d,
                                                val_d, 
                                                gx_d, gy_d, gz_d, 
                                                pre1, xkappa, natom);
    }
#endif

    //Finish the calculation
    cudaThreadSynchronize();
    CUERR;
    wkf_timer_stop(timer);
    // report average kernel runtime
    cu_enqueue = wkf_timer_time(timer) / ((double) itmax);
    
    wkf_timer_start(timer);
    // read output image
    cudaMemcpy(val, val_d, grid_buffer_size, cudaMemcpyDeviceToHost);
    
    //Push the data out to device
    cudaThreadSynchronize();
    wkf_timer_stop(timer);
    cu_read = wkf_timer_time(timer);

    wkf_timer_stop(totaltimer);
    cu_total = wkf_timer_time(totaltimer);
 
    printf("Attach: %.2f Alloc: %1.12g Enqueue: %1.12g Read: %1.12g\n",
           cu_attach, cu_alloc, cu_enqueue, cu_read);
    print_total(val, ngrid);

    // release kernel, program, and memory objects
    cudaFree(ax_d);
    cudaFree(ay_d);
    cudaFree(az_d);
    cudaFree(charge_d);
    cudaFree(size_d);
    
    cudaFree(gx_d);
    cudaFree(gy_d);
    cudaFree(gz_d);
    
    cudaFree(val_d);

    wkf_timer_destroy(timer);
    wkf_timer_destroy(totaltimer);

    *runtotal = cu_total; // return total GPU runtime
    
    return 0;
}

void usage() {
  printf("command line parameters:\n");
  printf("Device selection: -cpu -clcpu -clgpu -claccel\n");
  printf("Optional test flags:\n");
  printf("  -itmax N         loop test N times\n");
  printf("  -wgsize          set workgroup size\n");
  printf("  -vecscan         loop over various floatX vector widths\n");
  printf("  -asyncwgcopy     use async_workgroup_copy() for shared mem I/O\n");
  printf("  -testsmemnobcast use ATI alternative shared mem access pattern\n");
}

void getargs(int argc, const char **argv, int *itmax, 
             int *runcpu, int *runclcpu, int *runclgpu, int *runclaccel,
             int *vecscan, int *testsmemnobcast, int *asyncwgcopy,
             int *wgsize) {
  int i;
  for (i=0; i<argc; i++) {
    if ((!strcmp(argv[i], "-itmax")) && ((i+1) < argc)) {
      i++;
      *itmax = atoi(argv[i]);;
    }

    if (!strcmp(argv[i], "-cpu"))
      *runcpu = 1;

    if (!strcmp(argv[i], "-clcpu"))
      *runclcpu = 1;

    if (!strcmp(argv[i], "-clgpu"))
      *runclgpu = 1;

    if (!strcmp(argv[i], "-claccel"))
      *runclaccel = 1;

    if (!strcmp(argv[i], "-vecscan"))
      *vecscan = 1;

    if (!strcmp(argv[i], "-testsmemnobcast"))
      *testsmemnobcast = 1;

    if (!strcmp(argv[i], "-asyncwgcopy"))
      *asyncwgcopy = 1;

    if ((!strcmp(argv[i], "-wgsize")) && ((i+1) < argc)) {
      i++;
      *wgsize = atoi(argv[i]);;
    }

    if (!strcmp(argv[i], "-all")) {
      *runcpu = 1;
      *runclcpu = 1;
      *runclgpu = 1;
      *runclaccel = 1;
      *vecscan = 1;
      *testsmemnobcast = 1;
    }
  }

  printf("Running tests on:\n");
  printf("  CPU: %s\n", (*runcpu) ? "enabled" : "disabled");
  printf("  CUDA GPU: %s\n", (*runclgpu) ? "enabled" : "disabled");
  printf("\n");
  printf("Run parameters:\n");
  printf("  kernel loop count: %d\n", *itmax);
  printf("     workgroup size: %d\n", *wgsize);
  printf("   vector type scan: %s\n", (*vecscan) ? "enabled" : "disabled");
  printf("  test smem nobcast: %s\n", (*testsmemnobcast) ? "enabled" : "disabled");
  printf("      async wg copy: %s\n", (*asyncwgcopy) ? "enabled" : "disabled");
}


int main(int argc, const char **argv) {
    double runtotal;
    int naadj = 0;
    int itmax = 1;
    int runcpu = 0;
    int runclcpu = 0;
    int runclgpu = 1;
    int runclaccel = 0;
    int vecscan = 0;
    int testsmemnobcast = 0;
    int asyncwgcopy = 0;
    int wgsize = 64;

    getargs(argc, argv, &itmax, 
            &runcpu, &runclcpu, &runclgpu, &runclaccel,
            &vecscan, &testsmemnobcast, &asyncwgcopy,
            &wgsize);

    if (!(runcpu || runclcpu || runclgpu || runclaccel)) {
      usage();

      printf("\nNo device type selected!  Exiting\n");
      return -1;
    }

    wkf_timerhandle timer = wkf_timer_create();
 
    int natom = 5877;
    int ngrid = 134918;
    int ngadj = ngrid + (512 - (ngrid & 511));
    
    float pre1 = 4.46184985145e19;
    float xkappa = 0.0735516324639;
    
    float *ax = (float*)calloc(natom, sizeof(float));
    float *ay = (float*)calloc(natom, sizeof(float));
    float *az = (float*)calloc(natom, sizeof(float));
    float *charge = (float*)calloc(natom, sizeof(float));
    float *size = (float*)calloc(natom, sizeof(float));

    float *gx = (float*)calloc(ngadj, sizeof(float));
    float *gy = (float*)calloc(ngadj, sizeof(float));
    float *gz = (float*)calloc(ngadj, sizeof(float));
    
    float *val1 = (float*)calloc(ngadj, sizeof(float));
    float *val2 = (float*)calloc(ngadj, sizeof(float));
    
    gendata(ax, ay, az, gx, gy, gz, charge, size, natom, ngrid);
   
    if (runcpu) { 
      wkf_timer_start(timer);
      cpu_scalar_opt(ngadj, natom, ax, ay, az, gx, gy, gz, charge, size, xkappa, pre1, val1);
      wkf_timer_stop(timer);

      SEP;
      print_total(val1, ngrid);
      printf("CPU Loop: %1.12g\n", wkf_timer_time(timer));
      SEP;
    }

    SEP;
    CUERR;
    exec_kernel(wgsize, 1, ngrid, natom, ngadj, naadj, 
                ax, ay, az, gx, gy, gz, 
                charge, size, xkappa, pre1, val2, itmax, 
                &runtotal);
    printf("GPU Total - Scalar Unoptimized: %1.12g\n", runtotal);
    SEP;
      
    free(ax);
    free(ay);
    free(az);
    free(charge);
    free(size);
    
    free(gx);
    free(gy);
    free(gz);
    
    free(val1);
    free(val2);

    wkf_timer_destroy(timer);

#if defined(_MSC_VER)
    // wait for keypress before exiting
    printf("Press any key to exit test...\n");
    int x = getc(stdin);
#endif
    
    return 0;
}



















