/***************************************************************************
 *cr
 *cr            (C) Copyright 2007 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/
/*
 * CUDA accelerated coulombic potential grid test code
 *   John E. Stone <johns@ks.uiuc.edu>
 *   http://www.ks.uiuc.edu/~johns/
 *
 * Coulombic potential grid calculation microbenchmark based on the time
 * consuming portions of the 'cionize' ion placement tool.
 *
 * This version of the code uses the 64KB constant buffer area reloaded 
 * for each group of MAXATOMS atoms, until the contributions from all 
 * atoms have been summed into the potential grid.
 * This version uses Kahan's compensated summation method to 
 * increase floating point accuracy for the large number of summed
 * potential values.
 *
 * Benchmark for this version: 138 GFLOPS, 20.5 billion atom evals/sec
 *   (Test system: GeForce 8800GTX)
 */

#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#include "util.h"

#define CUERR { cudaError_t err; \
  if ((err = cudaGetLastError()) != cudaSuccess) { \
  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
  return -1; }}

// max constant buffer size is 64KB, minus whatever
// the CUDA runtime and compiler are using that we don't know about
// At 16 bytes for atom, for this program 4070 atoms is about the max
// we can store in the constant buffer.
#define MAXATOMS 4070
__constant__ float4 atominfo[MAXATOMS];


// This kernel calculates coulombic potential at each grid point and
// stores the results in the output array.
// Note: this implementation uses precomputed and unrolled
// loops of dy*dy + dz*dz values for increased FP arithmetic intensity
// per FP load.  The X coordinate portion of the loop is unrolled by 
// four, allowing the same dy^2 + dz^2 values to be reused four times,
// increasing the ratio of FP arithmetic relative to FP loads, and 
// eliminating some redundant calculations.
// This version implement's Kahan's compensated summation method to 
// increase floating point accuracy for the large number of summed
// potential values.
//
// NVCC -cubin says this implementation uses 24 regs, 28 smem
// Profiler output says this code gets 50% warp occupancy
//
// Best benchmark to date: 138 GFLOPS
#define UNROLLX 4
__global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
  unsigned int xindex  = (__umul24(blockIdx.x, blockDim.x) + threadIdx.x) * UNROLLX;
  unsigned int yindex  = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
  unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex + xindex;

  // query current energy value in the grid, start the read early
  // so the fetch occurs while we're summing the new energy values
  float curenergyx1 = energygrid[outaddr    ];
  float curenergyx2 = energygrid[outaddr + 1];
  float curenergyx3 = energygrid[outaddr + 2];
  float curenergyx4 = energygrid[outaddr + 3];

  float coory = gridspacing * yindex;

  float coorx1 = gridspacing * (xindex    );
  float coorx2 = gridspacing * (xindex + 1);
  float coorx3 = gridspacing * (xindex + 2);
  float coorx4 = gridspacing * (xindex + 3);

  float energyvalx1=0.0f;
  float energyvalx2=0.0f;
  float energyvalx3=0.0f;
  float energyvalx4=0.0f;

  // compensated summation roundoff storage
  float energycomp1=0.0f;
  float energycomp2=0.0f;
  float energycomp3=0.0f;
  float energycomp4=0.0f;

  // Atom loop: 4 atoms, 27 floating point ops, 4 FP loads per iteration
  //            27/4 = 4.75 floating point ops per gridpoint
  //
  // Note: this implementation uses precomputed and unrolled
  // loops of dy*dy + dz*dz values for increased FP arithmetic intensity
  // per FP load.
  // XXX explicitly dividing 1.0f / sqrt() helps the compiler clue in
  //     that we really wanted rsqrt() to begin with, it drops 30% 
  //     performance otherwise.
  int atomid;
  for (atomid=0; atomid<numatoms; atomid++) {
    float dy = coory - atominfo[atomid].y;
    float dysqpdzsq = (dy * dy) + atominfo[atomid].z;

    float dx1 = coorx1 - atominfo[atomid].x;
    float dx2 = coorx2 - atominfo[atomid].x;
    float dx3 = coorx3 - atominfo[atomid].x;
    float dx4 = coorx4 - atominfo[atomid].x;

    float s, y, t;
    s = atominfo[atomid].w * (1.0f / sqrtf(dx1*dx1 + dysqpdzsq));
    y = s - energycomp1;
    t = energyvalx1 + y;
    energycomp1 = (t - energyvalx1)  - y;
    energyvalx1 = t;

    s = atominfo[atomid].w * (1.0f / sqrtf(dx2*dx2 + dysqpdzsq));
    y = s - energycomp2;
    t = energyvalx2 + y;
    energycomp2 = (t - energyvalx2)  - y;
    energyvalx2 = t;

    s = atominfo[atomid].w * (1.0f / sqrtf(dx3*dx3 + dysqpdzsq));
    y = s - energycomp3;
    t = energyvalx3 + y;
    energycomp3 = (t - energyvalx3)  - y;
    energyvalx3 = t;

    s = atominfo[atomid].w * (1.0f / sqrtf(dx4*dx4 + dysqpdzsq));
    y = s - energycomp4;
    t = energyvalx4 + y;
    energycomp4 = (t - energyvalx4)  - y;
    energyvalx4 = t;
  }

  // accumulate energy value with the existing value
  energygrid[outaddr    ] = curenergyx1 + energyvalx1;
  energygrid[outaddr + 1] = curenergyx2 + energyvalx2;
  energygrid[outaddr + 2] = curenergyx3 + energyvalx3;
  energygrid[outaddr + 3] = curenergyx4 + energyvalx4;
}


int copyatomstoconstbuf(float *atoms, int count, float zplane) {
  if (count > MAXATOMS) {
    printf("Atom count exceeds constant buffer storage capacity\n");
    return -1;
  }

  float atompre[4*MAXATOMS];
  int i;
  for (i=0; i<count*4; i+=4) {
    atompre[i    ] = atoms[i    ];
    atompre[i + 1] = atoms[i + 1];
    float dz = zplane - atoms[i + 2];
    atompre[i + 2]  = dz*dz;
    atompre[i + 3] = atoms[i + 3];
  }

  cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
  CUERR // check and clear any existing errors

  return 0;
}


int initatoms(float **atombuf, int count, dim3 volsize, float gridspacing) {
  dim3 size;
  int i;
  float *atoms;

  atoms = (float *) malloc(count * 4 * sizeof(float));
  *atombuf = atoms;

  // compute grid dimensions in angstroms
  size.x = gridspacing * volsize.x;
  size.y = gridspacing * volsize.y;
  size.z = gridspacing * volsize.z;

  for (i=0; i<count; i++) {
    int addr = i * 4;
    atoms[addr    ] = (rand() / (float) RAND_MAX) * size.x; 
    atoms[addr + 1] = (rand() / (float) RAND_MAX) * size.y; 
    atoms[addr + 2] = (rand() / (float) RAND_MAX) * size.z; 
    atoms[addr + 3] = ((rand() / (float) RAND_MAX) * 2.0) - 1.0;  // charge
  }  

  return 0;
}


int main(int argc, char** argv) {
  float *doutput = NULL;
  float *energy = NULL;
  float *atoms = NULL;
  dim3 volsize, Gsz, Bsz;
  rt_timerhandle runtimer, mastertimer, copytimer, hostcopytimer;
  float copytotal, runtotal, mastertotal, hostcopytotal;
  const char *statestr = "|/-\\.";
  int state=0;

  printf("CUDA accelerated coulombic potential microbenchmark V2.0\n");
  printf("John E. Stone <johns@ks.uiuc.edu>\n");
  printf("http://www.ks.uiuc.edu/~johns/\n");
  printf("--------------------------------------------------------\n");
  int deviceCount = 0;
  cudaGetDeviceCount(&deviceCount);
  printf("Detected %d CUDA accelerators:\n", deviceCount);
  int dev;
  for (dev=0; dev < deviceCount; dev++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("  CUDA device[%d]: '%s'  Mem: %dMB  Rev: %d.%d\n", 
           dev, deviceProp.name, deviceProp.bytes / (1024*1024), 
           deviceProp.major, deviceProp.minor);
  }

  int cudadev = 0;
  if (argc == 2) {
    sscanf(argv[1], "%d", &cudadev);
    if (cudadev < 0 || cudadev >= deviceCount) {
      cudadev = 0; 
    }    
  }
  printf("  Single-threaded single-GPU test run.\n");
  printf("  Opening CUDA device %d...\n", cudadev);
  cudaSetDevice(cudadev);
  CUERR // check and clear any existing errors

  // number of atoms to simulate
  int atomcount = 100000;

  // setup energy grid size
  // XXX this is a large test case to clearly illustrate that even while
  //     the CUDA kernel is running entirely on the GPU, the CUDA runtime
  //     library is soaking up the entire host CPU for some reason.
  volsize.x = 2048;
  volsize.y = 2048;
  volsize.z = 1;

  // set voxel spacing
  float gridspacing = 0.1;

  // setup CUDA grid and block sizes
  // XXX we have to make a trade-off between the number of threads per
  //     block and the resulting padding size we'll end up with since
  //     each thread will do 4 consecutive grid cells in this version,
  //     we're using up some of our available parallelism to reduce overhead.
  Bsz.x =  4;                            // each thread does multiple Xs
  Bsz.y = 16;
  Bsz.z = 1;
  Gsz.x = volsize.x / (Bsz.x * UNROLLX); // each thread does multiple Xs
  Gsz.y = volsize.y / Bsz.y; 
  Gsz.z = volsize.z / Bsz.z; 

  // initialize the wall clock timers
  runtimer = rt_timer_create();
  mastertimer = rt_timer_create();
  copytimer = rt_timer_create();
  hostcopytimer = rt_timer_create();
  copytotal = 0;
  runtotal = 0;
  hostcopytotal = 0;

  printf("Grid size: %d x %d x %d\n", volsize.x, volsize.y, volsize.z);
  printf("Running kernel(atoms:%d, gridspacing %g, z %d)\n", atomcount, gridspacing, 0);

  // allocate and initialize atom coordinates and charges
  if (initatoms(&atoms, atomcount, volsize, gridspacing))
    return -1;

  // allocate and initialize the GPU output array
  int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
  printf("Allocating %.2fMB of memory for output buffer...\n", volmemsz / (1024.0 * 1024.0));

  cudaMalloc((void**)&doutput, volmemsz);
  CUERR // check and clear any existing errors
  cudaMemset(doutput, 0, volmemsz);
  CUERR // check and clear any existing errors

  rt_timer_start(mastertimer);

  int iterations=0;
  int atomstart;
  for (atomstart=0; atomstart<atomcount; atomstart+=MAXATOMS) {   
    iterations++;
    int runatoms;
    int atomsremaining = atomcount - atomstart;
    if (atomsremaining > MAXATOMS)
      runatoms = MAXATOMS;
    else
      runatoms = atomsremaining;

    printf("%c\r", statestr[state]);
    fflush(stdout);
    state = (state+1) & 3;
//  printf("  Kernel setup: atomstart %d runcount: %d\n", atomstart, runatoms);

    // copy the atoms to the GPU
    rt_timer_start(copytimer);
    if (copyatomstoconstbuf(atoms + 4*atomstart, runatoms, 0*gridspacing)) 
      return -1;
    rt_timer_stop(copytimer);
    copytotal += rt_timer_time(copytimer);
 
    // RUN the kernel...
    rt_timer_start(runtimer);
    cenergy<<<Gsz, Bsz, 0>>>(runatoms, 0.1, doutput);
    CUERR // check and clear any existing errors
    rt_timer_stop(runtimer);
    runtotal += rt_timer_time(runtimer);
  }
  printf("Done\n");

  rt_timer_stop(mastertimer);
  mastertotal = rt_timer_time(mastertimer);

  // Copy the GPU output data back to the host and use/store it..
  energy = (float *) malloc(volmemsz);
  rt_timer_start(hostcopytimer);
  cudaMemcpy(energy, doutput, volmemsz,  cudaMemcpyDeviceToHost);
  CUERR // check and clear any existing errors
  rt_timer_stop(hostcopytimer);
  hostcopytotal=rt_timer_time(hostcopytimer);

#if 0
  int x, y;
  for (y=0; y<16; y++) {
    for (x=0; x<16; x++) {
      int addr = y * volsize.x + x;
      printf("out[%d]: %f\n", addr, energy[addr]);
    }
  }
#endif

  printf("Final calculation required %d iterations of %d atoms\n", iterations, MAXATOMS);
  printf("Copy time: %f seconds, %f per iteration\n", copytotal, copytotal / (float) iterations);
  printf("Kernel time: %f seconds, %f per iteration\n", runtotal, runtotal / (float) iterations);
  printf("Total time: %f seconds\n", mastertotal);
  printf("Kernel invocation rate: %f iterations per second\n", iterations / mastertotal);
  printf("GPU to host copy bandwidth: %gMB/sec, %f seconds total\n",
         (volmemsz / (1024.0 * 1024.0)) / hostcopytotal, hostcopytotal);

  double atomevalssec = ((double) volsize.x * volsize.y * volsize.z * atomcount) / (mastertotal * 1000000000.0);
  printf("Efficiency metric, %g billion atom evals per second\n", atomevalssec);

  /* 27/4 FLOPS per atom eval */
  printf("FP performance: %g GFLOPS\n", atomevalssec * (27.0/4.0));
  printf(" (1xADD=1 + 5xSUB=5 + 1xMUL=1 + 8xMADD=16 + 4xRSQRT=4 = 27 per iteration)\n");
  free(atoms);
  free(energy);
  cudaFree(doutput);

  return 0;
}




