/***************************************************************************
 *cr
 *cr            (C) Copyright 2007 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/
/*
 * CUDA accelerated coulombic potential grid test code
 *   John E. Stone <johns@ks.uiuc.edu>
 *   http://www.ks.uiuc.edu/~johns/
 *
 * Coulombic potential grid calculation microbenchmark based on the time
 * consuming portions of the 'cionize' ion placement tool.
 *
 * This version of the cude uses a 2-D texture to store all of the atoms.
 *
 * Benchmark for this version:  90 GFLOPS, 9.0 billion atom evals/sec
 *   (Test system: GeForce 8800GTX)
 */

#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>

#include "util.h"

#if 1
#define CUERR { cudaError_t err; \
  if ((err = cudaGetLastError()) != cudaSuccess) { \
  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
  return -1; }}
#else
#define CUERR
#endif

#define TEXROWSIZE 16384

// declare texture reference for the 2-D float4 texture
texture<float4, 2, cudaReadModeElementType> tex;

// This kernel calculates coulombic potential at each grid point and
// stores the results in the output array.
__global__ static void cenergy(int numatoms, float gridspacing, int z, float * energygrid) {
  int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
  int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
  int outaddr = (gridDim.x * blockDim.x * yindex) + xindex;

  float3 coor;
  coor.x = gridspacing * xindex;
  coor.y = gridspacing * yindex;
  coor.z = gridspacing * z;

  int atomid, tx, ty;
  float energyval=0.0f;
  for (atomid=0,tx=0,ty=0; atomid < numatoms; ty++) {
    for (tx=0; tx < TEXROWSIZE && atomid < numatoms; tx++, atomid++) {
      float4 atominfo = texfetch(tex, tx, ty);
      float dx = coor.x - atominfo.x;
      float dy = coor.y - atominfo.y;
      float dz = coor.z - atominfo.z;

      // explicitly dividing 1.0f / sqrt() helps the compiler clue in
      // that we really wanted rsqrt() to begin with, it drops 30% performance
      // otherwise.
      float r_1 = 1.0f / sqrtf(dx*dx + dy*dy + dz*dz);
      energyval += atominfo.w * r_1;
    }
  }

  energygrid[outaddr] = energyval;
}


int copyatomstotexture(float *atoms, int count, cudaArray **darray) {
  // allocate array and copy image data
printf("tex: %d %d %d %d %d\n", 
  tex.channelDesc.x,
  tex.channelDesc.y,
  tex.channelDesc.z,
  tex.channelDesc.w,
  tex.channelDesc.f);

  cudaMallocArray(darray, &tex.channelDesc, TEXROWSIZE, (count / TEXROWSIZE) + 2);
  CUERR // check and clear any existing errors

  cudaMemcpyToArray(*darray, 0, 0, atoms, count*4*sizeof(float), cudaMemcpyHostToDevice);
  CUERR // check and clear any existing errors

  // set texture parameters
  tex.addressMode[0] = cudaAddressModeClamp;
  tex.addressMode[1] = cudaAddressModeClamp;
  tex.filterMode = cudaFilterModePoint;
  tex.normalized = false; // do not normalize coordinates

  // Bind the array to the texture
  cudaBindTexture(tex, *darray);

  return 0;
}


int initatoms(float **atombuf, int count, dim3 volsize, float gridspacing) {
  dim3 size;
  int i;
  float *atoms;

  atoms = (float *) malloc(count * 4 * sizeof(float));
  *atombuf = atoms;

  // compute grid dimensions in angstroms
  size.x = gridspacing * volsize.x;
  size.y = gridspacing * volsize.y;
  size.z = gridspacing * volsize.z;

  for (i=0; i<count; i++) {
    int addr = i * 4;
    atoms[addr    ] = (rand() / (float) RAND_MAX) * size.x; 
    atoms[addr + 1] = (rand() / (float) RAND_MAX) * size.y; 
    atoms[addr + 2] = (rand() / (float) RAND_MAX) * size.z; 
    atoms[addr + 3] = ((rand() / (float) RAND_MAX) * 2.0) - 1.0;  // charge
  }  

  return 0;
}


int main(int argc, char** argv) {
  cudaArray *datoms = NULL;
  float *doutput = NULL;
  float *energy = NULL;
  float *atoms = NULL;
  dim3 volsize, Gsz, Bsz;
  rt_timerhandle runtimer, mastertimer, copytimer;
  float copytotal, runtotal, mastertotal;

  // number of atoms to simulate
  int atomcount = 100000;

  // setup energy grid size
  volsize.x = 512;
  volsize.y = 512;
  volsize.z = 1;

  // set voxel spacing
  float gridspacing = 0.1;

  // setup CUDA grid and block sizes
  Bsz.x = 16;
  Bsz.y = 16;
  Bsz.z = 1;
  Gsz.x = volsize.x / Bsz.x; 
  Gsz.y = volsize.y / Bsz.y; 
  Gsz.z = volsize.z / Bsz.z; 

  // initialize the wall clock timers
  runtimer = rt_timer_create();
  mastertimer = rt_timer_create();
  copytimer = rt_timer_create();
  copytotal = 0;
  runtotal = 0;

  // allocate and initialize atom coordinates and charges
  if (initatoms(&atoms, atomcount, volsize, gridspacing))
    return -1;

  // allocate and initialize the GPU output array
  int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
  cudaMalloc((void**)&doutput, volmemsz);
  CUERR // check and clear any existing errors
  cudaMemset(doutput, 0, volmemsz);
  CUERR // check and clear any existing errors


  printf("Grid size: %d x %d x %d\n", volsize.x, volsize.y, volsize.z);
  printf("Running kernel(atoms:%d, gridspacing %f, z %d)\n", atomcount, gridspacing, 0);

  rt_timer_start(mastertimer);

  // copy the atoms to the GPU
  rt_timer_start(copytimer);
  if (copyatomstotexture(atoms, atomcount, &datoms)) 
    return -1;
  rt_timer_stop(copytimer);
  copytotal += rt_timer_time(copytimer);
 
  // RUN the kernel...
  rt_timer_start(runtimer);
  cenergy<<<Gsz, Bsz, 0>>>(atomcount, 0.1, 0, doutput);
  CUERR // check and clear any existing errors
  rt_timer_stop(runtimer);
  runtotal += rt_timer_time(runtimer);

  rt_timer_stop(mastertimer);
  mastertotal = rt_timer_time(mastertimer);

  printf("Copy time: %f seconds\n", copytotal);
  printf("Kernel time: %f seconds\n", runtotal);
  printf("Total time: %f seconds\n", mastertotal);

  double atomevalssec = ((double) volsize.x * volsize.y * volsize.z * atomcount)/ (mastertotal * 1000000000.0);
  printf("Efficiency metric, %g billion atom evals per second\n", atomevalssec);

  /* 10 FLOPS per atom eval */
  printf("FP performance: %g GFLOPS\n", atomevalssec * 10.0);

  // Copy the GPU output data back to the host and use/store it..
  energy = (float *) malloc(volmemsz);
  cudaMemcpy(energy, doutput, volmemsz,  cudaMemcpyDeviceToHost);
  CUERR // check and clear any existing errors

#if 0
  int x, y;
  for (y=0; y<16; y++) {
    for (x=0; x<16; x++) {
      int addr = y * volsize.x + x;
      printf("out[%d]: %f\n", addr, energy[addr]);
    }
  }
#endif

  cudaFree(doutput);
  cudaFreeArray(datoms);
  CUERR // check and clear any existing errors

  printf("Done\n");
}
