Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDAVolCPotential.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2007-2008 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: CUDAVolCPotential.cu,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.26 $      $Date: 2008/03/27 19:37:42 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   CUDA accelerated coulombic potential grid calculation
00019  *
00020  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #include "utilities.h"
00027 #include "VMDThreads.h"
00028 #include "CUDAKernels.h" 
00029 
00030 typedef struct {
00031   int threadid;
00032   int threadcount;
00033   float* atoms;
00034   float* grideners;
00035   long int numplane;
00036   long int numcol;
00037   long int numpt;
00038   long int natoms;
00039   float gridspacing;
00040 } enthrparms;
00041 
00042 /* thread prototype */
00043 static void * cudaenergythread(void *);
00044 
00045 #if 1
00046 #define CUERR { cudaError_t err; \
00047   if ((err = cudaGetLastError()) != cudaSuccess) { \
00048   printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
00049   printf("Thread aborting...\n"); \
00050   return NULL; }}
00051 #else
00052 #define CUERR
00053 #endif
00054 
00055 // max constant buffer size is 64KB, minus whatever
00056 // the CUDA runtime and compiler are using that we don't know about
00057 // At 16 bytes/atom, 4000 atoms is about the max we can store in
00058 // the constant buffer.
00059 #define MAXATOMS 4000
00060 __constant__ static float4 atominfo[MAXATOMS];
00061 
00062 
00063 // 
00064 // The CUDA kernels calculate coulombic potential at each grid point and
00065 // store the results in the output array.
00066 //
00067 // These versions of the code use the 64KB constant buffer area reloaded
00068 // for each group of MAXATOMS atoms, until the contributions from all
00069 // atoms have been summed into the potential grid.
00070 //
00071 // These implementations use precomputed and unrolled loops of 
00072 // (dy^2 + dz^2) values for increased FP arithmetic intensity.
00073 // The X coordinate portion of the loop is unrolled by four or eight,
00074 // allowing the same dy^2 + dz^2 values to be reused multiple times,
00075 // increasing the ratio of FP arithmetic relative to FP loads, and
00076 // eliminating some redundant calculations.
00077 //
00078 
00079 //
00080 // Tuned global memory coalescing version, unrolled in X
00081 //
00082 // Benchmark for this version: 291 GFLOPS, 39.5 billion atom evals/sec
00083 //  (Test system: GeForce 8800GTX)
00084 // 
00085 
00086 #if 1
00087 
00088 //
00089 // Tunings for large potential map dimensions (e.g. 384x384x...)
00090 //
00091 // NVCC -cubin says this implementation uses 20 regs
00092 //
00093 #define UNROLLX     8
00094 #define UNROLLY     1
00095 #define BLOCKSIZEX  8  // make large enough to allow coalesced global mem ops
00096 #define BLOCKSIZEY  8  // make as small as possible for finer granularity
00097 
00098 #else
00099 
00100 //
00101 // Tunings for small potential map dimensions (e.g. 128x128x...)
00102 //
00103 // NVCC -cubin says this implementation uses 16 regs
00104 //
00105 #define UNROLLX     4
00106 #define UNROLLY     1
00107 #define BLOCKSIZEX 16  // make large enough to allow coalesced global mem ops
00108 #define BLOCKSIZEY 16  // make as small as possible for finer granularity
00109 
00110 #endif
00111 
00112 #define BLOCKSIZE  (BLOCKSIZEX*BLOCKSIZEY)
00113 
00114 // FLOP counting
00115 #if UNROLLX == 8
00116 #define FLOPSPERATOMEVAL (59.0/8.0)
00117 #elif UNROLLX == 4 
00118 #define FLOPSPERATOMEVAL (31.0/4.0)
00119 #endif
00120 
00121 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
00122   unsigned int xindex  = __umul24(blockIdx.x, blockDim.x) * UNROLLX
00123                          + threadIdx.x;
00124   unsigned int yindex  = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00125   unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex
00126                          + xindex;
00127 
00128   float coory = gridspacing * yindex;
00129   float coorx = gridspacing * xindex;
00130 
00131   float energyvalx1=0.0f;
00132   float energyvalx2=0.0f;
00133   float energyvalx3=0.0f;
00134   float energyvalx4=0.0f;
00135 #if UNROLLX > 4
00136   float energyvalx5=0.0f;
00137   float energyvalx6=0.0f;
00138   float energyvalx7=0.0f;
00139   float energyvalx8=0.0f;
00140 #endif
00141 
00142   float gridspacing_coalesce = gridspacing * BLOCKSIZEX;
00143 
00144   // NOTE: One beneficial side effect of summing groups of 4,000 atoms
00145   // at a time in multiple passes is that these smaller summation 
00146   // groupings are likely to result in higher precision, since structures
00147   // are typically stored with atoms in chain order leading to high 
00148   // spatial locality within the groups of 4,000 atoms.  Kernels that 
00149   // sum these subgroups independently of the global sum for each voxel
00150   // should achieve relatively good floating point precision since large values
00151   // will tend to be summed with large values, and small values summed with
00152   // small values, etc. 
00153   int atomid;
00154   for (atomid=0; atomid<numatoms; atomid++) {
00155     float dy = coory - atominfo[atomid].y;
00156     float dyz2 = (dy * dy) + atominfo[atomid].z;
00157 
00158     float dx1 = coorx - atominfo[atomid].x;
00159     float dx2 = dx1 + gridspacing_coalesce;
00160     float dx3 = dx2 + gridspacing_coalesce;
00161     float dx4 = dx3 + gridspacing_coalesce;
00162 #if UNROLLX > 4
00163     float dx5 = dx4 + gridspacing_coalesce;
00164     float dx6 = dx5 + gridspacing_coalesce;
00165     float dx7 = dx6 + gridspacing_coalesce;
00166     float dx8 = dx7 + gridspacing_coalesce;
00167 #endif
00168 
00169     energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2);
00170     energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2);
00171     energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2);
00172     energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2);
00173 #if UNROLLX > 4
00174     energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2);
00175     energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2);
00176     energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2);
00177     energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2);
00178 #endif
00179   }
00180 
00181   energygrid[outaddr             ] += energyvalx1;
00182   energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2;
00183   energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3;
00184   energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4;
00185 #if UNROLLX > 4
00186   energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5;
00187   energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6;
00188   energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7;
00189   energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8;
00190 #endif
00191 }
00192 
00193 
00194 // required GPU array padding to match thread block size
00195 // XXX note: this code requires block size dimensions to be a power of two
00196 #define TILESIZEX BLOCKSIZEX*UNROLLX
00197 #define TILESIZEY BLOCKSIZEY*UNROLLY
00198 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00199 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00200 
00201 int copyatomstoconstbuf(const float *atoms, float *atompre, 
00202                         int count, float zplane) {
00203   if (count > MAXATOMS) {
00204     printf("Atom count exceeds constant buffer storage capacity\n");
00205     return -1;
00206   }
00207 
00208   int i;
00209   for (i=0; i<count*4; i+=4) {
00210     atompre[i    ] = atoms[i    ];
00211     atompre[i + 1] = atoms[i + 1];
00212     float dz = zplane - atoms[i + 2];
00213     atompre[i + 2]  = dz*dz;
00214     atompre[i + 3] = atoms[i + 3];
00215   }
00216 
00217   cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
00218   CUERR // check and clear any existing errors
00219 
00220   return 0;
00221 }
00222 
00223 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners,
00224                             long int numplane, long int numcol, long int numpt, 
00225                             float gridspacing) {
00226   int i;
00227   enthrparms *parms;
00228   vmd_thread_t * threads;
00229   vmd_timerhandle globaltimer;
00230   double totalruntime;
00231 
00232 #if defined(VMDTHREADS)
00233   int numprocs = vmd_thread_numprocessors();
00234 #else
00235   int numprocs = 1;
00236 #endif
00237 
00238   int deviceCount = 0;
00239   if (vmd_cuda_num_devices(&deviceCount))
00240     return -1;
00241   if (deviceCount < 1)
00242     return -1;
00243 
00244   /* take the lesser of the number of CPUs and GPUs */
00245   /* and execute that many threads                  */
00246   if (deviceCount < numprocs) {
00247     numprocs = deviceCount;
00248   }
00249 
00250   printf("Using %d CUDA GPUs\n", numprocs);
00251 
00252   /* allocate array of threads */
00253   threads = (vmd_thread_t *) calloc(numprocs * sizeof(vmd_thread_t), 1);
00254 
00255   /* allocate and initialize array of thread parameters */
00256   parms = (enthrparms *) malloc(numprocs * sizeof(enthrparms));
00257   for (i=0; i<numprocs; i++) {
00258     parms[i].threadid = i;
00259     parms[i].threadcount = numprocs;
00260     parms[i].atoms = atoms;
00261     parms[i].grideners = grideners;
00262     parms[i].numplane = numplane;
00263     parms[i].numcol = numcol;
00264     parms[i].numpt = numpt;
00265     parms[i].natoms = natoms;
00266     parms[i].gridspacing = gridspacing;
00267   }
00268 
00269   printf("GPU padded grid size: %d x %d x %d\n", 
00270     (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00271     (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00272     numplane);
00273 
00274   globaltimer = vmd_timer_create();
00275   vmd_timer_start(globaltimer);
00276 
00277 #if defined(VMDTHREADS)
00278   /* spawn child threads to do the work */
00279   /* thread 0 must also be processed this way otherwise    */
00280   /* we'll permanently bind the main thread to some device */
00281   for (i=0; i<numprocs; i++) {
00282     vmd_thread_create(&threads[i], cudaenergythread, &parms[i]);
00283   }
00284 
00285   /* join the threads after work is done */
00286   for (i=0; i<numprocs; i++) {
00287     vmd_thread_join(threads[i], NULL);
00288   }
00289 #else
00290   /* single thread does all of the work */
00291   cudaenergythread((void *) &parms[0]);
00292 #endif
00293 
00294   // Measure GFLOPS
00295   vmd_timer_stop(globaltimer);
00296   totalruntime = vmd_timer_time(globaltimer);
00297   vmd_timer_destroy(globaltimer);
00298 
00299   double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00300   printf("  %g billion atom evals/second, %g GFLOPS\n",
00301            atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00302 
00303   /* free thread parms */
00304   free(parms);
00305   free(threads);
00306 
00307   return 0;
00308 }
00309 
00310 
00311 
00312 
00313 
00314 static void * cudaenergythread(void *voidparms) {
00315   dim3 volsize, Gsz, Bsz;
00316   float *devenergy = NULL;
00317   float *hostenergy = NULL;
00318   float *atomprebuf = NULL;
00319   int atomprebufpinned = 1; // try to use pinned/page-locked atom buffer
00320 
00321   enthrparms *parms = (enthrparms *) voidparms;
00322   /* 
00323    * copy in per-thread parameters 
00324    */
00325   const float *atoms = parms->atoms;
00326   float* grideners = parms->grideners;
00327   const long int numplane = parms->numplane;
00328   const long int numcol = parms->numcol;
00329   const long int numpt = parms->numpt;
00330   const long int natoms = parms->natoms;
00331   const float gridspacing = parms->gridspacing;
00332   const int threadid = parms->threadid;
00333   const int threadcount = parms->threadcount;
00334   double lasttime, totaltime;
00335 
00336   cudaSetDevice(threadid);
00337   CUERR // check and clear any existing errors
00338 
00339   // setup energy grid size, padding out arrays for peak GPU memory performance
00340   volsize.x = (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00341   volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00342   volsize.z = 1;      // we only do one plane at a time
00343 
00344   // setup CUDA grid and block sizes
00345   Bsz.x = BLOCKSIZEX;
00346   Bsz.y = BLOCKSIZEY;
00347   Bsz.z = 1;
00348   Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00349   Gsz.y = volsize.y / (Bsz.y * UNROLLY); 
00350   Gsz.z = 1;
00351   int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00352 
00353   printf("Thread %d started for CUDA device %d...\n", threadid, threadid);
00354   vmd_timerhandle timer = vmd_timer_create();
00355   vmd_timer_start(timer);
00356   msgtimer * msgt = msg_timer_create(5);
00357 
00358   // Allocate DMA buffers with some extra padding at the end so that 
00359   // multiple GPUs aren't DMAing too close to each other, for NUMA..
00360 #define DMABUFPADSIZE (32 * 1024)
00361 
00362   // allocate atom pre-computation and copy buffer in pinned memory
00363   // for better host/GPU transfer bandwidth, when enabled
00364   if (atomprebufpinned) {
00365     // allocate GPU DMA buffer (with padding)
00366     if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) {
00367       printf("Pinned atom copy buffer allocation failed!\n");
00368       atomprebufpinned=0;
00369     }
00370   }
00371 
00372   // if a pinned allocation failed or we choose not to use 
00373   // pinned memory, fall back to a normal malloc here.
00374   if (!atomprebufpinned) {
00375     // allocate GPU DMA buffer (with padding)
00376     atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE);
00377     if (atomprebuf == NULL) {
00378       printf("Atom copy buffer allocation failed!\n");
00379       return NULL;
00380     }
00381   }
00382 
00383 
00384   // allocate and initialize the GPU output array
00385   cudaMalloc((void**)&devenergy, volmemsz);
00386   CUERR // check and clear any existing errors
00387 
00388   hostenergy = (float *) malloc(volmemsz); // allocate working buffer
00389 
00390   // For each point in the cube...
00391   int iterations=0;
00392   int k;
00393   for (k=threadid; k<numplane; k+= threadcount) {
00394     int y;
00395     int atomstart;
00396     float zplane = k * (float) gridspacing;
00397   
00398     // Copy energy grid into GPU 16-element padded input
00399     for (y=0; y<numcol; y++) {
00400       long eneraddr = k*numcol*numpt + y*numpt;
00401       memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float));
00402     }
00403 
00404     // Copy the Host input data to the GPU..
00405     cudaMemcpy(devenergy, hostenergy, volmemsz,  cudaMemcpyHostToDevice);
00406     CUERR // check and clear any existing errors
00407 
00408     lasttime = vmd_timer_timenow(timer);
00409     for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00410       iterations++;
00411       int runatoms;
00412       int atomsremaining = natoms - atomstart;
00413       if (atomsremaining > MAXATOMS)
00414         runatoms = MAXATOMS;
00415       else
00416         runatoms = atomsremaining;
00417 
00418       // copy the next group of atoms to the GPU
00419       if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane))
00420         return NULL;
00421 
00422       // RUN the kernel...
00423       cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy);
00424       CUERR // check and clear any existing errors
00425     }
00426 
00427     // Copy the GPU output data back to the host and use/store it..
00428     cudaMemcpy(hostenergy, devenergy, volmemsz,  cudaMemcpyDeviceToHost);
00429     CUERR // check and clear any existing errors
00430 
00431     // Copy GPU blocksize padded array back down to the original size
00432     for (y=0; y<numcol; y++) {
00433       long eneraddr = k*numcol*numpt + y*numpt;
00434       memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float));
00435     }
00436  
00437     totaltime = vmd_timer_timenow(timer);
00438     if (msg_timer_timeout(msgt)) {
00439       // XXX: we have to use printf here as msgInfo is not thread-safe yet.
00440       printf("thread[%d] plane %d/%ld time %.2f, elapsed %.1f, est. total: %.1f\n",
00441              threadid, k, numplane,
00442              totaltime - lasttime, totaltime,
00443              totaltime * numplane / (k+1));
00444     }
00445   }
00446 
00447   vmd_timer_destroy(timer); // free timer
00448   msg_timer_destroy(msgt); // free timer
00449   free(hostenergy);    // free working buffer
00450   cudaFree(devenergy); // free CUDA memory buffer
00451   CUERR // check and clear any existing errors
00452 
00453   // free pinned GPU copy buffer
00454   if (atomprebufpinned) {
00455     if (cudaFreeHost(atomprebuf) != cudaSuccess) {
00456       printf("Pinned atom buffer deallocation failed!\n");
00457       return NULL;
00458     }
00459   } else {
00460     free(atomprebuf);
00461   }
00462   return NULL;
00463 }
00464 
00465 
00466 
00467 

Generated on Sun Jul 6 01:27:25 2008 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002