CUDAVolCPotential.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAVolCPotential.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.46 $      $Date: 2020/02/26 20:16:56 $
00014  *
00015  ***************************************************************************/
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <string.h>
00038 #include <cuda.h>
00039 
00040 #include "Inform.h"
00041 #include "utilities.h"
00042 #include "WKFThreads.h"
00043 #include "WKFUtils.h"
00044 #include "CUDAKernels.h" 
00045 
00046 typedef struct {
00047   float* atoms;
00048   float* grideners;
00049   long int numplane;
00050   long int numcol;
00051   long int numpt;
00052   long int natoms;
00053   float gridspacing;
00054 } enthrparms;
00055 
00056 /* thread prototype */
00057 static void * cudaenergythread(void *);
00058 
00059 #if 1
00060 #define CUERR { cudaError_t err; \
00061   if ((err = cudaGetLastError()) != cudaSuccess) { \
00062   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00063   printf("Thread aborting...\n"); \
00064   return NULL; }}
00065 #else
00066 #define CUERR
00067 #endif
00068 
00069 // max constant buffer size is 64KB, minus whatever
00070 // the CUDA runtime and compiler are using that we don't know about
00071 // At 16 bytes/atom, 4000 atoms is about the max we can store in
00072 // the constant buffer.
00073 #define MAXATOMS 4000
00074 __constant__ static float4 atominfo[MAXATOMS];
00075 
00076 
00077 // 
00078 // The CUDA kernels calculate coulombic potential at each grid point and
00079 // store the results in the output array.
00080 //
00081 // These versions of the code use the 64KB constant buffer area reloaded
00082 // for each group of MAXATOMS atoms, until the contributions from all
00083 // atoms have been summed into the potential grid.
00084 //
00085 // These implementations use precomputed and unrolled loops of 
00086 // (dy^2 + dz^2) values for increased FP arithmetic intensity.
00087 // The X coordinate portion of the loop is unrolled by four or eight,
00088 // allowing the same dy^2 + dz^2 values to be reused multiple times,
00089 // increasing the ratio of FP arithmetic relative to FP loads, and
00090 // eliminating some redundant calculations.
00091 //
00092 
00093 //
00094 // Tuned global memory coalescing version, unrolled in X
00095 //
00096 // Benchmark for this version: 291 GFLOPS, 39.5 billion atom evals/sec
00097 //  (Test system: GeForce 8800GTX)
00098 // 
00099 
00100 #if 1
00101 
00102 //
00103 // Tunings for large potential map dimensions (e.g. 384x384x...)
00104 //
00105 // NVCC -cubin says this implementation uses 20 regs
00106 //
00107 #define UNROLLX     8
00108 #define UNROLLY     1
00109 #define BLOCKSIZEX  8  // make large enough to allow coalesced global mem ops
00110 #define BLOCKSIZEY  8  // make as small as possible for finer granularity
00111 
00112 #else
00113 
00114 //
00115 // Tunings for small potential map dimensions (e.g. 128x128x...)
00116 //
00117 // NVCC -cubin says this implementation uses 16 regs
00118 //
00119 #define UNROLLX     4
00120 #define UNROLLY     1
00121 #define BLOCKSIZEX 16  // make large enough to allow coalesced global mem ops
00122 #define BLOCKSIZEY 16  // make as small as possible for finer granularity
00123 
00124 #endif
00125 
00126 #define BLOCKSIZE  (BLOCKSIZEX*BLOCKSIZEY)
00127 
00128 // FLOP counting
00129 #if UNROLLX == 8
00130 #define FLOPSPERATOMEVAL (59.0/8.0)
00131 #elif UNROLLX == 4 
00132 #define FLOPSPERATOMEVAL (31.0/4.0)
00133 #endif
00134 
00135 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
00136   unsigned int xindex  = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x;
00137   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00138   unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex;
00139 
00140   float coory = gridspacing * yindex;
00141   float coorx = gridspacing * xindex;
00142 
00143   float energyvalx1=0.0f;
00144   float energyvalx2=0.0f;
00145   float energyvalx3=0.0f;
00146   float energyvalx4=0.0f;
00147 #if UNROLLX > 4
00148   float energyvalx5=0.0f;
00149   float energyvalx6=0.0f;
00150   float energyvalx7=0.0f;
00151   float energyvalx8=0.0f;
00152 #endif
00153 
00154   float gridspacing_coalesce = gridspacing * BLOCKSIZEX;
00155 
00156   // NOTE: One beneficial side effect of summing groups of 4,000 atoms
00157   // at a time in multiple passes is that these smaller summation 
00158   // groupings are likely to result in higher precision, since structures
00159   // are typically stored with atoms in chain order leading to high 
00160   // spatial locality within the groups of 4,000 atoms.  Kernels that 
00161   // sum these subgroups independently of the global sum for each voxel
00162   // should achieve relatively good floating point precision since large values
00163   // will tend to be summed with large values, and small values summed with
00164   // small values, etc. 
00165   int atomid;
00166   for (atomid=0; atomid<numatoms; atomid++) {
00167     float dy = coory - atominfo[atomid].y;
00168     float dyz2 = (dy * dy) + atominfo[atomid].z;
00169 
00170     float dx1 = coorx - atominfo[atomid].x;
00171     float dx2 = dx1 + gridspacing_coalesce;
00172     float dx3 = dx2 + gridspacing_coalesce;
00173     float dx4 = dx3 + gridspacing_coalesce;
00174 #if UNROLLX > 4
00175     float dx5 = dx4 + gridspacing_coalesce;
00176     float dx6 = dx5 + gridspacing_coalesce;
00177     float dx7 = dx6 + gridspacing_coalesce;
00178     float dx8 = dx7 + gridspacing_coalesce;
00179 #endif
00180 
00181     energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2);
00182     energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2);
00183     energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2);
00184     energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2);
00185 #if UNROLLX > 4
00186     energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2);
00187     energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2);
00188     energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2);
00189     energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2);
00190 #endif
00191   }
00192 
00193   energygrid[outaddr             ] += energyvalx1;
00194   energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2;
00195   energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3;
00196   energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4;
00197 #if UNROLLX > 4
00198   energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5;
00199   energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6;
00200   energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7;
00201   energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8;
00202 #endif
00203 }
00204 
00205 
00206 // required GPU array padding to match thread block size
00207 // XXX note: this code requires block size dimensions to be a power of two
00208 #define TILESIZEX BLOCKSIZEX*UNROLLX
00209 #define TILESIZEY BLOCKSIZEY*UNROLLY
00210 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00211 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00212 
00213 static int copyatomstoconstbuf(const float *atoms, float *atompre, 
00214                         int count, float zplane) {
00215   if (count > MAXATOMS) {
00216     printf("Atom count exceeds constant buffer storage capacity\n");
00217     return -1;
00218   }
00219 
00220   int i;
00221   for (i=0; i<count*4; i+=4) {
00222     atompre[i    ] = atoms[i    ];
00223     atompre[i + 1] = atoms[i + 1];
00224     float dz = zplane - atoms[i + 2];
00225     atompre[i + 2]  = dz*dz;
00226     atompre[i + 3] = atoms[i + 3];
00227   }
00228 
00229   cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
00230 
00231   return 0;
00232 }
00233 
00234 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners,
00235                             long int numplane, long int numcol, long int numpt, 
00236                             float gridspacing) {
00237   enthrparms parms;
00238   wkf_timerhandle globaltimer;
00239   double totalruntime;
00240   int rc=0;
00241 
00242   int numprocs = wkf_thread_numprocessors();
00243   int deviceCount = 0;
00244   if (vmd_cuda_num_devices(&deviceCount))
00245     return -1;
00246   if (deviceCount < 1)
00247     return -1;
00248 
00249   /* take the lesser of the number of CPUs and GPUs */
00250   /* and execute that many threads                  */
00251   if (deviceCount < numprocs) {
00252     numprocs = deviceCount;
00253   }
00254 
00255   printf("Using %d CUDA GPUs\n", numprocs);
00256   printf("GPU padded grid size: %d x %d x %d\n", 
00257     (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00258     (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00259     numplane);
00260 
00261   parms.atoms = atoms;
00262   parms.grideners = grideners;
00263   parms.numplane = numplane;
00264   parms.numcol = numcol;
00265   parms.numpt = numpt;
00266   parms.natoms = natoms;
00267   parms.gridspacing = gridspacing;
00268 
00269   globaltimer = wkf_timer_create();
00270   wkf_timer_start(globaltimer);
00271 
00272   /* spawn child threads to do the work */
00273   wkf_tasktile_t tile;
00274   tile.start=0;
00275   tile.end=numplane;
00276   rc = wkf_threadlaunch(numprocs, &parms, cudaenergythread, &tile);
00277 
00278   // Measure GFLOPS
00279   wkf_timer_stop(globaltimer);
00280   totalruntime = wkf_timer_time(globaltimer);
00281   wkf_timer_destroy(globaltimer);
00282 
00283   if (!rc) {
00284     double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00285     printf("  %g billion atom evals/second, %g GFLOPS\n",
00286            atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00287   } else {
00288     msgWarn << "A GPU encountered an unrecoverable error." << sendmsg;
00289     msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00290   }
00291   return rc;
00292 }
00293 
00294 
00295 
00296 
00297 
00298 static void * cudaenergythread(void *voidparms) {
00299   dim3 volsize, Gsz, Bsz;
00300   float *devenergy = NULL;
00301   float *hostenergy = NULL;
00302   float *atomprebuf = NULL;
00303   enthrparms *parms = NULL;
00304   int threadid=0;
00305   int atomprebufpinned = 0; // try to use pinned/page-locked atom buffer
00306 
00307   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00308   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00309 
00310   /* 
00311    * copy in per-thread parameters 
00312    */
00313   const float *atoms = parms->atoms;
00314   float* grideners = parms->grideners;
00315   const long int numplane = parms->numplane;
00316   const long int numcol = parms->numcol;
00317   const long int numpt = parms->numpt;
00318   const long int natoms = parms->natoms;
00319   const float gridspacing = parms->gridspacing;
00320   double lasttime, totaltime;
00321 
00322   cudaError_t rc;
00323   rc = cudaSetDevice(threadid);
00324   if (rc != cudaSuccess) {
00325 #if CUDART_VERSION >= 2010
00326     rc = cudaGetLastError(); // query last error and reset error state
00327     if (rc != cudaErrorSetOnActiveProcess)
00328       return NULL; // abort and return an error
00329 #else
00330     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00331                         // revs don't have a cudaErrorSetOnActiveProcess enum
00332 #endif
00333   }
00334 
00335   // setup energy grid size, padding out arrays for peak GPU memory performance
00336   volsize.x = (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00337   volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00338   volsize.z = 1;      // we only do one plane at a time
00339 
00340   // setup CUDA grid and block sizes
00341   Bsz.x = BLOCKSIZEX;
00342   Bsz.y = BLOCKSIZEY;
00343   Bsz.z = 1;
00344   Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00345   Gsz.y = volsize.y / (Bsz.y * UNROLLY); 
00346   Gsz.z = 1;
00347   int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00348 
00349   printf("Thread %d started for CUDA device %d...\n", threadid, threadid);
00350   wkf_timerhandle timer = wkf_timer_create();
00351   wkf_timer_start(timer);
00352   wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00353 
00354   // Allocate DMA buffers with some extra padding at the end so that 
00355   // multiple GPUs aren't DMAing too close to each other, for NUMA..
00356 #define DMABUFPADSIZE (32 * 1024)
00357 
00358   // allocate atom pre-computation and copy buffer in pinned memory
00359   // for better host/GPU transfer bandwidth, when enabled
00360   if (atomprebufpinned) {
00361     // allocate GPU DMA buffer (with padding)
00362     if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) {
00363       printf("Pinned atom copy buffer allocation failed!\n");
00364       atomprebufpinned=0;
00365     }
00366   }
00367 
00368   // if a pinned allocation failed or we choose not to use 
00369   // pinned memory, fall back to a normal malloc here.
00370   if (!atomprebufpinned) {
00371     // allocate GPU DMA buffer (with padding)
00372     atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE);
00373     if (atomprebuf == NULL) {
00374       printf("Atom copy buffer allocation failed!\n");
00375       return NULL;
00376     }
00377   }
00378 
00379 
00380   // allocate and initialize the GPU output array
00381   cudaMalloc((void**)&devenergy, volmemsz);
00382   CUERR // check and clear any existing errors
00383 
00384   hostenergy = (float *) malloc(volmemsz); // allocate working buffer
00385 
00386   // For each point in the cube...
00387   int iterations=0;
00388   int computedplanes=0;
00389   wkf_tasktile_t tile;
00390   while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00391     int k;
00392     for (k=tile.start; k<tile.end; k++) {
00393       int y;
00394       int atomstart;
00395       float zplane = k * (float) gridspacing;
00396       computedplanes++; // track work done by this GPU for progress reporting
00397  
00398       // Copy energy grid into GPU 16-element padded input
00399       for (y=0; y<numcol; y++) {
00400         long eneraddr = k*numcol*numpt + y*numpt;
00401         memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float));
00402       }
00403 
00404       // Copy the Host input data to the GPU..
00405       cudaMemcpy(devenergy, hostenergy, volmemsz,  cudaMemcpyHostToDevice);
00406       CUERR // check and clear any existing errors
00407 
00408       lasttime = wkf_timer_timenow(timer);
00409       for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00410         iterations++;
00411         int runatoms;
00412         int atomsremaining = natoms - atomstart;
00413         if (atomsremaining > MAXATOMS)
00414           runatoms = MAXATOMS;
00415         else
00416           runatoms = atomsremaining;
00417 
00418         // copy the next group of atoms to the GPU
00419         if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane))
00420           return NULL;
00421 
00422         // RUN the kernel...
00423         cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy);
00424         CUERR // check and clear any existing errors
00425       }
00426 
00427       // Copy the GPU output data back to the host and use/store it..
00428       cudaMemcpy(hostenergy, devenergy, volmemsz,  cudaMemcpyDeviceToHost);
00429       CUERR // check and clear any existing errors
00430 
00431       // Copy GPU blocksize padded array back down to the original size
00432       for (y=0; y<numcol; y++) {
00433         long eneraddr = k*numcol*numpt + y*numpt;
00434         memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float));
00435       }
00436  
00437       totaltime = wkf_timer_timenow(timer);
00438       if (wkf_msg_timer_timeout(msgt)) {
00439         // XXX: we have to use printf here as msgInfo is not thread-safe yet.
00440         printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00441                threadid, k, numplane, computedplanes,
00442                totaltime - lasttime, totaltime,
00443                totaltime * numplane / (k+1));
00444       }
00445     }
00446   }
00447 
00448   wkf_timer_destroy(timer); // free timer
00449   wkf_msg_timer_destroy(msgt); // free timer
00450   free(hostenergy);    // free working buffer
00451   cudaFree(devenergy); // free CUDA memory buffer
00452   CUERR // check and clear any existing errors
00453 
00454   // free pinned GPU copy buffer
00455   if (atomprebufpinned) {
00456     if (cudaFreeHost(atomprebuf) != cudaSuccess) {
00457       printf("Pinned atom buffer deallocation failed!\n");
00458       return NULL;
00459     }
00460   } else {
00461     free(atomprebuf);
00462   }
00463   return NULL;
00464 }
00465 
00466 
00467 
00468