Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDAQuickSurf.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2007-2011 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: CUDAQuickSurf.cu,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.46 $      $Date: 2012/03/12 16:38:33 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   CUDA accelerated gaussian density calculation
00019  *
00020  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #if CUDART_VERSION < 4000
00027 #error The VMD QuickSurf feature requires CUDA 4.0 or later
00028 #endif
00029 
00030 #include "Inform.h"
00031 #include "utilities.h"
00032 #include "WKFThreads.h"
00033 #include "WKFUtils.h"
00034 #include "CUDAKernels.h" 
00035 #include "CUDAMarchingCubes.h"
00036 #include "CUDAQuickSurf.h" 
00037 
00038 #include "DispCmds.h"
00039 #include "VMDDisplayList.h"
00040 
00041 //
00042 // multi-threaded direct summation implementation
00043 //
00044 
00045 typedef struct {
00046   float isovalue;
00047   float radscale;
00048   float4 *xyzr;
00049   float4 *colors;
00050   float *volmap;
00051   float *voltexmap;
00052   int numplane;
00053   int numcol;
00054   int numpt;
00055   long int natoms;
00056   float gridspacing;
00057 } enthrparms;
00058 
00059 /* thread prototype */
00060 static void * cudadensitythread(void *);
00061 
00062 #if 1
00063 #define CUERR { cudaError_t err; \
00064   if ((err = cudaGetLastError()) != cudaSuccess) { \
00065   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00066   printf("Thread aborting...\n"); \
00067   return NULL; }}
00068 #else
00069 #define CUERR
00070 #endif
00071 
00072 // 
00073 // Parameters for direct summation gaussian density kernels
00074 //
00075 #if 1
00076 
00077 #define DUNROLLX   8 
00078 #define DBLOCKSZX  8  // make large enough to allow coalesced global mem ops
00079 #define DBLOCKSZY  8  // make as small as possible for finer granularity
00080 
00081 #else
00082 
00083 #define DUNROLLX   4
00084 #define DBLOCKSZX 16  // make large enough to allow coalesced global mem ops
00085 #define DBLOCKSZY 16  // make as small as possible for finer granularity
00086 
00087 #endif
00088 
00089 #define DBLOCKSZ  (DBLOCKSZX*DBLOCKSZY)
00090 
00091 // FLOP counting
00092 #if DUNROLLX == 8
00093 #define FLOPSPERATOMEVALTEX (109.0/8.0)
00094 #define FLOPSPERATOMEVAL    (53.0/8.0)
00095 #elif DUNROLLX == 4 
00096 #define FLOPSPERATOMEVALTEX (57.0/4.0)
00097 #define FLOPSPERATOMEVAL    (29.0/4.0)
00098 #elif DUNROLLX == 2 
00099 #define FLOPSPERATOMEVALTEX (30.0/2.0)
00100 #define FLOPSPERATOMEVAL    (17.0/2.0)
00101 #endif
00102 
00103 __global__ static void gaussdensity_direct_tex(int natoms, 
00104                                         const float4 *xyzr, 
00105                                         const float4 *colors,
00106                                         float gridspacing, unsigned int z,
00107                                         float *densitygrid, 
00108                                         float3 *voltexmap, 
00109                                         float invisovalue) {
00110   unsigned int xindex  = (blockIdx.x * blockDim.x) * DUNROLLX + threadIdx.x;
00111   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00112   unsigned int zindex  = (blockIdx.z * blockDim.z) + threadIdx.z;
00113   unsigned int outaddr = 
00114     ((gridDim.x * blockDim.x) * DUNROLLX) * (gridDim.y * blockDim.y) * zindex + 
00115     ((gridDim.x * blockDim.x) * DUNROLLX) * yindex + xindex;
00116   zindex += z;
00117 
00118   float coorx = gridspacing * xindex;
00119   float coory = gridspacing * yindex;
00120   float coorz = gridspacing * zindex;
00121 
00122   float densityvalx1=0.0f;
00123   float densityvalx2=0.0f;
00124   float3 densitycolx1;
00125   densitycolx1=make_float3(0.0f, 0.0f, 0.0f);
00126   float3 densitycolx2=densitycolx1;
00127 
00128 #if DUNROLLX >= 4
00129   float densityvalx3=0.0f;
00130   float densityvalx4=0.0f;
00131   float3 densitycolx3=densitycolx1;
00132   float3 densitycolx4=densitycolx1;
00133 #endif
00134 #if DUNROLLX >= 8
00135   float densityvalx5=0.0f;
00136   float densityvalx6=0.0f;
00137   float densityvalx7=0.0f;
00138   float densityvalx8=0.0f;
00139 
00140   float3 densitycolx5=densitycolx1;
00141   float3 densitycolx6=densitycolx1;
00142   float3 densitycolx7=densitycolx1;
00143   float3 densitycolx8=densitycolx1;
00144 #endif
00145 
00146   float gridspacing_coalesce = gridspacing * DBLOCKSZX;
00147 
00148   int atomid;
00149   for (atomid=0; atomid<natoms; atomid++) {
00150     float4 atom = xyzr[atomid];
00151     float4 color = colors[atomid];
00152 
00153     float dy = coory - atom.y;
00154     float dz = coorz - atom.z;
00155     float dyz2 = dy*dy + dz*dz;
00156 
00157     float dx1 = coorx - atom.x;
00158     float r21 = (dx1*dx1 + dyz2) * atom.w;
00159     float tmp1 = exp2f(-r21);
00160     densityvalx1 += tmp1;
00161     tmp1 *= invisovalue;
00162     densitycolx1.x += tmp1 * color.x;
00163     densitycolx1.y += tmp1 * color.y;
00164     densitycolx1.z += tmp1 * color.z;
00165 
00166     float dx2 = dx1 + gridspacing_coalesce;
00167     float r22 = (dx2*dx2 + dyz2) * atom.w;
00168     float tmp2 = exp2f(-r22);
00169     densityvalx2 += tmp2;
00170     tmp2 *= invisovalue;
00171     densitycolx2.x += tmp2 * color.x;
00172     densitycolx2.y += tmp2 * color.y;
00173     densitycolx2.z += tmp2 * color.z;
00174 
00175 #if DUNROLLX >= 4
00176     float dx3 = dx2 + gridspacing_coalesce;
00177     float r23 = (dx3*dx3 + dyz2) * atom.w;
00178     float tmp3 = exp2f(-r23);
00179     densityvalx3 += tmp3;
00180     tmp3 *= invisovalue;
00181     densitycolx3.x += tmp3 * color.x;
00182     densitycolx3.y += tmp3 * color.y;
00183     densitycolx3.z += tmp3 * color.z;
00184 
00185     float dx4 = dx3 + gridspacing_coalesce;
00186     float r24 = (dx4*dx4 + dyz2) * atom.w;
00187     float tmp4 = exp2f(-r24);
00188     densityvalx4 += tmp4;
00189     tmp4 *= invisovalue;
00190     densitycolx4.x += tmp4 * color.x;
00191     densitycolx4.y += tmp4 * color.y;
00192     densitycolx4.z += tmp4 * color.z;
00193 #endif
00194 #if DUNROLLX >= 8
00195     float dx5 = dx4 + gridspacing_coalesce;
00196     float r25 = (dx5*dx5 + dyz2) * atom.w;
00197     float tmp5 = exp2f(-r25);
00198     densityvalx5 += tmp5;
00199     tmp5 *= invisovalue;
00200     densitycolx5.x += tmp5 * color.x;
00201     densitycolx5.y += tmp5 * color.y;
00202     densitycolx5.z += tmp5 * color.z;
00203 
00204     float dx6 = dx5 + gridspacing_coalesce;
00205     float r26 = (dx6*dx6 + dyz2) * atom.w;
00206     float tmp6 = exp2f(-r26);
00207     densityvalx6 += tmp6;
00208     tmp6 *= invisovalue;
00209     densitycolx6.x += tmp6 * color.x;
00210     densitycolx6.y += tmp6 * color.y;
00211     densitycolx6.z += tmp6 * color.z;
00212 
00213     float dx7 = dx6 + gridspacing_coalesce;
00214     float r27 = (dx7*dx7 + dyz2) * atom.w;
00215     float tmp7 = exp2f(-r27);
00216     densityvalx7 += tmp7;
00217     tmp7 *= invisovalue;
00218     densitycolx7.x += tmp7 * color.x;
00219     densitycolx7.y += tmp7 * color.y;
00220     densitycolx7.z += tmp7 * color.z;
00221 
00222     float dx8 = dx7 + gridspacing_coalesce;
00223     float r28 = (dx8*dx8 + dyz2) * atom.w;
00224     float tmp8 = exp2f(-r28);
00225     densityvalx8 += tmp8;
00226     tmp8 *= invisovalue;
00227     densitycolx8.x += tmp8 * color.x;
00228     densitycolx8.y += tmp8 * color.y;
00229     densitycolx8.z += tmp8 * color.z;
00230 #endif
00231   }
00232 
00233   densitygrid[outaddr             ] += densityvalx1;
00234   voltexmap[outaddr             ].x += densitycolx1.x;
00235   voltexmap[outaddr             ].y += densitycolx1.y;
00236   voltexmap[outaddr             ].z += densitycolx1.z;
00237 
00238   densitygrid[outaddr+1*DBLOCKSZX] += densityvalx2;
00239   voltexmap[outaddr+1*DBLOCKSZX].x += densitycolx2.x;
00240   voltexmap[outaddr+1*DBLOCKSZX].y += densitycolx2.y;
00241   voltexmap[outaddr+1*DBLOCKSZX].z += densitycolx2.z;
00242 
00243 #if DUNROLLX >= 4
00244   densitygrid[outaddr+2*DBLOCKSZX] += densityvalx3;
00245   voltexmap[outaddr+2*DBLOCKSZX].x += densitycolx3.x;
00246   voltexmap[outaddr+2*DBLOCKSZX].y += densitycolx3.y;
00247   voltexmap[outaddr+2*DBLOCKSZX].z += densitycolx3.z;
00248 
00249   densitygrid[outaddr+3*DBLOCKSZX] += densityvalx4;
00250   voltexmap[outaddr+3*DBLOCKSZX].x += densitycolx4.x;
00251   voltexmap[outaddr+3*DBLOCKSZX].y += densitycolx4.y;
00252   voltexmap[outaddr+3*DBLOCKSZX].z += densitycolx4.z;
00253 #endif
00254 #if DUNROLLX >= 8
00255   densitygrid[outaddr+4*DBLOCKSZX] += densityvalx5;
00256   voltexmap[outaddr+4*DBLOCKSZX].x += densitycolx5.x;
00257   voltexmap[outaddr+4*DBLOCKSZX].y += densitycolx5.y;
00258   voltexmap[outaddr+4*DBLOCKSZX].z += densitycolx5.z;
00259 
00260   densitygrid[outaddr+5*DBLOCKSZX] += densityvalx6;
00261   voltexmap[outaddr+5*DBLOCKSZX].x += densitycolx6.x;
00262   voltexmap[outaddr+5*DBLOCKSZX].y += densitycolx6.y;
00263   voltexmap[outaddr+5*DBLOCKSZX].z += densitycolx6.z;
00264 
00265   densitygrid[outaddr+6*DBLOCKSZX] += densityvalx7;
00266   voltexmap[outaddr+6*DBLOCKSZX].x += densitycolx7.x;
00267   voltexmap[outaddr+6*DBLOCKSZX].y += densitycolx7.y;
00268   voltexmap[outaddr+6*DBLOCKSZX].z += densitycolx7.z;
00269 
00270   densitygrid[outaddr+7*DBLOCKSZX] += densityvalx8;
00271   voltexmap[outaddr+7*DBLOCKSZX].x += densitycolx8.x;
00272   voltexmap[outaddr+7*DBLOCKSZX].y += densitycolx8.y;
00273   voltexmap[outaddr+7*DBLOCKSZX].z += densitycolx8.z;
00274 #endif
00275 }
00276 
00277 
00278 __global__ static void gaussdensity_direct(int natoms,
00279                                     const float4 *xyzr, 
00280                                     float gridspacing, unsigned int z, 
00281                                     float *densitygrid) {
00282   unsigned int xindex  = (blockIdx.x * blockDim.x) * DUNROLLX + threadIdx.x;
00283   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00284   unsigned int zindex  = (blockIdx.z * blockDim.z) + threadIdx.z;
00285   unsigned int outaddr = 
00286     ((gridDim.x * blockDim.x) * DUNROLLX) * (gridDim.y * blockDim.y) * zindex + 
00287     ((gridDim.x * blockDim.x) * DUNROLLX) * yindex + xindex;
00288   zindex += z;
00289 
00290   float coorx = gridspacing * xindex;
00291   float coory = gridspacing * yindex;
00292   float coorz = gridspacing * zindex;
00293 
00294   float densityvalx1=0.0f;
00295   float densityvalx2=0.0f;
00296 #if DUNROLLX >= 4
00297   float densityvalx3=0.0f;
00298   float densityvalx4=0.0f;
00299 #endif
00300 #if DUNROLLX >= 8
00301   float densityvalx5=0.0f;
00302   float densityvalx6=0.0f;
00303   float densityvalx7=0.0f;
00304   float densityvalx8=0.0f;
00305 #endif
00306 
00307   float gridspacing_coalesce = gridspacing * DBLOCKSZX;
00308 
00309   int atomid;
00310   for (atomid=0; atomid<natoms; atomid++) {
00311     float4 atom = xyzr[atomid];
00312     float dy = coory - atom.y;
00313     float dz = coorz - atom.z;
00314     float dyz2 = dy*dy + dz*dz;
00315 
00316     float dx1 = coorx - atom.x;
00317     float r21 = (dx1*dx1 + dyz2) * atom.w;
00318     densityvalx1 += exp2f(-r21);
00319 
00320     float dx2 = dx1 + gridspacing_coalesce;
00321     float r22 = (dx2*dx2 + dyz2) * atom.w;
00322     densityvalx2 += exp2f(-r22);
00323 
00324 #if DUNROLLX >= 4
00325     float dx3 = dx2 + gridspacing_coalesce;
00326     float r23 = (dx3*dx3 + dyz2) * atom.w;
00327     densityvalx3 += exp2f(-r23);
00328 
00329     float dx4 = dx3 + gridspacing_coalesce;
00330     float r24 = (dx4*dx4 + dyz2) * atom.w;
00331     densityvalx4 += exp2f(-r24);
00332 #endif
00333 #if DUNROLLX >= 8
00334     float dx5 = dx4 + gridspacing_coalesce;
00335     float r25 = (dx5*dx5 + dyz2) * atom.w;
00336     densityvalx5 += exp2f(-r25);
00337 
00338     float dx6 = dx5 + gridspacing_coalesce;
00339     float r26 = (dx6*dx6 + dyz2) * atom.w;
00340     densityvalx6 += exp2f(-r26);
00341 
00342     float dx7 = dx6 + gridspacing_coalesce;
00343     float r27 = (dx7*dx7 + dyz2) * atom.w;
00344     densityvalx7 += exp2f(-r27);
00345 
00346     float dx8 = dx7 + gridspacing_coalesce;
00347     float r28 = (dx8*dx8 + dyz2) * atom.w;
00348     densityvalx8 += exp2f(-r28);
00349 #endif
00350   }
00351 
00352   densitygrid[outaddr             ] += densityvalx1;
00353   densitygrid[outaddr+1*DBLOCKSZX] += densityvalx2;
00354 #if DUNROLLX >= 4
00355   densitygrid[outaddr+2*DBLOCKSZX] += densityvalx3;
00356   densitygrid[outaddr+3*DBLOCKSZX] += densityvalx4;
00357 #endif
00358 #if DUNROLLX >= 8
00359   densitygrid[outaddr+4*DBLOCKSZX] += densityvalx5;
00360   densitygrid[outaddr+5*DBLOCKSZX] += densityvalx6;
00361   densitygrid[outaddr+6*DBLOCKSZX] += densityvalx7;
00362   densitygrid[outaddr+7*DBLOCKSZX] += densityvalx8;
00363 #endif
00364 }
00365 
00366 
00367 // required GPU array padding to match thread block size
00368 // XXX note: this code requires block size dimensions to be a power of two
00369 #define TILESIZEX DBLOCKSZX*DUNROLLX
00370 #define TILESIZEY DBLOCKSZY
00371 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00372 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00373 
00374 int vmd_cuda_gaussdensity_direct(long int natoms, float4 *xyzr, float4 *colors,
00375                                  float* volmap, float *voltexmap, int3 volsz,
00376                                  float radscale, float gridspacing, 
00377                                  float isovalue, float gausslim) {
00378   enthrparms parms;
00379   wkf_timerhandle globaltimer;
00380   double totalruntime;
00381   int rc=0;
00382 
00383   int numprocs = wkf_thread_numprocessors();
00384   int deviceCount = 0;
00385   if (vmd_cuda_num_devices(&deviceCount))
00386     return -1;
00387   if (deviceCount < 1)
00388     return -1;
00389 
00390   /* take the lesser of the number of CPUs and GPUs */
00391   /* and execute that many threads                  */
00392   if (deviceCount < numprocs) {
00393     numprocs = deviceCount;
00394   }
00395 
00396   printf("Using %d CUDA GPUs\n", numprocs);
00397   printf("GPU padded grid size: %d x %d x %d\n", 
00398          (volsz.x + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00399          (volsz.y + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00400           volsz.z);
00401 
00402   parms.isovalue = isovalue;
00403   parms.radscale = radscale;
00404   parms.xyzr = xyzr;
00405   parms.colors = colors;
00406   parms.volmap = volmap;
00407   parms.voltexmap = voltexmap;
00408   parms.numplane = volsz.z;
00409   parms.numcol = volsz.y;
00410   parms.numpt = volsz.x;
00411   parms.natoms = natoms;
00412   parms.gridspacing = gridspacing;
00413 
00414   globaltimer = wkf_timer_create();
00415   wkf_timer_start(globaltimer);
00416 
00417   /* spawn child threads to do the work */
00418   wkf_tasktile_t tile;
00419   tile.start=0;
00420   tile.end=volsz.z;
00421   rc = wkf_threadlaunch(numprocs, &parms, cudadensitythread, &tile);
00422 
00423   // Measure GFLOPS
00424   wkf_timer_stop(globaltimer);
00425   totalruntime = wkf_timer_time(globaltimer);
00426   wkf_timer_destroy(globaltimer);
00427 
00428   if (!rc) {
00429     double atomevalssec = ((double) volsz.x * volsz.y * volsz.z * natoms) / (totalruntime * 1000000000.0);
00430     printf("  %g billion atom evals/second, %g GFLOPS\n",
00431            atomevalssec, 
00432            atomevalssec * ((voltexmap==NULL) ? FLOPSPERATOMEVAL : FLOPSPERATOMEVALTEX));
00433   } else {
00434     msgWarn << "A GPU encountered an unrecoverable error." << sendmsg;
00435     msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00436   }
00437   return rc;
00438 }
00439 
00440 
00441 
00442 
00443 
00444 static void * cudadensitythread(void *voidparms) {
00445   dim3 volsize, Gsz, Bsz;
00446   float *devdensity = NULL;
00447   float *devtexmap = NULL;
00448   float *hostdensity = NULL;
00449   float *hosttexmap = NULL;
00450   enthrparms *parms = NULL;
00451   int threadid=0;
00452 
00453   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00454   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00455 
00456   /* 
00457    * copy in per-thread parameters 
00458    */
00459   float isovalue = parms->isovalue;
00460 //  float radscale = parms->radscale;
00461   const float4 *xyzr = parms->xyzr;
00462   const float4 *colors = parms->colors;
00463   float *volmap = parms->volmap;
00464   float *voltexmap = parms->voltexmap;
00465   const long int numplane = parms->numplane;
00466   const int numcol = parms->numcol;
00467   const int numpt = parms->numpt;
00468   const int natoms = parms->natoms;
00469   const float gridspacing = parms->gridspacing;
00470   double lasttime, totaltime;
00471 
00472   cudaError_t rc;
00473   rc = cudaSetDevice(threadid);
00474   if (rc != cudaSuccess) {
00475 #if CUDART_VERSION >= 2010
00476     rc = cudaGetLastError(); // query last error and reset error state
00477     if (rc != cudaErrorSetOnActiveProcess)
00478       return NULL; // abort and return an error
00479 #else
00480     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00481                         // revs don't have a cudaErrorSetOnActiveProcess enum
00482 #endif
00483   }
00484 
00485   // setup density grid size, padding out arrays for peak GPU memory performance
00486   volsize.x = (numpt  + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00487   volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00488   volsize.z = 1;      // we only do one plane at a time
00489 
00490   // setup CUDA grid and block sizes
00491   Bsz.x = DBLOCKSZX;
00492   Bsz.y = DBLOCKSZY;
00493   Bsz.z = 1;
00494   Gsz.x = volsize.x / (Bsz.x * DUNROLLX);
00495   Gsz.y = volsize.y / Bsz.y; 
00496   Gsz.z = 1;
00497   int maxplanes = 4;
00498   int volplanesz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00499   int volmemsz = maxplanes * volplanesz;
00500 
00501   printf("Thread %d started for CUDA device %d, grid size %dx%dx%d\n", 
00502          threadid, threadid, Gsz.x, Gsz.y, Gsz.z);
00503   wkf_timerhandle timer = wkf_timer_create();
00504   wkf_timer_start(timer);
00505   wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00506 
00507   // allocate and initialize the GPU output array
00508   cudaMalloc((void**)&devdensity, volmemsz);
00509   if (voltexmap != NULL) cudaMalloc((void**)&devtexmap, volmemsz*3);
00510   CUERR // check and clear any existing errors
00511 
00512   hostdensity = (float *) malloc(volmemsz); // allocate working buffer
00513   if (voltexmap != NULL) hosttexmap = (float *) malloc(volmemsz*3); // allocate working buffer
00514 
00515   float *devatoms=NULL, *devradii=NULL, *devcolors=NULL;
00516   cudaMalloc((void**)&devatoms, natoms*4*sizeof(float));
00517   cudaMemcpy(devatoms, xyzr, natoms*4*sizeof(float), cudaMemcpyHostToDevice);
00518   if (colors) {
00519     cudaMalloc((void**)&devcolors, natoms*4*sizeof(float));  
00520     cudaMemcpy(devcolors, colors, natoms*4*sizeof(float), cudaMemcpyHostToDevice);
00521   }
00522   CUERR // check and clear any existing errors
00523 
00524   // For each point in the cube...
00525   int computedplanes=0;
00526   wkf_tasktile_t tile;
00527   while (wkf_threadlaunch_next_tile(voidparms, maxplanes, &tile) != WKF_SCHED_DONE) {
00528     int z, k;
00529     int numk=tile.end - tile.start;
00530     int runplanes = min(numk, maxplanes);
00531     // Set CUDA grid Z dimension
00532     Gsz.z = runplanes;
00533     for (k=tile.start; k<tile.end; k+=runplanes) {
00534 // printf("k[%d/%d] run: %d Gsz.z: %d\n", k, tile.end, runplanes, Gsz.z);
00535       int y;
00536       computedplanes++; // track work done by this GPU for progress reporting
00537 
00538 #if 1
00539       cudaMemset(devdensity, 0, volmemsz);
00540       if (hosttexmap != NULL)
00541         cudaMemset(devtexmap, 0, 3*volmemsz);
00542       CUERR // check and clear any existing errors
00543 #else
00544       // XXX no need for this currently... 
00545       // Copy density grid into GPU padded input
00546       for (z=k; z<k+runplanes; z++) {
00547         for (y=0; y<numcol; y++) {
00548           long densaddr = z*numcol*numpt + y*numpt;
00549           memcpy(&hostdensity[(z-k)*volsize.x*volsize.y + y*volsize.x], 
00550                  &volmap[densaddr], numpt * sizeof(float));
00551           if (hosttexmap != NULL)
00552             memcpy(&hosttexmap[((z-k)*volsize.x*volsize.y + y*volsize.x)*3], 
00553                    &voltexmap[densaddr*3], 3 * numpt * sizeof(float));
00554         }
00555       }
00556 
00557       // Copy the Host input data to the GPU..
00558       cudaMemcpy(devdensity, hostdensity, volmemsz, cudaMemcpyHostToDevice);
00559       if (devtexmap != NULL)
00560         cudaMemcpy(devtexmap, hosttexmap, volmemsz*3, cudaMemcpyHostToDevice);
00561       CUERR // check and clear any existing errors
00562 #endif
00563 
00564       lasttime = wkf_timer_timenow(timer);
00565       // RUN the kernel...
00566       if (devtexmap != NULL) {
00567         gaussdensity_direct_tex<<<Gsz, Bsz, 0>>>(natoms, 
00568                                           (float4*) devatoms,
00569                                           (float4*) devcolors,
00570                                           gridspacing, k,
00571                                           devdensity, 
00572                                           (float3*) devtexmap,
00573                                           1.0f / isovalue);
00574       } else {
00575         gaussdensity_direct<<<Gsz, Bsz, 0>>>(natoms, 
00576                                       (float4*) devatoms,
00577                                       gridspacing, k, devdensity);
00578       }
00579       cudaThreadSynchronize();
00580       CUERR // check and clear any existing errors
00581 
00582       // Copy the GPU output data back to the host and use/store it..
00583       cudaMemcpy(hostdensity, devdensity, volmemsz, cudaMemcpyDeviceToHost);
00584       if (devtexmap != NULL)
00585         cudaMemcpy(hosttexmap, devtexmap, volmemsz*3, cudaMemcpyDeviceToHost);
00586       CUERR // check and clear any existing errors
00587 
00588       // Copy GPU blocksize padded array back down to the original size
00589       for (z=k; z<k+runplanes; z++) {
00590         for (y=0; y<numcol; y++) {
00591           long densaddr = z*numcol*numpt + y*numpt;
00592           memcpy(&volmap[densaddr], 
00593                  &hostdensity[(z-k)*volsize.x*volsize.y + y*volsize.x], 
00594                  numpt * sizeof(float));
00595           if (hosttexmap != NULL) 
00596             memcpy(&voltexmap[densaddr*3], 
00597                    &hosttexmap[((z-k)*volsize.x*volsize.y + y*volsize.x)*3], 
00598                    3 * numpt * sizeof(float));
00599         }
00600       }
00601  
00602       totaltime = wkf_timer_timenow(timer);
00603       if (wkf_msg_timer_timeout(msgt)) {
00604         // XXX: we have to use printf here as msgInfo is not thread-safe yet.
00605         printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00606                threadid, k, numplane, computedplanes,
00607                totaltime - lasttime, totaltime,
00608                totaltime * numplane / (k+1));
00609       }
00610     }
00611   }
00612 
00613   wkf_timer_destroy(timer); // free timer
00614   wkf_msg_timer_destroy(msgt); // free timer
00615   free(hostdensity);    // free working buffer
00616   if (hosttexmap != NULL)
00617     free(hosttexmap);    // free working buffer
00618   cudaFree(devdensity); // free CUDA memory buffer
00619   if (devtexmap != NULL)
00620     cudaFree(devtexmap); // free CUDA memory buffer
00621   if (devatoms != NULL)
00622     cudaFree(devatoms);
00623   if (devradii != NULL)
00624     cudaFree(devradii);
00625   if (devcolors != NULL)
00626     cudaFree(devcolors);
00627 
00628   CUERR // check and clear any existing errors
00629 
00630   return NULL;
00631 }
00632 
00633 
00634 //
00635 // Linear-time density kernels that use spatial hashing of atoms 
00636 // into a uniform grid of atom bins to reduce the number of 
00637 // density computations by truncating the gaussian to a given radius
00638 // and only considering bins of atoms that fall within that radius.
00639 //
00640 #include <thrust/sort.h> // need thrust sorting primitives
00641 
00642 #define GRID_CELL_EMPTY 0xffffffff
00643 
00644 // calculate cell address as the hash value for each atom
00645 __global__ static void hashAtoms(unsigned int natoms,
00646                           const float4 *xyzr,
00647                           int3 numvoxels,
00648                           float invgridspacing,
00649                           unsigned int *atomIndex,
00650                           unsigned int *atomHash) {
00651   unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00652   if (index >= natoms)
00653     return;
00654 
00655   float4 atom = xyzr[index];
00656 
00657   int3 cell;
00658   cell.x = atom.x * invgridspacing;
00659   cell.y = atom.y * invgridspacing;
00660   cell.z = atom.z * invgridspacing;
00661 
00662   cell.x = min(cell.x, numvoxels.x-1);
00663   cell.y = min(cell.y, numvoxels.y-1);
00664   cell.z = min(cell.z, numvoxels.z-1);
00665 
00666   unsigned int hash = (cell.z * numvoxels.y * numvoxels.x) + 
00667                       (cell.y * numvoxels.x) + cell.x;
00668 
00669   atomIndex[index] = index; // original atom index
00670   atomHash[index] = hash;   // atoms hashed to voxel address
00671 }
00672 
00673 
00674 // build cell lists and reorder atoms and colors using sorted atom index list
00675 __global__ static void sortAtomsGenCellLists(unsigned int natoms,
00676                            const float4 *xyzr_d,
00677                            const float4 *color_d,
00678                            const unsigned int *atomIndex_d,
00679                            const unsigned int *atomHash_d,
00680                            float4 *sorted_xyzr_d,
00681                            float4 *sorted_color_d,
00682                            uint2 *cellStartEnd_d) {
00683   extern __shared__ unsigned int hash_s[]; // blockSize + 1 elements
00684   unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00685   unsigned int hash;
00686 
00687   if (index < natoms) {
00688     hash = atomHash_d[index];
00689     hash_s[threadIdx.x+1] = hash; // use smem to avoid redundant loads
00690     if (index > 0 && threadIdx.x == 0) {
00691       // first thread in block must load neighbor particle hash
00692       hash_s[0] = atomHash_d[index-1];
00693     }
00694   }
00695 
00696   __syncthreads();
00697 
00698   if (index < natoms) {
00699     // Since atoms are sorted, if this atom has a different cell
00700     // than its predecessor, it is the first atom in its cell, and 
00701     // it's index marks the end of the previous cell.
00702     if (index == 0 || hash != hash_s[threadIdx.x]) {
00703       cellStartEnd_d[hash].x = index; // set start
00704       if (index > 0)
00705         cellStartEnd_d[hash_s[threadIdx.x]].y = index; // set end
00706     }
00707 
00708     if (index == natoms - 1) {
00709       cellStartEnd_d[hash].y = index + 1; // set end
00710     }
00711 
00712     // Reorder atoms according to sorted indices
00713     unsigned int sortedIndex = atomIndex_d[index];
00714     float4 pos = xyzr_d[sortedIndex];
00715     sorted_xyzr_d[index] = pos;
00716  
00717     // Reorder colors according to sorted indices, if provided
00718     if (color_d != NULL) {
00719       float4 col = color_d[sortedIndex];
00720       sorted_color_d[index] = col;
00721     }
00722   }
00723 }
00724 
00725 
00726 static int vmd_cuda_build_density_atom_grid(int natoms,
00727                                      const float4 * xyzr_d,
00728                                      const float4 * color_d,
00729                                      float4 * sorted_xyzr_d,
00730                                      float4 * sorted_color_d,
00731                                      unsigned int *atomIndex_d,
00732                                      unsigned int *atomHash_d,
00733                                      uint2 * cellStartEnd_d,
00734                                      int3 volsz,
00735                                      float invgridspacing) {
00736 
00737   // Compute block and grid sizes to use for various kernels
00738   dim3 hBsz(256, 1, 1);
00739 
00740   // If we have a very large atom count, we must either use 
00741   // larger thread blocks, or use multi-dimensional grids of thread blocks. 
00742   // We can use up to 65535 blocks in a 1-D grid, so we can use
00743   // 256-thread blocks for less than 16776960 atoms, and use 512-thread
00744   // blocks for up to 33553920 atoms.  Beyond that, we have to use 2-D grids
00745   // and modified kernels.
00746   if (natoms > 16000000)
00747     hBsz.x = 512; // this will get us 
00748 
00749   dim3 hGsz(((natoms+hBsz.x-1) / hBsz.x), 1, 1);
00750 
00751   // Compute grid cell address as atom hash
00752   // XXX need to use 2-D indexing for large atom counts or we exceed the
00753   //     per-dimension 65535 block grid size limitation
00754   hashAtoms<<<hGsz, hBsz>>>(natoms, xyzr_d, volsz, invgridspacing,
00755                             atomIndex_d, atomHash_d);
00756 
00757   // Sort atom indices by their grid cell address
00758   // (wrapping the device pointers with vector iterators)
00759 #if 1
00760   // XXX It is common to encounter thrust memory allocation issues, so 
00761   //     we have to catch thrown exceptions here, otherwise we're guaranteed
00762   //     to eventually have a crash.  If we get a failure, we have to bomb
00763   //     out entirely and fall back to the CPU.
00764   try {
00765     thrust::sort_by_key(thrust::device_ptr<unsigned int>(atomHash_d),
00766                         thrust::device_ptr<unsigned int>(atomHash_d + natoms),
00767                         thrust::device_ptr<unsigned int>(atomIndex_d));
00768   } 
00769   catch (std::bad_alloc) {
00770     printf("CUDA Thrust memory allocation failed: %s line %d\n", __FILE__, __LINE__);
00771     return -1;
00772   }
00773 #else
00774   thrust::sort_by_key(thrust::device_ptr<unsigned int>(atomHash_d),
00775                       thrust::device_ptr<unsigned int>(atomHash_d + natoms),
00776                       thrust::device_ptr<unsigned int>(atomIndex_d));
00777 #endif
00778 
00779   // Initialize all cells to empty
00780   int ncells = volsz.x * volsz.y * volsz.z;
00781   cudaMemset(cellStartEnd_d, GRID_CELL_EMPTY, ncells*sizeof(uint2));
00782 
00783   // Reorder atoms into sorted order and find start and end of each cell
00784   // XXX need to use 2-D indexing for large atom counts or we exceed the
00785   //     per-dimension 65535 block grid size limitation
00786   unsigned int smemSize = sizeof(unsigned int)*(hBsz.x+1);
00787   sortAtomsGenCellLists<<<hGsz, hBsz, smemSize>>>(
00788                        natoms, xyzr_d, color_d, atomIndex_d, atomHash_d, 
00789                        sorted_xyzr_d, sorted_color_d, cellStartEnd_d);
00790 
00791 #if 1
00792   // XXX when the code is ready for production use, we can disable
00793   //     detailed error checking and use a more all-or-nothing approach
00794   //     where errors fall through all of the CUDA API calls until the
00795   //     end and we do the cleanup only at the end.
00796   cudaThreadSynchronize();
00797   cudaError_t err = cudaGetLastError();
00798   if (err != cudaSuccess) {
00799     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
00800     return -1;
00801   }
00802 #endif
00803 
00804   return 0;
00805 }
00806 
00807 
00808 // 
00809 // Parameters for linear-time range-limited gaussian density kernels
00810 //
00811 #define GGRIDSZ   8.0f
00812 #define GBLOCKSZX 8
00813 #define GBLOCKSZY 8
00814 
00815 #if 1
00816 #define GTEXBLOCKSZZ 2
00817 #define GTEXUNROLL   4
00818 #define GBLOCKSZZ    2
00819 #define GUNROLL      4
00820 #else
00821 #define GTEXBLOCKSZZ 8
00822 #define GTEXUNROLL   1
00823 #define GBLOCKSZZ    8
00824 #define GUNROLL      1
00825 #endif
00826 
00827 __global__ static void gaussdensity_fast_tex(int natoms,
00828                                          const float4 *sorted_xyzr, 
00829                                          const float4 *sorted_color, 
00830                                          int3 numvoxels,
00831                                          int3 acncells,
00832                                          float acgridspacing,
00833                                          float invacgridspacing,
00834                                          const uint2 * cellStartEnd,
00835                                          float gridspacing, unsigned int z, 
00836                                          float *densitygrid,
00837                                          float3 *voltexmap,
00838                                          float invisovalue) {
00839   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00840   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00841   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00842 
00843   // shave register use slightly
00844   unsigned int outaddr = zindex * numvoxels.x * numvoxels.y + 
00845                          yindex * numvoxels.x + xindex;
00846 
00847   // early exit if this thread is outside of the grid bounds
00848   if (xindex >= numvoxels.x || yindex >= numvoxels.y || zindex >= numvoxels.z)
00849     return;
00850 
00851   zindex += z;
00852 
00853   // compute ac grid index of lower corner minus gaussian radius
00854   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00855   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00856   int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00857 
00858   // compute ac grid index of upper corner plus gaussian radius
00859   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00860   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00861   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00862 
00863   xabmin = (xabmin < 0) ? 0 : xabmin;
00864   yabmin = (yabmin < 0) ? 0 : yabmin;
00865   zabmin = (zabmin < 0) ? 0 : zabmin;
00866   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00867   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00868   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00869 
00870   float coorx = gridspacing * xindex;
00871   float coory = gridspacing * yindex;
00872   float coorz = gridspacing * zindex;
00873 
00874   float densityval1=0.0f;
00875   float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00876 #if GTEXUNROLL >= 2
00877   float densityval2=0.0f;
00878   float3 densitycol2=densitycol1;
00879 #endif
00880 #if GTEXUNROLL >= 4
00881   float densityval3=0.0f;
00882   float3 densitycol3=densitycol1;
00883   float densityval4=0.0f;
00884   float3 densitycol4=densitycol1;
00885 #endif
00886 
00887   int acplanesz = acncells.x * acncells.y;
00888   int xab, yab, zab;
00889   for (zab=zabmin; zab<=zabmax; zab++) {
00890     for (yab=yabmin; yab<=yabmax; yab++) {
00891       for (xab=xabmin; xab<=xabmax; xab++) {
00892         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00893         uint2 atomstartend = cellStartEnd[abcellidx];
00894         if (atomstartend.x != GRID_CELL_EMPTY) {
00895           unsigned int atomid;
00896           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00897             float4 atom  = sorted_xyzr[atomid];
00898             float4 color = sorted_color[atomid];
00899             float dx = coorx - atom.x;
00900             float dy = coory - atom.y;
00901             float dxy2 = dx*dx + dy*dy;
00902 
00903             float dz = coorz - atom.z;
00904             float r21 = (dxy2 + dz*dz) * atom.w;
00905             float tmp1 = exp2f(r21);
00906             densityval1 += tmp1;
00907             tmp1 *= invisovalue;
00908             densitycol1.x += tmp1 * color.x;
00909             densitycol1.y += tmp1 * color.y;
00910             densitycol1.z += tmp1 * color.z;
00911 
00912 #if GTEXUNROLL >= 2
00913             float dz2 = dz + gridspacing;
00914             float r22 = (dxy2 + dz2*dz2) * atom.w;
00915             float tmp2 = exp2f(r22);
00916             densityval2 += tmp2;
00917             tmp2 *= invisovalue;
00918             densitycol2.x += tmp2 * color.x;
00919             densitycol2.y += tmp2 * color.y;
00920             densitycol2.z += tmp2 * color.z;
00921 #endif
00922 #if GTEXUNROLL >= 4
00923             float dz3 = dz2 + gridspacing;
00924             float r23 = (dxy2 + dz3*dz3) * atom.w;
00925             float tmp3 = exp2f(r23);
00926             densityval3 += tmp3;
00927             tmp3 *= invisovalue;
00928             densitycol3.x += tmp3 * color.x;
00929             densitycol3.y += tmp3 * color.y;
00930             densitycol3.z += tmp3 * color.z;
00931 
00932             float dz4 = dz3 + gridspacing;
00933             float r24 = (dxy2 + dz4*dz4) * atom.w;
00934             float tmp4 = exp2f(r24);
00935             densityval4 += tmp4;
00936             tmp4 *= invisovalue;
00937             densitycol4.x += tmp4 * color.x;
00938             densitycol4.y += tmp4 * color.y;
00939             densitycol4.z += tmp4 * color.z;
00940 #endif
00941           }
00942         }
00943       }
00944     }
00945   }
00946 
00947   densitygrid[outaddr          ] = densityval1;
00948   voltexmap[outaddr          ].x = densitycol1.x;
00949   voltexmap[outaddr          ].y = densitycol1.y;
00950   voltexmap[outaddr          ].z = densitycol1.z;
00951 
00952 #if GTEXUNROLL >= 2
00953   int planesz = numvoxels.x * numvoxels.y;
00954   densitygrid[outaddr + planesz] = densityval2;
00955   voltexmap[outaddr + planesz].x = densitycol2.x;
00956   voltexmap[outaddr + planesz].y = densitycol2.y;
00957   voltexmap[outaddr + planesz].z = densitycol2.z;
00958 #endif
00959 #if GTEXUNROLL >= 4
00960   densitygrid[outaddr + 2*planesz] = densityval3;
00961   voltexmap[outaddr + 2*planesz].x = densitycol3.x;
00962   voltexmap[outaddr + 2*planesz].y = densitycol3.y;
00963   voltexmap[outaddr + 2*planesz].z = densitycol3.z;
00964 
00965   densitygrid[outaddr + 3*planesz] = densityval4;
00966   voltexmap[outaddr + 3*planesz].x = densitycol4.x;
00967   voltexmap[outaddr + 3*planesz].y = densitycol4.y;
00968   voltexmap[outaddr + 3*planesz].z = densitycol4.z;
00969 #endif
00970 }
00971 
00972 
00973 __global__ static void gaussdensity_fast(int natoms,
00974                                          const float4 *sorted_xyzr, 
00975                                          int3 numvoxels,
00976                                          int3 acncells,
00977                                          float acgridspacing,
00978                                          float invacgridspacing,
00979                                          const uint2 * cellStartEnd,
00980                                          float gridspacing, unsigned int z, 
00981                                          float *densitygrid) {
00982   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00983   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00984   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00985   unsigned int outaddr = zindex * numvoxels.x * numvoxels.y + 
00986                          yindex * numvoxels.x + 
00987                          xindex;
00988 
00989   // early exit if this thread is outside of the grid bounds
00990   if (xindex >= numvoxels.x || yindex >= numvoxels.y || zindex >= numvoxels.z)
00991     return;
00992 
00993   zindex += z;
00994 
00995   // compute ac grid index of lower corner minus gaussian radius
00996   int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00997   int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00998   int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00999 
01000   // compute ac grid index of upper corner plus gaussian radius
01001   int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
01002   int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
01003   int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
01004 
01005   xabmin = (xabmin < 0) ? 0 : xabmin;
01006   yabmin = (yabmin < 0) ? 0 : yabmin;
01007   zabmin = (zabmin < 0) ? 0 : zabmin;
01008   xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
01009   yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
01010   zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
01011 
01012   float coorx = gridspacing * xindex;
01013   float coory = gridspacing * yindex;
01014   float coorz = gridspacing * zindex;
01015 
01016   float densityval1=0.0f;
01017 #if GUNROLL >= 2
01018   float densityval2=0.0f;
01019 #endif
01020 #if GUNROLL >= 4
01021   float densityval3=0.0f;
01022   float densityval4=0.0f;
01023 #endif
01024 
01025   int acplanesz = acncells.x * acncells.y;
01026   int xab, yab, zab;
01027   for (zab=zabmin; zab<=zabmax; zab++) {
01028     for (yab=yabmin; yab<=yabmax; yab++) {
01029       for (xab=xabmin; xab<=xabmax; xab++) {
01030         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
01031         uint2 atomstartend = cellStartEnd[abcellidx];
01032         if (atomstartend.x != GRID_CELL_EMPTY) {
01033           unsigned int atomid;
01034           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
01035             float4 atom = sorted_xyzr[atomid];
01036             float dx = coorx - atom.x;
01037             float dy = coory - atom.y;
01038             float dxy2 = dx*dx + dy*dy;
01039   
01040             float dz = coorz - atom.z;
01041             float r21 = (dxy2 + dz*dz) * atom.w;
01042             densityval1 += exp2f(r21);
01043 
01044 #if GUNROLL >= 2
01045             float dz2 = dz + gridspacing;
01046             float r22 = (dxy2 + dz2*dz2) * atom.w;
01047             densityval2 += exp2f(r22);
01048 #endif
01049 #if GUNROLL >= 4
01050             float dz3 = dz2 + gridspacing;
01051             float r23 = (dxy2 + dz3*dz3) * atom.w;
01052             densityval3 += exp2f(r23);
01053 
01054             float dz4 = dz3 + gridspacing;
01055             float r24 = (dxy2 + dz4*dz4) * atom.w;
01056             densityval4 += exp2f(r24);
01057 #endif
01058           }
01059         }
01060       }
01061     }
01062   }
01063 
01064   densitygrid[outaddr            ] = densityval1;
01065 #if GUNROLL >= 2
01066   int planesz = numvoxels.x * numvoxels.y;
01067   densitygrid[outaddr +   planesz] = densityval2;
01068 #endif
01069 #if GUNROLL >= 4
01070   densitygrid[outaddr + 2*planesz] = densityval3;
01071   densitygrid[outaddr + 3*planesz] = densityval4;
01072 #endif
01073 }
01074 
01075 
01076 // per-GPU handle with various memory buffer pointers, etc.
01077 typedef struct {
01079   long int natoms;
01080   int colorperatom;
01081   int gx;
01082   int gy;
01083   int gz;
01084 
01085   CUDAMarchingCubes *mc;     
01086 
01087   float *devdensity;         
01088   float *devvoltexmap;       
01089   float4 *xyzr_d;            
01090   float4 *sorted_xyzr_d;     
01091   float4 *color_d;           
01092   float4 *sorted_color_d;    
01093 
01094   unsigned int *atomIndex_d; 
01095   unsigned int *atomHash_d;  
01096   uint2 *cellStartEnd_d;     
01097 
01098   void *safety;
01099   float *v3f_d;
01100   float *n3f_d;
01101   float *c3f_d;
01102 
01103 } qsurf_gpuhandle;
01104 
01105 
01106 CUDAQuickSurf::CUDAQuickSurf() {
01107   voidgpu = calloc(1, sizeof(qsurf_gpuhandle));
01108 //  qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01109 }
01110 
01111 
01112 CUDAQuickSurf::~CUDAQuickSurf() {
01113   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01114 
01115   // free all working buffers if not done already
01116   free_bufs();
01117 
01118   // delete marching cubes object
01119   delete gpuh->mc;
01120 
01121   free(voidgpu);
01122 }
01123 
01124 
01125 int CUDAQuickSurf::free_bufs() {
01126   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01127 
01128   // zero out max buffer capacities
01129   gpuh->natoms = 0;
01130   gpuh->colorperatom = 0;
01131   gpuh->gx = 0;
01132   gpuh->gy = 0;
01133   gpuh->gz = 0;
01134 
01135   if (gpuh->safety != NULL)
01136     cudaFree(gpuh->safety);
01137   gpuh->safety=NULL;
01138 
01139   if (gpuh->devdensity != NULL)
01140     cudaFree(gpuh->devdensity);
01141   gpuh->devdensity=NULL;
01142 
01143   if (gpuh->devvoltexmap != NULL)
01144     cudaFree(gpuh->devvoltexmap);
01145   gpuh->devvoltexmap=NULL;
01146 
01147   if (gpuh->xyzr_d != NULL)
01148     cudaFree(gpuh->xyzr_d);
01149   gpuh->xyzr_d=NULL;
01150 
01151   if (gpuh->sorted_xyzr_d != NULL)
01152     cudaFree(gpuh->sorted_xyzr_d);  
01153   gpuh->sorted_xyzr_d=NULL;
01154 
01155   if (gpuh->color_d != NULL)
01156     cudaFree(gpuh->color_d);
01157   gpuh->color_d=NULL;
01158 
01159   if (gpuh->sorted_color_d != NULL)
01160     cudaFree(gpuh->sorted_color_d);
01161   gpuh->sorted_color_d=NULL;
01162 
01163   if (gpuh->atomIndex_d != NULL)
01164     cudaFree(gpuh->atomIndex_d);
01165   gpuh->atomIndex_d=NULL;
01166 
01167   if (gpuh->atomHash_d != NULL)
01168     cudaFree(gpuh->atomHash_d);
01169   gpuh->atomHash_d=NULL;
01170 
01171   if (gpuh->cellStartEnd_d != NULL)
01172     cudaFree(gpuh->cellStartEnd_d);
01173   gpuh->cellStartEnd_d=NULL;
01174 
01175   if (gpuh->v3f_d != NULL)
01176     cudaFree(gpuh->v3f_d);
01177   gpuh->v3f_d=NULL;
01178 
01179   if (gpuh->n3f_d != NULL)
01180     cudaFree(gpuh->n3f_d);
01181   gpuh->n3f_d=NULL;
01182 
01183   if (gpuh->c3f_d != NULL)
01184     cudaFree(gpuh->c3f_d);
01185   gpuh->c3f_d=NULL;
01186 
01187   return 0;
01188 }
01189 
01190 
01191 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom, 
01192                               int gx, int gy, int gz) {
01193   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01194 
01195   // If the current atom count, texturing mode, and total voxel count
01196   // use the same or less storage than the size of the existing buffers,
01197   // we can reuse the same buffers without having to go through the 
01198   // complex allocation and validation loops.  This is a big performance
01199   // benefit during trajectory animation.
01200   if (natoms <= gpuh->natoms &&
01201       colorperatom <= gpuh->colorperatom &&
01202       (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz))
01203     return 0;
01204  
01205   return -1; // no existing bufs, or too small to be used 
01206 }
01207 
01208 
01209 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom, 
01210                               int gx, int gy, int gz) {
01211   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01212 
01213   // early exit from allocation call if we've already got existing
01214   // buffers that are large enough to support the request
01215   if (check_bufs(natoms, colorperatom, gx, gy, gz) == 0)
01216     return 0;
01217 
01218   // If we have any existing allocations, trash them as they weren't
01219   // usable for this new request and we need to reallocate them from scratch
01220   free_bufs();
01221 
01222   long int ncells = gx * gy * gz;
01223   long int volmemsz = ncells * sizeof(float);
01224 
01225   // Allocate all of the memory buffers our algorithms will need up-front,
01226   // so we can retry and gracefully reduce the sizes of various buffers
01227   // to attempt to fit within available GPU memory 
01228   cudaMalloc((void**)&gpuh->devdensity, volmemsz);
01229   if (colorperatom) {
01230     cudaMalloc((void**)&gpuh->devvoltexmap, 3*volmemsz);
01231     cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4));
01232     cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4));
01233   }
01234   cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4));
01235   cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4));
01236   cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int));
01237   cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int));
01238   cudaMalloc((void**)&gpuh->cellStartEnd_d, ncells * sizeof(uint2));
01239 
01240   // allocate marching cubes output buffers
01241   int chunkmaxverts = 3 * ncells;
01242   cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3));
01243   cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3));
01244   cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3));
01245 
01246   // Allocate an extra phantom array to act as a safety net to
01247   // ensure that subsequent allocations performed internally by 
01248   // the NVIDIA thrust template library or by our 
01249   // marching cubes implementation don't fail, since we can't 
01250   // currently pre-allocate all of them.
01251   cudaMalloc(&gpuh->safety, 
01252              natoms*sizeof(float4) +                          // thrust
01253              8 * gx * gy * sizeof(float) +                    // thrust
01254              CUDAMarchingCubes::MemUsageMC(gx, gy, gz));      // mcubes
01255 
01256   cudaError_t err = cudaGetLastError();
01257   if (err != cudaSuccess)
01258     return -1;
01259 
01260   // once the allocation has succeeded, we update the GPU handle info 
01261   // so that the next test/allocation pass knows the latest state.
01262   gpuh->natoms = natoms;
01263   gpuh->colorperatom = colorperatom;
01264   gpuh->gx = gx;
01265   gpuh->gy = gy;
01266   gpuh->gz = gz;
01267 
01268   return 0;
01269 }
01270 
01271 
01272 int CUDAQuickSurf::get_chunk_bufs(int testexisting,
01273                                   long int natoms, int colorperatom, 
01274                                   int gx, int gy, int gz,
01275                                   int &cx, int &cy, int &cz,
01276                                   int &sx, int &sy, int &sz) {
01277   dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01278   if (colorperatom)
01279     Bsz.z = GTEXBLOCKSZZ;
01280 
01281   cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds
01282 
01283   // enter loop to attempt a single-pass computation, but if the
01284   // allocation fails, cut the chunk size Z dimension by half
01285   // repeatedly until we either run chunks of 8 planes at a time,
01286   // otherwise we assume it is hopeless.
01287   cz <<= 1; // premultiply by two to simplify loop body
01288   int chunkiters = 0;
01289   int chunkallocated = 0;
01290   while (!chunkallocated) {
01291     // Cut the Z chunk size in half
01292     chunkiters++;
01293     cz >>= 1;
01294 
01295     // if we've already dropped to a subvolume size, subtract off the
01296     // four extra Z planes from last time before we do the modulo padding
01297     // calculation so we don't hit an infinite loop trying to go below 
01298     // 16 planes due the padding math below.
01299     if (cz != gz)
01300       cz-=4;
01301 
01302     // Pad the chunk to a multiple of the computational tile size since
01303     // each thread computes multiple elements (unrolled in the Z direction)
01304     cz += (8 - (cz % 8));
01305 
01306     // The density map "slab" size is the chunk size but without the extra
01307     // plane used to copy the last plane of the previous chunk's density
01308     // into the start, for use by the marching cubes.
01309     sx = cx;
01310     sy = cy;
01311     sz = cz;
01312 
01313     // Add four extra Z-planes for copying the previous end planes into 
01314     // the start of the next chunk.
01315     cz+=4;
01316 
01317 #if 0
01318     printf("  Trying slab size: %d (test: %d)\n", sz, testexisting);
01319 #endif
01320 
01321 #if 1
01322     // test to see if total number of thread blocks exceeds maximum
01323     // number we can reasonably run prior to a kernel timeout error
01324     dim3 tGsz((sx+Bsz.x-1) / Bsz.x, 
01325               (sy+Bsz.y-1) / Bsz.y,
01326               (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01327     if (colorperatom) {
01328       tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01329     }
01330     if (tGsz.x * tGsz.y * tGsz.z > 65535)
01331       continue; 
01332 #endif
01333 
01334     // Bail out if we can't get enough memory to run at least
01335     // 8 slices in a single pass (making sure we've freed any allocations
01336     // beforehand, so they aren't leaked).
01337     if (sz <= 8) {
01338       return -1;
01339     }
01340  
01341     if (testexisting) {
01342       if (check_bufs(natoms, colorperatom, cx, cy, cz) != 0)
01343         continue;
01344     } else {
01345       if (alloc_bufs(natoms, colorperatom, cx, cy, cz) != 0)
01346         continue;
01347     }
01348 
01349     chunkallocated=1;
01350   }
01351 
01352   return 0;
01353 }
01354 
01355 
01356 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f, 
01357                              const float *colors_f,
01358                              int colorperatom,
01359                              float *origin, int *numvoxels, float maxrad,
01360                              float radscale, float gridspacing, 
01361                              float isovalue, float gausslim,
01362                              VMDDisplayList *cmdList) {
01363   qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01364 
01365   float4 *colors = (float4 *) colors_f;
01366   int3 volsz;
01367   volsz.x = numvoxels[0];
01368   volsz.y = numvoxels[1];
01369   volsz.z = numvoxels[2];
01370 
01371   int chunkmaxverts=0;
01372   int chunknumverts=0; 
01373   int numverts=0;
01374   int numfacets=0;
01375 
01376   wkf_timerhandle globaltimer = wkf_timer_create();
01377   wkf_timer_start(globaltimer);
01378 
01379   cudaError_t err;
01380   cudaDeviceProp deviceProp;
01381   int dev;
01382   if (cudaGetDevice(&dev) != cudaSuccess) {
01383     wkf_timer_destroy(globaltimer);
01384     return -1;
01385   }
01386  
01387   memset(&deviceProp, 0, sizeof(cudaDeviceProp));
01388   
01389   if (cudaGetDeviceProperties(&deviceProp, dev) != cudaSuccess) {
01390     wkf_timer_destroy(globaltimer);
01391     err = cudaGetLastError(); // eat error so next CUDA op succeeds
01392     return -1;
01393   }
01394 
01395   // This code currently requires compute capability 1.3 or 2.x
01396   // because we absolutely depend on hardware broadcasts for 
01397   // global memory reads by multiple threads reading the same element,
01398   // and the code more generally assumes the Fermi L1 cache and prefers
01399   // to launch 3-D grids where possible.  The current code will run on 
01400   // GT200 with reasonable performance so we allow it currently.  More
01401   // testing will be needed to determine if laptop integrated 
01402   // GT200 devices are truly fast enough to outrun quad core CPUs or
01403   // if we should trigger CPU fallback on devices with smaller SM counts.
01404   if ((deviceProp.major < 2) &&
01405       ((deviceProp.major == 1) && (deviceProp.minor < 3))) {
01406     wkf_timer_destroy(globaltimer);
01407     return -1;
01408   }
01409 
01410   // compute grid spacing for the acceleration grid
01411   float acgridspacing = gausslim * radscale * maxrad;
01412 
01413   // ensure acceleration grid spacing >= density grid spacing
01414   if (acgridspacing < gridspacing)
01415     acgridspacing = gridspacing;
01416 
01417   // Allocate output arrays for the gaussian density map and 3-D texture map
01418   // We test for errors carefully here since this is the most likely place
01419   // for a memory allocation failure due to the size of the grid.
01420   int3 chunksz = volsz;
01421   int3 slabsz = volsz;
01422 
01423   int3 accelcells;
01424   accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01425   accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01426   accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01427 
01428   dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01429   if (colorperatom)
01430     Bsz.z = GTEXBLOCKSZZ;
01431 
01432   // check to see if it's possible to use an existing allocation,
01433   // if so, just leave things as they are, and do the computation 
01434   // using the existing buffers
01435   if (gpuh->natoms == 0 ||
01436       get_chunk_bufs(1, natoms, colorperatom,
01437                      volsz.x, volsz.y, volsz.z,
01438                      chunksz.x, chunksz.y, chunksz.z,
01439                      slabsz.x, slabsz.y, slabsz.z) == -1) {
01440     // reset the chunksz and slabsz after failing to try and
01441     // fit them into the existing allocations...
01442     chunksz = volsz;
01443     slabsz = volsz;
01444 
01445     // reallocate the chunk buffers from scratch since we weren't
01446     // able to reuse them
01447     if (get_chunk_bufs(0, natoms, colorperatom,
01448                        volsz.x, volsz.y, volsz.z,
01449                        chunksz.x, chunksz.y, chunksz.z,
01450                        slabsz.x, slabsz.y, slabsz.z) == -1) {
01451       wkf_timer_destroy(globaltimer);
01452       return -1;
01453     }
01454   }
01455   chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z;
01456 
01457   // Free the "safety padding" memory we allocate to ensure we dont
01458   // have trouble with thrust calls that allocate their own memory later
01459   if (gpuh->safety != NULL)
01460     cudaFree(gpuh->safety);
01461   gpuh->safety = NULL;
01462 
01463 #if 0
01464   if (chunkiters > 1)
01465     printf("  Using GPU chunk size: %d\n", chunksz.z);
01466 
01467   printf("  Accel grid(%d, %d, %d) spacing %f\n",
01468          accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01469 #endif
01470 
01471   // pre-process the atom coordinates and radii as needed
01472   // short-term fix until a new CUDA kernel takes care of this
01473   int i, i4;
01474   float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01475   float log2e = log2(2.718281828);
01476   for (i=0,i4=0; i<natoms; i++,i4+=4) {
01477     xyzr[i].x = xyzr_f[i4    ];
01478     xyzr[i].y = xyzr_f[i4 + 1];
01479     xyzr[i].z = xyzr_f[i4 + 2];
01480 
01481     float scaledrad = xyzr_f[i4 + 3] * radscale;
01482     float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01483 
01484     xyzr[i].w = arinv;
01485   }
01486   cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01487   free(xyzr);
01488 
01489   if (colorperatom)
01490     cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01491  
01492   // build uniform grid acceleration structure
01493   if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d,
01494                                        gpuh->sorted_xyzr_d,
01495                                        gpuh->sorted_color_d,
01496                                        gpuh->atomIndex_d, gpuh->atomHash_d,
01497                                        gpuh->cellStartEnd_d, 
01498                                        accelcells, 1.0f / acgridspacing) != 0) {
01499     wkf_timer_destroy(globaltimer);
01500     free_bufs();
01501     return -1;
01502   }
01503 
01504   double sorttime = wkf_timer_timenow(globaltimer);
01505   double lastlooptime = sorttime;
01506 
01507   double densitykerneltime = 0.0f;
01508   double densitytime = 0.0f;
01509   double mckerneltime = 0.0f;
01510   double mctime = 0.0f; 
01511   double copycalltime = 0.0f;
01512   double copytime = 0.0f;
01513 
01514   float *volslab_d = NULL;
01515   float *texslab_d = NULL;
01516 
01517   int lzplane = GBLOCKSZZ * GUNROLL;
01518   if (colorperatom)
01519     lzplane = GTEXBLOCKSZZ * GTEXUNROLL;
01520 
01521   // initialize CUDA marching cubes class instance or rebuild it if needed
01522   uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z);
01523   if (gpuh->mc == NULL) {
01524     gpuh->mc = new CUDAMarchingCubes(); 
01525     if (!gpuh->mc->Initialize(mgsz)) {
01526       printf("MC Initialize() failed\n");
01527     }
01528   } else {
01529     uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize();
01530     if (slabsz.x > mcmaxgridsize.x ||
01531         slabsz.y > mcmaxgridsize.y ||
01532         slabsz.z > mcmaxgridsize.z) {
01533       printf("*** Allocating new MC object...\n");
01534       // delete marching cubes object
01535       delete gpuh->mc;
01536 
01537       // create and initialize CUDA marching cubes class instance
01538       gpuh->mc = new CUDAMarchingCubes(); 
01539 
01540       if (!gpuh->mc->Initialize(mgsz)) {
01541         printf("MC Initialize() failed while recreating MC object\n");
01542       }
01543     } 
01544   }
01545 
01546   int z;
01547   int chunkcount=0;
01548   for (z=0; z<volsz.z; z+=slabsz.z) {
01549     int3 curslab = slabsz;
01550     if (z+curslab.z > volsz.z)
01551       curslab.z = volsz.z - z; 
01552   
01553     int slabplanesz = curslab.x * curslab.y;
01554 
01555     dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x, 
01556              (curslab.y+Bsz.y-1) / Bsz.y,
01557              (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01558     if (colorperatom)
01559       Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01560 
01561     // For SM 2.x, we can run the entire slab in one pass by launching
01562     // a 3-D grid of thread blocks.
01563     // If we are running on SM 1.x, we can only launch 1-D grids so we
01564     // must loop over planar grids until we have processed the whole slab.
01565     dim3 Gszslice = Gsz;
01566     if (deviceProp.major < 2)
01567       Gszslice.z = 1;
01568 
01569 #if 0
01570     printf("CUDA device %d, grid size %dx%dx%d\n", 
01571            0, Gsz.x, Gsz.y, Gsz.z);
01572     printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n",
01573            curslab.x, curslab.y, curslab.z,
01574            accelcells.x, accelcells.y, accelcells.z);
01575     printf("Z=%d, curslab.z=%d\n", z, curslab.z);
01576 #endif
01577 
01578     // For all but the first density slab, we copy the last four 
01579     // planes of the previous run into the start of the next run so
01580     // that we can extract the isosurface with no discontinuities
01581     if (z == 0) {
01582       volslab_d = gpuh->devdensity;
01583       if (colorperatom)
01584         texslab_d = gpuh->devvoltexmap;
01585     } else {
01586       cudaMemcpy(gpuh->devdensity,
01587                  volslab_d + (slabsz.z-4) * slabplanesz, 
01588                  4 * slabplanesz * sizeof(float), cudaMemcpyDeviceToDevice);
01589       if (colorperatom)
01590         cudaMemcpy(gpuh->devvoltexmap,
01591                    texslab_d + (slabsz.z-4) * 3 * slabplanesz, 
01592                    4*3 * slabplanesz * sizeof(float), cudaMemcpyDeviceToDevice);
01593 
01594       volslab_d = gpuh->devdensity + (4 * slabplanesz);
01595       if (colorperatom)
01596         texslab_d = gpuh->devvoltexmap + (4 * 3 * slabplanesz);
01597     }
01598 
01599     for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) {
01600       int lzinc = lz * lzplane;
01601       float *volslice_d = volslab_d + lzinc * slabplanesz;
01602 
01603       if (colorperatom) {
01604         float *texslice_d = texslab_d + lzinc * slabplanesz * 3;
01605         gaussdensity_fast_tex<<<Gszslice, Bsz, 0>>>(natoms, 
01606             gpuh->sorted_xyzr_d, gpuh->sorted_color_d, 
01607             curslab, accelcells, acgridspacing,
01608             1.0f / acgridspacing, gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01609             volslice_d, (float3 *) texslice_d, 1.0f / isovalue);
01610       } else {
01611         gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms, 
01612             gpuh->sorted_xyzr_d, 
01613             curslab, accelcells, acgridspacing, 1.0f / acgridspacing, 
01614             gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d);
01615       }
01616     }
01617     cudaThreadSynchronize(); 
01618     densitykerneltime = wkf_timer_timenow(globaltimer);
01619 
01620 #if 0
01621     printf("  CUDA mcubes..."); fflush(stdout);
01622 #endif
01623 
01624     int vsz[3];
01625     vsz[0]=curslab.x;
01626     vsz[1]=curslab.y;
01627     vsz[2]=curslab.z;
01628 
01629     // For all but the first density slab, we copy the last four
01630     // planes of the previous run into the start of the next run so
01631     // that we can extract the isosurface with no discontinuities
01632     if (z != 0)
01633       vsz[2]=curslab.z + 4;
01634 
01635     float bbox[3];
01636     bbox[0] = vsz[0] * gridspacing;
01637     bbox[1] = vsz[1] * gridspacing;
01638     bbox[2] = vsz[2] * gridspacing;
01639 
01640 
01641     float gorigin[3];
01642     gorigin[0] = origin[0];
01643     gorigin[1] = origin[1];
01644     gorigin[2] = origin[2] + (z * gridspacing);
01645 
01646     if (z != 0)
01647       gorigin[2] = origin[2] + ((z-4) * gridspacing);
01648 
01649 #if 0
01650 printf("\n  ... vsz: %d %d %d\n", vsz[0], vsz[1], vsz[2]);
01651 printf("  ... org: %.2f %.2f %.2f\n", gorigin[0], gorigin[1], gorigin[2]);
01652 printf("  ... bxs: %.2f %.2f %.2f\n", bbox[0], bbox[1], bbox[2]);
01653 printf("  ... bbe: %.2f %.2f %.2f\n", 
01654   gorigin[0]+bbox[0], gorigin[1]+bbox[1], gorigin[2]+bbox[2]);
01655 #endif
01656 
01657     // If we are computing the volume using multiple passes, we have to 
01658     // overlap the marching cubes grids and compute a sub-volume to exclude
01659     // the end planes, except for the first and last sub-volume, in order to
01660     // get correct per-vertex normals at the edges of each sub-volume 
01661     int skipstartplane=0;
01662     int skipendplane=0;
01663     if (chunksz.z < volsz.z) {
01664       // on any but the first pass, we skip the first Z plane
01665       if (z != 0)
01666         skipstartplane=1;
01667 
01668       // on any but the last pass, we skip the last Z plane
01669       if (z+curslab.z < volsz.z)
01670         skipendplane=1;
01671     }
01672 
01673     //
01674     // Extract density map isosurface using marching cubes
01675     //
01676     uint3 gvsz = make_uint3(vsz[0], vsz[1], vsz[2]);
01677     float3 gorg = make_float3(gorigin[0], gorigin[1], gorigin[2]);
01678     float3 gbnds = make_float3(bbox[0], bbox[1], bbox[2]);
01679 
01680     gpuh->mc->SetIsovalue(isovalue);
01681     if (!gpuh->mc->SetVolumeData(gpuh->devdensity, gpuh->devvoltexmap, 
01682                                  gvsz, gorg, gbnds, true)) {
01683       printf("MC SetVolumeData() failed\n");
01684     }
01685     // set the sub-volume starting/ending indices if needed
01686     if (skipstartplane || skipendplane) {
01687       uint3 volstart = make_uint3(0, 0, 0);
01688       uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z);
01689 
01690       if (skipstartplane)
01691         volstart.z = 2;
01692 
01693       if (skipendplane)
01694         volend.z = gvsz.z - 2;
01695 
01696       gpuh->mc->SetSubVolume(volstart, volend);
01697     }
01698     gpuh->mc->computeIsosurface((float3 *) gpuh->v3f_d, (float3 *) gpuh->n3f_d, 
01699                                 (float3 *) gpuh->c3f_d, chunkmaxverts);
01700 
01701     chunknumverts = gpuh->mc->GetVertexCount();
01702 
01703 #if 0
01704     printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts);
01705 #endif
01706     if (chunknumverts == chunkmaxverts)
01707       printf("  *** Exceeded marching cubes vertex limit (%d verts)\n", chunknumverts);
01708 
01709     cudaThreadSynchronize(); 
01710     mckerneltime = wkf_timer_timenow(globaltimer);
01711 
01712     // Create a triangle mesh
01713     if (chunknumverts > 0) {
01714       DispCmdTriMesh cmdTriMesh;
01715       if (colorperatom) {
01716         // emit triangle mesh with per-vertex colors
01717         cmdTriMesh.cuda_putdata(gpuh->v3f_d, gpuh->n3f_d, gpuh->c3f_d,
01718                                 chunknumverts/3, cmdList);
01719       } else {
01720         // emit triangle mesh with no colors, uses current rendering state
01721         cmdTriMesh.cuda_putdata(gpuh->v3f_d, gpuh->n3f_d, NULL,
01722                                 chunknumverts/3, cmdList);
01723       }
01724     }
01725     numverts+=chunknumverts;
01726     numfacets+=chunknumverts/3;
01727 
01728 #if 0
01729    // XXX we'll hold onto this as we'll want to rescue this approach
01730    //     for electrostatics coloring where we have to have the 
01731    //     entire triangle mesh in order to do the calculation
01732     int l;
01733     int vertstart = 3 * numverts;
01734     int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float);
01735     int facebufsz = (numverts + chunknumverts) * sizeof(int);
01736     int chunkvertsz = 3 * chunknumverts * sizeof(float);
01737 
01738     v = (float*) realloc(v, vertbufsz);
01739     n = (float*) realloc(n, vertbufsz);
01740     c = (float*) realloc(c, vertbufsz);
01741     f = (int*)   realloc(f, facebufsz);
01742     cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01743     cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01744     if (colorperatom) {
01745       cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01746     } else {
01747       float *color = c+vertstart;
01748       for (l=0; l<chunknumverts*3; l+=3) {
01749         color[l + 0] = colors[0].x;
01750         color[l + 1] = colors[0].y;
01751         color[l + 2] = colors[0].z;
01752       }
01753     }
01754     for (l=numverts; l<numverts+chunknumverts; l++) {
01755       f[l]=l;
01756     }
01757     numverts+=chunknumverts;
01758     numfacets+=chunknumverts/3;
01759 #endif
01760 
01761     copycalltime = wkf_timer_timenow(globaltimer);
01762 
01763     densitytime += densitykerneltime - lastlooptime;
01764     mctime += mckerneltime - densitykerneltime;
01765     copytime += copycalltime - mckerneltime;
01766 
01767     lastlooptime = wkf_timer_timenow(globaltimer);
01768 
01769     chunkcount++; // increment number of chunks processed
01770   }
01771 
01772   // catch any errors that may have occured so that at the very least,
01773   // all of the subsequent resource deallocation calls will succeed
01774   err = cudaGetLastError();
01775 
01776   wkf_timer_stop(globaltimer);
01777   double totalruntime = wkf_timer_time(globaltimer);
01778   wkf_timer_destroy(globaltimer);
01779 
01780   // If an error occured, we print it and return an error code, once
01781   // all of the memory deallocations have completed.
01782   if (err != cudaSuccess) { 
01783     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01784     return -1;
01785   }
01786 
01787 #if 1
01788   printf("  GPU generated %d vertices, %d facets, in %d passes\n", numverts, numfacets, chunkcount);
01789 
01790   printf("  GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n", 
01791          (deviceProp.major == 1 && deviceProp.minor == 3) ? "SM 1.3" : "SM 2.x",
01792          totalruntime, sorttime, densitytime, mctime, copytime);
01793 #endif
01794 
01795   return 0;
01796 }
01797 
01798 
01799 
01800 
01801 

Generated on Wed May 16 01:49:09 2012 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002