CUDAMarchingCubes.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAMarchingCubes.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.44 $       $Date: 2022/03/18 20:39:06 $
00014  *
00015  ***************************************************************************/
00060 #include "CUDAKernels.h"
00061 #define CUDAMARCHINGCUBES_INTERNAL 1
00062 #include "CUDAMarchingCubes.h"
00063 #include <thrust/device_ptr.h>
00064 #include <thrust/scan.h>
00065 #include <thrust/functional.h>
00066 
00067 #include "CUDAParPrefixOps.h"
00068 
00069 //
00070 // Restrict macro to make it easy to do perf tuning tests
00071 //
00072 #if 1 && (CUDART_VERSION >= 4000)
00073 #define RESTRICT __restrict__
00074 #else
00075 #define RESTRICT
00076 #endif
00077 
00078 // The number of threads to use for triangle generation 
00079 // (limited by shared memory size)
00080 #define NTHREADS 48
00081 
00082 // The number of threads to use for all of the other kernels
00083 #define KERNTHREADS 256
00084 
00085 //
00086 // Various math operators for vector types not already 
00087 // provided by the regular CUDA headers
00088 //
00089 // "+" operator
00090 inline __host__ __device__ float3 operator+(float3 a, float3 b) {
00091   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
00092 }
00093 inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) {
00094   return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
00095 }
00096 inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) {
00097   return make_uint2(a.x + b.x, a.y + b.y);
00098 }
00099 
00100 // "-" operator
00101 inline __host__ __device__ float3 operator-(float3 a, float3 b) {
00102   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
00103 }
00104 inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) {
00105   return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
00106 }
00107 
00108 // "*" operator
00109 inline __host__ __device__ float3 operator*(float b, float3 a) {
00110   return make_float3(b * a.x, b * a.y, b * a.z);
00111 }
00112 
00113 // dot()
00114 inline __host__ __device__ float dot(float3 a, float3 b) {
00115   return a.x * b.x + a.y * b.y + a.z * b.z;
00116 }
00117 
00118 // fma() for float and float3
00119 inline __host__ __device__ float3 fmaf3(float x, float3 y, float3 z) {
00120   return make_float3(fmaf(x, y.x, z.x), 
00121                      fmaf(x, y.y, z.y), 
00122                      fmaf(x, y.z, z.z));
00123 }
00124 
00125 
00126 // lerp()
00127 inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) {
00128 #if 0
00129   return fmaf3(t, b, fmaf3(-t, a, a));
00130 #elif 1
00131   return a + t*(b-a);
00132 #else
00133   return (1-t)*a + t*b;
00134 #endif
00135 }
00136 
00137 // length()
00138 inline __host__ __device__ float length(float3 v) {
00139   return sqrtf(dot(v, v));
00140 }
00141 
00142 //
00143 // color format conversion routines
00144 //
00145 inline __device__ void convert_color(float3 & cf, float3 cf2) {
00146   cf = cf2;
00147 }
00148 
00149 inline __device__ void convert_color(uchar4 & cu, float3 cf) {
00150   // conversion to GLubyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
00151   // clamp color values to prevent integer wraparound
00152   cu = make_uchar4(fminf(cf.x * 255.0f, 255.0f),
00153                    fminf(cf.y * 255.0f, 255.0f),
00154                    fminf(cf.z * 255.0f, 255.0f),
00155                    255);
00156 }
00157 
00158 inline __device__ void convert_color(float3 & cf, uchar4 cu) {
00159   const float i2f = 1.0f / 255.0f;
00160   cf.x = cu.x * i2f;
00161   cf.y = cu.y * i2f;
00162   cf.z = cu.z * i2f;
00163 }
00164 
00165 //
00166 // normal format conversion routines
00167 //
00168 inline __device__ void convert_normal(float3 & nf, float3 nf2) {
00169   nf = nf2;
00170 }
00171 
00172 // Note: The CUDA char3 type is explicitly a signed type
00173 inline __device__ void convert_normal(char3 & cc, float3 cf) {
00174   // normalize input values before conversion to fixed point representation
00175   float invlen = rsqrtf(cf.x*cf.x + cf.y*cf.y + cf.z*cf.z);
00176 
00177   // conversion to GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
00178   cc = make_char3(cf.x * invlen * 127.5f - 0.5f,
00179                   cf.y * invlen * 127.5f - 0.5f,
00180                   cf.z * invlen * 127.5f - 0.5f);
00181 }
00182 
00183 
00184 // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS
00185 // XXX The sampleVolume() call underperforms vs. peak memory bandwidth
00186 //     because we don't strictly enforce coalescing requirements in the
00187 //     layout of the input volume presently.  If we forced X/Y dims to be
00188 //     warp-multiple it would become possible to use wider fetches and
00189 //     a few other tricks to improve global memory bandwidth 
00190 __device__ float sampleVolume(const float * RESTRICT data, 
00191                               uint3 p, uint3 gridSize) {
00192     return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
00193 }
00194 
00195 
00196 // sample volume texture at a point p, p CAN NEVER BE OUT OF BOUNDS
00197 __device__ float3 sampleColors(const float3 * RESTRICT data,
00198                                uint3 p, uint3 gridSize) {
00199     return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
00200 }
00201 
00202 // sample volume texture at a point p, p CAN NEVER BE OUT OF BOUNDS
00203 __device__ float3 sampleColors(const uchar4 * RESTRICT data,
00204                                uint3 p, uint3 gridSize) {
00205     uchar4 cu = data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
00206     float3 cf;
00207     convert_color(cf, cu);
00208     return cf;
00209 }
00210 
00211 
00212 // compute position in 3d grid from 1d index
00213 __device__ uint3 calcGridPos(unsigned int i, uint3 gridSize) {
00214     uint3 gridPos;
00215     unsigned int gridsizexy = gridSize.x * gridSize.y;
00216     gridPos.z = i / gridsizexy;
00217     unsigned int tmp1 = i - (gridsizexy * gridPos.z);
00218     gridPos.y = tmp1 / gridSize.x;
00219     gridPos.x = tmp1 - (gridSize.x * gridPos.y);
00220     return gridPos;
00221 }
00222 
00223 
00224 // compute interpolated vertex along an edge
00225 __device__ float3 vertexInterp(float isolevel, float3 p0, float3 p1, float f0, float f1) {
00226     float t = (isolevel - f0) / (f1 - f0);
00227     return lerp(p0, p1, t);
00228 }
00229 
00230 
00231 // classify voxel based on number of vertices it will generate 
00232 // one thread per two voxels
00233 template <int GRIDIS3D, int SUBGRID>
00234 __global__ void 
00235 // __launch_bounds__ ( KERNTHREADS, 1 )
00236 classifyVoxel(uint2 * RESTRICT voxelVerts, 
00237               cudaTextureObject_t numVertsTexObj,
00238               const float * RESTRICT volume,
00239               uint3 gridSize, unsigned int numVoxels, float3 voxelSize,
00240               uint3 subGridStart, uint3 subGridEnd,
00241               float isoValue) {
00242     uint3 gridPos;
00243     unsigned int i;
00244 
00245     // Compute voxel indices and address using either 2-D or 3-D 
00246     // thread indexing depending on the GRIDIS3D template parameter
00247     if (GRIDIS3D) {
00248       // Compute voxel index using 3-D thread indexing
00249       gridPos.x = blockIdx.x * blockDim.x + threadIdx.x;
00250       gridPos.y = blockIdx.y * blockDim.y + threadIdx.y;
00251       gridPos.z = blockIdx.z * blockDim.z + threadIdx.z;
00252 
00253       // safety check
00254       if (gridPos.x >= gridSize.x || 
00255           gridPos.y >= gridSize.y || 
00256           gridPos.z >= gridSize.z)
00257         return;
00258 
00259       // compute 1D grid index
00260       i = gridPos.z*gridSize.x*gridSize.y + gridPos.y*gridSize.x + gridPos.x;
00261     } else {
00262       // Compute voxel index using 2-D thread indexing
00263       unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00264       i = (blockId * blockDim.x) + threadIdx.x;
00265 
00266       // safety check
00267       if (i >= numVoxels)
00268         return;
00269 
00270       // compute current grid position
00271       gridPos = calcGridPos(i, gridSize);
00272     }
00273 
00274     // If we are told to compute the isosurface for only a sub-region
00275     // of the volume, we use a more complex boundary test, otherwise we
00276     // use just the maximum voxel dimension
00277     uint2 numVerts = make_uint2(0, 0); // initialize vertex output to zero
00278     if (SUBGRID) {
00279       if (gridPos.x < subGridStart.x || 
00280           gridPos.y < subGridStart.y || 
00281           gridPos.z < subGridStart.z ||
00282           gridPos.x >= subGridEnd.x || 
00283           gridPos.y >= subGridEnd.y || 
00284           gridPos.z >= subGridEnd.z) {
00285         voxelVerts[i] = numVerts; // no vertices returned
00286         return;
00287       }
00288     } else {
00289       if (gridPos.x > (gridSize.x - 2) || gridPos.y > (gridSize.y - 2) || gridPos.z > (gridSize.z - 2)) {
00290         voxelVerts[i] = numVerts; // no vertices returned
00291         return;
00292       }
00293     }
00294 
00295     // read field values at neighbouring grid vertices
00296     float field[8];
00297     field[0] = sampleVolume(volume, gridPos, gridSize);
00298     // TODO early exit test for
00299     //if (field[0] < 0.000001f)  {
00300     //    voxelVerts[i] = numVerts;
00301     //    return;
00302     //}
00303     field[1] = sampleVolume(volume, gridPos + make_uint3(1, 0, 0), gridSize);
00304     field[2] = sampleVolume(volume, gridPos + make_uint3(1, 1, 0), gridSize);
00305     field[3] = sampleVolume(volume, gridPos + make_uint3(0, 1, 0), gridSize);
00306     field[4] = sampleVolume(volume, gridPos + make_uint3(0, 0, 1), gridSize);
00307     field[5] = sampleVolume(volume, gridPos + make_uint3(1, 0, 1), gridSize);
00308     field[6] = sampleVolume(volume, gridPos + make_uint3(1, 1, 1), gridSize);
00309     field[7] = sampleVolume(volume, gridPos + make_uint3(0, 1, 1), gridSize);
00310 
00311     // calculate flag indicating if each vertex is inside or outside isosurface
00312     unsigned int cubeindex;
00313     cubeindex =  ((unsigned int) (field[0] < isoValue));
00314     cubeindex += ((unsigned int) (field[1] < isoValue))*2;
00315     cubeindex += ((unsigned int) (field[2] < isoValue))*4;
00316     cubeindex += ((unsigned int) (field[3] < isoValue))*8;
00317     cubeindex += ((unsigned int) (field[4] < isoValue))*16;
00318     cubeindex += ((unsigned int) (field[5] < isoValue))*32;
00319     cubeindex += ((unsigned int) (field[6] < isoValue))*64;
00320     cubeindex += ((unsigned int) (field[7] < isoValue))*128;
00321 
00322     // read number of vertices from texture
00323     numVerts.x = tex1Dfetch<unsigned int>(numVertsTexObj, int(cubeindex));
00324     numVerts.y = (numVerts.x > 0);
00325 
00326     voxelVerts[i] = numVerts;
00327 }
00328 
00329 
00330 // compact voxel array
00331 __global__ void 
00332 // __launch_bounds__ ( KERNTHREADS, 1 )
00333 compactVoxels(unsigned int * RESTRICT compactedVoxelArray, 
00334               const uint2 * RESTRICT voxelOccupied, 
00335               unsigned int lastVoxel, unsigned int numVoxels, 
00336               unsigned int numVoxelsm1) {
00337   unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00338   unsigned int i = (blockId * blockDim.x) + threadIdx.x;
00339 
00340   if ((i < numVoxels) && ((i < numVoxelsm1) ? voxelOccupied[i].y < voxelOccupied[i+1].y : lastVoxel)) {
00341     compactedVoxelArray[ voxelOccupied[i].y ] = i;
00342   }
00343 }
00344 
00345 
00346 // version that calculates no surface normal or color,  only triangle vertices
00347 __global__ void 
00348 // __launch_bounds__ ( NTHREADS, 1 )
00349 generateTriangleVerticesSMEM(float3 * RESTRICT pos, 
00350                              const unsigned int * RESTRICT compactedVoxelArray, 
00351                              const uint2 * RESTRICT numVertsScanned, 
00352                              cudaTextureObject_t triTexObj,
00353                              cudaTextureObject_t numVertsTexObj,
00354                              const float * RESTRICT volume,
00355                              uint3 gridSize, float3 voxelSize, 
00356                              float isoValue, unsigned int activeVoxels, 
00357                              unsigned int maxVertsM3) {
00358     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00359     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00360     unsigned int i = zOffset * (blockDim.x * blockDim.y) + (blockId * blockDim.x) + threadIdx.x;
00361 
00362     if (i >= activeVoxels)
00363         return;
00364 
00365     unsigned int voxel = compactedVoxelArray[i];
00366 
00367     // compute position in 3d grid
00368     uint3 gridPos = calcGridPos(voxel, gridSize);
00369 
00370     float3 p;
00371     p.x = gridPos.x * voxelSize.x;
00372     p.y = gridPos.y * voxelSize.y;
00373     p.z = gridPos.z * voxelSize.z;
00374 
00375     // calculate cell vertex positions
00376     float3 v[8];
00377     v[0] = p;
00378     v[1] = p + make_float3(voxelSize.x, 0, 0);
00379     v[2] = p + make_float3(voxelSize.x, voxelSize.y, 0);
00380     v[3] = p + make_float3(0, voxelSize.y, 0);
00381     v[4] = p + make_float3(0, 0, voxelSize.z);
00382     v[5] = p + make_float3(voxelSize.x, 0, voxelSize.z);
00383     v[6] = p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z);
00384     v[7] = p + make_float3(0, voxelSize.y, voxelSize.z);
00385 
00386     float field[8];
00387     field[0] = sampleVolume(volume, gridPos, gridSize);
00388     field[1] = sampleVolume(volume, gridPos + make_uint3(1, 0, 0), gridSize);
00389     field[2] = sampleVolume(volume, gridPos + make_uint3(1, 1, 0), gridSize);
00390     field[3] = sampleVolume(volume, gridPos + make_uint3(0, 1, 0), gridSize);
00391     field[4] = sampleVolume(volume, gridPos + make_uint3(0, 0, 1), gridSize);
00392     field[5] = sampleVolume(volume, gridPos + make_uint3(1, 0, 1), gridSize);
00393     field[6] = sampleVolume(volume, gridPos + make_uint3(1, 1, 1), gridSize);
00394     field[7] = sampleVolume(volume, gridPos + make_uint3(0, 1, 1), gridSize);
00395 
00396     // recalculate flag
00397     unsigned int cubeindex;
00398     cubeindex =  ((unsigned int)(field[0] < isoValue)); 
00399     cubeindex += ((unsigned int)(field[1] < isoValue))*2; 
00400     cubeindex += ((unsigned int)(field[2] < isoValue))*4; 
00401     cubeindex += ((unsigned int)(field[3] < isoValue))*8; 
00402     cubeindex += ((unsigned int)(field[4] < isoValue))*16; 
00403     cubeindex += ((unsigned int)(field[5] < isoValue))*32; 
00404     cubeindex += ((unsigned int)(field[6] < isoValue))*64; 
00405     cubeindex += ((unsigned int)(field[7] < isoValue))*128;
00406 
00407     // find the vertices where the surface intersects the cube 
00408     // Note: SIMD marching cubes implementations have no need
00409     //       for an edge table, because branch divergence eliminates any
00410     //       potential performance gain from only computing the per-edge
00411     //       vertices when indicated by the edgeTable.
00412 
00413     // Use shared memory to keep register pressure under control.
00414     // No need to call __syncthreads() since each thread uses its own
00415     // private shared memory buffer.
00416     __shared__ float3 vertlist[12*NTHREADS];
00417 
00418     vertlist[threadIdx.x] = vertexInterp(isoValue, v[0], v[1], field[0], field[1]);
00419     vertlist[NTHREADS+threadIdx.x] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]);
00420     vertlist[(NTHREADS*2)+threadIdx.x] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]);
00421     vertlist[(NTHREADS*3)+threadIdx.x] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]);
00422     vertlist[(NTHREADS*4)+threadIdx.x] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]);
00423     vertlist[(NTHREADS*5)+threadIdx.x] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]);
00424     vertlist[(NTHREADS*6)+threadIdx.x] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]);
00425     vertlist[(NTHREADS*7)+threadIdx.x] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]);
00426     vertlist[(NTHREADS*8)+threadIdx.x] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]);
00427     vertlist[(NTHREADS*9)+threadIdx.x] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]);
00428     vertlist[(NTHREADS*10)+threadIdx.x] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]);
00429     vertlist[(NTHREADS*11)+threadIdx.x] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]);
00430 
00431     // output triangle vertices
00432     unsigned int numVerts = tex1Dfetch<unsigned int>(numVertsTexObj, cubeindex);
00433     for(int i=0; i<numVerts; i+=3) {
00434         unsigned int index = numVertsScanned[voxel].x + i;
00435 
00436         float3 *vert[3];
00437         int edge;
00438         edge = tex1Dfetch<int>(triTexObj, (cubeindex*16) + i);
00439         vert[0] = &vertlist[(edge*NTHREADS)+threadIdx.x];
00440 
00441         edge = tex1Dfetch<int>(triTexObj, (cubeindex*16) + i + 1);
00442         vert[1] = &vertlist[(edge*NTHREADS)+threadIdx.x];
00443 
00444         edge = tex1Dfetch<int>(triTexObj, (cubeindex*16) + i + 2);
00445         vert[2] = &vertlist[(edge*NTHREADS)+threadIdx.x];
00446 
00447         if (index < maxVertsM3) {
00448             pos[index  ] = *vert[0];
00449             pos[index+1] = *vert[1];
00450             pos[index+2] = *vert[2];
00451         }
00452     }
00453 }
00454 
00455 
00456 __global__ void 
00457 // __launch_bounds__ ( KERNTHREADS, 1 )
00458 offsetTriangleVertices(float3 * RESTRICT pos,
00459                        float3 origin, unsigned int numVertsM1) {
00460   unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00461   unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00462   unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00463 
00464   if (i > numVertsM1)
00465     return;
00466 
00467   float3 p = pos[i];
00468   p.x += origin.x;
00469   p.y += origin.y;
00470   p.z += origin.z;
00471   pos[i] = p;
00472 }
00473 
00474 
00475 // version that calculates the surface normal for each triangle vertex
00476 template <class NORMAL>
00477 __global__ void 
00478 // __launch_bounds__ ( KERNTHREADS, 1 )
00479 generateTriangleNormals(const float3 * RESTRICT pos, NORMAL *norm, 
00480                         cudaTextureObject_t volTexObj,
00481                         float3 gridSizeInv, 
00482                         float3 bBoxInv, unsigned int numVerts) {
00483   unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00484   unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00485   unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00486 
00487   if (i > numVerts - 1)
00488     return;
00489 
00490   float3 n;
00491   float3 p, p1, p2;
00492   // normal calculation using central differences
00493   // TODO
00494   //p = (pos[i] + make_float3(1.0f)) * 0.5f;
00495   p = pos[i];
00496   p.x *= bBoxInv.x;
00497   p.y *= bBoxInv.y;
00498   p.z *= bBoxInv.z;
00499 
00500   p1 = p + make_float3(gridSizeInv.x, 0.0f, 0.0f);
00501   p2 = p - make_float3(gridSizeInv.x, 0.0f, 0.0f);
00502   n.x = tex3D<float>(volTexObj, p2.x, p2.y, p2.z) - tex3D<float>(volTexObj, p1.x, p1.y, p1.z);
00503   p1 = p + make_float3(0.0f, gridSizeInv.y, 0.0f);
00504   p2 = p - make_float3(0.0f, gridSizeInv.y, 0.0f);
00505   n.y = tex3D<float>(volTexObj, p2.x, p2.y, p2.z) - tex3D<float>(volTexObj, p1.x, p1.y, p1.z);
00506   p1 = p + make_float3(0.0f, 0.0f, gridSizeInv.z);
00507   p2 = p - make_float3(0.0f, 0.0f, gridSizeInv.z);
00508   n.z = tex3D<float>(volTexObj, p2.x, p2.y, p2.z) - tex3D<float>(volTexObj, p1.x, p1.y, p1.z);
00509 
00510   NORMAL no;
00511   convert_normal(no, n);
00512   norm[i] = no;
00513 }
00514 
00515 
00516 // version that calculates the surface normal and color for each triangle vertex
00517 template <class VERTEXCOL, class VOLTEX, class NORMAL>
00518 __global__ void
00519 // __launch_bounds__ ( KERNTHREADS, 1 )
00520 generateTriangleColorNormal(const float3 * RESTRICT pos, 
00521                             VERTEXCOL * RESTRICT col,
00522                             NORMAL * RESTRICT norm,
00523                             const VOLTEX * RESTRICT colors,
00524                             cudaTextureObject_t volTexObj,
00525                             uint3 gridSize, float3 gridSizeInv, 
00526                             float3 bBoxInv, unsigned int numVerts) {
00527   unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00528   unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00529   unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00530 
00531   if (i > numVerts - 1)
00532     return;
00533 
00534   float3 p = pos[i];
00535   p.x *= bBoxInv.x;
00536   p.y *= bBoxInv.y;
00537   p.z *= bBoxInv.z;
00538   // color computation
00539   float3 gridPosF = p;
00540   gridPosF.x *= float(gridSize.x);
00541   gridPosF.y *= float(gridSize.y);
00542   gridPosF.z *= float(gridSize.z);
00543   float3 gridPosFloor;
00544   // Without the offset, rounding errors can occur
00545   // TODO why do we need the offset??
00546   gridPosFloor.x = floorf(gridPosF.x + 0.0001f);
00547   gridPosFloor.y = floorf(gridPosF.y + 0.0001f);
00548   gridPosFloor.z = floorf(gridPosF.z + 0.0001f);
00549   float3 gridPosCeil;
00550   // Without the offset, rounding errors can occur
00551   // TODO why do we need the offset??
00552   gridPosCeil.x = ceilf(gridPosF.x - 0.0001f);
00553   gridPosCeil.y = ceilf(gridPosF.y - 0.0001f);
00554   gridPosCeil.z = ceilf(gridPosF.z - 0.0001f);
00555   uint3 gridPos0;
00556   gridPos0.x = gridPosFloor.x;
00557   gridPos0.y = gridPosFloor.y;
00558   gridPos0.z = gridPosFloor.z;
00559   uint3 gridPos1;
00560   gridPos1.x = gridPosCeil.x;
00561   gridPos1.y = gridPosCeil.y;
00562   gridPos1.z = gridPosCeil.z;
00563 
00564   // compute interpolated color
00565   float3 field[2];
00566   field[0] = sampleColors(colors, gridPos0, gridSize);
00567   field[1] = sampleColors(colors, gridPos1, gridSize);
00568   float3 tmp = gridPosF - gridPosFloor;
00569   float a = fmaxf(fmaxf(tmp.x, tmp.y), tmp.z);
00570   VERTEXCOL c;
00571   convert_color(c, lerp(field[0], field[1], a));
00572   col[i] = c;
00573 
00574   // normal calculation using central differences
00575   float3 p1, p2, n;
00576   p1 = p + make_float3(gridSizeInv.x, 0.0f, 0.0f);
00577   p2 = p - make_float3(gridSizeInv.x, 0.0f, 0.0f);
00578   n.x = tex3D<float>(volTexObj, p2.x, p2.y, p2.z) - tex3D<float>(volTexObj, p1.x, p1.y, p1.z);
00579   p1 = p + make_float3(0.0f, gridSizeInv.y, 0.0f);
00580   p2 = p - make_float3(0.0f, gridSizeInv.y, 0.0f);
00581   n.y = tex3D<float>(volTexObj, p2.x, p2.y, p2.z) - tex3D<float>(volTexObj, p1.x, p1.y, p1.z);
00582   p1 = p + make_float3(0.0f, 0.0f, gridSizeInv.z);
00583   p2 = p - make_float3(0.0f, 0.0f, gridSizeInv.z);
00584   n.z = tex3D<float>(volTexObj, p2.x, p2.y, p2.z) - tex3D<float>(volTexObj, p1.x, p1.y, p1.z);
00585 
00586   NORMAL no;
00587   convert_normal(no, n);
00588   norm[i] = no;
00589 }
00590 
00591 
00592 
00593 #if 0
00594 
00595 void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
00596   const uint2 zero = make_uint2(0, 0);
00597   long scanwork_sz = 0;
00598   scanwork_sz = dev_excl_scan_sum_tmpsz(input, ((long) numElements), output, zero);
00599   void *scanwork_d = NULL;
00600   cudaMalloc(&scanwork_d, scanwork_sz);
00601   dev_excl_scan_sum(input, ((long) numElements), output, scanwork_d, scanwork_sz, zero);
00602   cudaFree(scanwork_d);
00603 }
00604 
00605 #elif CUDART_VERSION >= 9000
00606 //
00607 // XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when
00608 //     used with the built-in uint2 vector integer types.  To workaround
00609 //     the problem we have to define our own type and associated conversion
00610 //     routines etc.
00611 //
00612 
00613 // XXX workaround for uint2 breakage in all CUDA revs 9.0 through 10.0
00614 struct myuint2 : uint2 {
00615   __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {}
00616   __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {}
00617   __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {}
00618 };
00619 
00620 void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
00621     const uint2 zero = make_uint2(0, 0);
00622     thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input),
00623                            thrust::device_ptr<myuint2>((myuint2*)input + numElements),
00624                            thrust::device_ptr<myuint2>((myuint2*)output),
00625                            (myuint2) zero);
00626 }
00627 
00628 #else
00629 
00630 void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
00631     const uint2 zero = make_uint2(0, 0);
00632     thrust::exclusive_scan(thrust::device_ptr<uint2>(input),
00633                            thrust::device_ptr<uint2>(input + numElements),
00634                            thrust::device_ptr<uint2>(output),
00635                            zero);
00636 }
00637 
00638 #endif
00639 
00640 void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) {
00641     thrust::inclusive_scan(thrust::device_ptr<float>(input), 
00642                            thrust::device_ptr<float>(input + numElements),
00643                            thrust::device_ptr<float>(output));
00644 }
00645 
00646 
00647 __global__ void 
00648 // __launch_bounds__ ( KERNTHREADS, 1 )
00649 computeTriangleAreas(const float3 * RESTRICT pos,
00650                      float * RESTRICT area, unsigned int maxTria) {
00651     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00652     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00653     unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00654 
00655     // prevent overrunning of array boundary
00656     if (i >= maxTria)
00657       return;
00658 
00659     // get all three triangle vertices
00660     float3 v0 = pos[3*i];
00661     float3 v1 = pos[3*i+1];
00662     float3 v2 = pos[3*i+2];
00663 
00664     // compute edge lengths
00665     float a = length(v0 - v1);
00666     float b = length(v0 - v2);
00667     float c = length(v1 - v2);
00668 
00669     // compute area (Heron's formula)
00670     float rad = (a + b + c) * (a + b - c) * (b + c - a) * (c + a - b);
00671     // make sure radicand is not negative
00672     rad = rad > 0.0f ? rad : 0.0f;
00673     area[i] = 0.25f * sqrtf(rad);
00674 }
00675 
00676 
00678 //
00679 // class CUDAMarchingCubes
00680 //
00682 
00683 CUDAMarchingCubes::CUDAMarchingCubes() {
00684     // initialize values
00685     isoValue = 0.5f;
00686     maxNumVoxels = 0;
00687     numVoxels    = 0;
00688     activeVoxels = 0;
00689     totalVerts   = 0;
00690     d_volume = 0;
00691     d_colors = 0;
00692     texformat = RGB3F;
00693     d_volumeArray = 0;
00694     d_voxelVerts = 0;
00695     d_numVertsTable = 0;
00696     d_triTable = 0;
00697     useColor = false;
00698     useSubGrid = false;
00699     initialized = false;
00700     setdata = false;
00701     cudadevice = 0;
00702     cudacomputemajor = 0;
00703 
00704     // clear all texture objects
00705     triTexObj = 0;
00706     numVertsTexObj = 0;
00707     volTexObj = 0;
00708 
00709     // Query GPU device attributes so we can launch the best kernel type
00710     cudaDeviceProp deviceProp;
00711     memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00712 
00713     if (cudaGetDevice(&cudadevice) != cudaSuccess) {
00714       // XXX do something more useful here...
00715     }
00716 
00717     if (cudaGetDeviceProperties(&deviceProp, cudadevice) != cudaSuccess) {
00718       // XXX do something more useful here...
00719     }
00720 
00721     cudacomputemajor = deviceProp.major;
00722 }
00723 
00724 
00725 CUDAMarchingCubes::~CUDAMarchingCubes() {
00726     Cleanup();
00727 }
00728 
00729 
00730 void CUDAMarchingCubes::Cleanup() {
00731     if (d_triTable) {
00732       // destroy tex obj before freeing backing memory allocation
00733       cudaDestroyTextureObject(triTexObj);
00734       triTexObj = 0;
00735 
00736       cudaFree(d_triTable);
00737       d_triTable = 0;
00738     }
00739 
00740     if (d_numVertsTable) {
00741       // destroy tex obj before freeing backing memory allocation
00742       cudaDestroyTextureObject(numVertsTexObj);
00743       numVertsTexObj = 0;
00744 
00745       cudaFree(d_numVertsTable);
00746       d_numVertsTable = 0;
00747     }
00748 
00749     if (d_voxelVerts) {
00750       cudaFree(d_voxelVerts);
00751       d_voxelVerts = 0;
00752     }
00753 
00754     if (d_compVoxelArray) {
00755       cudaFree(d_compVoxelArray);
00756       d_compVoxelArray = 0;
00757     }
00758 
00759     if (ownCudaDataArrays) {
00760       if (d_volume) {
00761         cudaFree(d_volume);
00762         d_volume = 0;
00763       }
00764 
00765       if (d_colors) {
00766         cudaFree(d_colors);
00767         d_colors = 0;
00768       }
00769     }
00770 
00771     if (d_volumeArray) {
00772       // destroy tex obj before freeing backing memory allocation
00773       cudaDestroyTextureObject(volTexObj);
00774       volTexObj = 0;
00775 
00776       cudaFreeArray(d_volumeArray);
00777       d_volumeArray = 0;
00778     }
00779 
00780     maxNumVoxels = 0;
00781     numVoxels    = 0;
00782     ownCudaDataArrays = false;
00783     initialized = false;
00784     setdata = false;
00785 }
00786 
00787 
00788 
00789 //
00790 // Methods that manage textures
00791 //
00792 void CUDAMarchingCubes::allocateTextures(int **d_triTab, unsigned int **d_numVertsTab) {
00793     //
00794     // new texture object implementation
00795     //
00796 
00797     // Build triTexObj 
00798     long tritexsz = 256*16*sizeof(int);
00799     cudaMalloc((void**) d_triTab, tritexsz);
00800     cudaMemcpy((void *)*d_triTab, (void *)triTable, tritexsz, cudaMemcpyHostToDevice);
00801     cudaResourceDesc resDescSigned;
00802     memset(&resDescSigned, 0, sizeof(resDescSigned));
00803     resDescSigned.resType = cudaResourceTypeLinear;
00804     resDescSigned.res.linear.desc.f = cudaChannelFormatKindSigned;
00805     resDescSigned.res.linear.desc.x = 32; // 32-bits in the X channel
00806     resDescSigned.res.linear.sizeInBytes = tritexsz;
00807     resDescSigned.res.linear.devPtr = *d_triTab;
00808 
00809     cudaTextureDesc texDesc;
00810     memset(&texDesc, 0, sizeof(texDesc));
00811     texDesc.readMode = cudaReadModeElementType;
00812   
00813     cudaCreateTextureObject(&triTexObj, &resDescSigned, &texDesc, NULL);
00814 
00815 
00816     // Build numVertsTexObj 
00817     long numvertsz = 256*sizeof(unsigned int);
00818     cudaMalloc((void**) d_numVertsTab, numvertsz);
00819     cudaMemcpy((void *)*d_numVertsTab, (void *)numVertsTable, numvertsz, cudaMemcpyHostToDevice);
00820     cudaResourceDesc resDescUnsigned;
00821     memset(&resDescUnsigned, 0, sizeof(resDescUnsigned));
00822     resDescUnsigned.resType = cudaResourceTypeLinear;
00823     resDescUnsigned.res.linear.desc.f = cudaChannelFormatKindUnsigned;
00824     resDescUnsigned.res.linear.desc.x = 32; // 32-bits in the X channel
00825     resDescUnsigned.res.linear.sizeInBytes = numvertsz;
00826     resDescUnsigned.res.linear.devPtr = *d_numVertsTab;
00827 
00828     cudaTextureDesc numVertsDesc;
00829     memset(&numVertsDesc, 0, sizeof(numVertsDesc));
00830     numVertsDesc.readMode = cudaReadModeElementType;
00831   
00832     cudaCreateTextureObject(&numVertsTexObj, &resDescUnsigned, &numVertsDesc, NULL);
00833 }
00834 
00835 
00836 
00840 void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) {
00841     // check if data is available
00842     if (!this->setdata)
00843       return;
00844 
00845     int threads = 256;
00846     dim3 grid((unsigned int) (ceil(float(numVoxels) / float(threads))), 1, 1);
00847     // get around maximum grid size of 65535 in each dimension
00848     if (grid.x > 65535) {
00849         grid.y = (unsigned int) (ceil(float(grid.x) / 32768.0f));
00850         grid.x = 32768;
00851     }
00852 
00853     dim3 threads3D(256, 1, 1);
00854     dim3 grid3D((unsigned int) (ceil(float(gridSize.x) / float(threads3D.x))), 
00855                 (unsigned int) (ceil(float(gridSize.y) / float(threads3D.y))), 
00856                 (unsigned int) (ceil(float(gridSize.z) / float(threads3D.z))));
00857 
00858     // calculate number of vertices need per voxel
00859     if (cudacomputemajor >= 2) {
00860       // launch a 3-D grid if we have a new enough device (Fermi or later...)
00861       if (useSubGrid) {
00862         classifyVoxel<1,1><<<grid3D, threads3D>>>(d_voxelVerts, numVertsTexObj,
00863                              d_volume, gridSize, numVoxels, voxelSize, 
00864                              subGridStart, subGridEnd, isoValue);
00865       } else {
00866         classifyVoxel<1,0><<<grid3D, threads3D>>>(d_voxelVerts, numVertsTexObj,
00867                              d_volume, gridSize, numVoxels, voxelSize, 
00868                              subGridStart, subGridEnd, isoValue);
00869       }
00870     } else {
00871       // launch a 2-D grid for older devices
00872       if (useSubGrid) {
00873         classifyVoxel<0,1><<<grid, threads>>>(d_voxelVerts, numVertsTexObj,
00874                              d_volume, gridSize, numVoxels, voxelSize,
00875                              subGridStart, subGridEnd, isoValue);
00876       } else {
00877         classifyVoxel<0,0><<<grid, threads>>>(d_voxelVerts, numVertsTexObj,
00878                              d_volume, gridSize, numVoxels, voxelSize,
00879                              subGridStart, subGridEnd, isoValue);
00880       }
00881     }
00882 
00883     // scan voxel vertex/occupation array (use in-place prefix sum for lower memory consumption)
00884     uint2 lastElement, lastScanElement;
00885     cudaMemcpy((void *) &lastElement, (void *)(d_voxelVerts + numVoxels-1), sizeof(uint2), cudaMemcpyDeviceToHost);
00886 
00887     ThrustScanWrapperUint2(d_voxelVerts, d_voxelVerts, numVoxels);
00888 
00889     // read back values to calculate total number of non-empty voxels
00890     // since we are using an exclusive scan, the total is the last value of
00891     // the scan result plus the last value in the input array
00892     cudaMemcpy((void *) &lastScanElement, (void *) (d_voxelVerts + numVoxels-1), sizeof(uint2), cudaMemcpyDeviceToHost);
00893     activeVoxels = lastElement.y + lastScanElement.y;
00894     // add up total number of vertices
00895     totalVerts = lastElement.x + lastScanElement.x;
00896     totalVerts = totalVerts < maxverts ? totalVerts : maxverts; // min
00897 
00898     if (activeVoxels==0) {
00899       // return if there are no full voxels
00900       totalVerts = 0;
00901       return;
00902     }
00903 
00904     grid.x = (unsigned int) (ceil(float(numVoxels) / float(threads)));
00905     grid.y = grid.z = 1;
00906     // get around maximum grid size of 65535 in each dimension
00907     if (grid.x > 65535) {
00908         grid.y = (unsigned int) (ceil(float(grid.x) / 32768.0f));
00909         grid.x = 32768;
00910     }
00911 
00912     // compact voxel index array
00913     compactVoxels<<<grid, threads>>>(d_compVoxelArray, d_voxelVerts, lastElement.y, numVoxels, numVoxels - 1);
00914 
00915     dim3 grid2((unsigned int) (ceil(float(activeVoxels) / (float) NTHREADS)), 1, 1);
00916     while(grid2.x > 65535) {
00917         grid2.x = (unsigned int) (ceil(float(grid2.x) / 2.0f));
00918         grid2.y *= 2;
00919     }
00920 
00921     grid3 = dim3((unsigned int) (ceil(float(totalVerts) / (float) threads)), 1, 1);
00922     while(grid3.x > 65535) {
00923         grid3.x = (unsigned int) (ceil(float(grid3.x) / 2.0f));
00924         grid3.y *= 2;
00925     }
00926     while(grid3.y > 65535) {
00927         grid3.y = (unsigned int) (ceil(float(grid3.y) / 2.0f));
00928         grid3.z *= 2;
00929     }
00930 
00931     // separate computation of vertices and vertex color/normal for higher occupancy and speed
00932     generateTriangleVerticesSMEM<<<grid2, NTHREADS>>>(vertOut, 
00933                                    d_compVoxelArray, d_voxelVerts, 
00934                                    triTexObj, numVertsTexObj,
00935                                    d_volume, gridSize, voxelSize, 
00936                                    isoValue, activeVoxels, maxverts - 3);
00937 }
00938 
00939 
00940 //
00941 // Generate colors using float3 vertex array format
00942 //
00943 void CUDAMarchingCubes::computeIsosurface(float3* vertOut, float3* normOut, float3* colOut, unsigned int maxverts) {
00944     // check if data is available
00945     if (!this->setdata)
00946       return;
00947 
00948     int threads = 256;
00949     dim3 grid3;
00950     computeIsosurfaceVerts(vertOut, maxverts, grid3);
00951 
00952     float3 gridSizeInv = make_float3(1.0f / float(gridSize.x), 1.0f / float(gridSize.y), 1.0f / float(gridSize.z));
00953     float3 bBoxInv = make_float3(1.0f / bBox.x, 1.0f / bBox.y, 1.0f / bBox.z);
00954     if (this->useColor) {
00955       switch (texformat) {
00956         case RGB3F:
00957           generateTriangleColorNormal<float3, float3, float3><<<grid3, threads>>>(vertOut, colOut, normOut, (float3 *) d_colors, 
00958                   volTexObj, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
00959           break;
00960 
00961         case RGB4U:
00962           generateTriangleColorNormal<float3, uchar4, float3><<<grid3, threads>>>(vertOut, colOut, normOut, (uchar4 *) d_colors, 
00963                   volTexObj, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
00964           break;
00965       }
00966     } else {
00967       generateTriangleNormals<float3><<<grid3, threads>>>(vertOut, normOut, 
00968               volTexObj, gridSizeInv, bBoxInv, totalVerts);
00969     }
00970 
00971     offsetTriangleVertices<<<grid3, threads>>>(vertOut, this->origin, totalVerts - 1);
00972 }
00973 
00974 
00975 //
00976 // Generate colors using uchar4 vertex array format
00977 //
00978 void CUDAMarchingCubes::computeIsosurface(float3* vertOut, float3* normOut, uchar4* colOut, unsigned int maxverts) {
00979     // check if data is available
00980     if (!this->setdata)
00981       return;
00982 
00983     int threads = 256;
00984     dim3 grid3;
00985     computeIsosurfaceVerts(vertOut, maxverts, grid3);
00986 
00987     float3 gridSizeInv = make_float3(1.0f / float(gridSize.x), 1.0f / float(gridSize.y), 1.0f / float(gridSize.z));
00988     float3 bBoxInv = make_float3(1.0f / bBox.x, 1.0f / bBox.y, 1.0f / bBox.z);
00989     if (this->useColor) {
00990       switch (texformat) {
00991         case RGB3F:
00992           generateTriangleColorNormal<uchar4, float3, float3><<<grid3, threads>>>(vertOut, colOut, normOut, (float3 *) d_colors, 
00993                   volTexObj, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
00994           break;
00995 
00996         case RGB4U:
00997           generateTriangleColorNormal<uchar4, uchar4, float3><<<grid3, threads>>>(vertOut, colOut, normOut, (uchar4 *) d_colors, 
00998                   volTexObj, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
00999           break;
01000       }
01001     } else {
01002       generateTriangleNormals<float3><<<grid3, threads>>>(vertOut, normOut, 
01003               volTexObj, gridSizeInv, bBoxInv, totalVerts);
01004     }
01005 
01006     offsetTriangleVertices<<<grid3, threads>>>(vertOut, this->origin, totalVerts - 1);
01007 }
01008 
01009 
01010 //
01011 // Generate normals w/ char format, colors w/ uchar4 vertex array format
01012 //
01013 // Note: The CUDA char3 type is explicitly a signed type
01014 void CUDAMarchingCubes::computeIsosurface(float3* vertOut, char3* normOut, uchar4* colOut, unsigned int maxverts) {
01015     // check if data is available
01016     if (!this->setdata)
01017       return;
01018 
01019     int threads = 256;
01020     dim3 grid3;
01021     computeIsosurfaceVerts(vertOut, maxverts, grid3);
01022 
01023     float3 gridSizeInv = make_float3(1.0f / float(gridSize.x), 1.0f / float(gridSize.y), 1.0f / float(gridSize.z));
01024     float3 bBoxInv = make_float3(1.0f / bBox.x, 1.0f / bBox.y, 1.0f / bBox.z);
01025     if (this->useColor) {
01026       switch (texformat) {
01027         case RGB3F:
01028           generateTriangleColorNormal<uchar4, float3, char3><<<grid3, threads>>>(vertOut, colOut, normOut, (float3 *) d_colors, 
01029                   volTexObj, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
01030           break;
01031 
01032         case RGB4U:
01033           generateTriangleColorNormal<uchar4, uchar4, char3><<<grid3, threads>>>(vertOut, colOut, normOut, (uchar4 *) d_colors, 
01034                   volTexObj, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
01035           break;
01036       }
01037     } else {
01038       generateTriangleNormals<char3><<<grid3, threads>>>(vertOut, normOut, 
01039               volTexObj, gridSizeInv, bBoxInv, totalVerts);
01040     }
01041 
01042     offsetTriangleVertices<<<grid3, threads>>>(vertOut, this->origin, totalVerts - 1);
01043 }
01044 
01045 
01046 bool CUDAMarchingCubes::Initialize(uint3 maxgridsize) {
01047     // check if already initialized
01048     if (initialized) return false;
01049 
01050     // use max grid size initially
01051     maxGridSize = maxgridsize;
01052     gridSize = maxGridSize;
01053     maxNumVoxels = gridSize.x*gridSize.y*gridSize.z;
01054     numVoxels = maxNumVoxels;
01055     
01056     // initialize subgrid dimensions to the entire volume by default
01057     subGridStart = make_uint3(0, 0, 0);
01058     subGridEnd = gridSize - make_uint3(1, 1, 1);
01059 
01060     // allocate textures
01061     allocateTextures(&d_triTable, &d_numVertsTable);
01062 
01063     // allocate device memory
01064     if (cudaMalloc((void**) &d_voxelVerts, sizeof(uint2) * numVoxels) != cudaSuccess) {
01065         Cleanup();
01066         return false;
01067     }
01068     if (cudaMalloc((void**) &d_compVoxelArray, sizeof(unsigned int) * numVoxels) != cudaSuccess) {
01069         Cleanup();
01070         return false;
01071     }
01072 
01073     // success
01074     initialized = true;
01075     return true;
01076 }
01077 
01078 
01079 bool CUDAMarchingCubes::SetVolumeData(float *volume, void *colors, 
01080                                       VolTexFormat vtexformat, uint3 gridsize, 
01081                                       float3 gridOrigin, float3 boundingBox, 
01082                                       bool cudaArray) {
01083   bool newgridsize = false;
01084 
01085   // check if initialize was successful
01086   if (!initialized) return false;
01087 
01088   // check if the grid size matches
01089   if (gridsize.x != gridSize.x ||
01090       gridsize.y != gridSize.y ||
01091       gridsize.z != gridSize.z) {
01092     newgridsize = true;
01093     int nv = gridsize.x*gridsize.y*gridsize.z;
01094     if (nv > maxNumVoxels)
01095       return false;
01096 
01097     gridSize = gridsize;
01098     numVoxels = nv;
01099 
01100     // initialize subgrid dimensions to the entire volume by default
01101     subGridStart = make_uint3(0, 0, 0);
01102     subGridEnd = gridSize - make_uint3(1, 1, 1);
01103   }
01104 
01105   // copy the grid origin, bounding box dimensions, 
01106   // and update dependent variables
01107   origin = gridOrigin;
01108   bBox = boundingBox;
01109   voxelSize = make_float3(bBox.x / gridSize.x,
01110                           bBox.y / gridSize.y,
01111                           bBox.z / gridSize.z);
01112 
01113   // check colors
01114   useColor = colors ? true : false;
01115 
01116   // copy cuda array pointers or create cuda arrays and copy data
01117   if (cudaArray) {
01118     // check ownership flag and free if necessary
01119     if (ownCudaDataArrays) {
01120       if (d_volume) cudaFree(d_volume);
01121       d_volume = NULL;
01122 
01123       if (d_colors) cudaFree(d_colors);
01124       d_colors = NULL;
01125     }
01126 
01127     // copy data pointers
01128     d_volume = volume;
01129     d_colors = colors;
01130     texformat = vtexformat;
01131 
01132     // set ownership flag
01133     ownCudaDataArrays = false;
01134   } else {
01135     // create the volume array (using max size) and copy data
01136     unsigned int size = numVoxels * sizeof(float);
01137     unsigned int maxsize = maxNumVoxels * sizeof(float);
01138 
01139     // check data array allocation
01140     if (!d_volume) {
01141       if (cudaMalloc((void**) &d_volume, maxsize) != cudaSuccess) {
01142         Cleanup();
01143         return false;
01144       }
01145       if (cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) != cudaSuccess) {
01146         Cleanup();
01147         return false;
01148       }
01149     }
01150 
01151     if (colors) {
01152       if (!d_colors) {
01153         // create the color array and copy colors
01154         unsigned int maxtxsize = 0; 
01155         unsigned int txsize = 0;
01156         texformat = vtexformat;
01157         switch (texformat) {
01158           case RGB3F: 
01159             txsize = numVoxels * sizeof(float3);
01160             maxtxsize = maxNumVoxels * sizeof(float3);
01161             break;
01162 
01163           case RGB4U: 
01164             txsize = numVoxels * sizeof(uchar4);
01165             maxtxsize = maxNumVoxels * sizeof(uchar4);
01166             break;
01167         } 
01168 
01169         if (cudaMalloc((void**) &d_colors, maxtxsize) != cudaSuccess) {
01170           Cleanup();
01171           return false;
01172         }
01173         if (cudaMemcpy(d_colors, colors, txsize, cudaMemcpyHostToDevice) != cudaSuccess) {
01174           Cleanup();
01175           return false;
01176         }
01177       }
01178     }
01179 
01180     // set ownership flag
01181     ownCudaDataArrays = true;
01182   }
01183 
01184   // Compute grid extents and channel description for the 3-D array
01185   cudaExtent gridExtent = make_cudaExtent(gridSize.x, gridSize.y, gridSize.z);
01186   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
01187 
01188   // Check to see if existing 3D array allocation matches current grid size,
01189   // deallocate it if not so that we regenerate it with the correct size.
01190   if (d_volumeArray && newgridsize) { 
01191     // destroy tex obj before freeing backing memory allocation
01192     cudaDestroyTextureObject(volTexObj);
01193     volTexObj = 0;
01194 
01195     cudaFreeArray(d_volumeArray);
01196     d_volumeArray = NULL;
01197   }
01198 
01199   // allocate the 3D array if needed
01200   if (!d_volumeArray) { 
01201     // create 3D array
01202     if (cudaMalloc3DArray(&d_volumeArray, &channelDesc, gridExtent) != cudaSuccess) {
01203       Cleanup();
01204       return false;
01205     }
01206   }
01207 
01208   //
01209   // copy data to 3D array and volumetric texture volTexObj
01210   // 
01211   cudaMemcpy3DParms copyParams = {0};
01212   copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_volume, 
01213                                             gridExtent.width*sizeof(float),
01214                                             gridExtent.width,
01215                                             gridExtent.height);
01216   copyParams.dstArray = d_volumeArray;
01217   copyParams.extent   = gridExtent;
01218   copyParams.kind     = cudaMemcpyDeviceToDevice;
01219   if (cudaMemcpy3D(&copyParams) != cudaSuccess) {
01220     Cleanup();
01221     return false;
01222   }
01223 
01224   cudaResourceDesc resDescVol;
01225   memset(&resDescVol, 0, sizeof(resDescVol));
01226   resDescVol.resType = cudaResourceTypeArray;
01227   resDescVol.res.array.array = d_volumeArray;
01228 
01229   cudaTextureDesc volDesc;
01230   memset(&volDesc, 0, sizeof(volDesc));
01231   volDesc.normalizedCoords = 1;
01232   volDesc.filterMode = cudaFilterModeLinear;
01233   volDesc.addressMode[0] = cudaAddressModeClamp;
01234   volDesc.addressMode[1] = cudaAddressModeClamp;
01235   volDesc.addressMode[2] = cudaAddressModeClamp;
01236   volDesc.readMode = cudaReadModeElementType;
01237 
01238   cudaCreateTextureObject(&volTexObj, &resDescVol, &volDesc, NULL);
01239 
01240   // success
01241   setdata = true;
01242 
01243   return true;
01244 }
01245 
01246 
01247 bool CUDAMarchingCubes::SetVolumeData(float *volume, float3 *colors, uint3 gridsize, float3 gridOrigin, float3 boundingBox, bool cudaArray) {
01248   return SetVolumeData(volume, colors, RGB3F, gridsize, gridOrigin, 
01249                        boundingBox, cudaArray);
01250 }
01251 
01252 
01253 bool CUDAMarchingCubes::SetVolumeData(float *volume, uchar4 *colors, uint3 gridsize, float3 gridOrigin, float3 boundingBox, bool cudaArray) {
01254   return SetVolumeData(volume, colors, RGB4U, gridsize, gridOrigin, 
01255                        boundingBox, cudaArray);
01256 }
01257 
01258 
01259 
01260 void CUDAMarchingCubes::SetSubVolume(uint3 start, uint3 end) {
01261   subGridStart = start;
01262   subGridEnd = end;
01263   useSubGrid = true;
01264 
01265   if (subGridEnd.x >= gridSize.x)
01266     subGridEnd.x = gridSize.x - 1;
01267   if (subGridEnd.y >= gridSize.y)
01268     subGridEnd.y = gridSize.y - 1;
01269   if (subGridEnd.z >= gridSize.z)
01270     subGridEnd.z = gridSize.z - 1;
01271 }
01272 
01273 
01274 bool CUDAMarchingCubes::computeIsosurface(float *volume, void *colors, 
01275                                           VolTexFormat vtexformat,
01276                                           uint3 gridsize, float3 gridOrigin, 
01277                                           float3 boundingBox, bool cudaArray, 
01278                                           float3* vertOut, float3* normOut, 
01279                                           float3* colOut, unsigned int maxverts) {
01280     // Setup
01281     if (!Initialize(gridsize))
01282         return false;
01283 
01284     if (!SetVolumeData(volume, colors, vtexformat, gridsize, gridOrigin, boundingBox, cudaArray))
01285         return false;
01286 
01287     // Compute and Render Isosurface
01288     computeIsosurface(vertOut, normOut, colOut, maxverts);
01289 
01290     // Tear down and free resources
01291     Cleanup();
01292 
01293     return true;
01294 }
01295 
01296 
01297 float CUDAMarchingCubes::computeSurfaceArea(float3 *verts, unsigned int triaCount) {
01298     // do nothing for zero triangles
01299     if(triaCount <= 0) return 0.0f;
01300 
01301     // initialize and allocate device arrays
01302     float *d_areas;
01303     unsigned long memSize = sizeof(float) * triaCount;
01304     if (cudaMalloc((void**) &d_areas, memSize) != cudaSuccess) {
01305         return -1.0f;
01306     }
01307 
01308     // compute area for each triangle
01309     const int threads = 256;
01310     dim3 grid(int(ceil(float(triaCount) / float(threads))), 1, 1);
01311 
01312     // get around maximum grid size of 65535 in each dimension
01313     while(grid.x > 65535) {
01314         grid.x = (unsigned int) (ceil(float(grid.x) / 2.0f));
01315         grid.y *= 2;
01316     }
01317     while(grid.y > 65535) {
01318         grid.y = (unsigned int) (ceil(float(grid.y) / 2.0f));
01319         grid.z *= 2;
01320     }
01321 
01322     computeTriangleAreas<<<grid, threads>>>(verts, d_areas, triaCount);
01323 
01324     // use prefix sum to compute total surface area
01325     ThrustScanWrapperArea(d_areas, d_areas, triaCount);
01326 
01327     // readback total surface area
01328     float totalArea;
01329     cudaMemcpy((void *)&totalArea, (void *)(d_areas + triaCount - 1), sizeof(float), cudaMemcpyDeviceToHost);
01330 
01331     cudaFree(d_areas); // clean up
01332     return totalArea;  // return result
01333 }
01334 
01335 
01336