Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDAMarchingCubes.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2011 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAMarchingCubes.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.8 $       $Date: 2011/11/22 21:17:57 $
00014  *
00015  ***************************************************************************
00016  * DESCRIPTION:
00017  *   CUDA Marching Cubes Implementation
00018  *
00019  * LICENSE:
00020  *   UIUC Open Source License
00021  *   http://www.ks.uiuc.edu/Research/vmd/plugins/pluginlicense.html
00022  *
00023  ***************************************************************************/
00024 
00025 //
00026 // Description: This class computes an isosurface for a given density grid
00027 //              using a CUDA Marching Cubes (MC) alorithm. 
00028 //              The implementation is based on the MC demo from the 
00029 //              Nvidia GPU Computing SDK, but has been improved 
00030 //              and extended.  This implementation achieves higher 
00031 //              performance by reducing the number of temporary memory
00032 //              buffers, reduces the number of scan calls by using vector
00033 //              integer types, and allows extraction of per-vertex normals 
00034 //              optionally computes per-vertex colors if provided with a 
00035 //              volumetric texture map.
00036 //
00037 // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de>
00038 //         John Stone <johns@ks.uiuc.edu>
00039 //
00040 // Copyright 2011
00041 //
00042 
00043 #include "CUDAKernels.h"
00044 #define CUDAMARCHINGCUBES_INTERNAL 1
00045 #include "CUDAMarchingCubes.h"
00046 #include <thrust/scan.h>
00047 #include <thrust/functional.h>
00048 
00049 // The number of threads to use for triangle generation 
00050 // (limited by shared memory size)
00051 #define NTHREADS 48
00052 
00053 //
00054 // Various math operators for vector types not already 
00055 // provided by the regular CUDA headers
00056 //
00057 // "+" operator
00058 inline __host__ __device__ float3 operator+(float3 a, float3 b) {
00059   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
00060 }
00061 inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) {
00062   return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
00063 }
00064 inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) {
00065   return make_uint2(a.x + b.x, a.y + b.y);
00066 }
00067 
00068 // "-" operator
00069 inline __host__ __device__ float3 operator-(float3 a, float3 b) {
00070   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
00071 }
00072 inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) {
00073   return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
00074 }
00075 
00076 // "*" operator
00077 inline __host__ __device__ float3 operator*(float b, float3 a) {
00078   return make_float3(b * a.x, b * a.y, b * a.z);
00079 }
00080 
00081 // dot()
00082 inline __host__ __device__ float dot(float3 a, float3 b) {
00083   return a.x * b.x + a.y * b.y + a.z * b.z;
00084 }
00085 
00086 // lerp()
00087 inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) {
00088   return a + t*(b-a);
00089 }
00090 
00091 // length()
00092 inline __host__ __device__ float length(float3 v) {
00093   return sqrtf(dot(v, v));
00094 }
00095 
00096 
00097 //
00098 // CUDA textures containing marching cubes look-up tables
00099 // Note: SIMD marching cubes implementations have no need for the edge table
00100 //
00101 texture<int, 1, cudaReadModeElementType> triTex;
00102 texture<unsigned int, 1, cudaReadModeElementType> numVertsTex;
00103 // 3D 24-bit RGB texture
00104 texture<float, 3, cudaReadModeElementType> volumeTex;
00105 
00106 // sample volume data set at a point
00107 __device__ float sampleVolume(float *data, uint3 p, uint3 gridSize) {
00108     // gridPos CAN NEVER BE OUT OF BOUNDS
00109     //p.x = (p.x >= gridSize.x) ? gridSize.x - 1 : p.x;
00110     //p.y = (p.y >= gridSize.y) ? gridSize.y - 1 : p.y;
00111     //p.z = (p.z >= gridSize.z) ? gridSize.z - 1 : p.z;
00112     unsigned int i = (p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x;
00113     return data[i];
00114 }
00115 
00116 
00117 // sample volume data set at a point
00118 __device__ float3 sampleColors(float3 *data, uint3 p, uint3 gridSize) {
00119     // gridPos CAN NEVER BE OUT OF BOUNDS
00120     //p.x = (p.x >= gridSize.x) ? gridSize.x - 1 : p.x;
00121     //p.y = (p.y >= gridSize.y) ? gridSize.y - 1 : p.y;
00122     //p.z = (p.z >= gridSize.z) ? gridSize.z - 1 : p.z;
00123     unsigned int i = (p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x;
00124     return data[i];
00125 }
00126 
00127 
00128 // compute position in 3d grid from 1d index
00129 __device__ uint3 calcGridPos(unsigned int i, uint3 gridSize) {
00130     uint3 gridPos;
00131     unsigned int gridsizexy = gridSize.x * gridSize.y;
00132     gridPos.z = i / gridsizexy;
00133     unsigned int tmp1 = i - (gridsizexy * gridPos.z);
00134     gridPos.y = tmp1 / gridSize.x;
00135     gridPos.x = tmp1 - (gridSize.x * gridPos.y);
00136     return gridPos;
00137 }
00138 
00139 
00140 // compute interpolated vertex along an edge
00141 __device__ float3 vertexInterp(float isolevel, float3 p0, float3 p1, float f0, float f1) {
00142     float t = (isolevel - f0) / (f1 - f0);
00143     return lerp(p0, p1, t);
00144 }
00145 
00146 
00147 // classify voxel based on number of vertices it will generate one thread per two voxels
00148 template <int gridis3d, int subgrid>
00149 __global__ void classifyVoxel(uint2* voxelVerts, float *volume,
00150                               uint3 gridSize, unsigned int numVoxels, float3 voxelSize,
00151                               uint3 subGridStart, uint3 subGridEnd,
00152                               float isoValue) {
00153     uint3 gridPos;
00154     unsigned int i;
00155 
00156     // Compute voxel indices and address using either 2-D or 3-D 
00157     // thread indexing depending on the caller-provided gridis3d parameter
00158     if (gridis3d) {
00159       // Compute voxel index using 3-D thread indexing
00160       // compute 3D grid position
00161       gridPos.x = blockIdx.x * blockDim.x + threadIdx.x;
00162       gridPos.y = blockIdx.y * blockDim.y + threadIdx.y;
00163       gridPos.z = blockIdx.z * blockDim.z + threadIdx.z;
00164 
00165       // safety check
00166       if (gridPos.x >= gridSize.x || 
00167           gridPos.y >= gridSize.y || 
00168           gridPos.z >= gridSize.z)
00169         return;
00170 
00171       // compute 1D grid index
00172       i = gridPos.z*gridSize.x*gridSize.y + gridPos.y*gridSize.x + gridPos.x;
00173     } else {
00174       // Compute voxel index using 2-D thread indexing
00175       unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00176       i = (blockId * blockDim.x) + threadIdx.x;
00177 
00178       // safety check
00179       if (i >= numVoxels)
00180         return;
00181 
00182       // compute current grid position
00183       gridPos = calcGridPos(i, gridSize);
00184     }
00185 
00186     // If we are told to compute the isosurface for only a sub-region
00187     // of the volume, we use a more complex boundary test, otherwise we
00188     // use just the maximum voxel dimension
00189     uint2 numVerts = make_uint2(0, 0); // initialize vertex output to zero
00190     if (subgrid) {
00191       if (gridPos.x < subGridStart.x || 
00192           gridPos.y < subGridStart.y || 
00193           gridPos.z < subGridStart.z ||
00194           gridPos.x >= subGridEnd.x || 
00195           gridPos.y >= subGridEnd.y || 
00196           gridPos.z >= subGridEnd.z) {
00197         voxelVerts[i] = numVerts; // no vertices returned
00198         return;
00199       }
00200     } else {
00201       if (gridPos.x > (gridSize.x - 2) || gridPos.y > (gridSize.y - 2) || gridPos.z > (gridSize.z - 2)) {
00202         voxelVerts[i] = numVerts; // no vertices returned
00203         return;
00204       }
00205     }
00206 
00207     // read field values at neighbouring grid vertices
00208     float field[8];
00209     field[0] = sampleVolume(volume, gridPos, gridSize);
00210     // TODO early exit test for
00211     //if( field[0] < 0.000001f )  {
00212     //    voxelVerts[i] = numVerts;
00213     //    return;
00214     //}
00215     field[1] = sampleVolume(volume, gridPos + make_uint3(1, 0, 0), gridSize);
00216     field[2] = sampleVolume(volume, gridPos + make_uint3(1, 1, 0), gridSize);
00217     field[3] = sampleVolume(volume, gridPos + make_uint3(0, 1, 0), gridSize);
00218     field[4] = sampleVolume(volume, gridPos + make_uint3(0, 0, 1), gridSize);
00219     field[5] = sampleVolume(volume, gridPos + make_uint3(1, 0, 1), gridSize);
00220     field[6] = sampleVolume(volume, gridPos + make_uint3(1, 1, 1), gridSize);
00221     field[7] = sampleVolume(volume, gridPos + make_uint3(0, 1, 1), gridSize);
00222 
00223     // calculate flag indicating if each vertex is inside or outside isosurface
00224     unsigned int cubeindex;
00225     cubeindex =  ((unsigned int) (field[0] < isoValue));
00226     cubeindex += ((unsigned int) (field[1] < isoValue))*2;
00227     cubeindex += ((unsigned int) (field[2] < isoValue))*4;
00228     cubeindex += ((unsigned int) (field[3] < isoValue))*8;
00229     cubeindex += ((unsigned int) (field[4] < isoValue))*16;
00230     cubeindex += ((unsigned int) (field[5] < isoValue))*32;
00231     cubeindex += ((unsigned int) (field[6] < isoValue))*64;
00232     cubeindex += ((unsigned int) (field[7] < isoValue))*128;
00233 
00234     // read number of vertices from texture
00235     numVerts.x = tex1Dfetch(numVertsTex, cubeindex);
00236     numVerts.y = (numVerts.x > 0);
00237 
00238     voxelVerts[i] = numVerts;
00239 }
00240 
00241 
00242 // compact voxel array
00243 __global__ void compactVoxels(unsigned int *compactedVoxelArray, uint2 *voxelOccupied, unsigned int lastVoxel, unsigned int numVoxels, unsigned int numVoxelsp1) {
00244     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00245     unsigned int i = (blockId * blockDim.x) + threadIdx.x;
00246 
00247     if ((i < numVoxels) && ( (i < numVoxelsp1) ? voxelOccupied[i].y < voxelOccupied[i+1].y : lastVoxel) ) {
00248       compactedVoxelArray[ voxelOccupied[i].y ] = i;
00249     }
00250 }
00251 
00252 
00253 // version that calculates no surface normal or color,  only triangle vertices
00254 __global__ void generateTriangleVerticesSMEM(float3 *pos, unsigned int *compactedVoxelArray, uint2 *numVertsScanned, float *volume,
00255                    uint3 gridSize, float3 voxelSize, float isoValue, unsigned int activeVoxels, unsigned int maxVertsM3) {
00256     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00257     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00258     unsigned int i = zOffset * ( blockDim.x * blockDim.y) + (blockId * blockDim.x) + threadIdx.x;
00259 
00260     if (i >= activeVoxels )
00261         return;
00262 
00263     unsigned int voxel = compactedVoxelArray[i];
00264 
00265     // compute position in 3d grid
00266     uint3 gridPos = calcGridPos(voxel, gridSize);
00267 
00268     float3 p;
00269     p.x = gridPos.x * voxelSize.x;
00270     p.y = gridPos.y * voxelSize.y;
00271     p.z = gridPos.z * voxelSize.z;
00272 
00273     // calculate cell vertex positions
00274     float3 v[8];
00275     v[0] = p;
00276     v[1] = p + make_float3(voxelSize.x, 0, 0);
00277     v[2] = p + make_float3(voxelSize.x, voxelSize.y, 0);
00278     v[3] = p + make_float3(0, voxelSize.y, 0);
00279     v[4] = p + make_float3(0, 0, voxelSize.z);
00280     v[5] = p + make_float3(voxelSize.x, 0, voxelSize.z);
00281     v[6] = p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z);
00282     v[7] = p + make_float3(0, voxelSize.y, voxelSize.z);
00283 
00284     float field[8];
00285     field[0] = sampleVolume(volume, gridPos, gridSize);
00286     field[1] = sampleVolume(volume, gridPos + make_uint3(1, 0, 0), gridSize);
00287     field[2] = sampleVolume(volume, gridPos + make_uint3(1, 1, 0), gridSize);
00288     field[3] = sampleVolume(volume, gridPos + make_uint3(0, 1, 0), gridSize);
00289     field[4] = sampleVolume(volume, gridPos + make_uint3(0, 0, 1), gridSize);
00290     field[5] = sampleVolume(volume, gridPos + make_uint3(1, 0, 1), gridSize);
00291     field[6] = sampleVolume(volume, gridPos + make_uint3(1, 1, 1), gridSize);
00292     field[7] = sampleVolume(volume, gridPos + make_uint3(0, 1, 1), gridSize);
00293 
00294     // recalculate flag
00295     unsigned int cubeindex;
00296     cubeindex =  ((unsigned int)(field[0] < isoValue)); 
00297     cubeindex += ((unsigned int)(field[1] < isoValue))*2; 
00298     cubeindex += ((unsigned int)(field[2] < isoValue))*4; 
00299     cubeindex += ((unsigned int)(field[3] < isoValue))*8; 
00300     cubeindex += ((unsigned int)(field[4] < isoValue))*16; 
00301     cubeindex += ((unsigned int)(field[5] < isoValue))*32; 
00302     cubeindex += ((unsigned int)(field[6] < isoValue))*64; 
00303     cubeindex += ((unsigned int)(field[7] < isoValue))*128;
00304 
00305     // find the vertices where the surface intersects the cube 
00306     // Note: SIMD marching cubes implementations have no need
00307     //       for an edge table, because branch divergence eliminates any
00308     //       potential performance gain from only computing the per-edge
00309     //       vertices when indicated by the edgeTable.
00310 
00311     // Use shared memory to keep register pressure under control.
00312     // No need to call __syncthreads() since each thread uses its own
00313     // private shared memory buffer.
00314     __shared__ float3 vertlist[12*NTHREADS];
00315 
00316     vertlist[threadIdx.x] = vertexInterp(isoValue, v[0], v[1], field[0], field[1]);
00317     vertlist[NTHREADS+threadIdx.x] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]);
00318     vertlist[(NTHREADS*2)+threadIdx.x] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]);
00319     vertlist[(NTHREADS*3)+threadIdx.x] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]);
00320     vertlist[(NTHREADS*4)+threadIdx.x] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]);
00321     vertlist[(NTHREADS*5)+threadIdx.x] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]);
00322     vertlist[(NTHREADS*6)+threadIdx.x] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]);
00323     vertlist[(NTHREADS*7)+threadIdx.x] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]);
00324     vertlist[(NTHREADS*8)+threadIdx.x] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]);
00325     vertlist[(NTHREADS*9)+threadIdx.x] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]);
00326     vertlist[(NTHREADS*10)+threadIdx.x] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]);
00327     vertlist[(NTHREADS*11)+threadIdx.x] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]);
00328 
00329     // output triangle vertices
00330     unsigned int numVerts = tex1Dfetch(numVertsTex, cubeindex);
00331     for(int i=0; i<numVerts; i+=3) {
00332         unsigned int index = numVertsScanned[voxel].x + i;
00333 
00334         float3 *vert[3];
00335         int edge;
00336         edge = tex1Dfetch(triTex, (cubeindex*16) + i);
00337         vert[0] = &vertlist[(edge*NTHREADS)+threadIdx.x];
00338 
00339         edge = tex1Dfetch(triTex, (cubeindex*16) + i + 1);
00340         vert[1] = &vertlist[(edge*NTHREADS)+threadIdx.x];
00341 
00342         edge = tex1Dfetch(triTex, (cubeindex*16) + i + 2);
00343         vert[2] = &vertlist[(edge*NTHREADS)+threadIdx.x];
00344 
00345         if (index < maxVertsM3) {
00346             pos[index  ] = *vert[0];
00347             pos[index+1] = *vert[1];
00348             pos[index+2] = *vert[2];
00349         }
00350     }
00351 }
00352 
00353 
00354 __global__ void offsetTriangleVertices(float3 *pos, float3 origin, unsigned int numVertsM1) {
00355     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00356     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00357     unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00358 
00359     if (i > numVertsM1)
00360       return;
00361 
00362     float3 p = pos[i];
00363     p.x += origin.x;
00364     p.y += origin.y;
00365     p.z += origin.z;
00366     pos[i] = p;
00367 }
00368 
00369 
00370 // version that calculates the surface normal for each triangle vertex
00371 __global__ void generateTriangleNormals(float3 *pos, float3 *norm, float3 gridSizeInv, float3 bBoxInv, unsigned int numVerts) {
00372     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00373     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00374     unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00375 
00376     if (i > numVerts - 1)
00377       return;
00378 
00379     float3 n;
00380     float3 p, p1, p2;
00381     // normal calculation using central differences
00382     // TODO
00383     //p = ( pos[i] + make_float3( 1.0f)) * 0.5f;
00384     p = pos[i];
00385     p.x *= bBoxInv.x;
00386     p.y *= bBoxInv.y;
00387     p.z *= bBoxInv.z;
00388     p1 = p + make_float3( gridSizeInv.x * 1.0f, 0.0f, 0.0f);
00389     p2 = p - make_float3( gridSizeInv.x * 1.0f, 0.0f, 0.0f);
00390     n.x = tex3D( volumeTex, p2.x, p2.y, p2.z) - tex3D( volumeTex, p1.x, p1.y, p1.z);
00391     p1 = p + make_float3( 0.0f, gridSizeInv.y * 1.0f, 0.0f);
00392     p2 = p - make_float3( 0.0f, gridSizeInv.y * 1.0f, 0.0f);
00393     n.y = tex3D( volumeTex, p2.x, p2.y, p2.z) - tex3D( volumeTex, p1.x, p1.y, p1.z);
00394     p1 = p + make_float3( 0.0f, 0.0f, gridSizeInv.z * 1.0f);
00395     p2 = p - make_float3( 0.0f, 0.0f, gridSizeInv.z * 1.0f);
00396     n.z = tex3D( volumeTex, p2.x, p2.y, p2.z) - tex3D( volumeTex, p1.x, p1.y, p1.z);
00397 
00398     norm[i] = n;
00399 }
00400 
00401 
00402 // version that calculates the surface normal and color for each triangle vertex
00403 __global__ void generateTriangleColorNormal(float3 *pos, float3 *col, float3 *norm, float3 *colors, uint3 gridSize, float3 gridSizeInv, float3 bBoxInv, unsigned int numVerts) {
00404     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00405     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00406     unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00407 
00408     if (i > numVerts - 1)
00409       return;
00410 
00411     float3 p = pos[i];
00412     p.x *= bBoxInv.x;
00413     p.y *= bBoxInv.y;
00414     p.z *= bBoxInv.z;
00415     // color computation
00416     float3 gridPosF = p;
00417     gridPosF.x *= float( gridSize.x);
00418     gridPosF.y *= float( gridSize.y);
00419     gridPosF.z *= float( gridSize.z);
00420     float3 gridPosFloor;
00421     // Without the offset, rounding errors can occur
00422     // TODO why do we need the offset??
00423     gridPosFloor.x = floorf(gridPosF.x + 0.0001f);
00424     gridPosFloor.y = floorf(gridPosF.y + 0.0001f);
00425     gridPosFloor.z = floorf(gridPosF.z + 0.0001f);
00426     float3 gridPosCeil;
00427     // Without the offset, rounding errors can occur
00428     // TODO why do we need the offset??
00429     gridPosCeil.x = ceilf(gridPosF.x - 0.0001f);
00430     gridPosCeil.y = ceilf(gridPosF.y - 0.0001f);
00431     gridPosCeil.z = ceilf(gridPosF.z - 0.0001f);
00432     uint3 gridPos0;
00433     gridPos0.x = gridPosFloor.x;
00434     gridPos0.y = gridPosFloor.y;
00435     gridPos0.z = gridPosFloor.z;
00436     uint3 gridPos1;
00437     gridPos1.x = gridPosCeil.x;
00438     gridPos1.y = gridPosCeil.y;
00439     gridPos1.z = gridPosCeil.z;
00440 
00441     float3 field[2];
00442     field[0] = sampleColors(colors, gridPos0, gridSize);
00443     field[1] = sampleColors(colors, gridPos1, gridSize);
00444 
00445     float3 tmp = gridPosF - gridPosFloor;
00446     float a = max( max( tmp.x, tmp.y), tmp.z);
00447     float3 c = lerp( field[0], field[1], a);
00448 
00449     float3 p1, p2;
00450     // normal calculation using central differences
00451     float3 n;
00452     p1 = p + make_float3( gridSizeInv.x * 1.0f, 0.0f, 0.0f);
00453     p2 = p - make_float3( gridSizeInv.x * 1.0f, 0.0f, 0.0f);
00454     n.x = tex3D( volumeTex, p2.x, p2.y, p2.z) - tex3D( volumeTex, p1.x, p1.y, p1.z);
00455     p1 = p + make_float3( 0.0f, gridSizeInv.y * 1.0f, 0.0f);
00456     p2 = p - make_float3( 0.0f, gridSizeInv.y * 1.0f, 0.0f);
00457     n.y = tex3D( volumeTex, p2.x, p2.y, p2.z) - tex3D( volumeTex, p1.x, p1.y, p1.z);
00458     p1 = p + make_float3( 0.0f, 0.0f, gridSizeInv.z * 1.0f);
00459     p2 = p - make_float3( 0.0f, 0.0f, gridSizeInv.z * 1.0f);
00460     n.z = tex3D( volumeTex, p2.x, p2.y, p2.z) - tex3D( volumeTex, p1.x, p1.y, p1.z);
00461 
00462     norm[i] = n;
00463     col[i] = c;
00464 }
00465 
00466 
00467 void allocateTextures(int **d_triTable, unsigned int **d_numVertsTable) {
00468     cudaChannelFormatDesc channelDescSigned = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
00469     cudaMalloc((void**) d_triTable, 256*16*sizeof(int));
00470     cudaMemcpy((void *)*d_triTable, (void *)triTable, 256*16*sizeof(int), cudaMemcpyHostToDevice);
00471     cudaBindTexture(0, triTex, *d_triTable, channelDescSigned);
00472 
00473     cudaChannelFormatDesc channelDescUnsigned = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
00474     cudaMalloc((void**) d_numVertsTable, 256*sizeof(unsigned int));
00475     cudaMemcpy((void *)*d_numVertsTable, (void *)numVertsTable, 256*sizeof(unsigned int), cudaMemcpyHostToDevice);
00476     cudaBindTexture(0, numVertsTex, *d_numVertsTable, channelDescUnsigned);
00477 }
00478 
00479 
00480 void bindVolumeTexture( cudaArray *d_vol, cudaChannelFormatDesc desc) {
00481     // set texture parameters
00482     volumeTex.normalized = 1;
00483     volumeTex.filterMode = cudaFilterModeLinear;
00484     //volumeTex.filterMode = cudaFilterModePoint;
00485     volumeTex.addressMode[0] = cudaAddressModeClamp;
00486     volumeTex.addressMode[1] = cudaAddressModeClamp;
00487     volumeTex.addressMode[2] = cudaAddressModeClamp;
00488     // bind array to 3D texture
00489     cudaBindTextureToArray( volumeTex, d_vol, desc);
00490 }
00491 
00492 
00493 #if 0
00494 void ThrustScanWrapper(unsigned int* output, unsigned int* input, unsigned int numElements) {
00495     thrust::exclusive_scan(thrust::device_ptr<unsigned int>(input), 
00496                            thrust::device_ptr<unsigned int>(input + numElements),
00497                            thrust::device_ptr<unsigned int>(output));
00498 }
00499 #endif
00500 
00501 void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
00502     const uint2 zero = make_uint2(0, 0);
00503     thrust::exclusive_scan(thrust::device_ptr<uint2>(input),
00504                            thrust::device_ptr<uint2>(input + numElements),
00505                            thrust::device_ptr<uint2>(output),
00506                            zero);
00507 }
00508 
00509 
00510 void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) {
00511     thrust::inclusive_scan(thrust::device_ptr<float>(input), 
00512                            thrust::device_ptr<float>(input + numElements),
00513                            thrust::device_ptr<float>(output));
00514 }
00515 
00516 
00517 __global__ void computeTriangleAreas( float3 *pos, float *area, unsigned int maxTria) {
00518     unsigned int zOffset = blockIdx.z * gridDim.x * gridDim.y;
00519     unsigned int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
00520     unsigned int i = zOffset + (blockId * blockDim.x) + threadIdx.x;
00521 
00522     // prevent overrunning of array boundary
00523     if (i >= maxTria)
00524       return;
00525 
00526     // get all three triangle vertices
00527     float3 v0 = pos[3*i];
00528     float3 v1 = pos[3*i+1];
00529     float3 v2 = pos[3*i+2];
00530 
00531     // compute edge lengths
00532     float a = length( v0 - v1);
00533     float b = length( v0 - v2);
00534     float c = length( v1 - v2);
00535 
00536     // compute area (Heron's formula)
00537     float rad = ( a + b + c) * ( a + b - c) * ( b + c - a) * ( c + a - b);
00538     // make sure radicand is not negative
00539     rad = rad > 0.0f ? rad : 0.0f;
00540     area[i] = 0.25f * sqrt( rad);
00541 }
00542 
00543 
00545 //
00546 // class CUDAMarchingCubes
00547 //
00549 
00550 CUDAMarchingCubes::CUDAMarchingCubes() {
00551     // initialize values
00552     isoValue = 0.5f;
00553     maxNumVoxels = 0;
00554     numVoxels    = 0;
00555     activeVoxels = 0;
00556     totalVerts   = 0;
00557     d_volume = 0;
00558     d_colors = 0;
00559     d_volumeArray = 0;
00560     d_voxelVerts = 0;
00561     d_numVertsTable = 0;
00562     d_triTable = 0;
00563     useColor = false;
00564     useSubGrid = false;
00565     initialized = false;
00566     setdata = false;
00567     cudadevice = 0;
00568     cudacomputemajor = 0;
00569 
00570     // Query GPU device attributes so we can launch the best kernel type
00571     cudaDeviceProp deviceProp;
00572     memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00573 
00574     if (cudaGetDevice(&cudadevice) != cudaSuccess) {
00575       // XXX do something more useful here...
00576     }
00577 
00578     if (cudaGetDeviceProperties(&deviceProp, cudadevice) != cudaSuccess) {
00579       // XXX do something more useful here...
00580     }
00581 
00582     cudacomputemajor = deviceProp.major;
00583 }
00584 
00585 
00586 CUDAMarchingCubes::~CUDAMarchingCubes() {
00587     Cleanup();
00588 }
00589 
00590 
00591 void CUDAMarchingCubes::Cleanup() {
00592     if( d_triTable ) cudaFree(d_triTable);
00593     if( d_numVertsTable ) cudaFree(d_numVertsTable);
00594     if( d_voxelVerts ) cudaFree(d_voxelVerts);
00595     if( d_compVoxelArray ) cudaFree(d_compVoxelArray);
00596     if( ownCudaDataArrays ) {
00597         if( d_volume ) cudaFree(d_volume);
00598         if( d_colors ) cudaFree(d_colors);
00599     }
00600     if( d_volumeArray) cudaFreeArray(d_volumeArray);
00601 
00602     maxNumVoxels = 0;
00603     numVoxels    = 0;
00604     d_triTable = 0;
00605     d_numVertsTable = 0;
00606     d_voxelVerts = 0;
00607     d_compVoxelArray = 0;
00608     d_volume = 0;
00609     d_colors = 0;
00610     ownCudaDataArrays = false;
00611     d_volumeArray = 0;
00612     initialized = false;
00613     setdata = false;
00614 }
00615 
00616 
00620 void CUDAMarchingCubes::computeIsosurface( float3* vertOut, float3* normOut, float3* colOut, unsigned int maxverts) {
00621     // check if data is available
00622     if( !this->setdata ) return;
00623 
00624     //cudaEvent_t start, stop;
00625     //float time;
00626     //float totalTime = 0.0f;
00627     //cudaEventCreate(&start);
00628     //cudaEventCreate(&stop);
00629 
00630     int threads = 256;
00631     //dim3 grid(int(ceil(float(numVoxels) / float( 2 * threads))), 1, 1); // for Mult2
00632     dim3 grid((unsigned int) (ceil(float(numVoxels) / float(threads))), 1, 1);
00633     // get around maximum grid size of 65535 in each dimension
00634     if (grid.x > 65535) {
00635         grid.y = (unsigned int) (ceil(float( grid.x) / 32768.0f));
00636         grid.x = 32768;
00637     }
00638 
00639     dim3 threads3D( 256, 1, 1);
00640     dim3 grid3D((unsigned int) (ceil(float(gridSize.x) / float(threads3D.x))), 
00641                 (unsigned int) (ceil(float(gridSize.y) / float(threads3D.y))), 
00642                 (unsigned int) (ceil(float(gridSize.z) / float(threads3D.z))));
00643 
00644     //cudaEventRecord( start, 0);
00645 
00646     cudaThreadSynchronize();
00647 
00648     // calculate number of vertices need per voxel
00649     if (cudacomputemajor >= 2) {
00650       // launch a 3-D grid if we have a new enough device (Fermi or later...)
00651       if (useSubGrid) {
00652         classifyVoxel<1,1><<<grid3D, threads3D>>>(d_voxelVerts, d_volume, 
00653                              gridSize, numVoxels, voxelSize, 
00654                              subGridStart, subGridEnd, isoValue);
00655       } else {
00656         classifyVoxel<1,0><<<grid3D, threads3D>>>(d_voxelVerts, d_volume, 
00657                              gridSize, numVoxels, voxelSize, 
00658                              subGridStart, subGridEnd, isoValue);
00659       }
00660     } else {
00661       // launch a 2-D grid for older devices
00662       if (useSubGrid) {
00663         classifyVoxel<0,1><<<grid, threads>>>(d_voxelVerts, d_volume, 
00664                              gridSize, numVoxels, voxelSize,
00665                              subGridStart, subGridEnd, isoValue);
00666       } else {
00667         classifyVoxel<0,0><<<grid, threads>>>(d_voxelVerts, d_volume, 
00668                              gridSize, numVoxels, voxelSize,
00669                              subGridStart, subGridEnd, isoValue);
00670       }
00671     }
00672 
00673     // check for errors
00674     cudaThreadSynchronize();
00675     //printf( "CUDA-MC: launch_classifyVoxel: %s\n", cudaGetErrorString( cudaGetLastError()));
00676     //cudaEventRecord( stop, 0);
00677     //cudaEventSynchronize( stop);
00678     //cudaEventElapsedTime(&time, start, stop);
00679     //printf ("-----------------------------------------------------\n");
00680     //printf ("Time for the launch_classifyVoxel kernel: %f ms\n", time);
00681     //totalTime += time;
00682 
00683     //cudaEventRecord( start, 0);
00684     // scan voxel vertex/occupation array (use in-place prefix sum for lower memory consumption)
00685     uint2 lastElement, lastScanElement;
00686     cudaMemcpy((void *) &lastElement, (void *)(d_voxelVerts + numVoxels-1), sizeof(uint2), cudaMemcpyDeviceToHost);
00687     cudaThreadSynchronize();
00688     ThrustScanWrapperUint2(d_voxelVerts, d_voxelVerts, numVoxels);
00689     // check for errors
00690     cudaThreadSynchronize();
00691     //printf( "CUDA-MC: ThrustScanWrapper I: %s\n", cudaGetErrorString( cudaGetLastError()));
00692     //cudaEventRecord( stop, 0);
00693     //cudaEventSynchronize( stop);
00694     //cudaEventElapsedTime(&time, start, stop);
00695     //printf ("Time for the ThrustScanWrapper kernel: %f ms\n", time);
00696     //totalTime += time;
00697 
00698     // read back values to calculate total number of non-empty voxels
00699     // since we are using an exclusive scan, the total is the last value of
00700     // the scan result plus the last value in the input array
00701     cudaMemcpy((void *) &lastScanElement, (void *) (d_voxelVerts + numVoxels-1), sizeof(uint2), cudaMemcpyDeviceToHost);
00702     activeVoxels = lastElement.y + lastScanElement.y;
00703     // add up total number of vertices
00704     totalVerts = lastElement.x + lastScanElement.x;
00705     totalVerts = totalVerts < maxverts ? totalVerts : maxverts; // min
00706 
00707     if (activeVoxels==0) {
00708         // return if there are no full voxels
00709         totalVerts = 0;
00710         return;
00711     }
00712 
00713     grid.x = (unsigned int) (ceil(float(numVoxels) / float(threads)));
00714     grid.y = grid.z = 1;
00715     // get around maximum grid size of 65535 in each dimension
00716     if (grid.x > 65535) {
00717         grid.y = (unsigned int) (ceil(float(grid.x) / 32768.0f));
00718         grid.x = 32768;
00719     }
00720 
00721     //cudaEventRecord( start, 0);
00722     // compact voxel index array
00723     compactVoxels<<<grid, threads>>>(d_compVoxelArray, d_voxelVerts, lastElement.y, numVoxels, numVoxels + 1);
00724     // check for errors
00725     //cudaThreadSynchronize();
00726     //printf( "CUDA-MC: launch_compactVoxels: %s\n", cudaGetErrorString( cudaGetLastError()));
00727     //cudaEventRecord( stop, 0);
00728     //cudaEventSynchronize( stop);
00729     //cudaEventElapsedTime(&time, start, stop);
00730     //printf ("Time for the launch_compactVoxels kernel: %f ms\n", time);
00731     //totalTime += time;
00732 
00733 
00734     dim3 grid2((unsigned int) (ceil(float(activeVoxels) / (float) NTHREADS)), 1, 1);
00735     while(grid2.x > 65535) {
00736         grid2.x = (unsigned int) (ceil(float(grid2.x) / 2.0f));
00737         grid2.y *= 2;
00738     }
00739 
00740     dim3 grid3((unsigned int) (ceil(float(totalVerts) / (float) threads)), 1, 1);
00741     while(grid3.x > 65535) {
00742         grid3.x = (unsigned int) (ceil(float(grid3.x) / 2.0f));
00743         grid3.y *= 2;
00744     }
00745     while(grid3.y > 65535) {
00746         grid3.y = (unsigned int) (ceil(float(grid3.y) / 2.0f));
00747         grid3.z *= 2;
00748     }
00749 
00750     //cudaEventRecord( start, 0);
00751 
00752     // separate computation of vertices and vertex color/normal for higher occupancy and speed
00753     generateTriangleVerticesSMEM<<<grid2, NTHREADS>>>( vertOut, 
00754         d_compVoxelArray, d_voxelVerts, d_volume, gridSize, voxelSize, 
00755         isoValue, activeVoxels, maxverts - 3);
00756     // check for errors
00757     cudaThreadSynchronize();
00758     //printf( "CUDA-MC: launch_generateTriangleVertices: %s\n", cudaGetErrorString( cudaGetLastError()));
00759 
00760     float3 gridSizeInv = make_float3( 1.0f / float( gridSize.x), 1.0f / float( gridSize.y), 1.0f / float( gridSize.z));
00761     float3 bBoxInv = make_float3( 1.0f / bBox.x, 1.0f / bBox.y, 1.0f / bBox.z);
00762     if( this->useColor ) {
00763         generateTriangleColorNormal<<<grid3, threads>>>(vertOut, colOut, normOut, (float3*)d_colors, this->gridSize, gridSizeInv, bBoxInv, totalVerts);
00764     } else {
00765         generateTriangleNormals<<<grid3, threads>>>(vertOut, normOut, gridSizeInv, bBoxInv, totalVerts);
00766     }
00767 
00768     // check for errors
00769     cudaThreadSynchronize();
00770     //printf( "CUDA-MC: launch_generateTriangleColorNormal: %s\n", cudaGetErrorString( cudaGetLastError()));
00771 
00772     offsetTriangleVertices<<<grid3, threads>>>(vertOut, this->origin, totalVerts - 1);
00773 
00774     // check for errors
00775     cudaThreadSynchronize();
00776     //printf( "CUDA-MC: launch_offsetTriangleVertices: %s\n", cudaGetErrorString( cudaGetLastError()));
00777 
00778     //cudaEventRecord( stop, 0);
00779     //cudaEventSynchronize( stop);
00780     //cudaEventElapsedTime(&time, start, stop);
00781     //printf ("Time for the generateTriangles kernel: %f ms\n", time);
00782     //totalTime += time;
00783 
00784     //printf (" CUDA MC: Total kernel runtime time: %f ms = %.3f fps\n", totalTime, 1000.0f/totalTime);
00785     // check for errors
00786     //cudaThreadSynchronize();
00787     //printf( "compute Marching Cubes: %s\n", cudaGetErrorString( cudaGetLastError()));
00788 }
00789 
00790 
00791 bool CUDAMarchingCubes::Initialize(uint3 maxgridsize) {
00792     // check if already initialized
00793     if (initialized) return false;
00794 
00795     // use max grid size initially
00796     maxGridSize = maxgridsize;
00797     gridSize = maxGridSize;
00798     maxNumVoxels = gridSize.x*gridSize.y*gridSize.z;
00799     numVoxels = maxNumVoxels;
00800     
00801     // initialize subgrid dimensions to the entire volume by default
00802     subGridStart = make_uint3(0, 0, 0);
00803     subGridEnd = gridSize - make_uint3(1, 1, 1);
00804 
00805     // allocate textures
00806     allocateTextures(&d_triTable, &d_numVertsTable);
00807 
00808     // allocate device memory
00809     if (cudaMalloc((void**) &d_voxelVerts, sizeof(uint2) * numVoxels) != cudaSuccess) {
00810         Cleanup();
00811         return false;
00812     }
00813     if (cudaMalloc((void**) &d_compVoxelArray, sizeof(unsigned int) * numVoxels) != cudaSuccess) {
00814         Cleanup();
00815         return false;
00816     }
00817 
00818     // success
00819     initialized = true;
00820     return true;
00821 }
00822 
00823 
00824 bool CUDAMarchingCubes::SetVolumeData(float *volume, float *colors, uint3 gridsize, 
00825                                       float3 gridOrigin, float3 boundingBox, bool cudaArray) {
00826   bool newgridsize = false;
00827 
00828   // check if initialize was successful
00829   if (!initialized) return false;
00830 
00831 #if 0
00832   // return if volume data was already set
00833   if (setdata) return false;
00834   if (d_volume != NULL) return false;
00835 #endif
00836 
00837   // check if the grid size matches
00838   if (gridsize.x != gridSize.x ||
00839       gridsize.y != gridSize.y ||
00840       gridsize.z != gridSize.z) {
00841     newgridsize = true;
00842     int nv = gridsize.x*gridsize.y*gridsize.z;
00843     if (nv > maxNumVoxels)
00844       return false;
00845 
00846     gridSize = gridsize;
00847     numVoxels = nv;
00848 
00849     // initialize subgrid dimensions to the entire volume by default
00850     subGridStart = make_uint3(0, 0, 0);
00851     subGridEnd = gridSize - make_uint3(1, 1, 1);
00852   }
00853 
00854   // copy the grid origin, bounding box dimensions, 
00855   // and update dependent variables
00856   origin = gridOrigin;
00857   bBox = boundingBox;
00858   voxelSize = make_float3(bBox.x / gridSize.x,
00859                           bBox.y / gridSize.y,
00860                           bBox.z / gridSize.z);
00861 
00862   // check colors
00863   useColor = colors ? true : false;
00864 
00865   // copy cuda array pointers or create cuda arrays and copy data
00866   if (cudaArray) {
00867     // check ownership flag and free if necessary
00868     if (ownCudaDataArrays) {
00869       if (d_volume) cudaFree(d_volume);
00870       d_volume = NULL;
00871 
00872       if (d_colors) cudaFree(d_colors);
00873       d_colors = NULL;
00874     }
00875 
00876     // copy data pointers
00877     d_volume = volume;
00878     d_colors = colors;
00879 
00880     // set ownership flag
00881     ownCudaDataArrays = false;
00882   } else {
00883     // create the volume array (using max size) and copy data
00884     unsigned int size = numVoxels * sizeof(float);
00885     unsigned int maxsize = maxNumVoxels * sizeof(float);
00886 
00887     // check data array allocation
00888     if (!d_volume) {
00889       if (cudaMalloc((void**) &d_volume, maxsize) != cudaSuccess) {
00890         Cleanup();
00891         return false;
00892       }
00893       if (cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) != cudaSuccess) {
00894         Cleanup();
00895         return false;
00896       }
00897     }
00898 
00899     if (colors) {
00900       if (!d_colors) {
00901         // create the color array and copy colors
00902         if (cudaMalloc((void**) &d_colors, maxsize*3) != cudaSuccess) {
00903           Cleanup();
00904           return false;
00905         }
00906         if (cudaMemcpy(d_colors, colors, size*3, cudaMemcpyHostToDevice) != cudaSuccess ) {
00907           Cleanup();
00908           return false;
00909         }
00910       }
00911     }
00912 
00913     // set ownership flag
00914     ownCudaDataArrays = true;
00915   }
00916 
00917   // Compute grid extents and channel description for the 3-D array
00918   cudaExtent gridExtent = make_cudaExtent(gridSize.x, gridSize.y, gridSize.z);
00919   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
00920 
00921   // Check to see if existing 3D array allocation matches current grid size,
00922   // deallocate it if not so that we regenerate it with the correct size.
00923   if (d_volumeArray && newgridsize) { 
00924     cudaFreeArray(d_volumeArray);
00925     d_volumeArray = NULL;
00926   }
00927 
00928   // allocate the 3D array if needed
00929   if (!d_volumeArray) { 
00930     // create 3D array
00931     if (cudaMalloc3DArray(&d_volumeArray, &channelDesc, gridExtent) != cudaSuccess) {
00932       Cleanup();
00933       return false;
00934     }
00935   }
00936 
00937   // copy data to 3D array
00938   cudaMemcpy3DParms copyParams = {0};
00939   copyParams.srcPtr   = make_cudaPitchedPtr((void*)d_volume, 0, 0, 0);
00940   copyParams.dstArray = d_volumeArray;
00941   copyParams.extent   = gridExtent;
00942   copyParams.kind     = cudaMemcpyDeviceToDevice;
00943   if (cudaMemcpy3D(&copyParams) != cudaSuccess) {
00944     Cleanup();
00945     return false;
00946   }
00947 
00948   // bind the array to a volume texture
00949   bindVolumeTexture(d_volumeArray, channelDesc);
00950 
00951   // success
00952   setdata = true;
00953 
00954   return true;
00955 }
00956 
00957 
00958 void CUDAMarchingCubes::SetSubVolume(uint3 start, uint3 end) {
00959   subGridStart = start;
00960   subGridEnd = end;
00961   useSubGrid = true;
00962 
00963   if (subGridEnd.x >= gridSize.x)
00964     subGridEnd.x = gridSize.x - 1;
00965   if (subGridEnd.y >= gridSize.y)
00966     subGridEnd.y = gridSize.y - 1;
00967   if (subGridEnd.z >= gridSize.z)
00968     subGridEnd.z = gridSize.z - 1;
00969 }
00970 
00971 
00972 bool CUDAMarchingCubes::computeIsosurface(float *volume, float *colors, 
00973                                           uint3 gridsize, float3 gridOrigin, 
00974                                           float3 boundingBox, bool cudaArray, 
00975                                           float3* vertOut, float3* normOut, 
00976                                           float3* colOut, unsigned int maxverts) {
00977     // Setup
00978     if (!Initialize(gridsize)) {
00979         return false;
00980     }
00981 
00982     if (!SetVolumeData(volume, colors, gridsize, gridOrigin, boundingBox, cudaArray)) {
00983         return false;
00984     }
00985 
00986     // Compute and Render Isosurface
00987     computeIsosurface(vertOut, normOut, colOut, maxverts);
00988 
00989     // Tear down and free resources
00990     Cleanup();
00991 
00992     return true;
00993 }
00994 
00995 
00996 float CUDAMarchingCubes::computeSurfaceArea( float3 *verts, unsigned int triaCount) {
00997     // do nothing for zero triangles
00998     if(triaCount <= 0) return 0.0f;
00999 
01000     // initialize and allocate device arrays
01001     float *d_areas;
01002     unsigned long memSize = sizeof(float) * triaCount;
01003     if (cudaMalloc((void**) &d_areas, memSize) != cudaSuccess) {
01004         return -1.0f;
01005     }
01006 
01007     // check for errors
01008     cudaThreadSynchronize();
01009     //printf( "cudaMalloc: %s\n", cudaGetErrorString( cudaGetLastError()));
01010 
01011     // compute area for each triangle
01012     int threads = 256;
01013     dim3 grid(int(ceil(float(triaCount) / float(threads))), 1, 1);
01014 
01015     // get around maximum grid size of 65535 in each dimension
01016     while(grid.x > 65535) {
01017         grid.x = (unsigned int) (ceil(float(grid.x) / 2.0f));
01018         grid.y *= 2;
01019     }
01020     while(grid.y > 65535) {
01021         grid.y = (unsigned int) (ceil(float(grid.y) / 2.0f));
01022         grid.z *= 2;
01023     }
01024 
01025     computeTriangleAreas<<<grid, threads>>>(verts, d_areas, triaCount);
01026     // check for errors
01027     cudaThreadSynchronize();
01028     //printf( "launch_computeTriangleAreas: %s\n", cudaGetErrorString( cudaGetLastError()));
01029 
01030     // use prefix sum to compute total surface area
01031     ThrustScanWrapperArea( d_areas, d_areas, triaCount);
01032     // check for errors
01033     cudaThreadSynchronize();
01034     //printf( "ThrustScanWrapperArea: %s\n", cudaGetErrorString( cudaGetLastError()));
01035 
01036     // readback total surface area
01037     float totalArea;
01038     cudaMemcpy((void *)&totalArea, (void *)(d_areas + triaCount - 1), sizeof(float), cudaMemcpyDeviceToHost);
01039     // check for errors
01040     cudaThreadSynchronize();
01041     //printf( "cudaMemcpy: %s\n", cudaGetErrorString( cudaGetLastError()));
01042 
01043     // clean up
01044     cudaFree(d_areas);
01045     // check for errors
01046     cudaThreadSynchronize();
01047     //printf( "cudaFree: %s\n", cudaGetErrorString( cudaGetLastError()));
01048 
01049     // return result
01050     return totalArea;
01051 }
01052 
01053 
01054 

Generated on Sat May 26 01:47:50 2012 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002