CUDAWatershed.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAWatershed.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.34 $        $Date: 2020/02/26 19:26:47 $
00014  *
00015  ***************************************************************************/
00021 #include <cstdio>
00022 
00023 #define WATERSHED_INTERNAL 1
00024 #include "CUDAWatershed.h"
00025 #include "ProfileHooks.h"
00026 
00027 #define px_offset_d    (1)
00028 #define py_offset_d    (width)
00029 #define pz_offset_d    (heightWidth)
00030 #define nx_offset_d    (-1)
00031 #define ny_offset_d    (-width)
00032 #define nz_offset_d    (-heightWidth)
00033 
00034 #define nx_ny_offset_d (-1 - width)
00035 #define nx_py_offset_d (-1 + width)
00036 #define px_py_offset_d (1 + width)
00037 #define px_ny_offset_d (1 - width)
00038 
00039 #define px_pz_offset_d (1 + heightWidth)
00040 #define nx_nz_offset_d (-1 - heightWidth)
00041 #define nx_pz_offset_d (-1 + heightWidth)
00042 #define px_nz_offset_d (1 - heightWidth)
00043 
00044 #define py_pz_offset_d (width + heightWidth)
00045 #define ny_nz_offset_d (-width - heightWidth)
00046 #define ny_pz_offset_d (-width + heightWidth)
00047 #define py_nz_offset_d (width - heightWidth)
00048 
00049 //#define GPU_BLOCK_UPDATE
00050 #define GPU_WARP_UPDATE
00051 
00052 #ifdef GPU_BLOCK_UPDATE
00053 
00054 #define BLOCK_X_DIM 128
00055 #define BLOCK_Y_DIM 2
00056 #define BLOCK_Z_DIM 2
00057 
00058 #else
00059 
00060 // BLOCK_X_DIM must be 2^n
00061 #define BLOCK_X_DIM 64
00062 #define BLOCK_Y_DIM 2
00063 #define BLOCK_Z_DIM 2
00064 
00065 #endif
00066 
00067 // 
00068 // GPU-side constant memory and change flag globals used 
00069 // by the neighbor calculation and update kernels
00070 //
00071 __constant__ int neighbor_offsets_d[19];
00072 __device__ unsigned int changes_d;
00073 
00074 #define INST_DESTROY_GPU(G_T) \
00075 template void destroy_gpu<G_T,float>(watershed_gpu_state_t<G_T,float> &gpu_state);\
00076 template void destroy_gpu<G_T,unsigned short>(watershed_gpu_state_t<G_T,unsigned short> &gpu_state);\
00077 template void destroy_gpu<G_T,unsigned char>(watershed_gpu_state_t<G_T,unsigned char> &gpu_state);
00078 
00079 #define INST_INIT_GPU_ON_DEV(G_T) \
00080 template bool init_gpu_on_device<G_T, float>(watershed_gpu_state_t<G_T,float> &gpu_state, float* image, int imageongpu,\
00081                                              unsigned int w, unsigned int h, unsigned int d);\
00082 template bool init_gpu_on_device<G_T, unsigned short>(watershed_gpu_state_t<G_T,unsigned short> &gpu_state,\
00083                                 unsigned short* image, int imageongpu, unsigned int w, unsigned int h, unsigned int d);\
00084 template bool init_gpu_on_device<G_T, unsigned char>(watershed_gpu_state_t<G_T,unsigned char> &gpu_state,\
00085                                 unsigned char* image, int imageongpu, unsigned int w, unsigned int h, unsigned int d);\
00086 
00087 #define INST_INIT_GPU(G_T) \
00088 template bool init_gpu<G_T, float>(state_t<G_T,float> &state, int *eq_and_lower,\
00089                              watershed_gpu_state_t<G_T,float> &gpu_state,\
00090                              unsigned int w, unsigned int h, unsigned int d);\
00091 template bool init_gpu<G_T, unsigned short>(state_t<G_T,unsigned short> &state, int *eq_and_lower,\
00092                              watershed_gpu_state_t<G_T,unsigned short> &gpu_state,\
00093                              unsigned int w, unsigned int h, unsigned int d);\
00094 template bool init_gpu<G_T, unsigned char>(state_t<G_T,unsigned char> &state, int *eq_and_lower,\
00095                              watershed_gpu_state_t<G_T,unsigned char> &gpu_state,\
00096                              unsigned int w, unsigned int h, unsigned int d);
00097 
00098 #define INST_UPDATE_CUDA(G_T) \
00099 template void update_cuda<G_T,float>(watershed_gpu_state_t<G_T,float> &gpu_state, \
00100                                 G_T *final_segments_d);\
00101 template void update_cuda<G_T,unsigned short>(watershed_gpu_state_t<G_T,unsigned short> &gpu_state, \
00102                                 G_T *final_segments_d);\
00103 template void update_cuda<G_T,unsigned char>(watershed_gpu_state_t<G_T,unsigned char> &gpu_state, \
00104                                 G_T *final_segments_d);
00105 
00106 INST_DESTROY_GPU(unsigned long)
00107 INST_DESTROY_GPU(unsigned int)
00108 INST_DESTROY_GPU(unsigned short)
00109 
00110 INST_INIT_GPU_ON_DEV(unsigned long)
00111 INST_INIT_GPU_ON_DEV(unsigned int)
00112 INST_INIT_GPU_ON_DEV(unsigned short)
00113 
00114 INST_INIT_GPU(unsigned long)
00115 INST_INIT_GPU(unsigned int)
00116 INST_INIT_GPU(unsigned short)
00117 
00118 INST_UPDATE_CUDA(unsigned long)
00119 INST_UPDATE_CUDA(unsigned int)
00120 INST_UPDATE_CUDA(unsigned short)
00121 
00122 //
00123 // Explicit template instantiations so that all required variants are 
00124 // compiled and available at link time.
00125 //
00126 // instantiate for: long, float
00127 template void update_cuda<long,float>(watershed_gpu_state_t<long,float> &gpu_state, 
00128                                 long *final_segments_d);
00129 // instantiate for: long, unsigned short
00130 template void update_cuda<long,unsigned short>(watershed_gpu_state_t<long,unsigned short> &gpu_state, 
00131                                 long *final_segments_d);
00132 // instantiate for: long, unsigned char
00133 template void update_cuda<long,unsigned char>(watershed_gpu_state_t<long,unsigned char> &gpu_state, 
00134                                 long *final_segments_d);
00135 
00136 // instantiate for: int, float
00137 template void update_cuda<int,float>(watershed_gpu_state_t<int,float> &gpu_state, 
00138                                int *final_segments_d);
00139 // instantiate for: int, short
00140 template void update_cuda<int,unsigned short>(watershed_gpu_state_t<int,unsigned short> &gpu_state, 
00141                                int *final_segments_d);
00142 // instantiate for: int, unsigned char
00143 template void update_cuda<int,unsigned char>(watershed_gpu_state_t<int,unsigned char> &gpu_state, 
00144                                int *final_segments_d);
00145 
00146 // instantiate for: short, float
00147 template void update_cuda<short,float>(watershed_gpu_state_t<short,float> &gpu_state,
00148                                  short *final_segments_d);
00149 // instantiate for: short, short
00150 template void update_cuda<short,unsigned short>(watershed_gpu_state_t<short,unsigned short> &gpu_state,
00151                                  short *final_segments_d);
00152 // instantiate for: short, char
00153 template void update_cuda<short,unsigned char>(watershed_gpu_state_t<short,unsigned char> &gpu_state,
00154                                  short *final_segments_d);
00155 
00156 
00157 #define DIV_BY_32 5
00158 #define GIGABYTE (1024.0f*1024.0f*1024.0f)
00159 
00160 template <typename GROUP_T, typename IMAGE_T>
00161 __global__ void update_kernel(GROUP_T* __restrict__ current_group,
00162                               GROUP_T* __restrict__ next_group,
00163                               IMAGE_T* __restrict__ current_value,
00164                               IMAGE_T* __restrict__ next_value,
00165                               int* __restrict__ eq_and_lower,
00166                               unsigned char* __restrict__ current_update,
00167                               unsigned char* __restrict__ next_update,
00168                               const int width,
00169                               const int height,
00170                               const int depth);
00171 
00172 template <typename GROUP_T, typename IMAGE_T>
00173 void init_neighbor_offsets(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state) {
00174   int height = gpu_state.height;
00175   int width = gpu_state.width;
00176   int heightWidth = height * width;
00177   int neighbor_offsets_t[19] = { 0,
00178     px_offset,
00179     py_offset,
00180     pz_offset,
00181     nx_offset,
00182     ny_offset,
00183     nz_offset,
00184 
00185     nx_ny_offset,
00186     nx_py_offset,
00187     px_py_offset,
00188     px_ny_offset,
00189 
00190     px_pz_offset,
00191     nx_nz_offset,
00192     nx_pz_offset,
00193     px_nz_offset,
00194 
00195     py_pz_offset,
00196     ny_nz_offset,
00197     ny_pz_offset,
00198     py_nz_offset,
00199   };  
00200 
00201   cudaMemcpyToSymbolAsync(neighbor_offsets_d, neighbor_offsets_t, sizeof(int) * 19);
00202 }
00203 
00204 
00205 #define CALCULATE_NEIGHBORS_CUDA(offset_str) {\
00206   const long idx_offset = idx + offset_str##_offset_d;\
00207   slope = curr_intensity - image[idx_offset];\
00208   if (slope < smallest_slope) {\
00209     smallest_slope = slope;\
00210     curr_lower = offset_str##_idx;\
00211   } else if (slope >= -FLOAT_DIFF && slope <= FLOAT_DIFF) {\
00212     curr_n_eq |= offset_str;\
00213     if (idx_offset < min_group) {\
00214       min_group = idx_offset;\
00215     }\
00216   } \
00217   }
00218 
00219 
00220 // kernel for computing initial neighbor arrays on the GPU
00221 template <typename GROUP_T, typename IMAGE_T>
00222 __global__ void calc_neighbors_kernel(const IMAGE_T* __restrict__ image,
00223                                       GROUP_T* __restrict__ next_group,
00224                                       IMAGE_T* __restrict__ curr_value,
00225                                       IMAGE_T* __restrict__ next_value,
00226                                       int* __restrict__ eq_and_lower,
00227                                       const int width,
00228                                       const int height,
00229                                       const int depth) {
00230 
00231   const int x = blockIdx.x * BLOCK_X_DIM + threadIdx.x;
00232   const int y = blockIdx.y * BLOCK_Y_DIM + threadIdx.y;
00233   const int z = blockIdx.z * BLOCK_Z_DIM + threadIdx.z;
00234   const long idx = z * long(height) * long(width) + y * long(width) + x;
00235   const int heightWidth = height * width;
00236 
00237   if (x >= width || y >= height || z >= depth) {
00238     return;
00239   }
00240 
00241   //
00242   // beginning of neighbor calculation macros 
00243   //
00244   float slope;
00245   float smallest_slope = -FLOAT_DIFF;
00246   IMAGE_T curr_intensity = image[idx];
00247   long min_group = idx; // Group number = idx to start
00248   int curr_n_eq = 0;
00249   long curr_lower = 0;
00250   //TODO change to be actual offset instead of index into offset array
00251 
00252 #ifdef CONNECTED_18
00253   if (x > 0) {
00254     CALCULATE_NEIGHBORS_CUDA(nx);
00255   }
00256   if (x < width-1) {
00257     CALCULATE_NEIGHBORS_CUDA(px);
00258   }
00259 
00260   /* Calculate n_lower and n_eq */
00261   if (y > 0) {
00262     CALCULATE_NEIGHBORS_CUDA(ny);
00263     if (x < width - 1) {
00264       CALCULATE_NEIGHBORS_CUDA(px_ny);
00265     }
00266   }
00267   if (y < height-1) {
00268     CALCULATE_NEIGHBORS_CUDA(py);
00269     if (x < width - 1) {
00270       CALCULATE_NEIGHBORS_CUDA(px_py);
00271     }
00272   }
00273 
00274   if (z > 0) {
00275     CALCULATE_NEIGHBORS_CUDA(nz);
00276     if (x < width - 1) {
00277       CALCULATE_NEIGHBORS_CUDA(px_nz);
00278     }
00279     if (x > 0) {
00280       CALCULATE_NEIGHBORS_CUDA(nx_nz);
00281     }
00282     if (y > 0 ) {
00283       CALCULATE_NEIGHBORS_CUDA(ny_nz);
00284     }
00285     if (y < height - 1) {
00286       CALCULATE_NEIGHBORS_CUDA(py_nz);
00287     }
00288   }
00289   if (z < depth-1) {
00290     CALCULATE_NEIGHBORS_CUDA(pz);
00291     if (x < width - 1) {
00292       CALCULATE_NEIGHBORS_CUDA(px_pz);
00293     }
00294     if (x > 0) {
00295       CALCULATE_NEIGHBORS_CUDA(nx_pz);
00296     }
00297     if (y < height - 1) {
00298       CALCULATE_NEIGHBORS_CUDA(py_pz);
00299     }
00300     if (y > 0) {
00301       CALCULATE_NEIGHBORS_CUDA(ny_pz);
00302     }
00303   }
00304 #else // 6 connected neighbors
00305   if (x > 0) {
00306     CALCULATE_NEIGHBORS_CUDA(nx);
00307   }
00308   if (x < width-1) {
00309     CALCULATE_NEIGHBORS_CUDA(px);
00310   }
00311 
00312   /* Calculate n_lower and n_eq */
00313   if (y > 0) {
00314     CALCULATE_NEIGHBORS_CUDA(ny);
00315   }
00316   if (y < height-1) {
00317     CALCULATE_NEIGHBORS_CUDA(py);
00318   }
00319 
00320   if (z > 0) {
00321     CALCULATE_NEIGHBORS_CUDA(nz);
00322   }
00323   if (z < depth-1) {
00324     CALCULATE_NEIGHBORS_CUDA(pz);
00325   }
00326 #endif // CONNECTED_18
00327 
00328   /* Update current voxel */
00329   eq_and_lower[idx] = MAKE_EQUAL_AND_LOWER(curr_n_eq, curr_lower);
00330   if (curr_lower == 0) {
00331     /* Minimum */
00332     next_group[idx] = min_group;
00333     next_value[idx] = image[idx];
00334     curr_value[idx] = image[idx];
00335   } else {
00336     /* Not minimum */
00337     const long nIdx = idx + neighbor_offsets_d[curr_lower];
00338     next_value[idx] = image[nIdx];
00339     curr_value[idx] = image[nIdx];
00340     next_group[idx] = nIdx;
00341   }
00342 }
00343 
00344 
00345 template <typename GROUP_T, typename IMAGE_T>
00346 void set_gpu_state(state_t<GROUP_T, IMAGE_T> &state, int *eq_and_lower, 
00347                    watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state,
00348                    unsigned long nVoxels) {
00349   cudaMemcpy(gpu_state.segments_d, state.group, 
00350              nVoxels * sizeof(GROUP_T), cudaMemcpyHostToDevice);
00351   cudaMemcpy(gpu_state.current_value_d, state.value, 
00352              nVoxels * sizeof(IMAGE_T), cudaMemcpyHostToDevice);
00353   cudaMemcpy(gpu_state.next_value_d, gpu_state.current_value_d, 
00354              nVoxels * sizeof(IMAGE_T), cudaMemcpyDeviceToDevice);
00355   cudaMemcpy(gpu_state.eq_and_lower_d, eq_and_lower, 
00356              nVoxels * sizeof(int), cudaMemcpyHostToDevice);
00357 }
00358 
00359 
00360 // Completely GPU-based initialization routine, to replace CPU-based initialization
00361 template <typename GROUP_T, typename IMAGE_T>
00362 bool init_gpu_on_device(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state, 
00363                         IMAGE_T* image, int imageongpu,
00364                         unsigned int w, unsigned int h, unsigned int d) {
00365   PROFILE_PUSH_RANGE("CUDAWatershed::init_gpu_on_device()", 0);
00366 
00367   unsigned int changes = 0;
00368   unsigned long nVoxels = long(h) * long(w) * long(d);
00369   gpu_state.width = w;
00370   gpu_state.height = h;
00371   gpu_state.depth = d;
00372   IMAGE_T* image_d;
00373 
00374   cudaMemcpyToSymbolAsync(changes_d, &changes, sizeof(unsigned int));
00375 
00376   if (imageongpu) {
00377     image_d = image;
00378   } else {  
00379     cudaMalloc(&image_d, nVoxels * sizeof(IMAGE_T));
00380     cudaMemcpyAsync(image_d, image, nVoxels * sizeof(IMAGE_T), cudaMemcpyHostToDevice);
00381   }
00382 
00383   cudaMalloc(&gpu_state.segments_d, nVoxels * sizeof(GROUP_T));
00384   cudaMalloc(&gpu_state.current_value_d, nVoxels * sizeof(IMAGE_T));
00385   cudaMalloc(&gpu_state.next_value_d, nVoxels * sizeof(IMAGE_T));
00386   cudaMalloc(&gpu_state.eq_and_lower_d, nVoxels * sizeof(int));
00387 
00388 #ifdef GPU_BLOCK_UPDATE
00389   long t_width = ceil((float)gpu_state.width / BLOCK_X_DIM);
00390   long t_height = ceil((float)gpu_state.height / BLOCK_Y_DIM);
00391   long t_depth = ceil((float)gpu_state.depth / BLOCK_Z_DIM);
00392   long num_blocks = t_width * t_height * t_depth;
00393   cudaMalloc(&gpu_state.next_update_d, num_blocks * sizeof(char));
00394   cudaMalloc(&gpu_state.current_update_d, num_blocks * sizeof(char));
00395   cudaMemsetAsync(gpu_state.next_update_d, 0, num_blocks * sizeof(char));
00396   cudaMemsetAsync(gpu_state.current_update_d, 1, num_blocks * sizeof(char));
00397 #endif // GPU_BLOCK_UPDATE
00398 
00399 #ifdef GPU_WARP_UPDATE
00400   long warp_x_dim = ceil(gpu_state.width/32.0f);
00401   long num_warps = ceil(gpu_state.height/32.0f) * 32L * ceil(gpu_state.depth/32.0f)* 32L * warp_x_dim;
00402   cudaMalloc(&gpu_state.next_update_d, num_warps * sizeof(char));
00403   cudaMalloc(&gpu_state.current_update_d, num_warps * sizeof(char));
00404   cudaMemsetAsync(gpu_state.next_update_d, 0, num_warps * sizeof(char));
00405   cudaMemsetAsync(gpu_state.current_update_d, 1, num_warps * sizeof(char));
00406 #endif // GPU_WARP_UPDATE
00407 
00408   init_neighbor_offsets(gpu_state);
00409 
00410   const dim3 block(BLOCK_X_DIM, BLOCK_Y_DIM, BLOCK_Z_DIM);
00411   size_t x_grid = ceil((float)gpu_state.width/BLOCK_X_DIM);
00412   size_t y_grid = ceil((float)gpu_state.height/BLOCK_Y_DIM);
00413   size_t z_grid = ceil((float)gpu_state.depth/BLOCK_Z_DIM);
00414   const dim3 grid(x_grid, y_grid, z_grid);
00415   calc_neighbors_kernel<<<grid, block>>>(image_d, gpu_state.segments_d, gpu_state.current_value_d,
00416                                          gpu_state.next_value_d, gpu_state.eq_and_lower_d, w, h, d);
00417 
00418 #if 0
00419   cudaDeviceSynchronize();
00420   cudaError_t error = cudaGetLastError();
00421   if (error != cudaSuccess) {
00422     printf("Error initializing device: %s\n", cudaGetErrorString(error));
00423     destroy_gpu(gpu_state);
00424     PROFILE_POP_RANGE();
00425     return false;
00426   }
00427 #endif
00428 
00429   gpu_state.init = true;
00430 
00431   // only free the image if it was a temp copy from the host
00432   if (!imageongpu) 
00433     cudaFree(image_d);
00434 
00435   PROFILE_POP_RANGE();
00436 
00437   return true;
00438 }
00439 
00440 
00441 template <typename GROUP_T, typename IMAGE_T>
00442 bool init_gpu(state_t<GROUP_T, IMAGE_T> &state, int *eq_and_lower, 
00443               watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state,
00444               unsigned int w, unsigned int h, unsigned int d) {
00445   PROFILE_MARK("CUDAWatershed::init_gpu()", 0);
00446 
00447   unsigned int changes = 0;
00448   unsigned long nVoxels = long(h) * long(w) * long(d);
00449   gpu_state.width = w;
00450   gpu_state.height = h;
00451   gpu_state.depth = d;
00452 
00453   cudaMalloc(&gpu_state.segments_d, nVoxels * sizeof(long));
00454   cudaMalloc(&gpu_state.current_value_d, nVoxels * sizeof(IMAGE_T));
00455   cudaMalloc(&gpu_state.next_value_d, nVoxels * sizeof(IMAGE_T));
00456   cudaMalloc(&gpu_state.eq_and_lower_d, nVoxels * sizeof(int));
00457 
00458 #ifdef GPU_BLOCK_UPDATE
00459   long t_width = ceil((float)gpu_state.width / BLOCK_X_DIM);
00460   long t_height = ceil((float)gpu_state.height / BLOCK_Y_DIM);
00461   long t_depth = ceil((float)gpu_state.depth / BLOCK_Z_DIM);
00462   long num_blocks = t_width * t_height * t_depth;
00463   cudaMalloc(&gpu_state.next_update_d, num_blocks * sizeof(char));
00464   cudaMalloc(&gpu_state.current_update_d, num_blocks * sizeof(char));
00465   cudaMemset(gpu_state.next_update_d, 0, num_blocks * sizeof(char));
00466   cudaMemset(gpu_state.current_update_d, 1, num_blocks * sizeof(char));
00467 #endif // GPU_BLOCK_UPDATE
00468 
00469 #ifdef GPU_WARP_UPDATE
00470   long warp_x_dim = ceil(gpu_state.width/32.0f);
00471   long num_warps = ceil(gpu_state.height/32.0f) * 32L * ceil(gpu_state.depth/32.0f) * 32L * warp_x_dim;
00472   cudaMalloc(&gpu_state.next_update_d, num_warps * sizeof(char));
00473   cudaMalloc(&gpu_state.current_update_d, num_warps * sizeof(char));
00474   cudaMemset(gpu_state.next_update_d, 0, num_warps * sizeof(char));
00475   cudaMemset(gpu_state.current_update_d, 1, num_warps * sizeof(char));
00476 #endif // GPU_WARP_UPDATE
00477 
00478   cudaMemcpyToSymbol(changes_d, &changes, sizeof(unsigned int));
00479   init_neighbor_offsets(gpu_state);
00480   gpu_state.init = true;
00481 
00482 #if 0
00483   cudaDeviceSynchronize();
00484   cudaError_t error = cudaGetLastError();
00485   if (error != cudaSuccess) {
00486     printf("Error initializing device2: %s\n", cudaGetErrorString(error));
00487     destroy_gpu(gpu_state);
00488     return false;
00489   }
00490 #endif
00491 
00492   // copy host-side arrays to the GPU
00493   set_gpu_state(state, eq_and_lower, gpu_state, nVoxels);
00494 
00495   return true;
00496 }
00497 
00498 
00499 template <typename GROUP_T, typename IMAGE_T>
00500 void destroy_gpu(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state) {
00501 #if defined(VERBOSE)
00502   cudaDeviceSynchronize();
00503   cudaError_t error = cudaGetLastError();
00504   if (error != cudaSuccess) {
00505     printf("Error before destroy_gpu: %s\n", cudaGetErrorString(error));
00506   }
00507 #endif
00508 
00509   if (gpu_state.init) {
00510     if (gpu_state.segments_d != NULL) {
00511       cudaFree(gpu_state.segments_d);
00512       gpu_state.segments_d = NULL;
00513     }
00514 
00515     if (gpu_state.current_value_d != NULL) {
00516       cudaFree(gpu_state.current_value_d);
00517       gpu_state.current_value_d = NULL;
00518     }
00519     if (gpu_state.next_value_d != NULL) {
00520       cudaFree(gpu_state.next_value_d);
00521       gpu_state.next_value_d = NULL;
00522     }
00523 
00524 
00525     if (gpu_state.eq_and_lower_d != NULL) {
00526       cudaFree(gpu_state.eq_and_lower_d);
00527       gpu_state.eq_and_lower_d = NULL;
00528     }
00529 
00530 #ifdef GPU_BLOCK_UPDATE
00531     if (gpu_state.current_update_d != NULL) {
00532       cudaFree(gpu_state.current_update_d);
00533       gpu_state.current_update_d = NULL;
00534     }
00535     if (gpu_state.next_update_d != NULL) {
00536       cudaFree(gpu_state.next_update_d);
00537       gpu_state.next_update_d = NULL;
00538     }
00539     if (gpu_state.current_update_d != NULL) {
00540       cudaFree(gpu_state.current_update_d);
00541       gpu_state.current_update_d = NULL;
00542     }
00543     if (gpu_state.next_update_d != NULL) {
00544       cudaFree(gpu_state.next_update_d);
00545       gpu_state.next_update_d = NULL;
00546     }
00547 #endif // GPU_BLOCK_UPDATE
00548     gpu_state.init = false;
00549   }
00550 
00551 #if defined(VERBOSE)
00552   cudaDeviceSynchronize();
00553   error = cudaGetLastError();
00554   if (error != cudaSuccess) {
00555     printf("Error after destroy_gpu: %s\n", cudaGetErrorString(error));
00556   }
00557 #endif
00558 }
00559 
00560 
00561 template <typename GROUP_T, typename IMAGE_T>
00562 void update_cuda(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state, GROUP_T *final_segments_d) {
00563   const dim3 block(BLOCK_X_DIM, BLOCK_Y_DIM, BLOCK_Z_DIM);
00564   size_t x_grid = ceil((float)gpu_state.width/BLOCK_X_DIM);
00565   size_t y_grid = ceil((float)gpu_state.height/BLOCK_Y_DIM);
00566   size_t z_grid = ceil((float)gpu_state.depth/BLOCK_Z_DIM);
00567   const dim3 grid(x_grid, y_grid, z_grid);
00568   unsigned int changes = 1;
00569   const unsigned int zero = 0;
00570   GROUP_T *current_group_d = gpu_state.segments_d;;
00571   GROUP_T *next_group_d = final_segments_d;
00572 
00573 #ifdef VERBOSE
00574   printf("Block =  %d x %d x %d = %d\n", block.x, block.y, block.z, block.x*block.y*block.z);
00575   printf("Grid =  %d x %d x %d = %d\n", grid.x, grid.y, grid.z, grid.x*grid.y*grid.z);
00576   printf("Threads = %d\n", grid.x*grid.y*grid.z*block.x*block.y*block.z);
00577   printf("Image = %d x %d x %d\n", gpu_state.height, gpu_state.width, gpu_state.depth);
00578   printf("Voxels  = %d\n", gpu_state.height * gpu_state.width * gpu_state.depth);
00579 #endif // VERBOSE
00580 
00581   //
00582   // run update_kernel in a loop until no changes are flagged in changes_d
00583   // XXX we may want to modify this scheme to reduce or eliminate the
00584   //     cudaMemcpyToSymbol() calls, by making an "eager" loop that
00585   //     only checks the outcome every N iterations (along with a kernel
00586   //     early exit scheme), by using writes via host-mapped memory, 
00587   //     or a similar technique.
00588   while (changes) {
00589     cudaMemcpyToSymbolAsync(changes_d, &zero, sizeof(changes_d));
00590 
00591     update_kernel<GROUP_T><<<grid, block>>>(current_group_d, next_group_d, gpu_state.current_value_d,
00592                                       gpu_state.next_value_d, gpu_state.eq_and_lower_d,
00593                                       gpu_state.current_update_d, gpu_state.next_update_d,
00594                                       gpu_state.height, gpu_state.width, gpu_state.depth);
00595 
00596     SWAP(current_group_d, next_group_d, GROUP_T*);
00597     SWAP(gpu_state.current_value_d, gpu_state.next_value_d, IMAGE_T*);
00598     SWAP(gpu_state.current_update_d, gpu_state.next_update_d, unsigned char*);
00599 
00600     cudaMemcpyFromSymbol(&changes, changes_d, sizeof(changes));
00601 
00602 #if defined(VERBOSE)
00603     cudaDeviceSynchronize();
00604     cudaError_t error = cudaGetLastError();
00605     if (error != cudaSuccess) {
00606       printf("Error in update: %s\n", cudaGetErrorString(error));
00607       destroy_gpu(gpu_state);
00608       exit(-1);
00609     }
00610 #endif // VERBOSE
00611   }
00612 
00613   // copy segment groups if necessary
00614   if (current_group_d != final_segments_d) {
00615     long nVoxels = gpu_state.height * gpu_state.width * gpu_state.depth;
00616     cudaMemcpy(final_segments_d, current_group_d,
00617                nVoxels * sizeof(GROUP_T), cudaMemcpyDeviceToDevice);
00618   }
00619 }
00620 
00621 
00622 #define UPDATE_VOXEL_CUDA(and_value, idx, curr_group, smallest_value) {\
00623   if (n_eq & and_value) {\
00624     const long offset_idx = idx + and_value##_offset_d;\
00625     if (current_value[offset_idx] + FLOAT_DIFF < smallest_value) {\
00626       smallest_value = current_value[offset_idx];\
00627       offset_number = and_value##_idx;\
00628       next_idx = idx + and_value##_offset_d;\
00629     }\
00630     curr_group = current_group[offset_idx] < curr_group ?\
00631     current_group[offset_idx] : curr_group;\
00632   }}
00633 
00634 
00635 template <typename GROUP_T, typename IMAGE_T>
00636 __global__ void update_kernel(GROUP_T* __restrict__ current_group,
00637                               GROUP_T* __restrict__ next_group,
00638                               IMAGE_T* __restrict__ current_value,
00639                               IMAGE_T* __restrict__ next_value,
00640                               int* __restrict__ eq_and_lower,
00641                               unsigned char* __restrict__ current_update,
00642                               unsigned char* __restrict__ next_update,
00643                               const int width,
00644                               const int height,
00645                               const int depth) {
00646 
00647   const int tx = blockIdx.x * BLOCK_X_DIM + threadIdx.x;
00648   const int ty = blockIdx.y * BLOCK_Y_DIM + threadIdx.y;
00649   const int tz = blockIdx.z * BLOCK_Z_DIM + threadIdx.z;
00650   const long idx = tz * long(height) * long(width) + ty * long(width) + tx;
00651   const int heightWidth = height * width;
00652 
00653   if (tx >= width || ty >= height || tz >= depth) {
00654     return;
00655   }
00656 
00657 #ifdef GPU_BLOCK_UPDATE
00658   const long block_x = blockIdx.x;
00659   const long block_y = blockIdx.y;
00660   const long block_z = blockIdx.z;
00661   const long block_idx = block_z * gridDim.y * gridDim.x + block_y * gridDim.x + block_x;
00662   if (current_update[block_idx] == 0) {
00663     return;
00664   }
00665   current_update[block_idx] = 0;
00666 #endif // GPU_BLOCK_UPDATE
00667 
00668 #ifdef GPU_WARP_UPDATE
00669   const long warp_x = tx >> DIV_BY_32;
00670   const long warp_y = ty;
00671   const long warp_z = tz;
00672   const long warp_x_dim = (BLOCK_X_DIM * gridDim.x) >> DIV_BY_32;
00673   const long warp_y_dim = BLOCK_Y_DIM * gridDim.y;
00674   const long warp_z_dim = BLOCK_Z_DIM * gridDim.z;
00675   const long warp_idx = warp_z * warp_y_dim * warp_x_dim + warp_y * warp_x_dim + warp_x;
00676 
00677   if (current_update[warp_idx] == 0) {
00678     return;
00679   }
00680   current_update[warp_idx] = 0;
00681 #endif // GPU_WARP_UPDATE
00682 
00683   bool update = false;
00684   const int equal_and_lower = eq_and_lower[idx];
00685   const int offset_number = GET_N_LOWER(equal_and_lower);
00686   if (offset_number != 0) {
00687     const long nidx = idx + neighbor_offsets_d[offset_number];
00688     if (next_group[idx] != current_group[nidx] || next_value[idx] != current_value[nidx]) {
00689       next_group[idx] = current_group[nidx];
00690       next_value[idx] = current_value[nidx];
00691       update = true;
00692     }
00693   } else {
00694     const int n_eq = GET_N_EQ(equal_and_lower);
00695     IMAGE_T smallest_value = current_value[idx];
00696     GROUP_T curr_group = current_group[idx];
00697     int offset_number = 0;
00698     long next_idx = 0;
00699 
00700 #ifdef CONNECTED_18
00701     /* +-x */
00702     UPDATE_VOXEL_CUDA(nx, idx, curr_group, smallest_value);
00703     UPDATE_VOXEL_CUDA(px, idx, curr_group, smallest_value);
00704 
00705     /* +-x, -y*/
00706     UPDATE_VOXEL_CUDA(nx_ny, idx, curr_group, smallest_value);
00707     UPDATE_VOXEL_CUDA(   ny, idx, curr_group, smallest_value);
00708     UPDATE_VOXEL_CUDA(px_ny, idx, curr_group, smallest_value);
00709 
00710     /* +-x, +y*/
00711     UPDATE_VOXEL_CUDA(nx_py, idx, curr_group, smallest_value);
00712     UPDATE_VOXEL_CUDA(   py, idx, curr_group, smallest_value);
00713     UPDATE_VOXEL_CUDA(px_py, idx, curr_group, smallest_value);
00714 
00715     /* +-x, +z*/
00716     UPDATE_VOXEL_CUDA(px_pz, idx, curr_group, smallest_value);
00717     UPDATE_VOXEL_CUDA(nx_pz, idx, curr_group, smallest_value);
00718     /* +-y, +z*/
00719     UPDATE_VOXEL_CUDA(py_pz, idx, curr_group, smallest_value);
00720     UPDATE_VOXEL_CUDA(   pz, idx, curr_group, smallest_value);
00721     UPDATE_VOXEL_CUDA(ny_pz, idx, curr_group, smallest_value);
00722 
00723     /* +-x, -z*/
00724     UPDATE_VOXEL_CUDA(nx_nz, idx, curr_group, smallest_value);
00725     UPDATE_VOXEL_CUDA(   nz, idx, curr_group, smallest_value);
00726     UPDATE_VOXEL_CUDA(px_nz, idx, curr_group, smallest_value);
00727     /* +-y, -z*/
00728     UPDATE_VOXEL_CUDA(py_nz, idx, curr_group, smallest_value);
00729     UPDATE_VOXEL_CUDA(ny_nz, idx, curr_group, smallest_value);
00730 
00731 #else // 6 connected neighbors
00732     UPDATE_VOXEL_CUDA(nx, idx, curr_group, smallest_value);
00733     UPDATE_VOXEL_CUDA(px, idx, curr_group, smallest_value);
00734     UPDATE_VOXEL_CUDA(ny, idx, curr_group, smallest_value);
00735     UPDATE_VOXEL_CUDA(py, idx, curr_group, smallest_value);
00736     UPDATE_VOXEL_CUDA(nz, idx, curr_group, smallest_value);
00737     UPDATE_VOXEL_CUDA(pz, idx, curr_group, smallest_value);
00738 #endif // CONNECTED_18
00739 
00740     if (offset_number != 0) {
00741       eq_and_lower[idx] = MAKE_EQUAL_AND_LOWER(n_eq, offset_number);
00742       next_value[idx] = smallest_value;
00743       next_group[idx] = current_group[next_idx];
00744       update = true;
00745     } else if (curr_group != next_group[idx]) {
00746       next_group[idx] = curr_group;
00747       update = true;
00748     }
00749   }
00750 
00751   if (update) {
00752     changes_d = 1;
00753 
00754 #ifdef GPU_BLOCK_UPDATE
00755     next_update[block_idx] = 1;
00756 
00757     if (block_x > 0) {
00758       next_update[block_idx - 1] = 1;
00759     }
00760     if (block_x < gridDim.x - 1) {
00761       next_update[block_idx + 1] = 1;
00762     }
00763     if (block_y > 0) {
00764       next_update[block_idx - gridDim.x] = 1;
00765     }
00766     if (block_y < gridDim.y - 1) {
00767       next_update[block_idx + gridDim.x] = 1;
00768     }
00769     if (block_z > 0) {
00770       next_update[block_idx - (gridDim.x * gridDim.y)] = 1;
00771     }
00772     if (block_z < gridDim.z - 1) {
00773       next_update[block_idx + (gridDim.x * gridDim.y)] = 1;
00774     }
00775 #endif // GPU_BLOCK_UPDATE
00776 
00777 #ifdef GPU_WARP_UPDATE
00778     next_update[warp_idx] = 1;
00779 
00780     if (warp_x > 0) {
00781       next_update[warp_idx - 1] = 1;
00782     }
00783     if (warp_x < warp_x_dim - 1) {
00784       next_update[warp_idx + 1] = 1;
00785     }
00786     if (warp_y > 0) {
00787       next_update[warp_idx - warp_x_dim] = 1;
00788     }
00789     if (warp_y < warp_y_dim - 1) {
00790       next_update[warp_idx + warp_x_dim] = 1;
00791     }
00792     if (warp_z > 0) {
00793       next_update[warp_idx - (warp_x_dim * warp_y_dim)] = 1;
00794     }
00795     if (warp_z < warp_z_dim - 1) {
00796       next_update[warp_idx + (warp_x_dim * warp_y_dim)] = 1;
00797     }
00798 #endif // GPU_WARP_UPDATE
00799   }
00800 }
00801