00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021 #include <cstdio>
00022
00023 #define WATERSHED_INTERNAL 1
00024 #include "CUDAWatershed.h"
00025 #include "ProfileHooks.h"
00026
00027 #define px_offset_d (1)
00028 #define py_offset_d (width)
00029 #define pz_offset_d (heightWidth)
00030 #define nx_offset_d (-1)
00031 #define ny_offset_d (-width)
00032 #define nz_offset_d (-heightWidth)
00033
00034 #define nx_ny_offset_d (-1 - width)
00035 #define nx_py_offset_d (-1 + width)
00036 #define px_py_offset_d (1 + width)
00037 #define px_ny_offset_d (1 - width)
00038
00039 #define px_pz_offset_d (1 + heightWidth)
00040 #define nx_nz_offset_d (-1 - heightWidth)
00041 #define nx_pz_offset_d (-1 + heightWidth)
00042 #define px_nz_offset_d (1 - heightWidth)
00043
00044 #define py_pz_offset_d (width + heightWidth)
00045 #define ny_nz_offset_d (-width - heightWidth)
00046 #define ny_pz_offset_d (-width + heightWidth)
00047 #define py_nz_offset_d (width - heightWidth)
00048
00049
00050 #define GPU_WARP_UPDATE
00051
00052 #ifdef GPU_BLOCK_UPDATE
00053
00054 #define BLOCK_X_DIM 128
00055 #define BLOCK_Y_DIM 2
00056 #define BLOCK_Z_DIM 2
00057
00058 #else
00059
00060
00061 #define BLOCK_X_DIM 64
00062 #define BLOCK_Y_DIM 2
00063 #define BLOCK_Z_DIM 2
00064
00065 #endif
00066
00067
00068
00069
00070
00071 __constant__ int neighbor_offsets_d[19];
00072 __device__ unsigned int changes_d;
00073
00074 #define INST_DESTROY_GPU(G_T) \
00075 template void destroy_gpu<G_T,float>(watershed_gpu_state_t<G_T,float> &gpu_state);\
00076 template void destroy_gpu<G_T,unsigned short>(watershed_gpu_state_t<G_T,unsigned short> &gpu_state);\
00077 template void destroy_gpu<G_T,unsigned char>(watershed_gpu_state_t<G_T,unsigned char> &gpu_state);
00078
00079 #define INST_INIT_GPU_ON_DEV(G_T) \
00080 template bool init_gpu_on_device<G_T, float>(watershed_gpu_state_t<G_T,float> &gpu_state, float* image, int imageongpu,\
00081 unsigned int w, unsigned int h, unsigned int d);\
00082 template bool init_gpu_on_device<G_T, unsigned short>(watershed_gpu_state_t<G_T,unsigned short> &gpu_state,\
00083 unsigned short* image, int imageongpu, unsigned int w, unsigned int h, unsigned int d);\
00084 template bool init_gpu_on_device<G_T, unsigned char>(watershed_gpu_state_t<G_T,unsigned char> &gpu_state,\
00085 unsigned char* image, int imageongpu, unsigned int w, unsigned int h, unsigned int d);\
00086
00087 #define INST_INIT_GPU(G_T) \
00088 template bool init_gpu<G_T, float>(state_t<G_T,float> &state, int *eq_and_lower,\
00089 watershed_gpu_state_t<G_T,float> &gpu_state,\
00090 unsigned int w, unsigned int h, unsigned int d);\
00091 template bool init_gpu<G_T, unsigned short>(state_t<G_T,unsigned short> &state, int *eq_and_lower,\
00092 watershed_gpu_state_t<G_T,unsigned short> &gpu_state,\
00093 unsigned int w, unsigned int h, unsigned int d);\
00094 template bool init_gpu<G_T, unsigned char>(state_t<G_T,unsigned char> &state, int *eq_and_lower,\
00095 watershed_gpu_state_t<G_T,unsigned char> &gpu_state,\
00096 unsigned int w, unsigned int h, unsigned int d);
00097
00098 #define INST_UPDATE_CUDA(G_T) \
00099 template void update_cuda<G_T,float>(watershed_gpu_state_t<G_T,float> &gpu_state, \
00100 G_T *final_segments_d);\
00101 template void update_cuda<G_T,unsigned short>(watershed_gpu_state_t<G_T,unsigned short> &gpu_state, \
00102 G_T *final_segments_d);\
00103 template void update_cuda<G_T,unsigned char>(watershed_gpu_state_t<G_T,unsigned char> &gpu_state, \
00104 G_T *final_segments_d);
00105
00106 INST_DESTROY_GPU(unsigned long)
00107 INST_DESTROY_GPU(unsigned int)
00108 INST_DESTROY_GPU(unsigned short)
00109
00110 INST_INIT_GPU_ON_DEV(unsigned long)
00111 INST_INIT_GPU_ON_DEV(unsigned int)
00112 INST_INIT_GPU_ON_DEV(unsigned short)
00113
00114 INST_INIT_GPU(unsigned long)
00115 INST_INIT_GPU(unsigned int)
00116 INST_INIT_GPU(unsigned short)
00117
00118 INST_UPDATE_CUDA(unsigned long)
00119 INST_UPDATE_CUDA(unsigned int)
00120 INST_UPDATE_CUDA(unsigned short)
00121
00122
00123
00124
00125
00126
00127 template void update_cuda<long,float>(watershed_gpu_state_t<long,float> &gpu_state,
00128 long *final_segments_d);
00129
00130 template void update_cuda<long,unsigned short>(watershed_gpu_state_t<long,unsigned short> &gpu_state,
00131 long *final_segments_d);
00132
00133 template void update_cuda<long,unsigned char>(watershed_gpu_state_t<long,unsigned char> &gpu_state,
00134 long *final_segments_d);
00135
00136
00137 template void update_cuda<int,float>(watershed_gpu_state_t<int,float> &gpu_state,
00138 int *final_segments_d);
00139
00140 template void update_cuda<int,unsigned short>(watershed_gpu_state_t<int,unsigned short> &gpu_state,
00141 int *final_segments_d);
00142
00143 template void update_cuda<int,unsigned char>(watershed_gpu_state_t<int,unsigned char> &gpu_state,
00144 int *final_segments_d);
00145
00146
00147 template void update_cuda<short,float>(watershed_gpu_state_t<short,float> &gpu_state,
00148 short *final_segments_d);
00149
00150 template void update_cuda<short,unsigned short>(watershed_gpu_state_t<short,unsigned short> &gpu_state,
00151 short *final_segments_d);
00152
00153 template void update_cuda<short,unsigned char>(watershed_gpu_state_t<short,unsigned char> &gpu_state,
00154 short *final_segments_d);
00155
00156
00157 #define DIV_BY_32 5
00158 #define GIGABYTE (1024.0f*1024.0f*1024.0f)
00159
00160 template <typename GROUP_T, typename IMAGE_T>
00161 __global__ void update_kernel(GROUP_T* __restrict__ current_group,
00162 GROUP_T* __restrict__ next_group,
00163 IMAGE_T* __restrict__ current_value,
00164 IMAGE_T* __restrict__ next_value,
00165 int* __restrict__ eq_and_lower,
00166 unsigned char* __restrict__ current_update,
00167 unsigned char* __restrict__ next_update,
00168 const int width,
00169 const int height,
00170 const int depth);
00171
00172 template <typename GROUP_T, typename IMAGE_T>
00173 void init_neighbor_offsets(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state) {
00174 int height = gpu_state.height;
00175 int width = gpu_state.width;
00176 int heightWidth = height * width;
00177 int neighbor_offsets_t[19] = { 0,
00178 px_offset,
00179 py_offset,
00180 pz_offset,
00181 nx_offset,
00182 ny_offset,
00183 nz_offset,
00184
00185 nx_ny_offset,
00186 nx_py_offset,
00187 px_py_offset,
00188 px_ny_offset,
00189
00190 px_pz_offset,
00191 nx_nz_offset,
00192 nx_pz_offset,
00193 px_nz_offset,
00194
00195 py_pz_offset,
00196 ny_nz_offset,
00197 ny_pz_offset,
00198 py_nz_offset,
00199 };
00200
00201 cudaMemcpyToSymbolAsync(neighbor_offsets_d, neighbor_offsets_t, sizeof(int) * 19);
00202 }
00203
00204
00205 #define CALCULATE_NEIGHBORS_CUDA(offset_str) {\
00206 const long idx_offset = idx + offset_str##_offset_d;\
00207 slope = curr_intensity - image[idx_offset];\
00208 if (slope < smallest_slope) {\
00209 smallest_slope = slope;\
00210 curr_lower = offset_str##_idx;\
00211 } else if (slope >= -FLOAT_DIFF && slope <= FLOAT_DIFF) {\
00212 curr_n_eq |= offset_str;\
00213 if (idx_offset < min_group) {\
00214 min_group = idx_offset;\
00215 }\
00216 } \
00217 }
00218
00219
00220
00221 template <typename GROUP_T, typename IMAGE_T>
00222 __global__ void calc_neighbors_kernel(const IMAGE_T* __restrict__ image,
00223 GROUP_T* __restrict__ next_group,
00224 IMAGE_T* __restrict__ curr_value,
00225 IMAGE_T* __restrict__ next_value,
00226 int* __restrict__ eq_and_lower,
00227 const int width,
00228 const int height,
00229 const int depth) {
00230
00231 const int x = blockIdx.x * BLOCK_X_DIM + threadIdx.x;
00232 const int y = blockIdx.y * BLOCK_Y_DIM + threadIdx.y;
00233 const int z = blockIdx.z * BLOCK_Z_DIM + threadIdx.z;
00234 const long idx = z * long(height) * long(width) + y * long(width) + x;
00235 const int heightWidth = height * width;
00236
00237 if (x >= width || y >= height || z >= depth) {
00238 return;
00239 }
00240
00241
00242
00243
00244 float slope;
00245 float smallest_slope = -FLOAT_DIFF;
00246 IMAGE_T curr_intensity = image[idx];
00247 long min_group = idx;
00248 int curr_n_eq = 0;
00249 long curr_lower = 0;
00250
00251
00252 #ifdef CONNECTED_18
00253 if (x > 0) {
00254 CALCULATE_NEIGHBORS_CUDA(nx);
00255 }
00256 if (x < width-1) {
00257 CALCULATE_NEIGHBORS_CUDA(px);
00258 }
00259
00260
00261 if (y > 0) {
00262 CALCULATE_NEIGHBORS_CUDA(ny);
00263 if (x < width - 1) {
00264 CALCULATE_NEIGHBORS_CUDA(px_ny);
00265 }
00266 }
00267 if (y < height-1) {
00268 CALCULATE_NEIGHBORS_CUDA(py);
00269 if (x < width - 1) {
00270 CALCULATE_NEIGHBORS_CUDA(px_py);
00271 }
00272 }
00273
00274 if (z > 0) {
00275 CALCULATE_NEIGHBORS_CUDA(nz);
00276 if (x < width - 1) {
00277 CALCULATE_NEIGHBORS_CUDA(px_nz);
00278 }
00279 if (x > 0) {
00280 CALCULATE_NEIGHBORS_CUDA(nx_nz);
00281 }
00282 if (y > 0 ) {
00283 CALCULATE_NEIGHBORS_CUDA(ny_nz);
00284 }
00285 if (y < height - 1) {
00286 CALCULATE_NEIGHBORS_CUDA(py_nz);
00287 }
00288 }
00289 if (z < depth-1) {
00290 CALCULATE_NEIGHBORS_CUDA(pz);
00291 if (x < width - 1) {
00292 CALCULATE_NEIGHBORS_CUDA(px_pz);
00293 }
00294 if (x > 0) {
00295 CALCULATE_NEIGHBORS_CUDA(nx_pz);
00296 }
00297 if (y < height - 1) {
00298 CALCULATE_NEIGHBORS_CUDA(py_pz);
00299 }
00300 if (y > 0) {
00301 CALCULATE_NEIGHBORS_CUDA(ny_pz);
00302 }
00303 }
00304 #else // 6 connected neighbors
00305 if (x > 0) {
00306 CALCULATE_NEIGHBORS_CUDA(nx);
00307 }
00308 if (x < width-1) {
00309 CALCULATE_NEIGHBORS_CUDA(px);
00310 }
00311
00312
00313 if (y > 0) {
00314 CALCULATE_NEIGHBORS_CUDA(ny);
00315 }
00316 if (y < height-1) {
00317 CALCULATE_NEIGHBORS_CUDA(py);
00318 }
00319
00320 if (z > 0) {
00321 CALCULATE_NEIGHBORS_CUDA(nz);
00322 }
00323 if (z < depth-1) {
00324 CALCULATE_NEIGHBORS_CUDA(pz);
00325 }
00326 #endif // CONNECTED_18
00327
00328
00329 eq_and_lower[idx] = MAKE_EQUAL_AND_LOWER(curr_n_eq, curr_lower);
00330 if (curr_lower == 0) {
00331
00332 next_group[idx] = min_group;
00333 next_value[idx] = image[idx];
00334 curr_value[idx] = image[idx];
00335 } else {
00336
00337 const long nIdx = idx + neighbor_offsets_d[curr_lower];
00338 next_value[idx] = image[nIdx];
00339 curr_value[idx] = image[nIdx];
00340 next_group[idx] = nIdx;
00341 }
00342 }
00343
00344
00345 template <typename GROUP_T, typename IMAGE_T>
00346 void set_gpu_state(state_t<GROUP_T, IMAGE_T> &state, int *eq_and_lower,
00347 watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state,
00348 unsigned long nVoxels) {
00349 cudaMemcpy(gpu_state.segments_d, state.group,
00350 nVoxels * sizeof(GROUP_T), cudaMemcpyHostToDevice);
00351 cudaMemcpy(gpu_state.current_value_d, state.value,
00352 nVoxels * sizeof(IMAGE_T), cudaMemcpyHostToDevice);
00353 cudaMemcpy(gpu_state.next_value_d, gpu_state.current_value_d,
00354 nVoxels * sizeof(IMAGE_T), cudaMemcpyDeviceToDevice);
00355 cudaMemcpy(gpu_state.eq_and_lower_d, eq_and_lower,
00356 nVoxels * sizeof(int), cudaMemcpyHostToDevice);
00357 }
00358
00359
00360
00361 template <typename GROUP_T, typename IMAGE_T>
00362 bool init_gpu_on_device(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state,
00363 IMAGE_T* image, int imageongpu,
00364 unsigned int w, unsigned int h, unsigned int d) {
00365 PROFILE_PUSH_RANGE("CUDAWatershed::init_gpu_on_device()", 0);
00366
00367 unsigned int changes = 0;
00368 unsigned long nVoxels = long(h) * long(w) * long(d);
00369 gpu_state.width = w;
00370 gpu_state.height = h;
00371 gpu_state.depth = d;
00372 IMAGE_T* image_d;
00373
00374 cudaMemcpyToSymbolAsync(changes_d, &changes, sizeof(unsigned int));
00375
00376 if (imageongpu) {
00377 image_d = image;
00378 } else {
00379 cudaMalloc(&image_d, nVoxels * sizeof(IMAGE_T));
00380 cudaMemcpyAsync(image_d, image, nVoxels * sizeof(IMAGE_T), cudaMemcpyHostToDevice);
00381 }
00382
00383 cudaMalloc(&gpu_state.segments_d, nVoxels * sizeof(GROUP_T));
00384 cudaMalloc(&gpu_state.current_value_d, nVoxels * sizeof(IMAGE_T));
00385 cudaMalloc(&gpu_state.next_value_d, nVoxels * sizeof(IMAGE_T));
00386 cudaMalloc(&gpu_state.eq_and_lower_d, nVoxels * sizeof(int));
00387
00388 #ifdef GPU_BLOCK_UPDATE
00389 long t_width = ceil((float)gpu_state.width / BLOCK_X_DIM);
00390 long t_height = ceil((float)gpu_state.height / BLOCK_Y_DIM);
00391 long t_depth = ceil((float)gpu_state.depth / BLOCK_Z_DIM);
00392 long num_blocks = t_width * t_height * t_depth;
00393 cudaMalloc(&gpu_state.next_update_d, num_blocks * sizeof(char));
00394 cudaMalloc(&gpu_state.current_update_d, num_blocks * sizeof(char));
00395 cudaMemsetAsync(gpu_state.next_update_d, 0, num_blocks * sizeof(char));
00396 cudaMemsetAsync(gpu_state.current_update_d, 1, num_blocks * sizeof(char));
00397 #endif // GPU_BLOCK_UPDATE
00398
00399 #ifdef GPU_WARP_UPDATE
00400 long warp_x_dim = ceil(gpu_state.width/32.0f);
00401 long num_warps = ceil(gpu_state.height/32.0f) * 32L * ceil(gpu_state.depth/32.0f)* 32L * warp_x_dim;
00402 cudaMalloc(&gpu_state.next_update_d, num_warps * sizeof(char));
00403 cudaMalloc(&gpu_state.current_update_d, num_warps * sizeof(char));
00404 cudaMemsetAsync(gpu_state.next_update_d, 0, num_warps * sizeof(char));
00405 cudaMemsetAsync(gpu_state.current_update_d, 1, num_warps * sizeof(char));
00406 #endif // GPU_WARP_UPDATE
00407
00408 init_neighbor_offsets(gpu_state);
00409
00410 const dim3 block(BLOCK_X_DIM, BLOCK_Y_DIM, BLOCK_Z_DIM);
00411 size_t x_grid = ceil((float)gpu_state.width/BLOCK_X_DIM);
00412 size_t y_grid = ceil((float)gpu_state.height/BLOCK_Y_DIM);
00413 size_t z_grid = ceil((float)gpu_state.depth/BLOCK_Z_DIM);
00414 const dim3 grid(x_grid, y_grid, z_grid);
00415 calc_neighbors_kernel<<<grid, block>>>(image_d, gpu_state.segments_d, gpu_state.current_value_d,
00416 gpu_state.next_value_d, gpu_state.eq_and_lower_d, w, h, d);
00417
00418 #if 0
00419 cudaDeviceSynchronize();
00420 cudaError_t error = cudaGetLastError();
00421 if (error != cudaSuccess) {
00422 printf("Error initializing device: %s\n", cudaGetErrorString(error));
00423 destroy_gpu(gpu_state);
00424 PROFILE_POP_RANGE();
00425 return false;
00426 }
00427 #endif
00428
00429 gpu_state.init = true;
00430
00431
00432 if (!imageongpu)
00433 cudaFree(image_d);
00434
00435 PROFILE_POP_RANGE();
00436
00437 return true;
00438 }
00439
00440
00441 template <typename GROUP_T, typename IMAGE_T>
00442 bool init_gpu(state_t<GROUP_T, IMAGE_T> &state, int *eq_and_lower,
00443 watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state,
00444 unsigned int w, unsigned int h, unsigned int d) {
00445 PROFILE_MARK("CUDAWatershed::init_gpu()", 0);
00446
00447 unsigned int changes = 0;
00448 unsigned long nVoxels = long(h) * long(w) * long(d);
00449 gpu_state.width = w;
00450 gpu_state.height = h;
00451 gpu_state.depth = d;
00452
00453 cudaMalloc(&gpu_state.segments_d, nVoxels * sizeof(long));
00454 cudaMalloc(&gpu_state.current_value_d, nVoxels * sizeof(IMAGE_T));
00455 cudaMalloc(&gpu_state.next_value_d, nVoxels * sizeof(IMAGE_T));
00456 cudaMalloc(&gpu_state.eq_and_lower_d, nVoxels * sizeof(int));
00457
00458 #ifdef GPU_BLOCK_UPDATE
00459 long t_width = ceil((float)gpu_state.width / BLOCK_X_DIM);
00460 long t_height = ceil((float)gpu_state.height / BLOCK_Y_DIM);
00461 long t_depth = ceil((float)gpu_state.depth / BLOCK_Z_DIM);
00462 long num_blocks = t_width * t_height * t_depth;
00463 cudaMalloc(&gpu_state.next_update_d, num_blocks * sizeof(char));
00464 cudaMalloc(&gpu_state.current_update_d, num_blocks * sizeof(char));
00465 cudaMemset(gpu_state.next_update_d, 0, num_blocks * sizeof(char));
00466 cudaMemset(gpu_state.current_update_d, 1, num_blocks * sizeof(char));
00467 #endif // GPU_BLOCK_UPDATE
00468
00469 #ifdef GPU_WARP_UPDATE
00470 long warp_x_dim = ceil(gpu_state.width/32.0f);
00471 long num_warps = ceil(gpu_state.height/32.0f) * 32L * ceil(gpu_state.depth/32.0f) * 32L * warp_x_dim;
00472 cudaMalloc(&gpu_state.next_update_d, num_warps * sizeof(char));
00473 cudaMalloc(&gpu_state.current_update_d, num_warps * sizeof(char));
00474 cudaMemset(gpu_state.next_update_d, 0, num_warps * sizeof(char));
00475 cudaMemset(gpu_state.current_update_d, 1, num_warps * sizeof(char));
00476 #endif // GPU_WARP_UPDATE
00477
00478 cudaMemcpyToSymbol(changes_d, &changes, sizeof(unsigned int));
00479 init_neighbor_offsets(gpu_state);
00480 gpu_state.init = true;
00481
00482 #if 0
00483 cudaDeviceSynchronize();
00484 cudaError_t error = cudaGetLastError();
00485 if (error != cudaSuccess) {
00486 printf("Error initializing device2: %s\n", cudaGetErrorString(error));
00487 destroy_gpu(gpu_state);
00488 return false;
00489 }
00490 #endif
00491
00492
00493 set_gpu_state(state, eq_and_lower, gpu_state, nVoxels);
00494
00495 return true;
00496 }
00497
00498
00499 template <typename GROUP_T, typename IMAGE_T>
00500 void destroy_gpu(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state) {
00501 #if defined(VERBOSE)
00502 cudaDeviceSynchronize();
00503 cudaError_t error = cudaGetLastError();
00504 if (error != cudaSuccess) {
00505 printf("Error before destroy_gpu: %s\n", cudaGetErrorString(error));
00506 }
00507 #endif
00508
00509 if (gpu_state.init) {
00510 if (gpu_state.segments_d != NULL) {
00511 cudaFree(gpu_state.segments_d);
00512 gpu_state.segments_d = NULL;
00513 }
00514
00515 if (gpu_state.current_value_d != NULL) {
00516 cudaFree(gpu_state.current_value_d);
00517 gpu_state.current_value_d = NULL;
00518 }
00519 if (gpu_state.next_value_d != NULL) {
00520 cudaFree(gpu_state.next_value_d);
00521 gpu_state.next_value_d = NULL;
00522 }
00523
00524
00525 if (gpu_state.eq_and_lower_d != NULL) {
00526 cudaFree(gpu_state.eq_and_lower_d);
00527 gpu_state.eq_and_lower_d = NULL;
00528 }
00529
00530 #ifdef GPU_BLOCK_UPDATE
00531 if (gpu_state.current_update_d != NULL) {
00532 cudaFree(gpu_state.current_update_d);
00533 gpu_state.current_update_d = NULL;
00534 }
00535 if (gpu_state.next_update_d != NULL) {
00536 cudaFree(gpu_state.next_update_d);
00537 gpu_state.next_update_d = NULL;
00538 }
00539 if (gpu_state.current_update_d != NULL) {
00540 cudaFree(gpu_state.current_update_d);
00541 gpu_state.current_update_d = NULL;
00542 }
00543 if (gpu_state.next_update_d != NULL) {
00544 cudaFree(gpu_state.next_update_d);
00545 gpu_state.next_update_d = NULL;
00546 }
00547 #endif // GPU_BLOCK_UPDATE
00548 gpu_state.init = false;
00549 }
00550
00551 #if defined(VERBOSE)
00552 cudaDeviceSynchronize();
00553 error = cudaGetLastError();
00554 if (error != cudaSuccess) {
00555 printf("Error after destroy_gpu: %s\n", cudaGetErrorString(error));
00556 }
00557 #endif
00558 }
00559
00560
00561 template <typename GROUP_T, typename IMAGE_T>
00562 void update_cuda(watershed_gpu_state_t<GROUP_T, IMAGE_T> &gpu_state, GROUP_T *final_segments_d) {
00563 const dim3 block(BLOCK_X_DIM, BLOCK_Y_DIM, BLOCK_Z_DIM);
00564 size_t x_grid = ceil((float)gpu_state.width/BLOCK_X_DIM);
00565 size_t y_grid = ceil((float)gpu_state.height/BLOCK_Y_DIM);
00566 size_t z_grid = ceil((float)gpu_state.depth/BLOCK_Z_DIM);
00567 const dim3 grid(x_grid, y_grid, z_grid);
00568 unsigned int changes = 1;
00569 const unsigned int zero = 0;
00570 GROUP_T *current_group_d = gpu_state.segments_d;;
00571 GROUP_T *next_group_d = final_segments_d;
00572
00573 #ifdef VERBOSE
00574 printf("Block = %d x %d x %d = %d\n", block.x, block.y, block.z, block.x*block.y*block.z);
00575 printf("Grid = %d x %d x %d = %d\n", grid.x, grid.y, grid.z, grid.x*grid.y*grid.z);
00576 printf("Threads = %d\n", grid.x*grid.y*grid.z*block.x*block.y*block.z);
00577 printf("Image = %d x %d x %d\n", gpu_state.height, gpu_state.width, gpu_state.depth);
00578 printf("Voxels = %d\n", gpu_state.height * gpu_state.width * gpu_state.depth);
00579 #endif // VERBOSE
00580
00581
00582
00583
00584
00585
00586
00587
00588 while (changes) {
00589 cudaMemcpyToSymbolAsync(changes_d, &zero, sizeof(changes_d));
00590
00591 update_kernel<GROUP_T><<<grid, block>>>(current_group_d, next_group_d, gpu_state.current_value_d,
00592 gpu_state.next_value_d, gpu_state.eq_and_lower_d,
00593 gpu_state.current_update_d, gpu_state.next_update_d,
00594 gpu_state.height, gpu_state.width, gpu_state.depth);
00595
00596 SWAP(current_group_d, next_group_d, GROUP_T*);
00597 SWAP(gpu_state.current_value_d, gpu_state.next_value_d, IMAGE_T*);
00598 SWAP(gpu_state.current_update_d, gpu_state.next_update_d, unsigned char*);
00599
00600 cudaMemcpyFromSymbol(&changes, changes_d, sizeof(changes));
00601
00602 #if defined(VERBOSE)
00603 cudaDeviceSynchronize();
00604 cudaError_t error = cudaGetLastError();
00605 if (error != cudaSuccess) {
00606 printf("Error in update: %s\n", cudaGetErrorString(error));
00607 destroy_gpu(gpu_state);
00608 exit(-1);
00609 }
00610 #endif // VERBOSE
00611 }
00612
00613
00614 if (current_group_d != final_segments_d) {
00615 long nVoxels = gpu_state.height * gpu_state.width * gpu_state.depth;
00616 cudaMemcpy(final_segments_d, current_group_d,
00617 nVoxels * sizeof(GROUP_T), cudaMemcpyDeviceToDevice);
00618 }
00619 }
00620
00621
00622 #define UPDATE_VOXEL_CUDA(and_value, idx, curr_group, smallest_value) {\
00623 if (n_eq & and_value) {\
00624 const long offset_idx = idx + and_value##_offset_d;\
00625 if (current_value[offset_idx] + FLOAT_DIFF < smallest_value) {\
00626 smallest_value = current_value[offset_idx];\
00627 offset_number = and_value##_idx;\
00628 next_idx = idx + and_value##_offset_d;\
00629 }\
00630 curr_group = current_group[offset_idx] < curr_group ?\
00631 current_group[offset_idx] : curr_group;\
00632 }}
00633
00634
00635 template <typename GROUP_T, typename IMAGE_T>
00636 __global__ void update_kernel(GROUP_T* __restrict__ current_group,
00637 GROUP_T* __restrict__ next_group,
00638 IMAGE_T* __restrict__ current_value,
00639 IMAGE_T* __restrict__ next_value,
00640 int* __restrict__ eq_and_lower,
00641 unsigned char* __restrict__ current_update,
00642 unsigned char* __restrict__ next_update,
00643 const int width,
00644 const int height,
00645 const int depth) {
00646
00647 const int tx = blockIdx.x * BLOCK_X_DIM + threadIdx.x;
00648 const int ty = blockIdx.y * BLOCK_Y_DIM + threadIdx.y;
00649 const int tz = blockIdx.z * BLOCK_Z_DIM + threadIdx.z;
00650 const long idx = tz * long(height) * long(width) + ty * long(width) + tx;
00651 const int heightWidth = height * width;
00652
00653 if (tx >= width || ty >= height || tz >= depth) {
00654 return;
00655 }
00656
00657 #ifdef GPU_BLOCK_UPDATE
00658 const long block_x = blockIdx.x;
00659 const long block_y = blockIdx.y;
00660 const long block_z = blockIdx.z;
00661 const long block_idx = block_z * gridDim.y * gridDim.x + block_y * gridDim.x + block_x;
00662 if (current_update[block_idx] == 0) {
00663 return;
00664 }
00665 current_update[block_idx] = 0;
00666 #endif // GPU_BLOCK_UPDATE
00667
00668 #ifdef GPU_WARP_UPDATE
00669 const long warp_x = tx >> DIV_BY_32;
00670 const long warp_y = ty;
00671 const long warp_z = tz;
00672 const long warp_x_dim = (BLOCK_X_DIM * gridDim.x) >> DIV_BY_32;
00673 const long warp_y_dim = BLOCK_Y_DIM * gridDim.y;
00674 const long warp_z_dim = BLOCK_Z_DIM * gridDim.z;
00675 const long warp_idx = warp_z * warp_y_dim * warp_x_dim + warp_y * warp_x_dim + warp_x;
00676
00677 if (current_update[warp_idx] == 0) {
00678 return;
00679 }
00680 current_update[warp_idx] = 0;
00681 #endif // GPU_WARP_UPDATE
00682
00683 bool update = false;
00684 const int equal_and_lower = eq_and_lower[idx];
00685 const int offset_number = GET_N_LOWER(equal_and_lower);
00686 if (offset_number != 0) {
00687 const long nidx = idx + neighbor_offsets_d[offset_number];
00688 if (next_group[idx] != current_group[nidx] || next_value[idx] != current_value[nidx]) {
00689 next_group[idx] = current_group[nidx];
00690 next_value[idx] = current_value[nidx];
00691 update = true;
00692 }
00693 } else {
00694 const int n_eq = GET_N_EQ(equal_and_lower);
00695 IMAGE_T smallest_value = current_value[idx];
00696 GROUP_T curr_group = current_group[idx];
00697 int offset_number = 0;
00698 long next_idx = 0;
00699
00700 #ifdef CONNECTED_18
00701
00702 UPDATE_VOXEL_CUDA(nx, idx, curr_group, smallest_value);
00703 UPDATE_VOXEL_CUDA(px, idx, curr_group, smallest_value);
00704
00705
00706 UPDATE_VOXEL_CUDA(nx_ny, idx, curr_group, smallest_value);
00707 UPDATE_VOXEL_CUDA( ny, idx, curr_group, smallest_value);
00708 UPDATE_VOXEL_CUDA(px_ny, idx, curr_group, smallest_value);
00709
00710
00711 UPDATE_VOXEL_CUDA(nx_py, idx, curr_group, smallest_value);
00712 UPDATE_VOXEL_CUDA( py, idx, curr_group, smallest_value);
00713 UPDATE_VOXEL_CUDA(px_py, idx, curr_group, smallest_value);
00714
00715
00716 UPDATE_VOXEL_CUDA(px_pz, idx, curr_group, smallest_value);
00717 UPDATE_VOXEL_CUDA(nx_pz, idx, curr_group, smallest_value);
00718
00719 UPDATE_VOXEL_CUDA(py_pz, idx, curr_group, smallest_value);
00720 UPDATE_VOXEL_CUDA( pz, idx, curr_group, smallest_value);
00721 UPDATE_VOXEL_CUDA(ny_pz, idx, curr_group, smallest_value);
00722
00723
00724 UPDATE_VOXEL_CUDA(nx_nz, idx, curr_group, smallest_value);
00725 UPDATE_VOXEL_CUDA( nz, idx, curr_group, smallest_value);
00726 UPDATE_VOXEL_CUDA(px_nz, idx, curr_group, smallest_value);
00727
00728 UPDATE_VOXEL_CUDA(py_nz, idx, curr_group, smallest_value);
00729 UPDATE_VOXEL_CUDA(ny_nz, idx, curr_group, smallest_value);
00730
00731 #else // 6 connected neighbors
00732 UPDATE_VOXEL_CUDA(nx, idx, curr_group, smallest_value);
00733 UPDATE_VOXEL_CUDA(px, idx, curr_group, smallest_value);
00734 UPDATE_VOXEL_CUDA(ny, idx, curr_group, smallest_value);
00735 UPDATE_VOXEL_CUDA(py, idx, curr_group, smallest_value);
00736 UPDATE_VOXEL_CUDA(nz, idx, curr_group, smallest_value);
00737 UPDATE_VOXEL_CUDA(pz, idx, curr_group, smallest_value);
00738 #endif // CONNECTED_18
00739
00740 if (offset_number != 0) {
00741 eq_and_lower[idx] = MAKE_EQUAL_AND_LOWER(n_eq, offset_number);
00742 next_value[idx] = smallest_value;
00743 next_group[idx] = current_group[next_idx];
00744 update = true;
00745 } else if (curr_group != next_group[idx]) {
00746 next_group[idx] = curr_group;
00747 update = true;
00748 }
00749 }
00750
00751 if (update) {
00752 changes_d = 1;
00753
00754 #ifdef GPU_BLOCK_UPDATE
00755 next_update[block_idx] = 1;
00756
00757 if (block_x > 0) {
00758 next_update[block_idx - 1] = 1;
00759 }
00760 if (block_x < gridDim.x - 1) {
00761 next_update[block_idx + 1] = 1;
00762 }
00763 if (block_y > 0) {
00764 next_update[block_idx - gridDim.x] = 1;
00765 }
00766 if (block_y < gridDim.y - 1) {
00767 next_update[block_idx + gridDim.x] = 1;
00768 }
00769 if (block_z > 0) {
00770 next_update[block_idx - (gridDim.x * gridDim.y)] = 1;
00771 }
00772 if (block_z < gridDim.z - 1) {
00773 next_update[block_idx + (gridDim.x * gridDim.y)] = 1;
00774 }
00775 #endif // GPU_BLOCK_UPDATE
00776
00777 #ifdef GPU_WARP_UPDATE
00778 next_update[warp_idx] = 1;
00779
00780 if (warp_x > 0) {
00781 next_update[warp_idx - 1] = 1;
00782 }
00783 if (warp_x < warp_x_dim - 1) {
00784 next_update[warp_idx + 1] = 1;
00785 }
00786 if (warp_y > 0) {
00787 next_update[warp_idx - warp_x_dim] = 1;
00788 }
00789 if (warp_y < warp_y_dim - 1) {
00790 next_update[warp_idx + warp_x_dim] = 1;
00791 }
00792 if (warp_z > 0) {
00793 next_update[warp_idx - (warp_x_dim * warp_y_dim)] = 1;
00794 }
00795 if (warp_z < warp_z_dim - 1) {
00796 next_update[warp_idx + (warp_x_dim * warp_y_dim)] = 1;
00797 }
00798 #endif // GPU_WARP_UPDATE
00799 }
00800 }
00801