00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00024 #define WATERSHED_INTERNAL 1
00025
00026 #include <stdio.h>
00027 #include "CUDAGaussianBlur.h"
00028 #include "Watershed.h"
00029 #include "ProfileHooks.h"
00030 #include <cuda_fp16.h>
00031
00032 #if 0
00033 #define CUERR { cudaError_t err; \
00034 cudaDeviceSynchronize(); \
00035 if ((err = cudaGetLastError()) != cudaSuccess) { \
00036 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00037 }}
00038 #else
00039 #define CUERR
00040 #endif
00041
00042 #define KERNEL_MAX_SIZE 4096
00043
00044 __constant__ float kernel_d_c[KERNEL_MAX_SIZE];
00045
00046 #define BLOCKDIM_X 128
00047 #define BLOCKDIM_Y 1
00048 #define BLOCKDIM_Z 1
00049
00050
00051 #define XCONV_BLOCKDIM_X 96
00052 #define XCONV_BLOCKDIM_Y 1
00053 #define XCONV_BLOCKDIM_Z 1
00054
00055 #define YCONV_BLOCKDIM_X 16
00056 #define YCONV_BLOCKDIM_Y 8
00057 #define YCONV_BLOCKDIM_Z 1
00058
00059 #define ZCONV_BLOCKDIM_X 4
00060 #define ZCONV_BLOCKDIM_Y 1
00061 #define ZCONV_BLOCKDIM_Z 32
00062
00063 #define X_PADDING 32
00064
00065 #define SWAPF(a, b) {\
00066 float* t = a;\
00067 a = b;\
00068 b = t;\
00069 }
00070
00071 #define INST_GAUSSIAN_CUDA(I_T) \
00072 template void gaussian1D_x_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00073 int width, int height, int depth);\
00074 template void gaussian1D_y_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00075 int width, int height, int depth);\
00076 template void gaussian1D_z_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00077 int width, int height, int depth);
00078
00079 INST_GAUSSIAN_CUDA(float)
00080
00081 INST_GAUSSIAN_CUDA(unsigned short)
00082 INST_GAUSSIAN_CUDA(unsigned char)
00083
00084
00085
00086 template <typename T>
00087 __device__ __forceinline__ void convert_type_and_assign_val(T* arr, long idx, float val) {
00088 arr[idx] = roundf(val);
00089 }
00090
00091 template <> __device__ __forceinline__ void convert_type_and_assign_val<float>(float* arr, long idx, float val) {
00092 arr[idx] = val;
00093 }
00094
00095 #if CUDART_VERSION >= 9000
00096
00097 template <> __device__ __forceinline__ void convert_type_and_assign_val<half>(half* arr, long idx, float val) {
00098 arr[idx] = val;
00099 }
00100 #endif
00101
00102 void copy_array_from_gpu(void* arr, void* arr_d, int bytes) {
00103 PROFILE_PUSH_RANGE("CUDAGaussianBlur::copy_array_from_gpu()", 5);
00104
00105 cudaMemcpy(arr, arr_d, bytes, cudaMemcpyDeviceToHost);
00106 CUERR
00107
00108 PROFILE_POP_RANGE();
00109 }
00110
00111
00112 void copy_array_to_gpu(void* arr_d, void* arr, int bytes) {
00113 cudaMemcpyAsync(arr_d, arr, bytes, cudaMemcpyHostToDevice);
00114 CUERR
00115 }
00116
00117
00118 void* alloc_cuda_array(int bytes) {
00119 void* t;
00120 cudaMalloc(&t, bytes);
00121 CUERR
00122 return t;
00123 }
00124
00125
00126 void free_cuda_array(void* arr) {
00127 cudaFree(arr);
00128 }
00129
00130
00131 void set_gaussian_1D_kernel_cuda(float* kernel, int kernel_size) {
00132
00133 if (kernel_size > KERNEL_MAX_SIZE) {
00134 printf("Warning: exceeded maximum kernel size on GPU.\n");
00135 kernel_size = KERNEL_MAX_SIZE;
00136 }
00137
00138 cudaMemcpyToSymbolAsync(kernel_d_c, kernel, kernel_size * sizeof(float));
00139 CUERR
00140 }
00141
00142
00143 template <typename IMAGE_T, const int offset>
00144 __global__ void convolution_x_kernel_s(const IMAGE_T* __restrict__ src_d,
00145 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00146 __shared__ IMAGE_T src_s[XCONV_BLOCKDIM_Z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)];
00147 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00148 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00149 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00150
00151 if (y >= height || z >= depth) {
00152 return;
00153 }
00154
00155
00156 const long idx = z * long(height) * long(width) + y * long(width) + x;
00157
00158
00159 const long g_row_base_idx = z * long(height) * long(width) + y * long(width);
00160
00161
00162 const int t_row_base_idx = threadIdx.z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)
00163 + threadIdx.y * (XCONV_BLOCKDIM_X + offset*2) + offset;
00164
00165
00166 const int base_x = x - threadIdx.x;
00167
00168
00169 if (base_x - offset < 0 || base_x + XCONV_BLOCKDIM_X + offset >= width) {
00170 for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) {
00171 int x_offset = base_x + i;
00172 if (x_offset < 0)
00173 x_offset = 0;
00174 else if (x_offset >= width)
00175 x_offset = width-1;
00176 src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset];
00177 }
00178 } else {
00179 for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) {
00180 int x_offset = base_x + i;
00181 src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset];
00182 }
00183 }
00184 if (x >= width)
00185 return;
00186
00187
00188 __syncthreads();
00189
00190 const IMAGE_T* src_s_offset = src_s + t_row_base_idx + threadIdx.x;
00191 const float* kernel_offset = kernel_d_c + offset;
00192 float value = 0.0f;
00193 #pragma unroll
00194 for (int i = -offset; i <= offset; ++i) {
00195 value += src_s_offset[i] * kernel_offset[i];
00196 }
00197
00198 convert_type_and_assign_val(dst_d, idx, value);
00199 }
00200
00201
00202
00203 template <typename IMAGE_T, const int offset>
00204 __global__ void convolution_y_kernel_s(const IMAGE_T* __restrict__ src_d,
00205 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00206 __shared__ IMAGE_T src_s[YCONV_BLOCKDIM_Z * (YCONV_BLOCKDIM_Y + 2*offset) * YCONV_BLOCKDIM_X];
00207 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00208 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00209 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00210
00211 if (x >= width || z >= depth) {
00212 return;
00213 }
00214
00215
00216 const long idx = z * long(height) * long(width) + y * long(width) + x;
00217
00218
00219 const long g_col_base_idx = z * long(height) * long(width) + x;
00220
00221
00222 const int t_col_base_idx = threadIdx.z * (YCONV_BLOCKDIM_Y +offset*2) * YCONV_BLOCKDIM_X
00223 + threadIdx.x + offset * YCONV_BLOCKDIM_X;
00224
00225
00226 const int base_y = y - threadIdx.y;
00227
00228
00229 if (base_y - offset < 0 || base_y + YCONV_BLOCKDIM_Y - 1 + offset >= height) {
00230 for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) {
00231 int y_offset = base_y + i;
00232 if (y_offset < 0)
00233 y_offset = 0;
00234 else if (y_offset >= height)
00235 y_offset = height-1;
00236 src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width];
00237 }
00238 } else {
00239 for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) {
00240 int y_offset = base_y + i;
00241 src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width];
00242 }
00243 }
00244 if (y >= height)
00245 return;
00246
00247
00248 __syncthreads();
00249
00250 const IMAGE_T* src_s_offset = src_s + t_col_base_idx + threadIdx.y * YCONV_BLOCKDIM_X;
00251 const float* kernel_offset = kernel_d_c + offset;
00252 float value = 0.0f;
00253 #pragma unroll
00254 for (int i = -offset; i <= offset; ++i) {
00255 value += src_s_offset[i*YCONV_BLOCKDIM_X] * kernel_offset[i];
00256 }
00257
00258 convert_type_and_assign_val(dst_d, idx, value);
00259 }
00260
00261
00262 template <typename IMAGE_T, const int offset>
00263 __global__ void convolution_z_kernel_s(const IMAGE_T* __restrict__ src_d,
00264 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00265 __shared__ IMAGE_T src_s[ZCONV_BLOCKDIM_Z * (ZCONV_BLOCKDIM_Y + 2*offset) * ZCONV_BLOCKDIM_X];
00266 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00267 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00268 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00269
00270 if (x >= width || y >= height) {
00271 return;
00272 }
00273
00274
00275 const long idx = z * long(height) * long(width) + y * long(width) + x;
00276
00277
00278 const long g_base_col_idx = y * long(width) + x;
00279
00280
00281 const int t_base_col_idx = threadIdx.y * ZCONV_BLOCKDIM_X
00282 + threadIdx.x + offset * ZCONV_BLOCKDIM_X;
00283 const int base_z = z - threadIdx.z;
00284 const long heightWidth = long(height) * long(width);
00285 float value = 0.0f;
00286
00287
00288 if (base_z - offset < 0 || base_z + ZCONV_BLOCKDIM_Z - 1 + offset >= depth) {
00289 for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) {
00290 int z_offset = base_z + i;
00291 if (z_offset < 0)
00292 z_offset = 0;
00293 else if (z_offset >= depth)
00294 z_offset = depth-1;
00295 src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y]
00296 = src_d[g_base_col_idx + z_offset*heightWidth];
00297 }
00298 } else {
00299 for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) {
00300 int z_offset = base_z + i;
00301 src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y]
00302 = src_d[g_base_col_idx + z_offset*heightWidth];
00303 }
00304 }
00305
00306 if (z >= depth)
00307 return;
00308
00309 __syncthreads();
00310
00311 const IMAGE_T* src_s_offset = src_s + t_base_col_idx + threadIdx.z
00312 * ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y;
00313 const float* kernel_offset = kernel_d_c + offset;
00314 #pragma unroll
00315 for (int i = -offset; i <= offset; ++i) {
00316 value += src_s_offset[i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y] * kernel_offset[i];
00317 }
00318
00319 convert_type_and_assign_val(dst_d, idx, value);
00320 }
00321
00322
00323 template <typename IMAGE_T, const int offset>
00324 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d,
00325 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00326 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00327 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00328 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00329
00330 if (x >= width || y >= height || z >= depth) {
00331 return;
00332 }
00333
00334 const long idx = z * long(height) * long(width) + y * long(width) + x;
00335 const int offset_neg = x - offset >= 0 ? -offset : -x;
00336 const int offset_pos = x + offset < width ? offset : width - x - 1;
00337 const float* kernel_offset = kernel_d_c + offset;
00338 const IMAGE_T* src_idx = src_d + idx;
00339 float value = 0.0f;
00340
00341 if (offset_neg == -offset && offset_pos == offset) {
00342 #pragma unroll
00343 for (int i = -offset; i <= offset; ++i) {
00344 value += src_idx[i] * kernel_offset[i];
00345 }
00346 } else {
00347
00348 for (int i = -offset; i < offset_neg; ++i) {
00349 value += src_idx[offset_neg] * kernel_offset[i];
00350 }
00351
00352 for (int i = offset_neg; i < offset_pos; ++i) {
00353 value += src_idx[i] * kernel_offset[i];
00354 }
00355
00356
00357 for (int i = offset_pos; i <= offset; ++i) {
00358 value += src_idx[offset_pos] * kernel_offset[i];
00359 }
00360 }
00361
00362 convert_type_and_assign_val(dst_d, idx, value);
00363 }
00364
00365
00366 template <typename IMAGE_T>
00367 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d,
00368 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00369 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00370 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00371 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00372
00373 if (x >= width || y >= height || z >= depth) {
00374 return;
00375 }
00376
00377 const long idx = z * long(height) * long(width) + y * long(width) + x;
00378 const int offset_neg = x - offset >= 0 ? -offset : -x;
00379 const int offset_pos = x + offset < width ? offset : width - x - 1;
00380 const float* kernel_offset = kernel_d_c + offset;
00381 const IMAGE_T* src_idx = src_d + idx;
00382 float value = 0.0f;
00383
00384
00385 for (int i = -offset; i < offset_neg; ++i) {
00386 value += src_idx[offset_neg] * kernel_offset[i];
00387 }
00388
00389 for (int i = offset_neg; i < offset_pos; ++i) {
00390 value += src_idx[i] * kernel_offset[i];
00391 }
00392
00393
00394 for (int i = offset_pos; i <= offset; ++i) {
00395 value += src_idx[offset_pos] * kernel_offset[i];
00396 }
00397
00398 convert_type_and_assign_val(dst_d, idx, value);
00399 }
00400
00401
00402 template <typename IMAGE_T, const int offset>
00403 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d,
00404 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00405 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00406 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00407 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00408
00409 if (x >= width || y >= height || z >= depth) {
00410 return;
00411 }
00412
00413 const long idx = z * long(height) * long(width) + y * long(width) + x;
00414 const int offset_neg = y - offset >= 0 ? -offset : -y;
00415 const int offset_pos = y + offset < height ? offset : height - y - 1;
00416 const float* kernel_offset = kernel_d_c + offset;
00417 const IMAGE_T* src_idx = src_d + idx;
00418 float value = 0.0f;
00419
00420 if (offset_neg == -offset && offset_pos == offset) {
00421 #pragma unroll
00422 for (int i = -offset; i <= offset; ++i) {
00423 value += src_idx[i*width] * kernel_offset[i];
00424 }
00425 } else {
00426
00427 for (int i = -offset; i < offset_neg; ++i) {
00428 value += src_idx[offset_neg*width] * kernel_offset[i];
00429 }
00430
00431 for (int i = offset_neg; i < offset_pos; ++i) {
00432 value += src_idx[i*width] * kernel_offset[i];
00433 }
00434
00435
00436 for (int i = offset_pos; i <= offset; ++i) {
00437 value += src_idx[offset_pos*width] * kernel_offset[i];
00438 }
00439 }
00440
00441 convert_type_and_assign_val(dst_d, idx, value);
00442 }
00443
00444
00445 template <typename IMAGE_T>
00446 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d,
00447 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00448 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00449 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00450 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00451
00452 if (x >= width || y >= height || z >= depth) {
00453 return;
00454 }
00455
00456 const long idx = z * long(height) * long(width) + y * long(width) + x;
00457 const int offset_neg = y - offset >= 0 ? -offset : -y;
00458 const int offset_pos = y + offset < height ? offset : height - y - 1;
00459 const float* kernel_offset = kernel_d_c + offset;
00460 const IMAGE_T* src_idx = src_d + idx;
00461 float value = 0.0f;
00462
00463
00464 for (int i = -offset; i < offset_neg; ++i) {
00465 value += src_idx[offset_neg*width] * kernel_offset[i];
00466 }
00467
00468 for (int i = offset_neg; i < offset_pos; ++i) {
00469 value += src_idx[i*width] * kernel_offset[i];
00470 }
00471
00472
00473 for (int i = offset_pos; i <= offset; ++i) {
00474 value += src_idx[offset_pos*width] * kernel_offset[i];
00475 }
00476
00477 convert_type_and_assign_val(dst_d, idx, value);
00478 }
00479
00480
00481 template <typename IMAGE_T>
00482 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d,
00483 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00484 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00485 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00486 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00487
00488 if (x >= width || y >= height || z >= depth) {
00489 return;
00490 }
00491
00492 const int idx = z * long(height) * long(width) + y * long(width) + x;
00493 const int offset_neg = z - offset >= 0 ? -offset : -z;
00494 const int offset_pos = z + offset < depth ? offset : depth - z - 1;
00495 const long heightWidth = long(height) * long(width);
00496 const float* kernel_offset = kernel_d_c + offset;
00497 const IMAGE_T* src_idx = src_d + idx;
00498 float value = 0.0f;
00499
00500
00501 for (int i = -offset; i < offset_neg; ++i) {
00502 value += src_idx[offset_neg*heightWidth] * kernel_offset[i];
00503 }
00504
00505 for (int i = offset_neg; i < offset_pos; ++i) {
00506 value += src_idx[i*heightWidth] * kernel_offset[i];
00507 }
00508
00509
00510 for (int i = offset_pos; i <= offset; ++i) {
00511 value += src_idx[offset_pos*heightWidth] * kernel_offset[i];
00512 }
00513
00514 convert_type_and_assign_val(dst_d, idx, value);
00515 }
00516
00517
00518 template <typename IMAGE_T, const int offset>
00519 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d,
00520 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00521 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00522 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00523 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00524
00525 if (x >= width || y >= height || z >= depth) {
00526 return;
00527 }
00528
00529 const long idx = z * long(height) * long(width) + y * long(width) + x;
00530 const int offset_neg = z - offset >= 0 ? -offset : -z;
00531 const int offset_pos = z + offset < depth ? offset : depth - z - 1;
00532 const long heightWidth = long(height) * long(width);
00533 const float* kernel_offset = kernel_d_c + offset;
00534 const IMAGE_T* src_idx = src_d + idx;
00535 float value = 0.0f;
00536
00537 if (offset_neg == -offset && offset_pos == offset) {
00538 #pragma unroll
00539 for (int i = -offset; i <= offset; ++i) {
00540 value += src_idx[i*heightWidth] * kernel_offset[i];
00541 }
00542 } else {
00543
00544 for (int i = -offset; i < offset_neg; ++i) {
00545 value += src_idx[offset_neg*heightWidth] * kernel_offset[i];
00546 }
00547
00548 for (int i = offset_neg; i < offset_pos; ++i) {
00549 value += src_idx[i*heightWidth] * kernel_offset[i];
00550 }
00551
00552
00553 for (int i = offset_pos; i <= offset; ++i) {
00554 value += src_idx[offset_pos*heightWidth] * kernel_offset[i];
00555 }
00556 }
00557
00558
00559 convert_type_and_assign_val(dst_d, idx, value);
00560 }
00561
00562
00563 template <typename IMAGE_T>
00564 void gaussian1D_x_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00565 int width, int height, int depth) {
00566 long x_grid = (width + XCONV_BLOCKDIM_X - 1)/XCONV_BLOCKDIM_X;
00567 long y_grid = (height + XCONV_BLOCKDIM_Y - 1)/XCONV_BLOCKDIM_Y;
00568 long z_grid = (depth + XCONV_BLOCKDIM_Z - 1)/XCONV_BLOCKDIM_Z;
00569 dim3 block(XCONV_BLOCKDIM_X, XCONV_BLOCKDIM_Y, XCONV_BLOCKDIM_Z);
00570 dim3 grid(x_grid, y_grid, z_grid);
00571
00572
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585 switch (kernel_size) {
00586 case 3:
00587 convolution_x_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00588 width, height, depth);
00589 break;
00590 case 5:
00591 convolution_x_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00592 width, height, depth);
00593 break;
00594 case 7:
00595 convolution_x_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00596 width, height, depth);
00597 break;
00598 case 9:
00599 convolution_x_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00600 width, height, depth);
00601 break;
00602 case 11:
00603 convolution_x_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00604 width, height, depth);
00605 break;
00606 case 13:
00607 convolution_x_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00608 width, height, depth);
00609 break;
00610 case 15:
00611 convolution_x_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00612 width, height, depth);
00613 break;
00614 case 17:
00615 convolution_x_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00616 width, height, depth);
00617 break;
00618 case 19:
00619 convolution_x_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00620 width, height, depth);
00621 break;
00622 case 21:
00623 convolution_x_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00624 width, height, depth);
00625 break;
00626 default:
00627 convolution_x_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00628 kernel_size/2, width, height, depth);
00629 break;
00630 }
00631
00632 CUERR
00633 }
00634
00635
00636 template <typename IMAGE_T>
00637 void gaussian1D_y_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00638 int width, int height, int depth) {
00639 long x_grid = (width + YCONV_BLOCKDIM_X - 1)/YCONV_BLOCKDIM_X;
00640 long y_grid = (height + YCONV_BLOCKDIM_Y - 1)/YCONV_BLOCKDIM_Y;
00641 long z_grid = (depth + YCONV_BLOCKDIM_Z - 1)/YCONV_BLOCKDIM_Z;
00642 dim3 block(YCONV_BLOCKDIM_X, YCONV_BLOCKDIM_Y, YCONV_BLOCKDIM_Z);
00643 dim3 grid(x_grid, y_grid, z_grid);
00644
00645 switch (kernel_size) {
00646 case 3:
00647 convolution_y_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00648 width, height, depth);
00649 break;
00650 case 5:
00651 convolution_y_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00652 width, height, depth);
00653 break;
00654 case 7:
00655 convolution_y_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00656 width, height, depth);
00657 break;
00658 case 9:
00659 convolution_y_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00660 width, height, depth);
00661 break;
00662 case 11:
00663 convolution_y_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00664 width, height, depth);
00665 break;
00666 case 13:
00667 convolution_y_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00668 width, height, depth);
00669 break;
00670 case 15:
00671 convolution_y_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00672 width, height, depth);
00673 break;
00674 case 17:
00675 convolution_y_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00676 width, height, depth);
00677 break;
00678 case 19:
00679 convolution_y_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00680 width, height, depth);
00681 break;
00682 case 21:
00683 convolution_y_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00684 width, height, depth);
00685 break;
00686 default:
00687 convolution_y_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00688 kernel_size/2, width, height, depth);
00689 break;
00690 }
00691
00692 CUERR
00693 }
00694
00695
00696 template <typename IMAGE_T>
00697 void gaussian1D_z_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00698 int width, int height, int depth) {
00699 long x_grid = (width + ZCONV_BLOCKDIM_X - 1)/ZCONV_BLOCKDIM_X;
00700 long y_grid = (height + ZCONV_BLOCKDIM_Y - 1)/ZCONV_BLOCKDIM_Y;
00701 long z_grid = (depth + ZCONV_BLOCKDIM_Z - 1)/ZCONV_BLOCKDIM_Z;
00702 dim3 block(ZCONV_BLOCKDIM_X, ZCONV_BLOCKDIM_Y, ZCONV_BLOCKDIM_Z);
00703 dim3 grid(x_grid, y_grid, z_grid);
00704
00705 switch (kernel_size) {
00706 case 3:
00707 convolution_z_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00708 width, height, depth);
00709 break;
00710 case 5:
00711 convolution_z_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00712 width, height, depth);
00713 break;
00714 case 7:
00715 convolution_z_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00716 width, height, depth);
00717 break;
00718 case 9:
00719 convolution_z_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00720 width, height, depth);
00721 break;
00722 case 11:
00723 convolution_z_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00724 width, height, depth);
00725 break;
00726 case 13:
00727 convolution_z_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00728 width, height, depth);
00729 break;
00730 case 15:
00731 convolution_z_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00732 width, height, depth);
00733 break;
00734 case 17:
00735 convolution_z_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00736 width, height, depth);
00737 break;
00738 case 19:
00739 convolution_z_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00740 width, height, depth);
00741 break;
00742 case 21:
00743 convolution_z_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00744 width, height, depth);
00745 break;
00746 default:
00747 convolution_z_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00748 kernel_size/2, width, height, depth);
00749 break;
00750 }
00751
00752 CUERR
00753 }
00754
00755