CUDAGaussianBlur.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAGaussianBlur.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.18 $        $Date: 2020/02/26 20:31:15 $
00014  *
00015  ***************************************************************************/
00024 #define WATERSHED_INTERNAL 1
00025 
00026 #include <stdio.h>
00027 #include "CUDAGaussianBlur.h"
00028 #include "Watershed.h"
00029 #include "ProfileHooks.h"
00030 #include <cuda_fp16.h>
00031 
00032 #if 0 
00033 #define CUERR { cudaError_t err; \
00034   cudaDeviceSynchronize(); \
00035   if ((err = cudaGetLastError()) != cudaSuccess) { \
00036   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00037   }}
00038 #else
00039 #define CUERR
00040 #endif
00041 
00042 #define KERNEL_MAX_SIZE 4096
00043 
00044 __constant__ float kernel_d_c[KERNEL_MAX_SIZE];
00045 
00046 #define BLOCKDIM_X 128
00047 #define BLOCKDIM_Y 1
00048 #define BLOCKDIM_Z 1
00049 
00050 // optimized for 21 kernel size
00051 #define XCONV_BLOCKDIM_X 96
00052 #define XCONV_BLOCKDIM_Y 1
00053 #define XCONV_BLOCKDIM_Z 1
00054 
00055 #define YCONV_BLOCKDIM_X 16
00056 #define YCONV_BLOCKDIM_Y 8
00057 #define YCONV_BLOCKDIM_Z 1
00058 
00059 #define ZCONV_BLOCKDIM_X 4
00060 #define ZCONV_BLOCKDIM_Y 1
00061 #define ZCONV_BLOCKDIM_Z 32
00062 
00063 #define X_PADDING 32
00064 
00065 #define SWAPF(a, b) {\
00066   float* t = a;\
00067   a = b;\
00068   b = t;\
00069 }
00070 
00071 #define INST_GAUSSIAN_CUDA(I_T) \
00072 template void gaussian1D_x_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00073                                        int width, int height, int depth);\
00074 template void gaussian1D_y_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00075                                        int width, int height, int depth);\
00076 template void gaussian1D_z_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00077                                        int width, int height, int depth);
00078 
00079 INST_GAUSSIAN_CUDA(float)
00080 //INST_GAUSSIAN_CUDA(half)
00081 INST_GAUSSIAN_CUDA(unsigned short)
00082 INST_GAUSSIAN_CUDA(unsigned char)
00083 
00084 // Function that converts val to the appropriate type and then assigns it to arr[idx]
00085 // This is necessary for coverting an real type to integer type so we can handle rounding
00086 template <typename T>
00087 __device__ __forceinline__ void convert_type_and_assign_val(T* arr, long idx, float val) {
00088   arr[idx] = roundf(val);
00089 }
00090 
00091 template <> __device__ __forceinline__ void convert_type_and_assign_val<float>(float* arr, long idx, float val) {
00092   arr[idx] = val;
00093 }
00094 
00095 #if CUDART_VERSION >= 9000
00096 // half-precision only became mainstream with CUDA 9.x and later versions
00097 template <> __device__ __forceinline__ void convert_type_and_assign_val<half>(half* arr, long idx, float val) {
00098   arr[idx] = val;
00099 }
00100 #endif
00101 
00102 void copy_array_from_gpu(void* arr, void* arr_d, int bytes) {
00103   PROFILE_PUSH_RANGE("CUDAGaussianBlur::copy_array_from_gpu()", 5);
00104  
00105   cudaMemcpy(arr, arr_d, bytes, cudaMemcpyDeviceToHost);
00106   CUERR // check and clear any existing errors
00107 
00108   PROFILE_POP_RANGE();
00109 }
00110 
00111 
00112 void copy_array_to_gpu(void* arr_d, void* arr, int bytes) {
00113   cudaMemcpyAsync(arr_d, arr, bytes, cudaMemcpyHostToDevice);
00114   CUERR // check and clear any existing errors
00115 }
00116 
00117 
00118 void* alloc_cuda_array(int bytes) {
00119   void* t;
00120   cudaMalloc(&t, bytes);
00121   CUERR // check and clear any existing errors
00122   return t;
00123 }
00124 
00125 
00126 void free_cuda_array(void* arr) {
00127   cudaFree(arr);
00128 }
00129 
00130 
00131 void set_gaussian_1D_kernel_cuda(float* kernel, int kernel_size) {
00132 
00133   if (kernel_size > KERNEL_MAX_SIZE) {
00134     printf("Warning: exceeded maximum kernel size on GPU.\n");
00135     kernel_size = KERNEL_MAX_SIZE;
00136   }
00137 
00138   cudaMemcpyToSymbolAsync(kernel_d_c, kernel, kernel_size * sizeof(float));
00139   CUERR // check and clear any existing errors
00140 }
00141 
00142 
00143 template <typename IMAGE_T, const int offset>
00144 __global__ void convolution_x_kernel_s(const IMAGE_T* __restrict__ src_d,
00145     IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00146   __shared__ IMAGE_T src_s[XCONV_BLOCKDIM_Z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)];
00147   const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00148   const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00149   const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00150 
00151   if (y >= height || z >= depth) {
00152     return;
00153   }
00154 
00155   // global idx
00156   const long idx = z * long(height) * long(width) + y * long(width) + x;
00157 
00158   // idx of first entry in row (x = 0)
00159   const long g_row_base_idx = z * long(height) * long(width) + y * long(width);
00160 
00161   // idx of first entry in shared mem array row
00162   const int t_row_base_idx = threadIdx.z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)
00163                  + threadIdx.y * (XCONV_BLOCKDIM_X + offset*2) + offset;
00164 
00165   // idx of first x coord in this block
00166   const int base_x = x - threadIdx.x;
00167 
00168   // Check if we need to deal with edge cases
00169   if (base_x - offset < 0 || base_x + XCONV_BLOCKDIM_X + offset >= width) {
00170     for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) {
00171         int x_offset = base_x + i;
00172         if (x_offset < 0) // left edge case
00173             x_offset = 0;
00174         else if (x_offset >= width) // right edge case
00175             x_offset = width-1;
00176         src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset];
00177     }
00178   } else {
00179     for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) {
00180         int x_offset = base_x + i;
00181         src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset];
00182     }
00183   }
00184   if (x >= width)
00185     return;
00186 
00187   // XXX is this safe to do with early returns on all GPUs?
00188   __syncthreads();
00189 
00190   const IMAGE_T* src_s_offset = src_s + t_row_base_idx + threadIdx.x;
00191   const float* kernel_offset = kernel_d_c + offset;
00192   float value = 0.0f;
00193 #pragma unroll
00194   for (int i = -offset; i <= offset; ++i) {
00195     value += src_s_offset[i] * kernel_offset[i];
00196   }
00197 
00198   convert_type_and_assign_val(dst_d, idx, value);
00199 }
00200 
00201 
00202 
00203 template <typename IMAGE_T, const int offset>
00204 __global__ void convolution_y_kernel_s(const IMAGE_T* __restrict__ src_d,
00205     IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00206   __shared__ IMAGE_T src_s[YCONV_BLOCKDIM_Z * (YCONV_BLOCKDIM_Y + 2*offset) * YCONV_BLOCKDIM_X];
00207   const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00208   const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00209   const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00210 
00211   if (x >= width || z >= depth) {
00212     return;
00213   }
00214 
00215   // global idx
00216   const long idx = z * long(height) * long(width) + y * long(width) + x;
00217 
00218   // idx of first entry in column (y = 0)
00219   const long g_col_base_idx = z * long(height) * long(width) + x;
00220 
00221   // idx of first entry in shared mem array column
00222   const int t_col_base_idx = threadIdx.z * (YCONV_BLOCKDIM_Y +offset*2) * YCONV_BLOCKDIM_X
00223                  + threadIdx.x + offset * YCONV_BLOCKDIM_X;
00224 
00225   // idx of first y coord in this block
00226   const int base_y = y - threadIdx.y;
00227 
00228   // Check if we need to deal with edge cases
00229   if (base_y - offset < 0 || base_y + YCONV_BLOCKDIM_Y - 1 + offset >= height) {
00230     for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) {
00231         int y_offset = base_y + i;
00232         if (y_offset < 0) // left edge case
00233             y_offset = 0;
00234         else if (y_offset >= height) // right edge case
00235             y_offset = height-1;
00236         src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width];
00237     }
00238   } else {
00239     for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) {
00240         int y_offset = base_y + i;
00241         src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width];
00242     }
00243   }
00244   if (y >= height)
00245     return;
00246 
00247   // XXX is this safe to do with early returns on all GPUs?
00248   __syncthreads();
00249 
00250   const IMAGE_T* src_s_offset = src_s + t_col_base_idx + threadIdx.y * YCONV_BLOCKDIM_X;
00251   const float* kernel_offset = kernel_d_c + offset;
00252   float value = 0.0f;
00253 #pragma unroll
00254   for (int i = -offset; i <= offset; ++i) {
00255     value += src_s_offset[i*YCONV_BLOCKDIM_X] * kernel_offset[i];
00256   }
00257 
00258   convert_type_and_assign_val(dst_d, idx, value);
00259 }
00260 
00261 
00262 template <typename IMAGE_T, const int offset>
00263 __global__ void convolution_z_kernel_s(const IMAGE_T* __restrict__ src_d,
00264     IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00265   __shared__ IMAGE_T src_s[ZCONV_BLOCKDIM_Z * (ZCONV_BLOCKDIM_Y + 2*offset) * ZCONV_BLOCKDIM_X];
00266   const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00267   const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00268   const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00269 
00270   if (x >= width || y >= height) {
00271     return;
00272   }
00273 
00274   // global idx
00275   const long idx = z * long(height) * long(width) + y * long(width) + x;
00276 
00277   // idx of first entry in column (z = 0)
00278   const long g_base_col_idx = y * long(width) + x;
00279 
00280   // idx of first entry in shared mem array column
00281   const int t_base_col_idx = threadIdx.y * ZCONV_BLOCKDIM_X 
00282                              + threadIdx.x + offset * ZCONV_BLOCKDIM_X;
00283   const int base_z = z - threadIdx.z;
00284   const long heightWidth = long(height) * long(width);
00285   float value = 0.0f;
00286 
00287   // Check if we need to deal with edge cases
00288   if (base_z - offset < 0 || base_z + ZCONV_BLOCKDIM_Z - 1 + offset >= depth) {
00289     for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) {
00290         int z_offset = base_z + i;
00291         if (z_offset < 0) // left edge case
00292             z_offset = 0;
00293         else if (z_offset >= depth) // right edge case
00294             z_offset = depth-1;
00295         src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y]
00296               = src_d[g_base_col_idx + z_offset*heightWidth];
00297     }
00298   } else {
00299     for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) {
00300         int z_offset = base_z + i;
00301         src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y]
00302               = src_d[g_base_col_idx + z_offset*heightWidth];
00303     }
00304   }
00305 
00306   if (z >= depth)
00307     return;
00308 
00309   __syncthreads();
00310 
00311   const IMAGE_T* src_s_offset = src_s + t_base_col_idx + threadIdx.z
00312                               * ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y;
00313   const float* kernel_offset = kernel_d_c + offset;
00314 #pragma unroll
00315   for (int i = -offset; i <= offset; ++i) {
00316     value += src_s_offset[i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y] * kernel_offset[i];
00317   }
00318 
00319   convert_type_and_assign_val(dst_d, idx, value);
00320 }
00321 
00322 
00323 template <typename IMAGE_T, const int offset>
00324 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d,
00325     IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00326   const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00327   const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00328   const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00329 
00330   if (x >= width || y >= height || z >= depth) {
00331     return;
00332   }
00333 
00334   const long idx = z * long(height) * long(width) + y * long(width) + x;
00335   const int offset_neg = x - offset >= 0 ? -offset : -x;
00336   const int offset_pos = x + offset < width ? offset : width - x - 1;
00337   const float* kernel_offset = kernel_d_c + offset;
00338   const IMAGE_T* src_idx = src_d + idx;
00339   float value = 0.0f;
00340 
00341   if (offset_neg == -offset && offset_pos == offset) {
00342 #pragma unroll
00343     for (int i = -offset; i <= offset; ++i) {
00344       value += src_idx[i] * kernel_offset[i];
00345     }
00346   } else {
00347     // Handle boundary condition
00348     for (int i = -offset; i < offset_neg; ++i) {
00349       value += src_idx[offset_neg] * kernel_offset[i];
00350     }
00351 
00352     for (int i = offset_neg; i < offset_pos; ++i) {
00353       value += src_idx[i] * kernel_offset[i];
00354     }
00355 
00356     // Handle boundary condition
00357     for (int i = offset_pos; i <= offset; ++i) {
00358       value += src_idx[offset_pos] * kernel_offset[i];
00359     }
00360   }
00361 
00362   convert_type_and_assign_val(dst_d, idx, value);
00363 }
00364 
00365 
00366 template <typename IMAGE_T>
00367 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d,
00368     IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00369   const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00370   const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00371   const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00372 
00373   if (x >= width || y >= height || z >= depth) {
00374     return;
00375   }
00376 
00377   const long idx = z * long(height) * long(width) + y * long(width) + x;
00378   const int offset_neg = x - offset >= 0 ? -offset : -x;
00379   const int offset_pos = x + offset < width ? offset : width - x - 1;
00380   const float* kernel_offset = kernel_d_c + offset;
00381   const IMAGE_T* src_idx = src_d + idx;
00382   float value = 0.0f;
00383 
00384   // Handle boundary condition
00385   for (int i = -offset; i < offset_neg; ++i) {
00386     value += src_idx[offset_neg] * kernel_offset[i];
00387   }
00388 
00389   for (int i = offset_neg; i < offset_pos; ++i) {
00390     value += src_idx[i] * kernel_offset[i];
00391   }
00392 
00393   // Handle boundary condition
00394   for (int i = offset_pos; i <= offset; ++i) {
00395     value += src_idx[offset_pos] * kernel_offset[i];
00396   }
00397 
00398   convert_type_and_assign_val(dst_d, idx, value);
00399 }
00400 
00401 
00402 template <typename IMAGE_T, const int offset>
00403 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d,
00404     IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00405   const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00406   const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00407   const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00408 
00409   if (x >= width || y >= height || z >= depth) {
00410     return;
00411   }
00412 
00413   const long idx = z * long(height) * long(width) + y * long(width) + x;
00414   const int offset_neg = y - offset >= 0 ? -offset : -y;
00415   const int offset_pos = y + offset < height ? offset : height - y - 1;
00416   const float* kernel_offset = kernel_d_c + offset;
00417   const IMAGE_T* src_idx = src_d + idx;
00418   float value = 0.0f;
00419 
00420   if (offset_neg == -offset && offset_pos == offset) {
00421 #pragma unroll
00422     for (int i = -offset; i <= offset; ++i) {
00423       value += src_idx[i*width] * kernel_offset[i];
00424     }
00425   } else {
00426     // Handle boundary condition
00427     for (int i = -offset; i < offset_neg; ++i) {
00428       value += src_idx[offset_neg*width] * kernel_offset[i];
00429     }
00430 
00431     for (int i = offset_neg; i < offset_pos; ++i) {
00432       value += src_idx[i*width] * kernel_offset[i];
00433     }
00434 
00435     // Handle boundary condition
00436     for (int i = offset_pos; i <= offset; ++i) {
00437       value += src_idx[offset_pos*width] * kernel_offset[i];
00438     }
00439   }
00440 
00441   convert_type_and_assign_val(dst_d, idx, value);
00442 }
00443 
00444 
00445 template <typename IMAGE_T>
00446 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d,
00447     IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00448   const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00449   const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00450   const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00451 
00452   if (x >= width || y >= height || z >= depth) {
00453     return;
00454   }
00455 
00456   const long idx = z * long(height) * long(width) + y * long(width) + x;
00457   const int offset_neg = y - offset >= 0 ? -offset : -y;
00458   const int offset_pos = y + offset < height ? offset : height - y - 1;
00459   const float* kernel_offset = kernel_d_c + offset;
00460   const IMAGE_T* src_idx = src_d + idx;
00461   float value = 0.0f;
00462 
00463   // Handle boundary condition
00464   for (int i = -offset; i < offset_neg; ++i) {
00465     value += src_idx[offset_neg*width] * kernel_offset[i];
00466   }
00467 
00468   for (int i = offset_neg; i < offset_pos; ++i) {
00469     value += src_idx[i*width] * kernel_offset[i];
00470   }
00471 
00472   // Handle boundary condition
00473   for (int i = offset_pos; i <= offset; ++i) {
00474     value += src_idx[offset_pos*width] * kernel_offset[i];
00475   }
00476 
00477   convert_type_and_assign_val(dst_d, idx, value);
00478 }
00479 
00480 
00481 template <typename IMAGE_T>
00482 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d,
00483     IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00484   const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00485   const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00486   const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00487 
00488   if (x >= width || y >= height || z >= depth) {
00489     return;
00490   }
00491 
00492   const int idx = z * long(height) * long(width) + y * long(width) + x;
00493   const int offset_neg = z - offset >= 0 ? -offset : -z;
00494   const int offset_pos = z + offset < depth ? offset : depth - z - 1;
00495   const long heightWidth = long(height) * long(width);
00496   const float* kernel_offset = kernel_d_c + offset;
00497   const IMAGE_T* src_idx = src_d + idx;
00498   float value = 0.0f;
00499 
00500   // Handle boundary condition
00501   for (int i = -offset; i < offset_neg; ++i) {
00502     value += src_idx[offset_neg*heightWidth] * kernel_offset[i];
00503   }
00504 
00505   for (int i = offset_neg; i < offset_pos; ++i) {
00506     value += src_idx[i*heightWidth] * kernel_offset[i];
00507   }
00508 
00509   // Handle boundary condition
00510   for (int i = offset_pos; i <= offset; ++i) {
00511     value += src_idx[offset_pos*heightWidth] * kernel_offset[i];
00512   }
00513 
00514   convert_type_and_assign_val(dst_d, idx, value);
00515 }
00516 
00517 
00518 template <typename IMAGE_T, const int offset>
00519 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d,
00520     IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00521   const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00522   const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00523   const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00524 
00525   if (x >= width || y >= height || z >= depth) {
00526     return;
00527   }
00528 
00529   const long idx = z * long(height) * long(width) + y * long(width) + x;
00530   const int offset_neg = z - offset >= 0 ? -offset : -z;
00531   const int offset_pos = z + offset < depth ? offset : depth - z - 1;
00532   const long heightWidth = long(height) * long(width);
00533   const float* kernel_offset = kernel_d_c + offset;
00534   const IMAGE_T* src_idx = src_d + idx;
00535   float value = 0.0f;
00536 
00537   if (offset_neg == -offset && offset_pos == offset) {
00538 #pragma unroll
00539     for (int i = -offset; i <= offset; ++i) {
00540       value += src_idx[i*heightWidth] * kernel_offset[i];
00541     }
00542   } else {
00543     // Handle boundary condition
00544     for (int i = -offset; i < offset_neg; ++i) {
00545       value += src_idx[offset_neg*heightWidth] * kernel_offset[i];
00546     }
00547 
00548     for (int i = offset_neg; i < offset_pos; ++i) {
00549       value += src_idx[i*heightWidth] * kernel_offset[i];
00550     }
00551 
00552     // Handle boundary condition
00553     for (int i = offset_pos; i <= offset; ++i) {
00554       value += src_idx[offset_pos*heightWidth] * kernel_offset[i];
00555     }
00556   }
00557 
00558 
00559   convert_type_and_assign_val(dst_d, idx, value);
00560 }
00561 
00562 
00563 template <typename IMAGE_T>
00564 void gaussian1D_x_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00565                        int width, int height, int depth) {
00566   long x_grid = (width + XCONV_BLOCKDIM_X - 1)/XCONV_BLOCKDIM_X;
00567   long y_grid = (height + XCONV_BLOCKDIM_Y - 1)/XCONV_BLOCKDIM_Y;
00568   long z_grid = (depth + XCONV_BLOCKDIM_Z - 1)/XCONV_BLOCKDIM_Z;
00569   dim3 block(XCONV_BLOCKDIM_X, XCONV_BLOCKDIM_Y, XCONV_BLOCKDIM_Z);
00570   dim3 grid(x_grid, y_grid, z_grid);
00571 
00572   // XXX TODO 
00573   // This comment applies to all gaussian1D_XX_cuda kernels.
00574   // We should probably figure out a few different sizes like
00575   // 7, 15, 25, 35, 47, 59, etc that are farther apart
00576   // and then for each blur round the kernel size up to the nearest
00577   // supported size. Since we are ~doubling the sigma after each round
00578   // of blurring this current scheme is inefficient as only a few are used
00579   // before falling through to the default case.
00580   // If we build the kernel using the same sigma but a larger kernel_size
00581   // it should generate the same (or slightly more accurate) results
00582   // and the perf gained of using a templated kernel is much greater
00583   // than the hit from using a slightly larger than needed kernel_size
00584 
00585   switch (kernel_size) {
00586     case 3:
00587     convolution_x_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00588                                           width, height, depth);
00589     break;
00590     case 5:
00591     convolution_x_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00592                                           width, height, depth);
00593     break;
00594     case 7:
00595     convolution_x_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00596                                           width, height, depth);
00597     break;
00598     case 9:
00599     convolution_x_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00600                                           width, height, depth);
00601     break;
00602     case 11:
00603     convolution_x_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00604                                           width, height, depth);
00605     break;
00606     case 13:
00607     convolution_x_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00608                                           width, height, depth);
00609     break;
00610     case 15:
00611     convolution_x_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00612                                           width, height, depth);
00613     break;
00614     case 17:
00615     convolution_x_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00616                                           width, height, depth);
00617     break;
00618     case 19:
00619     convolution_x_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00620                                           width, height, depth);
00621     break;
00622     case 21:
00623     convolution_x_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00624                                           width, height, depth);
00625     break;
00626     default:
00627     convolution_x_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00628                                kernel_size/2, width, height, depth);
00629     break;
00630   }
00631 
00632   CUERR // check and clear any existing errors
00633 }
00634 
00635 
00636 template <typename IMAGE_T>
00637 void gaussian1D_y_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00638                        int width, int height, int depth) {
00639   long x_grid = (width + YCONV_BLOCKDIM_X - 1)/YCONV_BLOCKDIM_X;
00640   long y_grid = (height + YCONV_BLOCKDIM_Y - 1)/YCONV_BLOCKDIM_Y;
00641   long z_grid = (depth + YCONV_BLOCKDIM_Z - 1)/YCONV_BLOCKDIM_Z;
00642   dim3 block(YCONV_BLOCKDIM_X, YCONV_BLOCKDIM_Y, YCONV_BLOCKDIM_Z);
00643   dim3 grid(x_grid, y_grid, z_grid);
00644 
00645   switch (kernel_size) {
00646     case 3:
00647     convolution_y_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00648                                           width, height, depth);
00649     break;
00650     case 5:
00651     convolution_y_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00652                                           width, height, depth);
00653     break;
00654     case 7:
00655     convolution_y_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00656                                           width, height, depth);
00657     break;
00658     case 9:
00659     convolution_y_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00660                                           width, height, depth);
00661     break;
00662     case 11:
00663     convolution_y_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00664                                           width, height, depth);
00665     break;
00666     case 13:
00667     convolution_y_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00668                                           width, height, depth);
00669     break;
00670     case 15:
00671     convolution_y_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00672                                           width, height, depth);
00673     break;
00674     case 17:
00675     convolution_y_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00676                                           width, height, depth);
00677     break;
00678     case 19:
00679     convolution_y_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00680                                           width, height, depth);
00681     break;
00682     case 21:
00683     convolution_y_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00684                                           width, height, depth);
00685     break;
00686     default:
00687     convolution_y_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00688                                kernel_size/2, width, height, depth);
00689     break;
00690   }
00691 
00692   CUERR // check and clear any existing errors
00693 }
00694 
00695 
00696 template <typename IMAGE_T>
00697 void gaussian1D_z_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00698                        int width, int height, int depth) {
00699   long x_grid = (width + ZCONV_BLOCKDIM_X - 1)/ZCONV_BLOCKDIM_X;
00700   long y_grid = (height + ZCONV_BLOCKDIM_Y - 1)/ZCONV_BLOCKDIM_Y;
00701   long z_grid = (depth + ZCONV_BLOCKDIM_Z - 1)/ZCONV_BLOCKDIM_Z;
00702   dim3 block(ZCONV_BLOCKDIM_X, ZCONV_BLOCKDIM_Y, ZCONV_BLOCKDIM_Z);
00703   dim3 grid(x_grid, y_grid, z_grid);
00704 
00705   switch (kernel_size) {
00706     case 3:
00707     convolution_z_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00708                                           width, height, depth);
00709     break;
00710     case 5:
00711     convolution_z_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00712                                           width, height, depth);
00713     break;
00714     case 7:
00715     convolution_z_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00716                                           width, height, depth);
00717     break;
00718     case 9:
00719     convolution_z_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00720                                           width, height, depth);
00721     break;
00722     case 11:
00723     convolution_z_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00724                                           width, height, depth);
00725     break;
00726     case 13:
00727     convolution_z_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00728                                           width, height, depth);
00729     break;
00730     case 15:
00731     convolution_z_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00732                                           width, height, depth);
00733     break;
00734     case 17:
00735     convolution_z_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00736                                           width, height, depth);
00737     break;
00738     case 19:
00739     convolution_z_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00740                                           width, height, depth);
00741     break;
00742     case 21:
00743     convolution_z_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00744                                           width, height, depth);
00745     break;
00746     default:
00747     convolution_z_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00748                                kernel_size/2, width, height, depth);
00749     break;
00750   }
00751 
00752   CUERR // check and clear any existing errors
00753 }
00754 
00755