GaussianBlur.C Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: GaussianBlur.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.27 $        $Date: 2020/10/29 03:16:18 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   Perform Gaussian blur filtering on 3-D volumes. Primarily for use in 
00019  *   scale-space filtering for 3-D image segmentation of Cryo-EM density maps.
00020  ***************************************************************************/
00021 
00022 #include <string.h>
00023 #include <stdlib.h>
00024 #include <stdio.h>
00025 #include <math.h>
00026 #include <climits>
00027 #include "GaussianBlur.h"
00028 #include "CUDAGaussianBlur.h"
00029 #include "ProfileHooks.h"
00030 
00031 #define PI_F (3.14159265359f)
00032 
00033 template <typename IMAGE_T>
00034 GaussianBlur<IMAGE_T>::GaussianBlur(IMAGE_T* img, int w, int h, int d, bool use_cuda) {
00035   host_image_needs_update = 0;
00036   height = h;
00037   width = w;
00038   depth = d;
00039   image_d = NULL;
00040   image = NULL;
00041   scratch = NULL;
00042   heightWidth = long(height) * long(width);
00043   nVoxels = heightWidth * long(depth);
00044   cuda = use_cuda;
00045 #ifndef VMDCUDA
00046   cuda = false;
00047 #endif
00048 
00049   image = new IMAGE_T[nVoxels];
00050 #ifdef VMDCUDA
00051   if (cuda) {
00052     image_d = (IMAGE_T*) alloc_cuda_array(nVoxels * sizeof(IMAGE_T));
00053     scratch_d = (IMAGE_T*) alloc_cuda_array(nVoxels * sizeof(IMAGE_T));
00054     if (image_d == NULL || scratch_d == NULL) {
00055       cuda = false;
00056       free_cuda_array(image_d);
00057       free_cuda_array(scratch_d);
00058     } else {
00059       copy_array_to_gpu(image_d, img, nVoxels * sizeof(IMAGE_T));
00060       copy_array_to_gpu(scratch_d, img, nVoxels * sizeof(IMAGE_T));
00061     }
00062   } 
00063   if (!cuda)
00064 #endif
00065   {
00066     scratch = new IMAGE_T[nVoxels];
00067     memcpy(image, img, nVoxels * sizeof(IMAGE_T));
00068   }
00069 }
00070 
00071 
00072 template <typename IMAGE_T>
00073 GaussianBlur<IMAGE_T>::~GaussianBlur() {
00074   delete [] image;
00075 #ifdef VMDCUDA
00076   if (cuda) {
00077     free_cuda_array(image_d);
00078     free_cuda_array(scratch_d);
00079   } else
00080 #endif
00081     delete [] scratch;
00082 }
00083 
00084 
00085 template <typename IMAGE_T>
00086 IMAGE_T* GaussianBlur<IMAGE_T>::get_image() {
00087 #ifdef VMDCUDA
00088   if (host_image_needs_update) {
00089     copy_array_from_gpu(image, image_d, nVoxels * sizeof(IMAGE_T));
00090     host_image_needs_update = 0;
00091   }
00092 #endif
00093 
00094   return image;
00095 }
00096 
00097 
00098 template <typename IMAGE_T>
00099 IMAGE_T* GaussianBlur<IMAGE_T>::get_image_d() {
00100   return image_d;
00101 }
00102 
00103 // This is an arbitarily  "max" size for the CPU, but
00104 // is the max for the GPU. Additionally we probably never want
00105 // to run a blur with a kernel_size this larger anyways as it is likely
00106 // larger than the image. I think it makes sense to cap the sigma to a certian
00107 // large value. That would have to be done before this function is called though.
00108 #define MAX_KERNEL_SIZE 4096
00109 
00110 template <typename IMAGE_T>
00111 int GaussianBlur<IMAGE_T>::getKernelSizeForSigma(float sigma) {
00112   int size = int(ceilf(sigma * 6));
00113   if (size % 2 == 0) {
00114     ++size;
00115   }
00116   if (size < 3) {
00117     size = 3;
00118   }
00119   if (size >=  MAX_KERNEL_SIZE) {
00120     size = MAX_KERNEL_SIZE;
00121   }
00122   return size;
00123 }
00124 
00125 
00126 template <typename IMAGE_T>
00127 void GaussianBlur<IMAGE_T>::fillGaussianBlurKernel3D(float sigma, int size, float* kernel) {
00128   float sigma2 = sigma * sigma;
00129   int middle = size / 2;
00130   for (int z = 0; z < size; z++) {
00131     for (int y = 0; y < size; y++) {
00132       for (int x = 0; x < size; x++) {
00133         const int idx = z * size * size + y * size + x;
00134         float x_dist = float(middle - x);
00135         float y_dist = float(middle - y);
00136         float z_dist = float(middle - z);
00137         float distance2 = x_dist * x_dist + y_dist * y_dist + z_dist * z_dist;
00138         float s = 1.0f / (sigma * sqrtf(2.0f * PI_F)) * expf(-distance2 / (2.0f * sigma2));
00139         kernel[idx] = s;
00140       }
00141     }
00142   }
00143 }
00144 
00145 
00146 template <typename IMAGE_T>
00147 void GaussianBlur<IMAGE_T>::fillGaussianBlurKernel1D(float sigma, int size, float* kernel) {
00148   float sigma2 = sigma * sigma;
00149   int middle = size / 2;
00150   for (int i = 0; i < size; ++i) {
00151     float distance = float (middle - i);
00152     float distance2 = distance * distance;
00153     float s = 1.0f / (sigma * sqrtf(2.0f*PI_F)) * expf(-distance2 / (2.0f * sigma2));
00154     kernel[i] = s;
00155   }
00156 }
00157 
00158 
00159 #define SWAPT(a, b) {\
00160   IMAGE_T* t = a;\
00161   a = b;\
00162   b = t;\
00163 }
00164 
00165 
00166 template <typename IMAGE_T>
00167 void GaussianBlur<IMAGE_T>::blur(float sigma) {
00168 #ifdef VMDCUDA
00169   if (cuda) {
00170     blur_cuda(sigma);
00171   } else
00172 #endif
00173     blur_cpu(sigma);
00174 }
00175 
00176 
00177 #if defined(VMDCUDA)
00178 template <typename IMAGE_T>
00179 void GaussianBlur<IMAGE_T>::blur_cuda(float sigma) {
00180   PROFILE_PUSH_RANGE("GaussianBlur<IMAGE_T>::blur_cuda()", 7);
00181 
00182   if (true) {  //TODO temp hack to test 3D gaussian kernel
00183     int size = getKernelSizeForSigma(sigma);
00184     float *kernel = new float[size];
00185     fillGaussianBlurKernel1D(sigma, size, kernel);
00186     set_gaussian_1D_kernel_cuda(kernel, size);
00187 
00188     gaussian1D_x_cuda(image_d, scratch_d, size, width, height, depth);
00189     SWAPT(image_d, scratch_d);
00190 
00191     gaussian1D_y_cuda(image_d, scratch_d, size, width, height, depth);
00192     SWAPT(image_d, scratch_d);
00193 
00194     gaussian1D_z_cuda(image_d, scratch_d, size, width, height, depth);
00195     SWAPT(image_d, scratch_d);
00196 
00197     // don't copy back to the host until we are absolutely forced to...
00198     host_image_needs_update = 1;
00199 
00200     delete [] kernel;
00201   } else {
00202     int size = getKernelSizeForSigma(sigma);
00203     float *kernel = new float[size * size * size];
00204     fillGaussianBlurKernel3D(sigma, size, kernel);
00205     delete [] kernel;
00206   }
00207 
00208   PROFILE_POP_RANGE();
00209 }
00210 #endif
00211 
00212 #if 1
00213 template <typename T>
00214 static inline void assign_float_to_dest(float val, T* dest) {
00215   // truncation is not great for quantized integer representations,
00216   // but it is at least portable...
00217   dest[0] = T(val);
00218 }
00219 #else
00220 template <typename T>
00221 static inline void assign_float_to_dest(float val, T* dest) {
00222   dest[0] = roundf(val);
00223 }
00224 
00225 // this case causes problem with older compilers
00226 template <> inline void assign_float_to_dest<float>(float val, float* dest) {
00227   dest[0] = val;
00228 }
00229 #endif
00230 
00231 template <typename IMAGE_T>
00232 void GaussianBlur<IMAGE_T>::blur_cpu(float sigma) {
00233   int x, y, z;
00234   int size = getKernelSizeForSigma(sigma);
00235   float *kernel = new float[size];
00236   fillGaussianBlurKernel1D(sigma, size, kernel);
00237   const int offset = size >> 1;
00238   SWAPT(scratch, image);
00239 
00240   /* X kernel */
00241 #ifdef USE_OMP
00242 #pragma omp parallel for schedule(static)
00243 #endif
00244   for (z = 0; z < depth; ++z) {
00245     for (y = 0; y < height; ++y) {
00246       for (x = 0; x < width; ++x) {
00247         const long idx = z*heightWidth + y*long(width) + x;
00248         const int new_offset_neg = x - offset >= 0 ? -offset :  -x;
00249         const int new_offset_pos = x + offset < width ? offset : width - x - 1;
00250         float value = 0.0f;
00251         int i;
00252         for (i = -offset; i < new_offset_neg; ++i) {
00253           value += scratch[idx + new_offset_neg] * kernel[i+offset];
00254         }
00255 
00256         for (; i <= new_offset_pos; ++i) {
00257           value += scratch[idx + i] * kernel[i+offset];
00258         }
00259 
00260         for (; i <= offset; ++i) {
00261           value += scratch[idx + new_offset_pos] * kernel[i+offset];
00262         }
00263 
00264         assign_float_to_dest(value, &image[idx]);
00265       }
00266     }
00267   }
00268   SWAPT(scratch, image);
00269 
00270   /* Y kernel */
00271 #ifdef USE_OMP
00272 #pragma omp parallel for schedule(static)
00273 #endif
00274   for (z = 0; z < depth; ++z) {
00275     for (y = 0; y < height; ++y) {
00276       const int new_offset_neg = y - offset >= 0 ? -offset :  -y;
00277       const int new_offset_pos = y + offset < height ? offset : height - y - 1;
00278       for (x = 0; x < width; ++x) {
00279         const long idx = z*heightWidth + y*long(width) + x;
00280         float value = 0.0f;
00281         int i;
00282 
00283         for (i = -offset; i < new_offset_neg; ++i) {
00284           value += scratch[idx + new_offset_neg*width] * kernel[i+offset];
00285         }
00286 
00287         for (; i <= new_offset_pos; ++i) {
00288           value += scratch[idx + i*width] * kernel[i+offset];
00289         }
00290 
00291         for (; i <= offset; ++i) {
00292           value += scratch[idx + new_offset_pos*width] * kernel[i+offset];
00293         }
00294 
00295         assign_float_to_dest(value, &image[idx]);
00296       }
00297     }
00298   }
00299   SWAPT(scratch, image);
00300 
00301   /* Z kernel */
00302 #ifdef USE_OMP
00303 #pragma omp parallel for schedule(static)
00304 #endif
00305   for (z = 0; z < depth; ++z) {
00306     const int new_offset_neg = z - offset >= 0 ? -offset :  -z;
00307     const int new_offset_pos = z + offset < depth ? offset : depth - z - 1;
00308     for (y = 0; y < height; ++y) {
00309       for (x = 0; x < width; ++x) {
00310         const long idx = z*heightWidth + y*long(width) + x;
00311         float value = 0.0f;
00312         int i;
00313 
00314         for (i = -offset; i < new_offset_neg; ++i) {
00315           value += scratch[idx + new_offset_neg*heightWidth] * kernel[i+offset];
00316         }
00317 
00318         for (; i <= new_offset_pos; ++i) {
00319             value += scratch[idx + i*heightWidth] * kernel[i+offset];
00320         }
00321 
00322         for (; i <= offset; ++i) {
00323           value += scratch[idx + new_offset_pos*heightWidth] * kernel[i+offset];
00324         }
00325 
00326         assign_float_to_dest(value, &image[idx]);
00327       }
00328     }
00329   }
00330 
00331   delete [] kernel;
00332 }
00333 
00334 template class GaussianBlur<float>;
00335 template class GaussianBlur<unsigned short>;
00336 template class GaussianBlur<unsigned char>;