00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <string.h>
00023 #include <stdlib.h>
00024 #include <stdio.h>
00025 #include <math.h>
00026 #include <climits>
00027 #include "GaussianBlur.h"
00028 #include "CUDAGaussianBlur.h"
00029 #include "ProfileHooks.h"
00030
00031 #define PI_F (3.14159265359f)
00032
00033 template <typename IMAGE_T>
00034 GaussianBlur<IMAGE_T>::GaussianBlur(IMAGE_T* img, int w, int h, int d, bool use_cuda) {
00035 host_image_needs_update = 0;
00036 height = h;
00037 width = w;
00038 depth = d;
00039 image_d = NULL;
00040 image = NULL;
00041 scratch = NULL;
00042 heightWidth = long(height) * long(width);
00043 nVoxels = heightWidth * long(depth);
00044 cuda = use_cuda;
00045 #ifndef VMDCUDA
00046 cuda = false;
00047 #endif
00048
00049 image = new IMAGE_T[nVoxels];
00050 #ifdef VMDCUDA
00051 if (cuda) {
00052 image_d = (IMAGE_T*) alloc_cuda_array(nVoxels * sizeof(IMAGE_T));
00053 scratch_d = (IMAGE_T*) alloc_cuda_array(nVoxels * sizeof(IMAGE_T));
00054 if (image_d == NULL || scratch_d == NULL) {
00055 cuda = false;
00056 free_cuda_array(image_d);
00057 free_cuda_array(scratch_d);
00058 } else {
00059 copy_array_to_gpu(image_d, img, nVoxels * sizeof(IMAGE_T));
00060 copy_array_to_gpu(scratch_d, img, nVoxels * sizeof(IMAGE_T));
00061 }
00062 }
00063 if (!cuda)
00064 #endif
00065 {
00066 scratch = new IMAGE_T[nVoxels];
00067 memcpy(image, img, nVoxels * sizeof(IMAGE_T));
00068 }
00069 }
00070
00071
00072 template <typename IMAGE_T>
00073 GaussianBlur<IMAGE_T>::~GaussianBlur() {
00074 delete [] image;
00075 #ifdef VMDCUDA
00076 if (cuda) {
00077 free_cuda_array(image_d);
00078 free_cuda_array(scratch_d);
00079 } else
00080 #endif
00081 delete [] scratch;
00082 }
00083
00084
00085 template <typename IMAGE_T>
00086 IMAGE_T* GaussianBlur<IMAGE_T>::get_image() {
00087 #ifdef VMDCUDA
00088 if (host_image_needs_update) {
00089 copy_array_from_gpu(image, image_d, nVoxels * sizeof(IMAGE_T));
00090 host_image_needs_update = 0;
00091 }
00092 #endif
00093
00094 return image;
00095 }
00096
00097
00098 template <typename IMAGE_T>
00099 IMAGE_T* GaussianBlur<IMAGE_T>::get_image_d() {
00100 return image_d;
00101 }
00102
00103
00104
00105
00106
00107
00108 #define MAX_KERNEL_SIZE 4096
00109
00110 template <typename IMAGE_T>
00111 int GaussianBlur<IMAGE_T>::getKernelSizeForSigma(float sigma) {
00112 int size = int(ceilf(sigma * 6));
00113 if (size % 2 == 0) {
00114 ++size;
00115 }
00116 if (size < 3) {
00117 size = 3;
00118 }
00119 if (size >= MAX_KERNEL_SIZE) {
00120 size = MAX_KERNEL_SIZE;
00121 }
00122 return size;
00123 }
00124
00125
00126 template <typename IMAGE_T>
00127 void GaussianBlur<IMAGE_T>::fillGaussianBlurKernel3D(float sigma, int size, float* kernel) {
00128 float sigma2 = sigma * sigma;
00129 int middle = size / 2;
00130 for (int z = 0; z < size; z++) {
00131 for (int y = 0; y < size; y++) {
00132 for (int x = 0; x < size; x++) {
00133 const int idx = z * size * size + y * size + x;
00134 float x_dist = float(middle - x);
00135 float y_dist = float(middle - y);
00136 float z_dist = float(middle - z);
00137 float distance2 = x_dist * x_dist + y_dist * y_dist + z_dist * z_dist;
00138 float s = 1.0f / (sigma * sqrtf(2.0f * PI_F)) * expf(-distance2 / (2.0f * sigma2));
00139 kernel[idx] = s;
00140 }
00141 }
00142 }
00143 }
00144
00145
00146 template <typename IMAGE_T>
00147 void GaussianBlur<IMAGE_T>::fillGaussianBlurKernel1D(float sigma, int size, float* kernel) {
00148 float sigma2 = sigma * sigma;
00149 int middle = size / 2;
00150 for (int i = 0; i < size; ++i) {
00151 float distance = float (middle - i);
00152 float distance2 = distance * distance;
00153 float s = 1.0f / (sigma * sqrtf(2.0f*PI_F)) * expf(-distance2 / (2.0f * sigma2));
00154 kernel[i] = s;
00155 }
00156 }
00157
00158
00159 #define SWAPT(a, b) {\
00160 IMAGE_T* t = a;\
00161 a = b;\
00162 b = t;\
00163 }
00164
00165
00166 template <typename IMAGE_T>
00167 void GaussianBlur<IMAGE_T>::blur(float sigma) {
00168 #ifdef VMDCUDA
00169 if (cuda) {
00170 blur_cuda(sigma);
00171 } else
00172 #endif
00173 blur_cpu(sigma);
00174 }
00175
00176
00177 #if defined(VMDCUDA)
00178 template <typename IMAGE_T>
00179 void GaussianBlur<IMAGE_T>::blur_cuda(float sigma) {
00180 PROFILE_PUSH_RANGE("GaussianBlur<IMAGE_T>::blur_cuda()", 7);
00181
00182 if (true) {
00183 int size = getKernelSizeForSigma(sigma);
00184 float *kernel = new float[size];
00185 fillGaussianBlurKernel1D(sigma, size, kernel);
00186 set_gaussian_1D_kernel_cuda(kernel, size);
00187
00188 gaussian1D_x_cuda(image_d, scratch_d, size, width, height, depth);
00189 SWAPT(image_d, scratch_d);
00190
00191 gaussian1D_y_cuda(image_d, scratch_d, size, width, height, depth);
00192 SWAPT(image_d, scratch_d);
00193
00194 gaussian1D_z_cuda(image_d, scratch_d, size, width, height, depth);
00195 SWAPT(image_d, scratch_d);
00196
00197
00198 host_image_needs_update = 1;
00199
00200 delete [] kernel;
00201 } else {
00202 int size = getKernelSizeForSigma(sigma);
00203 float *kernel = new float[size * size * size];
00204 fillGaussianBlurKernel3D(sigma, size, kernel);
00205 delete [] kernel;
00206 }
00207
00208 PROFILE_POP_RANGE();
00209 }
00210 #endif
00211
00212 #if 1
00213 template <typename T>
00214 static inline void assign_float_to_dest(float val, T* dest) {
00215
00216
00217 dest[0] = T(val);
00218 }
00219 #else
00220 template <typename T>
00221 static inline void assign_float_to_dest(float val, T* dest) {
00222 dest[0] = roundf(val);
00223 }
00224
00225
00226 template <> inline void assign_float_to_dest<float>(float val, float* dest) {
00227 dest[0] = val;
00228 }
00229 #endif
00230
00231 template <typename IMAGE_T>
00232 void GaussianBlur<IMAGE_T>::blur_cpu(float sigma) {
00233 int x, y, z;
00234 int size = getKernelSizeForSigma(sigma);
00235 float *kernel = new float[size];
00236 fillGaussianBlurKernel1D(sigma, size, kernel);
00237 const int offset = size >> 1;
00238 SWAPT(scratch, image);
00239
00240
00241 #ifdef USE_OMP
00242 #pragma omp parallel for schedule(static)
00243 #endif
00244 for (z = 0; z < depth; ++z) {
00245 for (y = 0; y < height; ++y) {
00246 for (x = 0; x < width; ++x) {
00247 const long idx = z*heightWidth + y*long(width) + x;
00248 const int new_offset_neg = x - offset >= 0 ? -offset : -x;
00249 const int new_offset_pos = x + offset < width ? offset : width - x - 1;
00250 float value = 0.0f;
00251 int i;
00252 for (i = -offset; i < new_offset_neg; ++i) {
00253 value += scratch[idx + new_offset_neg] * kernel[i+offset];
00254 }
00255
00256 for (; i <= new_offset_pos; ++i) {
00257 value += scratch[idx + i] * kernel[i+offset];
00258 }
00259
00260 for (; i <= offset; ++i) {
00261 value += scratch[idx + new_offset_pos] * kernel[i+offset];
00262 }
00263
00264 assign_float_to_dest(value, &image[idx]);
00265 }
00266 }
00267 }
00268 SWAPT(scratch, image);
00269
00270
00271 #ifdef USE_OMP
00272 #pragma omp parallel for schedule(static)
00273 #endif
00274 for (z = 0; z < depth; ++z) {
00275 for (y = 0; y < height; ++y) {
00276 const int new_offset_neg = y - offset >= 0 ? -offset : -y;
00277 const int new_offset_pos = y + offset < height ? offset : height - y - 1;
00278 for (x = 0; x < width; ++x) {
00279 const long idx = z*heightWidth + y*long(width) + x;
00280 float value = 0.0f;
00281 int i;
00282
00283 for (i = -offset; i < new_offset_neg; ++i) {
00284 value += scratch[idx + new_offset_neg*width] * kernel[i+offset];
00285 }
00286
00287 for (; i <= new_offset_pos; ++i) {
00288 value += scratch[idx + i*width] * kernel[i+offset];
00289 }
00290
00291 for (; i <= offset; ++i) {
00292 value += scratch[idx + new_offset_pos*width] * kernel[i+offset];
00293 }
00294
00295 assign_float_to_dest(value, &image[idx]);
00296 }
00297 }
00298 }
00299 SWAPT(scratch, image);
00300
00301
00302 #ifdef USE_OMP
00303 #pragma omp parallel for schedule(static)
00304 #endif
00305 for (z = 0; z < depth; ++z) {
00306 const int new_offset_neg = z - offset >= 0 ? -offset : -z;
00307 const int new_offset_pos = z + offset < depth ? offset : depth - z - 1;
00308 for (y = 0; y < height; ++y) {
00309 for (x = 0; x < width; ++x) {
00310 const long idx = z*heightWidth + y*long(width) + x;
00311 float value = 0.0f;
00312 int i;
00313
00314 for (i = -offset; i < new_offset_neg; ++i) {
00315 value += scratch[idx + new_offset_neg*heightWidth] * kernel[i+offset];
00316 }
00317
00318 for (; i <= new_offset_pos; ++i) {
00319 value += scratch[idx + i*heightWidth] * kernel[i+offset];
00320 }
00321
00322 for (; i <= offset; ++i) {
00323 value += scratch[idx + new_offset_pos*heightWidth] * kernel[i+offset];
00324 }
00325
00326 assign_float_to_dest(value, &image[idx]);
00327 }
00328 }
00329 }
00330
00331 delete [] kernel;
00332 }
00333
00334 template class GaussianBlur<float>;
00335 template class GaussianBlur<unsigned short>;
00336 template class GaussianBlur<unsigned char>;