00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00022 #include <stdio.h>
00023 #include "CUDAParPrefixOps.h"
00024 #include "CUDASegmentation.h"
00025
00026 #if 0
00027 #define CUERR { cudaError_t err; \
00028 cudaDeviceSynchronize(); \
00029 if ((err = cudaGetLastError()) != cudaSuccess) { \
00030 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00031 }}
00032 #else
00033 #define CUERR
00034 #endif
00035
00036 #define GROUP_BLOCK_SIZE 1024
00037 #define VOXEL_BLOCK_SIZE 1024
00038
00039
00040
00041
00042
00043 __device__ unsigned long nGroups_d;
00044
00045 void free_gpu_temp_storage(gpuseg_temp_storage *tmp) {
00046 if (tmp != NULL && tmp->tmp_d != NULL) {
00047 cudaFree(tmp->tmp_d);
00048 }
00049 tmp->tmp_d = NULL;
00050 tmp->sz = 0;
00051 }
00052
00053
00054 template <typename GROUP_T>
00055 __global__ void update_groups_from_map(GROUP_T* __restrict__ groups_d,
00056 const GROUP_T* __restrict__ group_map_d,
00057 const long nVoxels) {
00058 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00059 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00060 const GROUP_T old_group = groups_d[i];
00061 groups_d[i] = group_map_d[old_group];
00062 }
00063 }
00064
00065
00066 template <typename GROUP_T>
00067 __global__ void init_group_map(const GROUP_T* __restrict__ groups_d,
00068 GROUP_T* __restrict__ group_map_d,
00069 const long nVoxels) {
00070 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00071 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00072 const GROUP_T group_id = groups_d[i];
00073 group_map_d[group_id] = 1;
00074 }
00075 }
00076
00077
00078 template <typename GROUP_T>
00079 __global__ void update_group_map(GROUP_T* __restrict__ group_map_d,
00080 const GROUP_T* __restrict__ scan_results,
00081 const long nGroups) {
00082 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00083
00084 for (long i = idx; i < nGroups; i += gridDim.x * blockDim.x) {
00085 if (i == nGroups - 1) {
00086 nGroups_d = group_map_d[i] + scan_results[i];
00087 }
00088 if (group_map_d[i] == 1) {
00089 group_map_d[i] = scan_results[i];
00090 }
00091 }
00092 }
00093
00094
00095 template <typename GROUP_T>
00096 long sequentialize_groups_cuda(GROUP_T* groups_d, GROUP_T* group_map_d,
00097 unsigned long nVoxels, unsigned long nGroups,
00098 gpuseg_temp_storage *tmp,
00099 gpuseg_temp_storage *scanwork) {
00100 int num_voxel_blocks = (nVoxels + VOXEL_BLOCK_SIZE-1) / VOXEL_BLOCK_SIZE;
00101 int num_group_blocks = (nGroups + GROUP_BLOCK_SIZE-1) / GROUP_BLOCK_SIZE;
00102 GROUP_T* scanout_d = NULL;
00103 void *scanwork_d = NULL;
00104 long scanwork_sz = 0;
00105
00106 if (num_voxel_blocks > 65535)
00107 num_voxel_blocks = 65535;
00108 if (num_group_blocks > 65535)
00109 num_group_blocks = 65535;
00110
00111 CUERR
00112
00113
00114
00115 if (tmp != NULL) {
00116 if (tmp->tmp_d == NULL || tmp->sz < nGroups * long(sizeof(GROUP_T))) {
00117 if (tmp->tmp_d != NULL)
00118 cudaFree(tmp->tmp_d);
00119 tmp->tmp_d = NULL;
00120 tmp->sz = 0;
00121
00122 tmp->sz = nGroups * sizeof(GROUP_T);
00123 cudaMalloc(&tmp->tmp_d, tmp->sz);
00124 }
00125 scanout_d = static_cast<GROUP_T*>(tmp->tmp_d);
00126 } else {
00127
00128
00129 cudaMalloc(&scanout_d, nGroups * sizeof(GROUP_T));
00130 }
00131 CUERR
00132
00133 #if defined(VMDUSECUB)
00134
00135
00136 if (scanwork != NULL) {
00137 long sz = dev_excl_scan_sum_tmpsz(group_map_d, nGroups, scanout_d,
00138 GROUP_T(0));
00139 if (scanwork->tmp_d == NULL || scanwork->sz < sz) {
00140 if (scanwork->tmp_d != NULL)
00141 cudaFree(scanwork->tmp_d);
00142 scanwork->tmp_d = NULL;
00143
00144 scanwork->sz = sz;
00145 cudaMalloc(&scanwork->tmp_d, scanwork->sz);
00146 }
00147
00148 scanwork_d = scanwork->tmp_d;
00149 scanwork_sz = scanwork->sz;
00150 }
00151 #endif
00152
00153 CUERR
00154
00155
00156 cudaMemsetAsync(group_map_d, 0, nGroups * sizeof(GROUP_T));
00157 CUERR
00158
00159
00160 init_group_map<GROUP_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(groups_d, group_map_d, nVoxels);
00161 CUERR
00162
00163
00164 dev_excl_scan_sum(group_map_d, nGroups, scanout_d,
00165 scanwork_d, scanwork_sz, GROUP_T(0));
00166 CUERR
00167
00168
00169 update_group_map<GROUP_T><<<num_group_blocks, GROUP_BLOCK_SIZE>>>(group_map_d, scanout_d, nGroups);
00170 CUERR
00171
00172
00173 update_groups_from_map<<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(groups_d, group_map_d, nVoxels);
00174 CUERR
00175
00176
00177 cudaMemcpyFromSymbol(&nGroups, nGroups_d, sizeof(unsigned long));
00178
00179 if (tmp == NULL) {
00180 cudaFree(scanout_d);
00181 }
00182
00183 return nGroups;
00184 }
00185
00186
00187 template <typename IN_T>
00188 __device__ void getOrderedInt( IN_T input , int* retAddr) {
00189 retAddr[0] = (int)input;
00190 }
00191
00192 template <> __device__ void getOrderedInt<float>( float input , int* retAddr) {
00193 int intVal = __float_as_int( input );
00194 if (intVal < 0) {
00195 intVal ^= 0x7FFFFFFF;
00196 }
00197 retAddr[0] = intVal;
00198 }
00199
00200
00201 template <typename IN_T>
00202 int getOrderedIntHost(IN_T floatVal) {
00203 return (int)floatVal;
00204 }
00205
00206 template <> int getOrderedIntHost<float>(float floatVal) {
00207 int intVal = *((int *) &floatVal);
00208 if (intVal < 0) {
00209 intVal ^= 0x7FFFFFFF;
00210 }
00211 return intVal;
00212 }
00213
00214
00215 template <typename IMAGE_T>
00216 __global__ void init_max_array(int* max_values, long nGroups) {
00217 long idx = blockIdx.x * blockDim.x + threadIdx.x;
00218
00219 for (long i=idx; i<nGroups; i+=gridDim.x * blockDim.x) {
00220 getOrderedInt((int)-INT_MAX, &max_values[i]);
00221 }
00222 }
00223
00224
00225 template <typename GROUP_T, typename IMAGE_T>
00226 __global__ void find_max_values(const GROUP_T* __restrict__ groups_d,
00227 const IMAGE_T* __restrict__ image_d,
00228 int* __restrict__ max_values,
00229 const long nVoxels) {
00230 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00231
00232 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00233 const GROUP_T group_num = groups_d[i];
00234 int voxel_value;
00235 getOrderedInt(image_d[i], &voxel_value);
00236
00237
00238 atomicMax(&max_values[group_num], voxel_value);
00239 }
00240 }
00241
00242
00243 template <typename GROUP_T, typename IMAGE_T>
00244 __global__ void find_max_values_warpred(const GROUP_T* __restrict__ groups_d,
00245 const IMAGE_T* __restrict__ image_d,
00246 int* __restrict__ max_values,
00247 const long nVoxels) {
00248 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00249
00250 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00251 const GROUP_T group_num = groups_d[i];
00252 int voxel_value;
00253 getOrderedInt(image_d[i], &voxel_value);
00254 #if __CUDA_ARCH__ >= 300
00255
00256
00257
00258
00259 int mylaneid = threadIdx.x % warpSize;
00260 int maxlaneid = mylaneid;
00261 for (int lane = 0; lane<warpSize; lane++) {
00262 int grpmatched = (group_num == __shfl_sync(0xffffffff, group_num, lane, warpSize));
00263 int laneval = __shfl_sync(0xffffffff, voxel_value, lane, warpSize);
00264 if (grpmatched) {
00265 if (laneval > voxel_value)
00266 maxlaneid = lane;
00267
00268
00269 if ((laneval == voxel_value) && (lane < mylaneid))
00270 maxlaneid = lane;
00271 }
00272 }
00273
00274 if (maxlaneid == mylaneid)
00275 atomicMax(&max_values[group_num], voxel_value);
00276 #else
00277
00278 atomicMax(&max_values[group_num], voxel_value);
00279 #endif
00280 }
00281 }
00282
00283
00284
00285
00286
00287 template <typename GROUP_T, typename IMAGE_T>
00288 __global__ void find_max_values_shm(const GROUP_T* __restrict__ groups_d,
00289 const IMAGE_T* __restrict__ image_d,
00290 int* __restrict__ max_values,
00291 const long nVoxels,
00292 int initval, long nGroups) {
00293 extern __shared__ int max_vals_shm[];
00294 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00295
00296
00297
00298 max_vals_shm[threadIdx.x] = initval;
00299
00300
00301 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00302 const GROUP_T group_num = groups_d[i];
00303 int voxel_value;
00304 getOrderedInt(image_d[i], &voxel_value);
00305 atomicMax(&max_vals_shm[group_num], voxel_value);
00306 }
00307 __syncthreads();
00308
00309
00310 if (threadIdx.x < nGroups) {
00311 atomicMax(&max_values[threadIdx.x], max_vals_shm[threadIdx.x]);
00312 }
00313 }
00314
00315
00316 template <typename GROUP_T, typename IMAGE_T>
00317 __global__ void find_max_values_shm_2xl(const GROUP_T* __restrict__ groups_d,
00318 const IMAGE_T* __restrict__ image_d,
00319 int* __restrict__ max_values,
00320 const long nVoxels,
00321 int initval, long nGroups) {
00322 extern __shared__ int max_vals_shm[];
00323 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00324
00325
00326 int gdx = threadIdx.x;
00327 max_vals_shm[gdx] = initval;
00328 gdx = blockDim.x + threadIdx.x;
00329 max_vals_shm[gdx] = initval;
00330
00331
00332 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00333 const GROUP_T group_num = groups_d[i];
00334 int voxel_value;
00335 getOrderedInt(image_d[i], &voxel_value);
00336 atomicMax(&max_vals_shm[group_num], voxel_value);
00337 }
00338 __syncthreads();
00339
00340
00341 gdx = threadIdx.x;
00342 if (gdx < nGroups) {
00343 atomicMax(&max_values[gdx], max_vals_shm[gdx]);
00344 }
00345 gdx = blockDim.x + threadIdx.x;
00346 if (gdx < nGroups) {
00347 atomicMax(&max_values[gdx], max_vals_shm[gdx]);
00348 }
00349 }
00350
00351
00352 template <typename GROUP_T, typename IMAGE_T>
00353 __global__ void find_max_values_shm_4xl(const GROUP_T* __restrict__ groups_d,
00354 const IMAGE_T* __restrict__ image_d,
00355 int* __restrict__ max_values,
00356 const long nVoxels,
00357 int initval, long nGroups) {
00358 extern __shared__ int max_vals_shm[];
00359 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00360
00361
00362 int gdx = threadIdx.x;
00363 max_vals_shm[gdx] = initval;
00364 gdx = blockDim.x + threadIdx.x;
00365 max_vals_shm[gdx] = initval;
00366 gdx = 2*blockDim.x + threadIdx.x;
00367 max_vals_shm[gdx] = initval;
00368 gdx = 3*blockDim.x + threadIdx.x;
00369 max_vals_shm[gdx] = initval;
00370
00371
00372 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00373 const GROUP_T group_num = groups_d[i];
00374 int voxel_value;
00375 getOrderedInt(image_d[i], &voxel_value);
00376 atomicMax(&max_vals_shm[group_num], voxel_value);
00377 }
00378 __syncthreads();
00379
00380
00381 gdx = threadIdx.x;
00382 if (gdx < nGroups) {
00383 atomicMax(&max_values[gdx], max_vals_shm[gdx]);
00384 }
00385 gdx = blockDim.x + threadIdx.x;
00386 if (gdx < nGroups) {
00387 atomicMax(&max_values[gdx], max_vals_shm[gdx]);
00388 }
00389 gdx = 2*blockDim.x + threadIdx.x;
00390 if (gdx < nGroups) {
00391 atomicMax(&max_values[gdx], max_vals_shm[gdx]);
00392 }
00393 gdx = 3*blockDim.x + threadIdx.x;
00394 if (gdx < nGroups) {
00395 atomicMax(&max_values[gdx], max_vals_shm[gdx]);
00396 }
00397 }
00398
00399
00400
00401 template <typename GROUP_T, typename IMAGE_T>
00402 __global__ void update_group_idx(const GROUP_T* __restrict__ groups_d,
00403 const IMAGE_T* __restrict__ image_d,
00404 const int* __restrict max_values,
00405 unsigned long* __restrict__ group_idx,
00406 const unsigned long nVoxels) {
00407 long idx = blockIdx.x * blockDim.x + threadIdx.x;
00408 for (long i=idx; i<nVoxels; i+=gridDim.x * blockDim.x) {
00409 const GROUP_T group_num = groups_d[i];
00410 int max_val;
00411 getOrderedInt(image_d[i], &max_val);
00412 if (max_values[group_num] == max_val) {
00413 group_idx[group_num] = i;
00414 }
00415 }
00416 }
00417
00418
00419 template <typename GROUP_T, typename IMAGE_T>
00420 void find_groups_max_idx_cuda(GROUP_T* groups_d, IMAGE_T* image_d, unsigned long* max_idx, unsigned long nVoxels,
00421 unsigned long nGroups, gpuseg_temp_storage *tmp) {
00422 int num_voxel_blocks = (nVoxels + VOXEL_BLOCK_SIZE-1) / VOXEL_BLOCK_SIZE;
00423 int num_group_blocks = (nGroups + GROUP_BLOCK_SIZE-1) / GROUP_BLOCK_SIZE;
00424 int *max_values;
00425
00426 if (num_voxel_blocks > 65535)
00427 num_voxel_blocks = 65535;
00428 if (num_group_blocks > 65535)
00429 num_group_blocks = 65535;
00430
00431
00432
00433 if (tmp != NULL) {
00434 if (tmp->tmp_d == NULL || tmp->sz < nGroups * long(sizeof(int))) {
00435
00436 if (tmp->tmp_d != NULL)
00437 cudaFree(tmp->tmp_d);
00438 tmp->tmp_d = NULL;
00439 tmp->sz = 0;
00440
00441 cudaMalloc(&tmp->tmp_d, nGroups * sizeof(int));
00442 tmp->sz = nGroups * sizeof(int);
00443 }
00444 max_values = static_cast<int *>(tmp->tmp_d);
00445 } else {
00446
00447
00448 cudaMalloc(&max_values, nGroups * sizeof(int));
00449 }
00450
00451 CUERR
00452
00453 init_max_array<IMAGE_T><<<num_group_blocks, GROUP_BLOCK_SIZE>>>(max_values, nGroups);
00454 CUERR
00455
00456
00457
00458
00459 int initval = getOrderedIntHost((int)-INT_MAX);
00460 if (nGroups <= VOXEL_BLOCK_SIZE) {
00461 find_max_values_shm<GROUP_T, IMAGE_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE, VOXEL_BLOCK_SIZE*sizeof(int)>>>(groups_d, image_d, max_values, nVoxels, initval, nGroups);
00462 #if 1
00463 } else if (nGroups <= 2*VOXEL_BLOCK_SIZE) {
00464 find_max_values_shm_2xl<GROUP_T, IMAGE_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE, 2*VOXEL_BLOCK_SIZE*sizeof(int)>>>(groups_d, image_d, max_values, nVoxels, initval, nGroups);
00465 } else if (nGroups <= 4*VOXEL_BLOCK_SIZE) {
00466 find_max_values_shm_4xl<GROUP_T, IMAGE_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE, 4*VOXEL_BLOCK_SIZE*sizeof(int)>>>(groups_d, image_d, max_values, nVoxels, initval, nGroups);
00467 #endif
00468 } else if (nGroups < nVoxels) {
00469
00470 find_max_values_warpred<GROUP_T, IMAGE_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(groups_d, image_d, max_values, nVoxels);
00471 } else {
00472
00473 find_max_values<GROUP_T, IMAGE_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(groups_d, image_d, max_values, nVoxels);
00474 }
00475 CUERR
00476
00477 update_group_idx<GROUP_T, IMAGE_T><<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(groups_d, image_d, max_values, max_idx, nVoxels);
00478 CUERR
00479
00480 if (tmp == NULL) {
00481 cudaFree(max_values);
00482 }
00483 }
00484
00485
00486 #define NUM_N 18
00487
00488 template <typename GROUP_T, typename IMAGE_T>
00489 __global__ void hill_climb_kernel(IMAGE_T* image_d, unsigned long* max_int_d, GROUP_T* group_map_d, GROUP_T* groups_d,
00490 int height, int width, int depth, unsigned long nGroups) {
00491 __shared__ IMAGE_T neighbor_vals[NUM_N];
00492 int tIdx = threadIdx.x;
00493 if (tIdx >= NUM_N + 1) {
00494 return;
00495 }
00496 for (int g = blockIdx.x; g < nGroups; g += gridDim.x) {
00497 long curr_idx;
00498 long new_idx = max_int_d[g];
00499 do {
00500 int offsets[NUM_N + 1] = { 0, 1, -1, width, -width, height*width, -height*width,
00501 1 + width, 1 - width, -1 + width, -1 - width,
00502 1 + height*width, 1 - height*width, -1 + height*width, -1 - height*width,
00503 width + height*width, width - height*width, -width + height*width, -width - height*width};
00504 curr_idx = new_idx;
00505 int z = new_idx / (height * width);
00506 int r = new_idx % (height * width);
00507 int y = r / width;
00508 int x = r % width;
00509
00510 if (x >= width - 1) {
00511 offsets[1] = 0;
00512 offsets[7] = 0;
00513 offsets[8] = 0;
00514 offsets[11] = 0;
00515 offsets[12] = 0;
00516 }
00517
00518 if (x <= 0) {
00519 offsets[2] = 0;
00520 offsets[9] = 0;
00521 offsets[10] = 0;
00522 offsets[13] = 0;
00523 offsets[14] = 0;
00524 }
00525
00526 if (y >= height - 1) {
00527 offsets[3] = 0;
00528 offsets[7] = 0;
00529 offsets[9] = 0;
00530 offsets[15] = 0;
00531 offsets[16] = 0;
00532 }
00533
00534 if (y <= 0) {
00535 offsets[4] = 0;
00536 offsets[8] = 0;
00537 offsets[10] = 0;
00538 offsets[17] = 0;
00539 offsets[18] = 0;
00540 }
00541
00542 if (z >= depth - 1) {
00543 offsets[5] = 0;
00544 offsets[11] = 0;
00545 offsets[13] = 0;
00546 offsets[15] = 0;
00547 offsets[17] = 0;
00548 }
00549
00550 if (z <= 0) {
00551 offsets[6] = 0;
00552 offsets[12] = 0;
00553 offsets[14] = 0;
00554 offsets[16] = 0;
00555 offsets[18] = 0;
00556 }
00557
00558 int offset = offsets[tIdx];
00559 neighbor_vals[tIdx] = image_d[curr_idx + offset];
00560 __syncthreads();
00561 IMAGE_T curr_val = neighbor_vals[0];
00562 int new_offset = 0;
00563 for (int i = 1; i <= NUM_N; ++i) {
00564 if (neighbor_vals[i] > curr_val) {
00565 curr_val = neighbor_vals[i];
00566 new_offset = offsets[i];
00567 }
00568 }
00569 new_idx += new_offset;
00570 } while (curr_idx != new_idx);
00571
00572 if (tIdx == 0) {
00573 group_map_d[g] = groups_d[new_idx];
00574 }
00575 }
00576 }
00577
00578
00579 template <typename GROUP_T, typename IMAGE_T>
00580 void hill_climb_merge_cuda(GROUP_T* groups_d, IMAGE_T* image_d, unsigned long* max_idx_d, GROUP_T* group_map_d,
00581 int height, int width, int depth, unsigned long nGroups) {
00582 long nVoxels = long(height) * long(width) * long(depth);
00583 int num_voxel_blocks = (nVoxels + VOXEL_BLOCK_SIZE-1) / VOXEL_BLOCK_SIZE;
00584 int nBlocks = nGroups > 65535 ? 65535 : nGroups;
00585
00586 CUERR
00587
00588 hill_climb_kernel<<<nBlocks, 32>>>(image_d, max_idx_d, group_map_d, groups_d,
00589 height, width, depth, nGroups);
00590
00591 CUERR
00592 update_groups_from_map<<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(groups_d, group_map_d, nVoxels);
00593
00594 CUERR
00595 }
00596
00597 template <typename GROUP_T, typename IMAGE_T>
00598 void watershed_hill_climb_merge_cuda(GROUP_T* segments_d, GROUP_T* new_segments_d, IMAGE_T* image_d, GROUP_T* group_map_d,
00599 unsigned long* max_idx_d, long height, long width, long depth, unsigned long nGroups) {
00600 long nVoxels = long(height) * long(width) * long(depth);
00601 int num_voxel_blocks = (nVoxels + VOXEL_BLOCK_SIZE-1) / VOXEL_BLOCK_SIZE;
00602 int nBlocks = nGroups > 65535 ? 65535 : nGroups;
00603
00604 CUERR
00605
00606 hill_climb_kernel<<<nBlocks, 32>>>(image_d, max_idx_d, group_map_d, new_segments_d,
00607 height, width, depth, nGroups);
00608
00609 CUERR
00610 update_groups_from_map<<<num_voxel_blocks, VOXEL_BLOCK_SIZE>>>(segments_d, group_map_d, nVoxels);
00611
00612 CUERR
00613 }
00614
00615 template <typename IN_T, typename OUT_T>
00616 __global__ void copy_and_convert_kernel(IN_T* in, OUT_T* out, long num_elements) {
00617 const long idx = blockIdx.x * blockDim.x + threadIdx.x;
00618 for (long i=idx; i<num_elements; i+=gridDim.x * blockDim.x) {
00619 out[i] = (OUT_T)in[i];
00620 }
00621 }
00622
00623 template <typename IN_T, typename OUT_T>
00624 void copy_and_convert_type_cuda(IN_T* in, OUT_T* out, long num_elements) {
00625 int nBlocks = (num_elements + VOXEL_BLOCK_SIZE-1) / VOXEL_BLOCK_SIZE;
00626 nBlocks = nBlocks > 65535 ? 65535 : nBlocks;
00627
00628 copy_and_convert_kernel<<<nBlocks, 32>>>(in, out, num_elements);
00629 }
00630
00631 #define COPY_CONV_INST(T) \
00632 template void copy_and_convert_type_cuda<T,unsigned long>(T* in, unsigned long* out, long num_elements);\
00633 template void copy_and_convert_type_cuda<T,unsigned int>(T* in, unsigned int* out, long num_elements);\
00634 template void copy_and_convert_type_cuda<T,unsigned short>(T* in, unsigned short* out, long num_elements);\
00635 template void copy_and_convert_type_cuda<T,unsigned char>(T* in, unsigned char* out, long num_elements);\
00636 template void copy_and_convert_type_cuda<T,long>(T* in, long* out, long num_elements);\
00637 template void copy_and_convert_type_cuda<T,int>(T* in, int* out, long num_elements);\
00638 template void copy_and_convert_type_cuda<T,short>(T* in, short* out, long num_elements);\
00639 template void copy_and_convert_type_cuda<T,char>(T* in, char* out, long num_elements);\
00640 template void copy_and_convert_type_cuda<T,float>(T* in, float* out, long num_elements);\
00641
00642 #define INST_SEQ_GROUPS_CUDA(G_T) \
00643 template long sequentialize_groups_cuda<G_T>(G_T* groups_d, G_T* group_map_d,\
00644 unsigned long nVoxels, unsigned long nGroups,\
00645 gpuseg_temp_storage *tmp,\
00646 gpuseg_temp_storage *scanwork);\
00647
00648 #define INST_FIND_MAX_IDX_CUDA(G_T) \
00649 template void find_groups_max_idx_cuda<G_T, float>(G_T* groups_d, float* image_d, unsigned long* max_idx,\
00650 unsigned long nVoxels, unsigned long nGroups,\
00651 gpuseg_temp_storage *tmp);\
00652 template void find_groups_max_idx_cuda<G_T, unsigned short>(G_T* groups_d, unsigned short* image_d, unsigned long* max_idx,\
00653 unsigned long nVoxels, unsigned long nGroups,\
00654 gpuseg_temp_storage *tmp);\
00655 template void find_groups_max_idx_cuda<G_T, unsigned char>(G_T* groups_d, unsigned char* image_d, unsigned long* max_idx,\
00656 unsigned long nVoxels, unsigned long nGroups,\
00657 gpuseg_temp_storage *tmp);
00658
00659 #define INST_HILL_CLIMB_MERGE_CUDA(G_T) \
00660 template void hill_climb_merge_cuda<G_T, float>(G_T* groups_d, float* image_d, unsigned long* max_idx_d,\
00661 G_T* group_map_d, int height, int width, int depth, unsigned long nGroups);\
00662 template void hill_climb_merge_cuda<G_T, unsigned short>(G_T* groups_d, unsigned short* image_d, unsigned long* max_idx_d,\
00663 G_T* group_map_d, int height, int width, int depth, unsigned long nGroups);\
00664 template void hill_climb_merge_cuda<G_T, unsigned char>(G_T* groups_d, unsigned char* image_d, unsigned long* max_idx_d,\
00665 G_T* group_map_d, int height, int width, int depth, unsigned long nGroups);
00666
00667 #define INST_WATERSHED_HILL_CLIMB_MERGE_CUDA(G_T) \
00668 template void watershed_hill_climb_merge_cuda<G_T, float>(G_T* segments_d, G_T* new_segments_d, float* image_d, G_T* group_map_d,\
00669 unsigned long* max_idx_d, long height, long width, long depth, unsigned long nGroups);\
00670 template void watershed_hill_climb_merge_cuda<G_T, unsigned short>(G_T* segments_d, G_T* new_segments_d,\
00671 unsigned short* image_d, G_T* group_map_d,\
00672 unsigned long* max_idx_d, long height, long width, long depth, unsigned long nGroups);\
00673 template void watershed_hill_climb_merge_cuda<G_T, unsigned char>(G_T* segments_d, G_T* new_segments_d,\
00674 unsigned char* image_d, G_T* group_map_d,\
00675 unsigned long* max_idx_d, long height, long width, long depth, unsigned long nGroups);
00676
00677
00678 COPY_CONV_INST(long)
00679 COPY_CONV_INST(unsigned long)
00680 COPY_CONV_INST(int)
00681 COPY_CONV_INST(unsigned int)
00682 COPY_CONV_INST(short)
00683 COPY_CONV_INST(unsigned short)
00684 COPY_CONV_INST(char)
00685 COPY_CONV_INST(unsigned char)
00686 COPY_CONV_INST(float)
00687
00688
00689 INST_SEQ_GROUPS_CUDA(unsigned long)
00690 INST_SEQ_GROUPS_CUDA(unsigned int)
00691 INST_SEQ_GROUPS_CUDA(unsigned short)
00692
00693 INST_FIND_MAX_IDX_CUDA(unsigned long)
00694 INST_FIND_MAX_IDX_CUDA(unsigned int)
00695 INST_FIND_MAX_IDX_CUDA(unsigned short)
00696
00697
00698 INST_HILL_CLIMB_MERGE_CUDA(unsigned long)
00699 INST_HILL_CLIMB_MERGE_CUDA(unsigned int)
00700 INST_HILL_CLIMB_MERGE_CUDA(unsigned short)
00701
00702 INST_WATERSHED_HILL_CLIMB_MERGE_CUDA(unsigned long)
00703 INST_WATERSHED_HILL_CLIMB_MERGE_CUDA(unsigned int)
00704 INST_WATERSHED_HILL_CLIMB_MERGE_CUDA(unsigned short)