00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00022 #include <stdlib.h>
00023 #include "CUDAParPrefixOps.h"
00024 
00025 #if defined(VMDUSECUB)
00026 #include <cub/cub.cuh>
00027 #else
00028 #include <thrust/scan.h>
00029 #include <thrust/execution_policy.h>
00030 #endif
00031 
00032 
00033 #if 0
00034 #define CUERR { cudaError_t err; \
00035   cudaDeviceSynchronize(); \
00036   if ((err = cudaGetLastError()) != cudaSuccess) { \
00037   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00038   }}
00039 #else
00040 #define CUERR
00041 #endif
00042 
00043 #if defined(VMDUSECUB)
00044 
00045 
00046 struct VMDScanSum {
00047   template <typename T>
00048   
00049   __device__ __forceinline__
00050   T operator()(const T &a, const T &b) const {
00051     return a + b;
00052   }
00053 };
00054 
00055 
00056 
00057 
00058 template <typename T>
00059 long dev_excl_scan_sum_tmpsz(T *in_d, long nitems, T *out_d, T ival) {
00060   size_t tsz = 0;
00061   VMDScanSum sum_op;
00062   cub::DeviceScan::ExclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00063                                  sum_op, ival, nitems);
00064   return (long) tsz;
00065 }
00066 
00067 template <typename T>
00068 void dev_excl_scan_sum(T *in_d, long nitems, T *out_d,
00069                        void *scanwork_d, long tlsz, T ival) {
00070   VMDScanSum sum_op;
00071   int autoallocate=0;
00072   size_t tsz = tlsz;
00073 
00074   if (scanwork_d == NULL) {
00075     autoallocate=1;
00076     cub::DeviceScan::ExclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00077                                    sum_op, ival, nitems);
00078 
00079     cudaMalloc(&scanwork_d, tsz); 
00080   }
00081 
00082   cub::DeviceScan::ExclusiveScan(scanwork_d, tsz, in_d, out_d, 
00083                                  sum_op, ival, nitems);
00084   if (autoallocate)
00085     cudaFree(scanwork_d);
00086 }
00087 
00088 
00089 
00090 
00091 
00092 template <typename T>
00093 long dev_incl_scan_sum_tmpsz(T *in_d, long nitems, T *out_d) {
00094   size_t tsz = 0;
00095   VMDScanSum sum_op;
00096   cub::DeviceScan::InclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00097                                  sum_op, nitems);
00098   return (long) tsz;
00099 }
00100 
00101 
00102 template <typename T>
00103 void dev_incl_scan_sum(T *in_d, long nitems, T *out_d,
00104                        void *scanwork_d, long tlsz) {
00105   VMDScanSum sum_op;
00106   int autoallocate=0;
00107   size_t tsz = tlsz;
00108 
00109   if (scanwork_d == NULL) {
00110     autoallocate=1;
00111     cub::DeviceScan::InclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00112                                    sum_op, nitems);
00113 
00114     cudaMalloc(&scanwork_d, tsz); 
00115   }
00116 
00117   cub::DeviceScan::InclusiveScan(scanwork_d, tsz, in_d, out_d, 
00118                                  sum_op, nitems);
00119   if (autoallocate)
00120     cudaFree(scanwork_d);
00121 }
00122 
00123 #else
00124 
00125 
00126 
00127 
00128 
00129 template <typename T>
00130 long dev_excl_scan_sum_tmpsz(T *in_d, long nGroups, T *out_d, T ival) {
00131   return 0;
00132 }
00133 
00134 
00135 template <typename T>
00136 void dev_excl_scan_sum(T *in_d, long nGroups, T *out_d,
00137                        void *scanwork_d, long tsz, T ival) {
00138   
00139   
00140   
00141   
00142   
00143   
00144   
00145   
00146   thrust::exclusive_scan(thrust::cuda::par, in_d, in_d + nGroups, out_d);
00147 }
00148 
00149 
00150 
00151 
00152 
00153 template <typename T>
00154 long dev_incl_scan_sum_tmpsz(T *in_d, long nGroups, T *out_d) {
00155   return 0;
00156 }
00157 
00158 template <typename T>
00159 void dev_incl_scan_sum(T *in_d, long nGroups, T *out_d,
00160                        void *scanwork_d, long tsz) {
00161   
00162   
00163   
00164   
00165   
00166   
00167   
00168   
00169   thrust::inclusive_scan(thrust::cuda::par, in_d, in_d + nGroups, out_d);
00170 }
00171 
00172 
00173 #endif
00174 
00175 
00176 
00177 
00178 
00179 
00180 #define INST_DEV_EXCL_SCAN_SUM_TMPSZ(T) template long dev_excl_scan_sum_tmpsz<T>(T*, long, T*, T);
00181 #define INST_DEV_EXCL_SCAN_SUM(T) template void dev_excl_scan_sum<T>(T*, long, T*, void*, long, T);
00182 
00183 INST_DEV_EXCL_SCAN_SUM_TMPSZ(long)
00184 INST_DEV_EXCL_SCAN_SUM_TMPSZ(int)
00185 INST_DEV_EXCL_SCAN_SUM_TMPSZ(short)
00186 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned long)
00187 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned int)
00188 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned short)
00189 
00190 INST_DEV_EXCL_SCAN_SUM(long)
00191 INST_DEV_EXCL_SCAN_SUM(int)
00192 INST_DEV_EXCL_SCAN_SUM(short)
00193 INST_DEV_EXCL_SCAN_SUM(unsigned long)
00194 INST_DEV_EXCL_SCAN_SUM(unsigned int)
00195 INST_DEV_EXCL_SCAN_SUM(unsigned short)
00196 
00197 
00198 #if 0
00199 inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) {
00200   return make_uint2(a.x + b.x, a.y + b.y);
00201 }
00202 
00203 template long dev_excl_scan_sum_tmpsz<uint2>(uint2*, long, uint2*, uint2);
00204 template void dev_excl_scan_sum<uint2>(uint2*, long, uint2*, void*, long, uint2);
00205 #endif
00206 
00207 template void dev_incl_scan_sum<float>(float*, long, float*, void*, long);
00208 
00209