Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

CUDAParPrefixOps.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAParPrefixOps.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.11 $        $Date: 2020/02/26 20:16:56 $
00014  *
00015  ***************************************************************************/
00022 #include <stdlib.h>
00023 #include "CUDAParPrefixOps.h"
00024 
00025 #if defined(VMDUSECUB)
00026 #include <cub/cub.cuh>
00027 #else
00028 #include <thrust/scan.h>
00029 #include <thrust/execution_policy.h>
00030 #endif
00031 
00032 
00033 #if 0
00034 #define CUERR { cudaError_t err; \
00035   cudaDeviceSynchronize(); \
00036   if ((err = cudaGetLastError()) != cudaSuccess) { \
00037   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00038   }}
00039 #else
00040 #define CUERR
00041 #endif
00042 
00043 #if defined(VMDUSECUB)
00044 
00045 // VMDScanSum functor
00046 struct VMDScanSum {
00047   template <typename T>
00048   //  CUB_RUNTIME_FUNCTION __forceinline__
00049   __device__ __forceinline__
00050   T operator()(const T &a, const T &b) const {
00051     return a + b;
00052   }
00053 };
00054 
00055 // 
00056 // Exclusive scan
00057 //
00058 template <typename T>
00059 long dev_excl_scan_sum_tmpsz(T *in_d, long nitems, T *out_d, T ival) {
00060   size_t tsz = 0;
00061   VMDScanSum sum_op;
00062   cub::DeviceScan::ExclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00063                                  sum_op, ival, nitems);
00064   return (long) tsz;
00065 }
00066 
00067 template <typename T>
00068 void dev_excl_scan_sum(T *in_d, long nitems, T *out_d,
00069                        void *scanwork_d, long tlsz, T ival) {
00070   VMDScanSum sum_op;
00071   int autoallocate=0;
00072   size_t tsz = tlsz;
00073 
00074   if (scanwork_d == NULL) {
00075     autoallocate=1;
00076     cub::DeviceScan::ExclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00077                                    sum_op, ival, nitems);
00078 //    printf("One-time alloc scan tmp size: %ld\n", tsz);
00079     cudaMalloc(&scanwork_d, tsz); 
00080   }
00081 
00082   cub::DeviceScan::ExclusiveScan(scanwork_d, tsz, in_d, out_d, 
00083                                  sum_op, ival, nitems);
00084   if (autoallocate)
00085     cudaFree(scanwork_d);
00086 }
00087 
00088 
00089 // 
00090 // Inclusive scan
00091 //
00092 template <typename T>
00093 long dev_incl_scan_sum_tmpsz(T *in_d, long nitems, T *out_d) {
00094   size_t tsz = 0;
00095   VMDScanSum sum_op;
00096   cub::DeviceScan::InclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00097                                  sum_op, nitems);
00098   return (long) tsz;
00099 }
00100 
00101 
00102 template <typename T>
00103 void dev_incl_scan_sum(T *in_d, long nitems, T *out_d,
00104                        void *scanwork_d, long tlsz) {
00105   VMDScanSum sum_op;
00106   int autoallocate=0;
00107   size_t tsz = tlsz;
00108 
00109   if (scanwork_d == NULL) {
00110     autoallocate=1;
00111     cub::DeviceScan::InclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL, 
00112                                    sum_op, nitems);
00113 //    printf("One-time alloc scan tmp size: %ld\n", tsz);
00114     cudaMalloc(&scanwork_d, tsz); 
00115   }
00116 
00117   cub::DeviceScan::InclusiveScan(scanwork_d, tsz, in_d, out_d, 
00118                                  sum_op, nitems);
00119   if (autoallocate)
00120     cudaFree(scanwork_d);
00121 }
00122 
00123 #else
00124 
00125 
00126 // 
00127 // Exclusive scan
00128 //
00129 template <typename T>
00130 long dev_excl_scan_sum_tmpsz(T *in_d, long nGroups, T *out_d, T ival) {
00131   return 0;
00132 }
00133 
00134 
00135 template <typename T>
00136 void dev_excl_scan_sum(T *in_d, long nGroups, T *out_d,
00137                        void *scanwork_d, long tsz, T ival) {
00138   // Prefix scan
00139   // XXX thrust is performing allocation/deallocations per-invocation,
00140   //     which shows up on the SP profile collections as taking a surprising
00141   //     amount of time.  We need to find a way to provide it with completely
00142   //     pre-allocated temp workspaces if at all possible.
00143   //     One Thrust cached allocation scheme (which works for GCC > 4.4) 
00144   //     is described in an example here:
00145   //       https://github.com/thrust/thrust/blob/master/examples/cuda/custom_temporary_allocation.cu
00146   thrust::exclusive_scan(thrust::cuda::par, in_d, in_d + nGroups, out_d);
00147 }
00148 
00149 
00150 // 
00151 // Inclusive scan
00152 //
00153 template <typename T>
00154 long dev_incl_scan_sum_tmpsz(T *in_d, long nGroups, T *out_d) {
00155   return 0;
00156 }
00157 
00158 template <typename T>
00159 void dev_incl_scan_sum(T *in_d, long nGroups, T *out_d,
00160                        void *scanwork_d, long tsz) {
00161   // Prefix scan
00162   // XXX thrust is performing allocation/deallocations per-invocation,
00163   //     which shows up on the SP profile collections as taking a surprising
00164   //     amount of time.  We need to find a way to provide it with completely
00165   //     pre-allocated temp workspaces if at all possible.
00166   //     One Thrust cached allocation scheme (which works for GCC > 4.4) 
00167   //     is described in an example here:
00168   //       https://github.com/thrust/thrust/blob/master/examples/cuda/custom_temporary_allocation.cu
00169   thrust::inclusive_scan(thrust::cuda::par, in_d, in_d + nGroups, out_d);
00170 }
00171 
00172 
00173 #endif
00174 
00175 
00176 
00177 //
00178 // Force instantiation of required templates...
00179 //
00180 #define INST_DEV_EXCL_SCAN_SUM_TMPSZ(T) template long dev_excl_scan_sum_tmpsz<T>(T*, long, T*, T);
00181 #define INST_DEV_EXCL_SCAN_SUM(T) template void dev_excl_scan_sum<T>(T*, long, T*, void*, long, T);
00182 
00183 INST_DEV_EXCL_SCAN_SUM_TMPSZ(long)
00184 INST_DEV_EXCL_SCAN_SUM_TMPSZ(int)
00185 INST_DEV_EXCL_SCAN_SUM_TMPSZ(short)
00186 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned long)
00187 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned int)
00188 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned short)
00189 
00190 INST_DEV_EXCL_SCAN_SUM(long)
00191 INST_DEV_EXCL_SCAN_SUM(int)
00192 INST_DEV_EXCL_SCAN_SUM(short)
00193 INST_DEV_EXCL_SCAN_SUM(unsigned long)
00194 INST_DEV_EXCL_SCAN_SUM(unsigned int)
00195 INST_DEV_EXCL_SCAN_SUM(unsigned short)
00196 
00197 
00198 #if 0
00199 inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) {
00200   return make_uint2(a.x + b.x, a.y + b.y);
00201 }
00202 
00203 template long dev_excl_scan_sum_tmpsz<uint2>(uint2*, long, uint2*, uint2);
00204 template void dev_excl_scan_sum<uint2>(uint2*, long, uint2*, void*, long, uint2);
00205 #endif
00206 
00207 template void dev_incl_scan_sum<float>(float*, long, float*, void*, long);
00208 
00209 

Generated on Thu Mar 28 02:43:03 2024 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002