CUDASort.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDASort.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.6 $        $Date: 2020/02/26 19:26:47 $
00014  *
00015  ***************************************************************************/
00021 #include <stdlib.h>
00022 #include "CUDASort.h"
00023 
00024 #if defined(VMDUSECUB)
00025 #include <cub/cub.cuh>
00026 #else
00027 #include <thrust/sort.h> // need thrust sorting primitives
00028 #include <thrust/device_ptr.h> // need thrust sorting primitives
00029 #include <thrust/execution_policy.h>
00030 #endif
00031 
00032 
00033 #if 0
00034 #define CUERR { cudaError_t err; \
00035   cudaDeviceSynchronize(); \
00036   if ((err = cudaGetLastError()) != cudaSuccess) { \
00037   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00038   }}
00039 #else
00040 #define CUERR
00041 #endif
00042 
00043 #if defined(VMDUSECUB)
00044 
00045 //
00046 // Ascending key-value radix sort
00047 //
00048 template <typename KeyT, typename ValT>
00049 long dev_radix_sort_by_key_tmpsz(KeyT *keys_d, ValT *vals_d, long nitems) {
00050   size_t tsz = 0;
00051   cub::DeviceRadixSort::SortPairs(NULL, tsz, 
00052                                   (const KeyT *) NULL, (KeyT *) NULL,
00053                                   (const ValT *) NULL, (ValT *) NULL,
00054                                   nitems, 0, sizeof(KeyT) * 8, 0, false);
00055   return (long) tsz; 
00056 }
00057 
00058 
00059 template <typename KeyT, typename ValT>
00060 int dev_radix_sort_by_key(KeyT *keys_d, ValT *vals_d, long nitems,
00061                           KeyT *keyswork_d, ValT *valswork_d,
00062                           void *sortwork_d, long tlsz,
00063                           KeyT min_key, KeyT max_key) {
00064   int tmp_autoallocate=0;
00065   int keys_autoallocate=0;
00066   int vals_autoallocate=0;
00067   size_t tsz = tlsz;
00068   
00069   if (keyswork_d == NULL) {
00070     keys_autoallocate=1;
00071 //    printf("One-time alloc keyswork size: %ld\n", nitems * sizeof(KeyT));
00072     cudaMalloc(&keyswork_d, nitems * sizeof(KeyT));
00073   }
00074 
00075   if (valswork_d == NULL) {
00076     vals_autoallocate=1;
00077 //    printf("One-time alloc valswork size: %ld\n", nitems * sizeof(ValT));
00078     cudaMalloc(&valswork_d, nitems * sizeof(ValT));
00079   }
00080 
00081   if (sortwork_d == NULL) {
00082     tmp_autoallocate=1;
00083     cub::DeviceRadixSort::SortPairs(NULL, tsz, 
00084                                     (const KeyT *) NULL, (KeyT *) NULL,
00085                                     (const ValT *) NULL, (ValT *) NULL,
00086                                     nitems, 0, sizeof(KeyT) * 8, 0, false);
00087 //    printf("One-time alloc sort tmp size: %ld\n", tsz);
00088     cudaMalloc(&sortwork_d, tsz);
00089   }
00090 
00091 
00092   //
00093   // One of the benefits of the bitwise nature of the stages in 
00094   // a radix sort is that we can trim the range of bits to only
00095   // those that are used within the input.  This reduces the number
00096   // of radix sort stages, thereby improving performance.  Since the
00097   // CUB implementation works on unsigned integral key types, we can 
00098   // determine the starting and ending bit positions to sort on if we
00099   // know the range of the input values.  If we don't know the precise
00100   // range of input values, we can still benefit from any upper or lower
00101   // bound value.  Worst case we use all of the bits in the key type, so
00102   // in that case it would be helpful to use a narrow key type.
00103   // 
00104 
00105   // default, worst-case scenario is that we must sort on all bit columns
00106   int begin_bit = 0;
00107   int end_bit = sizeof(KeyT) * 8;
00108 
00109   // if the caller provided a non-zero max key, we'll compute begin/end
00110   // bit columns from the LSB and MSB of the provided min/max keys
00111   if (max_key != ((KeyT) 0)) {
00112     // compute LSB from min key
00113     KeyT less = min_key - 1;
00114     KeyT lsb_val = (less | min_key) ^ less;
00115     int lsb = int(log2((double) lsb_val));
00116 
00117     int msb = int(log2((double) max_key)+0.5); // compute MSB from max key
00118 
00119     begin_bit = lsb;   // CUB starts sort on LSB bit column
00120     end_bit = msb + 1; // CUB sorts up to one bit beyond the MSB bit column
00121   }
00122 
00123   // XXX radix sort doesn't occur in-place, it needs to bounce back and 
00124   // forth between double-buffered work areas in multiple sweeps
00125   cub::DeviceRadixSort::SortPairs(NULL, tsz, 
00126                                   keys_d, keyswork_d, vals_d, valswork_d,
00127                                   nitems, begin_bit, end_bit, 0, false);
00128 
00129   // copy results from output work area 
00130   cudaMemcpyAsync(keys_d, keyswork_d, nitems * sizeof(KeyT), cudaMemcpyDeviceToDevice, 0);
00131   cudaMemcpyAsync(vals_d, valswork_d, nitems * sizeof(ValT), cudaMemcpyDeviceToDevice, 0);
00132 
00133   if (tmp_autoallocate) {
00134     cudaFree(sortwork_d);
00135   }
00136 
00137   cudaStreamSynchronize(0);
00138 
00139   if (keys_autoallocate) {
00140     cudaFree(keyswork_d);
00141   }
00142 
00143   if (vals_autoallocate) {
00144     cudaFree(valswork_d);
00145   }
00146 
00147   return 0;
00148 }
00149 
00150 
00151 #else
00152 
00153 //
00154 // Ascending key-value radix sort
00155 //
00156 template <typename KeyT, typename ValT>
00157 long dev_radix_sort_by_key_tmpsz(KeyT *keys_d, ValT *vals_d, long nitems) {
00158   return 0;
00159 }
00160 
00161 
00162 template <typename KeyT, typename ValT>
00163 int dev_radix_sort_by_key(KeyT *keys_d, ValT *vals_d, long nitems,
00164                           KeyT *keyswork_d, ValT *valswork_d,
00165                           void *sortwork_d, long tsz,
00166                           KeyT min_key, KeyT max_key) {
00167   // Thrust: device pointers are wrapped with vector iterators
00168   try {
00169     // It is common to encounter thrust memory allocation issues, so
00170     // we have to catch thrown exceptions here, otherwise we're guaranteed
00171     // to eventually have a crash.  If we get a failure, we have to bomb
00172     // out entirely and fall back to the CPU.
00173 
00174     // XXX thrust is performing allocation/deallocations per-invocation,
00175     //     which shows up on the SP profile collections as taking a surprising
00176     //     amount of time.  We need to find a way to provide it with completely
00177     //     pre-allocated temp workspaces if at all possible.
00178     //     One Thrust cached allocation scheme (which works for GCC > 4.4)
00179     //     is described in an example here:
00180     //       https://github.com/thrust/thrust/blob/master/examples/cuda/custom_temporary_allocation.cu
00181     thrust::sort_by_key(thrust::device_ptr<unsigned int>(keys_d),
00182                         thrust::device_ptr<unsigned int>(keys_d + nitems),
00183                         thrust::device_ptr<unsigned int>(vals_d));
00184   }
00185   catch (std::bad_alloc) {
00186     printf("CUDA Thrust memory allocation failed: %s line %d\n", __FILE__, __LINE__);
00187     return -1;
00188   }
00189   catch (thrust::system::system_error) {
00190     printf("CUDA Thrust sort_by_key() failed: %s line %d\n", __FILE__, __LINE__);
00191     return -1;
00192   }
00193 
00194   return 0;
00195 }
00196 
00197 #endif
00198 
00199 
00200 
00201 //
00202 // Force instantiation of required templates...
00203 //
00204 #define INST_DEV_RADIX_SORT_BY_KEY_TMPSZ(KT, VT) template long dev_radix_sort_by_key_tmpsz<KT, VT>(KT*, VT*, long);
00205 #define INST_DEV_RADIX_SORT_BY_KEY(KT, VT) template int dev_radix_sort_by_key<KT, VT>(KT*, VT*, long, KT*, VT*, void *, long, KT, KT);
00206 
00207 INST_DEV_RADIX_SORT_BY_KEY_TMPSZ(unsigned int, unsigned int);
00208 INST_DEV_RADIX_SORT_BY_KEY(unsigned int, unsigned int);
00209 
00210