CudaUtils.h

Go to the documentation of this file.
00001 #ifndef CUDAUTILS_H
00002 #define CUDAUTILS_H
00003 
00004 #ifdef NAMD_CUDA
00005 #include <stdio.h>
00006 #include <cuda.h>
00007 #include <cuda_runtime.h>
00008 
00009 #define WARPSIZE 32
00010 
00011 #define WARP_FULL_MASK 0xffffffff
00012 
00013 #if (__CUDACC_VER_MAJOR__ >= 9)
00014 #define NAMD_USE_COOPERATIVE_GROUPS
00015 #endif
00016 
00017 #ifdef NAMD_USE_COOPERATIVE_GROUPS
00018   #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
00019     __shfl_xor_sync(MASK, VAR, LANE, SIZE)
00020   #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
00021     __shfl_up_sync(MASK, VAR, DELTA, SIZE)
00022   #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
00023     __shfl_down_sync(MASK, VAR, DELTA, SIZE)
00024   #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
00025     __shfl_sync(MASK, VAR, LANE, SIZE)
00026   #define WARP_ALL(MASK, P)     __all_sync(MASK, P)
00027   #define WARP_ANY(MASK, P)     __any_sync(MASK, P)
00028   #define WARP_BALLOT(MASK, P)  __ballot_sync(MASK, P)
00029   #define WARP_SYNC(MASK)       __syncwarp(MASK)
00030   #define BLOCK_SYNC            __barrier_sync(0)
00031 #else
00032   #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
00033     __shfl_xor(VAR, LANE, SIZE)
00034   #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
00035     __shfl_up(VAR, DELTA, SIZE)
00036   #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
00037     __shfl_down(VAR, DELTA, SIZE)
00038   #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
00039     __shfl(VAR, LANE, SIZE)
00040   #define WARP_ALL(MASK, P)     __all(P)
00041   #define WARP_ANY(MASK, P)     __any(P)
00042   #define WARP_BALLOT(MASK, P)  __ballot(P)
00043   #define WARP_SYNC(MASK)
00044   #define BLOCK_SYNC            __syncthreads()
00045 #endif
00046 
00047 
00048 /*
00049 // Define float3 + float3 operation
00050 __host__ __device__ inline float3 operator+(const float3 a, const float3 b) {
00051   float3 c;
00052   c.x = a.x + b.x;
00053   c.y = a.y + b.y;
00054   c.z = a.z + b.z;
00055   return c;
00056 }
00057 */
00058 
00059 //
00060 // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11
00061 //
00062 template <bool>
00063 struct CudaStaticAssert;
00064 
00065 template <>
00066 struct CudaStaticAssert<true> {
00067 };
00068 
00069 #define cuda_static_assert(expr) \
00070   (CudaStaticAssert<(expr) != 0>())
00071 
00072 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
00073 
00074 void cudaNAMD_bug(const char *msg);
00075 
00076 //
00077 // Error checking wrapper for CUDA
00078 //
00079 #define cudaCheck(stmt) do {                                 \
00080         cudaError_t err = stmt;                            \
00081   if (err != cudaSuccess) {                          \
00082         char msg[128];  \
00083           sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
00084           cudaDie(msg, err); \
00085   }                                                  \
00086 } while(0)
00087 
00088 #ifdef __CUDACC__
00089 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
00090 // native implementation available
00091 #else
00092 #if __CUDA_ARCH__ >= 600
00093 #error using CAS implementation of double atomicAdd
00094 #endif
00095 //
00096 // Double precision atomicAdd, copied from CUDA_C_Programming_Guide.pdf (ver 5.0)
00097 //
00098 static __device__ double atomicAdd(double* address, double val) {
00099   unsigned long long int* address_as_ull = (unsigned long long int*)address;
00100   unsigned long long int old = *address_as_ull, assumed;
00101   do {
00102     assumed = old;
00103     old = atomicCAS(address_as_ull, assumed,
00104         __double_as_longlong(val +
00105            __longlong_as_double(assumed)));
00106   } while (assumed != old);
00107   return __longlong_as_double(old);
00108 }
00109 #endif
00110 #endif
00111 
00112 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT);
00113 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT);
00114 
00115 template <class T>
00116 void clear_device_array(T *data, const int ndata, cudaStream_t stream=0) {
00117   clear_device_array_async_T(data, ndata, stream, sizeof(T));
00118 }
00119 
00120 template <class T>
00121 void clear_device_array_sync(T *data, const int ndata) {
00122   clear_device_array_T(data, ndata, sizeof(T));
00123 }
00124 
00125 void allocate_host_T(void **pp, const int len, const size_t sizeofT);
00126 //----------------------------------------------------------------------------------------
00127 //
00128 // Allocate page-locked host memory
00129 // pp = memory pointer
00130 // len = length of the array
00131 //
00132 template <class T>
00133 void allocate_host(T **pp, const int len) {
00134   allocate_host_T((void **)pp, len, sizeof(T));
00135 }
00136 
00137 
00138 void allocate_device_T(void **pp, const int len, const size_t sizeofT);
00139 //----------------------------------------------------------------------------------------
00140 //
00141 // Allocate gpu memory
00142 // pp = memory pointer
00143 // len = length of the array
00144 //
00145 template <class T>
00146 void allocate_device(T **pp, const int len) {
00147   allocate_device_T((void **)pp, len, sizeof(T));
00148 }
00149 
00150 
00151 void deallocate_device_T(void **pp);
00152 //----------------------------------------------------------------------------------------
00153 //
00154 // Deallocate gpu memory
00155 // pp = memory pointer
00156 //
00157 template <class T>
00158 void deallocate_device(T **pp) {
00159   deallocate_device_T((void **)pp);
00160 }
00161 //----------------------------------------------------------------------------------------
00162 
00163 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT);
00164 //----------------------------------------------------------------------------------------
00165 //
00166 // Allocate & re-allocate device memory
00167 // pp = memory pointer
00168 // curlen = current length of the array
00169 // newlen = new required length of the array
00170 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00171 //
00172 // returns true if reallocation happened
00173 //
00174 template <class T>
00175 bool reallocate_device(T **pp, int *curlen, const int newlen, const float fac=1.0f) {
00176   return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
00177 }
00178 //----------------------------------------------------------------------------------------
00179 bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac, 
00180                        const unsigned int flag, const size_t sizeofT);
00181 //----------------------------------------------------------------------------------------
00182 //
00183 // Allocate & re-allocate pinned host memory
00184 // pp = memory pointer
00185 // curlen = current length of the array
00186 // newlen = new required length of the array
00187 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00188 // flag = allocation type:
00189 //        cudaHostAllocDefault = default type, emulates cudaMallocHost
00190 //        cudaHostAllocMapped  = maps allocation into CUDA address space
00191 //
00192 // returns true if reallocation happened
00193 //
00194 template <class T>
00195 bool reallocate_host(T **pp, int *curlen, const int newlen,
00196                      const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault) {
00197   return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
00198 }
00199 
00200 void deallocate_host_T(void **pp);
00201 //----------------------------------------------------------------------------------------
00202 //
00203 // Deallocate page-locked host memory
00204 // pp = memory pointer
00205 //
00206 template <class T>
00207 void deallocate_host(T **pp) {
00208   deallocate_host_T((void **)pp);
00209 }
00210 //----------------------------------------------------------------------------------------
00211 
00212 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
00213            const size_t sizeofT);
00214 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
00215      const size_t sizeofT);
00216 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
00217            const size_t sizeofT);
00218 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT);
00219 
00220 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00221            const size_t sizeofT);
00222 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT);
00223 
00224 //----------------------------------------------------------------------------------------
00225 //
00226 // Copies memory Host -> Device
00227 //
00228 template <class T>
00229 void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0) {
00230   copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
00231 }
00232 
00233 //----------------------------------------------------------------------------------------
00234 //
00235 // Copies memory Host -> Device using synchronous calls
00236 //
00237 template <class T>
00238 void copy_HtoD_sync(const T *h_array, T *d_array, int array_len) {
00239   copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
00240 }
00241 
00242 //----------------------------------------------------------------------------------------
00243 //
00244 // Copies memory Device -> Host
00245 //
00246 template <class T>
00247 void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0) {
00248   copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
00249 }
00250 //----------------------------------------------------------------------------------------
00251 //
00252 // Copies memory Device -> Host using synchronous calls
00253 //
00254 template <class T>
00255 void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len) {
00256   copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
00257 }
00258 //----------------------------------------------------------------------------------------
00259 //
00260 // Copies memory Device -> Device
00261 //
00262 template <class T>
00263 void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0) {
00264   copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
00265 }
00266 //----------------------------------------------------------------------------------------
00267 //
00268 // Copies memory Device -> Device using synchronous calls
00269 //
00270 template <class T>
00271 void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len) {
00272   copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
00273 }
00274 
00275 //----------------------------------------------------------------------------------------
00276 //
00277 // Copies memory between two peer devices Device -> Device
00278 //
00279 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
00280   const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00281   const size_t sizeofT);
00282 
00283 template <class T>
00284 void copy_PeerDtoD(const int src_dev, const int dst_dev,
00285   const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0) {
00286   copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
00287 }
00288 
00289 //----------------------------------------------------------------------------------------
00290 //
00291 // Copies 3D memory block Host -> Device
00292 //
00293 void copy3D_HtoD_T(void* src_data, void* dst_data,
00294   int src_x0, int src_y0, int src_z0,
00295   size_t src_xsize, size_t src_ysize,
00296   int dst_x0, int dst_y0, int dst_z0,
00297   size_t dst_xsize, size_t dst_ysize,
00298   size_t width, size_t height, size_t depth,
00299   size_t sizeofT, cudaStream_t stream);
00300 
00301 template <class T>
00302 void copy3D_HtoD(T* src_data, T* dst_data,
00303   int src_x0, int src_y0, int src_z0,
00304   size_t src_xsize, size_t src_ysize,
00305   int dst_x0, int dst_y0, int dst_z0,
00306   size_t dst_xsize, size_t dst_ysize,
00307   size_t width, size_t height, size_t depth,
00308   cudaStream_t stream=0) {
00309   copy3D_HtoD_T(src_data, dst_data,
00310     src_x0, src_y0, src_z0,
00311     src_xsize, src_ysize,
00312     dst_x0, dst_y0, dst_z0,
00313     dst_xsize, dst_ysize,
00314     width, height, depth,
00315     sizeof(T), stream);
00316 }
00317 
00318 //----------------------------------------------------------------------------------------
00319 //
00320 // Copies 3D memory block Device -> Host
00321 //
00322 void copy3D_DtoH_T(void* src_data, void* dst_data,
00323   int src_x0, int src_y0, int src_z0,
00324   size_t src_xsize, size_t src_ysize,
00325   int dst_x0, int dst_y0, int dst_z0,
00326   size_t dst_xsize, size_t dst_ysize,
00327   size_t width, size_t height, size_t depth,
00328   size_t sizeofT, cudaStream_t stream);
00329 
00330 template <class T>
00331 void copy3D_DtoH(T* src_data, T* dst_data,
00332   int src_x0, int src_y0, int src_z0,
00333   size_t src_xsize, size_t src_ysize,
00334   int dst_x0, int dst_y0, int dst_z0,
00335   size_t dst_xsize, size_t dst_ysize,
00336   size_t width, size_t height, size_t depth,
00337   cudaStream_t stream=0) {
00338   copy3D_DtoH_T(src_data, dst_data,
00339     src_x0, src_y0, src_z0,
00340     src_xsize, src_ysize,
00341     dst_x0, dst_y0, dst_z0,
00342     dst_xsize, dst_ysize,
00343     width, height, depth,
00344     sizeof(T), stream);
00345 }
00346 
00347 //----------------------------------------------------------------------------------------
00348 //
00349 // Copies 3D memory block Device -> Device
00350 //
00351 void copy3D_DtoD_T(void* src_data, void* dst_data,
00352   int src_x0, int src_y0, int src_z0,
00353   size_t src_xsize, size_t src_ysize,
00354   int dst_x0, int dst_y0, int dst_z0,
00355   size_t dst_xsize, size_t dst_ysize,
00356   size_t width, size_t height, size_t depth,
00357   size_t sizeofT, cudaStream_t stream);
00358 
00359 template <class T>
00360 void copy3D_DtoD(T* src_data, T* dst_data,
00361   int src_x0, int src_y0, int src_z0,
00362   size_t src_xsize, size_t src_ysize,
00363   int dst_x0, int dst_y0, int dst_z0,
00364   size_t dst_xsize, size_t dst_ysize,
00365   size_t width, size_t height, size_t depth,
00366   cudaStream_t stream=0) {
00367   copy3D_DtoD_T(src_data, dst_data,
00368     src_x0, src_y0, src_z0,
00369     src_xsize, src_ysize,
00370     dst_x0, dst_y0, dst_z0,
00371     dst_xsize, dst_ysize,
00372     width, height, depth,
00373     sizeof(T), stream);
00374 }
00375 
00376 //----------------------------------------------------------------------------------------
00377 //
00378 // Copies 3D memory block between two peer devices Device -> Device
00379 //
00380 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
00381   void* src_data, void* dst_data,
00382   int src_x0, int src_y0, int src_z0,
00383   size_t src_xsize, size_t src_ysize,
00384   int dst_x0, int dst_y0, int dst_z0,
00385   size_t dst_xsize, size_t dst_ysize,
00386   size_t width, size_t height, size_t depth,
00387   size_t sizeofT, cudaStream_t stream);
00388 
00389 template <class T>
00390 void copy3D_PeerDtoD(int src_dev, int dst_dev,
00391   T* src_data, T* dst_data,
00392   int src_x0, int src_y0, int src_z0,
00393   size_t src_xsize, size_t src_ysize,
00394   int dst_x0, int dst_y0, int dst_z0,
00395   size_t dst_xsize, size_t dst_ysize,
00396   size_t width, size_t height, size_t depth,
00397   cudaStream_t stream=0) {
00398   copy3D_PeerDtoD_T(src_dev, dst_dev,
00399     src_data, dst_data,
00400     src_x0, src_y0, src_z0,
00401     src_xsize, src_ysize,
00402     dst_x0, dst_y0, dst_z0,
00403     dst_xsize, dst_ysize,
00404     width, height, depth,
00405     sizeof(T), stream);
00406 }
00407 
00408 #endif // NAMD_CUDA
00409 
00410 #endif // CUDAUTILS_H

Generated on Mon Nov 20 01:17:12 2017 for NAMD by  doxygen 1.4.7