| version 1.2 | version 1.3 |
|---|
| |
| #define CUDAUTILS_H | #define CUDAUTILS_H |
| | |
| #ifdef NAMD_CUDA | #ifdef NAMD_CUDA |
| | #include <stdio.h> |
| #include <cuda.h> | #include <cuda.h> |
| #include <cuda_runtime.h> | #include <cuda_runtime.h> |
| | |
| | #define WARPSIZE 32 |
| | |
| | // Texture objects require Kepler (3.0) hardware and cuda compiler 5.0 |
| | // If your setup does not support texture objects (such as Mac OS X), |
| | // set compiler flag DISABLE_CUDA_TEXTURE_OBJECTS |
| | |
| | /* |
| | // Define float3 + float3 operation |
| | __host__ __device__ inline float3 operator+(const float3 a, const float3 b) { |
| | float3 c; |
| | c.x = a.x + b.x; |
| | c.y = a.y + b.y; |
| | c.z = a.z + b.z; |
| | return c; |
| | } |
| | */ |
| | |
| // | // |
| // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11 | // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11 |
| // | // |
| |
| | |
| void cudaDie(const char *msg, cudaError_t err=cudaSuccess); | void cudaDie(const char *msg, cudaError_t err=cudaSuccess); |
| | |
| | void cudaNAMD_bug(const char *msg); |
| | |
| // | // |
| // Error checking wrapper for CUDA | // Error checking wrapper for CUDA |
| // | // |
| |
| } \ | } \ |
| } while(0) | } while(0) |
| | |
| void deallocate_device_T(void **pp); | #ifdef __CUDACC__ |
| | #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) |
| | // native implementation available |
| | #else |
| | #if __CUDA_ARCH__ >= 600 |
| | #error using CAS implementation of double atomicAdd |
| | #endif |
| | // |
| | // Double precision atomicAdd, copied from CUDA_C_Programming_Guide.pdf (ver 5.0) |
| | // |
| | static __device__ double atomicAdd(double* address, double val) { |
| | unsigned long long int* address_as_ull = (unsigned long long int*)address; |
| | unsigned long long int old = *address_as_ull, assumed; |
| | do { |
| | assumed = old; |
| | old = atomicCAS(address_as_ull, assumed, |
| | __double_as_longlong(val + |
| | __longlong_as_double(assumed))); |
| | } while (assumed != old); |
| | return __longlong_as_double(old); |
| | } |
| | #endif |
| | #endif |
| | |
| | void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT); |
| | void clear_device_array_T(void *data, const int ndata, const size_t sizeofT); |
| | |
| | template <class T> |
| | void clear_device_array(T *data, const int ndata, cudaStream_t stream=0) { |
| | clear_device_array_async_T(data, ndata, stream, sizeof(T)); |
| | } |
| | |
| | template <class T> |
| | void clear_device_array_sync(T *data, const int ndata) { |
| | clear_device_array_T(data, ndata, sizeof(T)); |
| | } |
| | |
| | void allocate_host_T(void **pp, const int len, const size_t sizeofT); |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Deallocate gpu memory | // Allocate page-locked host memory |
| // pp = memory pointer | // pp = memory pointer |
| | // len = length of the array |
| // | // |
| template <class T> | template <class T> |
| void deallocate_device(T **pp) { | void allocate_host(T **pp, const int len) { |
| deallocate_device_T((void **)pp); | allocate_host_T((void **)pp, len, sizeof(T)); |
| } | } |
| | |
| | |
| | void allocate_device_T(void **pp, const int len, const size_t sizeofT); |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| | // |
| | // Allocate gpu memory |
| | // pp = memory pointer |
| | // len = length of the array |
| | // |
| | template <class T> |
| | void allocate_device(T **pp, const int len) { |
| | allocate_device_T((void **)pp, len, sizeof(T)); |
| | } |
| | |
| bool resize_device_T(void **pp, int *curlen, const int cur_size, const int new_size, | |
| const float fac, const size_t sizeofT); | |
| | |
| | void deallocate_device_T(void **pp); |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Allocate & re-allocate GPU memory, preserves content | // Deallocate gpu memory |
| // pp = memory pointer | // pp = memory pointer |
| // curlen = current length of the array | |
| // cur_size = current size of the data content | |
| // new_size = new size | |
| // fac = extra space allocation factor: in case of re-allocation new length will be fac*new_size | |
| // | // |
| template <class T> | template <class T> |
| bool resize_device(T **pp, int *curlen, const int cur_size, const int new_size, const float fac) { | void deallocate_device(T **pp) { |
| return resize_device_T((void **)pp, curlen, cur_size, new_size, fac, sizeof(T)); | deallocate_device_T((void **)pp); |
| } | } |
| | //---------------------------------------------------------------------------------------- |
| | |
| bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT); | bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT); |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| |
| return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T)); | return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T)); |
| } | } |
| | |
| bool resize_host_T(void **pp, int *curlen, const int cur_size, const int new_size, | void deallocate_host_T(void **pp); |
| const float fac, const unsigned int flag, const size_t sizeofT); | |
| | |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Allocate & re-allocate host memory, preserves content | // Deallocate page-locked host memory |
| // pp = memory pointer | // pp = memory pointer |
| // curlen = current length of the array | |
| // cur_size = current size of the data content | |
| // new_size = new size | |
| // fac = extra space allocation factor: in case of re-allocation new length will be fac*new_size | |
| // flag = allocation type: | |
| // cudaHostAllocDefault = default type, emulates cudaMallocHost | |
| // cudaHostAllocMapped = maps allocation into CUDA address space | |
| // | // |
| template <class T> | template <class T> |
| bool resize_host(T **pp, int *curlen, const int cur_size, const int new_size, | void deallocate_host(T **pp) { |
| const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault) { | deallocate_host_T((void **)pp); |
| return resize_host_T((void **)pp, curlen, cur_size, new_size, fac, flag, sizeof(T)); | |
| } | } |
| | //---------------------------------------------------------------------------------------- |
| | |
| | void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, |
| | const size_t sizeofT); |
| | void copy_HtoD_T(const void *h_array, void *d_array, int array_len, |
| | const size_t sizeofT); |
| | void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, |
| | const size_t sizeofT); |
| | void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT); |
| | |
| | void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, |
| | const size_t sizeofT); |
| | void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT); |
| | |
| void deallocate_host_T(void **pp); | |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Deallocate page-locked host memory | // Copies memory Host -> Device |
| // pp = memory pointer | |
| // | // |
| template <class T> | template <class T> |
| void deallocate_host(T **pp) { | void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0) { |
| deallocate_host_T((void **)pp); | copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory Host -> Device using synchronous calls |
| | // |
| | template <class T> |
| | void copy_HtoD_sync(const T *h_array, T *d_array, int array_len) { |
| | copy_HtoD_T(h_array, d_array, array_len, sizeof(T)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory Device -> Host |
| | // |
| | template <class T> |
| | void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0) { |
| | copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T)); |
| | } |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory Device -> Host using synchronous calls |
| | // |
| | template <class T> |
| | void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len) { |
| | copy_DtoH_T(d_array, h_array, array_len, sizeof(T)); |
| | } |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory Device -> Device |
| | // |
| | template <class T> |
| | void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0) { |
| | copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T)); |
| | } |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory Device -> Device using synchronous calls |
| | // |
| | template <class T> |
| | void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len) { |
| | copy_DtoD_T(d_src, h_dst, array_len, sizeof(T)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory between two peer devices Device -> Device |
| | // |
| | void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, |
| | const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, |
| | const size_t sizeofT); |
| | |
| | template <class T> |
| | void copy_PeerDtoD(const int src_dev, const int dst_dev, |
| | const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0) { |
| | copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T)); |
| } | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block Host -> Device |
| | // |
| | void copy3D_HtoD_T(void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream); |
| | |
| | template <class T> |
| | void copy3D_HtoD(T* src_data, T* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | cudaStream_t stream=0) { |
| | copy3D_HtoD_T(src_data, dst_data, |
| | src_x0, src_y0, src_z0, |
| | src_xsize, src_ysize, |
| | dst_x0, dst_y0, dst_z0, |
| | dst_xsize, dst_ysize, |
| | width, height, depth, |
| | sizeof(T), stream); |
| | } |
| | |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block Device -> Host |
| | // |
| | void copy3D_DtoH_T(void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream); |
| | |
| | template <class T> |
| | void copy3D_DtoH(T* src_data, T* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | cudaStream_t stream=0) { |
| | copy3D_DtoH_T(src_data, dst_data, |
| | src_x0, src_y0, src_z0, |
| | src_xsize, src_ysize, |
| | dst_x0, dst_y0, dst_z0, |
| | dst_xsize, dst_ysize, |
| | width, height, depth, |
| | sizeof(T), stream); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block Device -> Device |
| | // |
| | void copy3D_DtoD_T(void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream); |
| | |
| | template <class T> |
| | void copy3D_DtoD(T* src_data, T* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | cudaStream_t stream=0) { |
| | copy3D_DtoD_T(src_data, dst_data, |
| | src_x0, src_y0, src_z0, |
| | src_xsize, src_ysize, |
| | dst_x0, dst_y0, dst_z0, |
| | dst_xsize, dst_ysize, |
| | width, height, depth, |
| | sizeof(T), stream); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block between two peer devices Device -> Device |
| | // |
| | void copy3D_PeerDtoD_T(int src_dev, int dst_dev, |
| | void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream); |
| | |
| | template <class T> |
| | void copy3D_PeerDtoD(int src_dev, int dst_dev, |
| | T* src_data, T* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | cudaStream_t stream=0) { |
| | copy3D_PeerDtoD_T(src_dev, dst_dev, |
| | src_data, dst_data, |
| | src_x0, src_y0, src_z0, |
| | src_xsize, src_ysize, |
| | dst_x0, dst_y0, dst_z0, |
| | dst_xsize, dst_ysize, |
| | width, height, depth, |
| | sizeof(T), stream); |
| | } |
| | |
| #endif // NAMD_CUDA | #endif // NAMD_CUDA |
| | |