4 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
8 #include <cuda_runtime.h>
13 #include <hip/hip_runtime.h>
16 typedef unsigned long long int WarpMask;
19 #define FORCE_ENERGY_TABLE_SIZE 4096
21 #define WARP_FULL_MASK 0xffffffff
26 #define ATOMIC_BINS WARPSIZE
29 #if (__CUDACC_VER_MAJOR__ >= 9)
30 #define NAMD_USE_COOPERATIVE_GROUPS
33 #ifdef NAMD_USE_COOPERATIVE_GROUPS
34 #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
35 __shfl_xor_sync(MASK, VAR, LANE, SIZE)
36 #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
37 __shfl_up_sync(MASK, VAR, DELTA, SIZE)
38 #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
39 __shfl_down_sync(MASK, VAR, DELTA, SIZE)
40 #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
41 __shfl_sync(MASK, VAR, LANE, SIZE)
42 #define WARP_ALL(MASK, P) __all_sync(MASK, P)
43 #define WARP_ANY(MASK, P) __any_sync(MASK, P)
44 #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P)
45 #define WARP_SYNC(MASK) __syncwarp(MASK)
46 #define BLOCK_SYNC __barrier_sync(0)
48 #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
49 __shfl_xor(VAR, LANE, SIZE)
50 #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
51 __shfl_up(VAR, DELTA, SIZE)
52 #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
53 __shfl_down(VAR, DELTA, SIZE)
54 #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
55 __shfl(VAR, LANE, SIZE)
56 #define WARP_ALL(MASK, P) __all(P)
57 #define WARP_ANY(MASK, P) __any(P)
58 #define WARP_BALLOT(MASK, P) __ballot(P)
59 #define WARP_SYNC(MASK)
60 #define BLOCK_SYNC __syncthreads()
85 #define cuda_static_assert(expr) \
86 (CudaStaticAssert<(expr) != 0>())
88 void cudaDie(
const char *msg, cudaError_t err=cudaSuccess);
95 #define cudaCheck(stmt) do { \
96 cudaError_t err = stmt; \
97 if (err != cudaSuccess) { \
99 sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
105 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
108 #if __CUDA_ARCH__ >= 600
109 #error using CAS implementation of double atomicAdd
114 static __device__
double atomicAdd(
double* address,
double val) {
115 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
116 unsigned long long int old = *address_as_ull, assumed;
119 old = atomicCAS(address_as_ull, assumed,
120 __double_as_longlong(val +
121 __longlong_as_double(assumed)));
122 }
while (assumed != old);
123 return __longlong_as_double(old);
179 bool reallocate_device_T(
void **pp,
int *curlen,
const int newlen,
const float fac,
const size_t sizeofT);
195 bool reallocate_host_T(
void **pp,
int *curlen,
const int newlen,
const float fac,
196 const unsigned int flag,
const size_t sizeofT);
212 const float fac=1.0f,
const unsigned int flag=cudaHostAllocDefault) {
229 const size_t sizeofT);
230 void copy_HtoD_T(
const void *h_array,
void *d_array,
int array_len,
231 const size_t sizeofT);
233 const size_t sizeofT);
234 void copy_DtoH_T(
const void *d_array,
void *h_array,
const int array_len,
const size_t sizeofT);
237 const size_t sizeofT);
238 void copy_DtoD_T(
const void *d_src,
void *d_dst,
const int array_len,
const size_t sizeofT);
255 copy_HtoD_T(h_array, d_array, array_len,
sizeof(T));
263 void copy_DtoH(
const T *d_array, T *h_array,
const int array_len, cudaStream_t
stream=0) {
272 copy_DtoH_T(d_array, h_array, array_len,
sizeof(T));
279 void copy_DtoD(
const T *d_src, T *h_dst,
const int array_len, cudaStream_t
stream=0) {
296 const void *d_src,
void *d_dst,
const int array_len, cudaStream_t
stream,
297 const size_t sizeofT);
301 const T *d_src, T *d_dst,
const int array_len, cudaStream_t
stream=0) {
310 int src_x0,
int src_y0,
int src_z0,
311 size_t src_xsize,
size_t src_ysize,
312 int dst_x0,
int dst_y0,
int dst_z0,
313 size_t dst_xsize,
size_t dst_ysize,
314 size_t width,
size_t height,
size_t depth,
315 size_t sizeofT, cudaStream_t
stream);
319 int src_x0,
int src_y0,
int src_z0,
320 size_t src_xsize,
size_t src_ysize,
321 int dst_x0,
int dst_y0,
int dst_z0,
322 size_t dst_xsize,
size_t dst_ysize,
323 size_t width,
size_t height,
size_t depth,
326 src_x0, src_y0, src_z0,
327 src_xsize, src_ysize,
328 dst_x0, dst_y0, dst_z0,
329 dst_xsize, dst_ysize,
330 width, height, depth,
339 int src_x0,
int src_y0,
int src_z0,
340 size_t src_xsize,
size_t src_ysize,
341 int dst_x0,
int dst_y0,
int dst_z0,
342 size_t dst_xsize,
size_t dst_ysize,
343 size_t width,
size_t height,
size_t depth,
344 size_t sizeofT, cudaStream_t
stream);
348 int src_x0,
int src_y0,
int src_z0,
349 size_t src_xsize,
size_t src_ysize,
350 int dst_x0,
int dst_y0,
int dst_z0,
351 size_t dst_xsize,
size_t dst_ysize,
352 size_t width,
size_t height,
size_t depth,
355 src_x0, src_y0, src_z0,
356 src_xsize, src_ysize,
357 dst_x0, dst_y0, dst_z0,
358 dst_xsize, dst_ysize,
359 width, height, depth,
368 int src_x0,
int src_y0,
int src_z0,
369 size_t src_xsize,
size_t src_ysize,
370 int dst_x0,
int dst_y0,
int dst_z0,
371 size_t dst_xsize,
size_t dst_ysize,
372 size_t width,
size_t height,
size_t depth,
373 size_t sizeofT, cudaStream_t
stream);
377 int src_x0,
int src_y0,
int src_z0,
378 size_t src_xsize,
size_t src_ysize,
379 int dst_x0,
int dst_y0,
int dst_z0,
380 size_t dst_xsize,
size_t dst_ysize,
381 size_t width,
size_t height,
size_t depth,
384 src_x0, src_y0, src_z0,
385 src_xsize, src_ysize,
386 dst_x0, dst_y0, dst_z0,
387 dst_xsize, dst_ysize,
388 width, height, depth,
397 void* src_data,
void* dst_data,
398 int src_x0,
int src_y0,
int src_z0,
399 size_t src_xsize,
size_t src_ysize,
400 int dst_x0,
int dst_y0,
int dst_z0,
401 size_t dst_xsize,
size_t dst_ysize,
402 size_t width,
size_t height,
size_t depth,
403 size_t sizeofT, cudaStream_t
stream);
407 T* src_data, T* dst_data,
408 int src_x0,
int src_y0,
int src_z0,
409 size_t src_xsize,
size_t src_ysize,
410 int dst_x0,
int dst_y0,
int dst_z0,
411 size_t dst_xsize,
size_t dst_ysize,
412 size_t width,
size_t height,
size_t depth,
416 src_x0, src_y0, src_z0,
417 src_xsize, src_ysize,
418 dst_x0, dst_y0, dst_z0,
419 dst_xsize, dst_ysize,
420 width, height, depth,
426 #endif // CUDAUTILS_H
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void deallocate_device_T(void **pp)
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0)
void deallocate_host(T **pp)
void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void allocate_device(T **pp, const int len)
void deallocate_device(T **pp)
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0)
void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void allocate_host(T **pp, const int len)
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
__thread cudaStream_t stream
void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0)
void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len)
bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
bool reallocate_host(T **pp, int *curlen, const int newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void clear_device_array_sync(T *data, const int ndata)
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD_sync(const T *h_array, T *d_array, int array_len)
void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len)
void cudaNAMD_bug(const char *msg)
void clear_device_array(T *data, const int ndata, cudaStream_t stream=0)
void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
bool reallocate_device(T **pp, int *curlen, const int newlen, const float fac=1.0f)
bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0)
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void clear_device_array_T(void *data, const int ndata, const size_t sizeofT)
void deallocate_host_T(void **pp)
void allocate_host_T(void **pp, const int len, const size_t sizeofT)
void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)