6 #include <cuda_runtime.h> 10 #include <hip/hip_runtime.h> 18 #define BOUNDINGBOXSIZE 32 28 #ifdef NAMD_NAVI_BUILD 34 #define BOUNDINGBOXSIZE 32 38 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 40 #define FORCE_ENERGY_TABLE_SIZE 4096 42 #define COPY_CUDATENSOR(S,D) \ 53 #define COPY_CUDAVECTOR(S,D) \ 58 #define PRINT_CUDATENSOR(T, SS) \ 59 SS << T.xx << " " << T.xy << " " << T.xz << " " << T.yx << " " << \ 60 T.yy << " " << T.yz << " " << T.zx << " " << T.zy << " " << T.zz << \ 72 #define ATOMIC_BINS WARPSIZE 86 __forceinline__ __device__
100 __forceinline__ __device__
127 #define FEP_BONDED_CUDA_DEBUG 128 #ifdef FEP_BONDED_CUDA_DEBUG 149 #define WARP_FULL_MASK 0xffffffff 153 #if (__CUDACC_VER_MAJOR__ >= 9) 154 #define NAMD_USE_COOPERATIVE_GROUPS 157 #ifdef NAMD_USE_COOPERATIVE_GROUPS 158 #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \ 159 __shfl_xor_sync(MASK, VAR, LANE, SIZE) 160 #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \ 161 __shfl_up_sync(MASK, VAR, DELTA, SIZE) 162 #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \ 163 __shfl_down_sync(MASK, VAR, DELTA, SIZE) 164 #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \ 165 __shfl_sync(MASK, VAR, LANE, SIZE) 166 #define WARP_ALL(MASK, P) __all_sync(MASK, P) 167 #define WARP_ANY(MASK, P) __any_sync(MASK, P) 168 #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P) 169 #define WARP_SYNC(MASK) __syncwarp(MASK) 170 #define BLOCK_SYNC __barrier_sync(0) 172 #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \ 173 __shfl_xor(VAR, LANE, SIZE) 174 #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \ 175 __shfl_up(VAR, DELTA, SIZE) 176 #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \ 177 __shfl_down(VAR, DELTA, SIZE) 178 #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \ 179 __shfl(VAR, LANE, SIZE) 182 #define NAMD_WARP_ALL(MASK, P) __all(P) 183 #define NAMD_WARP_ANY(MASK, P) __any(P) 184 #define NAMD_WARP_BALLOT(MASK, P) __ballot(P) 185 #define NAMD_WARP_SYNC(MASK) 186 #define BLOCK_SYNC __syncthreads() 188 #define WARP_ALL(MASK, P) __all(P) 189 #define WARP_ANY(MASK, P) __any(P) 190 #define WARP_BALLOT(MASK, P) __ballot(P) 191 #define WARP_SYNC(MASK) 192 #define BLOCK_SYNC __syncthreads() 196 #if !defined(NAMD_HIP) 197 #define NAMD_WARP_SYNC(MASK) WARP_SYNC(MASK) 222 #define cuda_static_assert(expr) \ 223 (CudaStaticAssert<(expr) != 0>()) 225 void cudaDie(
const char *msg, cudaError_t err=cudaSuccess);
226 void curandDie(
const char *msg,
int err=0);
233 #define cudaCheck(stmt) do { \ 234 cudaError_t err = stmt; \ 235 if (err != cudaSuccess) { \ 237 sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \ 242 #define curandCheck(stmt) do { \ 243 curandStatus_t err = stmt; \ 244 if (err != CURAND_STATUS_SUCCESS) { \ 246 sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \ 247 curandDie(msg, (int)err); \ 252 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) 255 #if __CUDA_ARCH__ >= 600 256 #error using CAS implementation of double atomicAdd 261 static __device__
double atomicAdd(
double* address,
double val) {
262 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
263 unsigned long long int old = *address_as_ull, assumed;
266 old = atomicCAS(address_as_ull, assumed,
267 __double_as_longlong(val +
268 __longlong_as_double(assumed)));
269 }
while (assumed != old);
270 return __longlong_as_double(old);
288 void allocate_host_T(
void **pp,
const size_t len,
const size_t sizeofT);
343 bool reallocate_device_T(
void **pp,
size_t *curlen,
const size_t newlen,
const float fac,
const size_t sizeofT);
359 bool reallocate_host_T(
void **pp,
size_t *curlen,
const size_t newlen,
const float fac,
360 const unsigned int flag,
const size_t sizeofT);
376 const float fac=1.0f,
const unsigned int flag=cudaHostAllocDefault) {
392 void copy_HtoD_async_T(
const void *h_array,
void *d_array,
size_t array_len, cudaStream_t stream,
393 const size_t sizeofT);
394 void copy_HtoD_T(
const void *h_array,
void *d_array,
size_t array_len,
395 const size_t sizeofT);
396 void copy_DtoH_async_T(
const void *d_array,
void *h_array,
const size_t array_len, cudaStream_t stream,
397 const size_t sizeofT);
398 void copy_DtoH_T(
const void *d_array,
void *h_array,
const size_t array_len,
const size_t sizeofT);
400 void copy_DtoD_async_T(
const void *d_src,
void *d_dst,
const size_t array_len, cudaStream_t stream,
401 const size_t sizeofT);
402 void copy_DtoD_T(
const void *d_src,
void *d_dst,
const size_t array_len,
const size_t sizeofT);
409 void copy_HtoD(
const T *h_array, T *d_array,
size_t array_len, cudaStream_t stream=0) {
419 copy_HtoD_T(h_array, d_array, array_len,
sizeof(T));
427 void copy_DtoH(
const T *d_array, T *h_array,
const size_t array_len, cudaStream_t stream=0) {
436 copy_DtoH_T(d_array, h_array, array_len,
sizeof(T));
443 void copy_DtoD(
const T *d_src, T *h_dst,
const size_t array_len, cudaStream_t stream=0) {
460 const void *d_src,
void *d_dst,
const size_t array_len, cudaStream_t stream,
461 const size_t sizeofT);
465 const T *d_src, T *d_dst,
const size_t array_len, cudaStream_t stream=0) {
474 int src_x0,
int src_y0,
int src_z0,
475 size_t src_xsize,
size_t src_ysize,
476 int dst_x0,
int dst_y0,
int dst_z0,
477 size_t dst_xsize,
size_t dst_ysize,
478 size_t width,
size_t height,
size_t depth,
479 size_t sizeofT, cudaStream_t stream);
483 int src_x0,
int src_y0,
int src_z0,
484 size_t src_xsize,
size_t src_ysize,
485 int dst_x0,
int dst_y0,
int dst_z0,
486 size_t dst_xsize,
size_t dst_ysize,
487 size_t width,
size_t height,
size_t depth,
488 cudaStream_t stream=0) {
490 src_x0, src_y0, src_z0,
491 src_xsize, src_ysize,
492 dst_x0, dst_y0, dst_z0,
493 dst_xsize, dst_ysize,
494 width, height, depth,
503 int src_x0,
int src_y0,
int src_z0,
504 size_t src_xsize,
size_t src_ysize,
505 int dst_x0,
int dst_y0,
int dst_z0,
506 size_t dst_xsize,
size_t dst_ysize,
507 size_t width,
size_t height,
size_t depth,
508 size_t sizeofT, cudaStream_t stream);
512 int src_x0,
int src_y0,
int src_z0,
513 size_t src_xsize,
size_t src_ysize,
514 int dst_x0,
int dst_y0,
int dst_z0,
515 size_t dst_xsize,
size_t dst_ysize,
516 size_t width,
size_t height,
size_t depth,
517 cudaStream_t stream=0) {
519 src_x0, src_y0, src_z0,
520 src_xsize, src_ysize,
521 dst_x0, dst_y0, dst_z0,
522 dst_xsize, dst_ysize,
523 width, height, depth,
532 int src_x0,
int src_y0,
int src_z0,
533 size_t src_xsize,
size_t src_ysize,
534 int dst_x0,
int dst_y0,
int dst_z0,
535 size_t dst_xsize,
size_t dst_ysize,
536 size_t width,
size_t height,
size_t depth,
537 size_t sizeofT, cudaStream_t stream);
541 int src_x0,
int src_y0,
int src_z0,
542 size_t src_xsize,
size_t src_ysize,
543 int dst_x0,
int dst_y0,
int dst_z0,
544 size_t dst_xsize,
size_t dst_ysize,
545 size_t width,
size_t height,
size_t depth,
546 cudaStream_t stream=0) {
548 src_x0, src_y0, src_z0,
549 src_xsize, src_ysize,
550 dst_x0, dst_y0, dst_z0,
551 dst_xsize, dst_ysize,
552 width, height, depth,
561 void* src_data,
void* dst_data,
562 int src_x0,
int src_y0,
int src_z0,
563 size_t src_xsize,
size_t src_ysize,
564 int dst_x0,
int dst_y0,
int dst_z0,
565 size_t dst_xsize,
size_t dst_ysize,
566 size_t width,
size_t height,
size_t depth,
567 size_t sizeofT, cudaStream_t stream);
571 T* src_data, T* dst_data,
572 int src_x0,
int src_y0,
int src_z0,
573 size_t src_xsize,
size_t src_ysize,
574 int dst_x0,
int dst_y0,
int dst_z0,
575 size_t dst_xsize,
size_t dst_ysize,
576 size_t width,
size_t height,
size_t depth,
577 cudaStream_t stream=0) {
580 src_x0, src_y0, src_z0,
581 src_xsize, src_ysize,
582 dst_x0, dst_y0, dst_z0,
583 dst_xsize, dst_ysize,
584 width, height, depth,
612 #endif // NAMD_CUDA || NAMD_HIP 614 #endif // CUDAUTILS_H
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void allocate_device_managed(T **pp, const size_t len)
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)
void deallocate_host(T **pp)
void clear_device_array_sync(T *data, const size_t ndata)
__forceinline__ __device__ cudaTensor & operator+=(const cudaTensor &other)
void copy_DtoD_sync(const T *d_src, T *h_dst, const size_t array_len)
void deallocate_device(T **pp)
void allocate_device(T **pp, const size_t len)
void curandDie(const char *msg, int err=0)
void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT)
void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)
void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT)
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void allocate_device_async(T **pp, const size_t len, cudaStream_t stream)
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
void cudaNAMD_bug(const char *msg)
void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
bool reallocate_device(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f)
__forceinline__ __device__ friend cudaTensor operator+(const cudaTensor &t1, const cudaTensor &t2)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
bool reallocate_host(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT)
void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len)
void allocate_host(T **pp, const size_t len)
void copy_DtoD(const T *d_src, T *h_dst, const size_t array_len, cudaStream_t stream=0)
void clear_device_array(T *data, const size_t ndata, cudaStream_t stream=0)
void deallocate_device_T_async(void **pp, cudaStream_t stream)
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const size_t array_len, cudaStream_t stream=0)
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void deallocate_device_T(void **pp)
void deallocate_host_T(void **pp)
void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT)
void deallocate_device_async(T **pp, cudaStream_t stream)
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)
void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT)
void allocate_host_T(void **pp, const size_t len, const size_t sizeofT)