6 #include <cuda_runtime.h> 10 #include <hip/hip_runtime.h> 18 #define BOUNDINGBOXSIZE 32 22 #if (__CUDACC_VER_MAJOR__ < 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ <= 8) 23 #define NAMD_CCCL_MAJOR_VERSION 2 25 #include <cuda/std/version> 26 #define NAMD_CCCL_MAJOR_VERSION CCCL_MAJOR_VERSION 37 #ifdef NAMD_NAVI_BUILD 43 #define BOUNDINGBOXSIZE 32 47 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 49 #define FORCE_ENERGY_TABLE_SIZE 4096 51 #define COPY_CUDATENSOR(S,D) \ 62 #define COPY_CUDAVECTOR(S,D) \ 67 #define PRINT_CUDATENSOR(T, SS) \ 68 SS << T.xx << " " << T.xy << " " << T.xz << " " << T.yx << " " << \ 69 T.yy << " " << T.yz << " " << T.zx << " " << T.zy << " " << T.zz << \ 81 #define ATOMIC_BINS WARPSIZE 95 __forceinline__ __device__
100 this->yx += other.
yx;
101 this->yy += other.
yy;
102 this->yz += other.
yz;
103 this->zx += other.
zx;
104 this->zy += other.
zy;
105 this->zz += other.
zz;
109 __forceinline__ __device__
136 #define FEP_BONDED_CUDA_DEBUG 137 #ifdef FEP_BONDED_CUDA_DEBUG 158 #define WARP_FULL_MASK 0xffffffff 162 #if (__CUDACC_VER_MAJOR__ >= 9) 163 #define NAMD_USE_COOPERATIVE_GROUPS 166 #ifdef NAMD_USE_COOPERATIVE_GROUPS 167 #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \ 168 __shfl_xor_sync(MASK, VAR, LANE, SIZE) 169 #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \ 170 __shfl_up_sync(MASK, VAR, DELTA, SIZE) 171 #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \ 172 __shfl_down_sync(MASK, VAR, DELTA, SIZE) 173 #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \ 174 __shfl_sync(MASK, VAR, LANE, SIZE) 175 #define WARP_ALL(MASK, P) __all_sync(MASK, P) 176 #define WARP_ANY(MASK, P) __any_sync(MASK, P) 177 #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P) 178 #define WARP_SYNC(MASK) __syncwarp(MASK) 179 #define BLOCK_SYNC __barrier_sync(0) 181 #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \ 182 __shfl_xor(VAR, LANE, SIZE) 183 #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \ 184 __shfl_up(VAR, DELTA, SIZE) 185 #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \ 186 __shfl_down(VAR, DELTA, SIZE) 187 #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \ 188 __shfl(VAR, LANE, SIZE) 191 #define NAMD_WARP_ALL(MASK, P) __all(P) 192 #define NAMD_WARP_ANY(MASK, P) __any(P) 193 #define NAMD_WARP_BALLOT(MASK, P) __ballot(P) 194 #define NAMD_WARP_SYNC(MASK) 195 #define BLOCK_SYNC __syncthreads() 197 #define WARP_ALL(MASK, P) __all(P) 198 #define WARP_ANY(MASK, P) __any(P) 199 #define WARP_BALLOT(MASK, P) __ballot(P) 200 #define WARP_SYNC(MASK) 201 #define BLOCK_SYNC __syncthreads() 205 #if !defined(NAMD_HIP) 206 #define NAMD_WARP_SYNC(MASK) WARP_SYNC(MASK) 231 #define cuda_static_assert(expr) \ 232 (CudaStaticAssert<(expr) != 0>()) 234 void cudaDie(
const char *msg, cudaError_t err=cudaSuccess);
235 void curandDie(
const char *msg,
int err=0);
242 #define cudaCheck(stmt) do { \ 243 cudaError_t err = stmt; \ 244 if (err != cudaSuccess) { \ 246 sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \ 251 #define curandCheck(stmt) do { \ 252 curandStatus_t err = stmt; \ 253 if (err != CURAND_STATUS_SUCCESS) { \ 255 sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \ 256 curandDie(msg, (int)err); \ 261 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) 264 #if __CUDA_ARCH__ >= 600 265 #error using CAS implementation of double atomicAdd 270 static __device__
double atomicAdd(
double* address,
double val) {
271 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
272 unsigned long long int old = *address_as_ull, assumed;
275 old = atomicCAS(address_as_ull, assumed,
276 __double_as_longlong(val +
277 __longlong_as_double(assumed)));
278 }
while (assumed != old);
279 return __longlong_as_double(old);
297 void allocate_host_T(
void **pp,
const size_t len,
const size_t sizeofT);
352 bool reallocate_device_T(
void **pp,
size_t *curlen,
const size_t newlen,
const float fac,
const size_t sizeofT);
368 bool reallocate_host_T(
void **pp,
size_t *curlen,
const size_t newlen,
const float fac,
369 const unsigned int flag,
const size_t sizeofT);
385 const float fac=1.0f,
const unsigned int flag=cudaHostAllocDefault) {
401 void copy_HtoD_async_T(
const void *h_array,
void *d_array,
size_t array_len, cudaStream_t stream,
402 const size_t sizeofT);
403 void copy_HtoD_T(
const void *h_array,
void *d_array,
size_t array_len,
404 const size_t sizeofT);
405 void copy_DtoH_async_T(
const void *d_array,
void *h_array,
const size_t array_len, cudaStream_t stream,
406 const size_t sizeofT);
407 void copy_DtoH_T(
const void *d_array,
void *h_array,
const size_t array_len,
const size_t sizeofT);
409 void copy_DtoD_async_T(
const void *d_src,
void *d_dst,
const size_t array_len, cudaStream_t stream,
410 const size_t sizeofT);
411 void copy_DtoD_T(
const void *d_src,
void *d_dst,
const size_t array_len,
const size_t sizeofT);
418 void copy_HtoD(
const T *h_array, T *d_array,
size_t array_len, cudaStream_t stream=0) {
428 copy_HtoD_T(h_array, d_array, array_len,
sizeof(T));
436 void copy_DtoH(
const T *d_array, T *h_array,
const size_t array_len, cudaStream_t stream=0) {
445 copy_DtoH_T(d_array, h_array, array_len,
sizeof(T));
452 void copy_DtoD(
const T *d_src, T *h_dst,
const size_t array_len, cudaStream_t stream=0) {
469 const void *d_src,
void *d_dst,
const size_t array_len, cudaStream_t stream,
470 const size_t sizeofT);
474 const T *d_src, T *d_dst,
const size_t array_len, cudaStream_t stream=0) {
483 int src_x0,
int src_y0,
int src_z0,
484 size_t src_xsize,
size_t src_ysize,
485 int dst_x0,
int dst_y0,
int dst_z0,
486 size_t dst_xsize,
size_t dst_ysize,
487 size_t width,
size_t height,
size_t depth,
488 size_t sizeofT, cudaStream_t stream);
492 int src_x0,
int src_y0,
int src_z0,
493 size_t src_xsize,
size_t src_ysize,
494 int dst_x0,
int dst_y0,
int dst_z0,
495 size_t dst_xsize,
size_t dst_ysize,
496 size_t width,
size_t height,
size_t depth,
497 cudaStream_t stream=0) {
499 src_x0, src_y0, src_z0,
500 src_xsize, src_ysize,
501 dst_x0, dst_y0, dst_z0,
502 dst_xsize, dst_ysize,
503 width, height, depth,
512 int src_x0,
int src_y0,
int src_z0,
513 size_t src_xsize,
size_t src_ysize,
514 int dst_x0,
int dst_y0,
int dst_z0,
515 size_t dst_xsize,
size_t dst_ysize,
516 size_t width,
size_t height,
size_t depth,
517 size_t sizeofT, cudaStream_t stream);
521 int src_x0,
int src_y0,
int src_z0,
522 size_t src_xsize,
size_t src_ysize,
523 int dst_x0,
int dst_y0,
int dst_z0,
524 size_t dst_xsize,
size_t dst_ysize,
525 size_t width,
size_t height,
size_t depth,
526 cudaStream_t stream=0) {
528 src_x0, src_y0, src_z0,
529 src_xsize, src_ysize,
530 dst_x0, dst_y0, dst_z0,
531 dst_xsize, dst_ysize,
532 width, height, depth,
541 int src_x0,
int src_y0,
int src_z0,
542 size_t src_xsize,
size_t src_ysize,
543 int dst_x0,
int dst_y0,
int dst_z0,
544 size_t dst_xsize,
size_t dst_ysize,
545 size_t width,
size_t height,
size_t depth,
546 size_t sizeofT, cudaStream_t stream);
550 int src_x0,
int src_y0,
int src_z0,
551 size_t src_xsize,
size_t src_ysize,
552 int dst_x0,
int dst_y0,
int dst_z0,
553 size_t dst_xsize,
size_t dst_ysize,
554 size_t width,
size_t height,
size_t depth,
555 cudaStream_t stream=0) {
557 src_x0, src_y0, src_z0,
558 src_xsize, src_ysize,
559 dst_x0, dst_y0, dst_z0,
560 dst_xsize, dst_ysize,
561 width, height, depth,
570 void* src_data,
void* dst_data,
571 int src_x0,
int src_y0,
int src_z0,
572 size_t src_xsize,
size_t src_ysize,
573 int dst_x0,
int dst_y0,
int dst_z0,
574 size_t dst_xsize,
size_t dst_ysize,
575 size_t width,
size_t height,
size_t depth,
576 size_t sizeofT, cudaStream_t stream);
580 T* src_data, T* dst_data,
581 int src_x0,
int src_y0,
int src_z0,
582 size_t src_xsize,
size_t src_ysize,
583 int dst_x0,
int dst_y0,
int dst_z0,
584 size_t dst_xsize,
size_t dst_ysize,
585 size_t width,
size_t height,
size_t depth,
586 cudaStream_t stream=0) {
589 src_x0, src_y0, src_z0,
590 src_xsize, src_ysize,
591 dst_x0, dst_y0, dst_z0,
592 dst_xsize, dst_ysize,
593 width, height, depth,
621 #endif // NAMD_CUDA || NAMD_HIP 623 #endif // CUDAUTILS_H
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void allocate_device_managed(T **pp, const size_t len)
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)
void deallocate_host(T **pp)
void clear_device_array_sync(T *data, const size_t ndata)
__forceinline__ __device__ cudaTensor & operator+=(const cudaTensor &other)
void copy_DtoD_sync(const T *d_src, T *h_dst, const size_t array_len)
void deallocate_device(T **pp)
void allocate_device(T **pp, const size_t len)
void curandDie(const char *msg, int err=0)
void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT)
void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)
void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT)
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void allocate_device_async(T **pp, const size_t len, cudaStream_t stream)
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
void cudaNAMD_bug(const char *msg)
void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
bool reallocate_device(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f)
__forceinline__ __device__ friend cudaTensor operator+(const cudaTensor &t1, const cudaTensor &t2)
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
bool reallocate_host(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT)
void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len)
void allocate_host(T **pp, const size_t len)
void copy_DtoD(const T *d_src, T *h_dst, const size_t array_len, cudaStream_t stream=0)
void clear_device_array(T *data, const size_t ndata, cudaStream_t stream=0)
void deallocate_device_T_async(void **pp, cudaStream_t stream)
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const size_t array_len, cudaStream_t stream=0)
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void deallocate_device_T(void **pp)
void deallocate_host_T(void **pp)
void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT)
void deallocate_device_async(T **pp, cudaStream_t stream)
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)
void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT)
void allocate_host_T(void **pp, const size_t len, const size_t sizeofT)