#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include "HipDefines.h"
#include <iostream>

Classes
struct	cudaTensor

struct	cudaVector

struct	CudaMInfo

struct	CudaStaticAssert< bool >

struct	CudaStaticAssert< true >

struct	CudaNBConstants

Macros
#define	WARPSIZE 32

#define	BOUNDINGBOXSIZE 32

#define	NAMD_CCCL_MAJOR_VERSION 2

#define	FORCE_ENERGY_TABLE_SIZE 4096

#define	COPY_CUDATENSOR(S, D)

#define	COPY_CUDAVECTOR(S, D)

#define	PRINT_CUDATENSOR(T, SS)

#define	ATOMIC_BINS 1

#define	FEP_BONDED_CUDA_DEBUG

#define	WARP_FULL_MASK 0xffffffff

#define	WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) __shfl_xor(VAR, LANE, SIZE)

#define	WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) __shfl_up(VAR, DELTA, SIZE)

#define	WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) __shfl_down(VAR, DELTA, SIZE)

#define	WARP_SHUFFLE(MASK, VAR, LANE, SIZE) __shfl(VAR, LANE, SIZE)

#define	WARP_ALL(MASK, P) __all(P)

#define	WARP_ANY(MASK, P) __any(P)

#define	WARP_BALLOT(MASK, P) __ballot(P)

#define	WARP_SYNC(MASK)

#define	BLOCK_SYNC __syncthreads()

#define	NAMD_WARP_SYNC(MASK) WARP_SYNC(MASK)

#define	cuda_static_assert(expr) (CudaStaticAssert<(expr) != 0>())

#define	cudaCheck(stmt)

#define	curandCheck(stmt)

Typedefs
typedef unsigned int	WarpMask

typedef double	BigReal

Functions
void	cudaDie (const char *msg, cudaError_t err=cudaSuccess)

void	curandDie (const char *msg, int err=0)

void	cudaNAMD_bug (const char *msg)

void	clear_device_array_async_T (void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)

void	clear_device_array_T (void *data, const size_t ndata, const size_t sizeofT)

template<class T >
void	clear_device_array (T *data, const size_t ndata, cudaStream_t stream=0)

template<class T >
void	clear_device_array_sync (T *data, const size_t ndata)

void	allocate_host_T (void **pp, const size_t len, const size_t sizeofT)

template<class T >
void	allocate_host (T **pp, const size_t len)

void	allocate_device_T (void **pp, const size_t len, const size_t sizeofT)

void	allocate_device_T_managed (void **pp, const size_t len, const size_t sizeofT)

void	allocate_device_T_async (void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)

template<class T >
void	allocate_device (T **pp, const size_t len)

template<class T >
void	allocate_device_managed (T **pp, const size_t len)

template<class T >
void	allocate_device_async (T **pp, const size_t len, cudaStream_t stream)

void	deallocate_device_T (void **pp)

void	deallocate_device_T_async (void **pp, cudaStream_t stream)

template<class T >
void	deallocate_device (T **pp)

template<class T >
void	deallocate_device_async (T **pp, cudaStream_t stream)

bool	reallocate_device_T (void *pp, size_t curlen, const size_t newlen, const float fac, const size_t sizeofT)

template<class T >
bool	reallocate_device (T *pp, size_t curlen, const size_t newlen, const float fac=1.0f)

bool	reallocate_host_T (void *pp, size_t curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)

template<class T >
bool	reallocate_host (T *pp, size_t curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)

void	deallocate_host_T (void **pp)

template<class T >
void	deallocate_host (T **pp)

void	copy_HtoD_async_T (const void h_array, void d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)

void	copy_HtoD_T (const void h_array, void d_array, size_t array_len, const size_t sizeofT)

void	copy_DtoH_async_T (const void d_array, void h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)

void	copy_DtoH_T (const void d_array, void h_array, const size_t array_len, const size_t sizeofT)

void	copy_DtoD_async_T (const void d_src, void d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)

void	copy_DtoD_T (const void d_src, void d_dst, const size_t array_len, const size_t sizeofT)

template<class T >
void	copy_HtoD (const T h_array, T d_array, size_t array_len, cudaStream_t stream=0)

template<class T >
void	copy_HtoD_sync (const T h_array, T d_array, size_t array_len)

template<class T >
void	copy_DtoH (const T d_array, T h_array, const size_t array_len, cudaStream_t stream=0)

template<class T >
void	copy_DtoH_sync (const T d_array, T h_array, const size_t array_len)

template<class T >
void	copy_DtoD (const T d_src, T h_dst, const size_t array_len, cudaStream_t stream=0)

template<class T >
void	copy_DtoD_sync (const T d_src, T h_dst, const size_t array_len)

void	copy_PeerDtoD_async_T (const int src_dev, const int dst_dev, const void d_src, void d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)

template<class T >
void	copy_PeerDtoD (const int src_dev, const int dst_dev, const T d_src, T d_dst, const size_t array_len, cudaStream_t stream=0)

void	copy3D_HtoD_T (void src_data, void dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)

template<class T >
void	copy3D_HtoD (T src_data, T dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)

void	copy3D_DtoH_T (void src_data, void dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)

template<class T >
void	copy3D_DtoH (T src_data, T dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)

void	copy3D_DtoD_T (void src_data, void dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)

template<class T >
void	copy3D_DtoD (T src_data, T dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)

void	copy3D_PeerDtoD_T (int src_dev, int dst_dev, void src_data, void dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)

template<class T >
void	copy3D_PeerDtoD (int src_dev, int dst_dev, T src_data, T dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)

Macro Definition Documentation

◆ ATOMIC_BINS

#define ATOMIC_BINS 1

Definition at line 79 of file CudaUtils.h.

Referenced by CudaComputeNonbonded::initialize().

◆ BLOCK_SYNC

#define BLOCK_SYNC __syncthreads()

Definition at line 201 of file CudaUtils.h.

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel().

◆ BOUNDINGBOXSIZE

#define BOUNDINGBOXSIZE 32

Definition at line 18 of file CudaUtils.h.

◆ COPY_CUDATENSOR

#define COPY_CUDATENSOR	(	S,
		D
	)

Value:

D.xx = S.xx; \
  D.xy = S.xy; \
  D.xz = S.xz; \
  D.yx = S.yx; \
  D.yy = S.yy; \
  D.yz = S.yz; \
  D.zx = S.zx; \
  D.zy = S.zy; \
  D.zz = S.zz

Definition at line 51 of file CudaUtils.h.

◆ COPY_CUDAVECTOR

#define COPY_CUDAVECTOR	(	S,
		D
	)

Value:

D.x = S.x; \
  D.y = S.y; \
  D.z = S.z

Definition at line 62 of file CudaUtils.h.

◆ cuda_static_assert

#define cuda_static_assert ( expr ) (CudaStaticAssert<(expr) != 0>())

Definition at line 231 of file CudaUtils.h.

◆ cudaCheck

#define cudaCheck ( stmt )

Value:

do {                                 \
        cudaError_t err = stmt;                            \
  if (err != cudaSuccess) {                          \
        char msg[256];  \
          sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
          cudaDie(msg, err); \
  }                                                  \
} while(0)

Definition at line 242 of file CudaUtils.h.

◆ curandCheck

#define curandCheck ( stmt )

Value:

do {                                 \
    curandStatus_t  err = stmt;                                   \
    if (err != CURAND_STATUS_SUCCESS) {                           \
      char msg[256];                                                    \
      sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
      curandDie(msg, (int)err);                                   \
    }                                                                   \
  } while(0)

Definition at line 251 of file CudaUtils.h.

◆ FEP_BONDED_CUDA_DEBUG

#define FEP_BONDED_CUDA_DEBUG

Definition at line 136 of file CudaUtils.h.

◆ FORCE_ENERGY_TABLE_SIZE

#define FORCE_ENERGY_TABLE_SIZE 4096

Definition at line 49 of file CudaUtils.h.

Referenced by CudaNonbondedTables::CudaNonbondedTables().

◆ NAMD_CCCL_MAJOR_VERSION

#define NAMD_CCCL_MAJOR_VERSION 2

Definition at line 23 of file CudaUtils.h.

◆ NAMD_WARP_SYNC

#define NAMD_WARP_SYNC ( MASK ) WARP_SYNC(MASK)

Definition at line 206 of file CudaUtils.h.

◆ PRINT_CUDATENSOR

#define PRINT_CUDATENSOR	(	T,
		SS
	)

Value:

SS << T.xx << " " << T.xy << " " << T.xz << " " << T.yx << " " << \
        T.yy << " " << T.yz << " " << T.zx << " " << T.zy << " " << T.zz << \
        std::endl;

Definition at line 67 of file CudaUtils.h.

◆ WARP_ALL

#define WARP_ALL	(	MASK,
		P
	)	__all(P)

Definition at line 197 of file CudaUtils.h.

◆ WARP_ANY

#define WARP_ANY	(	MASK,
		P
	)	__any(P)

Definition at line 198 of file CudaUtils.h.

◆ WARP_BALLOT

#define WARP_BALLOT	(	MASK,
		P
	)	__ballot(P)

Definition at line 199 of file CudaUtils.h.

◆ WARP_FULL_MASK

#define WARP_FULL_MASK 0xffffffff

Definition at line 158 of file CudaUtils.h.

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel().

◆ WARP_SHUFFLE

#define WARP_SHUFFLE	(	MASK,
		VAR,
		LANE,
		SIZE
	)	__shfl(VAR, LANE, SIZE)

Definition at line 187 of file CudaUtils.h.

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel().

◆ WARP_SHUFFLE_DOWN

#define WARP_SHUFFLE_DOWN	(	MASK,
		VAR,
		DELTA,
		SIZE
	)	__shfl_down(VAR, DELTA, SIZE)

Definition at line 185 of file CudaUtils.h.

◆ WARP_SHUFFLE_UP

#define WARP_SHUFFLE_UP	(	MASK,
		VAR,
		DELTA,
		SIZE
	)	__shfl_up(VAR, DELTA, SIZE)

Definition at line 183 of file CudaUtils.h.

◆ WARP_SHUFFLE_XOR

#define WARP_SHUFFLE_XOR	(	MASK,
		VAR,
		LANE,
		SIZE
	)	__shfl_xor(VAR, LANE, SIZE)

Definition at line 181 of file CudaUtils.h.

◆ WARP_SYNC

#define WARP_SYNC ( MASK )

Definition at line 200 of file CudaUtils.h.

◆ WARPSIZE

#define WARPSIZE 32

Definition at line 17 of file CudaUtils.h.

Referenced by CudaPmeOneDevice::compute(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), and ComputeBondedCUDAKernel::warpAlign().

Typedef Documentation

◆ BigReal

typedef double BigReal

Definition at line 75 of file CudaUtils.h.

◆ WarpMask

typedef unsigned int WarpMask

Definition at line 19 of file CudaUtils.h.

Function Documentation

◆ allocate_device()

template<class T >

void allocate_device	(	T **	pp,
		const size_t	len
	)

Definition at line 320 of file CudaUtils.h.

References allocate_device_T().

                                                {
   allocate_device_T((void **)pp, len, sizeof(T));
 }

◆ allocate_device_async()

template<class T >

void allocate_device_async	(	T **	pp,
		const size_t	len,
		cudaStream_t	stream
	)

Definition at line 330 of file CudaUtils.h.

References allocate_device_T_async().

Referenced by ComputeLonepairsCUDA::updateAtoms().

                                                                           {
   allocate_device_T_async((void **)pp, len, sizeof(T), stream);
 }

◆ allocate_device_managed()

template<class T >

void allocate_device_managed	(	T **	pp,
		const size_t	len
	)

Definition at line 325 of file CudaUtils.h.

References allocate_device_T_managed().

                                                        {
   allocate_device_T_managed((void **)pp, len, sizeof(T));
 }

◆ allocate_device_T()

void allocate_device_T	(	void **	pp,
		const size_t	len,
		const size_t	sizeofT
	)

Definition at line 97 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_device(), allocate_device_T_async(), and bindTextureObject().

                                                                           {
   cudaCheck(cudaMalloc(pp, sizeofT*len));
 }

◆ allocate_device_T_async()

void allocate_device_T_async	(	void **	pp,
		const size_t	len,
		const size_t	sizeofT,
		cudaStream_t	stream
	)

Definition at line 105 of file CudaUtils.C.

References allocate_device_T(), and cudaCheck.

Referenced by allocate_device_async().

                                                                                                     {
 #if (CUDART_VERSION >= 11020)
   cudaCheck(cudaMallocAsync(pp, sizeofT*len, stream));
 #else
   allocate_device_T(pp, len, sizeofT);
 #endif
 }

◆ allocate_device_T_managed()

void allocate_device_T_managed	(	void **	pp,
		const size_t	len,
		const size_t	sizeofT
	)

Definition at line 101 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_device_managed().

                                                                                  {
   cudaCheck(cudaMallocManaged(pp, sizeofT*len));
 }

◆ allocate_host()

template<class T >

void allocate_host	(	T **	pp,
		const size_t	len
	)

Definition at line 305 of file CudaUtils.h.

References allocate_host_T().

                                              {
   allocate_host_T((void **)pp, len, sizeof(T));
 }

◆ allocate_host_T()

void allocate_host_T	(	void **	pp,
		const size_t	len,
		const size_t	sizeofT
	)

Definition at line 87 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_host().

                                                                         {
   cudaCheck(cudaMallocHost(pp, sizeofT*len));
 }

◆ clear_device_array()

template<class T >

void clear_device_array	(	T *	data,
		const size_t	ndata,
		cudaStream_t	stream = `0`
	)

Definition at line 288 of file CudaUtils.h.

References clear_device_array_async_T().

                                                                             {
   clear_device_array_async_T(data, ndata, stream, sizeof(T));
 }

◆ clear_device_array_async_T()

void clear_device_array_async_T	(	void *	data,
		const size_t	ndata,
		cudaStream_t	stream,
		const size_t	sizeofT
	)

Definition at line 73 of file CudaUtils.C.

References cudaCheck.

Referenced by clear_device_array().

                                                                                                            {
   cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
 }

◆ clear_device_array_sync()

template<class T >

void clear_device_array_sync	(	T *	data,
		const size_t	ndata
	)

Definition at line 293 of file CudaUtils.h.

References clear_device_array_T().

                                                           {
   clear_device_array_T(data, ndata, sizeof(T));
 }

◆ clear_device_array_T()

void clear_device_array_T	(	void *	data,
		const size_t	ndata,
		const size_t	sizeofT
	)

Definition at line 77 of file CudaUtils.C.

References cudaCheck.

Referenced by clear_device_array_sync().

                                                                                 {
   cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
 }

◆ copy3D_DtoD()

template<class T >

void copy3D_DtoD	(	T *	src_data,
		T *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		cudaStream_t	stream = `0`
	)

Definition at line 549 of file CudaUtils.h.

References copy3D_DtoD_T().

                          {
   copy3D_DtoD_T(src_data, dst_data,
     src_x0, src_y0, src_z0,
     src_xsize, src_ysize,
     dst_x0, dst_y0, dst_z0,
     dst_xsize, dst_ysize,
     width, height, depth,
     sizeof(T), stream);
 }

◆ copy3D_DtoD_T()

void copy3D_DtoD_T	(	void *	src_data,
		void *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		size_t	sizeofT,
		cudaStream_t	stream
	)

Definition at line 319 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_DtoD().

                                        {
   cudaMemcpy3DParms parms = {0};
 
   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
 
   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
 
   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
   parms.kind = cudaMemcpyDeviceToDevice;
 
   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
 }

◆ copy3D_DtoH()

template<class T >

void copy3D_DtoH	(	T *	src_data,
		T *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		cudaStream_t	stream = `0`
	)

Definition at line 520 of file CudaUtils.h.

References copy3D_DtoH_T().

                          {
   copy3D_DtoH_T(src_data, dst_data,
     src_x0, src_y0, src_z0,
     src_xsize, src_ysize,
     dst_x0, dst_y0, dst_z0,
     dst_xsize, dst_ysize,
     width, height, depth,
     sizeof(T), stream);
 }

◆ copy3D_DtoH_T()

void copy3D_DtoH_T	(	void *	src_data,
		void *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		size_t	sizeofT,
		cudaStream_t	stream
	)

Definition at line 294 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_DtoH().

                                        {
   cudaMemcpy3DParms parms = {0};
 
   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
 
   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
 
   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
   parms.kind = cudaMemcpyDeviceToHost;
 
   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
 }

◆ copy3D_HtoD()

template<class T >

void copy3D_HtoD	(	T *	src_data,
		T *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		cudaStream_t	stream = `0`
	)

Definition at line 491 of file CudaUtils.h.

References copy3D_HtoD_T().

                          {
   copy3D_HtoD_T(src_data, dst_data,
     src_x0, src_y0, src_z0,
     src_xsize, src_ysize,
     dst_x0, dst_y0, dst_z0,
     dst_xsize, dst_ysize,
     width, height, depth,
     sizeof(T), stream);
 }

◆ copy3D_HtoD_T()

void copy3D_HtoD_T	(	void *	src_data,
		void *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		size_t	sizeofT,
		cudaStream_t	stream
	)

Definition at line 269 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_HtoD().

                                        {
   cudaMemcpy3DParms parms = {0};
 
   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
 
   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
 
   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
   parms.kind = cudaMemcpyHostToDevice;
 
   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
 }

◆ copy3D_PeerDtoD()

template<class T >

void copy3D_PeerDtoD	(	int	src_dev,
		int	dst_dev,
		T *	src_data,
		T *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		cudaStream_t	stream = `0`
	)

Definition at line 579 of file CudaUtils.h.

References copy3D_PeerDtoD_T().

                          {
   copy3D_PeerDtoD_T(src_dev, dst_dev,
     src_data, dst_data,
     src_x0, src_y0, src_z0,
     src_xsize, src_ysize,
     dst_x0, dst_y0, dst_z0,
     dst_xsize, dst_ysize,
     width, height, depth,
     sizeof(T), stream);
 }

◆ copy3D_PeerDtoD_T()

void copy3D_PeerDtoD_T	(	int	src_dev,
		int	dst_dev,
		void *	src_data,
		void *	dst_data,
		int	src_x0,
		int	src_y0,
		int	src_z0,
		size_t	src_xsize,
		size_t	src_ysize,
		int	dst_x0,
		int	dst_y0,
		int	dst_z0,
		size_t	dst_xsize,
		size_t	dst_ysize,
		size_t	width,
		size_t	height,
		size_t	depth,
		size_t	sizeofT,
		cudaStream_t	stream
	)

Definition at line 344 of file CudaUtils.C.

References cudaCheck, and cudaDie().

Referenced by copy3D_PeerDtoD().

                                        {
 #ifdef NAMD_HIP
 // TODO-HIP: Is a workaround possible? cudaMemcpy3D+cudaMemcpyPeer+cudaMemcpy3D
    cudaDie("cudaMemcpy3DPeerAsync is not supported by HIP");
 #else
   cudaMemcpy3DPeerParms parms = {0};
 
   parms.srcDevice = src_dev;
   parms.dstDevice = dst_dev;
 
   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
 
   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
 
   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
 
   cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
 #endif
 }

◆ copy_DtoD()

template<class T >

void copy_DtoD	(	const T *	d_src,
		T *	h_dst,
		const size_t	array_len,
		cudaStream_t	stream = `0`
	)

Definition at line 452 of file CudaUtils.h.

References copy_DtoD_async_T().

                                                                                         {
   copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
 }

◆ copy_DtoD_async_T()

void copy_DtoD_async_T	(	const void *	d_src,
		void *	d_dst,
		const size_t	array_len,
		cudaStream_t	stream,
		const size_t	sizeofT
	)

Definition at line 246 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoD().

                                  {
   cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
 }

◆ copy_DtoD_sync()

template<class T >

void copy_DtoD_sync	(	const T *	d_src,
		T *	h_dst,
		const size_t	array_len
	)

Definition at line 460 of file CudaUtils.h.

References copy_DtoD_T().

                                                                       {
   copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
 }

◆ copy_DtoD_T()

void copy_DtoD_T	(	const void *	d_src,
		void *	d_dst,
		const size_t	array_len,
		const size_t	sizeofT
	)

Definition at line 251 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoD_sync().

                                                                                                {
   cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
 }

◆ copy_DtoH()

template<class T >

void copy_DtoH	(	const T *	d_array,
		T *	h_array,
		const size_t	array_len,
		cudaStream_t	stream = `0`
	)

Definition at line 436 of file CudaUtils.h.

References copy_DtoH_async_T().

                                                                                             {
   copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
 }

◆ copy_DtoH_async_T()

void copy_DtoH_async_T	(	const void *	d_array,
		void *	h_array,
		const size_t	array_len,
		cudaStream_t	stream,
		const size_t	sizeofT
	)

Definition at line 233 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoH().

                                  {
   cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
 }

◆ copy_DtoH_sync()

template<class T >

void copy_DtoH_sync	(	const T *	d_array,
		T *	h_array,
		const size_t	array_len
	)

Definition at line 444 of file CudaUtils.h.

References copy_DtoH_T().

                                                                           {
   copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
 }

◆ copy_DtoH_T()

void copy_DtoH_T	(	const void *	d_array,
		void *	h_array,
		const size_t	array_len,
		const size_t	sizeofT
	)

Definition at line 238 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoH_sync().

                                                                                                    {
   cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
 }

◆ copy_HtoD()

template<class T >

void copy_HtoD	(	const T *	h_array,
		T *	d_array,
		size_t	array_len,
		cudaStream_t	stream = `0`
	)

Definition at line 418 of file CudaUtils.h.

References copy_HtoD_async_T().

Referenced by ComputeLonepairsCUDA::updateAtoms().

                                                                                       {
   copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
 }

◆ copy_HtoD_async_T()

void copy_HtoD_async_T	(	const void *	h_array,
		void *	d_array,
		size_t	array_len,
		cudaStream_t	stream,
		const size_t	sizeofT
	)

Definition at line 219 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_HtoD().

                                  {
   cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
 }

◆ copy_HtoD_sync()

template<class T >

void copy_HtoD_sync	(	const T *	h_array,
		T *	d_array,
		size_t	array_len
	)

Definition at line 427 of file CudaUtils.h.

References copy_HtoD_T().

                                                                     {
   copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
 }

◆ copy_HtoD_T()

void copy_HtoD_T	(	const void *	h_array,
		void *	d_array,
		size_t	array_len,
		const size_t	sizeofT
	)

Definition at line 224 of file CudaUtils.C.

References cudaCheck.

Referenced by bindTextureObject(), and copy_HtoD_sync().

                            {
   cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
 }

◆ copy_PeerDtoD()

template<class T >

void copy_PeerDtoD	(	const int	src_dev,
		const int	dst_dev,
		const T *	d_src,
		T *	d_dst,
		const size_t	array_len,
		cudaStream_t	stream = `0`
	)

Definition at line 473 of file CudaUtils.h.

References copy_PeerDtoD_async_T().

                                                                            {
   copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
 }

◆ copy_PeerDtoD_async_T()

void copy_PeerDtoD_async_T	(	const int	src_dev,
		const int	dst_dev,
		const void *	d_src,
		void *	d_dst,
		const size_t	array_len,
		cudaStream_t	stream,
		const size_t	sizeofT
	)

Definition at line 259 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_PeerDtoD().

                         {
   cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
 }

◆ cudaDie()

void cudaDie	(	const char *	msg,
		cudaError_t	err = `cudaSuccess`
	)

Definition at line 9 of file CudaUtils.C.

References NAMD_die().

Referenced by copy3D_PeerDtoD_T(), cuda_check_pme_charges(), cuda_check_pme_forces(), and DeviceCUDA::initialize().

                                                {
   char host[128];
   gethostname(host, 128);  host[127] = 0;
   char devstr[128] = "";
   int devnum;
   if ( cudaGetDevice(&devnum) == cudaSuccess ) {
     sprintf(devstr, " device %d", devnum);
   }
   cudaDeviceProp deviceProp;
   if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
     sprintf(devstr, " device %d pci %x:%x:%x", devnum,
       deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
   }
   char errmsg[1024];
   if (err == cudaSuccess) {
     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
   } else {
     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));    
   }
   NAMD_die(errmsg);
 }

◆ cudaNAMD_bug()

void cudaNAMD_bug ( const char * msg )

Definition at line 53 of file CudaUtils.C.

References NAMD_bug().

Referenced by CudaFFTCompute::backward(), and CudaFFTCompute::forward().

53 {NAMD_bug(msg);}

NAMD_bug

void NAMD_bug(const char *err_msg)

Definition: common.C:196

◆ curandDie()

void curandDie	(	const char *	msg,
		int	err = `0`
	)

Definition at line 31 of file CudaUtils.C.

References NAMD_die().

                                          {
   char host[128];
   gethostname(host, 128);  host[127] = 0;
   char devstr[128] = "";
   int devnum;
   if ( cudaGetDevice(&devnum) == cudaSuccess ) {
     sprintf(devstr, " device %d", devnum);
   }
   cudaDeviceProp deviceProp;
   if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
     sprintf(devstr, " device %d pci %x:%x:%x", devnum,
       deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
   }
   char errmsg[1024];
   if (err == cudaSuccess) {
     sprintf(errmsg,"CUDA cuRAND error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
   } else {
     sprintf(errmsg,"CUDA cuRAND error %s on Pe %d (%s%s): status value %d", msg, CkMyPe(), host, devstr, err);    
   }
   NAMD_die(errmsg);
 }

◆ deallocate_device()

template<class T >

void deallocate_device ( T ** pp )

Definition at line 342 of file CudaUtils.h.

References deallocate_device_T().

Referenced by ComputeLonepairsCUDA::~ComputeLonepairsCUDA(), and CudaNonbondedTables::~CudaNonbondedTables().

                                {
   deallocate_device_T((void **)pp);
 }

◆ deallocate_device_async()

template<class T >

void deallocate_device_async	(	T **	pp,
		cudaStream_t	stream
	)

Definition at line 346 of file CudaUtils.h.

References deallocate_device_T_async().

Referenced by ComputeLonepairsCUDA::updateAtoms().

                                                           {
   deallocate_device_T_async((void **)pp, stream);
 }

◆ deallocate_device_T()

void deallocate_device_T ( void ** pp )

Definition at line 118 of file CudaUtils.C.

References cudaCheck.

Referenced by deallocate_device(), and deallocate_device_T_async().

                                     {
   
   if (*pp != NULL) {
     cudaCheck(cudaFree((void *)(*pp)));
     *pp = NULL;
   }
 
 }

◆ deallocate_device_T_async()

void deallocate_device_T_async	(	void **	pp,
		cudaStream_t	stream
	)

Definition at line 127 of file CudaUtils.C.

References cudaCheck, and deallocate_device_T().

Referenced by deallocate_device_async().

                                                                {
 #if (CUDART_VERSION >= 11020)
   if (*pp != NULL) {
     cudaCheck(cudaFreeAsync((void *)(*pp), stream));
     *pp = NULL;
   }
 #else
   deallocate_device_T(pp);
 #endif
 }

◆ deallocate_host()

template<class T >

void deallocate_host ( T ** pp )

Definition at line 396 of file CudaUtils.h.

References deallocate_host_T().

Referenced by CudaComputeNonbonded::~CudaComputeNonbonded().

                              {
   deallocate_host_T((void **)pp);
 }

◆ deallocate_host_T()

void deallocate_host_T ( void ** pp )

Definition at line 142 of file CudaUtils.C.

References cudaCheck.

Referenced by deallocate_host().

                                   {
   
   if (*pp != NULL) {
     cudaCheck(cudaFreeHost((void *)(*pp)));
     *pp = NULL;
   }
 
 }

◆ reallocate_device()

template<class T >

bool reallocate_device	(	T **	pp,
		size_t *	curlen,
		const size_t	newlen,
		const float	fac = `1.0f`
	)

Definition at line 364 of file CudaUtils.h.

References reallocate_device_T().

                                                                                           {
   return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
 }

◆ reallocate_device_T()

bool reallocate_device_T	(	void **	pp,
		size_t *	curlen,
		const size_t	newlen,
		const float	fac,
		const size_t	sizeofT
	)

Definition at line 161 of file CudaUtils.C.

References cudaCheck.

Referenced by reallocate_device().

                                                                                                                 {
 
   if (*pp != NULL && *curlen < newlen) {
     cudaCheck(cudaFree((void *)(*pp)));
     *pp = NULL;
   }
 
   if (*pp == NULL) {
     if (fac > 1.0f) {
       *curlen = (size_t)(((double)(newlen))*(double)fac);
     } else {
       *curlen = newlen;
     }
     cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
     return true;
   }
 
   return false;
 }

◆ reallocate_host()

template<class T >

bool reallocate_host	(	T **	pp,
		size_t *	curlen,
		const size_t	newlen,
		const float	fac = `1.0f`,
		const unsigned int	flag = `cudaHostAllocDefault`
	)

Definition at line 384 of file CudaUtils.h.

References reallocate_host_T().

                                                                                          {
   return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
 }

◆ reallocate_host_T()

bool reallocate_host_T	(	void **	pp,
		size_t *	curlen,
		const size_t	newlen,
		const float	fac,
		const unsigned int	flag,
		const size_t	sizeofT
	)

Definition at line 194 of file CudaUtils.C.

References cudaCheck.

Referenced by reallocate_host().

                                                                                        {
 
   if (*pp != NULL && *curlen < newlen) {
     cudaCheck(cudaFreeHost((void *)(*pp)));
     *pp = NULL;
   }
 
   if (*pp == NULL) {
     if (fac > 1.0f) {
       *curlen = (size_t)(((double)(newlen))*(double)fac);
     } else {
       *curlen = newlen;
     }
     cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
     return true;
   }
 
   return false;
 }

Classes

Macros

Typedefs

Functions

Macro Definition Documentation

◆ ATOMIC_BINS

◆ BLOCK_SYNC

◆ BOUNDINGBOXSIZE

◆ COPY_CUDATENSOR

◆ COPY_CUDAVECTOR

◆ cuda_static_assert

◆ cudaCheck

◆ curandCheck

◆ FEP_BONDED_CUDA_DEBUG

◆ FORCE_ENERGY_TABLE_SIZE

◆ NAMD_CCCL_MAJOR_VERSION

◆ NAMD_WARP_SYNC

◆ PRINT_CUDATENSOR

◆ WARP_ALL

◆ WARP_ANY

◆ WARP_BALLOT

◆ WARP_FULL_MASK

◆ WARP_SHUFFLE

◆ WARP_SHUFFLE_DOWN

◆ WARP_SHUFFLE_UP

◆ WARP_SHUFFLE_XOR

◆ WARP_SYNC

◆ WARPSIZE

Typedef Documentation

◆ BigReal

◆ WarpMask

Function Documentation

◆ allocate_device()

◆ allocate_device_async()

◆ allocate_device_managed()

◆ allocate_device_T()

◆ allocate_device_T_async()

◆ allocate_device_T_managed()

◆ allocate_host()

◆ allocate_host_T()

◆ clear_device_array()

◆ clear_device_array_async_T()

◆ clear_device_array_sync()

◆ clear_device_array_T()

◆ copy3D_DtoD()

◆ copy3D_DtoD_T()

◆ copy3D_DtoH()

◆ copy3D_DtoH_T()

◆ copy3D_HtoD()

◆ copy3D_HtoD_T()

◆ copy3D_PeerDtoD()

◆ copy3D_PeerDtoD_T()

◆ copy_DtoD()

◆ copy_DtoD_async_T()

◆ copy_DtoD_sync()

◆ copy_DtoD_T()

◆ copy_DtoH()

◆ copy_DtoH_async_T()

◆ copy_DtoH_sync()

◆ copy_DtoH_T()

◆ copy_HtoD()

◆ copy_HtoD_async_T()

◆ copy_HtoD_sync()

◆ copy_HtoD_T()

◆ copy_PeerDtoD()

◆ copy_PeerDtoD_async_T()

◆ cudaDie()

◆ cudaNAMD_bug()

◆ curandDie()

◆ deallocate_device()

◆ deallocate_device_async()

◆ deallocate_device_T()

◆ deallocate_device_T_async()

◆ deallocate_host()

◆ deallocate_host_T()

◆ reallocate_device()

◆ reallocate_device_T()

◆ reallocate_host()

◆ reallocate_host_T()