NAMD
Classes | Macros | Typedefs | Functions
CudaUtils.h File Reference
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

Go to the source code of this file.

Classes

struct  CudaStaticAssert< bool >
 
struct  CudaStaticAssert< true >
 

Macros

#define WARPSIZE   32
 
#define FORCE_ENERGY_TABLE_SIZE   4096
 
#define WARP_FULL_MASK   0xffffffff
 
#define ATOMIC_BINS   1
 
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)   __shfl_xor(VAR, LANE, SIZE)
 
#define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE)   __shfl_up(VAR, DELTA, SIZE)
 
#define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE)   __shfl_down(VAR, DELTA, SIZE)
 
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)   __shfl(VAR, LANE, SIZE)
 
#define WARP_ALL(MASK, P)   __all(P)
 
#define WARP_ANY(MASK, P)   __any(P)
 
#define WARP_BALLOT(MASK, P)   __ballot(P)
 
#define WARP_SYNC(MASK)
 
#define BLOCK_SYNC   __syncthreads()
 
#define cuda_static_assert(expr)   (CudaStaticAssert<(expr) != 0>())
 
#define cudaCheck(stmt)
 

Typedefs

typedef unsigned int WarpMask
 

Functions

void cudaDie (const char *msg, cudaError_t err=cudaSuccess)
 
void cudaNAMD_bug (const char *msg)
 
void clear_device_array_async_T (void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
 
void clear_device_array_T (void *data, const int ndata, const size_t sizeofT)
 
template<class T >
void clear_device_array (T *data, const int ndata, cudaStream_t stream=0)
 
template<class T >
void clear_device_array_sync (T *data, const int ndata)
 
void allocate_host_T (void **pp, const int len, const size_t sizeofT)
 
template<class T >
void allocate_host (T **pp, const int len)
 
void allocate_device_T (void **pp, const int len, const size_t sizeofT)
 
template<class T >
void allocate_device (T **pp, const int len)
 
void deallocate_device_T (void **pp)
 
template<class T >
void deallocate_device (T **pp)
 
bool reallocate_device_T (void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
 
template<class T >
bool reallocate_device (T **pp, int *curlen, const int newlen, const float fac=1.0f)
 
bool reallocate_host_T (void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
 
template<class T >
bool reallocate_host (T **pp, int *curlen, const int newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
 
void deallocate_host_T (void **pp)
 
template<class T >
void deallocate_host (T **pp)
 
void copy_HtoD_async_T (const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)
 
void copy_HtoD_T (const void *h_array, void *d_array, int array_len, const size_t sizeofT)
 
void copy_DtoH_async_T (const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
 
void copy_DtoH_T (const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
 
void copy_DtoD_async_T (const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
 
void copy_DtoD_T (const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
 
template<class T >
void copy_HtoD (const T *h_array, T *d_array, int array_len, cudaStream_t stream=0)
 
template<class T >
void copy_HtoD_sync (const T *h_array, T *d_array, int array_len)
 
template<class T >
void copy_DtoH (const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0)
 
template<class T >
void copy_DtoH_sync (const T *d_array, T *h_array, const int array_len)
 
template<class T >
void copy_DtoD (const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0)
 
template<class T >
void copy_DtoD_sync (const T *d_src, T *h_dst, const int array_len)
 
void copy_PeerDtoD_async_T (const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
 
template<class T >
void copy_PeerDtoD (const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0)
 
void copy3D_HtoD_T (void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
 
template<class T >
void copy3D_HtoD (T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
 
void copy3D_DtoH_T (void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
 
template<class T >
void copy3D_DtoH (T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
 
void copy3D_DtoD_T (void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
 
template<class T >
void copy3D_DtoD (T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
 
void copy3D_PeerDtoD_T (int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
 
template<class T >
void copy3D_PeerDtoD (int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
 

Macro Definition Documentation

#define ATOMIC_BINS   1
#define BLOCK_SYNC   __syncthreads()

Definition at line 60 of file CudaUtils.h.

#define cuda_static_assert (   expr)    (CudaStaticAssert<(expr) != 0>())

Definition at line 85 of file CudaUtils.h.

Referenced by reduceVariables().

#define cudaCheck (   stmt)
Value:
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
char msg[256]; \
sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
cudaDie(msg, err); \
} \
} while(0)
if(ComputeNonbondedUtil::goMethod==2)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9

Definition at line 95 of file CudaUtils.h.

Referenced by allocate_device_T(), allocate_host_T(), CudaFFTCompute::backward(), batchTranspose_xyz_yzx(), batchTranspose_xyz_zxy(), CudaComputeNonbondedKernel::bindExclusions(), bindTextureObject(), ComputeBondedCUDAKernel::bondedForce(), CudaTileListKernel::buildTileLists(), clear_device_array_async_T(), clear_device_array_T(), ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(), copy3D_DtoD_T(), copy3D_DtoH_T(), copy3D_HtoD_T(), copy3D_PeerDtoD_T(), copy_DtoD_async_T(), copy_DtoD_T(), copy_DtoH_async_T(), copy_DtoH_T(), copy_HtoD_async_T(), copy_HtoD_T(), copy_PeerDtoD_async_T(), CudaPmeRealSpaceCompute::copyAtoms(), CudaPmeTranspose::copyDataDeviceToDevice(), CudaPmeTranspose::copyDataDeviceToHost(), CudaPmeTranspose::copyDataHostToDevice(), createStream(), CudaComputeGBISKernel::CudaComputeGBISKernel(), CudaComputeNonbonded::CudaComputeNonbonded(), CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(), CudaNonbondedTables::CudaNonbondedTables(), CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(), CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(), CudaPmeTranspose::CudaPmeTranspose(), CudaTileListKernel::CudaTileListKernel(), deallocate_device_T(), deallocate_host_T(), ComputeNonbondedCUDA::doWork(), CudaPmeKSpaceCompute::energyAndVirialSetCallback(), CudaComputeNonbonded::finishReductions(), CudaTileListKernel::finishTileList(), CudaFFTCompute::forward(), gather_force(), CudaPmeRealSpaceCompute::gatherForce(), CudaPmeRealSpaceCompute::gatherForceSetCallback(), CudaComputeGBISKernel::GBISphase1(), CudaComputeGBISKernel::GBISphase2(), CudaComputeGBISKernel::GBISphase3(), DeviceCUDA::getMaxNumBlocks(), DeviceCUDA::getMaxNumThreads(), DeviceCUDA::initialize(), CudaComputeNonbonded::initialize(), ComputePmeCUDADevice::initialize(), ComputePmeCUDAMgr::initialize_pencils(), CudaPmePencilXY::initializeDevice(), CudaPmePencilX::initializeDevice(), CudaPmePencilY::initializeDevice(), CudaPmePencilZ::initializeDevice(), CudaComputeNonbonded::launchWork(), CudaComputeNonbondedKernel::nonbondedForce(), read_CUDA_ARCH(), reallocate_device_T(), reallocate_host_T(), CudaComputeNonbondedKernel::reduceVirialEnergy(), CudaTileListKernel::reSortTileLists(), scalar_sum(), CudaPmeTranspose::setDataPtrsYZX(), CudaPmeTranspose::setDataPtrsZXY(), CudaPmeKSpaceCompute::solve(), spread_charge(), CudaPmeRealSpaceCompute::spreadCharge(), transpose_xyz_yzx(), transpose_xyz_zxy(), CudaPmeTranspose::transposeXYZtoYZX(), CudaPmeTranspose::transposeXYZtoZXY(), CudaPmeRealSpaceCompute::waitGatherForceDone(), CudaPmeTranspose::waitStreamSynchronize(), writeComplexToDisk(), writeRealToDisk(), ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel(), ComputePmeCUDADevice::~ComputePmeCUDADevice(), ComputePmeCUDAMgr::~ComputePmeCUDAMgr(), CudaComputeGBISKernel::~CudaComputeGBISKernel(), CudaComputeNonbonded::~CudaComputeNonbonded(), CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel(), CudaFFTCompute::~CudaFFTCompute(), CudaNonbondedTables::~CudaNonbondedTables(), CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute(), CudaPmePencilX::~CudaPmePencilX(), CudaPmePencilXY::~CudaPmePencilXY(), CudaPmePencilY::~CudaPmePencilY(), CudaPmePencilZ::~CudaPmePencilZ(), CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute(), CudaPmeTranspose::~CudaPmeTranspose(), and CudaTileListKernel::~CudaTileListKernel().

#define FORCE_ENERGY_TABLE_SIZE   4096

Definition at line 19 of file CudaUtils.h.

Referenced by CudaNonbondedTables::CudaNonbondedTables(), and sampleTableTex().

#define WARP_ALL (   MASK,
 
)    __all(P)

Definition at line 56 of file CudaUtils.h.

Referenced by storeForces().

#define WARP_ANY (   MASK,
 
)    __any(P)

Definition at line 57 of file CudaUtils.h.

Referenced by void().

#define WARP_BALLOT (   MASK,
 
)    __ballot(P)

Definition at line 58 of file CudaUtils.h.

Referenced by gather_force(), repackTileListsKernel(), and storeForces().

#define WARP_FULL_MASK   0xffffffff
#define WARP_SHUFFLE (   MASK,
  VAR,
  LANE,
  SIZE 
)    __shfl(VAR, LANE, SIZE)
#define WARP_SHUFFLE_DOWN (   MASK,
  VAR,
  DELTA,
  SIZE 
)    __shfl_down(VAR, DELTA, SIZE)

Definition at line 52 of file CudaUtils.h.

#define WARP_SHUFFLE_UP (   MASK,
  VAR,
  DELTA,
  SIZE 
)    __shfl_up(VAR, DELTA, SIZE)

Definition at line 50 of file CudaUtils.h.

#define WARP_SHUFFLE_XOR (   MASK,
  VAR,
  LANE,
  SIZE 
)    __shfl_xor(VAR, LANE, SIZE)
#define WARP_SYNC (   MASK)
#define WARPSIZE   32

Typedef Documentation

typedef unsigned int WarpMask

Definition at line 11 of file CudaUtils.h.

Function Documentation

template<class T >
void allocate_device ( T **  pp,
const int  len 
)

Definition at line 162 of file CudaUtils.h.

References allocate_device_T().

162  {
163  allocate_device_T((void **)pp, len, sizeof(T));
164 }
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:75
void allocate_device_T ( void **  pp,
const int  len,
const size_t  sizeofT 
)

Definition at line 75 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_device(), and bindTextureObject().

75  {
76  cudaCheck(cudaMalloc(pp, sizeofT*len));
77 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void allocate_host ( T **  pp,
const int  len 
)

Definition at line 149 of file CudaUtils.h.

References allocate_host_T().

149  {
150  allocate_host_T((void **)pp, len, sizeof(T));
151 }
void allocate_host_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:65
void allocate_host_T ( void **  pp,
const int  len,
const size_t  sizeofT 
)

Definition at line 65 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_host().

65  {
66  cudaCheck(cudaMallocHost(pp, sizeofT*len));
67 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void clear_device_array ( T *  data,
const int  ndata,
cudaStream_t  stream = 0 
)

Definition at line 132 of file CudaUtils.h.

References clear_device_array_async_T(), and stream.

132  {
133  clear_device_array_async_T(data, ndata, stream, sizeof(T));
134 }
__thread cudaStream_t stream
void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:51
void clear_device_array_async_T ( void data,
const int  ndata,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 51 of file CudaUtils.C.

References cudaCheck.

Referenced by clear_device_array().

51  {
52  cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
53 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void clear_device_array_sync ( T *  data,
const int  ndata 
)

Definition at line 137 of file CudaUtils.h.

References clear_device_array_T().

137  {
138  clear_device_array_T(data, ndata, sizeof(T));
139 }
void clear_device_array_T(void *data, const int ndata, const size_t sizeofT)
Definition: CudaUtils.C:55
void clear_device_array_T ( void data,
const int  ndata,
const size_t  sizeofT 
)

Definition at line 55 of file CudaUtils.C.

References cudaCheck.

Referenced by clear_device_array_sync().

55  {
56  cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
57 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy3D_DtoD ( T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 376 of file CudaUtils.h.

References copy3D_DtoD_T(), and stream.

382  {
383  copy3D_DtoD_T(src_data, dst_data,
384  src_x0, src_y0, src_z0,
385  src_xsize, src_ysize,
386  dst_x0, dst_y0, dst_z0,
387  dst_xsize, dst_ysize,
388  width, height, depth,
389  sizeof(T), stream);
390 }
__thread cudaStream_t stream
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:285
void copy3D_DtoD_T ( void src_data,
void dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 285 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_DtoD().

291  {
292  cudaMemcpy3DParms parms = {0};
293 
294  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
295  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
296 
297  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
298  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
299 
300  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
301  parms.kind = cudaMemcpyDeviceToDevice;
302 
303 #ifdef NAMD_CUDA
304  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
305 #else
306  //TODO-HIP: remove ifdef when HIP implements cudaMemcpy3DAsync
307  cudaCheck(hipMemcpy3D(&parms));
308 #endif
309 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy3D_DtoH ( T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 347 of file CudaUtils.h.

References copy3D_DtoH_T(), and stream.

353  {
354  copy3D_DtoH_T(src_data, dst_data,
355  src_x0, src_y0, src_z0,
356  src_xsize, src_ysize,
357  dst_x0, dst_y0, dst_z0,
358  dst_xsize, dst_ysize,
359  width, height, depth,
360  sizeof(T), stream);
361 }
__thread cudaStream_t stream
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:255
void copy3D_DtoH_T ( void src_data,
void dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 255 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_DtoH().

261  {
262  cudaMemcpy3DParms parms = {0};
263 
264  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
265  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
266 
267  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
268  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
269 
270  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
271  parms.kind = cudaMemcpyDeviceToHost;
272 
273 #ifdef NAMD_CUDA
274  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
275 #else
276  //TODO-HIP: remove ifdef when HIP implements cudaMemcpy3DAsync
277  cudaCheck(hipMemcpy3D(&parms));
278 #endif
279 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy3D_HtoD ( T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 318 of file CudaUtils.h.

References copy3D_HtoD_T(), and stream.

324  {
325  copy3D_HtoD_T(src_data, dst_data,
326  src_x0, src_y0, src_z0,
327  src_xsize, src_ysize,
328  dst_x0, dst_y0, dst_z0,
329  dst_xsize, dst_ysize,
330  width, height, depth,
331  sizeof(T), stream);
332 }
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:225
__thread cudaStream_t stream
void copy3D_HtoD_T ( void src_data,
void dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 225 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_HtoD().

231  {
232  cudaMemcpy3DParms parms = {0};
233 
234  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
235  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
236 
237  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
238  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
239 
240  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
241 
242  parms.kind = cudaMemcpyHostToDevice;
243 #ifdef NAMD_CUDA
244  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
245 #else
246  //TODO-HIP: remove ifdef when HIP implements cudaMemcpy3DAsync
247  cudaCheck(hipMemcpy3D(&parms));
248 #endif
249 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy3D_PeerDtoD ( int  src_dev,
int  dst_dev,
T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 406 of file CudaUtils.h.

References copy3D_PeerDtoD_T(), and stream.

413  {
414  copy3D_PeerDtoD_T(src_dev, dst_dev,
415  src_data, dst_data,
416  src_x0, src_y0, src_z0,
417  src_xsize, src_ysize,
418  dst_x0, dst_y0, dst_z0,
419  dst_xsize, dst_ysize,
420  width, height, depth,
421  sizeof(T), stream);
422 }
__thread cudaStream_t stream
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:315
void copy3D_PeerDtoD_T ( int  src_dev,
int  dst_dev,
void src_data,
void dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 315 of file CudaUtils.C.

References cudaCheck, and cudaDie().

Referenced by copy3D_PeerDtoD().

322  {
323 #ifdef NAMD_HIP
324 // TODO-HIP: Is a workaround possible? cudaMemcpy3D+cudaMemcpyPeer+cudaMemcpy3D
325  cudaDie("cudaMemcpy3DPeerAsync is not supported by HIP");
326 #else
327  cudaMemcpy3DPeerParms parms = {0};
328 
329  parms.srcDevice = src_dev;
330  parms.dstDevice = dst_dev;
331 
332  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
333  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
334 
335  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
336  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
337 
338  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
339 
340  cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
341 #endif
342 }
__thread cudaStream_t stream
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_DtoD ( const T *  d_src,
T *  h_dst,
const int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 279 of file CudaUtils.h.

References copy_DtoD_async_T(), and stream.

279  {
280  copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
281 }
__thread cudaStream_t stream
void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:202
void copy_DtoD_async_T ( const void d_src,
void d_dst,
const int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 202 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoD().

203  {
204  cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
205 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_DtoD_sync ( const T *  d_src,
T *  h_dst,
const int  array_len 
)

Definition at line 287 of file CudaUtils.h.

References copy_DtoD_T().

287  {
288  copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
289 }
void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:207
void copy_DtoD_T ( const void d_src,
void d_dst,
const int  array_len,
const size_t  sizeofT 
)

Definition at line 207 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoD_sync().

207  {
208  cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
209 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_DtoH ( const T *  d_array,
T *  h_array,
const int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 263 of file CudaUtils.h.

References copy_DtoH_async_T(), and stream.

263  {
264  copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
265 }
__thread cudaStream_t stream
void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:189
void copy_DtoH_async_T ( const void d_array,
void h_array,
const int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 189 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoH().

190  {
191  cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
192 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_DtoH_sync ( const T *  d_array,
T *  h_array,
const int  array_len 
)

Definition at line 271 of file CudaUtils.h.

References copy_DtoH_T().

271  {
272  copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
273 }
void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:194
void copy_DtoH_T ( const void d_array,
void h_array,
const int  array_len,
const size_t  sizeofT 
)

Definition at line 194 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoH_sync().

194  {
195  cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
196 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_HtoD ( const T *  h_array,
T *  d_array,
int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 245 of file CudaUtils.h.

References copy_HtoD_async_T(), and stream.

245  {
246  copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
247 }
__thread cudaStream_t stream
void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:175
void copy_HtoD_async_T ( const void h_array,
void d_array,
int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 175 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_HtoD().

176  {
177  cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
178 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_HtoD_sync ( const T *  h_array,
T *  d_array,
int  array_len 
)

Definition at line 254 of file CudaUtils.h.

References copy_HtoD_T().

254  {
255  copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
256 }
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
Definition: CudaUtils.C:180
void copy_HtoD_T ( const void h_array,
void d_array,
int  array_len,
const size_t  sizeofT 
)

Definition at line 180 of file CudaUtils.C.

References cudaCheck.

Referenced by bindTextureObject(), and copy_HtoD_sync().

181  {
182  cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
183 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void copy_PeerDtoD ( const int  src_dev,
const int  dst_dev,
const T *  d_src,
T *  d_dst,
const int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 300 of file CudaUtils.h.

References copy_PeerDtoD_async_T(), and stream.

301  {
302  copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
303 }
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:215
__thread cudaStream_t stream
void copy_PeerDtoD_async_T ( const int  src_dev,
const int  dst_dev,
const void d_src,
void d_dst,
const int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 215 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_PeerDtoD().

217  {
218  cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
219 }
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void cudaDie ( const char *  msg,
cudaError_t  err = cudaSuccess 
)

Definition at line 9 of file CudaUtils.C.

References NAMD_die().

9  {
10  char host[128];
11  gethostname(host, 128); host[127] = 0;
12  char devstr[128] = "";
13  int devnum;
14  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
15  sprintf(devstr, " device %d", devnum);
16  }
17  cudaDeviceProp deviceProp;
18  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
19  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
20  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
21  }
22  char errmsg[1024];
23  if (err == cudaSuccess) {
24  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
25  } else {
26  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
27  }
28  NAMD_die(errmsg);
29 }
void NAMD_die(const char *err_msg)
Definition: common.C:85
void cudaNAMD_bug ( const char *  msg)

Definition at line 31 of file CudaUtils.C.

References NAMD_bug().

Referenced by CudaFFTCompute::backward(), CudaFFTCompute::forward(), gather_force(), and spread_charge().

31 {NAMD_bug(msg);}
void NAMD_bug(const char *err_msg)
Definition: common.C:129
template<class T >
void deallocate_device ( T **  pp)

Definition at line 174 of file CudaUtils.h.

References deallocate_device_T().

174  {
175  deallocate_device_T((void **)pp);
176 }
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:84
void deallocate_device_T ( void **  pp)

Definition at line 84 of file CudaUtils.C.

References cudaCheck.

Referenced by deallocate_device().

84  {
85 
86  if (*pp != NULL) {
87  cudaCheck(cudaFree((void *)(*pp)));
88  *pp = NULL;
89  }
90 
91 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
void deallocate_host ( T **  pp)

Definition at line 223 of file CudaUtils.h.

References deallocate_host_T().

223  {
224  deallocate_host_T((void **)pp);
225 }
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:98
void deallocate_host_T ( void **  pp)

Definition at line 98 of file CudaUtils.C.

References cudaCheck.

Referenced by deallocate_host().

98  {
99 
100  if (*pp != NULL) {
101  cudaCheck(cudaFreeHost((void *)(*pp)));
102  *pp = NULL;
103  }
104 
105 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
bool reallocate_device ( T **  pp,
int *  curlen,
const int  newlen,
const float  fac = 1.0f 
)

Definition at line 191 of file CudaUtils.h.

References reallocate_device_T().

191  {
192  return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
193 }
bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:117
bool reallocate_device_T ( void **  pp,
int *  curlen,
const int  newlen,
const float  fac,
const size_t  sizeofT 
)

Definition at line 117 of file CudaUtils.C.

References cudaCheck.

Referenced by reallocate_device().

117  {
118 
119  if (*pp != NULL && *curlen < newlen) {
120  cudaCheck(cudaFree((void *)(*pp)));
121  *pp = NULL;
122  }
123 
124  if (*pp == NULL) {
125  if (fac > 1.0f) {
126  *curlen = (int)(((double)(newlen))*(double)fac);
127  } else {
128  *curlen = newlen;
129  }
130  cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
131  return true;
132  }
133 
134  return false;
135 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<class T >
bool reallocate_host ( T **  pp,
int *  curlen,
const int  newlen,
const float  fac = 1.0f,
const unsigned int  flag = cudaHostAllocDefault 
)

Definition at line 211 of file CudaUtils.h.

References reallocate_host_T().

212  {
213  return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
214 }
bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:150
bool reallocate_host_T ( void **  pp,
int *  curlen,
const int  newlen,
const float  fac,
const unsigned int  flag,
const size_t  sizeofT 
)

Definition at line 150 of file CudaUtils.C.

References cudaCheck.

Referenced by reallocate_host().

151  {
152 
153  if (*pp != NULL && *curlen < newlen) {
154  cudaCheck(cudaFreeHost((void *)(*pp)));
155  *pp = NULL;
156  }
157 
158  if (*pp == NULL) {
159  if (fac > 1.0f) {
160  *curlen = (int)(((double)(newlen))*(double)fac);
161  } else {
162  *curlen = newlen;
163  }
164  cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
165  return true;
166  }
167 
168  return false;
169 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95