CudaUtils.h File Reference

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

Go to the source code of this file.

Classes

struct  CudaStaticAssert< true >

Defines

#define WARPSIZE   32
#define WARP_FULL_MASK   0xffffffff
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)   __shfl_xor(VAR, LANE, SIZE)
#define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE)   __shfl_up(VAR, DELTA, SIZE)
#define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE)   __shfl_down(VAR, DELTA, SIZE)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)   __shfl(VAR, LANE, SIZE)
#define WARP_ALL(MASK, P)   __all(P)
#define WARP_ANY(MASK, P)   __any(P)
#define WARP_BALLOT(MASK, P)   __ballot(P)
#define WARP_SYNC(MASK)
#define BLOCK_SYNC   __syncthreads()
#define cuda_static_assert(expr)   (CudaStaticAssert<(expr) != 0>())
#define cudaCheck(stmt)

Functions

void cudaDie (const char *msg, cudaError_t err=cudaSuccess)
void cudaNAMD_bug (const char *msg)
void clear_device_array_async_T (void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
void clear_device_array_T (void *data, const int ndata, const size_t sizeofT)
template<class T>
void clear_device_array (T *data, const int ndata, cudaStream_t stream=0)
template<class T>
void clear_device_array_sync (T *data, const int ndata)
void allocate_host_T (void **pp, const int len, const size_t sizeofT)
template<class T>
void allocate_host (T **pp, const int len)
void allocate_device_T (void **pp, const int len, const size_t sizeofT)
template<class T>
void allocate_device (T **pp, const int len)
void deallocate_device_T (void **pp)
template<class T>
void deallocate_device (T **pp)
bool reallocate_device_T (void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
template<class T>
bool reallocate_device (T **pp, int *curlen, const int newlen, const float fac=1.0f)
bool reallocate_host_T (void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
template<class T>
bool reallocate_host (T **pp, int *curlen, const int newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
void deallocate_host_T (void **pp)
template<class T>
void deallocate_host (T **pp)
void copy_HtoD_async_T (const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD_T (const void *h_array, void *d_array, int array_len, const size_t sizeofT)
void copy_DtoH_async_T (const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
void copy_DtoH_T (const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
void copy_DtoD_async_T (const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
void copy_DtoD_T (const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
template<class T>
void copy_HtoD (const T *h_array, T *d_array, int array_len, cudaStream_t stream=0)
template<class T>
void copy_HtoD_sync (const T *h_array, T *d_array, int array_len)
template<class T>
void copy_DtoH (const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0)
template<class T>
void copy_DtoH_sync (const T *d_array, T *h_array, const int array_len)
template<class T>
void copy_DtoD (const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0)
template<class T>
void copy_DtoD_sync (const T *d_src, T *h_dst, const int array_len)
void copy_PeerDtoD_async_T (const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
template<class T>
void copy_PeerDtoD (const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0)
void copy3D_HtoD_T (void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
template<class T>
void copy3D_HtoD (T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void copy3D_DtoH_T (void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
template<class T>
void copy3D_DtoH (T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void copy3D_DtoD_T (void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
template<class T>
void copy3D_DtoD (T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
void copy3D_PeerDtoD_T (int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
template<class T>
void copy3D_PeerDtoD (int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)


Define Documentation

#define BLOCK_SYNC   __syncthreads()

Definition at line 44 of file CudaUtils.h.

Referenced by bondedForcesKernel(), calcTileListPosKernel(), gather_force(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), localSort(), modifiedExclusionForcesKernel(), scalar_sum_kernel(), spread_charge_kernel(), transpose_xyz_yzx_device(), and transpose_xyz_zxy_device().

#define cuda_static_assert ( expr   )     (CudaStaticAssert<(expr) != 0>())

Definition at line 69 of file CudaUtils.h.

Referenced by reduceVariables().

#define cudaCheck ( stmt   ) 

Value:

do {                                 \
        cudaError_t err = stmt;                            \
  if (err != cudaSuccess) {                          \
        char msg[256];  \
          sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
          cudaDie(msg, err); \
  }                                                  \
} while(0)

Definition at line 79 of file CudaUtils.h.

Referenced by allocate_device_T(), allocate_host_T(), CudaFFTCompute::backward(), batchTranspose_xyz_yzx(), batchTranspose_xyz_zxy(), CudaComputeNonbondedKernel::bindExclusions(), bindTextureObject(), ComputeBondedCUDAKernel::bondedForce(), CudaTileListKernel::buildTileLists(), clear_device_array_async_T(), clear_device_array_T(), ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(), copy3D_DtoD_T(), copy3D_DtoH_T(), copy3D_HtoD_T(), copy3D_PeerDtoD_T(), copy_DtoD_async_T(), copy_DtoD_T(), copy_DtoH_async_T(), copy_DtoH_T(), copy_HtoD_async_T(), copy_HtoD_T(), copy_PeerDtoD_async_T(), CudaPmeRealSpaceCompute::copyAtoms(), CudaPmeTranspose::copyDataDeviceToDevice(), CudaPmeTranspose::copyDataDeviceToHost(), CudaPmeTranspose::copyDataHostToDevice(), createStream(), CudaComputeNonbonded::CudaComputeNonbonded(), CudaNonbondedTables::CudaNonbondedTables(), CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(), CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(), CudaPmeTranspose::CudaPmeTranspose(), deallocate_device_T(), deallocate_host_T(), doStreaming(), ComputeNonbondedCUDA::doWork(), CudaPmeKSpaceCompute::energyAndVirialSetCallback(), CudaComputeNonbonded::finishReductions(), CudaTileListKernel::finishTileList(), CudaFFTCompute::forward(), gather_force(), CudaPmeRealSpaceCompute::gatherForce(), CudaPmeRealSpaceCompute::gatherForceSetCallback(), DeviceCUDA::getMaxNumBlocks(), DeviceCUDA::getMaxNumThreads(), DeviceCUDA::initialize(), CudaComputeNonbonded::initialize(), ComputePmeCUDADevice::initialize(), ComputePmeCUDAMgr::initialize_pencils(), CudaPmePencilZ::initializeDevice(), CudaPmePencilY::initializeDevice(), CudaPmePencilX::initializeDevice(), CudaPmePencilXY::initializeDevice(), CudaComputeNonbonded::launchWork(), CudaComputeNonbondedKernel::nonbondedForce(), read_CUDA_ARCH(), reallocate_device_T(), reallocate_host_T(), CudaComputeNonbondedKernel::reduceVirialEnergy(), CudaTileListKernel::reSortTileLists(), scalar_sum(), CudaPmeTranspose::setDataPtrsYZX(), CudaPmeTranspose::setDataPtrsZXY(), CudaPmeKSpaceCompute::solve(), spread_charge(), CudaPmeRealSpaceCompute::spreadCharge(), transpose_xyz_yzx(), transpose_xyz_zxy(), CudaPmeTranspose::transposeXYZtoYZX(), CudaPmeTranspose::transposeXYZtoZXY(), CudaPmeRealSpaceCompute::waitGatherForceDone(), CudaPmeTranspose::waitStreamSynchronize(), writeComplexToDisk(), writeRealToDisk(), ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel(), ComputePmeCUDADevice::~ComputePmeCUDADevice(), ComputePmeCUDAMgr::~ComputePmeCUDAMgr(), CudaComputeNonbonded::~CudaComputeNonbonded(), CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel(), CudaFFTCompute::~CudaFFTCompute(), CudaNonbondedTables::~CudaNonbondedTables(), CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute(), CudaPmePencilX::~CudaPmePencilX(), CudaPmePencilXY::~CudaPmePencilXY(), CudaPmePencilY::~CudaPmePencilY(), CudaPmePencilZ::~CudaPmePencilZ(), CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute(), CudaPmeTranspose::~CudaPmeTranspose(), and CudaTileListKernel::~CudaTileListKernel().

#define WARP_ALL ( MASK,
 )     __all(P)

Definition at line 40 of file CudaUtils.h.

#define WARP_ANY ( MASK,
 )     __any(P)

Definition at line 41 of file CudaUtils.h.

#define WARP_BALLOT ( MASK,
 )     __ballot(P)

Definition at line 42 of file CudaUtils.h.

Referenced by gather_force(), and repackTileListsKernel().

#define WARP_FULL_MASK   0xffffffff

Definition at line 11 of file CudaUtils.h.

Referenced by bondedForcesKernel(), buildTileListsBBKernel(), cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), cuda_pme_forces_dev(), gather_force(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), modifiedExclusionForcesKernel(), reduceVariables(), repackTileListsKernel(), scalar_sum_kernel(), GBISResults< 3 >::shuffleNext(), GBISInput< 3 >::shuffleNext(), GBISResults< 2 >::shuffleNext(), GBISInput< 2 >::shuffleNext(), GBISResults< 1 >::shuffleNext(), GBISInput< 1 >::shuffleNext(), and shuffleNext().

#define WARP_SHUFFLE ( MASK,
VAR,
LANE,
SIZE   )     __shfl(VAR, LANE, SIZE)

Definition at line 38 of file CudaUtils.h.

Referenced by gather_force(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), scalar_sum_kernel(), GBISResults< 3 >::shuffleNext(), GBISInput< 3 >::shuffleNext(), GBISResults< 2 >::shuffleNext(), GBISInput< 2 >::shuffleNext(), GBISResults< 1 >::shuffleNext(), GBISInput< 1 >::shuffleNext(), and shuffleNext().

#define WARP_SHUFFLE_DOWN ( MASK,
VAR,
DELTA,
SIZE   )     __shfl_down(VAR, DELTA, SIZE)

Definition at line 36 of file CudaUtils.h.

#define WARP_SHUFFLE_UP ( MASK,
VAR,
DELTA,
SIZE   )     __shfl_up(VAR, DELTA, SIZE)

Definition at line 34 of file CudaUtils.h.

#define WARP_SHUFFLE_XOR ( MASK,
VAR,
LANE,
SIZE   )     __shfl_xor(VAR, LANE, SIZE)

Definition at line 32 of file CudaUtils.h.

Referenced by bondedForcesKernel(), modifiedExclusionForcesKernel(), and reduceVariables().

#define WARP_SYNC ( MASK   ) 

Definition at line 43 of file CudaUtils.h.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define WARPSIZE   32

Definition at line 9 of file CudaUtils.h.


Function Documentation

template<class T>
void allocate_device ( T **  pp,
const int  len 
)

Definition at line 146 of file CudaUtils.h.

References allocate_device_T().

00146                                             {
00147   allocate_device_T((void **)pp, len, sizeof(T));
00148 }

void allocate_device_T ( void **  pp,
const int  len,
const size_t  sizeofT 
)

Definition at line 81 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_device().

00081                                                                        {
00082   cudaCheck(cudaMalloc(pp, sizeofT*len));
00083 }

template<class T>
void allocate_host ( T **  pp,
const int  len 
)

Definition at line 133 of file CudaUtils.h.

References allocate_host_T().

00133                                           {
00134   allocate_host_T((void **)pp, len, sizeof(T));
00135 }

void allocate_host_T ( void **  pp,
const int  len,
const size_t  sizeofT 
)

Definition at line 71 of file CudaUtils.C.

References cudaCheck.

Referenced by allocate_host().

00071                                                                      {
00072   cudaCheck(cudaMallocHost(pp, sizeofT*len));
00073 }

template<class T>
void clear_device_array ( T *  data,
const int  ndata,
cudaStream_t  stream = 0 
)

Definition at line 116 of file CudaUtils.h.

References clear_device_array_async_T(), and stream.

00116                                                                          {
00117   clear_device_array_async_T(data, ndata, stream, sizeof(T));
00118 }

void clear_device_array_async_T ( void *  data,
const int  ndata,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 57 of file CudaUtils.C.

References cudaCheck.

Referenced by clear_device_array().

00057                                                                                                         {
00058   cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
00059 }

template<class T>
void clear_device_array_sync ( T *  data,
const int  ndata 
)

Definition at line 121 of file CudaUtils.h.

References clear_device_array_T().

00121                                                        {
00122   clear_device_array_T(data, ndata, sizeof(T));
00123 }

void clear_device_array_T ( void *  data,
const int  ndata,
const size_t  sizeofT 
)

Definition at line 61 of file CudaUtils.C.

References cudaCheck.

Referenced by clear_device_array_sync().

00061                                                                              {
00062   cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
00063 }

template<class T>
void copy3D_DtoD ( T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 360 of file CudaUtils.h.

References copy3D_DtoD_T(), and stream.

00366                          {
00367   copy3D_DtoD_T(src_data, dst_data,
00368     src_x0, src_y0, src_z0,
00369     src_xsize, src_ysize,
00370     dst_x0, dst_y0, dst_z0,
00371     dst_xsize, dst_ysize,
00372     width, height, depth,
00373     sizeof(T), stream);
00374 }

void copy3D_DtoD_T ( void *  src_data,
void *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 281 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_DtoD().

00287                                        {
00288   cudaMemcpy3DParms parms = {0};
00289 
00290   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00291   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00292 
00293   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00294   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00295 
00296   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00297   parms.kind = cudaMemcpyDeviceToDevice;
00298 
00299   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00300 }

template<class T>
void copy3D_DtoH ( T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 331 of file CudaUtils.h.

References copy3D_DtoH_T(), and stream.

00337                          {
00338   copy3D_DtoH_T(src_data, dst_data,
00339     src_x0, src_y0, src_z0,
00340     src_xsize, src_ysize,
00341     dst_x0, dst_y0, dst_z0,
00342     dst_xsize, dst_ysize,
00343     width, height, depth,
00344     sizeof(T), stream);
00345 }

void copy3D_DtoH_T ( void *  src_data,
void *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 256 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_DtoH().

00262                                        {
00263   cudaMemcpy3DParms parms = {0};
00264 
00265   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00266   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00267 
00268   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00269   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00270 
00271   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00272   parms.kind = cudaMemcpyDeviceToHost;
00273 
00274   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00275 }

template<class T>
void copy3D_HtoD ( T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 302 of file CudaUtils.h.

References copy3D_HtoD_T(), and stream.

00308                          {
00309   copy3D_HtoD_T(src_data, dst_data,
00310     src_x0, src_y0, src_z0,
00311     src_xsize, src_ysize,
00312     dst_x0, dst_y0, dst_z0,
00313     dst_xsize, dst_ysize,
00314     width, height, depth,
00315     sizeof(T), stream);
00316 }

void copy3D_HtoD_T ( void *  src_data,
void *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 231 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_HtoD().

00237                                        {
00238   cudaMemcpy3DParms parms = {0};
00239 
00240   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00241   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00242 
00243   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00244   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00245 
00246   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00247   parms.kind = cudaMemcpyHostToDevice;
00248 
00249   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00250 }

template<class T>
void copy3D_PeerDtoD ( int  src_dev,
int  dst_dev,
T *  src_data,
T *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
cudaStream_t  stream = 0 
)

Definition at line 390 of file CudaUtils.h.

References copy3D_PeerDtoD_T(), and stream.

00397                          {
00398   copy3D_PeerDtoD_T(src_dev, dst_dev,
00399     src_data, dst_data,
00400     src_x0, src_y0, src_z0,
00401     src_xsize, src_ysize,
00402     dst_x0, dst_y0, dst_z0,
00403     dst_xsize, dst_ysize,
00404     width, height, depth,
00405     sizeof(T), stream);
00406 }

void copy3D_PeerDtoD_T ( int  src_dev,
int  dst_dev,
void *  src_data,
void *  dst_data,
int  src_x0,
int  src_y0,
int  src_z0,
size_t  src_xsize,
size_t  src_ysize,
int  dst_x0,
int  dst_y0,
int  dst_z0,
size_t  dst_xsize,
size_t  dst_ysize,
size_t  width,
size_t  height,
size_t  depth,
size_t  sizeofT,
cudaStream_t  stream 
)

Definition at line 306 of file CudaUtils.C.

References cudaCheck.

Referenced by copy3D_PeerDtoD().

00313                                        {
00314   cudaMemcpy3DPeerParms parms = {0};
00315 
00316   parms.srcDevice = src_dev;
00317   parms.dstDevice = dst_dev;
00318 
00319   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00320   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00321 
00322   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00323   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00324 
00325   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00326 
00327   cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
00328 }

template<class T>
void copy_DtoD ( const T *  d_src,
T *  h_dst,
const int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 263 of file CudaUtils.h.

References copy_DtoD_async_T(), and stream.

00263                                                                                      {
00264   copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
00265 }

void copy_DtoD_async_T ( const void *  d_src,
void *  d_dst,
const int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 208 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoD().

00209                                  {
00210   cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
00211 }

template<class T>
void copy_DtoD_sync ( const T *  d_src,
T *  h_dst,
const int  array_len 
)

Definition at line 271 of file CudaUtils.h.

References copy_DtoD_T().

00271                                                                    {
00272   copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
00273 }

void copy_DtoD_T ( const void *  d_src,
void *  d_dst,
const int  array_len,
const size_t  sizeofT 
)

Definition at line 213 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoD_sync().

00213                                                                                             {
00214   cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
00215 }

template<class T>
void copy_DtoH ( const T *  d_array,
T *  h_array,
const int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 247 of file CudaUtils.h.

References copy_DtoH_async_T(), and stream.

00247                                                                                          {
00248   copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
00249 }

void copy_DtoH_async_T ( const void *  d_array,
void *  h_array,
const int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 195 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoH().

00196                                  {
00197   cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
00198 }

template<class T>
void copy_DtoH_sync ( const T *  d_array,
T *  h_array,
const int  array_len 
)

Definition at line 255 of file CudaUtils.h.

References copy_DtoH_T().

00255                                                                        {
00256   copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
00257 }

void copy_DtoH_T ( const void *  d_array,
void *  h_array,
const int  array_len,
const size_t  sizeofT 
)

Definition at line 200 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_DtoH_sync().

00200                                                                                                 {
00201   cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
00202 }

template<class T>
void copy_HtoD ( const T *  h_array,
T *  d_array,
int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 229 of file CudaUtils.h.

References copy_HtoD_async_T(), and stream.

00229                                                                                    {
00230   copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
00231 }

void copy_HtoD_async_T ( const void *  h_array,
void *  d_array,
int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 181 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_HtoD().

00182                                  {
00183   cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
00184 }

template<class T>
void copy_HtoD_sync ( const T *  h_array,
T *  d_array,
int  array_len 
)

Definition at line 238 of file CudaUtils.h.

References copy_HtoD_T().

00238                                                                  {
00239   copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
00240 }

void copy_HtoD_T ( const void *  h_array,
void *  d_array,
int  array_len,
const size_t  sizeofT 
)

Definition at line 186 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_HtoD_sync().

00187                            {
00188   cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
00189 }

template<class T>
void copy_PeerDtoD ( const int  src_dev,
const int  dst_dev,
const T *  d_src,
T *  d_dst,
const int  array_len,
cudaStream_t  stream = 0 
)

Definition at line 284 of file CudaUtils.h.

References copy_PeerDtoD_async_T(), and stream.

00285                                                                         {
00286   copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
00287 }

void copy_PeerDtoD_async_T ( const int  src_dev,
const int  dst_dev,
const void *  d_src,
void *  d_dst,
const int  array_len,
cudaStream_t  stream,
const size_t  sizeofT 
)

Definition at line 221 of file CudaUtils.C.

References cudaCheck.

Referenced by copy_PeerDtoD().

00223                         {
00224   cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
00225 }

void cudaDie ( const char *  msg,
cudaError_t  err = cudaSuccess 
)

Definition at line 9 of file CudaUtils.C.

References NAMD_die().

00009                                                {
00010   char host[128];
00011   gethostname(host, 128);  host[127] = 0;
00012   char devstr[128] = "";
00013   int devnum;
00014   if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00015     sprintf(devstr, " device %d", devnum);
00016   }
00017   cudaDeviceProp deviceProp;
00018   if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
00019     sprintf(devstr, " device %d pci %x:%x:%x", devnum,
00020       deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00021   }
00022   char errmsg[1024];
00023   if (err == cudaSuccess) {
00024     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
00025   } else {
00026     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));    
00027   }
00028   NAMD_die(errmsg);
00029 }

void cudaNAMD_bug ( const char *  msg  ) 

Definition at line 31 of file CudaUtils.C.

References NAMD_bug().

Referenced by CudaFFTCompute::backward(), CudaFFTCompute::forward(), gather_force(), and spread_charge().

00031 {NAMD_bug(msg);}

template<class T>
void deallocate_device ( T **  pp  ) 

Definition at line 158 of file CudaUtils.h.

References deallocate_device_T().

00158                                {
00159   deallocate_device_T((void **)pp);
00160 }

void deallocate_device_T ( void **  pp  ) 

Definition at line 90 of file CudaUtils.C.

References cudaCheck.

Referenced by deallocate_device().

00090                                     {
00091   
00092   if (*pp != NULL) {
00093     cudaCheck(cudaFree((void *)(*pp)));
00094     *pp = NULL;
00095   }
00096 
00097 }

template<class T>
void deallocate_host ( T **  pp  ) 

Definition at line 207 of file CudaUtils.h.

References deallocate_host_T().

00207                              {
00208   deallocate_host_T((void **)pp);
00209 }

void deallocate_host_T ( void **  pp  ) 

Definition at line 104 of file CudaUtils.C.

References cudaCheck.

Referenced by deallocate_host().

00104                                   {
00105   
00106   if (*pp != NULL) {
00107     cudaCheck(cudaFreeHost((void *)(*pp)));
00108     *pp = NULL;
00109   }
00110 
00111 }

template<class T>
bool reallocate_device ( T **  pp,
int *  curlen,
const int  newlen,
const float  fac = 1.0f 
)

Definition at line 175 of file CudaUtils.h.

References reallocate_device_T().

00175                                                                                     {
00176   return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
00177 }

bool reallocate_device_T ( void **  pp,
int *  curlen,
const int  newlen,
const float  fac,
const size_t  sizeofT 
)

Definition at line 123 of file CudaUtils.C.

References cudaCheck, and f.

Referenced by reallocate_device().

00123                                                                                                           {
00124 
00125   if (*pp != NULL && *curlen < newlen) {
00126     cudaCheck(cudaFree((void *)(*pp)));
00127     *pp = NULL;
00128   }
00129 
00130   if (*pp == NULL) {
00131     if (fac > 1.0f) {
00132       *curlen = (int)(((double)(newlen))*(double)fac);
00133     } else {
00134       *curlen = newlen;
00135     }
00136     cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
00137     return true;
00138   }
00139 
00140   return false;
00141 }

template<class T>
bool reallocate_host ( T **  pp,
int *  curlen,
const int  newlen,
const float  fac = 1.0f,
const unsigned int  flag = cudaHostAllocDefault 
)

Definition at line 195 of file CudaUtils.h.

References reallocate_host_T().

00196                                                                                          {
00197   return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
00198 }

bool reallocate_host_T ( void **  pp,
int *  curlen,
const int  newlen,
const float  fac,
const unsigned int  flag,
const size_t  sizeofT 
)

Definition at line 156 of file CudaUtils.C.

References cudaCheck, and f.

Referenced by reallocate_host().

00157                                                                                        {
00158 
00159   if (*pp != NULL && *curlen < newlen) {
00160     cudaCheck(cudaFreeHost((void *)(*pp)));
00161     *pp = NULL;
00162   }
00163 
00164   if (*pp == NULL) {
00165     if (fac > 1.0f) {
00166       *curlen = (int)(((double)(newlen))*(double)fac);
00167     } else {
00168       *curlen = newlen;
00169     }
00170     cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
00171     return true;
00172   }
00173 
00174   return false;
00175 }


Generated on Fri Oct 19 01:17:17 2018 for NAMD by  doxygen 1.4.7