7 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 9 void cudaDie(
const char *msg, cudaError_t err) {
11 gethostname(host, 128); host[127] = 0;
12 char devstr[128] =
"";
14 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
15 sprintf(devstr,
" device %d", devnum);
17 cudaDeviceProp deviceProp;
18 if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
19 sprintf(devstr,
" device %d pci %x:%x:%x", devnum,
20 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
23 if (err == cudaSuccess) {
24 sprintf(errmsg,
"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
26 sprintf(errmsg,
"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
33 gethostname(host, 128); host[127] = 0;
34 char devstr[128] =
"";
36 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
37 sprintf(devstr,
" device %d", devnum);
39 cudaDeviceProp deviceProp;
40 if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
41 sprintf(devstr,
" device %d pci %x:%x:%x", devnum,
42 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
45 if (err == cudaSuccess) {
46 sprintf(errmsg,
"CUDA cuRAND error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
48 sprintf(errmsg,
"CUDA cuRAND error %s on Pe %d (%s%s): status value %d", msg, CkMyPe(), host, devstr, err);
57 cudaError_t err = cudaGetDeviceCount(&devcnt);
60 if ( err == cudaSuccess ) err = cudaSetDevice(0);
61 if ( err == cudaSuccess ) err = cudaSetDeviceFlags(cudaDeviceMapHost);
62 if ( err == cudaSuccess ) err = cudaMalloc(&dummy, 4);
64 if ( err != cudaSuccess ) {
66 gethostname(host, 128); host[127] = 0;
67 fprintf(stderr,
"CUDA initialization error on %s: %s\n", host, cudaGetErrorString(err));
74 cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
78 cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
88 cudaCheck(cudaMallocHost(pp, sizeofT*len));
102 cudaCheck(cudaMallocManaged(pp, sizeofT*len));
106 #if (CUDART_VERSION >= 11020) 107 cudaCheck(cudaMallocAsync(pp, sizeofT*len, stream));
128 #if (CUDART_VERSION >= 11020) 130 cudaCheck(cudaFreeAsync((
void *)(*pp), stream));
161 bool reallocate_device_T(
void **pp,
size_t *curlen,
const size_t newlen,
const float fac,
const size_t sizeofT) {
163 if (*pp != NULL && *curlen < newlen) {
170 *curlen = (size_t)(((
double)(newlen))*(double)fac);
174 cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
195 const float fac,
const unsigned int flag,
const size_t sizeofT) {
197 if (*pp != NULL && *curlen < newlen) {
204 *curlen = (size_t)(((
double)(newlen))*(double)fac);
208 cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
219 void copy_HtoD_async_T(
const void *h_array,
void *d_array,
size_t array_len, cudaStream_t stream,
220 const size_t sizeofT) {
221 cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
224 void copy_HtoD_T(
const void *h_array,
void *d_array,
size_t array_len,
225 const size_t sizeofT) {
226 cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
233 void copy_DtoH_async_T(
const void *d_array,
void *h_array,
const size_t array_len, cudaStream_t stream,
234 const size_t sizeofT) {
235 cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
238 void copy_DtoH_T(
const void *d_array,
void *h_array,
const size_t array_len,
const size_t sizeofT) {
239 cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
246 void copy_DtoD_async_T(
const void *d_src,
void *d_dst,
const size_t array_len, cudaStream_t stream,
247 const size_t sizeofT) {
248 cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
251 void copy_DtoD_T(
const void *d_src,
void *d_dst,
const size_t array_len,
const size_t sizeofT) {
252 cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
260 const void *d_src,
void *d_dst,
const size_t array_len, cudaStream_t stream,
261 const size_t sizeofT) {
262 cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
270 int src_x0,
int src_y0,
int src_z0,
271 size_t src_xsize,
size_t src_ysize,
272 int dst_x0,
int dst_y0,
int dst_z0,
273 size_t dst_xsize,
size_t dst_ysize,
274 size_t width,
size_t height,
size_t depth,
275 size_t sizeofT, cudaStream_t stream) {
276 cudaMemcpy3DParms parms = {0};
278 parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
279 parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
281 parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
282 parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
284 parms.extent = make_cudaExtent(sizeofT*width, height, depth);
285 parms.kind = cudaMemcpyHostToDevice;
287 cudaCheck(cudaMemcpy3DAsync(&parms, stream));
295 int src_x0,
int src_y0,
int src_z0,
296 size_t src_xsize,
size_t src_ysize,
297 int dst_x0,
int dst_y0,
int dst_z0,
298 size_t dst_xsize,
size_t dst_ysize,
299 size_t width,
size_t height,
size_t depth,
300 size_t sizeofT, cudaStream_t stream) {
301 cudaMemcpy3DParms parms = {0};
303 parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
304 parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
306 parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
307 parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
309 parms.extent = make_cudaExtent(sizeofT*width, height, depth);
310 parms.kind = cudaMemcpyDeviceToHost;
312 cudaCheck(cudaMemcpy3DAsync(&parms, stream));
320 int src_x0,
int src_y0,
int src_z0,
321 size_t src_xsize,
size_t src_ysize,
322 int dst_x0,
int dst_y0,
int dst_z0,
323 size_t dst_xsize,
size_t dst_ysize,
324 size_t width,
size_t height,
size_t depth,
325 size_t sizeofT, cudaStream_t stream) {
326 cudaMemcpy3DParms parms = {0};
328 parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
329 parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
331 parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
332 parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
334 parms.extent = make_cudaExtent(sizeofT*width, height, depth);
335 parms.kind = cudaMemcpyDeviceToDevice;
337 cudaCheck(cudaMemcpy3DAsync(&parms, stream));
345 void* src_data,
void* dst_data,
346 int src_x0,
int src_y0,
int src_z0,
347 size_t src_xsize,
size_t src_ysize,
348 int dst_x0,
int dst_y0,
int dst_z0,
349 size_t dst_xsize,
size_t dst_ysize,
350 size_t width,
size_t height,
size_t depth,
351 size_t sizeofT, cudaStream_t stream) {
354 cudaDie(
"cudaMemcpy3DPeerAsync is not supported by HIP");
356 cudaMemcpy3DPeerParms parms = {0};
358 parms.srcDevice = src_dev;
359 parms.dstDevice = dst_dev;
361 parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
362 parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
364 parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
365 parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
367 parms.extent = make_cudaExtent(sizeofT*width, height, depth);
369 cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)
void cuda_affinity_initialize()
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
void deallocate_device_T(void **pp)
void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT)
void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT)
void allocate_host_T(void **pp, const size_t len, const size_t sizeofT)
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void cudaDie(const char *msg, cudaError_t err)
void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)
void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)
void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT)
void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void curandDie(const char *msg, int err)
void NAMD_bug(const char *err_msg)
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void NAMD_die(const char *err_msg)
void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT)
void deallocate_host_T(void **pp)
void deallocate_device_T_async(void **pp, cudaStream_t stream)
bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT)