NAMD
CudaUtils.C
Go to the documentation of this file.
1 
2 #include <stdio.h>
3 #include "common.h"
4 #include "charm++.h"
5 #include "CudaUtils.h"
6 
7 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
8 
9 void cudaDie(const char *msg, cudaError_t err) {
10  char host[128];
11  gethostname(host, 128); host[127] = 0;
12  char devstr[128] = "";
13  int devnum;
14  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
15  sprintf(devstr, " device %d", devnum);
16  }
17  cudaDeviceProp deviceProp;
18  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
19  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
20  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
21  }
22  char errmsg[1024];
23  if (err == cudaSuccess) {
24  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
25  } else {
26  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
27  }
28  NAMD_die(errmsg);
29 }
30 
31 void curandDie(const char *msg, int err) {
32  char host[128];
33  gethostname(host, 128); host[127] = 0;
34  char devstr[128] = "";
35  int devnum;
36  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
37  sprintf(devstr, " device %d", devnum);
38  }
39  cudaDeviceProp deviceProp;
40  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
41  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
42  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
43  }
44  char errmsg[1024];
45  if (err == cudaSuccess) {
46  sprintf(errmsg,"CUDA cuRAND error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
47  } else {
48  sprintf(errmsg,"CUDA cuRAND error %s on Pe %d (%s%s): status value %d", msg, CkMyPe(), host, devstr, err);
49  }
50  NAMD_die(errmsg);
51 }
52 
53 void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);}
54 
56  int devcnt = 0;
57  cudaError_t err = cudaGetDeviceCount(&devcnt);
58  if ( devcnt == 1 ) { // only one device so it must be ours
59  int *dummy;
60  if ( err == cudaSuccess ) err = cudaSetDevice(0);
61  if ( err == cudaSuccess ) err = cudaSetDeviceFlags(cudaDeviceMapHost);
62  if ( err == cudaSuccess ) err = cudaMalloc(&dummy, 4);
63  }
64  if ( err != cudaSuccess ) {
65  char host[128];
66  gethostname(host, 128); host[127] = 0;
67  fprintf(stderr,"CUDA initialization error on %s: %s\n", host, cudaGetErrorString(err));
68  }
69 }
70 
71 //----------------------------------------------------------------------------------------
72 
73 void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT) {
74  cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
75 }
76 
77 void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT) {
78  cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
79 }
80 
81 //----------------------------------------------------------------------------------------
82 //
83 // Allocate page-locked host memory
84 // pp = memory pointer
85 // len = length of the array
86 //
87 void allocate_host_T(void **pp, const size_t len, const size_t sizeofT) {
88  cudaCheck(cudaMallocHost(pp, sizeofT*len));
89 }
90 
91 //----------------------------------------------------------------------------------------
92 //
93 // Allocate gpu memory
94 // pp = memory pointer
95 // len = length of the array
96 //
97 void allocate_device_T(void **pp, const size_t len, const size_t sizeofT) {
98  cudaCheck(cudaMalloc(pp, sizeofT*len));
99 }
100 
101 void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT){
102  cudaCheck(cudaMallocManaged(pp, sizeofT*len));
103 }
104 
105 void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream){
106 #if (CUDART_VERSION >= 11020)
107  cudaCheck(cudaMallocAsync(pp, sizeofT*len, stream));
108 #else
109  allocate_device_T(pp, len, sizeofT);
110 #endif
111 }
112 
113 //----------------------------------------------------------------------------------------
114 //
115 // Deallocate gpu memory
116 // pp = memory pointer
117 //
118 void deallocate_device_T(void **pp) {
119 
120  if (*pp != NULL) {
121  cudaCheck(cudaFree((void *)(*pp)));
122  *pp = NULL;
123  }
124 
125 }
126 
127 void deallocate_device_T_async(void **pp, cudaStream_t stream) {
128 #if (CUDART_VERSION >= 11020)
129  if (*pp != NULL) {
130  cudaCheck(cudaFreeAsync((void *)(*pp), stream));
131  *pp = NULL;
132  }
133 #else
135 #endif
136 }
137 //----------------------------------------------------------------------------------------
138 //
139 // Deallocate page-locked host memory
140 // pp = memory pointer
141 //
142 void deallocate_host_T(void **pp) {
143 
144  if (*pp != NULL) {
145  cudaCheck(cudaFreeHost((void *)(*pp)));
146  *pp = NULL;
147  }
148 
149 }
150 
151 //----------------------------------------------------------------------------------------
152 //
153 // Allocate & re-allocate device memory
154 // pp = memory pointer
155 // curlen = current length of the array
156 // newlen = new required length of the array
157 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
158 //
159 // returns true if reallocation happened
160 //
161 bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT) {
162 
163  if (*pp != NULL && *curlen < newlen) {
164  cudaCheck(cudaFree((void *)(*pp)));
165  *pp = NULL;
166  }
167 
168  if (*pp == NULL) {
169  if (fac > 1.0f) {
170  *curlen = (size_t)(((double)(newlen))*(double)fac);
171  } else {
172  *curlen = newlen;
173  }
174  cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
175  return true;
176  }
177 
178  return false;
179 }
180 
181 //----------------------------------------------------------------------------------------
182 //
183 // Allocate & re-allocate page-locked host memory
184 // pp = memory pointer
185 // curlen = current length of the array
186 // newlen = new required length of the array
187 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
188 // flag = allocation type:
189 // cudaHostAllocDefault = default type, emulates cudaMallocHost
190 // cudaHostAllocMapped = maps allocation into CUDA address space
191 //
192 // returns true if reallocation happened
193 //
194 bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen,
195  const float fac, const unsigned int flag, const size_t sizeofT) {
196 
197  if (*pp != NULL && *curlen < newlen) {
198  cudaCheck(cudaFreeHost((void *)(*pp)));
199  *pp = NULL;
200  }
201 
202  if (*pp == NULL) {
203  if (fac > 1.0f) {
204  *curlen = (size_t)(((double)(newlen))*(double)fac);
205  } else {
206  *curlen = newlen;
207  }
208  cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
209  return true;
210  }
211 
212  return false;
213 }
214 
215 //----------------------------------------------------------------------------------------
216 //
217 // Copies memory Host -> Device
218 //
219 void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream,
220  const size_t sizeofT) {
221  cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
222 }
223 
224 void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len,
225  const size_t sizeofT) {
226  cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
227 }
228 
229 //----------------------------------------------------------------------------------------
230 //
231 // Copies memory Device -> Host
232 //
233 void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream,
234  const size_t sizeofT) {
235  cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
236 }
237 
238 void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT) {
239  cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
240 }
241 
242 //----------------------------------------------------------------------------------------
243 //
244 // Copies memory Device -> Device
245 //
246 void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream,
247  const size_t sizeofT) {
248  cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
249 }
250 
251 void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT) {
252  cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
253 }
254 
255 //----------------------------------------------------------------------------------------
256 //
257 // Copies memory between two devices Device -> Device
258 //
259 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
260  const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream,
261  const size_t sizeofT) {
262  cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
263 }
264 
265 //----------------------------------------------------------------------------------------
266 //
267 // Copies 3D memory block Host -> Device
268 //
269 void copy3D_HtoD_T(void* src_data, void* dst_data,
270  int src_x0, int src_y0, int src_z0,
271  size_t src_xsize, size_t src_ysize,
272  int dst_x0, int dst_y0, int dst_z0,
273  size_t dst_xsize, size_t dst_ysize,
274  size_t width, size_t height, size_t depth,
275  size_t sizeofT, cudaStream_t stream) {
276  cudaMemcpy3DParms parms = {0};
277 
278  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
279  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
280 
281  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
282  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
283 
284  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
285  parms.kind = cudaMemcpyHostToDevice;
286 
287  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
288 }
289 
290 //----------------------------------------------------------------------------------------
291 //
292 // Copies 3D memory block Device -> Host
293 //
294 void copy3D_DtoH_T(void* src_data, void* dst_data,
295  int src_x0, int src_y0, int src_z0,
296  size_t src_xsize, size_t src_ysize,
297  int dst_x0, int dst_y0, int dst_z0,
298  size_t dst_xsize, size_t dst_ysize,
299  size_t width, size_t height, size_t depth,
300  size_t sizeofT, cudaStream_t stream) {
301  cudaMemcpy3DParms parms = {0};
302 
303  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
304  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
305 
306  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
307  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
308 
309  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
310  parms.kind = cudaMemcpyDeviceToHost;
311 
312  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
313 }
314 
315 //----------------------------------------------------------------------------------------
316 //
317 // Copies 3D memory block Device -> Device
318 //
319 void copy3D_DtoD_T(void* src_data, void* dst_data,
320  int src_x0, int src_y0, int src_z0,
321  size_t src_xsize, size_t src_ysize,
322  int dst_x0, int dst_y0, int dst_z0,
323  size_t dst_xsize, size_t dst_ysize,
324  size_t width, size_t height, size_t depth,
325  size_t sizeofT, cudaStream_t stream) {
326  cudaMemcpy3DParms parms = {0};
327 
328  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
329  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
330 
331  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
332  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
333 
334  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
335  parms.kind = cudaMemcpyDeviceToDevice;
336 
337  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
338 }
339 
340 //----------------------------------------------------------------------------------------
341 //
342 // Copies 3D memory block between devices Device -> Device
343 //
344 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
345  void* src_data, void* dst_data,
346  int src_x0, int src_y0, int src_z0,
347  size_t src_xsize, size_t src_ysize,
348  int dst_x0, int dst_y0, int dst_z0,
349  size_t dst_xsize, size_t dst_ysize,
350  size_t width, size_t height, size_t depth,
351  size_t sizeofT, cudaStream_t stream) {
352 #ifdef NAMD_HIP
353 // TODO-HIP: Is a workaround possible? cudaMemcpy3D+cudaMemcpyPeer+cudaMemcpy3D
354  cudaDie("cudaMemcpy3DPeerAsync is not supported by HIP");
355 #else
356  cudaMemcpy3DPeerParms parms = {0};
357 
358  parms.srcDevice = src_dev;
359  parms.dstDevice = dst_dev;
360 
361  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
362  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
363 
364  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
365  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
366 
367  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
368 
369  cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
370 #endif
371 }
372 #endif // NAMD_CUDA
bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:194
void cuda_affinity_initialize()
Definition: CudaUtils.C:55
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:97
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:118
void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:251
void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:238
void allocate_host_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:87
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:269
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:73
void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:219
void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:101
void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:233
void curandDie(const char *msg, int err)
Definition: CudaUtils.C:31
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:319
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:259
void NAMD_die(const char *err_msg)
Definition: common.C:147
void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:105
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:224
void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:246
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:344
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:294
void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT)
Definition: CudaUtils.C:77
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:142
void deallocate_device_T_async(void **pp, cudaStream_t stream)
Definition: CudaUtils.C:127
bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:161