NAMD
CudaUtils.C
Go to the documentation of this file.
1 
2 #include <stdio.h>
3 #include "common.h"
4 #include "charm++.h"
5 #include "CudaUtils.h"
6 
7 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
8 
9 void cudaDie(const char *msg, cudaError_t err) {
10  char host[128];
11  gethostname(host, 128); host[127] = 0;
12  char devstr[128] = "";
13  int devnum;
14  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
15  sprintf(devstr, " device %d", devnum);
16  }
17  cudaDeviceProp deviceProp;
18  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
19  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
20  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
21  }
22  char errmsg[1024];
23  if (err == cudaSuccess) {
24  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
25  } else {
26  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
27  }
28  NAMD_die(errmsg);
29 }
30 
31 void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);}
32 
34  int devcnt = 0;
35  cudaError_t err = cudaGetDeviceCount(&devcnt);
36  if ( devcnt == 1 ) { // only one device so it must be ours
37  int *dummy;
38  if ( err == cudaSuccess ) err = cudaSetDevice(0);
39  if ( err == cudaSuccess ) err = cudaSetDeviceFlags(cudaDeviceMapHost);
40  if ( err == cudaSuccess ) err = cudaMalloc(&dummy, 4);
41  }
42  if ( err != cudaSuccess ) {
43  char host[128];
44  gethostname(host, 128); host[127] = 0;
45  fprintf(stderr,"CUDA initialization error on %s: %s\n", host, cudaGetErrorString(err));
46  }
47 }
48 
49 //----------------------------------------------------------------------------------------
50 
51 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT) {
52  cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
53 }
54 
55 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT) {
56  cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
57 }
58 
59 //----------------------------------------------------------------------------------------
60 //
61 // Allocate page-locked host memory
62 // pp = memory pointer
63 // len = length of the array
64 //
65 void allocate_host_T(void **pp, const int len, const size_t sizeofT) {
66  cudaCheck(cudaMallocHost(pp, sizeofT*len));
67 }
68 
69 //----------------------------------------------------------------------------------------
70 //
71 // Allocate gpu memory
72 // pp = memory pointer
73 // len = length of the array
74 //
75 void allocate_device_T(void **pp, const int len, const size_t sizeofT) {
76  cudaCheck(cudaMalloc(pp, sizeofT*len));
77 }
78 
79 //----------------------------------------------------------------------------------------
80 //
81 // Deallocate gpu memory
82 // pp = memory pointer
83 //
84 void deallocate_device_T(void **pp) {
85 
86  if (*pp != NULL) {
87  cudaCheck(cudaFree((void *)(*pp)));
88  *pp = NULL;
89  }
90 
91 }
92 
93 //----------------------------------------------------------------------------------------
94 //
95 // Deallocate page-locked host memory
96 // pp = memory pointer
97 //
98 void deallocate_host_T(void **pp) {
99 
100  if (*pp != NULL) {
101  cudaCheck(cudaFreeHost((void *)(*pp)));
102  *pp = NULL;
103  }
104 
105 }
106 
107 //----------------------------------------------------------------------------------------
108 //
109 // Allocate & re-allocate device memory
110 // pp = memory pointer
111 // curlen = current length of the array
112 // newlen = new required length of the array
113 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
114 //
115 // returns true if reallocation happened
116 //
117 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT) {
118 
119  if (*pp != NULL && *curlen < newlen) {
120  cudaCheck(cudaFree((void *)(*pp)));
121  *pp = NULL;
122  }
123 
124  if (*pp == NULL) {
125  if (fac > 1.0f) {
126  *curlen = (int)(((double)(newlen))*(double)fac);
127  } else {
128  *curlen = newlen;
129  }
130  cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
131  return true;
132  }
133 
134  return false;
135 }
136 
137 //----------------------------------------------------------------------------------------
138 //
139 // Allocate & re-allocate page-locked host memory
140 // pp = memory pointer
141 // curlen = current length of the array
142 // newlen = new required length of the array
143 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
144 // flag = allocation type:
145 // cudaHostAllocDefault = default type, emulates cudaMallocHost
146 // cudaHostAllocMapped = maps allocation into CUDA address space
147 //
148 // returns true if reallocation happened
149 //
150 bool reallocate_host_T(void **pp, int *curlen, const int newlen,
151  const float fac, const unsigned int flag, const size_t sizeofT) {
152 
153  if (*pp != NULL && *curlen < newlen) {
154  cudaCheck(cudaFreeHost((void *)(*pp)));
155  *pp = NULL;
156  }
157 
158  if (*pp == NULL) {
159  if (fac > 1.0f) {
160  *curlen = (int)(((double)(newlen))*(double)fac);
161  } else {
162  *curlen = newlen;
163  }
164  cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
165  return true;
166  }
167 
168  return false;
169 }
170 
171 //----------------------------------------------------------------------------------------
172 //
173 // Copies memory Host -> Device
174 //
175 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
176  const size_t sizeofT) {
177  cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
178 }
179 
180 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
181  const size_t sizeofT) {
182  cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
183 }
184 
185 //----------------------------------------------------------------------------------------
186 //
187 // Copies memory Device -> Host
188 //
189 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
190  const size_t sizeofT) {
191  cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
192 }
193 
194 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT) {
195  cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
196 }
197 
198 //----------------------------------------------------------------------------------------
199 //
200 // Copies memory Device -> Device
201 //
202 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
203  const size_t sizeofT) {
204  cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
205 }
206 
207 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) {
208  cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
209 }
210 
211 //----------------------------------------------------------------------------------------
212 //
213 // Copies memory between two devices Device -> Device
214 //
215 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
216  const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
217  const size_t sizeofT) {
218  cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
219 }
220 
221 //----------------------------------------------------------------------------------------
222 //
223 // Copies 3D memory block Host -> Device
224 //
225 void copy3D_HtoD_T(void* src_data, void* dst_data,
226  int src_x0, int src_y0, int src_z0,
227  size_t src_xsize, size_t src_ysize,
228  int dst_x0, int dst_y0, int dst_z0,
229  size_t dst_xsize, size_t dst_ysize,
230  size_t width, size_t height, size_t depth,
231  size_t sizeofT, cudaStream_t stream) {
232  cudaMemcpy3DParms parms = {0};
233 
234  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
235  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
236 
237  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
238  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
239 
240  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
241 
242  parms.kind = cudaMemcpyHostToDevice;
243 #ifdef NAMD_CUDA
244  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
245 #else
246  //TODO-HIP: remove ifdef when HIP implements cudaMemcpy3DAsync
247  cudaCheck(hipMemcpy3D(&parms));
248 #endif
249 }
250 
251 //----------------------------------------------------------------------------------------
252 //
253 // Copies 3D memory block Device -> Host
254 //
255 void copy3D_DtoH_T(void* src_data, void* dst_data,
256  int src_x0, int src_y0, int src_z0,
257  size_t src_xsize, size_t src_ysize,
258  int dst_x0, int dst_y0, int dst_z0,
259  size_t dst_xsize, size_t dst_ysize,
260  size_t width, size_t height, size_t depth,
261  size_t sizeofT, cudaStream_t stream) {
262  cudaMemcpy3DParms parms = {0};
263 
264  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
265  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
266 
267  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
268  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
269 
270  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
271  parms.kind = cudaMemcpyDeviceToHost;
272 
273 #ifdef NAMD_CUDA
274  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
275 #else
276  //TODO-HIP: remove ifdef when HIP implements cudaMemcpy3DAsync
277  cudaCheck(hipMemcpy3D(&parms));
278 #endif
279 }
280 
281 //----------------------------------------------------------------------------------------
282 //
283 // Copies 3D memory block Device -> Device
284 //
285 void copy3D_DtoD_T(void* src_data, void* dst_data,
286  int src_x0, int src_y0, int src_z0,
287  size_t src_xsize, size_t src_ysize,
288  int dst_x0, int dst_y0, int dst_z0,
289  size_t dst_xsize, size_t dst_ysize,
290  size_t width, size_t height, size_t depth,
291  size_t sizeofT, cudaStream_t stream) {
292  cudaMemcpy3DParms parms = {0};
293 
294  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
295  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
296 
297  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
298  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
299 
300  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
301  parms.kind = cudaMemcpyDeviceToDevice;
302 
303 #ifdef NAMD_CUDA
304  cudaCheck(cudaMemcpy3DAsync(&parms, stream));
305 #else
306  //TODO-HIP: remove ifdef when HIP implements cudaMemcpy3DAsync
307  cudaCheck(hipMemcpy3D(&parms));
308 #endif
309 }
310 
311 //----------------------------------------------------------------------------------------
312 //
313 // Copies 3D memory block between devices Device -> Device
314 //
315 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
316  void* src_data, void* dst_data,
317  int src_x0, int src_y0, int src_z0,
318  size_t src_xsize, size_t src_ysize,
319  int dst_x0, int dst_y0, int dst_z0,
320  size_t dst_xsize, size_t dst_ysize,
321  size_t width, size_t height, size_t depth,
322  size_t sizeofT, cudaStream_t stream) {
323 #ifdef NAMD_HIP
324 // TODO-HIP: Is a workaround possible? cudaMemcpy3D+cudaMemcpyPeer+cudaMemcpy3D
325  cudaDie("cudaMemcpy3DPeerAsync is not supported by HIP");
326 #else
327  cudaMemcpy3DPeerParms parms = {0};
328 
329  parms.srcDevice = src_dev;
330  parms.dstDevice = dst_dev;
331 
332  parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
333  parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
334 
335  parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
336  parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
337 
338  parms.extent = make_cudaExtent(sizeofT*width, height, depth);
339 
340  cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
341 #endif
342 }
343 #endif // NAMD_CUDA
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:84
void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:194
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:225
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:75
void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:207
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:215
__thread cudaStream_t stream
void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:189
bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:150
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:285
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
Definition: CudaUtils.C:180
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void NAMD_die(const char *err_msg)
Definition: common.C:85
void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:51
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
void dummy()
void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:202
bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:117
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:315
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:255
void clear_device_array_T(void *data, const int ndata, const size_t sizeofT)
Definition: CudaUtils.C:55
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:98
void cuda_affinity_initialize()
Definition: CudaUtils.C:33
void allocate_host_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:65
void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:175