NAMD
CudaUtils.h
Go to the documentation of this file.
1 #ifndef CUDAUTILS_H
2 #define CUDAUTILS_H
3 
4 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
5 #include <stdio.h>
6 #ifdef NAMD_CUDA
7 #include <cuda.h>
8 #include <cuda_runtime.h>
9 
10 #define WARPSIZE 32
11 typedef unsigned int WarpMask;
12 #else
13 #include <hip/hip_runtime.h>
14 #include "HipDefines.h"
15 #define WARPSIZE 64
16 typedef unsigned long long int WarpMask;
17 #endif
18 
19 #define FORCE_ENERGY_TABLE_SIZE 4096
20 
21 #define WARP_FULL_MASK 0xffffffff
22 
23 #ifdef NAMD_CUDA
24 #define ATOMIC_BINS 1
25 #else
26 #define ATOMIC_BINS WARPSIZE
27 #endif
28 
29 #if (__CUDACC_VER_MAJOR__ >= 9)
30 #define NAMD_USE_COOPERATIVE_GROUPS
31 #endif
32 
33 #ifdef NAMD_USE_COOPERATIVE_GROUPS
34  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
35  __shfl_xor_sync(MASK, VAR, LANE, SIZE)
36  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
37  __shfl_up_sync(MASK, VAR, DELTA, SIZE)
38  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
39  __shfl_down_sync(MASK, VAR, DELTA, SIZE)
40  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
41  __shfl_sync(MASK, VAR, LANE, SIZE)
42  #define WARP_ALL(MASK, P) __all_sync(MASK, P)
43  #define WARP_ANY(MASK, P) __any_sync(MASK, P)
44  #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P)
45  #define WARP_SYNC(MASK) __syncwarp(MASK)
46  #define BLOCK_SYNC __barrier_sync(0)
47 #else
48  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
49  __shfl_xor(VAR, LANE, SIZE)
50  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
51  __shfl_up(VAR, DELTA, SIZE)
52  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
53  __shfl_down(VAR, DELTA, SIZE)
54  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
55  __shfl(VAR, LANE, SIZE)
56  #define WARP_ALL(MASK, P) __all(P)
57  #define WARP_ANY(MASK, P) __any(P)
58  #define WARP_BALLOT(MASK, P) __ballot(P)
59  #define WARP_SYNC(MASK)
60  #define BLOCK_SYNC __syncthreads()
61 #endif
62 
63 
64 /*
65 // Define float3 + float3 operation
66 __host__ __device__ inline float3 operator+(const float3 a, const float3 b) {
67  float3 c;
68  c.x = a.x + b.x;
69  c.y = a.y + b.y;
70  c.z = a.z + b.z;
71  return c;
72 }
73 */
74 
75 //
76 // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11
77 //
78 template <bool>
80 
81 template <>
82 struct CudaStaticAssert<true> {
83 };
84 
85 #define cuda_static_assert(expr) \
86  (CudaStaticAssert<(expr) != 0>())
87 
88 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
89 
90 void cudaNAMD_bug(const char *msg);
91 
92 //
93 // Error checking wrapper for CUDA
94 //
95 #define cudaCheck(stmt) do { \
96  cudaError_t err = stmt; \
97  if (err != cudaSuccess) { \
98  char msg[256]; \
99  sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
100  cudaDie(msg, err); \
101  } \
102 } while(0)
103 
104 #ifdef __CUDACC__
105 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
106 // native implementation available
107 #else
108 #if __CUDA_ARCH__ >= 600
109 #error using CAS implementation of double atomicAdd
110 #endif
111 //
112 // Double precision atomicAdd, copied from CUDA_C_Programming_Guide.pdf (ver 5.0)
113 //
114 static __device__ double atomicAdd(double* address, double val) {
115  unsigned long long int* address_as_ull = (unsigned long long int*)address;
116  unsigned long long int old = *address_as_ull, assumed;
117  do {
118  assumed = old;
119  old = atomicCAS(address_as_ull, assumed,
120  __double_as_longlong(val +
121  __longlong_as_double(assumed)));
122  } while (assumed != old);
123  return __longlong_as_double(old);
124 }
125 #endif
126 #endif
127 
128 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT);
129 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT);
130 
131 template <class T>
132 void clear_device_array(T *data, const int ndata, cudaStream_t stream=0) {
133  clear_device_array_async_T(data, ndata, stream, sizeof(T));
134 }
135 
136 template <class T>
137 void clear_device_array_sync(T *data, const int ndata) {
138  clear_device_array_T(data, ndata, sizeof(T));
139 }
140 
141 void allocate_host_T(void **pp, const int len, const size_t sizeofT);
142 //----------------------------------------------------------------------------------------
143 //
144 // Allocate page-locked host memory
145 // pp = memory pointer
146 // len = length of the array
147 //
148 template <class T>
149 void allocate_host(T **pp, const int len) {
150  allocate_host_T((void **)pp, len, sizeof(T));
151 }
152 
153 
154 void allocate_device_T(void **pp, const int len, const size_t sizeofT);
155 //----------------------------------------------------------------------------------------
156 //
157 // Allocate gpu memory
158 // pp = memory pointer
159 // len = length of the array
160 //
161 template <class T>
162 void allocate_device(T **pp, const int len) {
163  allocate_device_T((void **)pp, len, sizeof(T));
164 }
165 
166 
167 void deallocate_device_T(void **pp);
168 //----------------------------------------------------------------------------------------
169 //
170 // Deallocate gpu memory
171 // pp = memory pointer
172 //
173 template <class T>
174 void deallocate_device(T **pp) {
175  deallocate_device_T((void **)pp);
176 }
177 //----------------------------------------------------------------------------------------
178 
179 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT);
180 //----------------------------------------------------------------------------------------
181 //
182 // Allocate & re-allocate device memory
183 // pp = memory pointer
184 // curlen = current length of the array
185 // newlen = new required length of the array
186 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
187 //
188 // returns true if reallocation happened
189 //
190 template <class T>
191 bool reallocate_device(T **pp, int *curlen, const int newlen, const float fac=1.0f) {
192  return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
193 }
194 //----------------------------------------------------------------------------------------
195 bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac,
196  const unsigned int flag, const size_t sizeofT);
197 //----------------------------------------------------------------------------------------
198 //
199 // Allocate & re-allocate pinned host memory
200 // pp = memory pointer
201 // curlen = current length of the array
202 // newlen = new required length of the array
203 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
204 // flag = allocation type:
205 // cudaHostAllocDefault = default type, emulates cudaMallocHost
206 // cudaHostAllocMapped = maps allocation into CUDA address space
207 //
208 // returns true if reallocation happened
209 //
210 template <class T>
211 bool reallocate_host(T **pp, int *curlen, const int newlen,
212  const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault) {
213  return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
214 }
215 
216 void deallocate_host_T(void **pp);
217 //----------------------------------------------------------------------------------------
218 //
219 // Deallocate page-locked host memory
220 // pp = memory pointer
221 //
222 template <class T>
223 void deallocate_host(T **pp) {
224  deallocate_host_T((void **)pp);
225 }
226 //----------------------------------------------------------------------------------------
227 
228 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
229  const size_t sizeofT);
230 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
231  const size_t sizeofT);
232 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
233  const size_t sizeofT);
234 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT);
235 
236 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
237  const size_t sizeofT);
238 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT);
239 
240 //----------------------------------------------------------------------------------------
241 //
242 // Copies memory Host -> Device
243 //
244 template <class T>
245 void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0) {
246  copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
247 }
248 
249 //----------------------------------------------------------------------------------------
250 //
251 // Copies memory Host -> Device using synchronous calls
252 //
253 template <class T>
254 void copy_HtoD_sync(const T *h_array, T *d_array, int array_len) {
255  copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
256 }
257 
258 //----------------------------------------------------------------------------------------
259 //
260 // Copies memory Device -> Host
261 //
262 template <class T>
263 void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0) {
264  copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
265 }
266 //----------------------------------------------------------------------------------------
267 //
268 // Copies memory Device -> Host using synchronous calls
269 //
270 template <class T>
271 void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len) {
272  copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
273 }
274 //----------------------------------------------------------------------------------------
275 //
276 // Copies memory Device -> Device
277 //
278 template <class T>
279 void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0) {
280  copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
281 }
282 //----------------------------------------------------------------------------------------
283 //
284 // Copies memory Device -> Device using synchronous calls
285 //
286 template <class T>
287 void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len) {
288  copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
289 }
290 
291 //----------------------------------------------------------------------------------------
292 //
293 // Copies memory between two peer devices Device -> Device
294 //
295 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
296  const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
297  const size_t sizeofT);
298 
299 template <class T>
300 void copy_PeerDtoD(const int src_dev, const int dst_dev,
301  const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0) {
302  copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
303 }
304 
305 //----------------------------------------------------------------------------------------
306 //
307 // Copies 3D memory block Host -> Device
308 //
309 void copy3D_HtoD_T(void* src_data, void* dst_data,
310  int src_x0, int src_y0, int src_z0,
311  size_t src_xsize, size_t src_ysize,
312  int dst_x0, int dst_y0, int dst_z0,
313  size_t dst_xsize, size_t dst_ysize,
314  size_t width, size_t height, size_t depth,
315  size_t sizeofT, cudaStream_t stream);
316 
317 template <class T>
318 void copy3D_HtoD(T* src_data, T* dst_data,
319  int src_x0, int src_y0, int src_z0,
320  size_t src_xsize, size_t src_ysize,
321  int dst_x0, int dst_y0, int dst_z0,
322  size_t dst_xsize, size_t dst_ysize,
323  size_t width, size_t height, size_t depth,
324  cudaStream_t stream=0) {
325  copy3D_HtoD_T(src_data, dst_data,
326  src_x0, src_y0, src_z0,
327  src_xsize, src_ysize,
328  dst_x0, dst_y0, dst_z0,
329  dst_xsize, dst_ysize,
330  width, height, depth,
331  sizeof(T), stream);
332 }
333 
334 //----------------------------------------------------------------------------------------
335 //
336 // Copies 3D memory block Device -> Host
337 //
338 void copy3D_DtoH_T(void* src_data, void* dst_data,
339  int src_x0, int src_y0, int src_z0,
340  size_t src_xsize, size_t src_ysize,
341  int dst_x0, int dst_y0, int dst_z0,
342  size_t dst_xsize, size_t dst_ysize,
343  size_t width, size_t height, size_t depth,
344  size_t sizeofT, cudaStream_t stream);
345 
346 template <class T>
347 void copy3D_DtoH(T* src_data, T* dst_data,
348  int src_x0, int src_y0, int src_z0,
349  size_t src_xsize, size_t src_ysize,
350  int dst_x0, int dst_y0, int dst_z0,
351  size_t dst_xsize, size_t dst_ysize,
352  size_t width, size_t height, size_t depth,
353  cudaStream_t stream=0) {
354  copy3D_DtoH_T(src_data, dst_data,
355  src_x0, src_y0, src_z0,
356  src_xsize, src_ysize,
357  dst_x0, dst_y0, dst_z0,
358  dst_xsize, dst_ysize,
359  width, height, depth,
360  sizeof(T), stream);
361 }
362 
363 //----------------------------------------------------------------------------------------
364 //
365 // Copies 3D memory block Device -> Device
366 //
367 void copy3D_DtoD_T(void* src_data, void* dst_data,
368  int src_x0, int src_y0, int src_z0,
369  size_t src_xsize, size_t src_ysize,
370  int dst_x0, int dst_y0, int dst_z0,
371  size_t dst_xsize, size_t dst_ysize,
372  size_t width, size_t height, size_t depth,
373  size_t sizeofT, cudaStream_t stream);
374 
375 template <class T>
376 void copy3D_DtoD(T* src_data, T* dst_data,
377  int src_x0, int src_y0, int src_z0,
378  size_t src_xsize, size_t src_ysize,
379  int dst_x0, int dst_y0, int dst_z0,
380  size_t dst_xsize, size_t dst_ysize,
381  size_t width, size_t height, size_t depth,
382  cudaStream_t stream=0) {
383  copy3D_DtoD_T(src_data, dst_data,
384  src_x0, src_y0, src_z0,
385  src_xsize, src_ysize,
386  dst_x0, dst_y0, dst_z0,
387  dst_xsize, dst_ysize,
388  width, height, depth,
389  sizeof(T), stream);
390 }
391 
392 //----------------------------------------------------------------------------------------
393 //
394 // Copies 3D memory block between two peer devices Device -> Device
395 //
396 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
397  void* src_data, void* dst_data,
398  int src_x0, int src_y0, int src_z0,
399  size_t src_xsize, size_t src_ysize,
400  int dst_x0, int dst_y0, int dst_z0,
401  size_t dst_xsize, size_t dst_ysize,
402  size_t width, size_t height, size_t depth,
403  size_t sizeofT, cudaStream_t stream);
404 
405 template <class T>
406 void copy3D_PeerDtoD(int src_dev, int dst_dev,
407  T* src_data, T* dst_data,
408  int src_x0, int src_y0, int src_z0,
409  size_t src_xsize, size_t src_ysize,
410  int dst_x0, int dst_y0, int dst_z0,
411  size_t dst_xsize, size_t dst_ysize,
412  size_t width, size_t height, size_t depth,
413  cudaStream_t stream=0) {
414  copy3D_PeerDtoD_T(src_dev, dst_dev,
415  src_data, dst_data,
416  src_x0, src_y0, src_z0,
417  src_xsize, src_ysize,
418  dst_x0, dst_y0, dst_z0,
419  dst_xsize, dst_ysize,
420  width, height, depth,
421  sizeof(T), stream);
422 }
423 
424 #endif // NAMD_CUDA
425 
426 #endif // CUDAUTILS_H
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:376
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:84
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:347
void copy_HtoD(const T *h_array, T *d_array, int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:245
void deallocate_host(T **pp)
Definition: CudaUtils.h:223
void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:194
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:225
void allocate_device(T **pp, const int len)
Definition: CudaUtils.h:162
void deallocate_device(T **pp)
Definition: CudaUtils.h:174
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:75
void copy_DtoH(const T *d_array, T *h_array, const int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:263
void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT)
Definition: CudaUtils.C:207
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:318
void allocate_host(T **pp, const int len)
Definition: CudaUtils.h:149
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:215
__thread cudaStream_t stream
void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:189
void copy_DtoD(const T *d_src, T *h_dst, const int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:279
void copy_DtoD_sync(const T *d_src, T *h_dst, const int array_len)
Definition: CudaUtils.h:287
unsigned int WarpMask
Definition: CudaUtils.h:11
bool reallocate_host_T(void **pp, int *curlen, const int newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:150
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:406
bool reallocate_host(T **pp, int *curlen, const int newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
Definition: CudaUtils.h:211
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:285
void clear_device_array_sync(T *data, const int ndata)
Definition: CudaUtils.h:137
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
Definition: CudaUtils.C:180
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:51
void copy_HtoD_sync(const T *h_array, T *d_array, int array_len)
Definition: CudaUtils.h:254
void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len)
Definition: CudaUtils.h:271
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
void clear_device_array(T *data, const int ndata, cudaStream_t stream=0)
Definition: CudaUtils.h:132
void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:202
bool reallocate_device(T **pp, int *curlen, const int newlen, const float fac=1.0f)
Definition: CudaUtils.h:191
bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:117
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const int array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:300
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:315
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:255
void clear_device_array_T(void *data, const int ndata, const size_t sizeofT)
Definition: CudaUtils.C:55
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:98
void allocate_host_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:65
void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:175