NAMD
CudaUtils.h
Go to the documentation of this file.
1 #ifndef CUDAUTILS_H
2 #define CUDAUTILS_H
3 
4 #ifdef NAMD_CUDA
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 #endif // NAMD_CUDA
8 
9 #ifdef NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #endif // NAMD_HIP
12 
13 #include <stdio.h>
14 #include "HipDefines.h"
15 
16 #ifdef NAMD_CUDA
17 #define WARPSIZE 32
18 #define BOUNDINGBOXSIZE 32
19 typedef unsigned int WarpMask;
20 #endif // NAMD_CUDA
21 
22 #ifdef NAMD_HIP
23 
24 /* JM: Definition of the warpsize: we need to be careful here since WARPSIZE is
25  a compile-time definition, but on HIP we can target both RDNA and CDNA
26  devices, which could be either wave32 or 64.
27 */
28 #ifdef NAMD_NAVI_BUILD
29 #define WARPSIZE 32
30 #else
31 #define WARPSIZE 64
32 #endif
33 
34 #define BOUNDINGBOXSIZE 32
35 typedef unsigned int WarpMask;
36 #endif // NAMD_HIP
37 
38 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
39 
40 #define FORCE_ENERGY_TABLE_SIZE 4096
41 
42 #define COPY_CUDATENSOR(S,D) \
43  D.xx = S.xx; \
44  D.xy = S.xy; \
45  D.xz = S.xz; \
46  D.yx = S.yx; \
47  D.yy = S.yy; \
48  D.yz = S.yz; \
49  D.zx = S.zx; \
50  D.zy = S.zy; \
51  D.zz = S.zz
52 
53 #define COPY_CUDAVECTOR(S,D) \
54  D.x = S.x; \
55  D.y = S.y; \
56  D.z = S.z
57 
58 #define PRINT_CUDATENSOR(T, SS) \
59  SS << T.xx << " " << T.xy << " " << T.xz << " " << T.yx << " " << \
60  T.yy << " " << T.yz << " " << T.zx << " " << T.zy << " " << T.zz << \
61  std::endl;
62 
63 #ifdef SHORTREALS
64 typedef float BigReal;
65 #else
66 typedef double BigReal;
67 #endif
68 
69 #ifdef NAMD_CUDA
70 #define ATOMIC_BINS 1
71 #else
72 #define ATOMIC_BINS WARPSIZE
73 #endif
74 
75 struct cudaTensor{
85 
86  __forceinline__ __device__
88  this->xx += other.xx;
89  this->xy += other.xy;
90  this->xz += other.xz;
91  this->yx += other.yx;
92  this->yy += other.yy;
93  this->yz += other.yz;
94  this->zx += other.zx;
95  this->zy += other.zy;
96  this->zz += other.zz;
97  return *this;
98  }
99 
100  __forceinline__ __device__
101  friend cudaTensor operator+(const cudaTensor& t1, const cudaTensor& t2) {
102  cudaTensor t;
103  t.xx = t1.xx + t2.xx;
104  t.xy = t1.xy + t2.xy;
105  t.xz = t1.xz + t2.xz;
106  t.yx = t1.yx + t2.yx;
107  t.yy = t1.yy + t2.yy;
108  t.yz = t1.yz + t2.yz;
109  t.zx = t1.zx + t2.zx;
110  t.zy = t1.zy + t2.zy;
111  t.zz = t1.zz + t2.zz;
112  return t;
113  }
114 };
115 
116 struct cudaVector{
120 };
121 
122 struct CudaMInfo{
123  int destPatchID[3][3][3];
124 };
125 
126 
127 #define FEP_BONDED_CUDA_DEBUG
128 #ifdef FEP_BONDED_CUDA_DEBUG
129 #include <iostream>
130 #endif
131 
132 
133 // XXX The macro definitions below need to be refactored.
134 // At present, we are supporting three options:
135 // - CUDA versions >= 9
136 // - CUDA versions < 9
137 // - HIP
138 //
139 // These three options should be explicitly represented in the
140 // macro definitions below. We should definitely not be expanding
141 // through multiple chained macros, e.g., NAMD_WARP_SYNC expanding
142 // to WARP_SYNC. Finally. these macros should be consistently named
143 // and applied throughout the code to avoid repeated tests against
144 // CUDA and HIP macros, e.g., MigrationCUDAKernel.cu lines 1567-1571.
145 //
146 // Once we fix the macros below and make their use consistent,
147 // we can purge this comment block.
148 
149 #define WARP_FULL_MASK 0xffffffff
150 //TODO:HIP verify
151 //#define WARP_FULL_MASK 0xffffffffffffffff
152 
153 #if (__CUDACC_VER_MAJOR__ >= 9)
154 #define NAMD_USE_COOPERATIVE_GROUPS
155 #endif
156 
157 #ifdef NAMD_USE_COOPERATIVE_GROUPS
158  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
159  __shfl_xor_sync(MASK, VAR, LANE, SIZE)
160  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
161  __shfl_up_sync(MASK, VAR, DELTA, SIZE)
162  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
163  __shfl_down_sync(MASK, VAR, DELTA, SIZE)
164  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
165  __shfl_sync(MASK, VAR, LANE, SIZE)
166  #define WARP_ALL(MASK, P) __all_sync(MASK, P)
167  #define WARP_ANY(MASK, P) __any_sync(MASK, P)
168  #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P)
169  #define WARP_SYNC(MASK) __syncwarp(MASK)
170  #define BLOCK_SYNC __barrier_sync(0)
171 #else
172  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
173  __shfl_xor(VAR, LANE, SIZE)
174  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
175  __shfl_up(VAR, DELTA, SIZE)
176  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
177  __shfl_down(VAR, DELTA, SIZE)
178  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
179  __shfl(VAR, LANE, SIZE)
180 #ifdef NAMD_HIP
181  // this collides with a few rocprim definitions, so we need redefine this as NAMD_WARP_*
182  #define NAMD_WARP_ALL(MASK, P) __all(P)
183  #define NAMD_WARP_ANY(MASK, P) __any(P)
184  #define NAMD_WARP_BALLOT(MASK, P) __ballot(P)
185  #define NAMD_WARP_SYNC(MASK)
186  #define BLOCK_SYNC __syncthreads()
187 #else
188  #define WARP_ALL(MASK, P) __all(P)
189  #define WARP_ANY(MASK, P) __any(P)
190  #define WARP_BALLOT(MASK, P) __ballot(P)
191  #define WARP_SYNC(MASK)
192  #define BLOCK_SYNC __syncthreads()
193 #endif
194 #endif
195 
196 #if !defined(NAMD_HIP)
197 #define NAMD_WARP_SYNC(MASK) WARP_SYNC(MASK)
198 #endif
199 
200 
201 /*
202 // Define float3 + float3 operation
203 __host__ __device__ inline float3 operator+(const float3 a, const float3 b) {
204  float3 c;
205  c.x = a.x + b.x;
206  c.y = a.y + b.y;
207  c.z = a.z + b.z;
208  return c;
209 }
210 */
211 
212 //
213 // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11
214 //
215 template <bool>
217 
218 template <>
219 struct CudaStaticAssert<true> {
220 };
221 
222 #define cuda_static_assert(expr) \
223  (CudaStaticAssert<(expr) != 0>())
224 
225 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
226 void curandDie(const char *msg, int err=0);
227 
228 void cudaNAMD_bug(const char *msg);
229 
230 //
231 // Error checking wrapper for CUDA
232 //
233 #define cudaCheck(stmt) do { \
234  cudaError_t err = stmt; \
235  if (err != cudaSuccess) { \
236  char msg[256]; \
237  sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
238  cudaDie(msg, err); \
239  } \
240 } while(0)
241 
242 #define curandCheck(stmt) do { \
243  curandStatus_t err = stmt; \
244  if (err != CURAND_STATUS_SUCCESS) { \
245  char msg[256]; \
246  sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
247  curandDie(msg, (int)err); \
248  } \
249  } while(0)
250 
251 #ifdef __CUDACC__
252 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
253 // native implementation available
254 #else
255 #if __CUDA_ARCH__ >= 600
256 #error using CAS implementation of double atomicAdd
257 #endif
258 //
259 // Double precision atomicAdd, copied from CUDA_C_Programming_Guide.pdf (ver 5.0)
260 //
261 static __device__ double atomicAdd(double* address, double val) {
262  unsigned long long int* address_as_ull = (unsigned long long int*)address;
263  unsigned long long int old = *address_as_ull, assumed;
264  do {
265  assumed = old;
266  old = atomicCAS(address_as_ull, assumed,
267  __double_as_longlong(val +
268  __longlong_as_double(assumed)));
269  } while (assumed != old);
270  return __longlong_as_double(old);
271 }
272 #endif
273 #endif
274 
275 void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT);
276 void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT);
277 
278 template <class T>
279 void clear_device_array(T *data, const size_t ndata, cudaStream_t stream=0) {
280  clear_device_array_async_T(data, ndata, stream, sizeof(T));
281 }
282 
283 template <class T>
284 void clear_device_array_sync(T *data, const size_t ndata) {
285  clear_device_array_T(data, ndata, sizeof(T));
286 }
287 
288 void allocate_host_T(void **pp, const size_t len, const size_t sizeofT);
289 //----------------------------------------------------------------------------------------
290 //
291 // Allocate page-locked host memory
292 // pp = memory pointer
293 // len = length of the array
294 //
295 template <class T>
296 void allocate_host(T **pp, const size_t len) {
297  allocate_host_T((void **)pp, len, sizeof(T));
298 }
299 
300 
301 void allocate_device_T(void **pp, const size_t len, const size_t sizeofT);
302 void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT);
303 void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream);
304 //----------------------------------------------------------------------------------------
305 //
306 // Allocate gpu memory
307 // pp = memory pointer
308 // len = length of the array
309 //
310 template <class T>
311 void allocate_device(T **pp, const size_t len) {
312  allocate_device_T((void **)pp, len, sizeof(T));
313 }
314 
315 template <class T>
316 void allocate_device_managed(T **pp, const size_t len) {
317  allocate_device_T_managed((void **)pp, len, sizeof(T));
318 }
319 
320 template <class T>
321 void allocate_device_async(T **pp, const size_t len, cudaStream_t stream) {
322  allocate_device_T_async((void **)pp, len, sizeof(T), stream);
323 }
324 
325 void deallocate_device_T(void **pp);
326 void deallocate_device_T_async(void **pp, cudaStream_t stream);
327 //----------------------------------------------------------------------------------------
328 //
329 // Deallocate gpu memory
330 // pp = memory pointer
331 //
332 template <class T>
333 void deallocate_device(T **pp) {
334  deallocate_device_T((void **)pp);
335 }
336 template <class T>
337 void deallocate_device_async(T **pp, cudaStream_t stream) {
338  deallocate_device_T_async((void **)pp, stream);
339 }
340 
341 //----------------------------------------------------------------------------------------
342 
343 bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT);
344 //----------------------------------------------------------------------------------------
345 //
346 // Allocate & re-allocate device memory
347 // pp = memory pointer
348 // curlen = current length of the array
349 // newlen = new required length of the array
350 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
351 //
352 // returns true if reallocation happened
353 //
354 template <class T>
355 bool reallocate_device(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f) {
356  return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
357 }
358 //----------------------------------------------------------------------------------------
359 bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac,
360  const unsigned int flag, const size_t sizeofT);
361 //----------------------------------------------------------------------------------------
362 //
363 // Allocate & re-allocate pinned host memory
364 // pp = memory pointer
365 // curlen = current length of the array
366 // newlen = new required length of the array
367 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
368 // flag = allocation type:
369 // cudaHostAllocDefault = default type, emulates cudaMallocHost
370 // cudaHostAllocMapped = maps allocation into CUDA address space
371 //
372 // returns true if reallocation happened
373 //
374 template <class T>
375 bool reallocate_host(T **pp, size_t *curlen, const size_t newlen,
376  const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault) {
377  return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
378 }
379 
380 void deallocate_host_T(void **pp);
381 //----------------------------------------------------------------------------------------
382 //
383 // Deallocate page-locked host memory
384 // pp = memory pointer
385 //
386 template <class T>
387 void deallocate_host(T **pp) {
388  deallocate_host_T((void **)pp);
389 }
390 //----------------------------------------------------------------------------------------
391 
392 void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream,
393  const size_t sizeofT);
394 void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len,
395  const size_t sizeofT);
396 void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream,
397  const size_t sizeofT);
398 void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT);
399 
400 void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream,
401  const size_t sizeofT);
402 void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT);
403 
404 //----------------------------------------------------------------------------------------
405 //
406 // Copies memory Host -> Device
407 //
408 template <class T>
409 void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0) {
410  copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
411 }
412 
413 //----------------------------------------------------------------------------------------
414 //
415 // Copies memory Host -> Device using synchronous calls
416 //
417 template <class T>
418 void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len) {
419  copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
420 }
421 
422 //----------------------------------------------------------------------------------------
423 //
424 // Copies memory Device -> Host
425 //
426 template <class T>
427 void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0) {
428  copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
429 }
430 //----------------------------------------------------------------------------------------
431 //
432 // Copies memory Device -> Host using synchronous calls
433 //
434 template <class T>
435 void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len) {
436  copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
437 }
438 //----------------------------------------------------------------------------------------
439 //
440 // Copies memory Device -> Device
441 //
442 template <class T>
443 void copy_DtoD(const T *d_src, T *h_dst, const size_t array_len, cudaStream_t stream=0) {
444  copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
445 }
446 //----------------------------------------------------------------------------------------
447 //
448 // Copies memory Device -> Device using synchronous calls
449 //
450 template <class T>
451 void copy_DtoD_sync(const T *d_src, T *h_dst, const size_t array_len) {
452  copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
453 }
454 
455 //----------------------------------------------------------------------------------------
456 //
457 // Copies memory between two peer devices Device -> Device
458 //
459 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
460  const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream,
461  const size_t sizeofT);
462 
463 template <class T>
464 void copy_PeerDtoD(const int src_dev, const int dst_dev,
465  const T *d_src, T *d_dst, const size_t array_len, cudaStream_t stream=0) {
466  copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
467 }
468 
469 //----------------------------------------------------------------------------------------
470 //
471 // Copies 3D memory block Host -> Device
472 //
473 void copy3D_HtoD_T(void* src_data, void* dst_data,
474  int src_x0, int src_y0, int src_z0,
475  size_t src_xsize, size_t src_ysize,
476  int dst_x0, int dst_y0, int dst_z0,
477  size_t dst_xsize, size_t dst_ysize,
478  size_t width, size_t height, size_t depth,
479  size_t sizeofT, cudaStream_t stream);
480 
481 template <class T>
482 void copy3D_HtoD(T* src_data, T* dst_data,
483  int src_x0, int src_y0, int src_z0,
484  size_t src_xsize, size_t src_ysize,
485  int dst_x0, int dst_y0, int dst_z0,
486  size_t dst_xsize, size_t dst_ysize,
487  size_t width, size_t height, size_t depth,
488  cudaStream_t stream=0) {
489  copy3D_HtoD_T(src_data, dst_data,
490  src_x0, src_y0, src_z0,
491  src_xsize, src_ysize,
492  dst_x0, dst_y0, dst_z0,
493  dst_xsize, dst_ysize,
494  width, height, depth,
495  sizeof(T), stream);
496 }
497 
498 //----------------------------------------------------------------------------------------
499 //
500 // Copies 3D memory block Device -> Host
501 //
502 void copy3D_DtoH_T(void* src_data, void* dst_data,
503  int src_x0, int src_y0, int src_z0,
504  size_t src_xsize, size_t src_ysize,
505  int dst_x0, int dst_y0, int dst_z0,
506  size_t dst_xsize, size_t dst_ysize,
507  size_t width, size_t height, size_t depth,
508  size_t sizeofT, cudaStream_t stream);
509 
510 template <class T>
511 void copy3D_DtoH(T* src_data, T* dst_data,
512  int src_x0, int src_y0, int src_z0,
513  size_t src_xsize, size_t src_ysize,
514  int dst_x0, int dst_y0, int dst_z0,
515  size_t dst_xsize, size_t dst_ysize,
516  size_t width, size_t height, size_t depth,
517  cudaStream_t stream=0) {
518  copy3D_DtoH_T(src_data, dst_data,
519  src_x0, src_y0, src_z0,
520  src_xsize, src_ysize,
521  dst_x0, dst_y0, dst_z0,
522  dst_xsize, dst_ysize,
523  width, height, depth,
524  sizeof(T), stream);
525 }
526 
527 //----------------------------------------------------------------------------------------
528 //
529 // Copies 3D memory block Device -> Device
530 //
531 void copy3D_DtoD_T(void* src_data, void* dst_data,
532  int src_x0, int src_y0, int src_z0,
533  size_t src_xsize, size_t src_ysize,
534  int dst_x0, int dst_y0, int dst_z0,
535  size_t dst_xsize, size_t dst_ysize,
536  size_t width, size_t height, size_t depth,
537  size_t sizeofT, cudaStream_t stream);
538 
539 template <class T>
540 void copy3D_DtoD(T* src_data, T* dst_data,
541  int src_x0, int src_y0, int src_z0,
542  size_t src_xsize, size_t src_ysize,
543  int dst_x0, int dst_y0, int dst_z0,
544  size_t dst_xsize, size_t dst_ysize,
545  size_t width, size_t height, size_t depth,
546  cudaStream_t stream=0) {
547  copy3D_DtoD_T(src_data, dst_data,
548  src_x0, src_y0, src_z0,
549  src_xsize, src_ysize,
550  dst_x0, dst_y0, dst_z0,
551  dst_xsize, dst_ysize,
552  width, height, depth,
553  sizeof(T), stream);
554 }
555 
556 //----------------------------------------------------------------------------------------
557 //
558 // Copies 3D memory block between two peer devices Device -> Device
559 //
560 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
561  void* src_data, void* dst_data,
562  int src_x0, int src_y0, int src_z0,
563  size_t src_xsize, size_t src_ysize,
564  int dst_x0, int dst_y0, int dst_z0,
565  size_t dst_xsize, size_t dst_ysize,
566  size_t width, size_t height, size_t depth,
567  size_t sizeofT, cudaStream_t stream);
568 
569 template <class T>
570 void copy3D_PeerDtoD(int src_dev, int dst_dev,
571  T* src_data, T* dst_data,
572  int src_x0, int src_y0, int src_z0,
573  size_t src_xsize, size_t src_ysize,
574  int dst_x0, int dst_y0, int dst_z0,
575  size_t dst_xsize, size_t dst_ysize,
576  size_t width, size_t height, size_t depth,
577  cudaStream_t stream=0) {
578  copy3D_PeerDtoD_T(src_dev, dst_dev,
579  src_data, dst_data,
580  src_x0, src_y0, src_z0,
581  src_xsize, src_ysize,
582  dst_x0, dst_y0, dst_z0,
583  dst_xsize, dst_ysize,
584  width, height, depth,
585  sizeof(T), stream);
586 }
587 
588 //----------------------------------------------------------------------------------------
589 //
590 // Utility structure for computing nonbonded interactions
591 //
592 // DMC: I needed this to be in a file which can be included from both C++ and CUDA files
594  float lj_0; // denom * cutoff2 - 3.0f * switch2 * denom
595  float lj_1; // denom * 2.0f
596  float lj_2; // denom * -12.0f
597  float lj_3; // denom * 12.0f * switch2
598  float lj_4; // cutoff2
599  float lj_5; // switch2
600  float e_0; // roff3Inv
601  float e_0_slow; // roff3Inv * (1 - slowScale)
602  float e_1; // roff2Inv
603  float e_2; // roffInv
604  float ewald_0; // ewaldcof
605  float ewald_1; // pi_ewaldcof
606  float ewald_2; // ewaldcof ^ 2
607  float ewald_3_slow; // ewaldcof ^ 3 * slowScale
608  float slowScale; // ratio of full electrostatics to nonbonded frequency
609 };
610 
611 
612 #endif // NAMD_CUDA || NAMD_HIP
613 
614 #endif // CUDAUTILS_H
615 
BigReal yy
Definition: CudaUtils.h:80
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:540
float ewald_3_slow
Definition: CudaUtils.h:607
bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:194
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:511
void allocate_device_managed(T **pp, const size_t len)
Definition: CudaUtils.h:316
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:344
void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:73
void deallocate_host(T **pp)
Definition: CudaUtils.h:387
void clear_device_array_sync(T *data, const size_t ndata)
Definition: CudaUtils.h:284
__forceinline__ __device__ cudaTensor & operator+=(const cudaTensor &other)
Definition: CudaUtils.h:87
BigReal zz
Definition: CudaUtils.h:84
BigReal yx
Definition: CudaUtils.h:79
void copy_DtoD_sync(const T *d_src, T *h_dst, const size_t array_len)
Definition: CudaUtils.h:451
void deallocate_device(T **pp)
Definition: CudaUtils.h:333
void allocate_device(T **pp, const size_t len)
Definition: CudaUtils.h:311
BigReal yz
Definition: CudaUtils.h:81
void curandDie(const char *msg, int err=0)
Definition: CudaUtils.C:31
void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:251
void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:246
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:269
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:482
void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:105
void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT)
Definition: CudaUtils.C:77
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:294
void allocate_device_async(T **pp, const size_t len, cudaStream_t stream)
Definition: CudaUtils.h:321
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:97
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53
void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:233
bool reallocate_device(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f)
Definition: CudaUtils.h:355
unsigned int WarpMask
Definition: CudaUtils.h:19
__forceinline__ __device__ friend cudaTensor operator+(const cudaTensor &t1, const cudaTensor &t2)
Definition: CudaUtils.h:101
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
Definition: CudaUtils.h:435
bool reallocate_host(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
Definition: CudaUtils.h:375
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:570
BigReal xx
Definition: CudaUtils.h:76
bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:161
void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len)
Definition: CudaUtils.h:418
double BigReal
Definition: CudaUtils.h:66
BigReal z
Definition: CudaUtils.h:119
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:296
BigReal zx
Definition: CudaUtils.h:82
void copy_DtoD(const T *d_src, T *h_dst, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:443
void clear_device_array(T *data, const size_t ndata, cudaStream_t stream=0)
Definition: CudaUtils.h:279
BigReal xz
Definition: CudaUtils.h:78
void deallocate_device_T_async(void **pp, cudaStream_t stream)
Definition: CudaUtils.C:127
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:464
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:427
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:118
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:142
void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:238
void deallocate_device_async(T **pp, cudaStream_t stream)
Definition: CudaUtils.h:337
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:224
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:319
int destPatchID[3][3][3]
Definition: CudaUtils.h:123
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:259
BigReal y
Definition: CudaUtils.h:118
void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:219
BigReal zy
Definition: CudaUtils.h:83
BigReal x
Definition: CudaUtils.h:117
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:409
void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:101
BigReal xy
Definition: CudaUtils.h:77
double BigReal
Definition: common.h:123
void allocate_host_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:87