NAMD
CudaUtils.h
Go to the documentation of this file.
1 #ifndef CUDAUTILS_H
2 #define CUDAUTILS_H
3 
4 #ifdef NAMD_CUDA
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 #endif // NAMD_CUDA
8 
9 #ifdef NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #endif // NAMD_HIP
12 
13 #include <stdio.h>
14 #include "HipDefines.h"
15 
16 #ifdef NAMD_CUDA
17 #define WARPSIZE 32
18 #define BOUNDINGBOXSIZE 32
19 typedef unsigned int WarpMask;
20 
21 // The transition to CCCL 3.0 happened with CUDA 12.9, and directly set the NAMD_CCCL_MAJOR_VERSION to support building NAMD with CUDA versions prior to the inclusion of CCCL
22 #if (__CUDACC_VER_MAJOR__ < 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ <= 8)
23 #define NAMD_CCCL_MAJOR_VERSION 2
24 #else
25 #include <cuda/std/version>
26 #define NAMD_CCCL_MAJOR_VERSION CCCL_MAJOR_VERSION
27 #endif
28 
29 #endif // NAMD_CUDA
30 
31 #ifdef NAMD_HIP
32 
33 /* JM: Definition of the warpsize: we need to be careful here since WARPSIZE is
34  a compile-time definition, but on HIP we can target both RDNA and CDNA
35  devices, which could be either wave32 or 64.
36 */
37 #ifdef NAMD_NAVI_BUILD
38 #define WARPSIZE 32
39 #else
40 #define WARPSIZE 64
41 #endif
42 
43 #define BOUNDINGBOXSIZE 32
44 typedef unsigned int WarpMask;
45 #endif // NAMD_HIP
46 
47 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
48 
49 #define FORCE_ENERGY_TABLE_SIZE 4096
50 
51 #define COPY_CUDATENSOR(S,D) \
52  D.xx = S.xx; \
53  D.xy = S.xy; \
54  D.xz = S.xz; \
55  D.yx = S.yx; \
56  D.yy = S.yy; \
57  D.yz = S.yz; \
58  D.zx = S.zx; \
59  D.zy = S.zy; \
60  D.zz = S.zz
61 
62 #define COPY_CUDAVECTOR(S,D) \
63  D.x = S.x; \
64  D.y = S.y; \
65  D.z = S.z
66 
67 #define PRINT_CUDATENSOR(T, SS) \
68  SS << T.xx << " " << T.xy << " " << T.xz << " " << T.yx << " " << \
69  T.yy << " " << T.yz << " " << T.zx << " " << T.zy << " " << T.zz << \
70  std::endl;
71 
72 #ifdef SHORTREALS
73 typedef float BigReal;
74 #else
75 typedef double BigReal;
76 #endif
77 
78 #ifdef NAMD_CUDA
79 #define ATOMIC_BINS 1
80 #else
81 #define ATOMIC_BINS WARPSIZE
82 #endif
83 
84 struct cudaTensor{
94 
95  __forceinline__ __device__
97  this->xx += other.xx;
98  this->xy += other.xy;
99  this->xz += other.xz;
100  this->yx += other.yx;
101  this->yy += other.yy;
102  this->yz += other.yz;
103  this->zx += other.zx;
104  this->zy += other.zy;
105  this->zz += other.zz;
106  return *this;
107  }
108 
109  __forceinline__ __device__
110  friend cudaTensor operator+(const cudaTensor& t1, const cudaTensor& t2) {
111  cudaTensor t;
112  t.xx = t1.xx + t2.xx;
113  t.xy = t1.xy + t2.xy;
114  t.xz = t1.xz + t2.xz;
115  t.yx = t1.yx + t2.yx;
116  t.yy = t1.yy + t2.yy;
117  t.yz = t1.yz + t2.yz;
118  t.zx = t1.zx + t2.zx;
119  t.zy = t1.zy + t2.zy;
120  t.zz = t1.zz + t2.zz;
121  return t;
122  }
123 };
124 
125 struct cudaVector{
129 };
130 
131 struct CudaMInfo{
132  int destPatchID[3][3][3];
133 };
134 
135 
136 #define FEP_BONDED_CUDA_DEBUG
137 #ifdef FEP_BONDED_CUDA_DEBUG
138 #include <iostream>
139 #endif
140 
141 
142 // XXX The macro definitions below need to be refactored.
143 // At present, we are supporting three options:
144 // - CUDA versions >= 9
145 // - CUDA versions < 9
146 // - HIP
147 //
148 // These three options should be explicitly represented in the
149 // macro definitions below. We should definitely not be expanding
150 // through multiple chained macros, e.g., NAMD_WARP_SYNC expanding
151 // to WARP_SYNC. Finally. these macros should be consistently named
152 // and applied throughout the code to avoid repeated tests against
153 // CUDA and HIP macros, e.g., MigrationCUDAKernel.cu lines 1567-1571.
154 //
155 // Once we fix the macros below and make their use consistent,
156 // we can purge this comment block.
157 
158 #define WARP_FULL_MASK 0xffffffff
159 //TODO:HIP verify
160 //#define WARP_FULL_MASK 0xffffffffffffffff
161 
162 #if (__CUDACC_VER_MAJOR__ >= 9)
163 #define NAMD_USE_COOPERATIVE_GROUPS
164 #endif
165 
166 #ifdef NAMD_USE_COOPERATIVE_GROUPS
167  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
168  __shfl_xor_sync(MASK, VAR, LANE, SIZE)
169  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
170  __shfl_up_sync(MASK, VAR, DELTA, SIZE)
171  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
172  __shfl_down_sync(MASK, VAR, DELTA, SIZE)
173  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
174  __shfl_sync(MASK, VAR, LANE, SIZE)
175  #define WARP_ALL(MASK, P) __all_sync(MASK, P)
176  #define WARP_ANY(MASK, P) __any_sync(MASK, P)
177  #define WARP_BALLOT(MASK, P) __ballot_sync(MASK, P)
178  #define WARP_SYNC(MASK) __syncwarp(MASK)
179  #define BLOCK_SYNC __barrier_sync(0)
180 #else
181  #define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE) \
182  __shfl_xor(VAR, LANE, SIZE)
183  #define WARP_SHUFFLE_UP(MASK, VAR, DELTA, SIZE) \
184  __shfl_up(VAR, DELTA, SIZE)
185  #define WARP_SHUFFLE_DOWN(MASK, VAR, DELTA, SIZE) \
186  __shfl_down(VAR, DELTA, SIZE)
187  #define WARP_SHUFFLE(MASK, VAR, LANE, SIZE) \
188  __shfl(VAR, LANE, SIZE)
189 #ifdef NAMD_HIP
190  // this collides with a few rocprim definitions, so we need redefine this as NAMD_WARP_*
191  #define NAMD_WARP_ALL(MASK, P) __all(P)
192  #define NAMD_WARP_ANY(MASK, P) __any(P)
193  #define NAMD_WARP_BALLOT(MASK, P) __ballot(P)
194  #define NAMD_WARP_SYNC(MASK)
195  #define BLOCK_SYNC __syncthreads()
196 #else
197  #define WARP_ALL(MASK, P) __all(P)
198  #define WARP_ANY(MASK, P) __any(P)
199  #define WARP_BALLOT(MASK, P) __ballot(P)
200  #define WARP_SYNC(MASK)
201  #define BLOCK_SYNC __syncthreads()
202 #endif
203 #endif
204 
205 #if !defined(NAMD_HIP)
206 #define NAMD_WARP_SYNC(MASK) WARP_SYNC(MASK)
207 #endif
208 
209 
210 /*
211 // Define float3 + float3 operation
212 __host__ __device__ inline float3 operator+(const float3 a, const float3 b) {
213  float3 c;
214  c.x = a.x + b.x;
215  c.y = a.y + b.y;
216  c.z = a.z + b.z;
217  return c;
218 }
219 */
220 
221 //
222 // Cuda static assert, copied from Facebook FFT sources. Remove once nvcc has c++11
223 //
224 template <bool>
226 
227 template <>
228 struct CudaStaticAssert<true> {
229 };
230 
231 #define cuda_static_assert(expr) \
232  (CudaStaticAssert<(expr) != 0>())
233 
234 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
235 void curandDie(const char *msg, int err=0);
236 
237 void cudaNAMD_bug(const char *msg);
238 
239 //
240 // Error checking wrapper for CUDA
241 //
242 #define cudaCheck(stmt) do { \
243  cudaError_t err = stmt; \
244  if (err != cudaSuccess) { \
245  char msg[256]; \
246  sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
247  cudaDie(msg, err); \
248  } \
249 } while(0)
250 
251 #define curandCheck(stmt) do { \
252  curandStatus_t err = stmt; \
253  if (err != CURAND_STATUS_SUCCESS) { \
254  char msg[256]; \
255  sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt,__FILE__,__FUNCTION__,__LINE__); \
256  curandDie(msg, (int)err); \
257  } \
258  } while(0)
259 
260 #ifdef __CUDACC__
261 #if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
262 // native implementation available
263 #else
264 #if __CUDA_ARCH__ >= 600
265 #error using CAS implementation of double atomicAdd
266 #endif
267 //
268 // Double precision atomicAdd, copied from CUDA_C_Programming_Guide.pdf (ver 5.0)
269 //
270 static __device__ double atomicAdd(double* address, double val) {
271  unsigned long long int* address_as_ull = (unsigned long long int*)address;
272  unsigned long long int old = *address_as_ull, assumed;
273  do {
274  assumed = old;
275  old = atomicCAS(address_as_ull, assumed,
276  __double_as_longlong(val +
277  __longlong_as_double(assumed)));
278  } while (assumed != old);
279  return __longlong_as_double(old);
280 }
281 #endif
282 #endif
283 
284 void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT);
285 void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT);
286 
287 template <class T>
288 void clear_device_array(T *data, const size_t ndata, cudaStream_t stream=0) {
289  clear_device_array_async_T(data, ndata, stream, sizeof(T));
290 }
291 
292 template <class T>
293 void clear_device_array_sync(T *data, const size_t ndata) {
294  clear_device_array_T(data, ndata, sizeof(T));
295 }
296 
297 void allocate_host_T(void **pp, const size_t len, const size_t sizeofT);
298 //----------------------------------------------------------------------------------------
299 //
300 // Allocate page-locked host memory
301 // pp = memory pointer
302 // len = length of the array
303 //
304 template <class T>
305 void allocate_host(T **pp, const size_t len) {
306  allocate_host_T((void **)pp, len, sizeof(T));
307 }
308 
309 
310 void allocate_device_T(void **pp, const size_t len, const size_t sizeofT);
311 void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT);
312 void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream);
313 //----------------------------------------------------------------------------------------
314 //
315 // Allocate gpu memory
316 // pp = memory pointer
317 // len = length of the array
318 //
319 template <class T>
320 void allocate_device(T **pp, const size_t len) {
321  allocate_device_T((void **)pp, len, sizeof(T));
322 }
323 
324 template <class T>
325 void allocate_device_managed(T **pp, const size_t len) {
326  allocate_device_T_managed((void **)pp, len, sizeof(T));
327 }
328 
329 template <class T>
330 void allocate_device_async(T **pp, const size_t len, cudaStream_t stream) {
331  allocate_device_T_async((void **)pp, len, sizeof(T), stream);
332 }
333 
334 void deallocate_device_T(void **pp);
335 void deallocate_device_T_async(void **pp, cudaStream_t stream);
336 //----------------------------------------------------------------------------------------
337 //
338 // Deallocate gpu memory
339 // pp = memory pointer
340 //
341 template <class T>
342 void deallocate_device(T **pp) {
343  deallocate_device_T((void **)pp);
344 }
345 template <class T>
346 void deallocate_device_async(T **pp, cudaStream_t stream) {
347  deallocate_device_T_async((void **)pp, stream);
348 }
349 
350 //----------------------------------------------------------------------------------------
351 
352 bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT);
353 //----------------------------------------------------------------------------------------
354 //
355 // Allocate & re-allocate device memory
356 // pp = memory pointer
357 // curlen = current length of the array
358 // newlen = new required length of the array
359 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
360 //
361 // returns true if reallocation happened
362 //
363 template <class T>
364 bool reallocate_device(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f) {
365  return reallocate_device_T((void **)pp, curlen, newlen, fac, sizeof(T));
366 }
367 //----------------------------------------------------------------------------------------
368 bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac,
369  const unsigned int flag, const size_t sizeofT);
370 //----------------------------------------------------------------------------------------
371 //
372 // Allocate & re-allocate pinned host memory
373 // pp = memory pointer
374 // curlen = current length of the array
375 // newlen = new required length of the array
376 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
377 // flag = allocation type:
378 // cudaHostAllocDefault = default type, emulates cudaMallocHost
379 // cudaHostAllocMapped = maps allocation into CUDA address space
380 //
381 // returns true if reallocation happened
382 //
383 template <class T>
384 bool reallocate_host(T **pp, size_t *curlen, const size_t newlen,
385  const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault) {
386  return reallocate_host_T((void **)pp, curlen, newlen, fac, flag, sizeof(T));
387 }
388 
389 void deallocate_host_T(void **pp);
390 //----------------------------------------------------------------------------------------
391 //
392 // Deallocate page-locked host memory
393 // pp = memory pointer
394 //
395 template <class T>
396 void deallocate_host(T **pp) {
397  deallocate_host_T((void **)pp);
398 }
399 //----------------------------------------------------------------------------------------
400 
401 void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream,
402  const size_t sizeofT);
403 void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len,
404  const size_t sizeofT);
405 void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream,
406  const size_t sizeofT);
407 void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT);
408 
409 void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream,
410  const size_t sizeofT);
411 void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT);
412 
413 //----------------------------------------------------------------------------------------
414 //
415 // Copies memory Host -> Device
416 //
417 template <class T>
418 void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0) {
419  copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
420 }
421 
422 //----------------------------------------------------------------------------------------
423 //
424 // Copies memory Host -> Device using synchronous calls
425 //
426 template <class T>
427 void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len) {
428  copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
429 }
430 
431 //----------------------------------------------------------------------------------------
432 //
433 // Copies memory Device -> Host
434 //
435 template <class T>
436 void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0) {
437  copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
438 }
439 //----------------------------------------------------------------------------------------
440 //
441 // Copies memory Device -> Host using synchronous calls
442 //
443 template <class T>
444 void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len) {
445  copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
446 }
447 //----------------------------------------------------------------------------------------
448 //
449 // Copies memory Device -> Device
450 //
451 template <class T>
452 void copy_DtoD(const T *d_src, T *h_dst, const size_t array_len, cudaStream_t stream=0) {
453  copy_DtoD_async_T(d_src, h_dst, array_len, stream, sizeof(T));
454 }
455 //----------------------------------------------------------------------------------------
456 //
457 // Copies memory Device -> Device using synchronous calls
458 //
459 template <class T>
460 void copy_DtoD_sync(const T *d_src, T *h_dst, const size_t array_len) {
461  copy_DtoD_T(d_src, h_dst, array_len, sizeof(T));
462 }
463 
464 //----------------------------------------------------------------------------------------
465 //
466 // Copies memory between two peer devices Device -> Device
467 //
468 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
469  const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream,
470  const size_t sizeofT);
471 
472 template <class T>
473 void copy_PeerDtoD(const int src_dev, const int dst_dev,
474  const T *d_src, T *d_dst, const size_t array_len, cudaStream_t stream=0) {
475  copy_PeerDtoD_async_T(src_dev, dst_dev, d_src, d_dst, array_len, stream, sizeof(T));
476 }
477 
478 //----------------------------------------------------------------------------------------
479 //
480 // Copies 3D memory block Host -> Device
481 //
482 void copy3D_HtoD_T(void* src_data, void* dst_data,
483  int src_x0, int src_y0, int src_z0,
484  size_t src_xsize, size_t src_ysize,
485  int dst_x0, int dst_y0, int dst_z0,
486  size_t dst_xsize, size_t dst_ysize,
487  size_t width, size_t height, size_t depth,
488  size_t sizeofT, cudaStream_t stream);
489 
490 template <class T>
491 void copy3D_HtoD(T* src_data, T* dst_data,
492  int src_x0, int src_y0, int src_z0,
493  size_t src_xsize, size_t src_ysize,
494  int dst_x0, int dst_y0, int dst_z0,
495  size_t dst_xsize, size_t dst_ysize,
496  size_t width, size_t height, size_t depth,
497  cudaStream_t stream=0) {
498  copy3D_HtoD_T(src_data, dst_data,
499  src_x0, src_y0, src_z0,
500  src_xsize, src_ysize,
501  dst_x0, dst_y0, dst_z0,
502  dst_xsize, dst_ysize,
503  width, height, depth,
504  sizeof(T), stream);
505 }
506 
507 //----------------------------------------------------------------------------------------
508 //
509 // Copies 3D memory block Device -> Host
510 //
511 void copy3D_DtoH_T(void* src_data, void* dst_data,
512  int src_x0, int src_y0, int src_z0,
513  size_t src_xsize, size_t src_ysize,
514  int dst_x0, int dst_y0, int dst_z0,
515  size_t dst_xsize, size_t dst_ysize,
516  size_t width, size_t height, size_t depth,
517  size_t sizeofT, cudaStream_t stream);
518 
519 template <class T>
520 void copy3D_DtoH(T* src_data, T* dst_data,
521  int src_x0, int src_y0, int src_z0,
522  size_t src_xsize, size_t src_ysize,
523  int dst_x0, int dst_y0, int dst_z0,
524  size_t dst_xsize, size_t dst_ysize,
525  size_t width, size_t height, size_t depth,
526  cudaStream_t stream=0) {
527  copy3D_DtoH_T(src_data, dst_data,
528  src_x0, src_y0, src_z0,
529  src_xsize, src_ysize,
530  dst_x0, dst_y0, dst_z0,
531  dst_xsize, dst_ysize,
532  width, height, depth,
533  sizeof(T), stream);
534 }
535 
536 //----------------------------------------------------------------------------------------
537 //
538 // Copies 3D memory block Device -> Device
539 //
540 void copy3D_DtoD_T(void* src_data, void* dst_data,
541  int src_x0, int src_y0, int src_z0,
542  size_t src_xsize, size_t src_ysize,
543  int dst_x0, int dst_y0, int dst_z0,
544  size_t dst_xsize, size_t dst_ysize,
545  size_t width, size_t height, size_t depth,
546  size_t sizeofT, cudaStream_t stream);
547 
548 template <class T>
549 void copy3D_DtoD(T* src_data, T* dst_data,
550  int src_x0, int src_y0, int src_z0,
551  size_t src_xsize, size_t src_ysize,
552  int dst_x0, int dst_y0, int dst_z0,
553  size_t dst_xsize, size_t dst_ysize,
554  size_t width, size_t height, size_t depth,
555  cudaStream_t stream=0) {
556  copy3D_DtoD_T(src_data, dst_data,
557  src_x0, src_y0, src_z0,
558  src_xsize, src_ysize,
559  dst_x0, dst_y0, dst_z0,
560  dst_xsize, dst_ysize,
561  width, height, depth,
562  sizeof(T), stream);
563 }
564 
565 //----------------------------------------------------------------------------------------
566 //
567 // Copies 3D memory block between two peer devices Device -> Device
568 //
569 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
570  void* src_data, void* dst_data,
571  int src_x0, int src_y0, int src_z0,
572  size_t src_xsize, size_t src_ysize,
573  int dst_x0, int dst_y0, int dst_z0,
574  size_t dst_xsize, size_t dst_ysize,
575  size_t width, size_t height, size_t depth,
576  size_t sizeofT, cudaStream_t stream);
577 
578 template <class T>
579 void copy3D_PeerDtoD(int src_dev, int dst_dev,
580  T* src_data, T* dst_data,
581  int src_x0, int src_y0, int src_z0,
582  size_t src_xsize, size_t src_ysize,
583  int dst_x0, int dst_y0, int dst_z0,
584  size_t dst_xsize, size_t dst_ysize,
585  size_t width, size_t height, size_t depth,
586  cudaStream_t stream=0) {
587  copy3D_PeerDtoD_T(src_dev, dst_dev,
588  src_data, dst_data,
589  src_x0, src_y0, src_z0,
590  src_xsize, src_ysize,
591  dst_x0, dst_y0, dst_z0,
592  dst_xsize, dst_ysize,
593  width, height, depth,
594  sizeof(T), stream);
595 }
596 
597 //----------------------------------------------------------------------------------------
598 //
599 // Utility structure for computing nonbonded interactions
600 //
601 // DMC: I needed this to be in a file which can be included from both C++ and CUDA files
603  float lj_0; // denom * cutoff2 - 3.0f * switch2 * denom
604  float lj_1; // denom * 2.0f
605  float lj_2; // denom * -12.0f
606  float lj_3; // denom * 12.0f * switch2
607  float lj_4; // cutoff2
608  float lj_5; // switch2
609  float e_0; // roff3Inv
610  float e_0_slow; // roff3Inv * (1 - slowScale)
611  float e_1; // roff2Inv
612  float e_2; // roffInv
613  float ewald_0; // ewaldcof
614  float ewald_1; // pi_ewaldcof
615  float ewald_2; // ewaldcof ^ 2
616  float ewald_3_slow; // ewaldcof ^ 3 * slowScale
617  float slowScale; // ratio of full electrostatics to nonbonded frequency
618 };
619 
620 
621 #endif // NAMD_CUDA || NAMD_HIP
622 
623 #endif // CUDAUTILS_H
624 
BigReal yy
Definition: CudaUtils.h:89
void copy3D_DtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:549
float ewald_3_slow
Definition: CudaUtils.h:616
bool reallocate_host_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const unsigned int flag, const size_t sizeofT)
Definition: CudaUtils.C:194
void copy3D_DtoH(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:520
void allocate_device_managed(T **pp, const size_t len)
Definition: CudaUtils.h:325
void copy3D_PeerDtoD_T(int src_dev, int dst_dev, void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:344
void clear_device_array_async_T(void *data, const size_t ndata, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:73
void deallocate_host(T **pp)
Definition: CudaUtils.h:396
void clear_device_array_sync(T *data, const size_t ndata)
Definition: CudaUtils.h:293
__forceinline__ __device__ cudaTensor & operator+=(const cudaTensor &other)
Definition: CudaUtils.h:96
BigReal zz
Definition: CudaUtils.h:93
BigReal yx
Definition: CudaUtils.h:88
void copy_DtoD_sync(const T *d_src, T *h_dst, const size_t array_len)
Definition: CudaUtils.h:460
void deallocate_device(T **pp)
Definition: CudaUtils.h:342
void allocate_device(T **pp, const size_t len)
Definition: CudaUtils.h:320
BigReal yz
Definition: CudaUtils.h:90
void curandDie(const char *msg, int err=0)
Definition: CudaUtils.C:31
void copy_DtoD_T(const void *d_src, void *d_dst, const size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:251
void copy_DtoD_async_T(const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:246
void copy3D_HtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:269
void copy3D_HtoD(T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:491
void allocate_device_T_async(void **pp, const size_t len, const size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:105
void clear_device_array_T(void *data, const size_t ndata, const size_t sizeofT)
Definition: CudaUtils.C:77
void copy3D_DtoH_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:294
void allocate_device_async(T **pp, const size_t len, cudaStream_t stream)
Definition: CudaUtils.h:330
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:97
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53
void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:233
bool reallocate_device(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f)
Definition: CudaUtils.h:364
unsigned int WarpMask
Definition: CudaUtils.h:19
__forceinline__ __device__ friend cudaTensor operator+(const cudaTensor &t1, const cudaTensor &t2)
Definition: CudaUtils.h:110
void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len)
Definition: CudaUtils.h:444
bool reallocate_host(T **pp, size_t *curlen, const size_t newlen, const float fac=1.0f, const unsigned int flag=cudaHostAllocDefault)
Definition: CudaUtils.h:384
void copy3D_PeerDtoD(int src_dev, int dst_dev, T *src_data, T *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, cudaStream_t stream=0)
Definition: CudaUtils.h:579
BigReal xx
Definition: CudaUtils.h:85
bool reallocate_device_T(void **pp, size_t *curlen, const size_t newlen, const float fac, const size_t sizeofT)
Definition: CudaUtils.C:161
void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len)
Definition: CudaUtils.h:427
double BigReal
Definition: CudaUtils.h:75
BigReal z
Definition: CudaUtils.h:128
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:305
BigReal zx
Definition: CudaUtils.h:91
void copy_DtoD(const T *d_src, T *h_dst, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:452
void clear_device_array(T *data, const size_t ndata, cudaStream_t stream=0)
Definition: CudaUtils.h:288
BigReal xz
Definition: CudaUtils.h:87
void deallocate_device_T_async(void **pp, cudaStream_t stream)
Definition: CudaUtils.C:127
void copy_PeerDtoD(const int src_dev, const int dst_dev, const T *d_src, T *d_dst, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:473
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:436
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void deallocate_device_T(void **pp)
Definition: CudaUtils.C:118
void deallocate_host_T(void **pp)
Definition: CudaUtils.C:142
void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:238
void deallocate_device_async(T **pp, cudaStream_t stream)
Definition: CudaUtils.h:346
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:224
void copy3D_DtoD_T(void *src_data, void *dst_data, int src_x0, int src_y0, int src_z0, size_t src_xsize, size_t src_ysize, int dst_x0, int dst_y0, int dst_z0, size_t dst_xsize, size_t dst_ysize, size_t width, size_t height, size_t depth, size_t sizeofT, cudaStream_t stream)
Definition: CudaUtils.C:319
int destPatchID[3][3][3]
Definition: CudaUtils.h:132
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, const void *d_src, void *d_dst, const size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:259
BigReal y
Definition: CudaUtils.h:127
void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, const size_t sizeofT)
Definition: CudaUtils.C:219
BigReal zy
Definition: CudaUtils.h:92
BigReal x
Definition: CudaUtils.h:126
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:418
void allocate_device_T_managed(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:101
BigReal xy
Definition: CudaUtils.h:86
double BigReal
Definition: common.h:123
void allocate_host_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:87