NAMD
Classes | Functions | Variables
CudaPmeSolverUtilKernel.cu File Reference
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include "CudaUtils.h"
#include "CudaPmeSolverUtilKernel.h"

Go to the source code of this file.

Classes

struct  RecipVirial_t
 
struct  gather_t< T, order >
 

Functions

__forceinline__ __device__ double expfunc (const double x)
 
__forceinline__ __device__ float expfunc (const float x)
 
template<typename T , typename T2 , bool calc_energy_virial, bool orderXYZ, bool doOrtho>
__global__ void scalar_sum_kernel (const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
 
template<typename T >
__forceinline__ __device__ void write_grid (const float val, const int ind, T *data)
 
template<typename T , int order>
__forceinline__ __device__ void calc_one_theta (const T w, T *theta)
 
template<typename T , typename T3 , int order>
__forceinline__ __device__ void calc_theta_dtheta (T wx, T wy, T wz, T3 *theta, T3 *dtheta)
 
template<typename AT , int order, bool periodicY, bool periodicZ>
__global__ void spread_charge_kernel (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
 
template<typename T >
__forceinline__ __device__ void gather_force_store (const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
 
template<>
__forceinline__ __device__ void gather_force_store< float > (const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
 
template<>
__forceinline__ __device__ void gather_force_store< float3 > (const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
 
template<typename CT , typename FT , int order, bool periodicY, bool periodicZ>
__global__ void gather_force (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
 
template<typename T >
__device__ __forceinline__ void transpose_xyz_yzx_device (const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void transpose_xyz_yzx_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void batchTranspose_xyz_yzx_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
 
template<typename T >
__device__ __forceinline__ void transpose_xyz_zxy_device (const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void transpose_xyz_zxy_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
 
template<typename T >
__global__ void batchTranspose_xyz_zxy_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
 
void spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
 
void scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
 
void gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
 
void transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 
void transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 

Variables

const int TILEDIM = 32
 
const int TILEROWS = 8
 

Function Documentation

void batchTranspose_xyz_yzx ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1390 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoYZX().

1393  {
1394 
1395  dim3 numthread(TILEDIM, TILEROWS, 1);
1396  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
1397 
1398  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1399 
1400  cudaCheck(cudaGetLastError());
1401 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename T >
__global__ void batchTranspose_xyz_yzx_kernel ( const TransposeBatch< T > *  batches,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in 
)

Definition at line 883 of file CudaPmeSolverUtilKernel.cu.

References TransposeBatch< T >::nx, and TILEDIM.

886  {
887 
888  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
889  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
890  int z_in = (blockIdx.z % nz) + threadIdx.z;
891 
892  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
893  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
894 
895  int bi = blockIdx.z/nz;
896 
897  TransposeBatch<T> batch = batches[bi];
898  int nx = batch.nx;
899  int ysize_out = batch.ysize_out;
900  int zsize_out = batch.zsize_out;
901  T* data_in = batch.data_in;
902  T* data_out = batch.data_out;
903 
904  transpose_xyz_yzx_device<T>(
905  x_in, y_in, z_in,
906  x_out, y_out,
907  nx, ny, nz,
908  xsize_in, ysize_in,
909  ysize_out, zsize_out,
910  data_in, data_out);
911 
912 }
const int TILEDIM
void batchTranspose_xyz_zxy ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1425 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoZXY().

1429  {
1430 
1431  dim3 numthread(TILEDIM, TILEROWS, 1);
1432  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
1433 
1434  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1435 
1436  cudaCheck(cudaGetLastError());
1437 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename T >
__global__ void batchTranspose_xyz_zxy_kernel ( const TransposeBatch< T > *  batches,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in 
)

Definition at line 1038 of file CudaPmeSolverUtilKernel.cu.

References TransposeBatch< T >::nx, and TILEDIM.

1041  {
1042 
1043  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1044  int y_in = (blockIdx.z % ny) + threadIdx.z;
1045  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1046 
1047  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1048  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1049 
1050  int bi = blockIdx.z/ny;
1051 
1052  TransposeBatch<T> batch = batches[bi];
1053  int nx = batch.nx;
1054  int zsize_out = batch.zsize_out;
1055  int xsize_out = batch.xsize_out;
1056  T* data_in = batch.data_in;
1057  T* data_out = batch.data_out;
1058 
1059  transpose_xyz_zxy_device<T>(
1060  x_in, y_in, z_in, x_out, z_out,
1061  nx, ny, nz,
1062  xsize_in, ysize_in,
1063  zsize_out, xsize_out,
1064  data_in, data_out);
1065 
1066 }
const int TILEDIM
template<typename T , int order>
__forceinline__ __device__ void calc_one_theta ( const T  w,
T *  theta 
)

Definition at line 326 of file CudaPmeSolverUtilKernel.cu.

References order.

326  {
327 
328  theta[order-1] = ((T)0);
329  theta[1] = w;
330  theta[0] = ((T)1) - w;
331 
332 #pragma unroll
333  for (int k=3;k <= order-1;k++) {
334  T div = ((T)1) / (T)(k-1);
335  theta[k-1] = div*w*theta[k-2];
336 #pragma unroll
337  for (int j=1;j <= k-2;j++) {
338  theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
339  }
340  theta[0] = div*(((T)1) - w)*theta[0];
341  }
342 
343  //--- one more recursion
344  T div = ((T)1) / (T)(order-1);
345  theta[order-1] = div*w*theta[order-2];
346 #pragma unroll
347  for (int j=1;j <= order-2;j++) {
348  theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
349  }
350 
351  theta[0] = div*(((T)1) - w)*theta[0];
352 }
#define order
Definition: PmeRealSpace.C:235
template<typename T , typename T3 , int order>
__forceinline__ __device__ void calc_theta_dtheta ( wx,
wy,
wz,
T3 *  theta,
T3 *  dtheta 
)

Definition at line 358 of file CudaPmeSolverUtilKernel.cu.

References order, x, y, and z.

358  {
359 
360  theta[order-1].x = ((T)0);
361  theta[order-1].y = ((T)0);
362  theta[order-1].z = ((T)0);
363  theta[1].x = wx;
364  theta[1].y = wy;
365  theta[1].z = wz;
366  theta[0].x = ((T)1) - wx;
367  theta[0].y = ((T)1) - wy;
368  theta[0].z = ((T)1) - wz;
369 
370 #pragma unroll
371  for (int k=3;k <= order-1;k++) {
372  T div = ((T)1) / (T)(k-1);
373  theta[k-1].x = div*wx*theta[k-2].x;
374  theta[k-1].y = div*wy*theta[k-2].y;
375  theta[k-1].z = div*wz*theta[k-2].z;
376 #pragma unroll
377  for (int j=1;j <= k-2;j++) {
378  theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
379  theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
380  theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
381  }
382  theta[0].x = div*(((T)1) - wx)*theta[0].x;
383  theta[0].y = div*(((T)1) - wy)*theta[0].y;
384  theta[0].z = div*(((T)1) - wz)*theta[0].z;
385  }
386 
387  //--- perform standard b-spline differentiation
388  dtheta[0].x = -theta[0].x;
389  dtheta[0].y = -theta[0].y;
390  dtheta[0].z = -theta[0].z;
391 #pragma unroll
392  for (int j=2;j <= order;j++) {
393  dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
394  dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
395  dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
396  }
397 
398  //--- one more recursion
399  T div = ((T)1) / (T)(order-1);
400  theta[order-1].x = div*wx*theta[order-2].x;
401  theta[order-1].y = div*wy*theta[order-2].y;
402  theta[order-1].z = div*wz*theta[order-2].z;
403 #pragma unroll
404  for (int j=1;j <= order-2;j++) {
405  theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
406  theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
407  theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
408  }
409 
410  theta[0].x = div*(((T)1) - wx)*theta[0].x;
411  theta[0].y = div*(((T)1) - wy)*theta[0].y;
412  theta[0].z = div*(((T)1) - wz)*theta[0].z;
413 }
#define order
Definition: PmeRealSpace.C:235
gridSize z
gridSize y
gridSize x
__forceinline__ __device__ double expfunc ( const double  x)

Definition at line 50 of file CudaPmeSolverUtilKernel.cu.

Referenced by scalar_sum_kernel().

50  {
51  return exp(x);
52 }
gridSize x
__forceinline__ __device__ float expfunc ( const float  x)

Definition at line 54 of file CudaPmeSolverUtilKernel.cu.

54  {
55  return __expf(x);
56 }
gridSize x
template<typename CT , typename FT , int order, bool periodicY, bool periodicZ>
__global__ void gather_force ( const float4 *  xyzq,
const int  ncoord,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const float *  data,
const cudaTextureObject_t  gridTexObj,
const int  stride,
FT *  force 
)

Definition at line 597 of file CudaPmeSolverUtilKernel.cu.

References __ldg, BLOCK_SYNC, for(), order, WARP_BALLOT, WARP_FULL_MASK, WARP_SHUFFLE, x, y, and z.

Referenced by CudaPmeRealSpaceCompute::gatherForce().

607  {
608 
609  const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
610 
611  // Shared memory
612  __shared__ gather_t<CT, order> shmem[32];
613 
614  const int pos = blockIdx.x*blockDim.x + threadIdx.x;
615  const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
616 
617  // Load atom data into shared memory
618  if (pos < pos_end && threadIdx.y == 0) {
619 
620  float4 xyzqi = xyzq[pos];
621  float x = xyzqi.x;
622  float y = xyzqi.y;
623  float z = xyzqi.z;
624  float q = xyzqi.w;
625 
626  float frx = ((float)nfftx)*x;
627  float fry = ((float)nffty)*y;
628  float frz = ((float)nfftz)*z;
629 
630  int frxi = (int)frx;
631  int fryi = (int)fry;
632  int frzi = (int)frz;
633 
634  float wx = frx - (float)frxi;
635  float wy = fry - (float)fryi;
636  float wz = frz - (float)frzi;
637 
638  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
639  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
640 
641  shmem[threadIdx.x].ix = frxi;
642  shmem[threadIdx.x].iy = fryi - y00;
643  shmem[threadIdx.x].iz = frzi - z00;
644  shmem[threadIdx.x].charge = q;
645 
646  float3 theta_tmp[order];
647  float3 dtheta_tmp[order];
648  calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
649 
650 #pragma unroll
651  for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
652 
653 #pragma unroll
654  for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
655 
656 #pragma unroll
657  for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
658 
659 #pragma unroll
660  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
661 
662 #pragma unroll
663  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
664 
665 #pragma unroll
666  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
667 
668  }
669  BLOCK_SYNC;
670 
671  // We divide the order x order x order cube into 8 sub-cubes.
672  // These sub-cubes are taken care by a single thread
673  // The size of the sub-cubes is:
674  // order=4 : 2x2x2
675  // order=6 : 3x3x3
676  // order=8 : 4x4x4
677  const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
678  // Calculate the starting index on the sub-cube for this thread
679  // tid = 0...63
680  const int t = (tid % 8); // sub-cube index (0...7)
681  // t = (tx0 + ty0*2 + tz0*4)/nsc
682  // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
683  const int tz0 = (t / 4)*nsc;
684  const int ty0 = ((t / 2) % 2)*nsc;
685  const int tx0 = (t % 2)*nsc;
686 
687  //
688  // Calculate forces for 32 atoms. We have 32*2 = 64 threads
689  // Loop is iterated 4 times:
690  // (iterations)
691  // Threads 0...7 = atoms 0, 8, 16, 24
692  // Threads 8...15 = atoms 1, 9, 17, 25
693  // Threads 16...31 = atoms 2, 10, 18, 26
694  // ...
695  // Threads 56...63 = atoms 7, 15, 23, 31
696  //
697 
698  int base = tid/8;
699  const int base_end = pos_end - blockIdx.x*blockDim.x;
700 
701  // Make sure to mask out any threads that are not running the loop.
702  // This will happen if the number of atoms is not a multiple of 32.
703  WarpMask warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
704 
705  while (base < base_end) {
706 
707  float f1 = 0.0f;
708  float f2 = 0.0f;
709  float f3 = 0.0f;
710  int ix0 = shmem[base].ix;
711  int iy0 = shmem[base].iy;
712  int iz0 = shmem[base].iz;
713 
714  // Each thread calculates a nsc x nsc x nsc sub-cube
715 #pragma unroll
716  for (int i=0;i < nsc*nsc*nsc;i++) {
717  int tz = tz0 + (i/(nsc*nsc));
718  int ty = ty0 + ((i/nsc) % nsc);
719  int tx = tx0 + (i % nsc);
720 
721  int ix = ix0 + tx;
722  int iy = iy0 + ty;
723  int iz = iz0 + tz;
724  if (ix >= nfftx) ix -= nfftx;
725 
726  if (periodicY && (iy >= nffty)) iy -= nffty;
727  if (!periodicY && (iy < 0 || iy >= ysize)) continue;
728 
729  if (periodicZ && (iz >= nfftz)) iz -= nfftz;
730  if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
731 
732  int ind = ix + (iy + iz*ysize)*xdim;
733 
734 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
735  float q0 = __ldg(&data[ind]);
736 #else
737  float q0 = tex1Dfetch<float>(gridTexObj, ind);
738 #endif
739  float thx0 = shmem[base].thetax[tx];
740  float thy0 = shmem[base].thetay[ty];
741  float thz0 = shmem[base].thetaz[tz];
742  float dthx0 = shmem[base].dthetax[tx];
743  float dthy0 = shmem[base].dthetay[ty];
744  float dthz0 = shmem[base].dthetaz[tz];
745  f1 += dthx0 * thy0 * thz0 * q0;
746  f2 += thx0 * dthy0 * thz0 * q0;
747  f3 += thx0 * thy0 * dthz0 * q0;
748  }
749 
750  //-------------------------
751 
752  // Reduce
753  const int i = threadIdx.x & 7;
754 
755  f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
756  f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
757  f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
758 
759  f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
760  f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
761  f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
762 
763  f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
764  f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
765  f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
766 
767  if (i == 0) {
768  shmem[base].f1 = f1;
769  shmem[base].f2 = f2;
770  shmem[base].f3 = f3;
771  }
772 
773  base += 8;
774  warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
775  }
776 
777  // Write forces
778  BLOCK_SYNC;
779  if (pos < pos_end && threadIdx.y == 0) {
780  float f1 = shmem[threadIdx.x].f1;
781  float f2 = shmem[threadIdx.x].f2;
782  float f3 = shmem[threadIdx.x].f3;
783  float q = -shmem[threadIdx.x].charge; //*ccelec_float;
784  // float fx = q*recip1*f1*nfftx;
785  // float fy = q*recip2*f2*nffty;
786  // float fz = q*recip3*f3*nfftz;
787  float fx = q*f1*nfftx;
788  float fy = q*f2*nffty;
789  float fz = q*f3*nfftz;
790  gather_force_store<FT>(fx, fy, fz, stride, pos, force);
791  }
792 
793 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define order
Definition: PmeRealSpace.C:235
unsigned int WarpMask
Definition: CudaUtils.h:11
gridSize z
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:58
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
gridSize y
gridSize x
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
for(int i=0;i< n1;++i)
void gather_force ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
const float *  data,
const int  order,
float3 *  force,
const cudaTextureObject_t  gridTexObj,
cudaStream_t  stream 
)

Definition at line 1225 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, and cudaNAMD_bug().

1235  {
1236 
1237  dim3 nthread(32, 2, 1);
1238  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1239  // dim3 nblock(npatch, 1, 1);
1240 
1241  switch(order) {
1242  case 4:
1243  if (periodicY && periodicZ)
1244  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>> (
1245  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1246  // recip11, recip22, recip33,
1247  data,
1248 #ifdef NAMD_CUDA
1249  gridTexObj,
1250 #endif
1251  1, force);
1252  else if (periodicY)
1253  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>> (
1254  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1255  // recip11, recip22, recip33,
1256  data,
1257 #ifdef NAMD_CUDA
1258  gridTexObj,
1259 #endif
1260  1, force);
1261  else if (periodicZ)
1262  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>> (
1263  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1264  // recip11, recip22, recip33,
1265  data,
1266 #ifdef NAMD_CUDA
1267  gridTexObj,
1268 #endif
1269  1, force);
1270  else
1271  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>> (
1272  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1273  // recip11, recip22, recip33,
1274  data,
1275 #ifdef NAMD_CUDA
1276  gridTexObj,
1277 #endif
1278  1, force);
1279  break;
1280 
1281  case 6:
1282  if (periodicY && periodicZ)
1283  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>> (
1284  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1285  // recip11, recip22, recip33,
1286  data,
1287 #ifdef NAMD_CUDA
1288  gridTexObj,
1289 #endif
1290  1, force);
1291  else if (periodicY)
1292  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>> (
1293  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1294  // recip11, recip22, recip33,
1295  data,
1296 #ifdef NAMD_CUDA
1297  gridTexObj,
1298 #endif
1299  1, force);
1300  else if (periodicZ)
1301  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>> (
1302  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1303  // recip11, recip22, recip33,
1304  data,
1305 #ifdef NAMD_CUDA
1306  gridTexObj,
1307 #endif
1308  1, force);
1309  else
1310  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>> (
1311  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1312  // recip11, recip22, recip33,
1313  data,
1314 #ifdef NAMD_CUDA
1315  gridTexObj,
1316 #endif
1317  1, force);
1318  break;
1319 
1320  case 8:
1321  if (periodicY && periodicZ)
1322  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>> (
1323  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1324  // recip11, recip22, recip33,
1325  data,
1326 #ifdef NAMD_CUDA
1327  gridTexObj,
1328 #endif
1329  1, force);
1330  else if (periodicY)
1331  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>> (
1332  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1333  // recip11, recip22, recip33,
1334  data,
1335 #ifdef NAMD_CUDA
1336  gridTexObj,
1337 #endif
1338  1, force);
1339  else if (periodicZ)
1340  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>> (
1341  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1342  // recip11, recip22, recip33,
1343  data,
1344 #ifdef NAMD_CUDA
1345  gridTexObj,
1346 #endif
1347  1, force);
1348  else
1349  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>> (
1350  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1351  // recip11, recip22, recip33,
1352  data,
1353 #ifdef NAMD_CUDA
1354  gridTexObj,
1355 #endif
1356  1, force);
1357  break;
1358 
1359  default:
1360  char str[128];
1361  sprintf(str, "gather_force, order %d not implemented",order);
1362  cudaNAMD_bug(str);
1363  }
1364  cudaCheck(cudaGetLastError());
1365 
1366 }
static __thread atom * atoms
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename T >
__forceinline__ __device__ void gather_force_store ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
T *  force 
)

Definition at line 529 of file CudaPmeSolverUtilKernel.cu.

531  {
532 }
template<>
__forceinline__ __device__ void gather_force_store< float > ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
float *  force 
)

Definition at line 536 of file CudaPmeSolverUtilKernel.cu.

538  {
539  // Store into non-strided float XYZ array
540 #ifdef NAMD_HIP
541  // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
542  // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
543  reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
544  reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
545  reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
546 #else
547  force[pos] = fx;
548  force[pos+stride] = fy;
549  force[pos+stride*2] = fz;
550 #endif
551 }
template<>
__forceinline__ __device__ void gather_force_store< float3 > ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
float3 *  force 
)

Definition at line 555 of file CudaPmeSolverUtilKernel.cu.

557  {
558  // Store into non-strided "float3" array
559 #ifdef NAMD_HIP
560  // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
561  // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
562  reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
563  reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
564  reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
565 #else
566  force[pos].x = fx;
567  force[pos].y = fy;
568  force[pos].z = fz;
569 #endif
570 }
void scalar_sum ( const bool  orderXYZ,
const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const double  kappa,
const float  recip1x,
const float  recip1y,
const float  recip1z,
const float  recip2x,
const float  recip2y,
const float  recip2z,
const float  recip3x,
const float  recip3y,
const float  recip3z,
const double  volume,
const float *  prefac1,
const float *  prefac2,
const float *  prefac3,
const int  k2_00,
const int  k3_00,
const bool  doEnergyVirial,
double *  energy,
double *  virial,
float2 data,
cudaStream_t  stream 
)

Definition at line 1158 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, M_PI, and WARPSIZE.

Referenced by CudaPmeKSpaceCompute::solve().

1167  {
1168 #ifdef NAMD_CUDA
1169  int nthread = 1024;
1170  int nblock = 64;
1171 #else
1172  int nthread = 256;
1173  int nblock = 256;
1174 #endif
1175 
1176  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1177  if (doEnergyVirial) {
1178  shmem_size = max(shmem_size, (int)((nthread/WARPSIZE)*sizeof(RecipVirial_t)));
1179  }
1180 
1181  float piv_inv = (float)(1.0/(M_PI*volume));
1182  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1183 
1184  if (doEnergyVirial) {
1185  if (orderXYZ) {
1186  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1187  nfft1, nfft2, nfft3, size1, size2, size3,
1188  recip1x, recip1y, recip1z,
1189  recip2x, recip2y, recip2z,
1190  recip3x, recip3y, recip3z,
1191  prefac1, prefac2, prefac3,
1192  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1193  } else {
1194  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1195  nfft1, nfft2, nfft3, size1, size2, size3,
1196  recip1x, recip1y, recip1z,
1197  recip2x, recip2y, recip2z,
1198  recip3x, recip3y, recip3z,
1199  prefac1, prefac2, prefac3,
1200  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1201  }
1202  } else {
1203  if (orderXYZ) {
1204  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1205  nfft1, nfft2, nfft3, size1, size2, size3,
1206  recip1x, recip1y, recip1z,
1207  recip2x, recip2y, recip2z,
1208  recip3x, recip3y, recip3z,
1209  prefac1, prefac2, prefac3,
1210  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1211  } else {
1212  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1213  nfft1, nfft2, nfft3, size1, size2, size3,
1214  recip1x, recip1y, recip1z,
1215  recip2x, recip2y, recip2z,
1216  recip3x, recip3y, recip3z,
1217  prefac1, prefac2, prefac3,
1218  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1219  }
1220  }
1221  cudaCheck(cudaGetLastError());
1222 
1223 }
#define M_PI
Definition: GoMolecule.C:39
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename T , typename T2 , bool calc_energy_virial, bool orderXYZ, bool doOrtho>
__global__ void scalar_sum_kernel ( const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const T  recip1x,
const T  recip1y,
const T  recip1z,
const T  recip2x,
const T  recip2y,
const T  recip2z,
const T  recip3x,
const T  recip3y,
const T  recip3z,
const T *  prefac1,
const T *  prefac2,
const T *  prefac3,
const T  pi_ewald,
const T  piv_inv,
const int  k2_00,
const int  k3_00,
T2 *  data,
double *__restrict__  energy_recip,
double *__restrict__  virial 
)

Definition at line 64 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, RecipVirial_t::energy, expfunc(), RecipVirial_t::virial, WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

74  {
75  // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
76  #ifdef NAMD_CUDA
77  extern __shared__ T sh_prefac[];
78  #else //NAMD_HIP
79  HIP_DYNAMIC_SHARED( T, sh_prefac)
80  #endif
81 
82  // Create pointers to shared memory
83  T* sh_prefac1 = (T *)&sh_prefac[0];
84  T* sh_prefac2 = (T *)&sh_prefac[nfft1];
85  T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
86 
87  // Calculate start position (k1, k2, k3) for each thread
88  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
89  int k3 = tid/(size1*size2);
90  tid -= k3*size1*size2;
91  int k2 = tid/size1;
92  int k1 = tid - k2*size1;
93 
94  // Starting position in data
95  int pos = k1 + (k2 + k3*size2)*size1;
96 
97  // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
98  k2 += k2_00;
99  k3 += k3_00;
100 
101  // Calculate limits w.r.t. global coordinates
102  const int lim2 = size2 + k2_00;
103  const int lim3 = size3 + k3_00;
104 
105  // Calculate increments (k1_inc, k2_inc, k3_inc)
106  int tot_inc = blockDim.x*gridDim.x;
107  const int k3_inc = tot_inc/(size1*size2);
108  tot_inc -= k3_inc*size1*size2;
109  const int k2_inc = tot_inc/size1;
110  const int k1_inc = tot_inc - k2_inc*size1;
111 
112  // Set data[0] = 0 for the global (0,0,0)
113  if (k1 == 0 && k2 == 0 && k3 == 0) {
114  T2 zero;
115  zero.x = (T)0;
116  zero.y = (T)0;
117  data[0] = zero;
118  // Increment position
119  k1 += k1_inc;
120  pos += k1_inc;
121  if (k1 >= size1) {
122  k1 -= size1;
123  k2++;
124  }
125  k2 += k2_inc;
126  pos += k2_inc*size1;
127  if (k2 >= lim2) {
128  k2 -= size2;
129  k3++;
130  }
131  k3 += k3_inc;
132  pos += k3_inc*size1*size2;
133  }
134 
135  // Load prefac data into shared memory
136  {
137  int t = threadIdx.x;
138  while (t < nfft1) {
139  sh_prefac1[t] = prefac1[t];
140  t += blockDim.x;
141  }
142  t = threadIdx.x;
143  while (t < nfft2) {
144  sh_prefac2[t] = prefac2[t];
145  t += blockDim.x;
146  }
147  t = threadIdx.x;
148  while (t < nfft3) {
149  sh_prefac3[t] = prefac3[t];
150  t += blockDim.x;
151  }
152  }
153  BLOCK_SYNC;
154 
155  double energy = 0.0;
156  double virial0 = 0.0;
157  double virial1 = 0.0;
158  double virial2 = 0.0;
159  double virial3 = 0.0;
160  double virial4 = 0.0;
161  double virial5 = 0.0;
162 
163  // If nfft1 is odd, set nfft1_half to impossible value so that
164  // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )"
165  // is never satisfied
166  const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
167  const int nfft2_half = nfft2/2;
168  const int nfft3_half = nfft3/2;
169 
170  while (k3 < lim3) {
171 
172  T2 q = data[pos];
173 
174  int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
175  int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
176 
177  T m1, m2, m3;
178  if (doOrtho) {
179  m1 = recip1x*k1;
180  m2 = recip2y*k2s;
181  m3 = recip3z*k3s;
182  } else {
183  m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
184  m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
185  m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
186  }
187 
188  T msq = m1*m1 + m2*m2 + m3*m3;
189  T msq_inv = ((T)1)/msq;
190 
191  T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
192  T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
193  if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
194  T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
195  T C = xp3*msq_inv;
196  T theta = theta3*C;
197 
198  if (calc_energy_virial) {
199  T fac = q2*C;
200  T vir = ((T)2)*(pi_ewald + msq_inv);
201  energy += fac;
202  virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
203  virial1 += (double)(fac*vir*m1*m2);
204  virial2 += (double)(fac*vir*m1*m3);
205  virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
206  virial4 += (double)(fac*vir*m2*m3);
207  virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
208  }
209 
210  q.x *= theta;
211  q.y *= theta;
212  data[pos] = q;
213 
214  // Increment position
215  k1 += k1_inc;
216  pos += k1_inc;
217  if (k1 >= size1) {
218  k1 -= size1;
219  k2++;
220  }
221  k2 += k2_inc;
222  pos += k2_inc*size1;
223  if (k2 >= lim2) {
224  k2 -= size2;
225  k3++;
226  }
227  k3 += k3_inc;
228  pos += k3_inc*size2*size1;
229  }
230 
231  // Reduce energy and virial
232  if (calc_energy_virial) {
233  const int tid = threadIdx.x & (warpSize-1);
234  const int base = (threadIdx.x/warpSize);
235  volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
236  // Reduce within warps
237  for (int d=warpSize/2;d >= 1;d /= 2) {
238  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
239  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
240  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
241  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
242  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
243  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
244  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
245  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
246  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
247  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
248  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
249  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
250  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
251  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
252  }
253  // Reduce between warps
254  // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
255  BLOCK_SYNC;
256  if (tid == 0) {
257  sh_ev[base].energy = energy;
258  sh_ev[base].virial[0] = virial0;
259  sh_ev[base].virial[1] = virial1;
260  sh_ev[base].virial[2] = virial2;
261  sh_ev[base].virial[3] = virial3;
262  sh_ev[base].virial[4] = virial4;
263  sh_ev[base].virial[5] = virial5;
264  }
265  BLOCK_SYNC;
266  if (base == 0) {
267  energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
268  virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
269  virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
270  virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
271  virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
272  virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
273  virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
274  for (int d=warpSize/2;d >= 1;d /= 2) {
275  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
276  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
277  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
278  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
279  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
280  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
281  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
282  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
283  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
284  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
285  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
286  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
287  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
288  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
289  }
290  }
291 
292  if (threadIdx.x == 0) {
293  atomicAdd(energy_recip, energy*0.5);
294  virial0 *= -0.5;
295  virial1 *= -0.5;
296  virial2 *= -0.5;
297  virial3 *= -0.5;
298  virial4 *= -0.5;
299  virial5 *= -0.5;
300  atomicAdd(&virial[0], virial0);
301  atomicAdd(&virial[1], virial1);
302  atomicAdd(&virial[2], virial2);
303  atomicAdd(&virial[3], virial3);
304  atomicAdd(&virial[4], virial4);
305  atomicAdd(&virial[5], virial5);
306  }
307 
308  }
309 
310 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define WARPSIZE
Definition: CudaUtils.h:10
__forceinline__ __device__ double expfunc(const double x)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
void spread_charge ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
float *  data,
const int  order,
cudaStream_t  stream 
)

Definition at line 1072 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, cudaNAMD_bug(), if(), and WARPSIZE.

Referenced by CudaPmeRealSpaceCompute::spreadCharge().

1077  {
1078 
1079  dim3 nthread, nblock;
1080  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
1081  switch(order) {
1082  case 4:
1083  nthread.x = WARPSIZE;
1084 #if defined(NAMD_CUDA)
1085  nthread.y = 4;
1086 #else
1087  nthread.y = order3 / WARPSIZE;
1088 #endif
1089  nthread.z = 1;
1090  nblock.x = (numAtoms - 1)/nthread.x + 1;
1091  nblock.y = 1;
1092  nblock.z = 1;
1093  if (periodicY && periodicZ)
1094  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1095  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1096  else if (periodicY)
1097  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1098  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1099  else if (periodicZ)
1100  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1101  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1102  else
1103  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1104  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1105  break;
1106 
1107  case 6:
1108  nthread.x = WARPSIZE;
1109  nthread.y = order3/WARPSIZE;
1110  nthread.z = 1;
1111  nblock.x = (numAtoms - 1)/nthread.x + 1;
1112  nblock.y = 1;
1113  nblock.z = 1;
1114  if (periodicY && periodicZ)
1115  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1116  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1117  else if (periodicY)
1118  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1119  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1120  else if (periodicZ)
1121  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1122  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1123  else
1124  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1125  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1126  break;
1127 
1128  case 8:
1129  nthread.x = WARPSIZE;
1130  nthread.y = order3/WARPSIZE;
1131  nthread.z = 1;
1132  nblock.x = (numAtoms - 1)/nthread.x + 1;
1133  nblock.y = 1;
1134  nblock.z = 1;
1135  if (periodicY && periodicZ)
1136  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1137  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1138  else if (periodicY)
1139  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1140  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1141  else if (periodicZ)
1142  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1143  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1144  else
1145  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1146  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1147  break;
1148 
1149  default:
1150  char str[128];
1151  sprintf(str, "spread_charge, order %d not implemented",order);
1152  cudaNAMD_bug(str);
1153  }
1154  cudaCheck(cudaGetLastError());
1155 
1156 }
static __thread atom * atoms
if(ComputeNonbondedUtil::goMethod==2)
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename AT , int order, bool periodicY, bool periodicZ>
__global__ void spread_charge_kernel ( const float4 *  xyzq,
const int  ncoord,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
AT *  data 
)

Definition at line 422 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, order, WARPSIZE, x, y, and z.

426  {
427 
428  // Shared memory use:
429  // order = 4: 1920 bytes
430  // order = 6: 2688 bytes
431  // order = 8: 3456 bytes
432  __shared__ int sh_ix[WARPSIZE];
433  __shared__ int sh_iy[WARPSIZE];
434  __shared__ int sh_iz[WARPSIZE];
435  __shared__ float sh_thetax[order*WARPSIZE];
436  __shared__ float sh_thetay[order*WARPSIZE];
437  __shared__ float sh_thetaz[order*WARPSIZE];
438 
439  // Process atoms pos to pos_end-1 (blockDim.x)
440  const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
441  const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
442 
443  if (pos < pos_end && threadIdx.y == 0) {
444 
445  float4 xyzqi = xyzq[pos];
446  float x = xyzqi.x;
447  float y = xyzqi.y;
448  float z = xyzqi.z;
449  float q = xyzqi.w;
450 
451  float frx = ((float)nfftx)*x;
452  float fry = ((float)nffty)*y;
453  float frz = ((float)nfftz)*z;
454 
455  int frxi = (int)frx;
456  int fryi = (int)fry;
457  int frzi = (int)frz;
458 
459  float wx = frx - (float)frxi;
460  float wy = fry - (float)fryi;
461  float wz = frz - (float)frzi;
462 
463  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
464  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
465 
466  sh_ix[threadIdx.x] = frxi;
467  sh_iy[threadIdx.x] = fryi - y00;
468  sh_iz[threadIdx.x] = frzi - z00;
469 
470  float theta[order];
471 
472  calc_one_theta<float, order>(wx, theta);
473 #pragma unroll
474  for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
475 
476  calc_one_theta<float, order>(wy, theta);
477 #pragma unroll
478  for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
479 
480  calc_one_theta<float, order>(wz, theta);
481 #pragma unroll
482  for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
483 
484  }
485 
486  BLOCK_SYNC;
487 
488  // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
489  // NOTE: Only tid=0...order*order*order-1 do any computation
490  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
491  const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
492  const int x0 = tid % order;
493  const int y0 = (tid / order) % order;
494  const int z0 = tid / (order*order);
495 
496  // Loop over atoms pos..pos_end-1
497  int iadd = blockDim.x*blockDim.y/order3;
498  int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
499  int iend = pos_end - blockIdx.x*blockDim.x;
500  for (;i < iend;i += iadd) {
501  int x = sh_ix[i] + x0;
502  int y = sh_iy[i] + y0;
503  int z = sh_iz[i] + z0;
504 
505  if (x >= nfftx) x -= nfftx;
506 
507  if (periodicY && (y >= nffty)) y -= nffty;
508  if (!periodicY && (y < 0 || y >= ysize)) continue;
509 
510  if (periodicZ && (z >= nfftz)) z -= nfftz;
511  if (!periodicZ && (z < 0 || z >= zsize)) continue;
512 
513  // Get position on the grid
514  int ind = x + xdim*(y + ysize*(z));
515 
516  // Here we unroll the 6x6x6 loop with 216 threads.
517  // NOTE: We use 7*32=224 threads to do this
518  // Calculate interpolated charge value and store it to global memory
519 
520  if (tid < order*order*order)
521  write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
522  }
523 
524 }
#define WARPSIZE
Definition: CudaUtils.h:10
#define order
Definition: PmeRealSpace.C:235
gridSize z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
gridSize y
gridSize x
void transpose_xyz_yzx ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1371 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1375  {
1376 
1377  dim3 numthread(TILEDIM, TILEROWS, 1);
1378  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
1379  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1380  nx, ny, nz, xsize_in, ysize_in,
1381  ysize_out, zsize_out,
1382  data_in, data_out);
1383 
1384  cudaCheck(cudaGetLastError());
1385 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename T >
__device__ __forceinline__ void transpose_xyz_yzx_device ( const int  x_in,
const int  y_in,
const int  z_in,
const int  x_out,
const int  y_out,
const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 800 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, TILEDIM, and TILEROWS.

806  {
807 
808  // Shared memory
809  __shared__ T tile[TILEDIM][TILEDIM+1];
810 
811  // Read (x,y) data_in into tile (shared memory)
812  for (int j=0;j < TILEDIM;j += TILEROWS)
813  if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
814  tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
815 
816  BLOCK_SYNC;
817 
818  // Write (y,x) tile into data_out
819  const int z_out = z_in;
820  for (int j=0;j < TILEDIM;j += TILEROWS)
821  if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
822  data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
823 }
const int TILEDIM
const int TILEROWS
template<typename T >
__global__ void transpose_xyz_yzx_kernel ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 829 of file CudaPmeSolverUtilKernel.cu.

References TILEDIM.

833  {
834 
835  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
836  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
837  int z_in = blockIdx.z + threadIdx.z;
838 
839  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
840  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
841 
842  transpose_xyz_yzx_device<T>(
843  x_in, y_in, z_in,
844  x_out, y_out,
845  nx, ny, nz,
846  xsize_in, ysize_in,
847  ysize_out, zsize_out,
848  data_in, data_out);
849 
850 /*
851  // Shared memory
852  __shared__ T tile[TILEDIM][TILEDIM+1];
853 
854  int x = blockIdx.x * TILEDIM + threadIdx.x;
855  int y = blockIdx.y * TILEDIM + threadIdx.y;
856  int z = blockIdx.z + threadIdx.z;
857 
858  // Read (x,y) data_in into tile (shared memory)
859  for (int j=0;j < TILEDIM;j += TILEROWS)
860  if ((x < nx) && (y + j < ny) && (z < nz))
861  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
862 
863  BLOCK_SYNC;
864 
865  // Write (y,x) tile into data_out
866  x = blockIdx.x * TILEDIM + threadIdx.y;
867  y = blockIdx.y * TILEDIM + threadIdx.x;
868  for (int j=0;j < TILEDIM;j += TILEROWS)
869  if ((x + j < nx) && (y < ny) && (z < nz))
870  data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
871 */
872 }
const int TILEDIM
void transpose_xyz_zxy ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1406 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1410  {
1411 
1412  dim3 numthread(TILEDIM, TILEROWS, 1);
1413  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
1414  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1415  nx, ny, nz, xsize_in, ysize_in,
1416  zsize_out, xsize_out,
1417  data_in, data_out);
1418 
1419  cudaCheck(cudaGetLastError());
1420 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
template<typename T >
__device__ __forceinline__ void transpose_xyz_zxy_device ( const int  x_in,
const int  y_in,
const int  z_in,
const int  x_out,
const int  z_out,
const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 978 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, TILEDIM, and TILEROWS.

984  {
985 
986  // Shared memory
987  __shared__ T tile[TILEDIM][TILEDIM+1];
988 
989  // Read (x,z) data_in into tile (shared memory)
990  for (int k=0;k < TILEDIM;k += TILEROWS)
991  if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
992  tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
993 
994  BLOCK_SYNC;
995 
996  // Write (z,x) tile into data_out
997  const int y_out = y_in;
998  for (int k=0;k < TILEDIM;k += TILEROWS)
999  if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
1000  data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
1001 }
const int TILEDIM
const int TILEROWS
template<typename T >
__global__ void transpose_xyz_zxy_kernel ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 1007 of file CudaPmeSolverUtilKernel.cu.

References TILEDIM.

1011  {
1012 
1013  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1014  int y_in = blockIdx.z + threadIdx.z;
1015  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1016 
1017  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1018  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1019 
1020  transpose_xyz_zxy_device<T>(
1021  x_in, y_in, z_in, x_out, z_out,
1022  nx, ny, nz,
1023  xsize_in, ysize_in,
1024  zsize_out, xsize_out,
1025  data_in, data_out);
1026 
1027 }
const int TILEDIM
template<typename T >
__forceinline__ __device__ void write_grid ( const float  val,
const int  ind,
T *  data 
)

Definition at line 313 of file CudaPmeSolverUtilKernel.cu.

314  {
315 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR >= 3) && (HIP_VERSION_MINOR > 3))
316  atomicAddNoRet(&data[ind], (T)val);
317 #else
318  atomicAdd(&data[ind], (T)val);
319 #endif
320 }

Variable Documentation

const int TILEDIM = 32
const int TILEROWS = 8