NAMD
|
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include "CudaUtils.h"
#include "CudaPmeSolverUtilKernel.h"
Go to the source code of this file.
Classes | |
struct | RecipVirial_t |
struct | gather_t< T, order > |
Functions | |
__forceinline__ __device__ double | expfunc (const double x) |
__forceinline__ __device__ float | expfunc (const float x) |
template<typename T , typename T2 , bool calc_energy_virial, bool orderXYZ, bool doOrtho> | |
__global__ void | scalar_sum_kernel (const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial) |
template<typename T > | |
__forceinline__ __device__ void | write_grid (const float val, const int ind, T *data) |
template<typename T , int order> | |
__forceinline__ __device__ void | calc_one_theta (const T w, T *theta) |
template<typename T , typename T3 , int order> | |
__forceinline__ __device__ void | calc_theta_dtheta (T wx, T wy, T wz, T3 *theta, T3 *dtheta) |
template<typename AT , int order, bool periodicY, bool periodicZ> | |
__global__ void | spread_charge_kernel (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data) |
template<typename T > | |
__forceinline__ __device__ void | gather_force_store (const float fx, const float fy, const float fz, const int stride, const int pos, T *force) |
template<> | |
__forceinline__ __device__ void | gather_force_store< float > (const float fx, const float fy, const float fz, const int stride, const int pos, float *force) |
template<> | |
__forceinline__ __device__ void | gather_force_store< float3 > (const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force) |
template<typename CT , typename FT , int order, bool periodicY, bool periodicZ> | |
__global__ void | gather_force (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force) |
template<typename T > | |
__device__ __forceinline__ void | transpose_xyz_yzx_device (const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out) |
template<typename T > | |
__global__ void | transpose_xyz_yzx_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out) |
template<typename T > | |
__global__ void | batchTranspose_xyz_yzx_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in) |
template<typename T > | |
__device__ __forceinline__ void | transpose_xyz_zxy_device (const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out) |
template<typename T > | |
__global__ void | transpose_xyz_zxy_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out) |
template<typename T > | |
__global__ void | batchTranspose_xyz_zxy_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in) |
void | spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream) |
void | scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream) |
void | gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream) |
void | transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream) |
void | batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream) |
void | transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream) |
void | batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream) |
Variables | |
const int | TILEDIM = 32 |
const int | TILEROWS = 8 |
void batchTranspose_xyz_yzx | ( | const int | numBatches, |
TransposeBatch< float2 > * | batches, | ||
const int | max_nx, | ||
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
cudaStream_t | stream | ||
) |
Definition at line 1390 of file CudaPmeSolverUtilKernel.cu.
References cudaCheck, TILEDIM, and TILEROWS.
Referenced by CudaPmeTranspose::transposeXYZtoYZX().
__global__ void batchTranspose_xyz_yzx_kernel | ( | const TransposeBatch< T > * | batches, |
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in | ||
) |
Definition at line 883 of file CudaPmeSolverUtilKernel.cu.
References TransposeBatch< T >::nx, and TILEDIM.
void batchTranspose_xyz_zxy | ( | const int | numBatches, |
TransposeBatch< float2 > * | batches, | ||
const int | max_nx, | ||
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
cudaStream_t | stream | ||
) |
Definition at line 1425 of file CudaPmeSolverUtilKernel.cu.
References cudaCheck, TILEDIM, and TILEROWS.
Referenced by CudaPmeTranspose::transposeXYZtoZXY().
__global__ void batchTranspose_xyz_zxy_kernel | ( | const TransposeBatch< T > * | batches, |
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in | ||
) |
Definition at line 1038 of file CudaPmeSolverUtilKernel.cu.
References TransposeBatch< T >::nx, and TILEDIM.
__forceinline__ __device__ void calc_one_theta | ( | const T | w, |
T * | theta | ||
) |
__forceinline__ __device__ void calc_theta_dtheta | ( | T | wx, |
T | wy, | ||
T | wz, | ||
T3 * | theta, | ||
T3 * | dtheta | ||
) |
__forceinline__ __device__ double expfunc | ( | const double | x | ) |
Definition at line 50 of file CudaPmeSolverUtilKernel.cu.
Referenced by scalar_sum_kernel().
__forceinline__ __device__ float expfunc | ( | const float | x | ) |
Definition at line 54 of file CudaPmeSolverUtilKernel.cu.
__global__ void gather_force | ( | const float4 * | xyzq, |
const int | ncoord, | ||
const int | nfftx, | ||
const int | nffty, | ||
const int | nfftz, | ||
const int | xsize, | ||
const int | ysize, | ||
const int | zsize, | ||
const int | xdim, | ||
const int | y00, | ||
const int | z00, | ||
const float * | data, | ||
const cudaTextureObject_t | gridTexObj, | ||
const int | stride, | ||
FT * | force | ||
) |
Definition at line 597 of file CudaPmeSolverUtilKernel.cu.
References __ldg, BLOCK_SYNC, for(), order, WARP_BALLOT, WARP_FULL_MASK, WARP_SHUFFLE, x, y, and z.
Referenced by CudaPmeRealSpaceCompute::gatherForce().
void gather_force | ( | const float4 * | atoms, |
const int | numAtoms, | ||
const int | nfftx, | ||
const int | nffty, | ||
const int | nfftz, | ||
const int | xsize, | ||
const int | ysize, | ||
const int | zsize, | ||
const int | xdim, | ||
const int | y00, | ||
const int | z00, | ||
const bool | periodicY, | ||
const bool | periodicZ, | ||
const float * | data, | ||
const int | order, | ||
float3 * | force, | ||
const cudaTextureObject_t | gridTexObj, | ||
cudaStream_t | stream | ||
) |
Definition at line 1225 of file CudaPmeSolverUtilKernel.cu.
References atoms, cudaCheck, and cudaNAMD_bug().
__forceinline__ __device__ void gather_force_store | ( | const float | fx, |
const float | fy, | ||
const float | fz, | ||
const int | stride, | ||
const int | pos, | ||
T * | force | ||
) |
Definition at line 529 of file CudaPmeSolverUtilKernel.cu.
__forceinline__ __device__ void gather_force_store< float > | ( | const float | fx, |
const float | fy, | ||
const float | fz, | ||
const int | stride, | ||
const int | pos, | ||
float * | force | ||
) |
Definition at line 536 of file CudaPmeSolverUtilKernel.cu.
__forceinline__ __device__ void gather_force_store< float3 > | ( | const float | fx, |
const float | fy, | ||
const float | fz, | ||
const int | stride, | ||
const int | pos, | ||
float3 * | force | ||
) |
Definition at line 555 of file CudaPmeSolverUtilKernel.cu.
void scalar_sum | ( | const bool | orderXYZ, |
const int | nfft1, | ||
const int | nfft2, | ||
const int | nfft3, | ||
const int | size1, | ||
const int | size2, | ||
const int | size3, | ||
const double | kappa, | ||
const float | recip1x, | ||
const float | recip1y, | ||
const float | recip1z, | ||
const float | recip2x, | ||
const float | recip2y, | ||
const float | recip2z, | ||
const float | recip3x, | ||
const float | recip3y, | ||
const float | recip3z, | ||
const double | volume, | ||
const float * | prefac1, | ||
const float * | prefac2, | ||
const float * | prefac3, | ||
const int | k2_00, | ||
const int | k3_00, | ||
const bool | doEnergyVirial, | ||
double * | energy, | ||
double * | virial, | ||
float2 * | data, | ||
cudaStream_t | stream | ||
) |
Definition at line 1158 of file CudaPmeSolverUtilKernel.cu.
References cudaCheck, M_PI, and WARPSIZE.
Referenced by CudaPmeKSpaceCompute::solve().
__global__ void scalar_sum_kernel | ( | const int | nfft1, |
const int | nfft2, | ||
const int | nfft3, | ||
const int | size1, | ||
const int | size2, | ||
const int | size3, | ||
const T | recip1x, | ||
const T | recip1y, | ||
const T | recip1z, | ||
const T | recip2x, | ||
const T | recip2y, | ||
const T | recip2z, | ||
const T | recip3x, | ||
const T | recip3y, | ||
const T | recip3z, | ||
const T * | prefac1, | ||
const T * | prefac2, | ||
const T * | prefac3, | ||
const T | pi_ewald, | ||
const T | piv_inv, | ||
const int | k2_00, | ||
const int | k3_00, | ||
T2 * | data, | ||
double *__restrict__ | energy_recip, | ||
double *__restrict__ | virial | ||
) |
Definition at line 64 of file CudaPmeSolverUtilKernel.cu.
References BLOCK_SYNC, RecipVirial_t::energy, expfunc(), RecipVirial_t::virial, WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.
void spread_charge | ( | const float4 * | atoms, |
const int | numAtoms, | ||
const int | nfftx, | ||
const int | nffty, | ||
const int | nfftz, | ||
const int | xsize, | ||
const int | ysize, | ||
const int | zsize, | ||
const int | xdim, | ||
const int | y00, | ||
const int | z00, | ||
const bool | periodicY, | ||
const bool | periodicZ, | ||
float * | data, | ||
const int | order, | ||
cudaStream_t | stream | ||
) |
Definition at line 1072 of file CudaPmeSolverUtilKernel.cu.
References atoms, cudaCheck, cudaNAMD_bug(), if(), and WARPSIZE.
Referenced by CudaPmeRealSpaceCompute::spreadCharge().
__global__ void spread_charge_kernel | ( | const float4 * | xyzq, |
const int | ncoord, | ||
const int | nfftx, | ||
const int | nffty, | ||
const int | nfftz, | ||
const int | xsize, | ||
const int | ysize, | ||
const int | zsize, | ||
const int | xdim, | ||
const int | y00, | ||
const int | z00, | ||
AT * | data | ||
) |
Definition at line 422 of file CudaPmeSolverUtilKernel.cu.
References BLOCK_SYNC, order, WARPSIZE, x, y, and z.
void transpose_xyz_yzx | ( | const int | nx, |
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
const int | ysize_out, | ||
const int | zsize_out, | ||
const float2 * | data_in, | ||
float2 * | data_out, | ||
cudaStream_t | stream | ||
) |
Definition at line 1371 of file CudaPmeSolverUtilKernel.cu.
References cudaCheck, TILEDIM, and TILEROWS.
__device__ __forceinline__ void transpose_xyz_yzx_device | ( | const int | x_in, |
const int | y_in, | ||
const int | z_in, | ||
const int | x_out, | ||
const int | y_out, | ||
const int | nx, | ||
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
const int | ysize_out, | ||
const int | zsize_out, | ||
const T * | data_in, | ||
T * | data_out | ||
) |
Definition at line 800 of file CudaPmeSolverUtilKernel.cu.
References BLOCK_SYNC, TILEDIM, and TILEROWS.
__global__ void transpose_xyz_yzx_kernel | ( | const int | nx, |
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
const int | ysize_out, | ||
const int | zsize_out, | ||
const T * | data_in, | ||
T * | data_out | ||
) |
void transpose_xyz_zxy | ( | const int | nx, |
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
const int | zsize_out, | ||
const int | xsize_out, | ||
const float2 * | data_in, | ||
float2 * | data_out, | ||
cudaStream_t | stream | ||
) |
Definition at line 1406 of file CudaPmeSolverUtilKernel.cu.
References cudaCheck, TILEDIM, and TILEROWS.
__device__ __forceinline__ void transpose_xyz_zxy_device | ( | const int | x_in, |
const int | y_in, | ||
const int | z_in, | ||
const int | x_out, | ||
const int | z_out, | ||
const int | nx, | ||
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
const int | zsize_out, | ||
const int | xsize_out, | ||
const T * | data_in, | ||
T * | data_out | ||
) |
Definition at line 978 of file CudaPmeSolverUtilKernel.cu.
References BLOCK_SYNC, TILEDIM, and TILEROWS.
__global__ void transpose_xyz_zxy_kernel | ( | const int | nx, |
const int | ny, | ||
const int | nz, | ||
const int | xsize_in, | ||
const int | ysize_in, | ||
const int | zsize_out, | ||
const int | xsize_out, | ||
const T * | data_in, | ||
T * | data_out | ||
) |
__forceinline__ __device__ void write_grid | ( | const float | val, |
const int | ind, | ||
T * | data | ||
) |
Definition at line 313 of file CudaPmeSolverUtilKernel.cu.
const int TILEDIM = 32 |
Definition at line 795 of file CudaPmeSolverUtilKernel.cu.
Referenced by batchTranspose_xyz_yzx(), batchTranspose_xyz_yzx_kernel(), batchTranspose_xyz_zxy(), batchTranspose_xyz_zxy_kernel(), transpose_xyz_yzx(), transpose_xyz_yzx_device(), transpose_xyz_yzx_kernel(), transpose_xyz_zxy(), transpose_xyz_zxy_device(), and transpose_xyz_zxy_kernel().
const int TILEROWS = 8 |
Definition at line 796 of file CudaPmeSolverUtilKernel.cu.
Referenced by batchTranspose_xyz_yzx(), batchTranspose_xyz_zxy(), transpose_xyz_yzx(), transpose_xyz_yzx_device(), transpose_xyz_zxy(), and transpose_xyz_zxy_device().