CudaPmeSolverUtilKernel.cu File Reference

#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include "CudaUtils.h"
#include "CudaPmeSolverUtilKernel.h"

Go to the source code of this file.

Classes

struct  RecipVirial_t
struct  gather_t< T, order >

Functions

__forceinline__ __device__
double 
expfunc (const double x)
__forceinline__ __device__
float 
expfunc (const float x)
template<typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
__global__ void scalar_sum_kernel (const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
template<typename T>
__forceinline__ __device__
void 
write_grid (const float val, const int ind, T *data)
template<typename T, int order>
__forceinline__ __device__
void 
calc_one_theta (const T w, T *theta)
template<typename T, typename T3, int order>
__forceinline__ __device__
void 
calc_theta_dtheta (T wx, T wy, T wz, T3 *theta, T3 *dtheta)
template<typename AT, int order, bool periodicY, bool periodicZ>
__global__ void spread_charge_kernel (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
template<typename T>
__forceinline__ __device__
void 
gather_force_store (const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
template<>
__forceinline__ __device__
void 
gather_force_store< float > (const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
template<>
__forceinline__ __device__
void 
gather_force_store< float3 > (const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
template<typename CT, typename FT, int order, bool periodicY, bool periodicZ>
__global__ void gather_force (const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
template<typename T>
__device__ __forceinline__
void 
transpose_xyz_yzx_device (const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
template<typename T>
__global__ void transpose_xyz_yzx_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
template<typename T>
__global__ void batchTranspose_xyz_yzx_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
template<typename T>
__device__ __forceinline__
void 
transpose_xyz_zxy_device (const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
template<typename T>
__global__ void transpose_xyz_zxy_kernel (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
template<typename T>
__global__ void batchTranspose_xyz_zxy_kernel (const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
void spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
void transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
void batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
void batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)

Variables

const int TILEDIM = 32
const int TILEROWS = 8


Function Documentation

void batchTranspose_xyz_yzx ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1340 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoYZX().

01343                                                                {
01344 
01345   dim3 numthread(TILEDIM, TILEROWS, 1);
01346   dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
01347 
01348   batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
01349   (batches, ny, nz, xsize_in, ysize_in);
01350 
01351   cudaCheck(cudaGetLastError());
01352 }

template<typename T>
__global__ void batchTranspose_xyz_yzx_kernel ( const TransposeBatch< T > *  batches,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in 
)

Definition at line 853 of file CudaPmeSolverUtilKernel.cu.

References TransposeBatch< T >::nx, and TILEDIM.

00856                                           {
00857 
00858   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
00859   int y_in = blockIdx.y * TILEDIM + threadIdx.y;
00860   int z_in = (blockIdx.z % nz)    + threadIdx.z;
00861 
00862   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
00863   int y_out = blockIdx.y * TILEDIM + threadIdx.x;
00864 
00865   int bi = blockIdx.z/nz;
00866 
00867   TransposeBatch<T> batch = batches[bi];
00868   int nx        = batch.nx;
00869   int ysize_out = batch.ysize_out;
00870   int zsize_out = batch.zsize_out;
00871   T* data_in    = batch.data_in;
00872   T* data_out   = batch.data_out;
00873 
00874   transpose_xyz_yzx_device<T>(
00875     x_in, y_in, z_in,
00876     x_out, y_out,
00877     nx, ny, nz,
00878     xsize_in, ysize_in,
00879     ysize_out, zsize_out,
00880     data_in, data_out);
00881 
00882 }

void batchTranspose_xyz_zxy ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1377 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoZXY().

01381                        {
01382 
01383   dim3 numthread(TILEDIM, TILEROWS, 1);
01384   dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
01385 
01386   batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
01387   (batches, ny, nz, xsize_in, ysize_in);
01388 
01389   cudaCheck(cudaGetLastError());
01390 }

template<typename T>
__global__ void batchTranspose_xyz_zxy_kernel ( const TransposeBatch< T > *  batches,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in 
)

Definition at line 1008 of file CudaPmeSolverUtilKernel.cu.

References TransposeBatch< T >::nx, and TILEDIM.

01011                                           {
01012 
01013   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
01014   int y_in = (blockIdx.z % ny)    + threadIdx.z;
01015   int z_in = blockIdx.y * TILEDIM + threadIdx.y;
01016 
01017   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
01018   int z_out = blockIdx.y * TILEDIM + threadIdx.x;
01019 
01020   int bi = blockIdx.z/ny;
01021 
01022   TransposeBatch<T> batch = batches[bi];
01023   int nx        = batch.nx;
01024   int zsize_out = batch.zsize_out;
01025   int xsize_out = batch.xsize_out;
01026   T* data_in    = batch.data_in;
01027   T* data_out   = batch.data_out;
01028 
01029   transpose_xyz_zxy_device<T>(
01030     x_in, y_in, z_in, x_out, z_out,
01031     nx, ny, nz,
01032     xsize_in, ysize_in,
01033     zsize_out, xsize_out,
01034     data_in, data_out);
01035 
01036 }

template<typename T, int order>
__forceinline__ __device__ void calc_one_theta ( const T  w,
T *  theta 
)

Definition at line 314 of file CudaPmeSolverUtilKernel.cu.

References j, and order.

00314                                                                     {
00315 
00316   theta[order-1] = ((T)0);
00317   theta[1] = w;
00318   theta[0] = ((T)1) - w;
00319 
00320 #pragma unroll
00321   for (int k=3;k <= order-1;k++) {
00322     T div = ((T)1) / (T)(k-1);
00323     theta[k-1] = div*w*theta[k-2];
00324 #pragma unroll
00325     for (int j=1;j <= k-2;j++) {
00326       theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
00327     }
00328     theta[0] = div*(((T)1) - w)*theta[0];
00329   }
00330       
00331   //--- one more recursion
00332   T div = ((T)1) / (T)(order-1);
00333   theta[order-1] = div*w*theta[order-2];
00334 #pragma unroll
00335   for (int j=1;j <= order-2;j++) {
00336     theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
00337   }
00338     
00339   theta[0] = div*(((T)1) - w)*theta[0];
00340 }

template<typename T, typename T3, int order>
__forceinline__ __device__ void calc_theta_dtheta ( wx,
wy,
wz,
T3 *  theta,
T3 *  dtheta 
)

Definition at line 346 of file CudaPmeSolverUtilKernel.cu.

References j, order, x, y, and z.

00346                                                                                            {
00347 
00348   theta[order-1].x = ((T)0);
00349   theta[order-1].y = ((T)0);
00350   theta[order-1].z = ((T)0);
00351   theta[1].x = wx;
00352   theta[1].y = wy;
00353   theta[1].z = wz;
00354   theta[0].x = ((T)1) - wx;
00355   theta[0].y = ((T)1) - wy;
00356   theta[0].z = ((T)1) - wz;
00357 
00358 #pragma unroll
00359   for (int k=3;k <= order-1;k++) {
00360     T div = ((T)1) / (T)(k-1);
00361     theta[k-1].x = div*wx*theta[k-2].x;
00362     theta[k-1].y = div*wy*theta[k-2].y;
00363     theta[k-1].z = div*wz*theta[k-2].z;
00364 #pragma unroll
00365     for (int j=1;j <= k-2;j++) {
00366       theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
00367       theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
00368       theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
00369     }
00370     theta[0].x = div*(((T)1) - wx)*theta[0].x;
00371     theta[0].y = div*(((T)1) - wy)*theta[0].y;
00372     theta[0].z = div*(((T)1) - wz)*theta[0].z;
00373   }
00374 
00375   //--- perform standard b-spline differentiation
00376   dtheta[0].x = -theta[0].x;
00377   dtheta[0].y = -theta[0].y;
00378   dtheta[0].z = -theta[0].z;
00379 #pragma unroll
00380   for (int j=2;j <= order;j++) {
00381     dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
00382     dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
00383     dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
00384   }
00385       
00386   //--- one more recursion
00387   T div = ((T)1) / (T)(order-1);
00388   theta[order-1].x = div*wx*theta[order-2].x;
00389   theta[order-1].y = div*wy*theta[order-2].y;
00390   theta[order-1].z = div*wz*theta[order-2].z;
00391 #pragma unroll
00392   for (int j=1;j <= order-2;j++) {
00393     theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
00394     theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
00395     theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
00396   }
00397     
00398   theta[0].x = div*(((T)1) - wx)*theta[0].x;
00399   theta[0].y = div*(((T)1) - wy)*theta[0].y;
00400   theta[0].z = div*(((T)1) - wz)*theta[0].z;
00401 }

__forceinline__ __device__ float expfunc ( const float  x  ) 

Definition at line 50 of file CudaPmeSolverUtilKernel.cu.

00050                                                         {
00051   return __expf(x);
00052 }

__forceinline__ __device__ double expfunc ( const double  x  ) 

Definition at line 46 of file CudaPmeSolverUtilKernel.cu.

Referenced by scalar_sum_kernel().

00046                                                           {
00047   return exp(x);
00048 }

void gather_force ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
const float *  data,
const int  order,
float3 *  force,
const cudaTextureObject_t  gridTexObj,
cudaStream_t  stream 
)

Definition at line 1200 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, and cudaNAMD_bug().

01208                        {
01209 
01210   dim3 nthread(32, 2, 1);
01211   dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
01212   // dim3 nblock(npatch, 1, 1);
01213 
01214   switch(order) {
01215     case 4:
01216     if (periodicY && periodicZ)
01217       gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
01218       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01219         // recip11, recip22, recip33,
01220         data,
01221         gridTexObj,
01222         1, force);
01223     else if (periodicY)
01224       gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
01225       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01226         // recip11, recip22, recip33,
01227         data,
01228         gridTexObj,
01229         1, force);
01230     else if (periodicZ)
01231       gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
01232       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01233         // recip11, recip22, recip33,
01234         data,
01235         gridTexObj,
01236         1, force);
01237     else
01238       gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
01239       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01240         // recip11, recip22, recip33,
01241         data,
01242         gridTexObj,
01243         1, force);
01244     break;
01245 
01246     case 6:
01247     if (periodicY && periodicZ)
01248       gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
01249       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01250         // recip11, recip22, recip33,
01251         data,
01252         gridTexObj,
01253         1, force);
01254     else if (periodicY)
01255       gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
01256       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01257         // recip11, recip22, recip33,
01258         data,
01259         gridTexObj,
01260         1, force);
01261     else if (periodicZ)
01262       gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
01263       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01264         // recip11, recip22, recip33,
01265         data,
01266         gridTexObj,
01267         1, force);
01268     else
01269       gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
01270       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01271         // recip11, recip22, recip33,
01272         data,
01273         gridTexObj,
01274         1, force);
01275     break;
01276  
01277     case 8:
01278     if (periodicY && periodicZ)
01279       gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
01280       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01281         // recip11, recip22, recip33,
01282         data,
01283         gridTexObj,
01284         1, force);
01285     else if (periodicY)
01286       gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
01287       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01288         // recip11, recip22, recip33,
01289         data,
01290         gridTexObj,
01291         1, force);
01292     else if (periodicZ)
01293       gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
01294       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01295         // recip11, recip22, recip33,
01296         data,
01297         gridTexObj,
01298         1, force);
01299     else
01300       gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
01301       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01302         // recip11, recip22, recip33,
01303         data,
01304         gridTexObj,
01305         1, force);
01306     break;
01307 
01308     default:
01309     char str[128];
01310     sprintf(str, "gather_force, order %d not implemented",order);
01311     cudaNAMD_bug(str);
01312   }
01313   cudaCheck(cudaGetLastError());
01314 
01315 }

template<typename CT, typename FT, int order, bool periodicY, bool periodicZ>
__global__ void gather_force ( const float4 *  xyzq,
const int  ncoord,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const float *  data,
const cudaTextureObject_t  gridTexObj,
const int  stride,
FT *  force 
)

Definition at line 569 of file CudaPmeSolverUtilKernel.cu.

References __ldg, BLOCK_SYNC, order, WARP_BALLOT, WARP_FULL_MASK, WARP_SHUFFLE, x, y, and z.

Referenced by CudaPmeRealSpaceCompute::gatherForce().

00574                                              : data is used for loads when __CUDA_ARCH__ >= 350
00575               const cudaTextureObject_t gridTexObj,
00576               const int stride,
00577               FT *force) {
00578 
00579   const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
00580 
00581   // Shared memory
00582   __shared__ gather_t<CT, order> shmem[32];
00583 
00584   const int pos = blockIdx.x*blockDim.x + threadIdx.x;
00585   const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
00586 
00587   // Load atom data into shared memory
00588   if (pos < pos_end && threadIdx.y == 0) {
00589 
00590     float4 xyzqi = xyzq[pos];
00591     float x = xyzqi.x;
00592     float y = xyzqi.y;
00593     float z = xyzqi.z;
00594     float q = xyzqi.w;
00595 
00596     float frx = ((float)nfftx)*x;
00597     float fry = ((float)nffty)*y;
00598     float frz = ((float)nfftz)*z;
00599 
00600     int frxi = (int)frx;
00601     int fryi = (int)fry;
00602     int frzi = (int)frz;
00603 
00604     float wx = frx - (float)frxi;
00605     float wy = fry - (float)fryi;
00606     float wz = frz - (float)frzi;
00607 
00608     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
00609     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
00610 
00611     shmem[threadIdx.x].ix = frxi;
00612     shmem[threadIdx.x].iy = fryi - y00;
00613     shmem[threadIdx.x].iz = frzi - z00;
00614     shmem[threadIdx.x].charge = q;
00615 
00616     float3 theta_tmp[order];
00617     float3 dtheta_tmp[order];
00618     calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
00619     
00620 #pragma unroll
00621     for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
00622 
00623 #pragma unroll
00624     for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
00625 
00626 #pragma unroll
00627     for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
00628 
00629 #pragma unroll
00630     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
00631 
00632 #pragma unroll
00633     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
00634 
00635 #pragma unroll
00636     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
00637 
00638   }
00639   BLOCK_SYNC;
00640 
00641   // We divide the order x order x order cube into 8 sub-cubes.
00642   // These sub-cubes are taken care by a single thread
00643   // The size of the sub-cubes is:
00644   // order=4 : 2x2x2
00645   // order=6 : 3x3x3
00646   // order=8 : 4x4x4
00647   const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
00648   // Calculate the starting index on the sub-cube for this thread
00649   // tid = 0...63
00650   const int t = (tid % 8);         // sub-cube index (0...7)
00651   // t = (tx0 + ty0*2 + tz0*4)/nsc
00652   // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
00653   const int tz0 = (t / 4)*nsc;
00654   const int ty0 = ((t / 2) % 2)*nsc;
00655   const int tx0 = (t % 2)*nsc;
00656 
00657   //
00658   // Calculate forces for 32 atoms. We have 32*2 = 64 threads
00659   // Loop is iterated 4 times:
00660   //                         (iterations)
00661   // Threads 0...7   = atoms 0, 8,  16, 24
00662   // Threads 8...15  = atoms 1, 9,  17, 25
00663   // Threads 16...31 = atoms 2, 10, 18, 26
00664   //                ...
00665   // Threads 56...63 = atoms 7, 15, 23, 31
00666   //
00667 
00668   int base = tid/8;
00669   const int base_end = pos_end - blockIdx.x*blockDim.x;
00670 
00671   // Make sure to mask out any threads that are not running the loop.
00672   // This will happen if the number of atoms is not a multiple of 32.
00673   int warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
00674 
00675   while (base < base_end) {
00676 
00677     float f1 = 0.0f;
00678     float f2 = 0.0f;
00679     float f3 = 0.0f;
00680     int ix0 = shmem[base].ix;
00681     int iy0 = shmem[base].iy;
00682     int iz0 = shmem[base].iz;
00683 
00684     // Each thread calculates a nsc x nsc x nsc sub-cube
00685 #pragma unroll
00686     for (int i=0;i < nsc*nsc*nsc;i++) {
00687       int tz = tz0 + (i/(nsc*nsc));
00688       int ty = ty0 + ((i/nsc) % nsc);
00689       int tx = tx0 + (i % nsc);
00690 
00691       int ix = ix0 + tx;
00692       int iy = iy0 + ty;
00693       int iz = iz0 + tz;
00694       if (ix >= nfftx) ix -= nfftx;
00695 
00696       if (periodicY  && (iy >= nffty)) iy -= nffty;
00697       if (!periodicY && (iy < 0 || iy >= ysize)) continue;
00698 
00699       if (periodicZ  && (iz >= nfftz)) iz -= nfftz;
00700       if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
00701 
00702       int ind = ix + (iy + iz*ysize)*xdim;
00703 
00704 #if __CUDA_ARCH__ >= 350
00705       float q0 = __ldg(&data[ind]);
00706 #else
00707       float q0 = tex1Dfetch<float>(gridTexObj, ind);
00708 #endif
00709       float thx0 = shmem[base].thetax[tx];
00710       float thy0 = shmem[base].thetay[ty];
00711       float thz0 = shmem[base].thetaz[tz];
00712       float dthx0 = shmem[base].dthetax[tx];
00713       float dthy0 = shmem[base].dthetay[ty];
00714       float dthz0 = shmem[base].dthetaz[tz];
00715       f1 += dthx0 * thy0 * thz0 * q0;
00716       f2 += thx0 * dthy0 * thz0 * q0;
00717       f3 += thx0 * thy0 * dthz0 * q0;
00718     }
00719 
00720     //-------------------------
00721 
00722     // Reduce
00723     const int i = threadIdx.x & 7;
00724 
00725     f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
00726     f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
00727     f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
00728 
00729     f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
00730     f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
00731     f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
00732 
00733     f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
00734     f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
00735     f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
00736 
00737     if (i == 0) {
00738       shmem[base].f1 = f1;
00739       shmem[base].f2 = f2;
00740       shmem[base].f3 = f3;
00741     }
00742 
00743     base += 8;
00744     warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
00745   }
00746 
00747   // Write forces
00748   BLOCK_SYNC;
00749   if (pos < pos_end && threadIdx.y == 0) {
00750     float f1 = shmem[threadIdx.x].f1;
00751     float f2 = shmem[threadIdx.x].f2;
00752     float f3 = shmem[threadIdx.x].f3;
00753     float q = -shmem[threadIdx.x].charge; //*ccelec_float;
00754     // float fx = q*recip1*f1*nfftx;
00755     // float fy = q*recip2*f2*nffty;
00756     // float fz = q*recip3*f3*nfftz;
00757     float fx = q*f1*nfftx;
00758     float fy = q*f2*nffty;
00759     float fz = q*f3*nfftz;
00760     gather_force_store<FT>(fx, fy, fz, stride, pos, force);
00761   }
00762 
00763 }

template<typename T>
__forceinline__ __device__ void gather_force_store ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
T *  force 
)

Definition at line 517 of file CudaPmeSolverUtilKernel.cu.

00519                 {
00520 }

template<>
__forceinline__ __device__ void gather_force_store< float > ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
float *  force 
)

template<>
__forceinline__ __device__ void gather_force_store< float3 > ( const float  fx,
const float  fy,
const float  fz,
const int  stride,
const int  pos,
float3 *  force 
)

void scalar_sum ( const bool  orderXYZ,
const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const double  kappa,
const float  recip1x,
const float  recip1y,
const float  recip1z,
const float  recip2x,
const float  recip2y,
const float  recip2z,
const float  recip3x,
const float  recip3y,
const float  recip3z,
const double  volume,
const float *  prefac1,
const float *  prefac2,
const float *  prefac3,
const int  k2_00,
const int  k3_00,
const bool  doEnergyVirial,
double *  energy,
double *  virial,
float2 data,
cudaStream_t  stream 
)

Definition at line 1136 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, and M_PI.

Referenced by CudaPmeKSpaceCompute::solve().

01145                        {
01146 
01147   int nthread = 1024;
01148   int nblock = 64;
01149 
01150   int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
01151   if (doEnergyVirial) {
01152     const int warpSize = 32;      
01153     shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t)));
01154   }
01155 
01156   float piv_inv = (float)(1.0/(M_PI*volume));
01157   float fac = (float)(M_PI*M_PI/(kappa*kappa));
01158 
01159   if (doEnergyVirial) {
01160     if (orderXYZ) {
01161       scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
01162       (nfft1, nfft2, nfft3, size1, size2, size3,
01163         recip1x, recip1y, recip1z,
01164         recip2x, recip2y, recip2z,
01165         recip3x, recip3y, recip3z,
01166         prefac1, prefac2, prefac3,
01167         fac, piv_inv, k2_00, k3_00, data, energy, virial);
01168     } else {
01169       scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
01170       (nfft1, nfft2, nfft3, size1, size2, size3,
01171         recip1x, recip1y, recip1z,
01172         recip2x, recip2y, recip2z,
01173         recip3x, recip3y, recip3z,
01174         prefac1, prefac2, prefac3,
01175         fac, piv_inv, k2_00, k3_00, data, energy, virial);
01176     }
01177   } else {
01178     if (orderXYZ) {
01179       scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
01180       (nfft1, nfft2, nfft3, size1, size2, size3,
01181         recip1x, recip1y, recip1z,
01182         recip2x, recip2y, recip2z,
01183         recip3x, recip3y, recip3z,
01184         prefac1, prefac2, prefac3,
01185         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
01186     } else {
01187       scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
01188       (nfft1, nfft2, nfft3, size1, size2, size3,
01189         recip1x, recip1y, recip1z,
01190         recip2x, recip2y, recip2z,
01191         recip3x, recip3y, recip3z,
01192         prefac1, prefac2, prefac3,
01193         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
01194     }
01195   }
01196   cudaCheck(cudaGetLastError());
01197 
01198 }

template<typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
__global__ void scalar_sum_kernel ( const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const T  recip1x,
const T  recip1y,
const T  recip1z,
const T  recip2x,
const T  recip2y,
const T  recip2z,
const T  recip3x,
const T  recip3y,
const T  recip3z,
const T *  prefac1,
const T *  prefac2,
const T *  prefac3,
const T  pi_ewald,
const T  piv_inv,
const int  k2_00,
const int  k3_00,
T2 *  data,
double *__restrict__  energy_recip,
double *__restrict__  virial 
)

Definition at line 60 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, RecipVirial_t::energy, expfunc(), RecipVirial_t::virial, WARP_FULL_MASK, WARP_SHUFFLE, and WARPSIZE.

00070                                        {
00071   // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
00072   extern __shared__ T sh_prefac[];
00073 
00074   // Create pointers to shared memory
00075   T* sh_prefac1 = (T *)&sh_prefac[0];
00076   T* sh_prefac2 = (T *)&sh_prefac[nfft1];
00077   T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
00078 
00079   // Calculate start position (k1, k2, k3) for each thread
00080   unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
00081   int k3 = tid/(size1*size2);
00082   tid -= k3*size1*size2;
00083   int k2 = tid/size1;
00084   int k1 = tid - k2*size1;
00085 
00086   // Starting position in data
00087   int pos = k1 + (k2 + k3*size2)*size1;
00088 
00089   // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
00090   k2 += k2_00;
00091   k3 += k3_00;
00092 
00093   // Calculate limits w.r.t. global coordinates
00094   const int lim2 = size2 + k2_00;
00095   const int lim3 = size3 + k3_00;
00096 
00097   // Calculate increments (k1_inc, k2_inc, k3_inc)
00098   int tot_inc = blockDim.x*gridDim.x;
00099   const int k3_inc = tot_inc/(size1*size2);
00100   tot_inc -= k3_inc*size1*size2;
00101   const int k2_inc = tot_inc/size1;
00102   const int k1_inc = tot_inc - k2_inc*size1;
00103 
00104   // Set data[0] = 0 for the global (0,0,0)
00105   if (k1 == 0 && k2 == 0 && k3 == 0) {
00106     T2 zero;
00107     zero.x = (T)0;
00108     zero.y = (T)0;
00109     data[0] = zero;
00110     // Increment position
00111     k1 += k1_inc;
00112     pos += k1_inc;
00113     if (k1 >= size1) {
00114       k1 -= size1;
00115       k2++;
00116     }
00117     k2 += k2_inc;
00118     pos += k2_inc*size1;
00119     if (k2 >= lim2) {
00120       k2 -= size2;
00121       k3++;
00122     }
00123     k3 += k3_inc;
00124     pos += k3_inc*size1*size2;
00125   }
00126 
00127   // Load prefac data into shared memory
00128   {
00129     int t = threadIdx.x;
00130     while (t < nfft1) {
00131       sh_prefac1[t] = prefac1[t];
00132       t += blockDim.x;
00133     }
00134     t = threadIdx.x;
00135     while (t < nfft2) {
00136       sh_prefac2[t] = prefac2[t];
00137       t += blockDim.x;
00138     }
00139     t = threadIdx.x;
00140     while (t < nfft3) {
00141       sh_prefac3[t] = prefac3[t];
00142       t += blockDim.x;
00143     }
00144   }
00145   BLOCK_SYNC;
00146 
00147   double energy = 0.0;
00148   double virial0 = 0.0;
00149   double virial1 = 0.0;
00150   double virial2 = 0.0;
00151   double virial3 = 0.0;
00152   double virial4 = 0.0;
00153   double virial5 = 0.0;
00154 
00155   // If nfft1 is odd, set nfft1_half to impossible value so that
00156   // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )" 
00157   // is never satisfied
00158   const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
00159   const int nfft2_half = nfft2/2;
00160   const int nfft3_half = nfft3/2;
00161 
00162   while (k3 < lim3) {
00163 
00164     T2 q = data[pos];
00165 
00166     int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
00167     int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
00168 
00169     T m1, m2, m3;
00170     if (doOrtho) {
00171       m1 = recip1x*k1;
00172       m2 = recip2y*k2s;
00173       m3 = recip3z*k3s;
00174     } else {
00175       m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
00176       m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
00177       m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
00178     }
00179 
00180     T msq = m1*m1 + m2*m2 + m3*m3;
00181     T msq_inv = ((T)1)/msq;
00182 
00183     T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
00184     T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
00185     if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
00186     T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
00187     T C = xp3*msq_inv;
00188     T theta = theta3*C;
00189 
00190     if (calc_energy_virial) {
00191       T fac = q2*C;
00192       T vir = ((T)2)*(pi_ewald + msq_inv);
00193       energy += fac;
00194       virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
00195       virial1 += (double)(fac*vir*m1*m2);
00196       virial2 += (double)(fac*vir*m1*m3);
00197       virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
00198       virial4 += (double)(fac*vir*m2*m3);
00199       virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
00200     }
00201 
00202     q.x *= theta;
00203     q.y *= theta;
00204     data[pos] = q;
00205     
00206     // Increment position
00207     k1 += k1_inc;
00208     pos += k1_inc;
00209     if (k1 >= size1) {
00210       k1 -= size1;
00211       k2++;
00212     }
00213     k2 += k2_inc;
00214     pos += k2_inc*size1;
00215     if (k2 >= lim2) {
00216       k2 -= size2;
00217       k3++;
00218     }
00219     k3 += k3_inc;
00220     pos += k3_inc*size2*size1;
00221   }
00222 
00223   // Reduce energy and virial
00224   if (calc_energy_virial) {
00225     const int tid = threadIdx.x & (warpSize-1);
00226     const int base = (threadIdx.x/warpSize);
00227     volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
00228     // Reduce within warps
00229     for (int d=warpSize/2;d >= 1;d /= 2) {
00230       energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
00231          WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
00232       virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
00233           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
00234       virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
00235           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
00236       virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
00237           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
00238       virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
00239           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
00240       virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
00241           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
00242       virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
00243           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
00244     }
00245     // Reduce between warps
00246     // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
00247     BLOCK_SYNC;
00248     if (tid == 0) {
00249       sh_ev[base].energy = energy;
00250       sh_ev[base].virial[0] = virial0;
00251       sh_ev[base].virial[1] = virial1;
00252       sh_ev[base].virial[2] = virial2;
00253       sh_ev[base].virial[3] = virial3;
00254       sh_ev[base].virial[4] = virial4;
00255       sh_ev[base].virial[5] = virial5;
00256     }
00257     BLOCK_SYNC;
00258     if (base == 0) {
00259       energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
00260       virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
00261       virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
00262       virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
00263       virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
00264       virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
00265       virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
00266       for (int d=warpSize/2;d >= 1;d /= 2) {
00267   energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
00268            WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
00269   virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
00270             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
00271   virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
00272             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
00273   virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
00274             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
00275   virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
00276             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
00277   virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
00278             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
00279   virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
00280             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
00281       }
00282     }
00283 
00284     if (threadIdx.x == 0) {
00285       atomicAdd(energy_recip, energy*0.5);
00286       virial0 *= -0.5;
00287       virial1 *= -0.5;
00288       virial2 *= -0.5;
00289       virial3 *= -0.5;
00290       virial4 *= -0.5;
00291       virial5 *= -0.5;
00292       atomicAdd(&virial[0], virial0);
00293       atomicAdd(&virial[1], virial1);
00294       atomicAdd(&virial[2], virial2);
00295       atomicAdd(&virial[3], virial3);
00296       atomicAdd(&virial[4], virial4);
00297       atomicAdd(&virial[5], virial5);
00298     }
00299 
00300   }
00301 
00302 }

void spread_charge ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
float *  data,
const int  order,
cudaStream_t  stream 
)

Definition at line 1042 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, cudaNAMD_bug(), and if().

Referenced by CudaPmeRealSpaceCompute::spreadCharge().

01047                                                      {
01048 
01049   dim3 nthread, nblock;
01050 
01051   switch(order) {
01052   case 4:
01053     nthread.x = 32;
01054     nthread.y = 4;
01055     nthread.z = 1;
01056     nblock.x = (numAtoms - 1)/nthread.x + 1;
01057     nblock.y = 1;
01058     nblock.z = 1;
01059     if (periodicY && periodicZ)
01060       spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
01061         (atoms, numAtoms,
01062          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01063     else if (periodicY)
01064       spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
01065         (atoms, numAtoms,
01066          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01067     else if (periodicZ)
01068       spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
01069         (atoms, numAtoms,
01070          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01071     else
01072       spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
01073         (atoms, numAtoms,
01074          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01075     break;
01076 
01077   case 6:
01078     nthread.x = 32;
01079     nthread.y = 7;
01080     nthread.z = 1;
01081     nblock.x = (numAtoms - 1)/nthread.x + 1;
01082     nblock.y = 1;
01083     nblock.z = 1;
01084     if (periodicY && periodicZ)
01085       spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
01086         (atoms, numAtoms,
01087          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01088     else if (periodicY)
01089       spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
01090         (atoms, numAtoms,
01091          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01092     else if (periodicZ)
01093       spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
01094         (atoms, numAtoms,
01095          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01096     else
01097       spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
01098         (atoms, numAtoms,
01099          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01100     break;
01101 
01102   case 8:
01103     nthread.x = 32;
01104     nthread.y = 16;
01105     nthread.z = 1;
01106     nblock.x = (numAtoms - 1)/nthread.x + 1;
01107     nblock.y = 1;
01108     nblock.z = 1;
01109     if (periodicY && periodicZ)
01110       spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
01111         (atoms, numAtoms,
01112          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01113     else if (periodicY)
01114       spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
01115         (atoms, numAtoms,
01116          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01117     else if (periodicZ)
01118       spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
01119         (atoms, numAtoms,
01120          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01121     else
01122       spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
01123         (atoms, numAtoms,
01124          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01125     break;
01126 
01127   default:
01128     char str[128];
01129     sprintf(str, "spread_charge, order %d not implemented",order);
01130     cudaNAMD_bug(str);
01131   }
01132   cudaCheck(cudaGetLastError());
01133 
01134 }

template<typename AT, int order, bool periodicY, bool periodicZ>
__global__ void spread_charge_kernel ( const float4 *  xyzq,
const int  ncoord,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
AT *  data 
)

Definition at line 410 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, order, x, y, and z.

00414                     {
00415 
00416   // Shared memory use:
00417   // order = 4: 1920 bytes
00418   // order = 6: 2688 bytes
00419   // order = 8: 3456 bytes
00420   __shared__ int sh_ix[32];
00421   __shared__ int sh_iy[32];
00422   __shared__ int sh_iz[32];
00423   __shared__ float sh_thetax[order*32];
00424   __shared__ float sh_thetay[order*32];
00425   __shared__ float sh_thetaz[order*32];
00426 
00427   // Process atoms pos to pos_end-1 (blockDim.x)
00428   const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
00429   const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
00430 
00431   if (pos < pos_end && threadIdx.y == 0) {
00432 
00433     float4 xyzqi = xyzq[pos];
00434     float x = xyzqi.x;
00435     float y = xyzqi.y;
00436     float z = xyzqi.z;
00437     float q = xyzqi.w;
00438 
00439     float frx = ((float)nfftx)*x;
00440     float fry = ((float)nffty)*y;
00441     float frz = ((float)nfftz)*z;
00442 
00443     int frxi = (int)frx;
00444     int fryi = (int)fry;
00445     int frzi = (int)frz;
00446 
00447     float wx = frx - (float)frxi;
00448     float wy = fry - (float)fryi;
00449     float wz = frz - (float)frzi;
00450 
00451     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
00452     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
00453 
00454     sh_ix[threadIdx.x] = frxi;
00455     sh_iy[threadIdx.x] = fryi - y00;
00456     sh_iz[threadIdx.x] = frzi - z00;
00457 
00458     float theta[order];
00459 
00460     calc_one_theta<float, order>(wx, theta);
00461 #pragma unroll
00462     for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
00463 
00464     calc_one_theta<float, order>(wy, theta);
00465 #pragma unroll
00466     for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
00467 
00468     calc_one_theta<float, order>(wz, theta);
00469 #pragma unroll
00470     for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
00471 
00472   }
00473 
00474   BLOCK_SYNC;
00475 
00476   // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
00477   // NOTE: Only tid=0...order*order*order-1 do any computation
00478   const int order3 = ((order*order*order-1)/warpSize + 1)*warpSize;
00479   const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3;   // 0...order3-1
00480   const int x0 = tid % order;
00481   const int y0 = (tid / order) % order;
00482   const int z0 = tid / (order*order);
00483 
00484   // Loop over atoms pos..pos_end-1
00485   int iadd = blockDim.x*blockDim.y/order3;
00486   int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
00487   int iend = pos_end - blockIdx.x*blockDim.x;
00488   for (;i < iend;i += iadd) {
00489     int x = sh_ix[i] + x0;
00490     int y = sh_iy[i] + y0;
00491     int z = sh_iz[i] + z0;
00492       
00493     if (x >= nfftx) x -= nfftx;
00494 
00495     if (periodicY  && (y >= nffty)) y -= nffty;
00496     if (!periodicY && (y < 0 || y >= ysize)) continue;
00497 
00498     if (periodicZ  && (z >= nfftz)) z -= nfftz;
00499     if (!periodicZ && (z < 0 || z >= zsize)) continue;
00500       
00501     // Get position on the grid
00502     int ind = x + xdim*(y + ysize*(z));
00503     
00504     // Here we unroll the 6x6x6 loop with 216 threads.
00505     // NOTE: We use 7*32=224 threads to do this
00506     // Calculate interpolated charge value and store it to global memory
00507 
00508     if (tid < order*order*order)
00509       write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
00510   }
00511 
00512 }

void transpose_xyz_yzx ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1320 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

01324                                                                 {
01325 
01326   dim3 numthread(TILEDIM, TILEROWS, 1);
01327   dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
01328 
01329   transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
01330   (nx, ny, nz, xsize_in, ysize_in,
01331     ysize_out, zsize_out,
01332     data_in, data_out);
01333 
01334   cudaCheck(cudaGetLastError());
01335 }

template<typename T>
__device__ __forceinline__ void transpose_xyz_yzx_device ( const int  x_in,
const int  y_in,
const int  z_in,
const int  x_out,
const int  y_out,
const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 770 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, j, TILEDIM, and TILEROWS.

00776                                  {
00777 
00778   // Shared memory
00779   __shared__ T tile[TILEDIM][TILEDIM+1];
00780 
00781   // Read (x,y) data_in into tile (shared memory)
00782   for (int j=0;j < TILEDIM;j += TILEROWS)
00783     if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
00784       tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
00785 
00786   BLOCK_SYNC;
00787 
00788   // Write (y,x) tile into data_out
00789   const int z_out = z_in;
00790   for (int j=0;j < TILEDIM;j += TILEROWS)
00791     if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
00792       data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
00793 }

template<typename T>
__global__ void transpose_xyz_yzx_kernel ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 799 of file CudaPmeSolverUtilKernel.cu.

References TILEDIM.

00803                                  {
00804 
00805   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
00806   int y_in = blockIdx.y * TILEDIM + threadIdx.y;
00807   int z_in = blockIdx.z           + threadIdx.z;
00808 
00809   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
00810   int y_out = blockIdx.y * TILEDIM + threadIdx.x;
00811 
00812   transpose_xyz_yzx_device<T>(
00813     x_in, y_in, z_in,
00814     x_out, y_out,
00815     nx, ny, nz,
00816     xsize_in, ysize_in,
00817     ysize_out, zsize_out,
00818     data_in, data_out);
00819 
00820 /*
00821   // Shared memory
00822   __shared__ T tile[TILEDIM][TILEDIM+1];
00823 
00824   int x = blockIdx.x * TILEDIM + threadIdx.x;
00825   int y = blockIdx.y * TILEDIM + threadIdx.y;
00826   int z = blockIdx.z           + threadIdx.z;
00827 
00828   // Read (x,y) data_in into tile (shared memory)
00829   for (int j=0;j < TILEDIM;j += TILEROWS)
00830     if ((x < nx) && (y + j < ny) && (z < nz))
00831       tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
00832 
00833   BLOCK_SYNC;
00834 
00835   // Write (y,x) tile into data_out
00836   x = blockIdx.x * TILEDIM + threadIdx.y;
00837   y = blockIdx.y * TILEDIM + threadIdx.x;
00838   for (int j=0;j < TILEDIM;j += TILEROWS)
00839     if ((x + j < nx) && (y < ny) && (z < nz))
00840       data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
00841 */
00842 }

void transpose_xyz_zxy ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1357 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

01361                                                                 {
01362 
01363   dim3 numthread(TILEDIM, TILEROWS, 1);
01364   dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
01365 
01366   transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
01367   (nx, ny, nz, xsize_in, ysize_in,
01368     zsize_out, xsize_out,
01369     data_in, data_out);
01370 
01371   cudaCheck(cudaGetLastError());
01372 }

template<typename T>
__device__ __forceinline__ void transpose_xyz_zxy_device ( const int  x_in,
const int  y_in,
const int  z_in,
const int  x_out,
const int  z_out,
const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 948 of file CudaPmeSolverUtilKernel.cu.

References BLOCK_SYNC, TILEDIM, and TILEROWS.

00954                                  {
00955 
00956   // Shared memory
00957   __shared__ T tile[TILEDIM][TILEDIM+1];
00958 
00959   // Read (x,z) data_in into tile (shared memory)
00960   for (int k=0;k < TILEDIM;k += TILEROWS)
00961     if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
00962       tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
00963 
00964   BLOCK_SYNC;
00965 
00966   // Write (z,x) tile into data_out
00967   const int y_out = y_in;
00968   for (int k=0;k < TILEDIM;k += TILEROWS)
00969     if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
00970       data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
00971 }

template<typename T>
__global__ void transpose_xyz_zxy_kernel ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const T *  data_in,
T *  data_out 
)

Definition at line 977 of file CudaPmeSolverUtilKernel.cu.

References TILEDIM.

00981                                  {
00982 
00983   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
00984   int y_in = blockIdx.z           + threadIdx.z;
00985   int z_in = blockIdx.y * TILEDIM + threadIdx.y;
00986 
00987   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
00988   int z_out = blockIdx.y * TILEDIM + threadIdx.x;
00989 
00990   transpose_xyz_zxy_device<T>(
00991     x_in, y_in, z_in, x_out, z_out,
00992     nx, ny, nz,
00993     xsize_in, ysize_in,
00994     zsize_out, xsize_out,
00995     data_in, data_out);
00996 
00997 }

template<typename T>
__forceinline__ __device__ void write_grid ( const float  val,
const int  ind,
T *  data 
)

Definition at line 305 of file CudaPmeSolverUtilKernel.cu.

00306                       {
00307   atomicAdd(&data[ind], (T)val);
00308 }


Variable Documentation

const int TILEDIM = 32

Definition at line 765 of file CudaPmeSolverUtilKernel.cu.

Referenced by batchTranspose_xyz_yzx(), batchTranspose_xyz_yzx_kernel(), batchTranspose_xyz_zxy(), batchTranspose_xyz_zxy_kernel(), transpose_xyz_yzx(), transpose_xyz_yzx_device(), transpose_xyz_yzx_kernel(), transpose_xyz_zxy(), transpose_xyz_zxy_device(), and transpose_xyz_zxy_kernel().

const int TILEROWS = 8

Definition at line 766 of file CudaPmeSolverUtilKernel.cu.

Referenced by batchTranspose_xyz_yzx(), batchTranspose_xyz_zxy(), transpose_xyz_yzx(), transpose_xyz_yzx_device(), transpose_xyz_zxy(), and transpose_xyz_zxy_device().


Generated on Sat Sep 23 01:17:16 2017 for NAMD by  doxygen 1.4.7