CudaPmeSolverUtilKernel.h File Reference

Go to the source code of this file.

Classes

struct  TransposeBatch< T >

Functions

void spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
void transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
void batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
void batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)


Function Documentation

void batchTranspose_xyz_yzx ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1340 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoYZX().

01343                                                                {
01344 
01345   dim3 numthread(TILEDIM, TILEROWS, 1);
01346   dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
01347 
01348   batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
01349   (batches, ny, nz, xsize_in, ysize_in);
01350 
01351   cudaCheck(cudaGetLastError());
01352 }

void batchTranspose_xyz_zxy ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1377 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoZXY().

01381                        {
01382 
01383   dim3 numthread(TILEDIM, TILEROWS, 1);
01384   dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
01385 
01386   batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
01387   (batches, ny, nz, xsize_in, ysize_in);
01388 
01389   cudaCheck(cudaGetLastError());
01390 }

void gather_force ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
const float *  data,
const int  order,
float3 *  force,
const cudaTextureObject_t  gridTexObj,
cudaStream_t  stream 
)

Definition at line 1200 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, and cudaNAMD_bug().

01208                        {
01209 
01210   dim3 nthread(32, 2, 1);
01211   dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
01212   // dim3 nblock(npatch, 1, 1);
01213 
01214   switch(order) {
01215     case 4:
01216     if (periodicY && periodicZ)
01217       gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
01218       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01219         // recip11, recip22, recip33,
01220         data,
01221         gridTexObj,
01222         1, force);
01223     else if (periodicY)
01224       gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
01225       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01226         // recip11, recip22, recip33,
01227         data,
01228         gridTexObj,
01229         1, force);
01230     else if (periodicZ)
01231       gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
01232       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01233         // recip11, recip22, recip33,
01234         data,
01235         gridTexObj,
01236         1, force);
01237     else
01238       gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
01239       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01240         // recip11, recip22, recip33,
01241         data,
01242         gridTexObj,
01243         1, force);
01244     break;
01245 
01246     case 6:
01247     if (periodicY && periodicZ)
01248       gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
01249       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01250         // recip11, recip22, recip33,
01251         data,
01252         gridTexObj,
01253         1, force);
01254     else if (periodicY)
01255       gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
01256       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01257         // recip11, recip22, recip33,
01258         data,
01259         gridTexObj,
01260         1, force);
01261     else if (periodicZ)
01262       gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
01263       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01264         // recip11, recip22, recip33,
01265         data,
01266         gridTexObj,
01267         1, force);
01268     else
01269       gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
01270       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01271         // recip11, recip22, recip33,
01272         data,
01273         gridTexObj,
01274         1, force);
01275     break;
01276  
01277     case 8:
01278     if (periodicY && periodicZ)
01279       gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
01280       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01281         // recip11, recip22, recip33,
01282         data,
01283         gridTexObj,
01284         1, force);
01285     else if (periodicY)
01286       gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
01287       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01288         // recip11, recip22, recip33,
01289         data,
01290         gridTexObj,
01291         1, force);
01292     else if (periodicZ)
01293       gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
01294       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01295         // recip11, recip22, recip33,
01296         data,
01297         gridTexObj,
01298         1, force);
01299     else
01300       gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
01301       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01302         // recip11, recip22, recip33,
01303         data,
01304         gridTexObj,
01305         1, force);
01306     break;
01307 
01308     default:
01309     char str[128];
01310     sprintf(str, "gather_force, order %d not implemented",order);
01311     cudaNAMD_bug(str);
01312   }
01313   cudaCheck(cudaGetLastError());
01314 
01315 }

void scalar_sum ( const bool  orderXYZ,
const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const double  kappa,
const float  recip1x,
const float  recip1y,
const float  recip1z,
const float  recip2x,
const float  recip2y,
const float  recip2z,
const float  recip3x,
const float  recip3y,
const float  recip3z,
const double  volume,
const float *  prefac1,
const float *  prefac2,
const float *  prefac3,
const int  k2_00,
const int  k3_00,
const bool  doEnergyVirial,
double *  energy,
double *  virial,
float2 data,
cudaStream_t  stream 
)

Definition at line 1136 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, and M_PI.

Referenced by CudaPmeKSpaceCompute::solve().

01145                        {
01146 
01147   int nthread = 1024;
01148   int nblock = 64;
01149 
01150   int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
01151   if (doEnergyVirial) {
01152     const int warpSize = 32;      
01153     shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t)));
01154   }
01155 
01156   float piv_inv = (float)(1.0/(M_PI*volume));
01157   float fac = (float)(M_PI*M_PI/(kappa*kappa));
01158 
01159   if (doEnergyVirial) {
01160     if (orderXYZ) {
01161       scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
01162       (nfft1, nfft2, nfft3, size1, size2, size3,
01163         recip1x, recip1y, recip1z,
01164         recip2x, recip2y, recip2z,
01165         recip3x, recip3y, recip3z,
01166         prefac1, prefac2, prefac3,
01167         fac, piv_inv, k2_00, k3_00, data, energy, virial);
01168     } else {
01169       scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
01170       (nfft1, nfft2, nfft3, size1, size2, size3,
01171         recip1x, recip1y, recip1z,
01172         recip2x, recip2y, recip2z,
01173         recip3x, recip3y, recip3z,
01174         prefac1, prefac2, prefac3,
01175         fac, piv_inv, k2_00, k3_00, data, energy, virial);
01176     }
01177   } else {
01178     if (orderXYZ) {
01179       scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
01180       (nfft1, nfft2, nfft3, size1, size2, size3,
01181         recip1x, recip1y, recip1z,
01182         recip2x, recip2y, recip2z,
01183         recip3x, recip3y, recip3z,
01184         prefac1, prefac2, prefac3,
01185         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
01186     } else {
01187       scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
01188       (nfft1, nfft2, nfft3, size1, size2, size3,
01189         recip1x, recip1y, recip1z,
01190         recip2x, recip2y, recip2z,
01191         recip3x, recip3y, recip3z,
01192         prefac1, prefac2, prefac3,
01193         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
01194     }
01195   }
01196   cudaCheck(cudaGetLastError());
01197 
01198 }

void spread_charge ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
float *  data,
const int  order,
cudaStream_t  stream 
)

Definition at line 1042 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, cudaNAMD_bug(), and if().

Referenced by CudaPmeRealSpaceCompute::spreadCharge().

01047                                                      {
01048 
01049   dim3 nthread, nblock;
01050 
01051   switch(order) {
01052   case 4:
01053     nthread.x = 32;
01054     nthread.y = 4;
01055     nthread.z = 1;
01056     nblock.x = (numAtoms - 1)/nthread.x + 1;
01057     nblock.y = 1;
01058     nblock.z = 1;
01059     if (periodicY && periodicZ)
01060       spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
01061         (atoms, numAtoms,
01062          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01063     else if (periodicY)
01064       spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
01065         (atoms, numAtoms,
01066          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01067     else if (periodicZ)
01068       spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
01069         (atoms, numAtoms,
01070          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01071     else
01072       spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
01073         (atoms, numAtoms,
01074          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01075     break;
01076 
01077   case 6:
01078     nthread.x = 32;
01079     nthread.y = 7;
01080     nthread.z = 1;
01081     nblock.x = (numAtoms - 1)/nthread.x + 1;
01082     nblock.y = 1;
01083     nblock.z = 1;
01084     if (periodicY && periodicZ)
01085       spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
01086         (atoms, numAtoms,
01087          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01088     else if (periodicY)
01089       spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
01090         (atoms, numAtoms,
01091          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01092     else if (periodicZ)
01093       spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
01094         (atoms, numAtoms,
01095          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01096     else
01097       spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
01098         (atoms, numAtoms,
01099          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01100     break;
01101 
01102   case 8:
01103     nthread.x = 32;
01104     nthread.y = 16;
01105     nthread.z = 1;
01106     nblock.x = (numAtoms - 1)/nthread.x + 1;
01107     nblock.y = 1;
01108     nblock.z = 1;
01109     if (periodicY && periodicZ)
01110       spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
01111         (atoms, numAtoms,
01112          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01113     else if (periodicY)
01114       spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
01115         (atoms, numAtoms,
01116          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01117     else if (periodicZ)
01118       spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
01119         (atoms, numAtoms,
01120          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01121     else
01122       spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
01123         (atoms, numAtoms,
01124          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01125     break;
01126 
01127   default:
01128     char str[128];
01129     sprintf(str, "spread_charge, order %d not implemented",order);
01130     cudaNAMD_bug(str);
01131   }
01132   cudaCheck(cudaGetLastError());
01133 
01134 }

void transpose_xyz_yzx ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1320 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

01324                                                                 {
01325 
01326   dim3 numthread(TILEDIM, TILEROWS, 1);
01327   dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
01328 
01329   transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
01330   (nx, ny, nz, xsize_in, ysize_in,
01331     ysize_out, zsize_out,
01332     data_in, data_out);
01333 
01334   cudaCheck(cudaGetLastError());
01335 }

void transpose_xyz_zxy ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1357 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

01361                                                                 {
01362 
01363   dim3 numthread(TILEDIM, TILEROWS, 1);
01364   dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
01365 
01366   transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
01367   (nx, ny, nz, xsize_in, ysize_in,
01368     zsize_out, xsize_out,
01369     data_in, data_out);
01370 
01371   cudaCheck(cudaGetLastError());
01372 }


Generated on Mon Nov 20 01:17:15 2017 for NAMD by  doxygen 1.4.7