NAMD
Classes | Functions
CudaPmeSolverUtilKernel.h File Reference
#include "HipDefines.h"

Go to the source code of this file.

Classes

struct  TransposeBatch< T >
 

Functions

void spread_charge (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
 
void scalar_sum (const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
 
void gather_force (const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
 
void transpose_xyz_yzx (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_yzx (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 
void transpose_xyz_zxy (const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
 
void batchTranspose_xyz_zxy (const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
 

Function Documentation

void batchTranspose_xyz_yzx ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1390 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoYZX().

1393  {
1394 
1395  dim3 numthread(TILEDIM, TILEROWS, 1);
1396  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
1397 
1398  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1399 
1400  cudaCheck(cudaGetLastError());
1401 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void batchTranspose_xyz_zxy ( const int  numBatches,
TransposeBatch< float2 > *  batches,
const int  max_nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
cudaStream_t  stream 
)

Definition at line 1425 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

Referenced by CudaPmeTranspose::transposeXYZtoZXY().

1429  {
1430 
1431  dim3 numthread(TILEDIM, TILEROWS, 1);
1432  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
1433 
1434  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1435 
1436  cudaCheck(cudaGetLastError());
1437 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void gather_force ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
const float *  data,
const int  order,
float3 *  force,
const cudaTextureObject_t  gridTexObj,
cudaStream_t  stream 
)

Definition at line 1225 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, and cudaNAMD_bug().

1235  {
1236 
1237  dim3 nthread(32, 2, 1);
1238  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1239  // dim3 nblock(npatch, 1, 1);
1240 
1241  switch(order) {
1242  case 4:
1243  if (periodicY && periodicZ)
1244  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>> (
1245  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1246  // recip11, recip22, recip33,
1247  data,
1248 #ifdef NAMD_CUDA
1249  gridTexObj,
1250 #endif
1251  1, force);
1252  else if (periodicY)
1253  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>> (
1254  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1255  // recip11, recip22, recip33,
1256  data,
1257 #ifdef NAMD_CUDA
1258  gridTexObj,
1259 #endif
1260  1, force);
1261  else if (periodicZ)
1262  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>> (
1263  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1264  // recip11, recip22, recip33,
1265  data,
1266 #ifdef NAMD_CUDA
1267  gridTexObj,
1268 #endif
1269  1, force);
1270  else
1271  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>> (
1272  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1273  // recip11, recip22, recip33,
1274  data,
1275 #ifdef NAMD_CUDA
1276  gridTexObj,
1277 #endif
1278  1, force);
1279  break;
1280 
1281  case 6:
1282  if (periodicY && periodicZ)
1283  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>> (
1284  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1285  // recip11, recip22, recip33,
1286  data,
1287 #ifdef NAMD_CUDA
1288  gridTexObj,
1289 #endif
1290  1, force);
1291  else if (periodicY)
1292  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>> (
1293  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1294  // recip11, recip22, recip33,
1295  data,
1296 #ifdef NAMD_CUDA
1297  gridTexObj,
1298 #endif
1299  1, force);
1300  else if (periodicZ)
1301  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>> (
1302  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1303  // recip11, recip22, recip33,
1304  data,
1305 #ifdef NAMD_CUDA
1306  gridTexObj,
1307 #endif
1308  1, force);
1309  else
1310  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>> (
1311  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1312  // recip11, recip22, recip33,
1313  data,
1314 #ifdef NAMD_CUDA
1315  gridTexObj,
1316 #endif
1317  1, force);
1318  break;
1319 
1320  case 8:
1321  if (periodicY && periodicZ)
1322  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>> (
1323  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1324  // recip11, recip22, recip33,
1325  data,
1326 #ifdef NAMD_CUDA
1327  gridTexObj,
1328 #endif
1329  1, force);
1330  else if (periodicY)
1331  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>> (
1332  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1333  // recip11, recip22, recip33,
1334  data,
1335 #ifdef NAMD_CUDA
1336  gridTexObj,
1337 #endif
1338  1, force);
1339  else if (periodicZ)
1340  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>> (
1341  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1342  // recip11, recip22, recip33,
1343  data,
1344 #ifdef NAMD_CUDA
1345  gridTexObj,
1346 #endif
1347  1, force);
1348  else
1349  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>> (
1350  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1351  // recip11, recip22, recip33,
1352  data,
1353 #ifdef NAMD_CUDA
1354  gridTexObj,
1355 #endif
1356  1, force);
1357  break;
1358 
1359  default:
1360  char str[128];
1361  sprintf(str, "gather_force, order %d not implemented",order);
1362  cudaNAMD_bug(str);
1363  }
1364  cudaCheck(cudaGetLastError());
1365 
1366 }
static __thread atom * atoms
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void scalar_sum ( const bool  orderXYZ,
const int  nfft1,
const int  nfft2,
const int  nfft3,
const int  size1,
const int  size2,
const int  size3,
const double  kappa,
const float  recip1x,
const float  recip1y,
const float  recip1z,
const float  recip2x,
const float  recip2y,
const float  recip2z,
const float  recip3x,
const float  recip3y,
const float  recip3z,
const double  volume,
const float *  prefac1,
const float *  prefac2,
const float *  prefac3,
const int  k2_00,
const int  k3_00,
const bool  doEnergyVirial,
double *  energy,
double *  virial,
float2 data,
cudaStream_t  stream 
)

Definition at line 1158 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, M_PI, and WARPSIZE.

Referenced by CudaPmeKSpaceCompute::solve().

1167  {
1168 #ifdef NAMD_CUDA
1169  int nthread = 1024;
1170  int nblock = 64;
1171 #else
1172  int nthread = 256;
1173  int nblock = 256;
1174 #endif
1175 
1176  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1177  if (doEnergyVirial) {
1178  shmem_size = max(shmem_size, (int)((nthread/WARPSIZE)*sizeof(RecipVirial_t)));
1179  }
1180 
1181  float piv_inv = (float)(1.0/(M_PI*volume));
1182  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1183 
1184  if (doEnergyVirial) {
1185  if (orderXYZ) {
1186  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1187  nfft1, nfft2, nfft3, size1, size2, size3,
1188  recip1x, recip1y, recip1z,
1189  recip2x, recip2y, recip2z,
1190  recip3x, recip3y, recip3z,
1191  prefac1, prefac2, prefac3,
1192  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1193  } else {
1194  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1195  nfft1, nfft2, nfft3, size1, size2, size3,
1196  recip1x, recip1y, recip1z,
1197  recip2x, recip2y, recip2z,
1198  recip3x, recip3y, recip3z,
1199  prefac1, prefac2, prefac3,
1200  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1201  }
1202  } else {
1203  if (orderXYZ) {
1204  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1205  nfft1, nfft2, nfft3, size1, size2, size3,
1206  recip1x, recip1y, recip1z,
1207  recip2x, recip2y, recip2z,
1208  recip3x, recip3y, recip3z,
1209  prefac1, prefac2, prefac3,
1210  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1211  } else {
1212  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1213  nfft1, nfft2, nfft3, size1, size2, size3,
1214  recip1x, recip1y, recip1z,
1215  recip2x, recip2y, recip2z,
1216  recip3x, recip3y, recip3z,
1217  prefac1, prefac2, prefac3,
1218  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1219  }
1220  }
1221  cudaCheck(cudaGetLastError());
1222 
1223 }
#define M_PI
Definition: GoMolecule.C:39
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void spread_charge ( const float4 *  atoms,
const int  numAtoms,
const int  nfftx,
const int  nffty,
const int  nfftz,
const int  xsize,
const int  ysize,
const int  zsize,
const int  xdim,
const int  y00,
const int  z00,
const bool  periodicY,
const bool  periodicZ,
float *  data,
const int  order,
cudaStream_t  stream 
)

Definition at line 1072 of file CudaPmeSolverUtilKernel.cu.

References atoms, cudaCheck, cudaNAMD_bug(), if(), and WARPSIZE.

Referenced by CudaPmeRealSpaceCompute::spreadCharge().

1077  {
1078 
1079  dim3 nthread, nblock;
1080  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
1081  switch(order) {
1082  case 4:
1083  nthread.x = WARPSIZE;
1084 #if defined(NAMD_CUDA)
1085  nthread.y = 4;
1086 #else
1087  nthread.y = order3 / WARPSIZE;
1088 #endif
1089  nthread.z = 1;
1090  nblock.x = (numAtoms - 1)/nthread.x + 1;
1091  nblock.y = 1;
1092  nblock.z = 1;
1093  if (periodicY && periodicZ)
1094  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1095  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1096  else if (periodicY)
1097  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1098  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1099  else if (periodicZ)
1100  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1101  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1102  else
1103  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1104  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1105  break;
1106 
1107  case 6:
1108  nthread.x = WARPSIZE;
1109  nthread.y = order3/WARPSIZE;
1110  nthread.z = 1;
1111  nblock.x = (numAtoms - 1)/nthread.x + 1;
1112  nblock.y = 1;
1113  nblock.z = 1;
1114  if (periodicY && periodicZ)
1115  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1116  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1117  else if (periodicY)
1118  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1119  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1120  else if (periodicZ)
1121  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1122  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1123  else
1124  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1125  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1126  break;
1127 
1128  case 8:
1129  nthread.x = WARPSIZE;
1130  nthread.y = order3/WARPSIZE;
1131  nthread.z = 1;
1132  nblock.x = (numAtoms - 1)/nthread.x + 1;
1133  nblock.y = 1;
1134  nblock.z = 1;
1135  if (periodicY && periodicZ)
1136  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1137  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1138  else if (periodicY)
1139  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1140  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1141  else if (periodicZ)
1142  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1143  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1144  else
1145  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1146  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1147  break;
1148 
1149  default:
1150  char str[128];
1151  sprintf(str, "spread_charge, order %d not implemented",order);
1152  cudaNAMD_bug(str);
1153  }
1154  cudaCheck(cudaGetLastError());
1155 
1156 }
static __thread atom * atoms
if(ComputeNonbondedUtil::goMethod==2)
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void transpose_xyz_yzx ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  ysize_out,
const int  zsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1371 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1375  {
1376 
1377  dim3 numthread(TILEDIM, TILEROWS, 1);
1378  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
1379  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1380  nx, ny, nz, xsize_in, ysize_in,
1381  ysize_out, zsize_out,
1382  data_in, data_out);
1383 
1384  cudaCheck(cudaGetLastError());
1385 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void transpose_xyz_zxy ( const int  nx,
const int  ny,
const int  nz,
const int  xsize_in,
const int  ysize_in,
const int  zsize_out,
const int  xsize_out,
const float2 data_in,
float2 data_out,
cudaStream_t  stream 
)

Definition at line 1406 of file CudaPmeSolverUtilKernel.cu.

References cudaCheck, TILEDIM, and TILEROWS.

1410  {
1411 
1412  dim3 numthread(TILEDIM, TILEROWS, 1);
1413  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
1414  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1415  nx, ny, nz, xsize_in, ysize_in,
1416  zsize_out, xsize_out,
1417  data_in, data_out);
1418 
1419  cudaCheck(cudaGetLastError());
1420 }
const int TILEDIM
__thread cudaStream_t stream
const int TILEROWS
#define cudaCheck(stmt)
Definition: CudaUtils.h:95