NAMD
Public Member Functions | List of all members
CudaComputeNonbondedKernel Class Reference

#include <CudaComputeNonbondedKernel.h>

Public Member Functions

 CudaComputeNonbondedKernel (int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 
 ~CudaComputeNonbondedKernel ()
 
void updateVdwTypesExcl (const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
 
void nonbondedForce (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
 
void reduceVirialEnergy (CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
 
void getVirialEnergy (VirialEnergy *h_virialEnergy, cudaStream_t stream)
 
void bindExclusions (int numExclusions, unsigned int *exclusion_bits)
 
int * getPatchReadyQueue ()
 
void reallocate_forceSOA (int atomStorageSize)
 

Detailed Description

Definition at line 9 of file CudaComputeNonbondedKernel.h.

Constructor & Destructor Documentation

CudaComputeNonbondedKernel::CudaComputeNonbondedKernel ( int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 1155 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

1156  : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
1157 
1158  cudaCheck(cudaSetDevice(deviceID));
1159 
1160  overflowExclusions = NULL;
1161  overflowExclusionsSize = 0;
1162 
1163  exclIndexMaxDiff = NULL;
1164  exclIndexMaxDiffSize = 0;
1165 
1166  atomIndex = NULL;
1167  atomIndexSize = 0;
1168 
1169  vdwTypes = NULL;
1170  vdwTypesSize = 0;
1171 
1172  patchNumCount = NULL;
1173  patchNumCountSize = 0;
1174 
1175  patchReadyQueue = NULL;
1176  patchReadyQueueSize = 0;
1177 
1178  force_x = force_y = force_z = force_w = NULL;
1179  forceSize = 0;
1180  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1181  forceSlowSize = 0;
1182 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel ( )

Definition at line 1196 of file CudaComputeNonbondedKernel.cu.

References cudaCheck.

1196  {
1197  cudaCheck(cudaSetDevice(deviceID));
1198  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
1199  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
1200  if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
1201  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
1202  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
1203  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
1204  if (force_x != NULL) deallocate_device<float>(&force_x);
1205  if (force_y != NULL) deallocate_device<float>(&force_y);
1206  if (force_z != NULL) deallocate_device<float>(&force_z);
1207  if (force_w != NULL) deallocate_device<float>(&force_w);
1208  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
1209  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
1210  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
1211  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
1212 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
#define cudaCheck(stmt)
Definition: CudaUtils.h:95

Member Function Documentation

void CudaComputeNonbondedKernel::bindExclusions ( int  numExclusions,
unsigned int *  exclusion_bits 
)

Definition at line 1461 of file CudaComputeNonbondedKernel.cu.

References constExclusions, cudaCheck, and MAX_CONST_EXCLUSIONS.

1461  {
1462 #ifdef NAMD_CUDA
1463 // TODO-HIP: cudaMemcpyToSymbol crashes on HIP-hcc with single or multiple GPUs.
1464 // Disable it considering that using constant memory does not improve performance.
1465 // This explains some choices made above.
1466  int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
1467  cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
1468 #endif
1469  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
1470  copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
1471 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
#define MAX_CONST_EXCLUSIONS
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
int * CudaComputeNonbondedKernel::getPatchReadyQueue ( )

Definition at line 1226 of file CudaComputeNonbondedKernel.cu.

References NAMD_die().

Referenced by CudaComputeNonbonded::launchWork().

1226  {
1227  if (!doStreaming) {
1228  NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
1229  }
1230  return patchReadyQueue;
1231 }
void NAMD_die(const char *err_msg)
Definition: common.C:85
void CudaComputeNonbondedKernel::getVirialEnergy ( VirialEnergy h_virialEnergy,
cudaStream_t  stream 
)
void CudaComputeNonbondedKernel::nonbondedForce ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doPairlist,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const float3  lata,
const float3  latb,
const float3  latc,
const float4 *  h_xyzq,
const float  cutoff2,
float4 *  d_forces,
float4 *  d_forcesSlow,
float4 *  h_forces,
float4 *  h_forcesSlow,
cudaStream_t  stream 
)

Definition at line 1250 of file CudaComputeNonbondedKernel.cu.

References CALL, CudaTileListKernel::clearTileListStat(), cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getNumPatches(), CudaTileListKernel::getNumTileLists(), CudaTileListKernel::getOutputOrder(), NAMD_die(), NONBONDKERNEL_NUM_WARP, numPatches, CudaTileListKernel::setTileListVirialEnergyLength(), and stream.

1257  {
1258 
1259  if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
1260 
1261  // clear_device_array<float4>(d_forces, atomStorageSize, stream);
1262  // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
1263 
1264 
1265  // XXX TODO: Clear all of these
1266  if(1){
1267  // two clears
1268  tlKernel.clearTileListStat(stream);
1269  clear_device_array<float>(force_x, atomStorageSize, stream);
1270  clear_device_array<float>(force_y, atomStorageSize, stream);
1271  clear_device_array<float>(force_z, atomStorageSize, stream);
1272  clear_device_array<float>(force_w, atomStorageSize, stream);
1273  if (doSlow) {
1274  clear_device_array<float>(forceSlow_x, atomStorageSize, stream);
1275  clear_device_array<float>(forceSlow_y, atomStorageSize, stream);
1276  clear_device_array<float>(forceSlow_z, atomStorageSize, stream);
1277  clear_device_array<float>(forceSlow_w, atomStorageSize, stream);
1278  }
1279  }
1280 
1281  // --- streaming ----
1282  float4* m_forces = NULL;
1283  float4* m_forcesSlow = NULL;
1284  int* m_patchReadyQueue = NULL;
1285  int numPatches = 0;
1286  unsigned int* patchNumCountPtr = NULL;
1287  if (doStreaming) {
1288  numPatches = tlKernel.getNumPatches();
1289  if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
1290  // If re-allocated, clear array
1291  clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
1292  }
1293  patchNumCountPtr = patchNumCount;
1294  bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
1295  if (re) {
1296  // If re-allocated, re-set to "-1"
1297  for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
1298  }
1299  cudaCheck(cudaHostGetDevicePointer((void**)&m_patchReadyQueue, patchReadyQueue, 0));
1300  cudaCheck(cudaHostGetDevicePointer((void**)&m_forces, h_forces, 0));
1301  cudaCheck(cudaHostGetDevicePointer((void**)&m_forcesSlow, h_forcesSlow, 0));
1302  }
1303  // -----------------
1304 
1305  if (doVirial || doEnergy) {
1306  tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
1307  }
1308 
1309  int shMemSize = 0;
1310 
1311  int* outputOrderPtr = tlKernel.getOutputOrder();
1312 
1313  int nwarp = NONBONDKERNEL_NUM_WARP;
1314  int nthread = WARPSIZE*nwarp;
1315  int start = 0;
1316  while (start < tlKernel.getNumTileLists())
1317  {
1318 
1319  int nleft = tlKernel.getNumTileLists() - start;
1320  int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
1321 
1322 #ifdef USE_TABLE_ARRAYS
1323  #define TABLE_PARAMS \
1324  cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
1325 #else
1326  #define TABLE_PARAMS \
1327  cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
1328 #endif
1329 
1330 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
1331  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
1332  <<< nblock, nthread, shMemSize, stream >>> (\
1333  start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
1334  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), cudaNonbondedTables.getVdwCoefTableTex(), \
1335  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
1336  TABLE_PARAMS, \
1337  tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
1338  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
1339  tlKernel.getBoundingBoxes(), \
1340  force_x, force_y, force_z, force_w, \
1341  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
1342  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
1343  outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
1344 
1345  bool called = false;
1346 
1347  if (doStreaming) {
1348  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
1349  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
1350  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
1351  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
1352  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
1353  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
1354  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
1355  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
1356 
1357  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1);
1358  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1);
1359  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1);
1360  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1);
1361  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1);
1362  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1);
1363  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1);
1364  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1);
1365  } else {
1366  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
1367  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
1368  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
1369  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
1370  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
1371  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
1372  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
1373  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
1374 
1375  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0);
1376  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0);
1377  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0);
1378  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0);
1379  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0);
1380  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0);
1381  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0);
1382  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0);
1383  }
1384 
1385  if (!called) {
1386  NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
1387  }
1388 
1389  {
1390  int block = 128;
1391  int grid = (atomStorageSize + block - 1)/block;
1392  if (doSlow)
1393  transposeForcesKernel<1><<<grid, block, 0, stream>>>(
1394  d_forces, d_forcesSlow,
1395  force_x, force_y, force_z, force_w,
1396  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1397  atomStorageSize);
1398  else
1399  transposeForcesKernel<0><<<grid, block, 0, stream>>>(
1400  d_forces, d_forcesSlow,
1401  force_x, force_y, force_z, force_w,
1402  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1403  atomStorageSize);
1404  }
1405 
1406 #undef CALL
1407 #undef TABLE_PARAMS
1408  cudaCheck(cudaGetLastError());
1409 
1410  start += nblock*nwarp;
1411  }
1412 
1413 }
void setTileListVirialEnergyLength(int len)
void clearTileListStat(cudaStream_t stream)
#define WARPSIZE
Definition: CudaUtils.h:10
#define NONBONDKERNEL_NUM_WARP
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
void NAMD_die(const char *err_msg)
Definition: common.C:85
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
#define CALL(DOENERGY, DOVIRIAL)
void CudaComputeNonbondedKernel::reallocate_forceSOA ( int  atomStorageSize)

Definition at line 1184 of file CudaComputeNonbondedKernel.cu.

1185 {
1186  reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
1187  reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
1188  reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
1189  reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
1190  reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
1191  reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
1192  reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
1193  reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
1194 }
void CudaComputeNonbondedKernel::reduceVirialEnergy ( CudaTileListKernel tlKernel,
const int  atomStorageSize,
const bool  doEnergy,
const bool  doVirial,
const bool  doSlow,
const bool  doGBIS,
float4 *  d_forces,
float4 *  d_forcesSlow,
VirialEnergy d_virialEnergy,
cudaStream_t  stream 
)

Definition at line 1418 of file CudaComputeNonbondedKernel.cu.

References ATOMIC_BINS, cudaCheck, deviceCUDA, CudaTileListKernel::get_xyzq(), DeviceCUDA::getMaxNumBlocks(), CudaTileListKernel::getTileListVirialEnergy(), CudaTileListKernel::getTileListVirialEnergyGBISLength(), CudaTileListKernel::getTileListVirialEnergyLength(), REDUCEGBISENERGYKERNEL_NUM_WARP, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP, REDUCEVIRIALENERGYKERNEL_NUM_WARP, stream, and WARPSIZE.

Referenced by CudaComputeNonbonded::launchWork().

1421  {
1422 
1423  if (doEnergy || doVirial) {
1424  clear_device_array<VirialEnergy>(d_virialEnergy, ATOMIC_BINS, stream);
1425  }
1426 
1427  if (doVirial)
1428  {
1430  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1431  reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>> (
1432  doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
1433  cudaCheck(cudaGetLastError());
1434  }
1435 
1436  if (doVirial || doEnergy)
1437  {
1439  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
1440  reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>> (
1441  doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1442  cudaCheck(cudaGetLastError());
1443  }
1444 
1445  if (doGBIS && doEnergy)
1446  {
1448  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
1449  reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>> (
1450  tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1451  cudaCheck(cudaGetLastError());
1452  }
1453  if (ATOMIC_BINS > 1)
1454  {
1455  // Reduce d_virialEnergy[ATOMIC_BINS] in-place (results are in d_virialEnergy[0])
1456  reduceNonbondedBinsKernel<<<1, ATOMIC_BINS, 0, stream>>>(doVirial, doEnergy, doSlow, doGBIS, d_virialEnergy);
1457  }
1458 
1459 }
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define ATOMIC_BINS
Definition: CudaUtils.h:24
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP
TileListVirialEnergy * getTileListVirialEnergy()
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP
#define REDUCEGBISENERGYKERNEL_NUM_WARP
void CudaComputeNonbondedKernel::updateVdwTypesExcl ( const int  atomStorageSize,
const int *  h_vdwTypes,
const int2 *  h_exclIndexMaxDiff,
const int *  h_atomIndex,
cudaStream_t  stream 
)

Definition at line 1214 of file CudaComputeNonbondedKernel.cu.

References OVERALLOC, and stream.

1215  {
1216 
1217  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
1218  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
1219  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
1220 
1221  copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
1222  copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
1223  copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
1224 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
#define OVERALLOC
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex

The documentation for this class was generated from the following files: