namd/doxygen/CudaTileListKernel_8h_source.html

 #ifndef CUDATILELISTKERNEL_H
 #define CUDATILELISTKERNEL_H
 #ifdef NAMD_CUDA

 #include <cuda_runtime.h>

 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
 struct TileExcl {
   unsigned int excl[32];
 };

 struct TileList {
   int iatomStart;
   int jtileStart;
   int jtileEnd;
   float3 offsetXYZ;
   int2 patchInd;        // Patch indices for this list
   union {
     int2 patchNumList;    // Number of lists contributing to each patch
     // int icompute;
   };
   int icompute;
 };

 struct PatchPairRecord {
   int iatomSize;
   int iatomFreeSize;
   int jatomSize;
   int jatomFreeSize;
 };

 //
 // Bounding box structure
 //
 struct BoundingBox {
   float x, y, z;      // Center
   float wx, wy, wz;   // Half-width
 };

 //
 // Stripped-down CUDA version of compute record
 //
 struct CudaComputeRecord {
   int2 patchInd;
   float3 offsetXYZ;
 };

 //
 // Stripped-down CUDA version of patch record
 //
 struct CudaPatchRecord {
   int numAtoms;
   int numFreeAtoms;
   int atomStart;
 #ifdef NODEGROUP_FORCE_REGISTER
   // appending an additional field to hold the patchId
   int patchID;
 #endif
 };

 //
 // Tile list status. Used to communicate tile list sizes between GPU and CPU
 //
 struct TileListStat {
   int numTileLists;
   int numTileListsGBIS;
   int numJtiles;
   int numExcluded;
   int patchReadyQueueCount;
   int outputOrderIndex;
   bool tilesSizeExceeded;
 };

 struct TileListVirialEnergy {
   float shx, shy, shz;
   float forcex, forcey, forcez;
   float forceSlowx, forceSlowy, forceSlowz;
   double energyVdw;
   double energyElec;
   double energySlow;

   /* FEP energies */
   double energyVdw_s;
   double energyElec_s;
   double energySlow_s;

   /* TI energies */
   double energyVdw_ti_1;
   double energyVdw_ti_2;
   double energyElec_ti_1;
   double energyElec_ti_2;
   double energySlow_ti_1;
   double energySlow_ti_2;

   double energyGBIS;
 };

 struct VirialEnergy {
   double virial[9];
   double virialSlow[9];
   double energyVdw;
   double energyElec;
   double energySlow;

   /* FEP energies */
   double energyVdw_s;
   double energyElec_s;
   double energySlow_s;

   /* TI energies */
   double energyVdw_ti_1;
   double energyVdw_ti_2;
   double energyElec_ti_1;
   double energyElec_ti_2;
   double energySlow_ti_1;
   double energySlow_ti_2;

   double energyGBIS;
 };

 class CudaTileListKernel {
 private:

   template <typename T>
   struct PtrSize {
     PtrSize(T* ptr, size_t size) : ptr(ptr), size(size) {}
     T* ptr;
     size_t size;
   };

   const int deviceID;

   // Events
   cudaEvent_t tileListStatEvent;
   bool tileListStatEventRecord;

   // Pair list cutoff squared
   float plcutoff2;

   // Number of patches
   int numPatches;

   // Number of computes
   int numComputes;

   // Number of tile lists
   int numTileLists;

   // Number of tile lists for GBIS
   int numTileListsGBIS;

   // Number of tiles
   int numJtiles;

   // Maximum number of tiles per tile list
   int maxTileListLen;

   CudaPatchRecord* cudaPatches;
   size_t cudaPatchesSize;

   CudaComputeRecord* cudaComputes;
   size_t cudaComputesSize;

   // --- For Streaming ---
   const bool doStreaming;
   int* patchNumLists;
   size_t patchNumListsSize;

   int* emptyPatches;
   size_t emptyPatchesSize;
   int* h_emptyPatches;
   size_t h_emptyPatchesSize;
   int numEmptyPatches;

   unsigned int* sortKeySrc;
   size_t sortKeySrcSize;
   unsigned int* sortKeyDst;
   size_t sortKeyDstSize;

   int maxTileListLen_sortKeys;

   unsigned int* sortKeys;
   size_t sortKeysSize;

   int2* minmaxListLen;
   size_t minmaxListLenSize;

   int sortKeys_endbit;
   // ---------------------

   // Single entry pinned host and device buffers for communicating tile list status
   TileListStat* h_tileListStat;
   TileListStat* d_tileListStat;

   // Atom coordinates and charge
   float4* xyzq;
   size_t xyzqSize;
   // Atom coordinate storage size
   int atomStorageSize;

   char *part;
   size_t partSize;

   // Tile lists
   TileList* tileLists1;
   size_t tileLists1Size;
   TileList* tileLists2;
   size_t tileLists2Size;
   TileList* tileListsGBIS;
   size_t tileListsGBISSize;

   // Pair pairs
   PatchPairRecord* patchPairs1;
   size_t patchPairs1Size;
   PatchPairRecord* patchPairs2;
   size_t patchPairs2Size;

   // j-atom start for tiles
   int* tileJatomStart1;
   size_t tileJatomStart1Size;
   int* tileJatomStart2;
   size_t tileJatomStart2Size;
   int* tileJatomStartGBIS;
   size_t tileJatomStartGBISSize;

   // Bounding boxes
   BoundingBox* boundingBoxes;
   size_t boundingBoxesSize;

   // Depth of each tile list
   unsigned int* tileListDepth1;
   size_t tileListDepth1Size;
   unsigned int* tileListDepth2;
   size_t tileListDepth2Size;

   // Tile list order
   int* tileListOrder1;
   size_t tileListOrder1Size;
   int* tileListOrder2;
   size_t tileListOrder2Size;

   // Position of each tile list = ExclusiveSum(tileListDepths)
   int* tileListPos;
   size_t tileListPosSize;

   // jtile occupancy and position
   int* jtiles;
   size_t jtilesSize;

   // Temporary buffers used in buildTileLists
   int* tilePos;
   size_t tilePosSize;

   // Exclusions
   TileExcl* tileExcls1;
   size_t tileExcls1Size;
   TileExcl* tileExcls2;
   size_t tileExcls2Size;

   // Temporary storage for CUB
   char* tempStorage;
   size_t tempStorageSize;

   // Number of exclusions detected
   int numExcluded;

   // Virials and energies for tile lists
   TileListVirialEnergy* tileListVirialEnergy;
   size_t tileListVirialEnergySize;

   int tileListVirialEnergyLength;
   int tileListVirialEnergyGBISLength;

   int activeBuffer;

   void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}

   void sortTileLists(
     const bool useJtiles,
     const int begin_bit, const bool highDepthBitsSet,
     // Source
     const int numTileListsSrc, const int numJtilesSrc,
     PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
     PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
     PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
     // Destination
     const int numTileListsDst, const int numJtilesDst,
     PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
     PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
     PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
     cudaStream_t stream);

   void writeTileList(const char* filename, const int numTileLists,
     const TileList* d_tileLists, cudaStream_t stream);
   void writeTileJatomStart(const char* filename, const int numJtiles,
     const int* d_tileJatomStart, cudaStream_t stream);
   // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
   //   const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);

   int* outputOrder;
   size_t outputOrderSize;
   bool doOutputOrder;

 public:

   CudaTileListKernel(int deviceID, bool doStreaming);
   ~CudaTileListKernel();

   int getNumEmptyPatches() {return numEmptyPatches;}
   int* getEmptyPatches() {return h_emptyPatches;}

   int getNumExcluded() {return numExcluded;}

   float get_plcutoff2() {return plcutoff2;}
   int getNumTileLists() {return numTileLists;}
   int getNumTileListsGBIS() {return numTileListsGBIS;}
   int getNumJtiles() {return numJtiles;}
   BoundingBox* getBoundingBoxes() {return boundingBoxes;}
   int* getJtiles() {return jtiles;}
   float4* get_xyzq() {return xyzq;}
   char* get_part() {return part;}

   TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
   void clearTileListStat(cudaStream_t stream);

   int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
   TileList* getTileLists() {
     return ((activeBuffer == 1) ? tileLists1 : tileLists2);
   }
   unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
   int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
   TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
   PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}

   int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
   TileList* getTileListsGBIS() {return tileListsGBIS;}

   TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}

   CudaPatchRecord* getCudaPatches() {return cudaPatches;}
   int getCudaPatchesSize() {return cudaPatchesSize;}

   void prepareTileList(cudaStream_t stream);
   void finishTileList(cudaStream_t stream);

   void updateComputes(const int numComputesIn,
     const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);

   void prepareBuffers(
     int atomStorageSizeIn, int numPatchesIn,
     const CudaPatchRecord* h_cudaPatches,
     cudaStream_t stream);

   void buildTileLists(const int numTileListsPrev,
     const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
     const float3 lata, const float3 latb, const float3 latc,
     const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
     const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged,
     const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration);

   void reSortTileLists(const bool doGBIS, cudaStream_t stream);
   // void applyOutputOrder(cudaStream_t stream);

   void setTileListVirialEnergyLength(int len);
   void setTileListVirialEnergyGBISLength(int len);
   int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
   int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}

   int getNumPatches() {return numPatches;}

   int getNumComputes() {return numComputes;}
   int* getOutputOrder() {
     if (!doStreaming) return NULL;
     if (doOutputOrder) {
       return outputOrder;
     } else {
       return NULL;
     }
   }

 };
 #endif // NAMD_CUDA
 #endif // CUDATILELISTKERNEL_H
TileListVirialEnergy::forcez
float forcez
Definition: CudaTileListKernel.h:76

CudaTileListKernel::getEmptyPatches
int * getEmptyPatches()
Definition: CudaTileListKernel.h:310

TileList::patchNumList
int2 patchNumList
Definition: CudaTileListKernel.h:19

TileListStat::tilesSizeExceeded
bool tilesSizeExceeded
Definition: CudaTileListKernel.h:71

VirialEnergy::virial
double virial[9]
Definition: CudaTileListKernel.h:99

CudaPatchRecord::atomStart
int atomStart
Definition: CudaTileListKernel.h:54

CudaTileListKernel::CudaTileListKernel
CudaTileListKernel(int deviceID, bool doStreaming)

CudaTileListKernel::prepareTileList
void prepareTileList(cudaStream_t stream)

CudaTileListKernel::setTileListVirialEnergyLength
void setTileListVirialEnergyLength(int len)

TileList::jtileStart
int jtileStart
Definition: CudaTileListKernel.h:14

PatchPairRecord::jatomFreeSize
int jatomFreeSize
Definition: CudaTileListKernel.h:29

BoundingBox::y
float y
Definition: CudaTileListKernel.h:36

TileListStat::numTileLists
int numTileLists
Definition: CudaTileListKernel.h:65

CudaTileListKernel::getPatchPairs
PatchPairRecord * getPatchPairs()
Definition: CudaTileListKernel.h:333

VirialEnergy::energySlow_s
double energySlow_s
Definition: CudaTileListKernel.h:108

CudaTileListKernel::getNumPatches
int getNumPatches()
Definition: CudaTileListKernel.h:369

CudaTileListKernel::getNumTileLists
int getNumTileLists()
Definition: CudaTileListKernel.h:315

CudaTileListKernel::getOutputOrder
int * getOutputOrder()
Definition: CudaTileListKernel.h:372

TileListStat::outputOrderIndex
int outputOrderIndex
Definition: CudaTileListKernel.h:70

CudaTileListKernel::clearTileListStat
void clearTileListStat(cudaStream_t stream)

CudaTileListKernel::getTileExcls
TileExcl * getTileExcls()
Definition: CudaTileListKernel.h:332

TileListVirialEnergy::forceSlowx
float forceSlowx
Definition: CudaTileListKernel.h:77

VirialEnergy::energyVdw_ti_2
double energyVdw_ti_2
Definition: CudaTileListKernel.h:112

CudaComputeRecord::offsetXYZ
float3 offsetXYZ
Definition: CudaTileListKernel.h:45

CudaTileListKernel::getNumExcluded
int getNumExcluded()
Definition: CudaTileListKernel.h:312

VirialEnergy::energyVdw_ti_1
double energyVdw_ti_1
Definition: CudaTileListKernel.h:111

CudaTileListKernel::setTileListVirialEnergyGBISLength
void setTileListVirialEnergyGBISLength(int len)

CudaTileListKernel
Definition: CudaTileListKernel.h:121

CudaTileListKernel::prepareBuffers
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)

CudaTileListKernel::getTileListVirialEnergyGBISLength
int getTileListVirialEnergyGBISLength()
Definition: CudaTileListKernel.h:367

TileList::jtileEnd
int jtileEnd
Definition: CudaTileListKernel.h:15

CudaTileListKernel::getCudaPatches
CudaPatchRecord * getCudaPatches()
Definition: CudaTileListKernel.h:340

CudaTileListKernel::getCudaPatchesSize
int getCudaPatchesSize()
Definition: CudaTileListKernel.h:341

CudaTileListKernel::~CudaTileListKernel
~CudaTileListKernel()

TileListVirialEnergy::forceSlowy
float forceSlowy
Definition: CudaTileListKernel.h:77

CudaPatchRecord
Definition: CudaTileListKernel.h:51

CudaPatchRecord::numAtoms
int numAtoms
Definition: CudaTileListKernel.h:52

CudaTileListKernel::getTileListDepth
unsigned int * getTileListDepth()
Definition: CudaTileListKernel.h:330

VirialEnergy::energyVdw_s
double energyVdw_s
Definition: CudaTileListKernel.h:106

TileListVirialEnergy::energyVdw_ti_1
double energyVdw_ti_1
Definition: CudaTileListKernel.h:88

CudaComputeRecord
Definition: CudaTileListKernel.h:43

CudaTileListKernel::getBoundingBoxes
BoundingBox * getBoundingBoxes()
Definition: CudaTileListKernel.h:318

CudaTileListKernel::get_part
char * get_part()
Definition: CudaTileListKernel.h:321

CudaTileListKernel::getNumComputes
int getNumComputes()
Definition: CudaTileListKernel.h:371

TileListVirialEnergy::forcey
float forcey
Definition: CudaTileListKernel.h:76

TileListVirialEnergy::energyGBIS
double energyGBIS
Definition: CudaTileListKernel.h:95

VirialEnergy::energyElec
double energyElec
Definition: CudaTileListKernel.h:102

TileListVirialEnergy::energyVdw_s
double energyVdw_s
Definition: CudaTileListKernel.h:83

TileList
Definition: CudaTileListKernel.h:12

TileListVirialEnergy
Definition: CudaTileListKernel.h:74

VirialEnergy
Definition: CudaTileListKernel.h:98

TileExcl::excl
unsigned int excl[32]
Definition: CudaTileListKernel.h:9

CudaTileListKernel::updateComputes
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)

TileListVirialEnergy::energyVdw
double energyVdw
Definition: CudaTileListKernel.h:78

TileList::iatomStart
int iatomStart
Definition: CudaTileListKernel.h:13

CudaTileListKernel::getTileListsGBIS
TileList * getTileListsGBIS()
Definition: CudaTileListKernel.h:336

BoundingBox::wy
float wy
Definition: CudaTileListKernel.h:37

TileListVirialEnergy::energySlow_ti_1
double energySlow_ti_1
Definition: CudaTileListKernel.h:92

TileListStat::numExcluded
int numExcluded
Definition: CudaTileListKernel.h:68

CudaTileListKernel::getTileListStatDevPtr
TileListStat * getTileListStatDevPtr()
Definition: CudaTileListKernel.h:323

VirialEnergy::energyElec_ti_2
double energyElec_ti_2
Definition: CudaTileListKernel.h:114

TileListVirialEnergy::energyElec_s
double energyElec_s
Definition: CudaTileListKernel.h:84

CudaTileListKernel::getTileLists
TileList * getTileLists()
Definition: CudaTileListKernel.h:327

CudaTileListKernel::getNumEmptyPatches
int getNumEmptyPatches()
Definition: CudaTileListKernel.h:309

VirialEnergy::energySlow
double energySlow
Definition: CudaTileListKernel.h:103

PatchPairRecord::iatomSize
int iatomSize
Definition: CudaTileListKernel.h:26

VirialEnergy::energyElec_s
double energyElec_s
Definition: CudaTileListKernel.h:107

CudaPatchRecord::numFreeAtoms
int numFreeAtoms
Definition: CudaTileListKernel.h:53

CudaTileListKernel::getNumTileListsGBIS
int getNumTileListsGBIS()
Definition: CudaTileListKernel.h:316

CudaTileListKernel::finishTileList
void finishTileList(cudaStream_t stream)

CudaTileListKernel::getTileListVirialEnergyLength
int getTileListVirialEnergyLength()
Definition: CudaTileListKernel.h:366

TileListVirialEnergy::energyElec
double energyElec
Definition: CudaTileListKernel.h:79

TileList::icompute
int icompute
Definition: CudaTileListKernel.h:22

CudaTileListKernel::get_xyzq
float4 * get_xyzq()
Definition: CudaTileListKernel.h:320

CudaTileListKernel::getTileJatomStart
int * getTileJatomStart()
Definition: CudaTileListKernel.h:326

PatchPairRecord::iatomFreeSize
int iatomFreeSize
Definition: CudaTileListKernel.h:27

TileListVirialEnergy::shz
float shz
Definition: CudaTileListKernel.h:75

BoundingBox::z
float z
Definition: CudaTileListKernel.h:36

TileListStat
Definition: CudaTileListKernel.h:64

CudaTileListKernel::getTileJatomStartGBIS
int * getTileJatomStartGBIS()
Definition: CudaTileListKernel.h:335

TileList::offsetXYZ
float3 offsetXYZ
Definition: CudaTileListKernel.h:16

TileListVirialEnergy::shy
float shy
Definition: CudaTileListKernel.h:75

CudaTileListKernel::get_plcutoff2
float get_plcutoff2()
Definition: CudaTileListKernel.h:314

TileListVirialEnergy::energyElec_ti_1
double energyElec_ti_1
Definition: CudaTileListKernel.h:90

VirialEnergy::virialSlow
double virialSlow[9]
Definition: CudaTileListKernel.h:100

CudaTileListKernel::getJtiles
int * getJtiles()
Definition: CudaTileListKernel.h:319

TileListStat::patchReadyQueueCount
int patchReadyQueueCount
Definition: CudaTileListKernel.h:69

PatchPairRecord
Definition: CudaTileListKernel.h:25

VirialEnergy::energySlow_ti_2
double energySlow_ti_2
Definition: CudaTileListKernel.h:116

TileListStat::numJtiles
int numJtiles
Definition: CudaTileListKernel.h:67

TileListVirialEnergy::forceSlowz
float forceSlowz
Definition: CudaTileListKernel.h:77

TileListVirialEnergy::energyVdw_ti_2
double energyVdw_ti_2
Definition: CudaTileListKernel.h:89

BoundingBox
Definition: CudaTileListKernel.h:35

TileExcl
Definition: CudaTileListKernel.h:8

CudaTileListKernel::getTileListVirialEnergy
TileListVirialEnergy * getTileListVirialEnergy()
Definition: CudaTileListKernel.h:338

CudaTileListKernel::getTileListOrder
int * getTileListOrder()
Definition: CudaTileListKernel.h:331

VirialEnergy::energyElec_ti_1
double energyElec_ti_1
Definition: CudaTileListKernel.h:113

TileListVirialEnergy::forcex
float forcex
Definition: CudaTileListKernel.h:76

TileListStat::numTileListsGBIS
int numTileListsGBIS
Definition: CudaTileListKernel.h:66

BoundingBox::x
float x
Definition: CudaTileListKernel.h:36

CudaTileListKernel::buildTileLists
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)

TileListVirialEnergy::energySlow
double energySlow
Definition: CudaTileListKernel.h:80

VirialEnergy::energyGBIS
double energyGBIS
Definition: CudaTileListKernel.h:118

TileListVirialEnergy::shx
float shx
Definition: CudaTileListKernel.h:75

BoundingBox::wz
float wz
Definition: CudaTileListKernel.h:37

CudaTileListKernel::getNumJtiles
int getNumJtiles()
Definition: CudaTileListKernel.h:317

TileListVirialEnergy::energyElec_ti_2
double energyElec_ti_2
Definition: CudaTileListKernel.h:91

VirialEnergy::energySlow_ti_1
double energySlow_ti_1
Definition: CudaTileListKernel.h:115

CudaComputeRecord::patchInd
int2 patchInd
Definition: CudaTileListKernel.h:44

CudaTileListKernel::reSortTileLists
void reSortTileLists(const bool doGBIS, cudaStream_t stream)

BoundingBox::wx
float wx
Definition: CudaTileListKernel.h:37

TileListVirialEnergy::energySlow_ti_2
double energySlow_ti_2
Definition: CudaTileListKernel.h:93

TileListVirialEnergy::energySlow_s
double energySlow_s
Definition: CudaTileListKernel.h:85

TileList::patchInd
int2 patchInd
Definition: CudaTileListKernel.h:17

VirialEnergy::energyVdw
double energyVdw
Definition: CudaTileListKernel.h:101

PatchPairRecord::jatomSize
int jatomSize
Definition: CudaTileListKernel.h:28