namd/doxygen/CudaTileListKernel_8hip_8h_source.html

 #ifndef CUDATILELISTKERNEL_HIP_H
 #define CUDATILELISTKERNEL_HIP_H
 #if defined(NAMD_HIP)

 // Exclusion mask: bit 1 = atom pair is included, 0 = atom pair is excluded
 struct TileExcl {
   WarpMask excl[WARPSIZE];
 };

 struct TileList {
   int iatomStart;
   int jtileStart;
   int jtileEnd;
   float3 offsetXYZ;
   int2 patchInd;        // Patch indices for this list
   union {
     int2 patchNumList;    // Number of lists contributing to each patch
     // int icompute;
   };
   int icompute;
 };

 struct PatchPairRecord {
   int iatomSize;
   int iatomFreeSize;
   int jatomSize;
   int jatomFreeSize;
 };

 //
 // Bounding box structure
 //
 struct BoundingBox {
   float x, y, z;      // Center
   float wx, wy, wz;   // Half-width
 };

 //
 // Stripped-down CUDA version of compute record
 //
 struct CudaComputeRecord {
   int2 patchInd;
   float3 offsetXYZ;
 };

 //
 // Stripped-down CUDA version of patch record
 //
 struct CudaPatchRecord {
   int numAtoms;
   int numFreeAtoms;
   int atomStart;
 #ifdef NODEGROUP_FORCE_REGISTER
   // appending an additional field to hold the patchId
   int patchID;
 #endif
 };

 //
 // Tile list status. Used to communicate tile list sizes between GPU and CPU
 //
 struct TileListStat {
   int numTileLists;
   int numTileListsGBIS;
   int numJtiles;
   int numExcluded;
   int patchReadyQueueCount;
   int outputOrderIndex;
   bool tilesSizeExceeded;
 };

 struct TileListVirialEnergy {
   float shx, shy, shz;
   float forcex, forcey, forcez;
   float forceSlowx, forceSlowy, forceSlowz;
   double energyVdw;
   double energyElec;
   double energySlow;

   /* FEP energies */
   double energyVdw_s;
   double energyElec_s;
   double energySlow_s;

   /* TI energies */
   double energyVdw_ti_1;
   double energyVdw_ti_2;
   double energyElec_ti_1;
   double energyElec_ti_2;
   double energySlow_ti_1;
   double energySlow_ti_2;

   double energyGBIS;
 };

 struct VirialEnergy {
   double virial[9];
   double virialSlow[9];
   double energyVdw;
   double energyElec;
   double energySlow;

   /* FEP energies */
   double energyVdw_s;
   double energyElec_s;
   double energySlow_s;

   /* TI energies */
   double energyVdw_ti_1;
   double energyVdw_ti_2;
   double energyElec_ti_1;
   double energyElec_ti_2;
   double energySlow_ti_1;
   double energySlow_ti_2;

   double energyGBIS;
 };

 class CudaTileListKernel {
 private:

   template <typename T>
   struct PtrSize {
     PtrSize(T* ptr, int size) : ptr(ptr), size(size) {}
     T* ptr;
     size_t size;
   };

   const int deviceID;

   // Events
   cudaEvent_t tileListStatEvent;
   bool tileListStatEventRecord;

   // Pair list cutoff squared
   float plcutoff2;

   // Number of patches
   int numPatches;

   // Number of computes
   int numComputes;

   // Number of tile lists
   int numTileLists;

   // Number of tile lists for GBIS
   int numTileListsGBIS;

   // Number of tiles
   int numJtiles;

   // Maximum number of tiles per tile list
   int maxTileListLen;

   CudaPatchRecord* cudaPatches;
   size_t cudaPatchesSize;

   CudaComputeRecord* cudaComputes;
   size_t cudaComputesSize;

   // --- For Streaming ---
   const bool doStreaming;
   int* patchNumLists;
   size_t patchNumListsSize;

   int* emptyPatches;
   size_t emptyPatchesSize;
   int* h_emptyPatches;
   size_t h_emptyPatchesSize;
   int numEmptyPatches;

   unsigned int* sortKeySrc;
   size_t sortKeySrcSize;
   unsigned int* sortKeyDst;
   size_t sortKeyDstSize;

   int maxTileListLen_sortKeys;

   unsigned int* sortKeys;
   size_t sortKeysSize;

   int2* minmaxListLen;
   size_t minmaxListLenSize;

   int sortKeys_endbit;
   // ---------------------

   // Single entry pinned host and device buffers for communicating tile list status
   TileListStat* h_tileListStat;
   TileListStat* d_tileListStat;

   // Atom coordinates and charge
   float4* xyzq;
   size_t xyzqSize;
   // Atom coordinate storage size
   size_t atomStorageSize;

   char *part;
   size_t partSize;

   // Tile lists
   TileList* tileLists1;
   size_t tileLists1Size;
   TileList* tileLists2;
   size_t tileLists2Size;
   TileList* tileListsGBIS;
   size_t tileListsGBISSize;

   // Pair pairs
   PatchPairRecord* patchPairs1;
   size_t patchPairs1Size;
   PatchPairRecord* patchPairs2;
   size_t patchPairs2Size;

   // j-atom start for tiles
   int* tileJatomStart1;
   size_t tileJatomStart1Size;
   int* tileJatomStart2;
   size_t tileJatomStart2Size;
   int* tileJatomStartGBIS;
   size_t tileJatomStartGBISSize;

   // Bounding boxes
   BoundingBox* boundingBoxes;
   size_t boundingBoxesSize;

   // Depth of each tile list
   unsigned int* tileListDepth1;
   size_t tileListDepth1Size;
   unsigned int* tileListDepth2;
   size_t tileListDepth2Size;

   // Tile list order
   int* tileListOrder1;
   size_t tileListOrder1Size;
   int* tileListOrder2;
   size_t tileListOrder2Size;

   // Position of each tile list = ExclusiveSum(tileListDepths)
   int* tileListPos;
   size_t tileListPosSize;

   // jtile occupancy and position
   int* jtiles;
   size_t jtilesSize;

   // Temporary buffers used in buildTileLists
   int* tilePos;
   size_t tilePosSize;

   // Exclusions
   TileExcl* tileExcls1;
   size_t tileExcls1Size;
   TileExcl* tileExcls2;
   size_t tileExcls2Size;

   // Temporary storage for CUB
   char* tempStorage;
   size_t tempStorageSize;

   // Number of exclusions detected
   int numExcluded;

   // Virials and energies for tile lists
   TileListVirialEnergy* tileListVirialEnergy;
   size_t tileListVirialEnergySize;

   int tileListVirialEnergyLength;
   int tileListVirialEnergyGBISLength;

   int activeBuffer;

   void setActiveBuffer(int activeBufferIn) {activeBuffer = activeBufferIn;}

   void sortTileLists(
     const bool useJtiles,
     const int begin_bit, const bool highDepthBitsSet,
     // Source
     const int numTileListsSrc, const int numJtilesSrc,
     PtrSize<TileList> tileListsSrc, PtrSize<int> tileJatomStartSrc,
     PtrSize<unsigned int> tileListDepthSrc, PtrSize<int> tileListOrderSrc,
     PtrSize<PatchPairRecord> patchPairsSrc, PtrSize<TileExcl> tileExclsSrc,
     // Destination
     const int numTileListsDst, const int numJtilesDst,
     PtrSize<TileList> tileListsDst, PtrSize<int> tileJatomStartDst,
     PtrSize<unsigned int> tileListDepthDst, PtrSize<int> tileListOrderDst,
     PtrSize<PatchPairRecord> patchPairsDst, PtrSize<TileExcl> tileExclsDst,
     cudaStream_t stream);

   void writeTileList(const char* filename, const int numTileLists,
     const TileList* d_tileLists, cudaStream_t stream);
   void writeTileList(FILE* handle, const int numTileLists,
     const TileList* d_tileLists, cudaStream_t stream);
   void writeTileJatomStart(const char* filename, const int numJtiles,
     const int* d_tileJatomStart, cudaStream_t stream);
   void writeTileJatomStart(FILE* handle, const int numJtiles,
     const int* d_tileJatomStart, cudaStream_t stream);
   void writeTileExcls(FILE* handle, const int numJtiles,
     const TileExcl* d_tileExcl, cudaStream_t stream);
   // void markJtileOverlap(const int width, const int numTileLists, TileList* d_tileLists,
   //   const int numJtiles, int* d_tileJatomStart, cudaStream_t stream);

   int* outputOrder;
   size_t outputOrderSize;
   bool doOutputOrder;

 public:

         CudaTileListKernel(int deviceID, bool doStreaming);
         ~CudaTileListKernel();

   int getNumEmptyPatches() {return numEmptyPatches;}
   int* getEmptyPatches() {return h_emptyPatches;}

   int getNumExcluded() {return numExcluded;}

   float get_plcutoff2() {return plcutoff2;}
   int getNumTileLists() {return numTileLists;}
   int getNumTileListsGBIS() {return numTileListsGBIS;}
   int getNumJtiles() {return numJtiles;}
   BoundingBox* getBoundingBoxes() {return boundingBoxes;}
   int* getJtiles() {return jtiles;}
   float4* get_xyzq() {return xyzq;}
   char* get_part() {return part;}

   TileListStat* getTileListStatDevPtr() {return d_tileListStat;}
   void clearTileListStat(cudaStream_t stream);

   int* getTileJatomStart() {return ((activeBuffer == 1) ? tileJatomStart1 : tileJatomStart2);}
   TileList* getTileLists() {
     return ((activeBuffer == 1) ? tileLists1 : tileLists2);
   }
   unsigned int* getTileListDepth() {return ((activeBuffer == 1) ? tileListDepth1 : tileListDepth2);}
   int* getTileListOrder() {return ((activeBuffer == 1) ? tileListOrder1 : tileListOrder2);}
   TileExcl* getTileExcls() {return ((activeBuffer == 1) ? tileExcls1 : tileExcls2);}
   PatchPairRecord* getPatchPairs() {return ((activeBuffer == 1) ? patchPairs1 : patchPairs2);}

   int* getTileJatomStartGBIS() {return tileJatomStartGBIS;}
   TileList* getTileListsGBIS() {return tileListsGBIS;}

   TileListVirialEnergy* getTileListVirialEnergy() {return tileListVirialEnergy;}

   CudaPatchRecord* getCudaPatches() {return cudaPatches;}
   int getCudaPatchesSize() {return cudaPatchesSize;}

   void prepareTileList(cudaStream_t stream);
         void finishTileList(cudaStream_t stream);

   void updateComputes(const int numComputesIn,
     const CudaComputeRecord* h_cudaComputes, cudaStream_t stream);

   void prepareBuffers(
     int atomStorageSizeIn, int numPatchesIn,
     const CudaPatchRecord* h_cudaPatches,
     cudaStream_t stream);

   void buildTileLists(const int numTileListsPrev,
     const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn,
     const float3 lata, const float3 latb, const float3 latc,
     const CudaPatchRecord* h_cudaPatches, const float4* h_xyzq, const float plcutoff2In,
     const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged,
     const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration);

 void reSortTileLists(const bool doGBIS, const bool CUDASOAIntegratorOn, cudaStream_t stream);  // void applyOutputOrder(cudaStream_t stream);

   void setTileListVirialEnergyLength(int len);
   void setTileListVirialEnergyGBISLength(int len);
   int getTileListVirialEnergyLength() {return tileListVirialEnergyLength;}
   int getTileListVirialEnergyGBISLength() {return tileListVirialEnergyGBISLength;}

   int getNumPatches() {return numPatches;}

   int getNumComputes() {return numComputes;}
   int* getOutputOrder() {
     if (!doStreaming) return NULL;
     if (doOutputOrder) {
       return outputOrder;
     } else {
       return NULL;
     }
   }

 };
 #endif // NAMD_CUDA
 #endif // CUDATILELISTKERNEL_H
TileListVirialEnergy::forcez
float forcez
Definition: CudaTileListKernel.h:74

CudaTileListKernel::getEmptyPatches
int * getEmptyPatches()
Definition: CudaTileListKernel.h:308

TileList::patchNumList
int2 patchNumList
Definition: CudaTileListKernel.h:17

TileListStat::tilesSizeExceeded
bool tilesSizeExceeded
Definition: CudaTileListKernel.h:69

VirialEnergy::virial
double virial[9]
Definition: CudaTileListKernel.h:97

CudaPatchRecord::atomStart
int atomStart
Definition: CudaTileListKernel.h:52

CudaTileListKernel::CudaTileListKernel
CudaTileListKernel(int deviceID, bool doStreaming)

CudaTileListKernel::prepareTileList
void prepareTileList(cudaStream_t stream)

CudaTileListKernel::setTileListVirialEnergyLength
void setTileListVirialEnergyLength(int len)

TileList::jtileStart
int jtileStart
Definition: CudaTileListKernel.h:12

PatchPairRecord::jatomFreeSize
int jatomFreeSize
Definition: CudaTileListKernel.h:27

BoundingBox::y
float y
Definition: CudaTileListKernel.h:34

TileListStat::numTileLists
int numTileLists
Definition: CudaTileListKernel.h:63

CudaTileListKernel::getPatchPairs
PatchPairRecord * getPatchPairs()
Definition: CudaTileListKernel.h:331

VirialEnergy::energySlow_s
double energySlow_s
Definition: CudaTileListKernel.h:106

CudaTileListKernel::getNumPatches
int getNumPatches()
Definition: CudaTileListKernel.h:367

CudaTileListKernel::getNumTileLists
int getNumTileLists()
Definition: CudaTileListKernel.h:313

CudaTileListKernel::getOutputOrder
int * getOutputOrder()
Definition: CudaTileListKernel.h:370

TileListStat::outputOrderIndex
int outputOrderIndex
Definition: CudaTileListKernel.h:68

CudaTileListKernel::clearTileListStat
void clearTileListStat(cudaStream_t stream)

CudaTileListKernel::getTileExcls
TileExcl * getTileExcls()
Definition: CudaTileListKernel.h:330

TileListVirialEnergy::forceSlowx
float forceSlowx
Definition: CudaTileListKernel.h:75

VirialEnergy::energyVdw_ti_2
double energyVdw_ti_2
Definition: CudaTileListKernel.h:110

CudaComputeRecord::offsetXYZ
float3 offsetXYZ
Definition: CudaTileListKernel.h:43

CudaTileListKernel::getNumExcluded
int getNumExcluded()
Definition: CudaTileListKernel.h:310

VirialEnergy::energyVdw_ti_1
double energyVdw_ti_1
Definition: CudaTileListKernel.h:109

CudaTileListKernel::setTileListVirialEnergyGBISLength
void setTileListVirialEnergyGBISLength(int len)

CudaTileListKernel
Definition: CudaTileListKernel.h:119

CudaTileListKernel::prepareBuffers
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)

CudaTileListKernel::getTileListVirialEnergyGBISLength
int getTileListVirialEnergyGBISLength()
Definition: CudaTileListKernel.h:365

TileList::jtileEnd
int jtileEnd
Definition: CudaTileListKernel.h:13

CudaTileListKernel::getCudaPatches
CudaPatchRecord * getCudaPatches()
Definition: CudaTileListKernel.h:338

CudaTileListKernel::getCudaPatchesSize
int getCudaPatchesSize()
Definition: CudaTileListKernel.h:339

CudaTileListKernel::~CudaTileListKernel
~CudaTileListKernel()

TileListVirialEnergy::forceSlowy
float forceSlowy
Definition: CudaTileListKernel.h:75

CudaPatchRecord
Definition: CudaTileListKernel.h:49

CudaPatchRecord::numAtoms
int numAtoms
Definition: CudaTileListKernel.h:50

CudaTileListKernel::getTileListDepth
unsigned int * getTileListDepth()
Definition: CudaTileListKernel.h:328

WARPSIZE
#define WARPSIZE
Definition: CudaUtils.h:17

VirialEnergy::energyVdw_s
double energyVdw_s
Definition: CudaTileListKernel.h:104

TileListVirialEnergy::energyVdw_ti_1
double energyVdw_ti_1
Definition: CudaTileListKernel.h:86

CudaComputeRecord
Definition: CudaTileListKernel.h:41

CudaTileListKernel::getBoundingBoxes
BoundingBox * getBoundingBoxes()
Definition: CudaTileListKernel.h:316

CudaTileListKernel::get_part
char * get_part()
Definition: CudaTileListKernel.h:319

CudaTileListKernel::getNumComputes
int getNumComputes()
Definition: CudaTileListKernel.h:369

TileListVirialEnergy::forcey
float forcey
Definition: CudaTileListKernel.h:74

TileListVirialEnergy::energyGBIS
double energyGBIS
Definition: CudaTileListKernel.h:93

VirialEnergy::energyElec
double energyElec
Definition: CudaTileListKernel.h:100

TileListVirialEnergy::energyVdw_s
double energyVdw_s
Definition: CudaTileListKernel.h:81

TileList
Definition: CudaTileListKernel.h:10

TileListVirialEnergy
Definition: CudaTileListKernel.h:72

VirialEnergy
Definition: CudaTileListKernel.h:96

TileExcl::excl
unsigned int excl[32]
Definition: CudaTileListKernel.h:7

CudaTileListKernel::updateComputes
void updateComputes(const int numComputesIn, const CudaComputeRecord *h_cudaComputes, cudaStream_t stream)

TileListVirialEnergy::energyVdw
double energyVdw
Definition: CudaTileListKernel.h:76

TileList::iatomStart
int iatomStart
Definition: CudaTileListKernel.h:11

CudaTileListKernel::getTileListsGBIS
TileList * getTileListsGBIS()
Definition: CudaTileListKernel.h:334

WarpMask
unsigned int WarpMask
Definition: CudaUtils.h:19

BoundingBox::wy
float wy
Definition: CudaTileListKernel.h:35

TileListVirialEnergy::energySlow_ti_1
double energySlow_ti_1
Definition: CudaTileListKernel.h:90

TileListStat::numExcluded
int numExcluded
Definition: CudaTileListKernel.h:66

CudaTileListKernel::getTileListStatDevPtr
TileListStat * getTileListStatDevPtr()
Definition: CudaTileListKernel.h:321

VirialEnergy::energyElec_ti_2
double energyElec_ti_2
Definition: CudaTileListKernel.h:112

TileListVirialEnergy::energyElec_s
double energyElec_s
Definition: CudaTileListKernel.h:82

CudaTileListKernel::getTileLists
TileList * getTileLists()
Definition: CudaTileListKernel.h:325

CudaTileListKernel::getNumEmptyPatches
int getNumEmptyPatches()
Definition: CudaTileListKernel.h:307

VirialEnergy::energySlow
double energySlow
Definition: CudaTileListKernel.h:101

PatchPairRecord::iatomSize
int iatomSize
Definition: CudaTileListKernel.h:24

VirialEnergy::energyElec_s
double energyElec_s
Definition: CudaTileListKernel.h:105

CudaPatchRecord::numFreeAtoms
int numFreeAtoms
Definition: CudaTileListKernel.h:51

CudaTileListKernel::getNumTileListsGBIS
int getNumTileListsGBIS()
Definition: CudaTileListKernel.h:314

CudaTileListKernel::finishTileList
void finishTileList(cudaStream_t stream)

CudaTileListKernel::getTileListVirialEnergyLength
int getTileListVirialEnergyLength()
Definition: CudaTileListKernel.h:364

TileListVirialEnergy::energyElec
double energyElec
Definition: CudaTileListKernel.h:77

TileList::icompute
int icompute
Definition: CudaTileListKernel.h:20

CudaTileListKernel::get_xyzq
float4 * get_xyzq()
Definition: CudaTileListKernel.h:318

CudaTileListKernel::getTileJatomStart
int * getTileJatomStart()
Definition: CudaTileListKernel.h:324

PatchPairRecord::iatomFreeSize
int iatomFreeSize
Definition: CudaTileListKernel.h:25

TileListVirialEnergy::shz
float shz
Definition: CudaTileListKernel.h:73

BoundingBox::z
float z
Definition: CudaTileListKernel.h:34

TileListStat
Definition: CudaTileListKernel.h:62

CudaTileListKernel::getTileJatomStartGBIS
int * getTileJatomStartGBIS()
Definition: CudaTileListKernel.h:333

TileList::offsetXYZ
float3 offsetXYZ
Definition: CudaTileListKernel.h:14

TileListVirialEnergy::shy
float shy
Definition: CudaTileListKernel.h:73

CudaTileListKernel::get_plcutoff2
float get_plcutoff2()
Definition: CudaTileListKernel.h:312

TileListVirialEnergy::energyElec_ti_1
double energyElec_ti_1
Definition: CudaTileListKernel.h:88

VirialEnergy::virialSlow
double virialSlow[9]
Definition: CudaTileListKernel.h:98

CudaTileListKernel::getJtiles
int * getJtiles()
Definition: CudaTileListKernel.h:317

TileListStat::patchReadyQueueCount
int patchReadyQueueCount
Definition: CudaTileListKernel.h:67

PatchPairRecord
Definition: CudaTileListKernel.h:23

VirialEnergy::energySlow_ti_2
double energySlow_ti_2
Definition: CudaTileListKernel.h:114

TileListStat::numJtiles
int numJtiles
Definition: CudaTileListKernel.h:65

TileListVirialEnergy::forceSlowz
float forceSlowz
Definition: CudaTileListKernel.h:75

TileListVirialEnergy::energyVdw_ti_2
double energyVdw_ti_2
Definition: CudaTileListKernel.h:87

BoundingBox
Definition: CudaTileListKernel.h:33

TileExcl
Definition: CudaTileListKernel.h:6

CudaTileListKernel::getTileListVirialEnergy
TileListVirialEnergy * getTileListVirialEnergy()
Definition: CudaTileListKernel.h:336

CudaTileListKernel::getTileListOrder
int * getTileListOrder()
Definition: CudaTileListKernel.h:329

VirialEnergy::energyElec_ti_1
double energyElec_ti_1
Definition: CudaTileListKernel.h:111

TileListVirialEnergy::forcex
float forcex
Definition: CudaTileListKernel.h:74

TileListStat::numTileListsGBIS
int numTileListsGBIS
Definition: CudaTileListKernel.h:64

BoundingBox::x
float x
Definition: CudaTileListKernel.h:34

CudaTileListKernel::buildTileLists
void buildTileLists(const int numTileListsPrev, const int numPatchesIn, const int atomStorageSizeIn, const int maxTileListLenIn, const float3 lata, const float3 latb, const float3 latc, const CudaPatchRecord *h_cudaPatches, const float4 *h_xyzq, const float plcutoff2In, const size_t maxShmemPerBlock, cudaStream_t stream, const bool atomsChanged, const bool allocatePart, bool CUDASOAintegratorOn, bool deviceMigration)

TileListVirialEnergy::energySlow
double energySlow
Definition: CudaTileListKernel.h:78

VirialEnergy::energyGBIS
double energyGBIS
Definition: CudaTileListKernel.h:116

TileListVirialEnergy::shx
float shx
Definition: CudaTileListKernel.h:73

BoundingBox::wz
float wz
Definition: CudaTileListKernel.h:35

CudaTileListKernel::getNumJtiles
int getNumJtiles()
Definition: CudaTileListKernel.h:315

TileListVirialEnergy::energyElec_ti_2
double energyElec_ti_2
Definition: CudaTileListKernel.h:89

VirialEnergy::energySlow_ti_1
double energySlow_ti_1
Definition: CudaTileListKernel.h:113

CudaComputeRecord::patchInd
int2 patchInd
Definition: CudaTileListKernel.h:42

CudaTileListKernel::reSortTileLists
void reSortTileLists(const bool doGBIS, cudaStream_t stream)

BoundingBox::wx
float wx
Definition: CudaTileListKernel.h:35

TileListVirialEnergy::energySlow_ti_2
double energySlow_ti_2
Definition: CudaTileListKernel.h:91

TileListVirialEnergy::energySlow_s
double energySlow_s
Definition: CudaTileListKernel.h:83

TileList::patchInd
int2 patchInd
Definition: CudaTileListKernel.h:15

VirialEnergy::energyVdw
double energyVdw
Definition: CudaTileListKernel.h:99

PatchPairRecord::jatomSize
int jatomSize
Definition: CudaTileListKernel.h:26