namd/doxygen/CudaPmeSolverUtil_8h_source.html

 #ifndef CUDAPMESOLVERUTIL_H
 #define CUDAPMESOLVERUTIL_H

 #ifdef NAMD_CUDA
 #include <cuda.h>
 #include <cufft.h>
 #endif // NAMD_CUDA

 #if defined(NAMD_HIP)
 #ifndef NAMD_CUDA
 #include <hipfft/hipfft.h>
 #endif
 #endif

 #include <stdio.h>

 #include "PmeSolverUtil.h"
 #include "CudaUtils.h"
 #include "CudaPmeSolverUtilKernel.h"
 #include "ReductionMgr.h"
 #include "HipDefines.h"

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream);
 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename);
 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream);

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 #define cufftCheck(stmt) do {                                           \
   cufftResult err = stmt;                                               \
   if (err != CUFFT_SUCCESS) {                                           \
         char msg[128];  \
           sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
           cudaDie(msg); \
   }                                                                     \
 } while(0)
 #endif
 //
 // CUDA implementation of FFTCompute
 //
 class CudaFFTCompute : public FFTCompute {
 private:
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   cufftHandle forwardPlan, backwardPlan;
   cufftType_t forwardType, backwardType;
 #endif
   int deviceID;
         cudaStream_t stream;
         void setStream();

 private:
         float* allocateData(const int dataSizeRequired);
         void plan3D(int *n, int flags);
         void plan2D(int *n, int howmany, int flags);
         void plan1DX(int *n, int howmany, int flags);
         void plan1DY(int *n, int howmany, int flags);
         void plan1DZ(int *n, int howmany, int flags);
         // int ncall, plantype;

 public:
         CudaFFTCompute(int deviceID, cudaStream_t stream);
         ~CudaFFTCompute();
         void forward();
         void backward();
 };

 //
 // Cuda implementation of PmeKSpaceCompute class
 //
 class CudaPmePencilXYZ;
 class CudaPmePencilZ;

 class CudaPmeKSpaceCompute : public PmeKSpaceCompute {
 private:
         int deviceID;
         cudaStream_t stream;
         // Device memory versions of (bm1, bm2, bm3)
         float *d_bm1, *d_bm2, *d_bm3;
         //float *prefac_x, *prefac_y, *prefac_z;
         struct EnergyVirial {
                 double energy;
                 double virial[9];
         };
         EnergyVirial* d_energyVirial;
         EnergyVirial* h_energyVirial;
         cudaEvent_t copyEnergyVirialEvent;
         bool ortho;
   // Check counter for event polling in energyAndVirialCheck()
   int checkCount;
         static void energyAndVirialCheck(void *arg, double walltime);
         CudaPmePencilXYZ* pencilXYZPtr;
         CudaPmePencilZ* pencilZPtr;
 public:
         CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation,
                 const int jblock, const int kblock, double kappa,
                 int deviceID, cudaStream_t stream, unsigned int iGrid = 0);
         ~CudaPmeKSpaceCompute();
         void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data);
         void waitEnergyAndVirial();
         double getEnergy();
         void getVirial(double *virial);
         void energyAndVirialSetCallback(CudaPmePencilXYZ* pencilPtr);
         void energyAndVirialSetCallback(CudaPmePencilZ* pencilPtr);
 };

 //
 // Cuda implementation of PmeRealSpaceCompute class
 //

 class ComputePmeCUDADevice;

 class CudaPmeRealSpaceCompute : public PmeRealSpaceCompute {
 private:
 #ifdef NAMD_CUDA
         bool gridTexObjActive;
         cudaTextureObject_t gridTexObj;
         int tex_data_len;
         float* tex_data;
 #else
         int grid_data_len;
         float* grid_data;
 #endif
         int deviceID;
         cudaStream_t stream;
         void setupGridData(float* data, int data_len);
         // Device memory for atoms
         size_t d_atomsCapacity;
         CudaAtom* d_atoms;
         // Device memory for patches
         // int d_patchesCapacity;
         // PatchInfo* d_patches;
         // Device memory for forces
         size_t d_forceCapacity;
         CudaForce* d_force;
         // // Device memory for self energy
         // double* d_selfEnergy;
   // Events
   cudaEvent_t gatherForceEvent;
   // Check counter for event polling
   int checkCount;
   // Store device pointer for event polling
   ComputePmeCUDADevice* devicePtr;
   static void cuda_gatherforce_check(void *arg, double walltime);
 public:
         CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock,
                 int deviceID, cudaStream_t stream);
         ~CudaPmeRealSpaceCompute();
         void copyAtoms(const int numAtoms, const CudaAtom* atoms);
         void spreadCharge(Lattice &lattice);
         void gatherForce(Lattice &lattice, CudaForce* force);
         void gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in);
         void waitGatherForceDone();
 };

 //
 // Cuda implementation of PmeTranspose class
 //
 class CudaPmeTranspose : public PmeTranspose {
 private:
         int deviceID;
         cudaStream_t stream;
         float2* d_data;
 #ifndef P2P_ENABLE_3D
         float2* d_buffer;
 #endif
         // List of device data pointers for transpose destinations on:
         // (a) this device on a different pencil (e.g. in XYZ->YZX transpose, on Y -pencil)
         // (b) different device on a different pencil
         // If NULL, use the local d_data -buffer
         std::vector<float2*> dataPtrsYZX;
         std::vector<float2*> dataPtrsZXY;

         // Batch data
         int max_nx_YZX[3];
         TransposeBatch<float2> *batchesYZX;
         int max_nx_ZXY[3];
         TransposeBatch<float2> *batchesZXY;

         void copyDataToPeerDevice(const int iblock,
                 const int iblock_out, const int jblock_out, const int kblock_out,
                 int deviceID_out, int permutation_out, float2* data_out);
 public:
         CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
                 const int jblock, const int kblock, int deviceID, cudaStream_t stream);
         ~CudaPmeTranspose();
         void setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data);
         void setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data);
         void transposeXYZtoYZX(const float2* data);
         void transposeXYZtoZXY(const float2* data);
         // void waitTransposeDone();
         void waitStreamSynchronize();
         void copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize);
         void copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out);
 #ifndef P2P_ENABLE_3D
         void copyDataDeviceToDevice(const int iblock, float2* data_out);
         float2* getBuffer(const int iblock);
 #endif
         void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
         void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
 };


 class CudaPmeOneDevice {
   public:
     PmeGrid pmeGrid;
     int deviceID;
     int deviceIndex;
     cudaStream_t stream;

     int natoms;
     size_t num_used_grids;

     float4* d_atoms;
     int* d_partition;
     float3* d_forces;
     float* d_scaling_factors; // alchemical scaling factors
 #ifndef USE_TABLE_ARRAYS
     cudaTextureObject_t* gridTexObjArrays;
 #endif

     float* d_grids;
     float2* d_trans;
     size_t gridsize;
     size_t transize;

 #if defined(NAMD_CUDA) || defined(NAMD_HIP) //to enable when hipfft full support is ready
     cufftHandle* forwardPlans;
     cufftHandle* backwardPlans;
 #endif

     float *d_bm1;
     float *d_bm2;
     float *d_bm3;

     double kappa;

     struct EnergyVirial {
       double energy;
       double virial[6];
     };
     EnergyVirial* d_energyVirials;
     EnergyVirial* h_energyVirials;

     bool self_energy_alch_first_time; // check if this is the first time to compute self energy for alch
     bool force_scaling_alch_first_time; // check if this is the first time to compute the force scaling factors
     double *d_selfEnergy;  // on device
     double *d_selfEnergy_FEP;
     double *d_selfEnergy_TI_1;
     double *d_selfEnergy_TI_2;
     double selfEnergy;     // remains constant for a given set of charges
     double selfEnergy_FEP;
     double selfEnergy_TI_1;
     double selfEnergy_TI_2;
     int m_step;

     CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_);
     ~CudaPmeOneDevice();

     void compute(
         const Lattice &lattice,
 #if 0
         const CudaAtom *d_atoms,
         CudaForce *d_force,
         int natoms,
 #endif
         int doEnergyVirial,
         int step
         );

     void finishReduction( bool doEnergyVirial);

     PatchLevelPmeData patchLevelPmeData;
     Lattice currentLattice;

     /*
      * @brief computes the grid point based on the given unscaled position
      *
      * This does not take periodic boundary conditions into account, so the returned
      * grid point can be negative or beyond the grid.
      *
      */
     int getShiftedGrid(const double x, const int grid);

     /*
      * @brief Computes the amount of shared memory used by the patch-level charge spreading kernel
      */
     int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order);

     /*
      * @brief Computes the amount of shared memory used by the patch-level force gathering kernel
      */
     int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order);

     /*
      * @brief Checks the patch-level kernel compatibility with the simulation parameters
      *
      * Currently the patch-level kernels only support periodic systems with an 8th order interpolation.
      *
      * The results will be stored in the PatchLevelPmeData object.
      *
      * TODO: the naming of this function would indicate it should return a value, so this can be confusing
      *       Maybe this logic should be refactored to improve readability of code.
      */
     void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ);

     /*
      * @brief Checks the patch-level kernel compatibility with the current device
      *
      * The patch-level kernels use a good amount shared memory, and this checks to see if the current
      * device has sufficient shared memory
      *
      * The results will be stored in the PatchLevelPmeData object.
      *
      */
     void checkPatchLevelDeviceCompatibility();

     /*
      * @brief Checks the patch-level kernel compatibility with the current lattice/patch width
      *
      * The patch-level kernels need to keep a sub-grid in shared memory, and the size of the sub-grid
      * depends on the patch width which can change with lattice.
      *
      * The results will be stored in the PatchLevelPmeData object.
      *
      */
     void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice& lattice,
       const int numPatches, const CudaLocalRecord* localRecords,
       double3* patchMin, double3* patchMax, double3* awayDists);

 private:
   void calcSelfEnergyAlch(int step);
   void scaleAndComputeFEPEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_F, double (&virial)[9]);
   void scaleAndComputeTIEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_TI_1, double& energy_TI_2, double (&virial)[9]);
   void scaleAndMergeForce(int step);
   SubmitReduction* getCurrentReduction() {
     // only supports GPU-resident mode
     return reductionGpuResident;
   }

   // CudaPmeOneDevice only supports GPU-resident mode
   SubmitReduction *reductionGpuResident = nullptr;
 };


 #endif // NAMD_CUDA
 #endif // CUDAPMESOLVERUTIL_H

CudaPmeOneDevice::finishReduction
void finishReduction(bool doEnergyVirial)
Definition: CudaPmeSolverUtil.C:1688

PmeRealSpaceCompute::jblock
const int jblock
Definition: PmeSolverUtil.h:332

CudaUtils.h

CudaPmeOneDevice::deviceID
int deviceID
Definition: CudaPmeSolverUtil.h:212

PmeRealSpaceCompute::numAtoms
int numAtoms
Definition: PmeSolverUtil.h:321

CudaPmeOneDevice::CudaPmeOneDevice
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
Definition: CudaPmeSolverUtil.C:1300

CudaPmeOneDevice::backwardPlans
cufftHandle * backwardPlans
Definition: CudaPmeSolverUtil.h:238

CudaPmeOneDevice::deviceIndex
int deviceIndex
Definition: CudaPmeSolverUtil.h:213

CudaPmeOneDevice::selfEnergy_FEP
double selfEnergy_FEP
Definition: CudaPmeSolverUtil.h:261

CudaPmeKSpaceCompute::energyAndVirialSetCallback
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
Definition: CudaPmeSolverUtil.C:474

PmeTranspose::permutation
const int permutation
Definition: PmeSolverUtil.h:427

CudaPmeOneDevice::force_scaling_alch_first_time
bool force_scaling_alch_first_time
Definition: CudaPmeSolverUtil.h:255

CudaPmeOneDevice::selfEnergy_TI_2
double selfEnergy_TI_2
Definition: CudaPmeSolverUtil.h:263

PmeKSpaceCompute::jblock
const int jblock
Definition: PmeSolverUtil.h:275

PmeSolverUtil.h

CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute
~CudaPmeKSpaceCompute()
Definition: CudaPmeSolverUtil.C:263

CudaPmeOneDevice::d_forces
float3 * d_forces
Definition: CudaPmeSolverUtil.h:221

CudaPmeTranspose::CudaPmeTranspose
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:842

CudaPmeOneDevice::d_selfEnergy_FEP
double * d_selfEnergy_FEP
Definition: CudaPmeSolverUtil.h:257

CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute
~CudaPmeRealSpaceCompute()
Definition: CudaPmeSolverUtil.C:567

PmeGrid
Definition: PmeBase.h:20

HipDefines.h

CudaPmeOneDevice::d_energyVirials
EnergyVirial * d_energyVirials
Definition: CudaPmeSolverUtil.h:251

CudaForce
Definition: CudaRecord.h:62

PatchLevelPmeData
Definition: CudaPmeSolverUtilKernel.h:6

CudaPmeOneDevice::checkPatchLevelLatticeCompatibilityAndComputeOffsets
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
Definition: CudaPmeSolverUtil.C:2062

CudaLocalRecord
Definition: CudaRecord.h:35

CudaPmeOneDevice::d_partition
int * d_partition
Definition: CudaPmeSolverUtil.h:220

CudaPmeOneDevice::d_trans
float2 * d_trans
Definition: CudaPmeSolverUtil.h:231

CudaPmeOneDevice::checkPatchLevelSimParamCompatibility
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
Definition: CudaPmeSolverUtil.C:2031

CudaPmeRealSpaceCompute::spreadCharge
void spreadCharge(Lattice &lattice)
Definition: CudaPmeSolverUtil.C:612

CudaPmeRealSpaceCompute::copyAtoms
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
Definition: CudaPmeSolverUtil.C:598

CudaPmeOneDevice::patchLevelPmeData
PatchLevelPmeData patchLevelPmeData
Definition: CudaPmeSolverUtil.h:282

CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:542

PmeKSpaceCompute
Definition: PmeSolverUtil.h:268

CudaPmeOneDevice::selfEnergy
double selfEnergy
Definition: CudaPmeSolverUtil.h:260

ComputePmeCUDADevice
Definition: ComputePmeCUDAMgr.h:420

CudaFFTCompute::backward
void backward()
Definition: CudaPmeSolverUtil.C:179

CudaPmeOneDevice::d_atoms
float4 * d_atoms
Definition: CudaPmeSolverUtil.h:219

CudaPmePencilXYZ
Definition: CudaPmeSolver.h:57

CudaPmeTranspose::copyDataToPeerDeviceZXY
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1255

TransposeBatch< float2 >

CudaPmeOneDevice::d_bm1
float * d_bm1
Definition: CudaPmeSolverUtil.h:241

CudaPmeTranspose::copyDataDeviceToDevice
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1204

PmeTranspose::pmeGrid
PmeGrid pmeGrid
Definition: PmeSolverUtil.h:426

PmeRealSpaceCompute::kblock
const int kblock
Definition: PmeSolverUtil.h:332

CudaFFTCompute::CudaFFTCompute
CudaFFTCompute(int deviceID, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:58

CudaPmeOneDevice::computeSharedMemoryPatchLevelSpreadCharge
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
Definition: CudaPmeSolverUtil.C:2010

CudaPmeTranspose::copyDataDeviceToHost
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
Definition: CudaPmeSolverUtil.C:1158

order
#define order
Definition: PmeRealSpace.C:235

CudaPmeOneDevice::d_selfEnergy
double * d_selfEnergy
Definition: CudaPmeSolverUtil.h:256

CudaPmeOneDevice::EnergyVirial
Definition: CudaPmeSolverUtil.h:247

CudaPmeOneDevice::m_step
int m_step
Definition: CudaPmeSolverUtil.h:264

SubmitReduction
Definition: ReductionMgr.h:326

CudaPmeKSpaceCompute
Definition: CudaPmeSolverUtil.h:73

PmeRealSpaceCompute::data
float * data
Definition: PmeSolverUtil.h:330

PmeTranspose::jblock
const int jblock
Definition: PmeSolverUtil.h:428

PmeRealSpaceCompute::pmeGrid
PmeGrid pmeGrid
Definition: PmeSolverUtil.h:323

CudaFFTCompute::~CudaFFTCompute
~CudaFFTCompute()
Definition: CudaPmeSolverUtil.C:113

PmeKSpaceCompute::pmeGrid
PmeGrid pmeGrid
Definition: PmeSolverUtil.h:270

CudaPmeOneDevice::EnergyVirial::energy
double energy
Definition: CudaPmeSolverUtil.h:248

CudaPmeOneDevice::currentLattice
Lattice currentLattice
Definition: CudaPmeSolverUtil.h:283

CudaPmeOneDevice::self_energy_alch_first_time
bool self_energy_alch_first_time
Definition: CudaPmeSolverUtil.h:254

FFTCompute
Definition: PmeSolverUtil.h:128

ReductionMgr.h

CudaPmeKSpaceCompute::getVirial
void getVirial(double *virial)
Definition: CudaPmeSolverUtil.C:498

CudaAtom
Definition: CudaRecord.h:58

CudaPmeOneDevice::gridTexObjArrays
cudaTextureObject_t * gridTexObjArrays
Definition: CudaPmeSolverUtil.h:224

CudaPmeTranspose
Definition: CudaPmeSolverUtil.h:158

CudaPmeKSpaceCompute::getEnergy
double getEnergy()
Definition: CudaPmeSolverUtil.C:494

CudaPmeOneDevice::EnergyVirial::virial
double virial[6]
Definition: CudaPmeSolverUtil.h:249

CudaPmeOneDevice::stream
cudaStream_t stream
Definition: CudaPmeSolverUtil.h:214

PmeTranspose::kblock
const int kblock
Definition: PmeSolverUtil.h:428

CudaPmeOneDevice::kappa
double kappa
Definition: CudaPmeSolverUtil.h:245

PmeKSpaceCompute::kappa
double kappa
Definition: PmeSolverUtil.h:273

CudaPmeOneDevice::pmeGrid
PmeGrid pmeGrid
Definition: CudaPmeSolverUtil.h:211

CudaPmeOneDevice::d_selfEnergy_TI_2
double * d_selfEnergy_TI_2
Definition: CudaPmeSolverUtil.h:259

CudaFFTCompute
Definition: CudaPmeSolverUtil.h:41

CudaPmeRealSpaceCompute
Definition: CudaPmeSolverUtil.h:112

CudaPmeOneDevice::d_bm3
float * d_bm3
Definition: CudaPmeSolverUtil.h:243

CudaPmeOneDevice::natoms
int natoms
Definition: CudaPmeSolverUtil.h:216

CudaPmeOneDevice::transize
size_t transize
Definition: CudaPmeSolverUtil.h:234

PmeKSpaceCompute::permutation
const int permutation
Definition: PmeSolverUtil.h:274

CudaPmeOneDevice
Definition: CudaPmeSolverUtil.h:209

CudaPmeOneDevice::d_grids
float * d_grids
Definition: CudaPmeSolverUtil.h:227

PmeTranspose
Definition: PmeSolverUtil.h:424

writeHostComplexToDisk
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
Definition: CudaPmeSolverUtil.C:39

CudaFFTCompute::forward
void forward()
Definition: CudaPmeSolverUtil.C:130

CudaPmeRealSpaceCompute::gatherForce
void gatherForce(Lattice &lattice, CudaForce *force)
Definition: CudaPmeSolverUtil.C:763

writeComplexToDisk
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:27

CudaPmeTranspose::waitStreamSynchronize
void waitStreamSynchronize()
Definition: CudaPmeSolverUtil.C:1153

CudaPmeKSpaceCompute::solve
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
Definition: CudaPmeSolverUtil.C:273

CudaPmeRealSpaceCompute::gatherForceSetCallback
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
Definition: CudaPmeSolverUtil.C:715

CudaPmeTranspose::setDataPtrsYZX
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
Definition: CudaPmeSolverUtil.C:873

CudaPmeOneDevice::gridsize
size_t gridsize
Definition: CudaPmeSolverUtil.h:233

CudaPmeTranspose::transposeXYZtoYZX
void transposeXYZtoYZX(const float2 *data)
Definition: CudaPmeSolverUtil.C:1006

CudaPmeOneDevice::d_bm2
float * d_bm2
Definition: CudaPmeSolverUtil.h:242

CudaPmeOneDevice::selfEnergy_TI_1
double selfEnergy_TI_1
Definition: CudaPmeSolverUtil.h:262

CudaPmeTranspose::setDataPtrsZXY
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
Definition: CudaPmeSolverUtil.C:944

CudaPmeKSpaceCompute::CudaPmeKSpaceCompute
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
Definition: CudaPmeSolverUtil.C:233

CudaPmeOneDevice::num_used_grids
size_t num_used_grids
Definition: CudaPmeSolverUtil.h:217

CudaPmeSolverUtilKernel.h

Lattice
Definition: Lattice.h:17

writeRealToDisk
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:46

CudaPmeOneDevice::forwardPlans
cufftHandle * forwardPlans
Definition: CudaPmeSolverUtil.h:237

CudaPmeTranspose::copyDataHostToDevice
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1179

PmeRealSpaceCompute
Definition: PmeSolverUtil.h:317

CudaPmeTranspose::transposeXYZtoZXY
void transposeXYZtoZXY(const float2 *data)
Definition: CudaPmeSolverUtil.C:1080

CudaPmePencilZ
Definition: CudaPmeSolver.h:190

CudaPmeRealSpaceCompute::waitGatherForceDone
void waitGatherForceDone()
Definition: CudaPmeSolverUtil.C:724

CudaPmeTranspose::copyDataToPeerDeviceYZX
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1245

CudaPmeKSpaceCompute::waitEnergyAndVirial
void waitEnergyAndVirial()

CudaPmeOneDevice::computeSharedMemoryPatchLevelGatherForce
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
Definition: CudaPmeSolverUtil.C:2021

CudaPmeTranspose::getBuffer
float2 * getBuffer(const int iblock)
Definition: CudaPmeSolverUtil.C:1230

CudaPmeOneDevice::getShiftedGrid
int getShiftedGrid(const double x, const int grid)
Definition: CudaPmeSolverUtil.C:2004

CudaPmeOneDevice::d_selfEnergy_TI_1
double * d_selfEnergy_TI_1
Definition: CudaPmeSolverUtil.h:258

CudaPmeTranspose::~CudaPmeTranspose
~CudaPmeTranspose()
Definition: CudaPmeSolverUtil.C:860

CudaPmeOneDevice::h_energyVirials
EnergyVirial * h_energyVirials
Definition: CudaPmeSolverUtil.h:252

PmeKSpaceCompute::kblock
const int kblock
Definition: PmeSolverUtil.h:275

CudaPmeOneDevice::d_scaling_factors
float * d_scaling_factors
Definition: CudaPmeSolverUtil.h:222

CudaPmeOneDevice::~CudaPmeOneDevice
~CudaPmeOneDevice()
Definition: CudaPmeSolverUtil.C:1472

CudaPmeOneDevice::checkPatchLevelDeviceCompatibility
void checkPatchLevelDeviceCompatibility()
Definition: CudaPmeSolverUtil.C:2042

CudaPmeOneDevice::compute
void compute(const Lattice &lattice, int doEnergyVirial, int step)
Definition: CudaPmeSolverUtil.C:1519