NAMD
CudaPmeSolverUtil.h
Go to the documentation of this file.
1 #ifndef CUDAPMESOLVERUTIL_H
2 #define CUDAPMESOLVERUTIL_H
3 
4 #ifdef NAMD_CUDA
5 #include <cuda.h>
6 #include <cufft.h>
7 #endif // NAMD_CUDA
8 
9 #if defined(NAMD_HIP)
10 #ifndef NAMD_CUDA
11 #include <hipfft/hipfft.h>
12 #endif
13 #endif
14 
15 #include <stdio.h>
16 
17 #include "PmeSolverUtil.h"
18 #include "CudaUtils.h"
20 #include "ReductionMgr.h"
21 #include "HipDefines.h"
22 
23 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
24 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream);
25 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename);
26 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream);
27 
28 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
29 #define cufftCheck(stmt) do { \
30  cufftResult err = stmt; \
31  if (err != CUFFT_SUCCESS) { \
32  char msg[128]; \
33  sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
34  cudaDie(msg); \
35  } \
36 } while(0)
37 #endif
38 //
39 // CUDA implementation of FFTCompute
40 //
41 class CudaFFTCompute : public FFTCompute {
42 private:
43 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
44  cufftHandle forwardPlan, backwardPlan;
45  cufftType_t forwardType, backwardType;
46 #endif
47  int deviceID;
48  cudaStream_t stream;
49  void setStream();
50 
51 private:
52  float* allocateData(const int dataSizeRequired);
53  void plan3D(int *n, int flags);
54  void plan2D(int *n, int howmany, int flags);
55  void plan1DX(int *n, int howmany, int flags);
56  void plan1DY(int *n, int howmany, int flags);
57  void plan1DZ(int *n, int howmany, int flags);
58  // int ncall, plantype;
59 
60 public:
61  CudaFFTCompute(int deviceID, cudaStream_t stream);
63  void forward();
64  void backward();
65 };
66 
67 //
68 // Cuda implementation of PmeKSpaceCompute class
69 //
70 class CudaPmePencilXYZ;
71 class CudaPmePencilZ;
72 
74 private:
75  int deviceID;
76  cudaStream_t stream;
77  // Device memory versions of (bm1, bm2, bm3)
78  float *d_bm1, *d_bm2, *d_bm3;
79  //float *prefac_x, *prefac_y, *prefac_z;
80  struct EnergyVirial {
81  double energy;
82  double virial[9];
83  };
84  EnergyVirial* d_energyVirial;
85  EnergyVirial* h_energyVirial;
86  cudaEvent_t copyEnergyVirialEvent;
87  bool ortho;
88  // Check counter for event polling in energyAndVirialCheck()
89  int checkCount;
90  static void energyAndVirialCheck(void *arg, double walltime);
91  CudaPmePencilXYZ* pencilXYZPtr;
92  CudaPmePencilZ* pencilZPtr;
93 public:
95  const int jblock, const int kblock, double kappa,
96  int deviceID, cudaStream_t stream, unsigned int iGrid = 0);
98  void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data);
99  void waitEnergyAndVirial();
100  double getEnergy();
101  void getVirial(double *virial);
104 };
105 
106 //
107 // Cuda implementation of PmeRealSpaceCompute class
108 //
109 
111 
113 private:
114 #ifdef NAMD_CUDA
115  bool gridTexObjActive;
116  cudaTextureObject_t gridTexObj;
117  int tex_data_len;
118  float* tex_data;
119 #else
120  int grid_data_len;
121  float* grid_data;
122 #endif
123  int deviceID;
124  cudaStream_t stream;
125  void setupGridData(float* data, int data_len);
126  // Device memory for atoms
127  size_t d_atomsCapacity;
128  CudaAtom* d_atoms;
129  // Device memory for patches
130  // int d_patchesCapacity;
131  // PatchInfo* d_patches;
132  // Device memory for forces
133  size_t d_forceCapacity;
134  CudaForce* d_force;
135  // // Device memory for self energy
136  // double* d_selfEnergy;
137  // Events
138  cudaEvent_t gatherForceEvent;
139  // Check counter for event polling
140  int checkCount;
141  // Store device pointer for event polling
142  ComputePmeCUDADevice* devicePtr;
143  static void cuda_gatherforce_check(void *arg, double walltime);
144 public:
145  CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock,
146  int deviceID, cudaStream_t stream);
148  void copyAtoms(const int numAtoms, const CudaAtom* atoms);
149  void spreadCharge(Lattice &lattice);
150  void gatherForce(Lattice &lattice, CudaForce* force);
151  void gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in);
152  void waitGatherForceDone();
153 };
154 
155 //
156 // Cuda implementation of PmeTranspose class
157 //
159 private:
160  int deviceID;
161  cudaStream_t stream;
162  float2* d_data;
163 #ifndef P2P_ENABLE_3D
164  float2* d_buffer;
165 #endif
166  // List of device data pointers for transpose destinations on:
167  // (a) this device on a different pencil (e.g. in XYZ->YZX transpose, on Y -pencil)
168  // (b) different device on a different pencil
169  // If NULL, use the local d_data -buffer
170  std::vector<float2*> dataPtrsYZX;
171  std::vector<float2*> dataPtrsZXY;
172 
173  // Batch data
174  int max_nx_YZX[3];
175  TransposeBatch<float2> *batchesYZX;
176  int max_nx_ZXY[3];
177  TransposeBatch<float2> *batchesZXY;
178 
179  void copyDataToPeerDevice(const int iblock,
180  const int iblock_out, const int jblock_out, const int kblock_out,
181  int deviceID_out, int permutation_out, float2* data_out);
182 public:
184  const int jblock, const int kblock, int deviceID, cudaStream_t stream);
186  void setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data);
187  void setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data);
188  void transposeXYZtoYZX(const float2* data);
189  void transposeXYZtoZXY(const float2* data);
190  // void waitTransposeDone();
191  void waitStreamSynchronize();
192  void copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize);
193  void copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out);
194 #ifndef P2P_ENABLE_3D
195  void copyDataDeviceToDevice(const int iblock, float2* data_out);
196  float2* getBuffer(const int iblock);
197 #endif
198  void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
199  void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
200 };
201 
202 
210  public:
212  int deviceID;
214  cudaStream_t stream;
215 
216  int natoms;
218 
219  float4* d_atoms;
221  float3* d_forces;
222  float* d_scaling_factors; // alchemical scaling factors
223 #ifndef USE_TABLE_ARRAYS
224  cudaTextureObject_t* gridTexObjArrays;
225 #endif
226 
227  float* d_grids;
231  float2* d_trans;
233  size_t gridsize;
234  size_t transize;
235 
236 #if defined(NAMD_CUDA) || defined(NAMD_HIP) //to enable when hipfft full support is ready
237  cufftHandle* forwardPlans;
238  cufftHandle* backwardPlans;
239 #endif
240 
241  float *d_bm1;
242  float *d_bm2;
243  float *d_bm3;
244 
245  double kappa;
246 
247  struct EnergyVirial {
248  double energy;
249  double virial[6];
250  };
253 
254  bool self_energy_alch_first_time; // check if this is the first time to compute self energy for alch
255  bool force_scaling_alch_first_time; // check if this is the first time to compute the force scaling factors
256  double *d_selfEnergy; // on device
260  double selfEnergy; // remains constant for a given set of charges
264  int m_step;
265 
266  CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_);
268 
269  void compute(
270  const Lattice &lattice,
271 #if 0
272  const CudaAtom *d_atoms,
273  CudaForce *d_force,
274  int natoms,
275 #endif
276  int doEnergyVirial,
277  int step
278  );
279 
280  void finishReduction( bool doEnergyVirial);
281 
284 
285  /*
286  * @brief computes the grid point based on the given unscaled position
287  *
288  * This does not take periodic boundary conditions into account, so the returned
289  * grid point can be negative or beyond the grid.
290  *
291  */
292  int getShiftedGrid(const double x, const int grid);
293 
294  /*
295  * @brief Computes the amount of shared memory used by the patch-level charge spreading kernel
296  */
297  int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order);
298 
299  /*
300  * @brief Computes the amount of shared memory used by the patch-level force gathering kernel
301  */
302  int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order);
303 
304  /*
305  * @brief Checks the patch-level kernel compatibility with the simulation parameters
306  *
307  * Currently the patch-level kernels only support periodic systems with an 8th order interpolation.
308  *
309  * The results will be stored in the PatchLevelPmeData object.
310  *
311  * TODO: the naming of this function would indicate it should return a value, so this can be confusing
312  * Maybe this logic should be refactored to improve readability of code.
313  */
314  void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ);
315 
316  /*
317  * @brief Checks the patch-level kernel compatibility with the current device
318  *
319  * The patch-level kernels use a good amount shared memory, and this checks to see if the current
320  * device has sufficient shared memory
321  *
322  * The results will be stored in the PatchLevelPmeData object.
323  *
324  */
326 
327  /*
328  * @brief Checks the patch-level kernel compatibility with the current lattice/patch width
329  *
330  * The patch-level kernels need to keep a sub-grid in shared memory, and the size of the sub-grid
331  * depends on the patch width which can change with lattice.
332  *
333  * The results will be stored in the PatchLevelPmeData object.
334  *
335  */
337  const int numPatches, const CudaLocalRecord* localRecords,
338  double3* patchMin, double3* patchMax, double3* awayDists);
339 
340 private:
341  void calcSelfEnergyAlch(int step);
342  void scaleAndComputeFEPEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_F, double (&virial)[9]);
343  void scaleAndComputeTIEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_TI_1, double& energy_TI_2, double (&virial)[9]);
344  void scaleAndMergeForce(int step);
345 };
346 
347 
348 #endif // NAMD_CUDA
349 #endif // CUDAPMESOLVERUTIL_H
350 
void finishReduction(bool doEnergyVirial)
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
cufftHandle * backwardPlans
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
EnergyVirial * d_energyVirials
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
void spreadCharge(Lattice &lattice)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
CudaFFTCompute(int deviceID, cudaStream_t stream)
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define order
Definition: PmeRealSpace.C:235
const int jblock
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
const int kblock
const int permutation
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
void gatherForce(Lattice &lattice, CudaForce *force)
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
cufftHandle * forwardPlans
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
EnergyVirial * h_energyVirials
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)