NAMD
CudaPmeSolverUtil.h
Go to the documentation of this file.
1 #ifndef CUDAPMESOLVERUTIL_H
2 #define CUDAPMESOLVERUTIL_H
3 #include <stdio.h>
4 #ifdef NAMD_CUDA
5 #include <cuda.h>
6 #include <cufft.h>
7 #endif // NAMD_CUDA
8 #if defined(NAMD_HIP)
9 #include "HipDefines.h"
10 #ifndef NAMD_CUDA
11 #include <rocfft.h>
12 #endif
13 #endif
14 #include "PmeSolverUtil.h"
15 #include "CudaUtils.h"
17 
18 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
19 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream);
20 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename);
21 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream);
22 
23 #ifdef NAMD_CUDA
24 #define cufftCheck(stmt) do { \
25  cufftResult err = stmt; \
26  if (err != CUFFT_SUCCESS) { \
27  char msg[128]; \
28  sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
29  cudaDie(msg); \
30  } \
31 } while(0)
32 #else //NAMD_HIP
33 #define rocfftCheck(stmt) do { \
34  rocfft_status err = stmt; \
35  if (err != rocfft_status_success) { \
36  char msg[128]; \
37  sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
38  cudaDie(msg); \
39  } \
40 } while(0)
41 
42 #endif
43 //
44 // CUDA implementation of FFTCompute
45 //
46 class CudaFFTCompute : public FFTCompute {
47 private:
48 #ifdef NAMD_CUDA
49  cufftHandle forwardPlan, backwardPlan;
50  cufftType_t forwardType, backwardType;
51 #else
52  rocfft_plan forwardPlan, backwardPlan;
53  rocfft_execution_info forwardPlanInfo, backwardPlanInfo;
54  void * forwardWorkBuffer;
55  void * backwardWorkBuffer;
56  void createPlans(
57  rocfft_transform_type forwardTransformType, rocfft_transform_type backwardTransformType,
58  size_t dimensions, const size_t* lengths, size_t howmany);
59 #endif
60  int deviceID;
61  cudaStream_t stream;
62  void setStream();
63 
64 private:
65  float* allocateData(const int dataSizeRequired);
66  void plan3D(int *n, int flags);
67  void plan2D(int *n, int howmany, int flags);
68  void plan1DX(int *n, int howmany, int flags);
69  void plan1DY(int *n, int howmany, int flags);
70  void plan1DZ(int *n, int howmany, int flags);
71  // int ncall, plantype;
72 
73 public:
74  CudaFFTCompute(int deviceID, cudaStream_t stream) : deviceID(deviceID), stream(stream) {}
76  void forward();
77  void backward();
78 };
79 
80 //
81 // Cuda implementation of PmeKSpaceCompute class
82 //
83 class CudaPmePencilXYZ;
84 class CudaPmePencilZ;
85 
87 private:
88  int deviceID;
89  cudaStream_t stream;
90  // Device memory versions of (bm1, bm2, bm3)
91  float *d_bm1, *d_bm2, *d_bm3;
92  //float *prefac_x, *prefac_y, *prefac_z;
93  struct EnergyVirial {
94  double energy;
95  double virial[9];
96  };
97  EnergyVirial* d_energyVirial;
98  EnergyVirial* h_energyVirial;
99  cudaEvent_t copyEnergyVirialEvent;
100  bool ortho;
101  // Check counter for event polling in energyAndVirialCheck()
102  int checkCount;
103  static void energyAndVirialCheck(void *arg, double walltime);
104  CudaPmePencilXYZ* pencilXYZPtr;
105  CudaPmePencilZ* pencilZPtr;
106 public:
108  const int jblock, const int kblock, double kappa,
109  int deviceID, cudaStream_t stream);
111  void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data);
112  // void waitEnergyAndVirial();
113  double getEnergy();
114  void getVirial(double *virial);
117 };
118 
119 //
120 // Cuda implementation of PmeRealSpaceCompute class
121 //
122 
124 
126 private:
127 #ifdef NAMD_CUDA
128  bool gridTexObjActive;
129  cudaTextureObject_t gridTexObj;
130  int tex_data_len;
131  float* tex_data;
132 #else
133  int grid_data_len;
134  float* grid_data;
135 #endif
136  int deviceID;
137  cudaStream_t stream;
138  void setupGridData(float* data, int data_len);
139  // Device memory for atoms
140  int d_atomsCapacity;
141  CudaAtom* d_atoms;
142  // Device memory for patches
143  // int d_patchesCapacity;
144  // PatchInfo* d_patches;
145  // Device memory for forces
146  int d_forceCapacity;
147  CudaForce* d_force;
148  // // Device memory for self energy
149  // double* d_selfEnergy;
150  // Events
151  cudaEvent_t gatherForceEvent;
152  // Check counter for event polling
153  int checkCount;
154  // Store device pointer for event polling
155  ComputePmeCUDADevice* devicePtr;
156  static void cuda_gatherforce_check(void *arg, double walltime);
157 public:
158  CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock,
159  int deviceID, cudaStream_t stream);
161  void copyAtoms(const int numAtoms, const CudaAtom* atoms);
162  void spreadCharge(Lattice &lattice);
163  void gatherForce(Lattice &lattice, CudaForce* force);
164  void gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in);
165  void waitGatherForceDone();
166 };
167 
168 //
169 // Cuda implementation of PmeTranspose class
170 //
172 private:
173  int deviceID;
174  cudaStream_t stream;
175  float2* d_data;
176 #ifndef P2P_ENABLE_3D
177  float2* d_buffer;
178 #endif
179  // List of device data pointers for transpose destinations on:
180  // (a) this device on a different pencil (e.g. in XYZ->YZX transpose, on Y -pencil)
181  // (b) different device on a different pencil
182  // If NULL, use the local d_data -buffer
183  std::vector<float2*> dataPtrsYZX;
184  std::vector<float2*> dataPtrsZXY;
185 
186  // Batch data
187  int max_nx_YZX[3];
188  TransposeBatch<float2> *batchesYZX;
189  int max_nx_ZXY[3];
190  TransposeBatch<float2> *batchesZXY;
191 
192  void copyDataToPeerDevice(const int iblock,
193  const int iblock_out, const int jblock_out, const int kblock_out,
194  int deviceID_out, int permutation_out, float2* data_out);
195 public:
197  const int jblock, const int kblock, int deviceID, cudaStream_t stream);
199  void setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data);
200  void setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data);
201  void transposeXYZtoYZX(const float2* data);
202  void transposeXYZtoZXY(const float2* data);
203  // void waitTransposeDone();
204  void waitStreamSynchronize();
205  void copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize);
206  void copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out);
207 #ifndef P2P_ENABLE_3D
208  void copyDataDeviceToDevice(const int iblock, float2* data_out);
209  float2* getBuffer(const int iblock);
210 #endif
211  void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
212  void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
213 };
214 #endif // NAMD_CUDA
215 #endif // CUDAPMESOLVERUTIL_H
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void setDataPtrsYZX(std::vector< float2 * > &dataPtrsNew, float2 *data)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
static __thread atom * atoms
void spreadCharge(Lattice &lattice)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
__thread cudaStream_t stream
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
CudaFFTCompute(int deviceID, cudaStream_t stream)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
void getVirial(double *virial)
const int kblock
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream)
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
const int permutation
void setDataPtrsZXY(std::vector< float2 * > &dataPtrsNew, float2 *data)
void gatherForce(Lattice &lattice, CudaForce *force)
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void transposeXYZtoYZX(const float2 *data)
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
float2 * getBuffer(const int iblock)
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)