NAMD
CudaComputeNonbonded.h
Go to the documentation of this file.
1 #ifndef CUDACOMPUTENONBONDED_H
2 #define CUDACOMPUTENONBONDED_H
3 
4 #ifdef NAMD_CUDA
5 #include <cuda.h>
6 #endif
7 #ifdef NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #endif
10 
11 #include <vector>
12 #include "Compute.h"
13 #include "Box.h"
14 #include "PatchTypes.h"
15 #include "CudaUtils.h"
16 #include "ComputeNonbondedUtil.h"
17 #include "CudaNonbondedTables.h"
18 #include "CudaTileListKernel.h"
19 #include "CudaTileListKernel.hip.h"
22 #include "CudaComputeGBISKernel.h"
23 #include "ComputeMgr.h"
24 #include "HipDefines.h"
25 
26 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
27 // 2^11 ints * 2^5 bits = 2^16 bits = range of unsigned short excl_index
28 // 2^27 ints * 2^5 bits = 2^32 bits = range of unsigned int excl_index
29 #define MAX_EXCLUSIONS (1<<27)
30 
32 public:
33  struct ComputeRecord {
36  // Index to patches[] -array
37  int patchInd[2];
39  };
40 
41  struct PatchRecord {
43  patch = NULL;
44  compAtom = NULL;
45  results = NULL;
46  positionBox = NULL;
47  forceBox = NULL;
48  intRadBox = NULL;
49  psiSumBox = NULL;
50  bornRadBox = NULL;
51  dEdaSumBox = NULL;
52  dHdrPrefixBox = NULL;
53  }
56  int numAtoms;
58  int atomStart;
59  // Pe where the patch was registered
60  int pe;
61  // For priority sorting
64  bool isSameNode;
65  // Storage for open positionBox
67  // Storage for open forceBox
69  // Boxes
72  Box<Patch,Real> *intRadBox; //5 GBIS Boxes
77  Real *intRad; //5 GBIS arrays
82  bool operator < (const PatchRecord& pr) const {
83  return (patchID < pr.patchID);
84  }
85  bool operator == (const PatchRecord& pr) const {
86  return (patchID == pr.patchID);
87  }
88  };
89 
90 private:
91  SimParameters *params; // convenience
92  // This variable is set in atomUpdate() by any Pe
93  bool atomsChangedIn;
94  // This variable is set in doWork() by masterPe
95  bool atomsChanged;
96  int npairlists;
97 
98  bool computesChanged;
99 
100  const int deviceID;
101  size_t maxShmemPerBlock;
102  cudaStream_t stream;
103 
104  // PME and VdW CUDA kernels
105  CudaComputeNonbondedKernel nonbondedKernel;
106 
107  // GBIS kernel
108  CudaComputeGBISKernel GBISKernel;
109 
110  // Tile list CUDA kernels
111  CudaTileListKernel tileListKernel;
112 
113  // Exclusions
114  int2 *exclusionsByAtom;
115 
116  // VdW-types
117  // Pinned host memory
118  int* vdwTypes;
119  size_t vdwTypesSize;
120 
121  // Maximum number of tiles per tile list
122  int maxTileListLen;
123 
124  // Pinned host memory
125  int2* exclIndexMaxDiff;
126  size_t exclIndexMaxDiffSize;
127 
128  // Pinned host memory
129  int* atomIndex;
130  size_t atomIndexSize;
131 
132  // Required (xyzq, vdwTypes) storage
133  int atomStorageSize;
134 
135  // Atom and charge storage
136  // Pinned host memory
137  CudaAtom* atoms;
138  size_t atomsSize;
139 
140  char *part;
141  size_t partSize;
142 
143  // Drude/NbThole
150  int *isDrude;
151  size_t isDrudeSize;
152 
157  std::vector<int> atomIndexToNBindex;
163  float* drudeAtomAlpha;
164  size_t drudeAtomAlphaSize;
165 
166  // Force storage
167  float4* h_forces;
168  size_t h_forcesSize;
169  float4* h_forcesSlow;
170  size_t h_forcesSlowSize;
171 
172  float4* d_forces;
173  size_t d_forcesSize;
174  float4* d_forcesSlow;
175  size_t d_forcesSlowSize;
176 
177  // Virial and energy storage
178  VirialEnergy* h_virialEnergy;
179  VirialEnergy* d_virialEnergy;
180 
181  // GBIS storage
182  //--------------
183  // Pinned host memory
184  float* intRad0H;
185  size_t intRad0HSize;
186  // Pinned host memory
187  float* intRadSH;
188  size_t intRadSHSize;
189  // Mapped host memory
190  GBReal* psiSumH;
191  size_t psiSumHSize;
192  // Pinned host memory
193  float* bornRadH;
194  size_t bornRadHSize;
195  // Mapped host memory
196  GBReal* dEdaSumH;
197  size_t dEdaSumHSize;
198  // Pinned host memory
199  float* dHdrPrefixH;
200  size_t dHdrPrefixHSize;
201 
202  // Event and sanity check flag for making sure event was actually recorded
203  cudaEvent_t forceDoneEvent;
204  bool forceDoneEventRecord;
205  // Check counter for event polling
206  int checkCount;
207 
208  // Node lock
209  CmiNodeLock lock;
210  // List of local PEs that have patches
211  std::vector<int> pes;
212  // List of patch indices on each rank
213  std::vector< std::vector<int> > rankPatches;
214  // Master Pe = Pe where this Compute and reduction lives
215  int masterPe;
216 
217  // Are we in skip?
218  bool doSkip;
219 
220  // Device-wide patch and compute records, and the list of patches
221  std::vector<ComputeRecord> computes;
222  std::vector<PatchRecord> patches;
223 
224  // CUDA versions of patches
225  // Pinned host memory
226  CudaPatchRecord* cudaPatches;
227 
228  // Maintain two reduction objects for different simulation modes
229  SubmitReduction *reductionGpuOffload = nullptr;
230  SubmitReduction *reductionGpuResident = nullptr;
231 
232  // Pair lists
233  int pairlistsValid;
234  float pairlistTolerance;
235  int usePairlists;
236  int savePairlists;
237  float plcutoff2;
238 
239  bool reSortDone;
240 
241  // Flags
242  bool doSlow;
243  bool doEnergy;
244  bool doVirial;
245  bool doAlch;
246  bool doNbThole;
247  bool doMinimize;
248 
249  AlchData alchFlags;
250  bool lambdaWindowUpdated;
251  // Walltime for force compute start
252  double beforeForceCompute;
253 
254  static inline void updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param);
255  void updateVdwTypesExclSubset(int first, int last);
256 
257  static inline void copyAtomsLoop(int first, int last, void *result, int paraNum, void *param);
258  void copyAtomsSubset(int first, int last);
259 
260  void addPatch(PatchID pid);
261  void addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset);
262  void updatePatches();
263  int calcNumTileLists();
264  void getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance);
265  void updateVdwTypesExcl();
266  void buildNeighborlist();
267  void skip();
268  void doGBISphase1();
269  void doGBISphase2();
270  void doGBISphase3();
271  void doForce();
272  void finishSetOfPatchesOnPe(std::vector<int>& patchSet);
273  void finishGBISPhase(int i);
274  void finishTimers();
275  void forceDone();
276  static void forceDoneCheck(void *arg, double walltime);
277  void forceDoneSetCallback();
278  void updateComputes();
279  void buildExclusions();
280  void skipPatch(int i);
281  void openBox(int i);
282  void reallocateArrays();
283 #ifdef NODEGROUP_FORCE_REGISTER
284  void updatePatchRecord();
285 #endif
286  void copyGBISphase(int i);
287  void updatePatch(int i);
288  int findPid(PatchID pid);
289  void assignPatch(int i);
290  ComputeMgr* computeMgr;
291  int patchesCounter;
292 
293  const bool doStreaming;
294  int* patchReadyQueue;
295  int patchReadyQueueNext, patchReadyQueueLen;
296 
297  void finishPatch(int i);
298  void unregisterBox(int i);
299 
300  // void writeId(const char* filename);
301  // void writeXYZ(const char* filename);
302 
303 public:
304  CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables& cudaNonbondedTables, bool doStreaming);
307  void registerComputePair(ComputeID cid, PatchID* pid, int* trans);
308  void assignPatches(ComputeMgr* computeMgrIn);
309  virtual void initialize();
310  virtual void atomUpdate();
311  virtual int noWork();
312  virtual void doWork();
313  void launchWork();
314  void finishReductions();
315  void unregisterBoxesOnPe();
316  void assignPatchesOnPe();
317  void openBoxesOnPe();
318  void skipPatchesOnPe();
319  void finishPatchesOnPe();
320  void finishPatchOnPe(int i);
321  void finishPatches();
322  void messageEnqueueWork();
323  virtual void patchReady(PatchID, int doneMigration, int seq);
324  virtual void gbisP2PatchReady(PatchID, int seq);
325  virtual void gbisP3PatchReady(PatchID, int seq);
326  void reSortTileLists();
327 
328  void updatePatchOrder(const std::vector<CudaLocalRecord>& data);
329  std::vector<PatchRecord>& getPatches() { return patches; }
330 
331  // Utility function to compute nonbonded parameters, used by ComputeBondedCUDAKernel as well
333  // Utility function to determine if force table will be used, used by ComputeBondedCUDAKernel as well
334  static bool getDoTable(SimParameters* params, const bool doSlow, const bool doVirial);
335 
336  // Returns current reduction object based on if simulation is in GPU resident or GPU offload mode
338 };
339 
340 #endif // NAMD_CUDA
341 #endif // CUDACOMPUTENONBONDED_H
bool operator==(const PatchRecord &pr) const
SubmitReduction * getCurrentReduction()
Alchemical datastructure that holds the lambda-relevant paramenters for FEP/TI.
bool operator<(const PatchRecord &pr) const
int32 ComputeID
Definition: NamdTypes.h:288
Definition: Vector.h:72
virtual void gbisP2PatchReady(PatchID, int seq)
float Real
Definition: common.h:118
std::vector< PatchRecord > & getPatches()
Definition: Patch.h:35
static CudaNBConstants getNonbondedCoef(SimParameters *params)
virtual void gbisP3PatchReady(PatchID, int seq)
CudaComputeNonbonded(ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
void registerComputeSelf(ComputeID cid, PatchID pid)
virtual void patchReady(PatchID, int doneMigration, int seq)
Box< Patch, CompAtom > * positionBox
void registerComputePair(ComputeID cid, PatchID *pid, int *trans)
void assignPatches(ComputeMgr *computeMgrIn)
int32 PatchID
Definition: NamdTypes.h:287
const ComputeID cid
Definition: Compute.h:43
static bool getDoTable(SimParameters *params, const bool doSlow, const bool doVirial)
float GBReal
Definition: ComputeGBIS.inl:17
void updatePatchOrder(const std::vector< CudaLocalRecord > &data)