NAMD
ComputeBondedCUDA.h
Go to the documentation of this file.
1 #ifndef COMPUTEBONDEDCUDA_H
2 #define COMPUTEBONDEDCUDA_H
3 #include "Compute.h"
4 #include "ComputeAniso.h"
5 #include "ComputeMap.h"
6 #include "ComputeThole.h"
7 #include "CudaNonbondedTables.h"
10 #include "ComputeHomeTuples.h"
11 #include "TupleTypesCUDA.h"
12 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
13 
14 #ifdef BONDED_CUDA
15 
16 #include <vector>
17 #include <array>
18 
19 class ComputeBondedCUDA : public Compute {
20 
21 public:
22 
23  static const int CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES];
24  static const int CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES];
25 
26 private:
27  bool initializeCalled;
28  SimParameters *params;
29  // Device ID and stream
30  const int deviceID;
31  cudaStream_t stream;
32 #ifdef NODEGROUP_FORCE_REGISTER
33  std::atomic<int> tupleWorkIndex;
34 #endif
35 
36  // Master PE for this compute
37  const int masterPe;
38 
39  // List of all patch IDs on this object
40  std::vector<int> allPatchIDs;
41 
42  // List of tuple patches for the entire compute (i.e. across all PEs)
43  TuplePatchList tuplePatchList;
44 
45  // For every PE, list of patches that it has registered
46  std::vector< std::vector<int> > patchIDsPerRank;
47 
48  // List of PEs involved in the computation
49  std::vector<int> pes;
50 
51  // Self compute
52  struct SelfCompute {
53  int type;
54  std::vector<int> patchIDs;
55  Tuples* tuples;
56  SelfCompute(int type=-1) : type(type), tuples(NULL) {}
57  int operator==(const SelfCompute &elem) const {
58  return (elem.type == type);
59  }
60  };
61 
62  // Home compute, each PE has one
63  struct HomeCompute {
64  std::vector<char> isBasePatch;
65  std::vector<int> patchIDs;
66  // Multiple tuples per PE, each of different kind
67  std::vector< Tuples* > tuples;
68  };
69 
70  // Computes for each PE
71  struct ComputeRecord {
72  HomeCompute homeCompute;
73  // Self computes, organized by type
74  std::vector< SelfCompute > selfComputes;
75  };
76 
77  // Collection of all computes for each PE
78  std::vector< ComputeRecord > computes;
79 
80  // For every tuple type, list of tuples
81  // NOTE: These are pointers to the data recorded in "computes" and
82  // are here to make it easier to traverse across all tuples of certain kind
83  std::array< std::list<Tuples*>, Tuples::NUM_TUPLE_TYPES > tupleList;
84 
85  int numTuplesPerType[Tuples::NUM_TUPLE_TYPES];
86 
87  AtomMap atomMap;
88  std::vector< AtomMapper* > atomMappers;
89 
90  /*struct PatchRecord {
91  int atomStart;
92  int numAtoms;
93  };*/
94  std::vector<PatchRecord> patches;
95 
96  // Patch "patchID" is found in patches[patchIndex[patchID]]
97  std::vector<int> patchIndex;
98 
99  // Maps multiplicit indices
100  std::vector<int> dihedralMultMap;
101  std::vector<int> improperMultMap;
102 
103  // Number of exclusions per rank, separated into modified and non-modified
104  struct NumExcl {
105  int numModifiedExclusions;
106  int numExclusions;
107  };
108  std::vector<NumExcl> numExclPerRank;
109 
110  // Flags that indicate wether this GPU has exclusions and modified exclusions
111  bool hasExclusions;
112  bool hasModifiedExclusions;
113 
114  // All tuple data
115  char* tupleData;
116  size_t tupleDataSize;
117 
118  std::vector<CudaBondStage> bondTupleData;
119  std::vector<CudaAngleStage> angleTupleData;
120  std::vector<CudaDihedralStage> dihedralTupleData;
121  std::vector<CudaDihedralStage> improperTupleData;
122  std::vector<CudaExclusionStage> modifiedExclusionTupleData;
123  std::vector<CudaExclusionStage> exclusionTupleData;
124  std::vector<CudaCrosstermStage> crosstermTupleData;
125  std::vector<CudaTholeStage> tholeTupleData;
126  std::vector<CudaAnisoStage> anisoTupleData;
127 
128  // Bonded CUDA kernel
129  ComputeBondedCUDAKernel bondedKernel;
130 #ifdef NODEGROUP_FORCE_REGISTER
131  MigrationBondedCUDAKernel migrationKernel;
132 #endif // NODEGROUP_FORCE_REGISTER
133 
134  // Pointer to computeMgr that created this object
135  ComputeMgr* computeMgr;
136 
137  // Node-wide counter for patches.
138  int patchesCounter;
139 
140  // Tuple migration data structures
141  double3* h_patchMapCenter;
142  double3* d_patchMapCenter;
143 
144  PatchRecord* d_patchRecord;
145  PatchRecord* h_patchRecord;
146 
147  // "Force done event" for event polling
148  cudaEvent_t forceDoneEvent;
149 
150  // Check counter for event polling
151  int checkCount;
152 
153  // Node lock
154  CmiNodeLock lock;
155  CmiNodeLock printLock;
156 
157  // This variable is set in atomUpdate() by any Pe
158  bool atomsChangedIn;
159  // This variable is set in doWork() by masterPe
160  bool atomsChanged;
161 
162  // Maintain two reduction objects for different simulation modes
163  SubmitReduction *reductionGpuOffload = nullptr;
164  SubmitReduction *reductionGpuResident = nullptr;
165 
166  // Required storage
167  int atomStorageSize;
168 
169  // Flags pointer
170  Flags* flags;
171 
172  // Lattice and energy and virial booleans
173  Lattice lattice;
174  bool doEnergy;
175  bool doVirial;
176  bool doSlow;
177  bool doMolly;
178 
179  // Current step, for alchemical route
180  int step;
181 
182  // Walltime for force compute start
183  double beforeForceCompute;
184 
185  bool accelMDdoDihe;
186 
187  // Atom storage in pinned host memory
188  CudaAtom* atoms;
189  size_t atomsSize;
190 
191  // Force storage in pinned host memory
192  FORCE_TYPE* forces;
193  size_t forcesSize;
194  int forcesSizeDP;
195 
196  double* energies_virials;
197 
198  CudaAlchFlags hostAlchFlags;
199  CudaAlchParameters hostAlchParameters;
200  CudaAlchLambdas hostAlchLambdas;
201  int pswitchTable[3*3];
202 
203  void mapAtoms();
204  void unmapAtoms();
205 
206  void updatePatches();
207 
208  static void forceDoneCheck(void *arg, double walltime);
209  void forceDoneSetCallback();
210 
211 
212  // ------------ For copyTupleData -------------------
213  struct TupleCopyWork {
214  int tupletype;
215  int ntuples;
216  void* tupleElemList;
217  int64_t tupleDataPos;
218  };
219 
220  std::vector<TupleCopyWork> tupleCopyWorkList;
221 
222  int64_t exclusionStartPos;
223  int64_t exclusionStartPos2;
224  std::vector<CudaBondStage> hostCudaBondStage;
225 
226 #ifdef NODEGROUP_FORCE_REGISTER
227  template <typename T>
228  void sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets);
229  void sortAndCopyToDevice();
230  void migrateTuples(bool startup);
231 
232  template <typename T, typename P, typename D>
233  void copyTupleToStage(const T& src, const P* __restrict__ p_array, D& dstval);
234 
235  template <typename T, typename P, typename D>
236  void copyToStage(const int ntuples, const T* __restrict__ src,
237  const P* __restrict__ p_array, std::vector<D>& dst);
238 
239  void copyExclusionDataStage(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
240  std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2);
241 #endif
242 
243  void copyBondData(const int ntuples, const BondElem* __restrict__ src,
244  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst);
245 
246  void copyBondDatafp32(const int ntuples, const BondElem* __restrict__ src,
247  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst);
248 
249  void copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
250  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst);
251 
252  template <bool doDihedral, typename T, typename P>
253  void copyDihedralData(const int ntuples, const T* __restrict__ src,
254  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst);
255 
256  template <bool doDihedral, typename T, typename P>
257  void copyDihedralDatafp32(const int ntuples, const T* __restrict__ src,
258  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst);
259 
260  void copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
261  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int64_t& pos, int64_t& pos2);
262 
263  void copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
264  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst);
265 
266  void copyTholeData(const int ntuples, const TholeElem* __restrict__ src,
267  const TholeValue* __restrict__ thole_array, CudaThole* __restrict__ dst);
268 
269  void copyAnisoData(const int ntuples, const AnisoElem* __restrict src,
270  const AnisoValue* __restrict aniso_array, CudaAniso* __restrict dst);
271 
272  static void tupleCopyWorker(int first, int last, void *result, int paraNum, void *param);
273  void tupleCopyWorker(int first, int last);
274 static void tupleCopyWorkerExcl(int first, int last, void *result, int paraNum, void *param);
275  void tupleCopyWorkerExcl(int first, int last);
276 
277 #ifdef NODEGROUP_FORCE_REGISTER
278  void tupleCopyWorkerType(int tupletype);
279 #endif
280  // --------------------------------------------------
281 
282  // Returns current reduction object based on if simulation is in GPU resident or GPU offload mode
283  SubmitReduction* getCurrentReduction();
284 
285 public:
286 
287  ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID, CudaNonbondedTables& cudaNonbondedTables);
288  ~ComputeBondedCUDA();
289  void registerCompute(int pe, int type, PatchIDList& pids);
290  void registerSelfCompute(int pe, int type, int pid);
291  void unregisterBoxesOnPe();
292  void assignPatchesOnPe();
293  virtual void patchReady(PatchID, int doneMigration, int seq);
294  virtual void initialize();
295  virtual void atomUpdate();
296  virtual int noWork();
297  virtual void doWork();
298  void messageEnqueueWork();
299  // void updatePatches();
300  void openBoxesOnPe(int startup = 1);
301  void loadTuplesOnPe(const int startup = 1);
302  void copyTupleData();
303  void copyTupleDataSN();
304  void launchWork();
305  void updateCudaAlchParameters();
306 
307  void updateHostCudaAlchFlags();
308  void updateKernelCudaAlchFlags();
309  void updateHostCudaAlchParameters();
310  void updateKernelCudaAlchParameters();
311  void updateHostCudaAlchLambdas();
312  void updateKernelCudaAlchLambdas();
313 
314 #ifdef NODEGROUP_FORCE_REGISTER
315  void updatePatchRecords();
316  void updateMaxTupleCounts(TupleCounts counts);
317  TupleCounts getMaxTupleCounts();
318  void registerPointersToHost();
319  void copyHostRegisterToDevice();
320  void copyPatchData();
321  void copyTupleDataGPU(const int startup);
322  void updatePatchOrder(const std::vector<CudaLocalRecord>& data);
323 #endif // NODEGROUP_FORCE_REGISTER
324 
325  void finishPatchesOnPe();
326  void finishPatches();
327  void finishReductions();
328 
329  std::vector<int>& getBondedPes(void) {return pes;}
330 
331  std::vector<PatchRecord>& getPatches() { return patches; }
332 };
333 
334 #endif // BONDED_CUDA
335 #endif // NAMD_CUDA
336 #endif // COMPUTEBONDEDCUDA_H
int32 ComputeID
Definition: NamdTypes.h:288
virtual void initialize()
Definition: Compute.h:56
virtual void doWork()
Definition: Compute.C:120
#define FORCE_TYPE
int operator==(const AtomSigInfo &s1, const AtomSigInfo &s2)
Definition: CompressPsf.C:146
virtual void atomUpdate()
Definition: Compute.h:59
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
virtual int noWork()
Definition: Compute.C:116
int32 PatchID
Definition: NamdTypes.h:287