NAMD
ComputeBondedCUDA.h
Go to the documentation of this file.
1 #ifndef COMPUTEBONDEDCUDA_H
2 #define COMPUTEBONDEDCUDA_H
3 #include "Compute.h"
4 #include "ComputeAniso.h"
5 #include "ComputeMap.h"
7 #include "ComputeThole.h"
8 #include "CudaNonbondedTables.h"
11 #include "ComputeHomeTuples.h"
12 #include "TupleTypesCUDA.h"
13 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
14 
15 #ifdef BONDED_CUDA
16 
17 #include <vector>
18 #include <array>
19 
20 class ComputeBondedCUDA : public Compute {
21 
22 public:
23 
24  static const int CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES];
25  static const int CudaTupleTypeSizeStage[Tuples::NUM_TUPLE_TYPES];
26 
27 private:
28  bool initializeCalled;
29  SimParameters *params;
30  // Device ID and stream
31  const int deviceID;
32  cudaStream_t stream;
33 #ifdef NODEGROUP_FORCE_REGISTER
34  std::atomic<int> tupleWorkIndex;
35 #endif
36 
37  // Master PE for this compute
38  const int masterPe;
39 
40  // List of all patch IDs on this object
41  std::vector<int> allPatchIDs;
42 
43  // List of tuple patches for the entire compute (i.e. across all PEs)
44  TuplePatchList tuplePatchList;
45 
46  // For every PE, list of patches that it has registered
47  std::vector< std::vector<int> > patchIDsPerRank;
48 
49  // List of PEs involved in the computation
50  std::vector<int> pes;
51 
52  // Self compute
53  struct SelfCompute {
54  int type;
55  std::vector<int> patchIDs;
56  Tuples* tuples;
57  SelfCompute(int type=-1) : type(type), tuples(NULL) {}
58  int operator==(const SelfCompute &elem) const {
59  return (elem.type == type);
60  }
61  };
62 
63  // Home compute, each PE has one
64  struct HomeCompute {
65  std::vector<char> isBasePatch;
66  std::vector<int> patchIDs;
67  // Multiple tuples per PE, each of different kind
68  std::vector< Tuples* > tuples;
69  };
70 
71  // Computes for each PE
72  struct ComputeRecord {
73  HomeCompute homeCompute;
74  // Self computes, organized by type
75  std::vector< SelfCompute > selfComputes;
76  };
77 
78  // Collection of all computes for each PE
79  std::vector< ComputeRecord > computes;
80 
81  // For every tuple type, list of tuples
82  // NOTE: These are pointers to the data recorded in "computes" and
83  // are here to make it easier to traverse across all tuples of certain kind
84  std::array< std::list<Tuples*>, Tuples::NUM_TUPLE_TYPES > tupleList;
85 
86  int numTuplesPerType[Tuples::NUM_TUPLE_TYPES];
87 
88  AtomMap atomMap;
89  std::vector< AtomMapper* > atomMappers;
90 
91  /*struct PatchRecord {
92  int atomStart;
93  int numAtoms;
94  };*/
95  std::vector<PatchRecord> patches;
96 
97  // Patch "patchID" is found in patches[patchIndex[patchID]]
98  std::vector<int> patchIndex;
99 
100  // Maps multiplicit indices
101  std::vector<int> dihedralMultMap;
102  std::vector<int> improperMultMap;
103 
104  // Number of exclusions per rank, separated into modified and non-modified
105  struct NumExcl {
106  int numModifiedExclusions;
107  int numExclusions;
108  };
109  std::vector<NumExcl> numExclPerRank;
110 
111  // Flags that indicate wether this GPU has exclusions and modified exclusions
112  bool hasExclusions;
113  bool hasModifiedExclusions;
114 
115  // All tuple data
116  char* tupleData;
117  size_t tupleDataSize;
118 
119  std::vector<CudaBondStage> bondTupleData;
120  std::vector<CudaAngleStage> angleTupleData;
121  std::vector<CudaDihedralStage> dihedralTupleData;
122  std::vector<CudaDihedralStage> improperTupleData;
123  std::vector<CudaExclusionStage> modifiedExclusionTupleData;
124  std::vector<CudaExclusionStage> exclusionTupleData;
125  std::vector<CudaCrosstermStage> crosstermTupleData;
126  std::vector<CudaTholeStage> tholeTupleData;
127  std::vector<CudaAnisoStage> anisoTupleData;
128  std::vector<CudaOneFourNbTholeStage> oneFourNbTholeTupleData;
129 
130  // Bonded CUDA kernel
131  ComputeBondedCUDAKernel bondedKernel;
132 #ifdef NODEGROUP_FORCE_REGISTER
133  MigrationBondedCUDAKernel migrationKernel;
134 #endif // NODEGROUP_FORCE_REGISTER
135 
136  // Pointer to computeMgr that created this object
137  ComputeMgr* computeMgr;
138 
139  // Node-wide counter for patches.
140  int patchesCounter;
141 
142  // Tuple migration data structures
143  double3* h_patchMapCenter;
144  double3* d_patchMapCenter;
145 
146  PatchRecord* d_patchRecord;
147  PatchRecord* h_patchRecord;
148 
149  // "Force done event" for event polling
150  cudaEvent_t forceDoneEvent;
151 
152  // Check counter for event polling
153  int checkCount;
154 
155  // Node lock
156  CmiNodeLock lock;
157  CmiNodeLock printLock;
158 
159  // This variable is set in atomUpdate() by any Pe
160  bool atomsChangedIn;
161  // This variable is set in doWork() by masterPe
162  bool atomsChanged;
163 
164  // Maintain two reduction objects for different simulation modes
165  SubmitReduction *reductionGpuOffload = nullptr;
166  SubmitReduction *reductionGpuResident = nullptr;
167 
168  // Required storage
169  int atomStorageSize;
170 
171  // Flags pointer
172  Flags* flags;
173 
174  // Lattice and energy and virial booleans
175  Lattice lattice;
176  bool doEnergy;
177  bool doVirial;
178  bool doSlow;
179  bool doMolly;
180 
181  // Current step, for alchemical route
182  int step;
183 
184  // Walltime for force compute start
185  double beforeForceCompute;
186 
187  bool accelMDdoDihe;
188 
189  // Atom storage in pinned host memory
190  CudaAtom* atoms;
191  size_t atomsSize;
192 
193  // Force storage in pinned host memory
194  FORCE_TYPE* forces;
195  size_t forcesSize;
196  int forcesSizeDP;
197 
198  double* energies_virials;
199 
200  CudaAlchFlags hostAlchFlags;
201  CudaAlchParameters hostAlchParameters;
202  CudaAlchLambdas hostAlchLambdas;
203  int pswitchTable[3*3];
204 
205  void mapAtoms();
206  void unmapAtoms();
207 
208  void updatePatches();
209 
210  static void forceDoneCheck(void *arg, double walltime);
211  void forceDoneSetCallback();
212 
213 
214  // ------------ For copyTupleData -------------------
215  struct TupleCopyWork {
216  int tupletype;
217  int ntuples;
218  void* tupleElemList;
219  int64_t tupleDataPos;
220  };
221 
222  std::vector<TupleCopyWork> tupleCopyWorkList;
223 
224  int64_t exclusionStartPos;
225  int64_t exclusionStartPos2;
226  std::vector<CudaBondStage> hostCudaBondStage;
227 
228 #ifdef NODEGROUP_FORCE_REGISTER
229  template <typename T>
230  void sortTupleList(std::vector<T>& tuples, std::vector<int>& tupleCounts, std::vector<int>& tupleOffsets);
231  void sortAndCopyToDevice();
232  void migrateTuples(bool startup);
233 
234  template <typename T, typename P, typename D>
235  void copyTupleToStage(const T& src, const P* __restrict__ p_array, D& dstval);
236 
237  template <typename T, typename P, typename D>
238  void copyToStage(const int ntuples, const T* __restrict__ src,
239  const P* __restrict__ p_array, std::vector<D>& dst);
240 
241  void copyExclusionDataStage(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
242  std::vector<CudaExclusionStage>& dst1, std::vector<CudaExclusionStage>& dst2, int64_t& pos, int64_t& pos2);
243 #endif
244 
245  void copyBondData(const int ntuples, const BondElem* __restrict__ src,
246  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst);
247 
248  void copyBondDatafp32(const int ntuples, const BondElem* __restrict__ src,
249  const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst);
250 
251  void copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
252  const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst);
253 
254  template <bool doDihedral, typename T, typename P>
255  void copyDihedralData(const int ntuples, const T* __restrict__ src,
256  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst);
257 
258  template <bool doDihedral, typename T, typename P>
259  void copyDihedralDatafp32(const int ntuples, const T* __restrict__ src,
260  const P* __restrict__ p_array, CudaDihedral* __restrict__ dst);
261 
262  void copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
263  CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int64_t& pos, int64_t& pos2);
264 
265  void copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
266  const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst);
267 
268  void copyTholeData(const int ntuples, const TholeElem* __restrict__ src,
269  const TholeValue* __restrict__ thole_array, CudaThole* __restrict__ dst);
270 
271  void copyAnisoData(const int ntuples, const AnisoElem* __restrict src,
272  const AnisoValue* __restrict aniso_array, CudaAniso* __restrict dst);
273 
274  void copyOneFourNbTholeData(const int ntuples, const OneFourNbTholeElem* __restrict src,
275  const OneFourNbTholeValue* __restrict one_four_nbthole_array,
276  CudaOneFourNbThole* __restrict dst);
277 
278  static void tupleCopyWorker(int first, int last, void *result, int paraNum, void *param);
279  void tupleCopyWorker(int first, int last);
280 static void tupleCopyWorkerExcl(int first, int last, void *result, int paraNum, void *param);
281  void tupleCopyWorkerExcl(int first, int last);
282 
283 #ifdef NODEGROUP_FORCE_REGISTER
284  void tupleCopyWorkerType(int tupletype);
285 #endif
286  // --------------------------------------------------
287 
288  // Returns current reduction object based on if simulation is in GPU resident or GPU offload mode
289  SubmitReduction* getCurrentReduction();
290 
291 public:
292 
293  ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID, CudaNonbondedTables& cudaNonbondedTables);
294  ~ComputeBondedCUDA();
295  void registerCompute(int pe, int type, PatchIDList& pids);
296  void registerSelfCompute(int pe, int type, int pid);
297  void unregisterBoxesOnPe();
298  void assignPatchesOnPe();
299  virtual void patchReady(PatchID, int doneMigration, int seq);
300  virtual void initialize();
301  virtual void atomUpdate();
302  virtual int noWork();
303  virtual void doWork();
304  void messageEnqueueWork();
305  // void updatePatches();
306  void openBoxesOnPe(int startup = 1);
307  void loadTuplesOnPe(const int startup = 1);
308  void copyTupleData();
309  void copyTupleDataSN();
310  void launchWork();
311  void updateCudaAlchParameters();
312 
313  void updateHostCudaAlchFlags();
314  void updateKernelCudaAlchFlags();
315  void updateHostCudaAlchParameters();
316  void updateKernelCudaAlchParameters();
317  void updateHostCudaAlchLambdas();
318  void updateKernelCudaAlchLambdas();
319 
320 #ifdef NODEGROUP_FORCE_REGISTER
321  void updatePatchRecords();
322  void updateMaxTupleCounts(TupleCounts counts);
323  TupleCounts getMaxTupleCounts();
324  void registerPointersToHost();
325  void copyHostRegisterToDevice();
326  void copyPatchData();
327  void copyTupleDataGPU(const int startup);
328  void updatePatchOrder(const std::vector<CudaLocalRecord>& data);
329 #endif // NODEGROUP_FORCE_REGISTER
330 
331  void finishPatchesOnPe();
332  void finishPatches();
333  void finishReductions();
334 
335  std::vector<int>& getBondedPes(void) {return pes;}
336 
337  std::vector<PatchRecord>& getPatches() { return patches; }
338 };
339 
340 #endif // BONDED_CUDA
341 #endif // NAMD_CUDA
342 #endif // COMPUTEBONDEDCUDA_H
int32 ComputeID
Definition: NamdTypes.h:288
virtual void initialize()
Definition: Compute.h:56
virtual void doWork()
Definition: Compute.C:120
#define FORCE_TYPE
int operator==(const AtomSigInfo &s1, const AtomSigInfo &s2)
Definition: CompressPsf.C:146
virtual void atomUpdate()
Definition: Compute.h:59
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67
virtual int noWork()
Definition: Compute.C:116
int32 PatchID
Definition: NamdTypes.h:287