NAMD
SequencerCUDA.h
Go to the documentation of this file.
1 #ifndef SEQUENCERCUDA_H
2 #define SEQUENCERCUDA_H
3 
4 #ifdef NAMD_CUDA
5 #include <curand.h>
6 //**< Filed to be used for external force, energy, and virial */
7 #endif
8 
9 #ifdef NAMD_HIP
10 #include <hiprand/hiprand.h>
11 #endif
12 
13 #include "NamdTypes.h"
14 #include "HomePatch.h"
15 #include "PatchTypes.h"
16 #include "ProcessorPrivate.h"
17 #include "ReductionMgr.h"
18 #include "SimParameters.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
21 #include "ComputeRestraintsCUDA.h"
22 #include "ComputeSMDCUDA.h"
24 #include "ComputeGridForceCUDA.h"
25 #include "ComputeLonepairsCUDA.h"
26 #include "ComputeConsForceCUDA.h"
27 #include "PatchData.h"
28 #include "Lattice.h"
29 #include "MigrationCUDAKernel.h"
30 #include "HipDefines.h"
31 
32 #ifdef NODEGROUP_FORCE_REGISTER
33 
34 enum ExtForceTag {
35  EXT_CONSTRAINTS = 0,
36  EXT_ELEC_FIELD,
37  EXT_SMD,
38  EXT_GROUP_RESTRAINTS,
39  EXT_GRIDFORCE,
40  EXT_CONSFORCE,
41  EXT_GLOBALMTS,
42  EXT_FORCE_TOTAL
43 };
44 
45 class SequencerCUDA{
46  friend class HomePatch;
47  friend class Sequencer;
48 public:
49  static SequencerCUDA *InstanceInit(const int, SimParameters*);
50  inline static SequencerCUDA *Object() { return CkpvAccess(SequencerCUDA_instance); }
51  inline static SequencerCUDA *ObjectOnPe(int pe) { return CkpvAccessOther(SequencerCUDA_instance, CmiRankOf(pe)); }
52 
53  int numPatchesCheckedIn;
54  int numPatchesReady;
55  std::vector<CthThread> waitingThreads;
56  CthThread masterThread;
57  bool masterThreadSleeping = false;
58  bool breakSuspends = false;
59  SequencerCUDA(const int, SimParameters*);
60  ~SequencerCUDA();
61  void initialize();
62  void zeroScalars();
63  bool reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy);
64  void reallocateMigrationDestination();
65  void deallocateArrays();
66  void deallocateStaticArrays();
67 
68  void copyAoSDataToHost();
69  void copyPatchDataToHost();
70  void copyAtomDataToDeviceAoS();
71 
72  void copyAtomDataToDevice(bool copyForces, int maxForceNumber);
73 
74  bool copyPatchData(const bool copyIn, const bool startup);
75  void copyDataToPeers(const bool copyIn);
76  void migrationLocalInit();
77  void migrationPerform();
78  void migrationLocalPost(int startup);
79  void migrationUpdateAdvancedFeatures(const int startup);
80  void migrationUpdateAtomCounts();
81  void migrationUpdateAtomOffsets();
82  void migrationUpdateRemoteOffsets();
83  void migrationUpdateProxyDestination();
84  void migrationUpdateDestination();
85  void migrationSortAtomsNonbonded();
86  void sync();
87 
88  void copyMigrationInfo(HomePatch *p, int patchIndex);
89 
90  void assembleOrderedPatchList();
93  void monteCarloPressure_part1(Tensor &factor, Vector &origin, Lattice &oldLattice);
95  void monteCarloPressure_part2(NodeReduction *reduction, int step, int maxForceNumber,
96  const bool doEnergy, const bool doGloblal, const bool doVirial);
97 
99  void monteCarloPressure_reject(Lattice &lattice);
101  void monteCarloPressure_accept(NodeReduction *reduction, const int doMigration);
102 
105  inline static void tensor_enforce_symmetry(Tensor& t) {
106  t.xy = t.yx;
107  t.xz = t.zx;
108  t.yz = t.zy;
109  }
110 
111  void launch_part1(
112  int step,
113  double dt_normal,
114  double dt_nbond,
115  double dt_slow,
116  double velrescaling,
117  const double maxvel2,
119  Tensor &factor,
120  Vector &origin,
121  Lattice &lattice,
122  int reassignVelocitiesStep,
123  int langevinPistonStep,
124  int berendsenPressureStep,
125  int maxForceNumber,
126  const int copyIn,
127  const int savePairlists,
128  const int usePairlists,
129  const bool doEnergy);
130 
131  void launch_part11(
132  double dt_normal,
133  double dt_nbond,
134  double dt_slow,
135  double velrescaling,
136  const double maxvel2,
138  Tensor &factor,
139  Vector &origin,
140  Lattice &lattice,
141  int langevinPistonStep,
142  int maxForceNumber,
143  const int copyIn,
144  const int savePairlists,
145  const int usePairlists,
146  const bool doEnergy);
147 
148  void launch_set_compute_positions();
149 
150  void launch_part2(
151  const int doMCPressure,
152  double dt_normal,
153  double dt_nbond,
154  double dt_slow,
156  Vector &origin,
157  int step,
158  int maxForceNumber,
159  const int langevinPistonStep,
160  const int copyIn,
161  const int copyOut,
162  const int doGlobal,
163  const bool doEnergy);
164 
165  void launch_part3(
166  const int doMCPressure,
167  double dt_normal,
168  double dt_nbond,
169  double dt_slow,
171  Vector &origin,
172  int step,
173  int maxForceNumber,
174  const bool requestGlobalForces,
175  const int doGlobalStaleForces,
176  const bool forceRequestedGPU,
177  const int copyIn,
178  const int copyOut,
179  const bool doEnergy,
180  const bool requestForcesOutput);
181 
183  void copyGlobalForcesToDevice();
184 
185  void copySettleParameter();
186  void finish_part1(const int copyIn,
187  const int savePairlists,
188  const int usePairlists,
190 
191  void update_patch_flags();
192  void finish_patch_flags(int isMigration);
193  void updatePairlistFlags(const int doMigration);
194  void updateDeviceKernels();
195 
196  // For total forces on CUDA global master
197  void allocateGPUSavedForces();
198  cudaStream_t stream, stream2;
199  PatchData *patchData;
200  friend class CudaGlobalMasterServer;
201 private:
202  const int deviceID;
203  bool mGpuOn; /*<! True if nDevices > 1*/
204  int nDevices; /*<! Total number of GPUs in the simulation -- number of MasterPEs */
205 
206 
207  int deviceIndex;
208  cudaEvent_t stream2CopyDone, stream2CopyAfter;
209  SimParameters *const simParams;
210 
211  std::vector<AtomMap*> atomMapList;
212 
213  // Migration Structures
214  FullAtom * d_atomdata_AoS_in;
215  FullAtom * d_atomdata_AoS;
216 
217  int *d_sortIndex;
218  int *d_sortSoluteIndex;
219  int4 *d_migrationDestination;
220 
221  int *d_migrationGroupSize;
222  int *d_migrationGroupIndex;
223 
224  int *d_idMig;
225  int *d_vdwType;
226 
227  int *idMig;
228  int *vdwType;
229 
230  // Device arrays
231 #if 1
232  double *d_recipMass;
233 #else
234  float *d_recipMass; // mass should be float
235 #endif
236  double *d_f_raw; // raw buffer for all arrays for faster setting -> broadcasting
237  double *d_f_normal_x, *d_f_normal_y, *d_f_normal_z;
238  double *d_f_nbond_x, *d_f_nbond_y, *d_f_nbond_z;
239  double *d_f_slow_x, *d_f_slow_y, *d_f_slow_z;
240  double *d_vel_x, *d_vel_y, *d_vel_z;
241  double *d_pos_raw;
242  double *d_pos_x, *d_pos_y, *d_pos_z;
243  // for fixed atoms
244  double *d_fixedPosition_x, *d_fixedPosition_y, *d_fixedPosition_z;
245 
246  double *d_f_saved_nbond_x, *d_f_saved_nbond_y, *d_f_saved_nbond_z;
247  double *d_f_saved_slow_x, *d_f_saved_slow_y, *d_f_saved_slow_z;
248 
249  double *d_posNew_raw;
250  double *d_posNew_x, *d_posNew_y, *d_posNew_z;
251 
252  double *d_f_global_x, *d_f_global_y, *d_f_global_z;
253 
254  double *d_rcm_x, *d_rcm_y, *d_rcm_z;
255  double *d_vcm_x, *d_vcm_y, *d_vcm_z;
256 
257  // backup forces for monte carlo barostat
258  double *d_f_rawMC; // raw buffer for all backup force array in MC barostat
259  double *d_pos_rawMC; // raw buffer for all backup position array in MC barostat
260  double *d_f_normalMC_x, *d_f_normalMC_y, *d_f_normalMC_z;
261  double *d_f_nbondMC_x, *d_f_nbondMC_y, *d_f_nbondMC_z;
262  double *d_f_slowMC_x, *d_f_slowMC_y, *d_f_slowMC_z;
263  // backup positions for monte carlo barostat
264  double *d_posMC_x, *d_posMC_y, *d_posMC_z;
265 
266  int *d_id; /*<! global atom index */
267  int *d_idOrder; /*<! Maps global atom index to local*/
268  int *d_moleculeStartIndex; /*<! starting index of each molecule */
269  int *d_moleculeAtom;/*<! Atom index sorted for all molecules */
270 
271  // XXX TODO: Maybe we can get away with not allocating these arrays
272  double *d_velNew_x, *d_velNew_y, *d_velNew_z;
273  double *d_posSave_x, *d_posSave_y, *d_posSave_z;
274  char3 *d_transform;
275 
276  int *d_patchOffsetTemp;
277 
278  float *d_rigidBondLength;
279  int *d_atomFixed;
280  int *d_groupFixed;
281  float *d_mass;
282  float *d_charge;
283  int *d_partition;
284  float *d_langevinParam;
285  float *d_langScalVelBBK2;
286  float *d_langScalRandBBK2;
287  float *d_gaussrand_x, *d_gaussrand_y, *d_gaussrand_z;
288  int *d_hydrogenGroupSize;
289  int *d_consFailure;
290  size_t d_consFailureSize;
291  int *settleList; /*<! Indexes of HGs to be calculated through SETTLE*/
292  size_t settleListSize;
293  CudaRattleElem* rattleList; /*<! Indexes of HGs with rigid bonds to be calculated through RATTLE*/
294  size_t rattleListSize;
295  int* d_globalToLocalID; /*<! maps PatchID to localID inside SOA structures */
296  int* d_patchToDeviceMap; /*<! Maps a patch to a device */
297  double3* d_patchCenter; /*<! patch centers on SOA datastructure */
298  double3* d_awayDists; /*<! 'away' field from marginCheck */
299  double3* d_patchMin; /*<! 'min' field from HomePatch */
300  double3* d_patchMax; /*<! 'max' field from HomePatch */
301  double* d_patchMaxAtomMovement; /*<! maxMovement of atoms in the patch*/
302  double* d_patchNewTolerance; /*<! maxMovement of atoms in the patch*/
303  unsigned int* d_tbcatomic;
304 
305  PatchDataSOA* d_HostPatchDataSOA;
306 
307  // Host arrays
308  double *recipMass;
309  double *f_global_x, *f_global_y, *f_global_z;
310  double *f_normal_x, *f_normal_y, *f_normal_z;
311  double *f_nbond_x, *f_nbond_y, *f_nbond_z;
312  double *f_slow_x, *f_slow_y, *f_slow_z;
313  double *vel_x, *vel_y, *vel_z;
314  double *pos_x, *pos_y, *pos_z;
315  char3 *transform;
316 
317  float *mass;
318  float *charge;
319  int *partition;
320  float *langevinParam;
321  float *langScalVelBBK2;
322  float *langScalRandBBK2;
323  //float *gaussrand_x, *gaussrand_y, *gaussrand_z;
324  int *hydrogenGroupSize; /*<! hydrogen size for each heavy atom. Hydrogens have 0*/
325  float *rigidBondLength;
326  // for fixed atoms
327  int* atomFixed;
328  int* groupFixed; // fixed atom + langevin piston
329  double* fixedPosition_x;
330  double* fixedPosition_y;
331  double* fixedPosition_z;
332  int* globalToLocalID; /*<! maps PatchID to localID inside SOA structure */
333  int* patchToDeviceMap; /*<! maps PatchID to localID inside SOA structure */
334  int* id; /*<! Global atom index */
335  double3* patchCenter;
336  double3* awayDists;
337  double3* patchMin;
338  double3* patchMax;
339  double* patchMaxAtomMovement;
340  double* patchNewTolerance;
341  int* computeNbondPosition;
342  int* sortOrder;
343  int* unsortOrder;
344  Lattice* pairlist_lattices;
345  double pairlist_newTolerance;
346  Lattice myLattice;
347  Lattice myLatticeOld;
348 
349  CudaMInfo *mInfo;
350 
351  // Host-Mapped scalars
352  int* killme;
353  BigReal* kineticEnergy_half;
354  BigReal* intKineticEnergy_half;
355  BigReal* kineticEnergy;
356  BigReal* intKineticEnergy;
357  BigReal* momentum_x;
358  BigReal* momentum_y;
359  BigReal* momentum_z;
360  BigReal* angularMomentum_x;
361  BigReal* angularMomentum_y;
362  BigReal* angularMomentum_z;
363  cudaTensor *virial;
364  cudaTensor *virial_half;
365  cudaTensor *intVirialNormal;
366  cudaTensor *intVirialNormal_half;
367  cudaTensor *rigidVirial;
368  cudaTensor *intVirialNbond;
369  cudaTensor *intVirialSlow;
370  cudaTensor *lpVirialNormal;
371  cudaTensor *lpVirialNbond;
372  cudaTensor *lpVirialSlow;
373 
374  double3 *extForce;
375  double *extEnergy;
376  cudaTensor *extVirial;
377 
378  unsigned int *h_marginViolations;
379  unsigned int *h_periodicCellSmall;
380 
381  unsigned int totalMarginViolations;
382 
383  // Device scalars
384  bool buildRigidLists;
385  int *d_killme;
386  BigReal *d_kineticEnergy;
387  BigReal *d_intKineticEnergy;
388  cudaTensor *d_virial;
389  cudaTensor *d_intVirialNormal;
390  cudaTensor *d_intVirialNbond;
391  cudaTensor *d_intVirialSlow;
392  cudaTensor *d_rigidVirial;
393  cudaTensor *d_lpVirialNormal;
394  cudaTensor *d_lpVirialNbond;
395  cudaTensor *d_lpVirialSlow;
396  // cudaTensor *d_extTensor;
397  BigReal *d_momentum_x;
398  BigReal *d_momentum_y;
399  BigReal *d_momentum_z;
400  BigReal *d_angularMomentum_x;
401  BigReal *d_angularMomentum_y;
402  BigReal *d_angularMomentum_z;
403  Lattice *d_lattices;
404  Lattice *d_pairlist_lattices;
405 
406  double3 *d_extForce;
407  double *d_extEnergy;
408  cudaTensor *d_extVirial;
409  CudaMInfo *d_mInfo;
410 
411  // Haochuan: for fixed atoms
412  cudaTensor *d_fixVirialNormal;
413  cudaTensor *d_fixVirialNbond;
414  cudaTensor *d_fixVirialSlow;
415  double3 *d_fixForceNormal;
416  double3 *d_fixForceNbond;
417  double3 *d_fixForceSlow;
418 
419  int *d_sortOrder;
420  int *d_unsortOrder;
421 
422  int numAtomsHome;
423  int numAtomsHomePrev;
424  int numAtomsHomeAllocated;
425  int numAtomsHomeAndProxy;
426  int numAtomsHomeAndProxyAllocated;
427 
428  int numPatchesGlobal;
429  int numPatchesHomeAndProxy;
430  int numPatchesHome;
431 
432  int marginViolations;
433  bool rescalePairlistTolerance;
434  int nSettle, nRattle;
435  BigReal maxvel2;
436  SettleParameters *sp;
437  CudaAtom** cudaAtomLists;
438 
439  CmiNodeLock printlock;
440 
441  cudaEvent_t eventStart, eventStop; // events for kernel timing
442  float t_total;
443 
444  // Launch_pt1 timers
445  float t_vverlet;
446  float t_pairlistCheck;
447  float t_setComputePositions;
448 
449  // Launch_pt2 timers
450  float t_accumulateForceKick;
451  float t_rattle;
452  float t_submitHalf;
453  float t_submitReductions1;
454  float t_submitReductions2;
455 
456  // True if system is periodic in all directions
457  bool isPeriodic;
458 
459  std::vector<HomePatch*> patchList;
460 
461 #if 1
462  // TODO remove once GPU migration is merged
463  std::vector<HomePatch*> patchListHomeAndProxy;
464 #endif
465 
466  int* consFailure;
467  unsigned long long int d_ullmaxtol;
468  SequencerCUDAKernel *CUDASequencerKernel;
469  MigrationCUDAKernel *CUDAMigrationKernel;
470  ComputeRestraintsCUDA *restraintsKernel;
471  ComputeSMDCUDA *SMDKernel;
472  ComputeGroupRestraintsCUDA *groupRestraintsKernel;
473  ComputeGridForceCUDA *gridForceKernel;
474  ComputeLonepairsCUDA *lonepairsKernel;
475  curandGenerator_t curandGen;
476  ComputeConsForceCUDA *consForceKernel;
477  // alchemical used PME grids
478  size_t num_used_grids;
479  std::vector<int> used_grids;
480 
481 #if 1
482  unsigned int* deviceQueue; // pointer to work queue this device holds
483 
484  // if we have more than one device running this, we keep a record of all devices' data here
485  double** d_peer_pos_x;
486  double** d_peer_pos_y;
487  double** d_peer_pos_z;
488  float** d_peer_charge;
489  int** d_peer_partition;
490  double** d_peer_vel_x;
491  double** d_peer_vel_y;
492  double** d_peer_vel_z;
493  double** d_peer_fb_x;
494  double** d_peer_fb_y;
495  double** d_peer_fb_z;
496  double** d_peer_fn_x;
497  double** d_peer_fn_y;
498  double** d_peer_fn_z;
499  double** d_peer_fs_x;
500  double** d_peer_fs_y;
501  double** d_peer_fs_z;
502  bool** h_patchRecordHasForces;
503  bool** d_patchRecordHasForces;
504  char* d_barrierFlag;
505 
506  // GPU Migration peer buffers
507 
508  int4** d_peer_migrationDestination;
509  int** d_peer_sortSoluteIndex;
510 
511  int** d_peer_id;
512  int** d_peer_vdwType;
513  int** d_peer_sortOrder;
514  int** d_peer_unsortOrder;
515  double3** d_peer_patchCenter;
516 
517  FullAtom** d_peer_atomdata;
518  CudaLocalRecord** d_peer_record;
519 
520 #endif
521 
522  void maximumMove(
523  const double maxvel2,
524  const int numAtoms);
525  void submitHalf(
526  NodeReduction *reduction,
527  int numAtoms, int part,
528  const bool doEnergy);
529  void submitReductions(
530  NodeReduction *reduction,
531  BigReal origin_x,
532  BigReal origin_y,
533  BigReal origin_z,
534  int marginViolations,
535  int doEnergy, // doEnergy
536  int doMomentum,
537  int numAtoms,
538  int maxForceNumber);
539 
540  void submitReductionValues();
541  void copyPositionsAndVelocitiesToHost(bool copyOut, const int doGlobal);
542  void copyPositionsToHost();
543  void startRun1(int maxForceNumber, const Lattice& lat);
544  void startRun2(
545  double dt_normal,
546  double dt_nbond,
547  double dt_slow,
548  Vector origin,
549  NodeReduction *reduction,
550  int doGlobal,
551  int maxForceNumber);
552  void startRun3(
553  double dt_normal,
554  double dt_nbond,
555  double dt_slow,
556  Vector origin,
557  NodeReduction *reduction,
558  const bool requestGlobalForces,
559  int doGlobalMasterStateForces,
560  const bool requestForcesOutput,
561  const bool requestGlobalForcesGPU,
562  int maxForceNumber);
563 
564  // TIP4 water model
565  void redistributeTip4pForces(
566  NodeReduction *reduction,
567  const int maxForceNumber,
568  const int doVirial);
569 
570  void printSOAForces(char *);
571  void printSOAPositionsAndVelocities();
572  void registerSOAPointersToHost();
573  void copySOAHostRegisterToDevice();
576  void calculateExternalForces(
577  const int step,
578  NodeReduction *reduction,
579  const int maxForceNumber,
580  const int doEnergy,
581  const int doVirial);
582 
583  void atomUpdatePme();
584 
585  void updateHostPatchDataSOA();
586  void saveForceCUDASOA_direct(
587  const bool doGlobal,
588  const bool doForcesOutput,
589  const int maxForceNumber);
590  void copyPositionsToHost_direct();
591 
592  int getNumPatchesHome() { return numPatchesHome; }
593 
594  double3* getHostPatchMin() { return patchMin; }
595  double3* getHostPatchMax() { return patchMax; }
596  double3* getHostAwayDists() { return awayDists; }
597 };
598 
599 #endif // NAMD_CUDA
600 #endif // SEQUENCERCUDA_H
BigReal zy
Definition: Tensor.h:19
friend class SequencerCUDA
Definition: Sequencer.h:47
SubmitReduction * reduction
Definition: Sequencer.h:322
BigReal xz
Definition: Tensor.h:17
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
Definition: Vector.h:72
bool masterThread
Definition: Sequencer.h:329
BigReal yz
Definition: Tensor.h:18
BigReal yx
Definition: Tensor.h:18
#define simParams
Definition: Output.C:129
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal zx
Definition: Tensor.h:19
double BigReal
Definition: common.h:123
A class for copying atom information from SequencerCUDA to CudaGlobalMasterClient.