NAMD
SequencerCUDA.h
Go to the documentation of this file.
1 #ifndef SEQUENCERCUDA_H
2 #define SEQUENCERCUDA_H
3 
4 #ifdef NAMD_CUDA
5 #include <curand.h>
6 //**< Filed to be used for external force, energy, and virial */
7 #endif
8 
9 #ifdef NAMD_HIP
10 #include <hiprand/hiprand.h>
11 #endif
12 
13 #include "NamdTypes.h"
14 #include "HomePatch.h"
15 #include "PatchTypes.h"
16 #include "ProcessorPrivate.h"
17 #include "ReductionMgr.h"
18 #include "SimParameters.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
21 #include "ComputeRestraintsCUDA.h"
22 #include "ComputeSMDCUDA.h"
24 #include "ComputeGridForceCUDA.h"
25 #include "ComputeLonepairsCUDA.h"
26 #include "ComputeConsForceCUDA.h"
27 #include "PatchData.h"
28 #include "Lattice.h"
29 #include "MigrationCUDAKernel.h"
30 #include "HipDefines.h"
31 
32 #ifdef NODEGROUP_FORCE_REGISTER
33 
34 enum ExtForceTag {
35  EXT_CONSTRAINTS = 0,
36  EXT_ELEC_FIELD,
37  EXT_SMD,
38  EXT_GROUP_RESTRAINTS,
39  EXT_GRIDFORCE,
40  EXT_CONSFORCE,
41  EXT_GLOBALMTS,
42  EXT_FORCE_TOTAL
43 };
44 
45 class SequencerCUDA{
46  friend class HomePatch;
47  friend class Sequencer;
48 public:
49  static SequencerCUDA *InstanceInit(const int, SimParameters*);
50  inline static SequencerCUDA *Object() { return CkpvAccess(SequencerCUDA_instance); }
51  inline static SequencerCUDA *ObjectOnPe(int pe) { return CkpvAccessOther(SequencerCUDA_instance, CmiRankOf(pe)); }
52 
53  int numPatchesCheckedIn;
54  int numPatchesReady;
55  std::vector<CthThread> waitingThreads;
56  CthThread masterThread;
57  bool masterThreadSleeping = false;
58  bool breakSuspends = false;
59  SequencerCUDA(const int, SimParameters*);
60  ~SequencerCUDA();
61  void initialize();
62  void zeroScalars();
63  bool reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy);
64  void reallocateMigrationDestination();
65  void deallocateArrays();
66  void deallocateStaticArrays();
67  void copyAoSDataToHost();
68  void copyPatchDataToHost();
69  void copyAtomDataToDeviceAoS();
70 
71  void copyAtomDataToDevice(bool copyForces, int maxForceNumber);
72 
73  bool copyPatchData(const bool copyIn, const bool startup);
74  void copyDataToPeers(const bool copyIn);
75  void migrationLocalInit();
76  void migrationPerform();
77  void migrationLocalPost(int startup);
78  void migrationUpdateAdvancedFeatures(const int startup);
79  void migrationUpdateAtomCounts();
80  void migrationUpdateAtomOffsets();
81  void migrationUpdateRemoteOffsets();
82  void migrationUpdateProxyDestination();
83  void migrationUpdateDestination();
84  void migrationSortAtomsNonbonded();
85  void sync();
86 
87  void copyMigrationInfo(HomePatch *p, int patchIndex);
88 
89  void assembleOrderedPatchList();
92  void monteCarloPressure_part1(Tensor &factor, Vector &origin, Lattice &oldLattice);
94  void monteCarloPressure_part2(int step, int maxForceNumber,
95  const bool doEnergy, const bool doGloblal, const bool doVirial);
96 
98  void monteCarloPressure_reject(Lattice &lattice);
100  void monteCarloPressure_accept(const int doMigration);
101 
104  inline static void tensor_enforce_symmetry(Tensor& t) {
105  t.xy = t.yx;
106  t.xz = t.zx;
107  t.yz = t.zy;
108  }
109 
110  void launch_part1(
111  int step,
112  double dt_normal,
113  double dt_nbond,
114  double dt_slow,
115  double velrescaling,
116  const double maxvel2,
117  Tensor &factor,
118  Vector &origin,
119  Lattice &lattice,
120  int reassignVelocitiesStep,
121  int langevinPistonStep,
122  int berendsenPressureStep,
123  int maxForceNumber,
124  const int copyIn,
125  const int savePairlists,
126  const int usePairlists,
127  const bool doEnergy);
128 
129  void launch_part11(
130  double dt_normal,
131  double dt_nbond,
132  double dt_slow,
133  double velrescaling,
134  const double maxvel2,
135  Tensor &factor,
136  Vector &origin,
137  Lattice &lattice,
138  int langevinPistonStep,
139  int maxForceNumber,
140  const int copyIn,
141  const int savePairlists,
142  const int usePairlists,
143  const bool doEnergy);
144 
145  void launch_set_compute_positions();
146 
147  void launch_part2(
148  const int doMCPressure,
149  double dt_normal,
150  double dt_nbond,
151  double dt_slow,
152  Vector &origin,
153  int step,
154  int maxForceNumber,
155  const int langevinPistonStep,
156  const int copyIn,
157  const int copyOut,
158  const int doGlobal,
159  const bool doEnergy);
160 
161  void launch_part3(
162  const int doMCPressure,
163  double dt_normal,
164  double dt_nbond,
165  double dt_slow,
166  Vector &origin,
167  int step,
168  int maxForceNumber,
169  const bool requestGlobalForces,
170  const int doGlobalStaleForces,
171  const bool forceRequestedGPU,
172  const int copyIn,
173  const int copyOut,
174  const bool doEnergy,
175  const bool requestForcesOutput);
176 
178  void copyGlobalForcesToDevice();
179 
180  void copySettleParameter();
181  void finish_part1(const int copyIn,
182  const int savePairlists,
183  const int usePairlists);
184 
185  void update_patch_flags();
186  void finish_patch_flags(int isMigration);
187  void updatePairlistFlags(const int doMigration);
188  void updateDeviceKernels();
189  void setRescalePairlistTolerance(const bool val);
190 
191  // For total forces on CUDA global master
192  void allocateGPUSavedForces();
193  cudaStream_t stream, stream2;
194  PatchData *patchData;
195  friend class CudaGlobalMasterServer;
196 private:
197  const int deviceID;
198  bool mGpuOn; /*<! True if nDevices > 1*/
199  int nDevices; /*<! Total number of GPUs in the simulation -- number of MasterPEs */
200 
201 
202  int deviceIndex;
203  cudaEvent_t stream2CopyDone, stream2CopyAfter;
204  SimParameters *const simParams;
205 
206  SubmitReduction *reduction;
207 
208  std::vector<AtomMap*> atomMapList;
209 
210  // Migration Structures
211  FullAtom * d_atomdata_AoS_in;
212  FullAtom * d_atomdata_AoS;
213 
214  int *d_sortIndex;
215  int *d_sortSoluteIndex;
216  int4 *d_migrationDestination;
217 
218  int *d_migrationGroupSize;
219  int *d_migrationGroupIndex;
220 
221  int *d_idMig;
222  int *d_vdwType;
223 
224  int *idMig;
225  int *vdwType;
226 
227  // Device arrays
228 #if 1
229  double *d_recipMass;
230 #else
231  float *d_recipMass; // mass should be float
232 #endif
233  double *d_f_raw; // raw buffer for all arrays for faster setting -> broadcasting
234  double *d_f_normal_x, *d_f_normal_y, *d_f_normal_z;
235  double *d_f_nbond_x, *d_f_nbond_y, *d_f_nbond_z;
236  double *d_f_slow_x, *d_f_slow_y, *d_f_slow_z;
237  double *d_vel_x, *d_vel_y, *d_vel_z;
238  double *d_pos_raw;
239  double *d_pos_x, *d_pos_y, *d_pos_z;
240  // for fixed atoms
241  double *d_fixedPosition_x, *d_fixedPosition_y, *d_fixedPosition_z;
242 
243  double *d_f_saved_nbond_x, *d_f_saved_nbond_y, *d_f_saved_nbond_z;
244  double *d_f_saved_slow_x, *d_f_saved_slow_y, *d_f_saved_slow_z;
245 
246  double *d_posNew_raw;
247  double *d_posNew_x, *d_posNew_y, *d_posNew_z;
248 
249  double *d_f_global_x, *d_f_global_y, *d_f_global_z;
250 
251  double *d_rcm_x, *d_rcm_y, *d_rcm_z;
252  double *d_vcm_x, *d_vcm_y, *d_vcm_z;
253 
254  // backup forces for monte carlo barostat
255  double *d_f_rawMC; // raw buffer for all backup force array in MC barostat
256  double *d_pos_rawMC; // raw buffer for all backup position array in MC barostat
257  double *d_f_normalMC_x, *d_f_normalMC_y, *d_f_normalMC_z;
258  double *d_f_nbondMC_x, *d_f_nbondMC_y, *d_f_nbondMC_z;
259  double *d_f_slowMC_x, *d_f_slowMC_y, *d_f_slowMC_z;
260  // backup positions for monte carlo barostat
261  double *d_posMC_x, *d_posMC_y, *d_posMC_z;
262 
263  int *d_id; /*<! global atom index */
264  int *d_idOrder; /*<! Maps global atom index to local*/
265  int *d_moleculeStartIndex; /*<! starting index of each molecule */
266  int *d_moleculeAtom;/*<! Atom index sorted for all molecules */
267 
268  // XXX TODO: Maybe we can get away with not allocating these arrays
269  double *d_velNew_x, *d_velNew_y, *d_velNew_z;
270  double *d_posSave_x, *d_posSave_y, *d_posSave_z;
271  char3 *d_transform;
272 
273  int *d_patchOffsetTemp;
274 
275  float *d_rigidBondLength;
276  int *d_atomFixed;
277  int *d_groupFixed;
278  float *d_mass;
279  float *d_charge;
280  int *d_partition;
281  float *d_langevinParam;
282  float *d_langScalVelBBK2;
283  float *d_langScalRandBBK2;
284  float *d_gaussrand_x, *d_gaussrand_y, *d_gaussrand_z;
285  int *d_hydrogenGroupSize;
286  int *d_consFailure;
287  size_t d_consFailureSize;
288  int *settleList; /*<! Indexes of HGs to be calculated through SETTLE*/
289  size_t settleListSize;
290  CudaRattleElem* rattleList; /*<! Indexes of HGs with rigid bonds to be calculated through RATTLE*/
291  size_t rattleListSize;
292  int* d_globalToLocalID; /*<! maps PatchID to localID inside SOA structures */
293  int* d_patchToDeviceMap; /*<! Maps a patch to a device */
294  double3* d_patchCenter; /*<! patch centers on SOA datastructure */
295  double3* d_awayDists; /*<! 'away' field from marginCheck */
296  double3* d_patchMin; /*<! 'min' field from HomePatch */
297  double3* d_patchMax; /*<! 'max' field from HomePatch */
298  double* d_patchMaxAtomMovement; /*<! maxMovement of atoms in the patch*/
299  double* d_patchNewTolerance; /*<! maxMovement of atoms in the patch*/
300  unsigned int* d_tbcatomic;
301 
302  PatchDataSOA* d_HostPatchDataSOA;
303 
304  // Host arrays
305  double *recipMass;
306  double *f_global_x, *f_global_y, *f_global_z;
307  double *f_normal_x, *f_normal_y, *f_normal_z;
308  double *f_nbond_x, *f_nbond_y, *f_nbond_z;
309  double *f_slow_x, *f_slow_y, *f_slow_z;
310  double *vel_x, *vel_y, *vel_z;
311  double *pos_x, *pos_y, *pos_z;
312  char3 *transform;
313 
314  float *mass;
315  float *charge;
316  int *partition;
317  float *langevinParam;
318  float *langScalVelBBK2;
319  float *langScalRandBBK2;
320  //float *gaussrand_x, *gaussrand_y, *gaussrand_z;
321  int *hydrogenGroupSize; /*<! hydrogen size for each heavy atom. Hydrogens have 0*/
322  float *rigidBondLength;
323  // for fixed atoms
324  int* atomFixed;
325  int* groupFixed; // fixed atom + langevin piston
326  double* fixedPosition_x;
327  double* fixedPosition_y;
328  double* fixedPosition_z;
329  int* globalToLocalID; /*<! maps PatchID to localID inside SOA structure */
330  int* patchToDeviceMap; /*<! maps PatchID to localID inside SOA structure */
331  int* id; /*<! Global atom index */
332  double3* patchCenter;
333  double3* awayDists;
334  double3* patchMin;
335  double3* patchMax;
336  double* patchMaxAtomMovement;
337  double* patchNewTolerance;
338  int* computeNbondPosition;
339  int* sortOrder;
340  int* unsortOrder;
341  Lattice* pairlist_lattices;
342  double pairlist_newTolerance;
343  Lattice myLattice;
344  Lattice myLatticeOld;
345 
346  CudaMInfo *mInfo;
347 
348  // Host-Mapped scalars
349  int* killme;
350  BigReal* kineticEnergy_half;
351  BigReal* intKineticEnergy_half;
352  BigReal* kineticEnergy;
353  BigReal* intKineticEnergy;
354  BigReal* momentum_x;
355  BigReal* momentum_y;
356  BigReal* momentum_z;
357  BigReal* angularMomentum_x;
358  BigReal* angularMomentum_y;
359  BigReal* angularMomentum_z;
360  cudaTensor *virial;
361  cudaTensor *virial_half;
362  cudaTensor *intVirialNormal;
363  cudaTensor *intVirialNormal_half;
364  cudaTensor *rigidVirial;
365  cudaTensor *intVirialNbond;
366  cudaTensor *intVirialSlow;
367  cudaTensor *lpVirialNormal;
368  cudaTensor *lpVirialNbond;
369  cudaTensor *lpVirialSlow;
370 
371  double3 *extForce;
372  double *extEnergy;
373  cudaTensor *extVirial;
374 
375  unsigned int *h_marginViolations;
376  unsigned int *h_periodicCellSmall;
377 
378  unsigned int totalMarginViolations;
379 
380  // Device scalars
381  bool buildRigidLists;
382  int *d_killme;
383  BigReal *d_kineticEnergy;
384  BigReal *d_intKineticEnergy;
385  cudaTensor *d_virial;
386  cudaTensor *d_intVirialNormal;
387  cudaTensor *d_intVirialNbond;
388  cudaTensor *d_intVirialSlow;
389  cudaTensor *d_rigidVirial;
390  cudaTensor *d_lpVirialNormal;
391  cudaTensor *d_lpVirialNbond;
392  cudaTensor *d_lpVirialSlow;
393  // cudaTensor *d_extTensor;
394  BigReal *d_momentum_x;
395  BigReal *d_momentum_y;
396  BigReal *d_momentum_z;
397  BigReal *d_angularMomentum_x;
398  BigReal *d_angularMomentum_y;
399  BigReal *d_angularMomentum_z;
400  Lattice *d_lattices;
401  Lattice *d_pairlist_lattices;
402 
403  double3 *d_extForce;
404  double *d_extEnergy;
405  cudaTensor *d_extVirial;
406  CudaMInfo *d_mInfo;
407 
408  // Haochuan: for fixed atoms
409  cudaTensor *d_fixVirialNormal;
410  cudaTensor *d_fixVirialNbond;
411  cudaTensor *d_fixVirialSlow;
412  double3 *d_fixForceNormal;
413  double3 *d_fixForceNbond;
414  double3 *d_fixForceSlow;
415 
416  int *d_sortOrder;
417  int *d_unsortOrder;
418 
419  int numAtomsHome;
420  int numAtomsHomePrev;
421  int numAtomsHomeAllocated;
422  int numAtomsHomeAndProxy;
423  int numAtomsHomeAndProxyAllocated;
424 
425  int numPatchesGlobal;
426  int numPatchesHomeAndProxy;
427  int numPatchesHome;
428 
429  int marginViolations;
430  bool rescalePairlistTolerance;
431  int nSettle, nRattle;
432  BigReal maxvel2;
433  SettleParameters *sp;
434  CudaAtom** cudaAtomLists;
435 
436  CmiNodeLock printlock;
437 
438  cudaEvent_t eventStart, eventStop; // events for kernel timing
439  float t_total;
440 
441  // Launch_pt1 timers
442  float t_vverlet;
443  float t_pairlistCheck;
444  float t_setComputePositions;
445 
446  // Launch_pt2 timers
447  float t_accumulateForceKick;
448  float t_rattle;
449  float t_submitHalf;
450  float t_submitReductions1;
451  float t_submitReductions2;
452 
453  // True if system is periodic in all directions
454  bool isPeriodic;
455 
456  std::vector<HomePatch*> patchList;
457 
458 #if 1
459  // TODO remove once GPU migration is merged
460  std::vector<HomePatch*> patchListHomeAndProxy;
461 #endif
462 
463  int* consFailure;
464  unsigned long long int d_ullmaxtol;
465  SequencerCUDAKernel *CUDASequencerKernel;
466  MigrationCUDAKernel *CUDAMigrationKernel;
467  ComputeRestraintsCUDA *restraintsKernel;
468  ComputeSMDCUDA *SMDKernel;
469  ComputeGroupRestraintsCUDA *groupRestraintsKernel;
470  ComputeGridForceCUDA *gridForceKernel;
471  ComputeLonepairsCUDA *lonepairsKernel;
472  curandGenerator_t curandGen;
473  ComputeConsForceCUDA *consForceKernel;
474  // alchemical used PME grids
475  size_t num_used_grids;
476  std::vector<int> used_grids;
477 
478 #if 1
479  unsigned int* deviceQueue; // pointer to work queue this device holds
480 
481  // if we have more than one device running this, we keep a record of all devices' data here
482  double** d_peer_pos_x;
483  double** d_peer_pos_y;
484  double** d_peer_pos_z;
485  float** d_peer_charge;
486  int** d_peer_partition;
487  double** d_peer_vel_x;
488  double** d_peer_vel_y;
489  double** d_peer_vel_z;
490  double** d_peer_fb_x;
491  double** d_peer_fb_y;
492  double** d_peer_fb_z;
493  double** d_peer_fn_x;
494  double** d_peer_fn_y;
495  double** d_peer_fn_z;
496  double** d_peer_fs_x;
497  double** d_peer_fs_y;
498  double** d_peer_fs_z;
499  bool** h_patchRecordHasForces;
500  bool** d_patchRecordHasForces;
501  char* d_barrierFlag;
502 
503  // GPU Migration peer buffers
504 
505  int4** d_peer_migrationDestination;
506  int** d_peer_sortSoluteIndex;
507 
508  int** d_peer_id;
509  int** d_peer_vdwType;
510  int** d_peer_sortOrder;
511  int** d_peer_unsortOrder;
512  double3** d_peer_patchCenter;
513 
514  FullAtom** d_peer_atomdata;
515  CudaLocalRecord** d_peer_record;
516 
517 #endif
518 
519  void maximumMove(
520  const double maxvel2,
521  const int numAtoms);
522  void submitHalf(
523  int numAtoms, int part,
524  const bool doEnergy);
525  void submitReductions(
526  BigReal origin_x,
527  BigReal origin_y,
528  BigReal origin_z,
529  int marginViolations,
530  int doEnergy, // doEnergy
531  int doMomentum,
532  int numAtoms,
533  int maxForceNumber);
534 
535  void submitReductionValues();
536  void copyPositionsAndVelocitiesToHost(bool copyOut, const int doGlobal);
537  void copyPositionsToHost();
538  void startRun1(int maxForceNumber, const Lattice& lat);
539  void startRun2(
540  double dt_normal,
541  double dt_nbond,
542  double dt_slow,
543  Vector origin,
544  int doGlobal,
545  int maxForceNumber);
546  void startRun3(
547  double dt_normal,
548  double dt_nbond,
549  double dt_slow,
550  Vector origin,
551  const bool requestGlobalForces,
552  int doGlobalMasterStateForces,
553  const bool requestForcesOutput,
554  const bool requestGlobalForcesGPU,
555  int maxForceNumber);
556 
557  // TIP4 water model
558  void redistributeTip4pForces(
559  const int maxForceNumber,
560  const int doVirial);
561 
562  void printSOAForces(char *);
563  void printSOAPositionsAndVelocities();
564  void registerSOAPointersToHost();
565  void copySOAHostRegisterToDevice();
568  void calculateExternalForces(
569  const int step,
570  const int maxForceNumber,
571  const int doEnergy,
572  const int doVirial);
573 
574  void atomUpdatePme();
575 
576  void updateHostPatchDataSOA();
577  void saveForceCUDASOA_direct(
578  const bool doGlobal,
579  const bool doForcesOutput,
580  const int maxForceNumber);
581  void copyPositionsToHost_direct();
582 
583  int getNumPatchesHome() { return numPatchesHome; }
584 
585  double3* getHostPatchMin() { return patchMin; }
586  double3* getHostPatchMax() { return patchMax; }
587  double3* getHostAwayDists() { return awayDists; }
588 };
589 
590 #endif // NAMD_CUDA
591 #endif // SEQUENCERCUDA_H
BigReal zy
Definition: Tensor.h:19
friend class SequencerCUDA
Definition: Sequencer.h:49
BigReal xz
Definition: Tensor.h:17
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
Definition: Vector.h:72
bool masterThread
Definition: Sequencer.h:331
BigReal yz
Definition: Tensor.h:18
BigReal yx
Definition: Tensor.h:18
#define simParams
Definition: Output.C:131
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal zx
Definition: Tensor.h:19
double BigReal
Definition: common.h:123
A class for copying atom information from SequencerCUDA to CudaGlobalMasterClient.