NAMD
CudaGlobalMasterServer.h
Go to the documentation of this file.
1 #ifndef CUDAGLOBALMASTERSERVER_H
2 #define CUDAGLOBALMASTERSERVER_H
3 
4 #include <memory>
5 #include <unordered_map>
6 #include <vector>
7 #include "CudaRecord.h"
8 #ifdef NAMD_CUDA
9 #include "cuda_runtime.h"
10 #endif
11 #ifdef NAMD_HIP
12 #include <hip/hip_runtime.h>
13 #endif
14 #include "common.h"
15 
16 namespace CudaGlobalMaster {
17  inline namespace CUDAGM_NS {
18  class CudaGlobalMasterClient;
19  }
20 }
21 
22 class Lattice;
23 class AtomMap;
24 class SubmitReduction;
25 
31 public:
32 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
33 
37  using tf_type = double;
43  struct CopyListTuple {
45  int m_src_dev_index;
47  int m_soa_index;
49  size_t m_client_index;
51  size_t m_client_atom_pos;
52  };
58  struct ClientBuffer {
60  double *d_data;
62  float *d_mass;
64  float *d_charge;
66  char *d_transform;
68  double *d_vel;
70  size_t sz;
71  };
76  struct PeerAtomData {
77  double **d_pos_x;
78  double **d_pos_y;
79  double **d_pos_z;
80  double **d_vel_x;
81  double **d_vel_y;
82  double **d_vel_z;
83  float **d_mass;
84  float **d_charge;
85  char3 **d_transform;
86  };
91  struct PeerTFArray {
92  tf_type **d_f_normal_x;
93  tf_type **d_f_normal_y;
94  tf_type **d_f_normal_z;
95  tf_type **d_f_saved_nbond_x;
96  tf_type **d_f_saved_nbond_y;
97  tf_type **d_f_saved_nbond_z;
98  tf_type **d_f_saved_slow_x;
99  tf_type **d_f_saved_slow_y;
100  tf_type **d_f_saved_slow_z;
101  int **d_atomFixed;
102  };
107  struct PeerAFArray {
108  tf_type **d_f_applied_x;
109  tf_type **d_f_applied_y;
110  tf_type **d_f_applied_z;
111  int **d_atomFixed;
112  };
118  CudaGlobalMasterServer(int deviceID, int printProfilingFreq = -1);
127  void addClient(std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient> client);
132  void removeClient(std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient> client);
137  void communicateToClients(const Lattice* lat);
141  void calculate();
145  void communicateToMD(bool doEnergy, bool doVirial);
149  void updateAtomMaps();
153  bool requestedTotalForces() const;
157  bool willAddGlobalForces() const;
162  void setStep(int64_t step);
166  const std::vector<std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient>> &getClients() const { return m_clients; }
167 
168  cudaStream_t getStream() {
169  return m_stream;
170  }
174  void finishReductions();
175 
179  template <typename T>
180  class CudaHostAllocator {
181  public:
182  using value_type = T;
183 
184  CudaHostAllocator() = default;
185 
186  template<typename U>
187  constexpr CudaHostAllocator(const CudaHostAllocator<U>&) noexcept {}
188 
189  friend bool operator==(const CudaHostAllocator&, const CudaHostAllocator&) { return true; }
190  friend bool operator!=(const CudaHostAllocator&, const CudaHostAllocator&) { return false; }
191 
192  T* allocate(size_t n) {
193  T* ptr;
194  if (cudaHostAlloc(&ptr, n * sizeof(T), cudaHostAllocMapped) != cudaSuccess) {
195  throw std::bad_alloc();
196  }
197  return ptr;
198  }
199  void deallocate(T* ptr, size_t n) noexcept {
200  cudaFreeHost(ptr);
201  }
202  template<typename U, typename... Args>
203  void construct(U* p, Args&&... args) {
204  new(p) U(std::forward<Args>(args)...);
205  }
206 
207  template<typename U>
208  void destroy(U* p) noexcept {
209  p->~U();
210  }
211  };
212 private:
221  void copyAtomsToClients(bool copyPositions, bool copyMasses, bool copyCharges,
222  bool copyTransforms, bool copyVelocities);
226  void copyTotalForcesToClients();
230  void addGlobalForces();
234  void buildAtomsCopyList();
238  void buildAtomsTotalForcesCopyList();
242  void buildForcedAtomsCopyList();
246  void allocatePeerArrays();
250  void copyPeerArraysToDevice();
254  SubmitReduction* getCurrentReduction() {
255  // only supports GPU-resident mode
256  return reductionGpuResident;
257  }
258 
259 private:
260  int m_device_id;
261  int64_t m_step;
262  cudaStream_t m_stream;
263  int m_num_devices;
264  int m_clients_changed;
265  int m_atom_maps_changed;
266  int m_print_profiling_freq;
267  std::vector<std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient>> m_clients;
268  static constexpr int numCopyLists = 3;
269  // Data structures for copying atomic positions to multiple clients
270  std::vector<CopyListTuple, CudaHostAllocator<CopyListTuple>> m_atom_pos_copy_list;
271  CopyListTuple *m_d_atom_pos_copy_list;
272  ClientBuffer *m_d_atom_pos_client_buffers;
273  // Data structures for copying total forces to multiple clients
274  std::vector<CopyListTuple, CudaHostAllocator<CopyListTuple>> m_atom_total_force_copy_list;
275  CopyListTuple *m_d_atom_total_force_copy_list;
276  ClientBuffer *m_atom_total_force_client_buffers;
277  // Data structures for copying total forces to multiple clients
278  std::vector<CopyListTuple, CudaHostAllocator<CopyListTuple>> m_forced_atom_copy_list;
279  bool m_unique_forced_atoms;
280  CopyListTuple *m_d_forced_atom_copy_list;
281  ClientBuffer *m_d_forced_atom_client_buffers;
282  // Data structures for mapping global atom ids to SOA ids
283  std::vector<std::vector<AtomMap *>> m_atom_map_lists;
284  std::vector<int> m_src_devs;
285  std::vector<std::vector<CudaLocalRecord>> m_local_records;
286  std::vector<int *> m_global_to_local_id;
287  std::unordered_map<int, int> m_device_id_to_index;
288  // Pointers to buffers of device arrays (for multiple GPUs)
289  PeerAtomData m_h_peer_atom_data;
290  PeerTFArray m_h_peer_tf_array;
291  PeerAFArray m_h_peer_af_array;
292  PeerAtomData m_d_peer_atom_data;
293  PeerTFArray m_d_peer_tf_array;
294  PeerAFArray m_d_peer_af_array;
295  // CudaGlobalMasterServer only supports GPU-resident mode
296  SubmitReduction *reductionGpuResident;
297  // Lattice
298  std::vector<double, CudaHostAllocator<double>> m_h_lattice;
299 #else
300  CudaGlobalMasterServer(int deviceID, int printProfilingFreq = -1);
301 #endif // (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
302 };
303 
304 #endif // CUDAGLOBALMASTERSERVER_H
CudaGlobalMasterServer(int deviceID, int printProfilingFreq=-1)
int operator!=(const FourBodyConsts &f1, const FourBodyConsts &f2)
Definition: CompressPsf.C:353
#define CUDAGM_NS
Definition: common.h:280
int operator==(const AtomSigInfo &s1, const AtomSigInfo &s2)
Definition: CompressPsf.C:146
A class for copying atom information from SequencerCUDA to CudaGlobalMasterClient.