NAMD
CudaGlobalMasterServer.C
Go to the documentation of this file.
1 #include "AtomMap.h"
5 #include "CudaUtils.h"
6 #include "DeviceCUDA.h"
7 #include "InfoStream.h"
8 #include "Lattice.h"
9 #include "NamdEventsProfiling.h"
10 #include "NamdTypes.h"
11 #include "ProcessorPrivate.h"
12 #include "SequencerCUDA.h"
13 #include "SimParameters.h"
14 // #include <functional>
15 #include <string>
16 
17 #ifdef DEBUGM
18 #include "Debug.h"
19 #define MIN_DEBUG_LEVEL 3
20 
23 std::string ptr_to_str(const void *ptr) {
24  std::ostringstream oss;
25  oss << ptr;
26  return oss.str();
27 }
28 #endif
29 
34 struct localIDResult {
35  bool found;
36  // int pe;
37  int soaIndex;
38 };
39 
40 #if defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)
41 
42 using CopyListTupleT = CudaGlobalMasterServer::CopyListTuple;
43 using ClientBufferT = CudaGlobalMasterServer::ClientBuffer;
44 
45 extern __thread DeviceCUDA *deviceCUDA;
46 
57 localIDResult queryLocalID(const AtomID globalID,
58  const std::vector<AtomMap *> &atomMapsList,
59  const std::vector<CudaLocalRecord> &localRecords,
60  const int *h_globalToLocalID) {
61  // The logic here is similar to ComputeRestraintsCUDA::updateRestrainedAtoms
62  localIDResult result{false, -1};
63  for (int i = 0; i < atomMapsList.size(); ++i) {
64  // Lookup a LocalID from the global atom ID
65  const LocalID lid = atomMapsList[i]->localID(globalID);
66  if (lid.pid != notUsed) {
67  // Atom found
68  result.found = true;
69  // Mapping from global patch ID to local patch ID
70  const int soaPid = h_globalToLocalID[lid.pid];
71  // Memory location of the atom = patch start offset + atom local index in the patch
72  result.soaIndex = localRecords[soaPid].bufferOffset + lid.index;
73 #ifdef DEBUGM
74  DebugM(1, "Atom " + std::to_string(globalID) + " found in local patch " +
75  std::to_string(lid.pid) + " with local index " +
76  std::to_string(lid.index) + ", SOA patch ID " +
77  std::to_string(soaPid) + ", SOA global index " +
78  std::to_string(result.soaIndex) + "\n");
79 #endif
80  break;
81  }
82  }
83 #ifdef DEBUGM
84  if (result.found == false) {
85  DebugM(3, "Atom " + std::to_string(globalID) + " not found.\n");
86  }
87 #endif
88  return result;
89 }
90 
109 template <typename F1, typename F2>
110 void buildCopyList(
111  F1 getAtomID, F2 getPosOrForceBuffer,
112  std::vector<std::shared_ptr<CudaGlobalMasterClient>> &clients,
113  const std::vector<std::vector<AtomMap *>> &atomMapsLists,
114  const std::vector<std::vector<CudaLocalRecord>> &localRecords,
115  const std::vector<int *> h_globalToLocalID,
116  const std::vector<int> &sourceDevicesList,
117  const std::unordered_map<int, int> &deviceToIndex,
118  ClientBufferT *&d_clientBuffers, std::vector<CopyListTupleT> &hostCopyList,
119  CopyListTupleT *&d_copyList, cudaStream_t stream, bool* checkUniqueList = nullptr) {
120  // Number of devices == number of master PEs == number of
121  // "std::vector<AtomMap*>"s
122  std::vector<ClientBufferT> clientBuffers;
123  hostCopyList.clear();
124  const size_t numClients = clients.size();
125  const int numDevices = atomMapsLists.size();
126  for (size_t i = 0; i < numClients; ++i) {
127  const auto &client = clients[i];
128  // getPosOrForceBuffer should return the device memory address of the client
129  // for copying to/from
130  clientBuffers.push_back(ClientBufferT{
131  ((*client).*getPosOrForceBuffer)(), client->getMasses(),
132  client->getCharges(), client->getTransforms(), client->getVelocities(),
133  ((*client).*getAtomID)().size()});
134  const auto &requested_atoms = ((*client).*getAtomID)();
135  // iout << iINFO << "PE: " << CkMyPe() << ", client atoms address " << &requested_atoms << "\n" << endi;
136  // For each client, we try to map the global IDs of its
137  // requested atoms to local SOA array indicies.
138  // Also, taking the multi-GPU cases into account, we need
139  // to bookkeep the device index of the peer arrays.
140  for (size_t j = 0; j < requested_atoms.size(); ++j) {
141  // The global ID of an atom
142  const AtomID gid = requested_atoms[j];
143  localIDResult result{false, -1};
144  // We find the global ID in all atomMaps associated to all master PEs
145  for (int k = 0; k < numDevices; ++k) {
146  result = queryLocalID(gid, atomMapsLists[k], localRecords[k],
147  h_globalToLocalID[k]);
148  if (result.found) {
149  hostCopyList.push_back(CopyListTupleT{
150  deviceToIndex.at(sourceDevicesList[k]), result.soaIndex, i, j});
151  break;
152  }
153  }
154  // The global ID should be found somewhere. Otherwise there is a bug.
155  if (!result.found) {
156  const std::string error =
157  "Cannot find the local ID in SOA arrays of atom " +
158  std::to_string(gid) + " requested by client[" + std::to_string(i) +
159  "] (" + client->name() + ")\n";
160  NAMD_bug(error.c_str());
161  }
162  }
163  }
164  // Sorting the copy list might improve the performance
165  std::sort(hostCopyList.begin(), hostCopyList.end(),
166  [](const CopyListTupleT &a, const CopyListTupleT &b) {
167  return a.m_soa_index < b.m_soa_index;
168  });
169  if (checkUniqueList) {
170  // Check if the list is unique
171  auto tmp_list = hostCopyList;
172  // std::sort(tmp_list.begin(), tmp_list.end(),
173  // [](const CopyListTupleT &a, const CopyListTupleT &b) {
174  // return a.m_soa_index < b.m_soa_index;
175  // });
176  auto last = std::unique(tmp_list.begin(), tmp_list.end(),
177  [](const CopyListTupleT &a, const CopyListTupleT &b){
178  return a.m_soa_index == b.m_soa_index;
179  });
180  if (last == tmp_list.end()) {
181  *checkUniqueList = true;
182  } else {
183  *checkUniqueList = false;
184  }
185  }
186  // Copy the copy list structure to the device memory
187  size_t copySize = sizeof(CopyListTupleT) * hostCopyList.size();
188 #ifdef DEBUGM
189  DebugM(3, "Will copy " + std::to_string(hostCopyList.size()) + " items.\n");
190 #endif
191  // std::cout << "Trying to assign a copyList with size of " <<
192  // std::to_string(copySize) << std::endl;
193  if (d_copyList != nullptr) {
194  cudaCheck(cudaFree(d_copyList));
195  }
196  cudaCheck(cudaMalloc(&d_copyList, copySize));
197  cudaCheck(cudaMemcpyAsync(d_copyList, hostCopyList.data(), copySize,
198  cudaMemcpyHostToDevice, stream));
199  copySize = sizeof(ClientBufferT) * clientBuffers.size();
200  if (d_clientBuffers != nullptr) {
201  cudaCheck(cudaFree(d_clientBuffers));
202  }
203  cudaCheck(cudaMalloc(&d_clientBuffers, copySize));
204  cudaCheck(cudaMemcpyAsync(d_clientBuffers, clientBuffers.data(), copySize,
205  cudaMemcpyHostToDevice, stream));
206 }
207 
214 void copyLatticeToClient(const Lattice* lat, int deviceID,
215  std::shared_ptr<CudaGlobalMasterClient> client, cudaStream_t stream) {
216  const std::vector<double> h_lattice{
217  lat->a().x, lat->a().y,
218  lat->a().z, lat->b().x,
219  lat->b().y, lat->b().z,
220  lat->c().x, lat->c().y,
221  lat->c().z, lat->origin().x,
222  lat->origin().y, lat->origin().z // Do I need this for wrapping?
223  };
224  const size_t copySize = sizeof(double) * h_lattice.size();
225  int savedDevice;
226  cudaCheck(cudaGetDevice(&savedDevice));
227  cudaCheck(cudaSetDevice(deviceID));
228  double *d_lattice = client->getLattice();
229  if (d_lattice) {
230  cudaCheck(cudaMemcpyAsync(d_lattice, h_lattice.data(), copySize,
231  cudaMemcpyHostToDevice, stream));
232  } else {
233  const std::string error = "Failed to copy lattice to client " + client->name() +
234  " (lattice requested but the client provides a "
235  "nullptr to the device buffer)";
236  NAMD_die(error.c_str());
237  }
238  cudaCheck(cudaSetDevice(savedDevice));
239 }
240 
241 #ifdef DEBUGM
242 void debugCopyList(const std::string &name,
243  const std::vector<CopyListTupleT> &L) {
244  std::cout << "CudaGlobalMasterServer: the copylist is " << name << ", with "
245  << L.size() << " items.\n";
246  for (size_t i = 0; i < L.size(); ++i) {
247  fprintf(stdout,
248  "i = %lu, deviceIndex = %d, soaIndex = %d, clientIndex = %lu, "
249  "clientArrayIndex = %lu\n",
250  i, L[i].m_src_dev_index, L[i].m_soa_index, L[i].m_client_index,
251  L[i].m_client_atom_pos);
252  }
253 }
254 #endif
255 
256 #if 0
257 void debugClientBuffer(const std::string &name, ClientBufferT *B,
258  size_t numClients) {
259  std::vector<ClientBufferT> hB(numClients);
260  cudaPointerAttributes attributes;
261  cudaCheck(cudaPointerGetAttributes(&attributes, B));
262  std::cout << "CudaGlobalMasterServer: the clientBuffer is " << name
263  << ", with " << numClients << " items.\n";
264  std::cout << "deviceBuffer pointer = " << static_cast<void *>(B) << '\n';
265  std::cout << "deviceBuffer attributes:\n";
266  std::cout << "memory type: " << attributes.type << std::endl;
267  cudaCheck(cudaMemcpy(hB.data(), B, sizeof(ClientBufferT) * numClients,
268  cudaMemcpyDeviceToHost));
269  for (size_t i = 0; i < numClients; ++i) {
270  fprintf(stdout,
271  "i = %lu, d_data = %p, d_mass = %p, d_charge = %p, size = %lu\n", i,
272  hB[i].d_data, hB[i].d_mass, hB[i].d_charge, hB[i].sz);
273  }
274  std::cout << std::endl;
275 }
276 #endif
277 
278 CudaGlobalMasterServer::CudaGlobalMasterServer(int deviceID, int printProfilingFreq)
279  : m_device_id(deviceID), m_step(0),
280  m_num_devices(deviceCUDA->getNumDevice()), m_clients_changed(0),
281  m_atom_maps_changed(0), m_print_profiling_freq(printProfilingFreq),
282  m_t_build_copy_lists(0), m_t_copy_atoms(0),
283  m_t_copy_total_forces(0), m_t_add_global_forces(0), m_t_calc(0),
284  m_t_reductions(0) {
285  iout << iINFO << "CudaGlobalMasterServer: initialized on PE " << CkMyPe()
286  << " and GPU device " << m_device_id << "\n"
287  << endi;
288  int savedDevice;
289  cudaCheck(cudaGetDevice(&savedDevice));
290  cudaCheck(cudaSetDevice(m_device_id));
291  cudaCheck(cudaStreamCreate(&m_stream));
292 #ifdef NAMD_NVTX_ENABLED
293  nvtxNameCuStreamA(m_stream, "CudaGlobalMaster stream");
294 #endif
295  m_atom_map_lists.resize(m_num_devices);
296  m_src_devs.resize(m_num_devices);
297  m_local_records.resize(m_num_devices);
298  m_global_to_local_id.resize(m_num_devices);
299  const int *allDevices = deviceCUDA->allDevices();
300  for (int i = 0; i < m_num_devices; ++i) {
301  const int currentDeviceID = allDevices[i];
302  m_device_id_to_index[currentDeviceID] = i;
303  m_src_devs[i] = currentDeviceID;
304  }
305  if (m_num_devices > 1) {
306  allocatePeerArrays();
307  }
308  // Copy lists
309  m_d_atom_pos_copy_list = nullptr;
310  m_d_atom_total_force_copy_list = nullptr;
311  m_d_forced_atom_copy_list = nullptr;
312  // Client buffers
313  m_d_atom_pos_client_buffers = nullptr;
314  m_atom_total_force_client_buffers = nullptr;
315  m_d_forced_atom_client_buffers = nullptr;
316  m_unique_forced_atoms = false;
317  cudaCheck(cudaSetDevice(savedDevice));
318 }
319 
320 CudaGlobalMasterServer::~CudaGlobalMasterServer() {
321  printProfiling();
322  if (m_d_atom_pos_copy_list != nullptr) {
323  cudaCheck(cudaFree(m_d_atom_pos_copy_list));
324  m_d_atom_pos_copy_list = nullptr;
325  }
326  if (m_d_atom_pos_client_buffers != nullptr) {
327  cudaCheck(cudaFree(m_d_atom_pos_client_buffers));
328  m_d_atom_pos_client_buffers = nullptr;
329  }
330  if (m_d_atom_total_force_copy_list != nullptr) {
331  cudaCheck(cudaFree(m_d_atom_total_force_copy_list));
332  m_d_atom_total_force_copy_list = nullptr;
333  }
334  if (m_atom_total_force_client_buffers != nullptr) {
335  cudaCheck(cudaFree(m_atom_total_force_client_buffers));
336  m_atom_total_force_client_buffers = nullptr;
337  }
338  if (m_d_forced_atom_copy_list != nullptr) {
339  cudaCheck(cudaFree(m_d_forced_atom_copy_list));
340  m_d_forced_atom_copy_list = nullptr;
341  }
342  if (m_d_forced_atom_client_buffers != nullptr) {
343  cudaCheck(cudaFree(m_d_forced_atom_client_buffers));
344  m_d_forced_atom_client_buffers = nullptr;
345  }
346  iout << iINFO << "CudaGlobalMasterServer: destructed on PE " << CkMyPe()
347  << "\n"
348  << endi;
349 }
350 
351 void CudaGlobalMasterServer::printProfiling() const {
352  CkPrintf("====================================================\n");
353  CkPrintf("========= CudaGlobalMasterServer Profiling =========\n");
354  CkPrintf("========== (Time is displayed in seconds) ==========\n");
355  CkPrintf("====================================================\n");
356  CkPrintf(" Build copy lists: %.2f\n", m_t_build_copy_lists.count());
357  CkPrintf(" Copy atoms: %.2f\n", m_t_copy_atoms.count());
358  CkPrintf(" Copy total forces: %.2f\n", m_t_copy_total_forces.count());
359  CkPrintf(" Add forces from clients: %.2f\n", m_t_add_global_forces.count());
360  CkPrintf(" Clients\' calculate(): %.2f\n", m_t_calc.count());
361  CkPrintf(" Clients\' finishReductions(): %.2f\n", m_t_reductions.count());
362  CkPrintf("====================================================\n");
363 }
364 
365 void CudaGlobalMasterServer::addClient(
366  std::shared_ptr<CudaGlobalMasterClient> client) {
367  auto it = std::find(m_clients.begin(), m_clients.end(), client);
368  if (it == m_clients.end()) {
369  iout << iINFO << "CudaGlobalMasterServer: adding client \""
370  << client->name() << "\"\n"
371  << endi;
372  m_clients.push_back(client);
373  m_clients_changed = CudaGlobalMasterServer::numCopyLists;
374  } else {
375  const std::string error =
376  "The client \"" + client->name() + "\" are being added twice.\n";
377  NAMD_die(error.c_str());
378  }
379 }
380 
381 void CudaGlobalMasterServer::removeClient(
382  std::shared_ptr<CudaGlobalMasterClient> client) {
383  auto it = std::find(m_clients.begin(), m_clients.end(), client);
384  if (it == m_clients.end()) {
385  iout << iWARN << "CudaGlobalMasterServer: the client \"" << client->name()
386  << "\" is not registered with CudaGlobalMasterServer\n"
387  << endi;
388  }
389  while (it != m_clients.end()) {
390  iout << iINFO << "CudaGlobalMasterServer: removing client \""
391  << client->name() << "\"\n"
392  << endi;
393  m_clients.erase(it);
394  m_clients_changed = CudaGlobalMasterServer::numCopyLists;
395  it = std::find(m_clients.begin(), m_clients.end(), client);
396  }
397 }
398 
399 void CudaGlobalMasterServer::updateAtomMaps() {
400  const int numPes = CkNumPes();
401  // Iterate over all PEs
402 #ifdef DEBUGM
403  DebugM(3, "updateAtomMaps: number of PEs = " + std::to_string(numPes) + "\n");
404 #endif
405  for (int i = 0; i < numPes; ++i) {
406  // Find the device ID of the i-th PE
407  const int peDevice = deviceCUDA->getDeviceIDforPe(i);
408  const int j = m_device_id_to_index.at(peDevice);
409  // Get the atom maps
410  AtomMap *amap = AtomMap::ObjectOnPe(i);
411 #ifdef DEBUGM
412  DebugM(3, "updateAtomMaps: PE " + std::to_string(i) + " atomMap " +
413  ptr_to_str(amap) + " on device " + std::to_string(peDevice) +
414  "\n");
415 #endif
416  m_atom_map_lists[j].push_back(amap);
417  }
418  const bool multi_gpu = m_num_devices > 1;
419  // Iterate over all devices to get the map of global patch ID to local patch
420  // ID
421  // TODO: I assume all devices are used. Is this correct?
422  for (int i = 0; i < m_num_devices; ++i) {
423  const int deviceID = m_src_devs[i];
424  // Get the master PE
425  const int masterPe = deviceCUDA->getMasterPeForDeviceID(deviceID);
426  // Get the corresponding SequencerCUDA instance
427  const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
428  m_global_to_local_id[i] = sequencer->globalToLocalID;
429  m_local_records[i] = sequencer->patchData->devData[deviceID].h_localPatches;
430  if (multi_gpu) {
431  m_h_peer_atom_data.d_pos_x[i] = sequencer->d_pos_x;
432  m_h_peer_atom_data.d_pos_y[i] = sequencer->d_pos_y;
433  m_h_peer_atom_data.d_pos_z[i] = sequencer->d_pos_z;
434  m_h_peer_atom_data.d_vel_x[i] = sequencer->d_vel_x;
435  m_h_peer_atom_data.d_vel_y[i] = sequencer->d_vel_y;
436  m_h_peer_atom_data.d_vel_z[i] = sequencer->d_vel_z;
437  m_h_peer_atom_data.d_mass[i] = sequencer->d_mass;
438  m_h_peer_atom_data.d_charge[i] = sequencer->d_charge;
439  m_h_peer_atom_data.d_transform[i] = sequencer->d_transform;
440 
441  m_h_peer_tf_array.d_f_normal_x[i] = sequencer->d_f_normal_x;
442  m_h_peer_tf_array.d_f_normal_y[i] = sequencer->d_f_normal_y;
443  m_h_peer_tf_array.d_f_normal_z[i] = sequencer->d_f_normal_z;
444  m_h_peer_tf_array.d_f_saved_nbond_x[i] = sequencer->d_f_saved_nbond_x;
445  m_h_peer_tf_array.d_f_saved_nbond_y[i] = sequencer->d_f_saved_nbond_y;
446  m_h_peer_tf_array.d_f_saved_nbond_z[i] = sequencer->d_f_saved_nbond_z;
447  m_h_peer_tf_array.d_f_saved_slow_x[i] = sequencer->d_f_saved_slow_x;
448  m_h_peer_tf_array.d_f_saved_slow_y[i] = sequencer->d_f_saved_slow_y;
449  m_h_peer_tf_array.d_f_saved_slow_z[i] = sequencer->d_f_saved_slow_z;
450  m_h_peer_tf_array.d_atomFixed[i] = sequencer->d_atomFixed;
451 
452  m_h_peer_af_array.d_f_applied_x[i] = sequencer->d_f_global_x;
453  m_h_peer_af_array.d_f_applied_y[i] = sequencer->d_f_global_y;
454  m_h_peer_af_array.d_f_applied_z[i] = sequencer->d_f_global_z;
455  m_h_peer_af_array.d_atomFixed[i] = sequencer->d_atomFixed;
456  }
457 #ifdef DEBUGM
458  DebugM(3, "updateAtomMaps: device " + std::to_string(deviceID) +
459  ", sequencer " + ptr_to_str(sequencer) + "\n");
460 #endif
461  }
462  if (multi_gpu) {
463  copyPeerArraysToDevice();
464  }
465  // Setup the flag to rebuild all copy lists
466  m_atom_maps_changed = CudaGlobalMasterServer::numCopyLists;
467 }
468 
469 void CudaGlobalMasterServer::setStep(int64_t step) {
470  m_step = step;
471  // Update the step number
472  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
473  (*it)->setStep(step);
474  }
475  if (m_print_profiling_freq > 0) {
476  if (step % m_print_profiling_freq == 0) {
477  printProfiling();
478  }
479  }
480 }
481 
482 void CudaGlobalMasterServer::communicateToClients(const Lattice* lat) {
483  // iout << iINFO << "PE: " << CkMyPe() << ", communicateToClients this address " << this << "\n" << endi;
484  bool b_buildAtomsPositionCopyList = false;
485  bool b_buildAtomsTotalForcesCopyList = false;
486  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
487  // Check all clients if their requested atoms are changed
488  if ((*it)->requestedAtomsChanged()) {
489  b_buildAtomsPositionCopyList = true;
490  }
491  // Check all clients if their requested total force atoms are changed
492  if ((*it)->requestedTotalForcesAtomsChanged()) {
493  b_buildAtomsTotalForcesCopyList = true;
494  }
495  // Copy lattice to client buffers if necessary
496  if ((*it)->requestUpdateLattice()) {
497  copyLatticeToClient(lat, m_device_id, *it, m_stream);
498  }
499  }
500  // Rebuild the list of atoms to copy if necessary
501  if (b_buildAtomsPositionCopyList || m_atom_maps_changed > 0 ||
502  m_clients_changed > 0) {
503  buildAtomsCopyList();
504  if (m_atom_maps_changed > 0)
505  m_atom_maps_changed--;
506  if (m_clients_changed > 0)
507  m_clients_changed--;
508  }
509  if (b_buildAtomsTotalForcesCopyList || m_atom_maps_changed > 0 ||
510  m_clients_changed > 0) {
511  buildAtomsTotalForcesCopyList();
512  if (m_atom_maps_changed > 0)
513  m_atom_maps_changed--;
514  if (m_clients_changed > 0)
515  m_clients_changed--;
516  }
517  // Check all clients if they request to update the atomic positions
518  bool b_copyPositions = false;
519  bool b_copyTotalForces = false;
520  bool b_copyMasses = false;
521  bool b_copyCharges = false;
522  bool b_copyTransforms = false;
523  bool b_copyVelocities = false;
524  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
525  b_copyPositions |= (*it)->requestUpdateAtomPositions();
526  b_copyTotalForces |= (*it)->requestUpdateAtomTotalForces();
527  b_copyMasses |= (*it)->requestUpdateMasses();
528  b_copyCharges |= (*it)->requestUpdateCharges();
529  b_copyTransforms |= (*it)->requestUpdateTransforms();
530  b_copyVelocities |= (*it)->requestUpdateVelocities();
531  }
532  // Update the atomic positions if necessary
533  if (b_buildAtomsPositionCopyList || b_copyPositions || b_copyMasses ||
534  b_copyCharges || b_copyTransforms || b_copyVelocities) {
535  copyAtomsToClients(b_copyPositions, b_copyMasses, b_copyCharges,
536  b_copyTransforms, b_copyVelocities);
537  }
538  if (b_copyTotalForces || b_buildAtomsTotalForcesCopyList) {
539  copyTotalForcesToClients();
540  }
541  // Call the calculate function of every client
542  NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_CALCULATECLIENTS);
543  const auto startTime = std::chrono::high_resolution_clock::now();
544  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
545  (*it)->calculate();
546  }
547  const auto endTime = std::chrono::high_resolution_clock::now();
548  m_t_calc += endTime - startTime;
549  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_CALCULATECLIENTS);
550 }
551 
552 bool CudaGlobalMasterServer::requestedTotalForces() const {
553  bool result = false;
554  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
555  result |= (!((*it)->getRequestedForcedAtoms().empty()) &&
556  (*it)->requestUpdateAtomTotalForces());
557  }
558  return result;
559 }
560 
561 void CudaGlobalMasterServer::buildAtomsCopyList() {
562 #ifdef DEBUGM
563  DebugM(3, "buildAtomsCopyList is called\n");
564 #endif
565  const auto startTime = std::chrono::high_resolution_clock::now();
566  // Save the current device ID
567  int savedDevice;
568  cudaCheck(cudaGetDevice(&savedDevice));
569  cudaCheck(cudaSetDevice(m_device_id));
570  buildCopyList(&CudaGlobalMasterClient::getRequestedAtoms,
571  &CudaGlobalMasterClient::getPositions, m_clients,
572  m_atom_map_lists, m_local_records, m_global_to_local_id,
573  m_src_devs, m_device_id_to_index, m_d_atom_pos_client_buffers,
574  m_atom_pos_copy_list, m_d_atom_pos_copy_list, m_stream);
575  // cudaCheck(cudaStreamSynchronize(m_stream));
576  cudaCheck(cudaSetDevice(savedDevice));
577  const auto endTime = std::chrono::high_resolution_clock::now();
578  m_t_build_copy_lists += endTime - startTime;
579 }
580 
581 void CudaGlobalMasterServer::copyAtomsToClients(bool copyPositions,
582  bool copyMasses,
583  bool copyCharges,
584  bool copyTransforms,
585  bool copyVelocities) {
586 #ifdef DEBUGM
587  DebugM(1, "copyAtomsToClients is called\n");
588 #endif
589  NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_ATOMTOCLIENTS);
590  const auto startTime = std::chrono::high_resolution_clock::now();
591  // Save the current device ID
592  int savedDevice;
593  cudaCheck(cudaGetDevice(&savedDevice));
594  cudaCheck(cudaSetDevice(m_device_id));
595  if (m_num_devices == 1) {
596 #ifdef DEBUGM
597 #ifdef MIN_DEBUG_LEVEL
598 #if MIN_DEBUG_LEVEL <= 1
599  debugCopyList("CudaGlobalMasterServer::copyAtomsToClients",
600  m_atom_pos_copy_list);
601 #endif
602 #endif
603 #endif
604  const int masterPe = deviceCUDA->getMasterPe();
605  const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
606  copyAtomsToClientsCUDA(
607  copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
608  sequencer->d_pos_x, sequencer->d_pos_y, sequencer->d_pos_z,
609  sequencer->d_vel_x, sequencer->d_vel_y, sequencer->d_vel_z,
610  sequencer->d_transform, sequencer->d_mass, sequencer->d_charge,
611  sequencer->myLattice, this->m_d_atom_pos_copy_list,
612  this->m_atom_pos_copy_list.size(), this->m_d_atom_pos_client_buffers,
613  m_clients.size(), m_stream);
614  } else {
615  const int masterPe = deviceCUDA->getMasterPe();
616  const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
617  copyAtomsToClientsCUDAMGPU(
618  copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
619  (const double **)m_h_peer_atom_data.d_pos_x,
620  (const double **)m_h_peer_atom_data.d_pos_y,
621  (const double **)m_h_peer_atom_data.d_pos_z,
622  (const double **)m_h_peer_atom_data.d_vel_x,
623  (const double **)m_h_peer_atom_data.d_vel_y,
624  (const double **)m_h_peer_atom_data.d_vel_z,
625  (const char3 **)m_h_peer_atom_data.d_transform,
626  (const float **)m_h_peer_atom_data.d_mass,
627  (const float **)m_h_peer_atom_data.d_charge, sequencer->myLattice,
628  this->m_d_atom_pos_copy_list, this->m_atom_pos_copy_list.size(),
629  this->m_d_atom_pos_client_buffers, m_clients.size(), m_stream);
630  }
631  // cudaCheck(cudaStreamSynchronize(m_stream));
632  cudaCheck(cudaSetDevice(savedDevice));
633  const auto endTime = std::chrono::high_resolution_clock::now();
634  m_t_copy_atoms += endTime - startTime;
635  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_ATOMTOCLIENTS);
636 }
637 
638 void CudaGlobalMasterServer::copyTotalForcesToClients() {
639 #ifdef DEBUGM
640  DebugM(1, "copyTotalForcesToClients is called\n");
641 #endif
643  NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_TOTALFORCETOCLIENTS);
644  const auto startTime = std::chrono::high_resolution_clock::now();
645  // Save the current device ID
646  int savedDevice;
647  cudaCheck(cudaGetDevice(&savedDevice));
648  cudaCheck(cudaSetDevice(m_device_id));
649  if (m_num_devices == 1) {
650 #ifdef DEBUGM
651 #ifdef MIN_DEBUG_LEVEL
652 #if MIN_DEBUG_LEVEL <= 1
653  debugCopyList("CudaGlobalMasterServer::copyTotalForcesToClients",
654  m_atom_total_force_copy_list);
655 #endif
656 #endif
657 #endif
658  const int masterPe = deviceCUDA->getMasterPe();
659  const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
660  copyTotalForcesToClientsCUDA(
661  simParams->fixedAtomsOn, sequencer->d_f_normal_x,
662  sequencer->d_f_normal_y, sequencer->d_f_normal_z,
663  sequencer->d_f_saved_nbond_x, sequencer->d_f_saved_nbond_y,
664  sequencer->d_f_saved_nbond_z, sequencer->d_f_saved_slow_x,
665  sequencer->d_f_saved_slow_y, sequencer->d_f_saved_slow_z,
666  sequencer->d_atomFixed, this->m_d_atom_total_force_copy_list,
667  this->m_atom_total_force_copy_list.size(),
668  this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
669  } else {
670  copyTotalForcesToClientsCUDAMGPU(
671  simParams->fixedAtomsOn,
672  (const double **)m_d_peer_tf_array.d_f_normal_x,
673  (const double **)m_d_peer_tf_array.d_f_normal_y,
674  (const double **)m_d_peer_tf_array.d_f_normal_z,
675  (const double **)m_d_peer_tf_array.d_f_saved_nbond_x,
676  (const double **)m_d_peer_tf_array.d_f_saved_nbond_y,
677  (const double **)m_d_peer_tf_array.d_f_saved_nbond_z,
678  (const double **)m_d_peer_tf_array.d_f_saved_slow_x,
679  (const double **)m_d_peer_tf_array.d_f_saved_slow_y,
680  (const double **)m_d_peer_tf_array.d_f_saved_slow_z,
681  (const int **)m_d_peer_tf_array.d_atomFixed,
682  this->m_d_atom_total_force_copy_list,
683  this->m_atom_total_force_copy_list.size(),
684  this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
685  }
686  // cudaCheck(cudaStreamSynchronize(m_stream));
687  cudaCheck(cudaSetDevice(savedDevice));
688  const auto endTime = std::chrono::high_resolution_clock::now();
689  m_t_copy_total_forces += endTime - startTime;
690  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_TOTALFORCETOCLIENTS);
691 }
692 
693 void CudaGlobalMasterServer::buildAtomsTotalForcesCopyList() {
694 #ifdef DEBUGM
695  DebugM(3, "buildAtomsTotalForcesCopyList is called\n");
696 #endif
697  const auto startTime = std::chrono::high_resolution_clock::now();
698  // Save the current device ID
699  int savedDevice;
700  cudaCheck(cudaGetDevice(&savedDevice));
701  cudaCheck(cudaSetDevice(m_device_id));
702  buildCopyList(&CudaGlobalMasterClient::getRequestedTotalForcesAtoms,
703  &CudaGlobalMasterClient::getTotalForces, m_clients,
704  m_atom_map_lists, m_local_records, m_global_to_local_id,
705  m_src_devs, m_device_id_to_index,
706  m_atom_total_force_client_buffers, m_atom_total_force_copy_list,
707  m_d_atom_total_force_copy_list, m_stream);
708  cudaCheck(cudaSetDevice(savedDevice));
709  const auto endTime = std::chrono::high_resolution_clock::now();
710  m_t_build_copy_lists += endTime - startTime;
711 }
712 
713 void CudaGlobalMasterServer::buildForcedAtomsCopyList() {
714 #ifdef DEBUGM
715  DebugM(3, "buildForcedAtomsCopyList is called\n");
716 #endif
717  const auto startTime = std::chrono::high_resolution_clock::now();
718  // Save the current device ID
719  int savedDevice;
720  cudaCheck(cudaGetDevice(&savedDevice));
721  cudaCheck(cudaSetDevice(m_device_id));
722  buildCopyList(&CudaGlobalMasterClient::getRequestedForcedAtoms,
723  &CudaGlobalMasterClient::getAppliedForces, m_clients,
724  m_atom_map_lists, m_local_records, m_global_to_local_id,
725  m_src_devs, m_device_id_to_index,
726  m_d_forced_atom_client_buffers, m_forced_atom_copy_list,
727  m_d_forced_atom_copy_list, m_stream, &m_unique_forced_atoms);
728  cudaCheck(cudaSetDevice(savedDevice));
729  const auto endTime = std::chrono::high_resolution_clock::now();
730  m_t_build_copy_lists += endTime - startTime;
731 }
732 
733 void CudaGlobalMasterServer::communicateToMD() {
734  // iout << iINFO << "PE: " << CkMyPe() << ", communicateToMD this address " << this << "\n" << endi;
735 #ifdef DEBUGM
736  DebugM(1, "Calling communicateToMD at step " + std::to_string(m_step));
737 #endif
738  bool b_buildForcedAtomsCopyList = false;
739  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
740  if ((*it)->requestedForcedAtomsChanged()) {
741  b_buildForcedAtomsCopyList = true;
742  }
743  }
744  if (b_buildForcedAtomsCopyList || m_atom_maps_changed > 0 ||
745  m_clients_changed > 0) {
746  buildForcedAtomsCopyList();
747  if (m_atom_maps_changed > 0)
748  m_atom_maps_changed--;
749  if (m_clients_changed > 0)
750  m_clients_changed--;
751  }
752  bool b_copyForcedAtoms = false;
753  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
754  if ((*it)->requestUpdateForcedAtoms()) {
755  b_copyForcedAtoms = true;
756  }
757  }
758  NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_ADDGLOBALFORCES);
759  const auto startTime = std::chrono::high_resolution_clock::now();
760  if (b_copyForcedAtoms) {
761  addGlobalForces();
762  }
763  const auto endTime = std::chrono::high_resolution_clock::now();
764  m_t_add_global_forces += endTime - startTime;
765  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_ADDGLOBALFORCES);
766 }
767 
768 bool CudaGlobalMasterServer::willAddGlobalForces() const {
769  bool result = false;
770  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
771  result |= (!((*it)->getRequestedForcedAtoms().empty()) &&
772  (*it)->requestUpdateForcedAtoms());
773  }
774  return result;
775 }
776 
777 void CudaGlobalMasterServer::addGlobalForces() {
778 #ifdef DEBUGM
779  DebugM(1, "Calling addGlobalForces at step " + std::to_string(m_step));
780 #endif
782  // Save the current device ID
783  int savedDevice;
784  cudaCheck(cudaGetDevice(&savedDevice));
785  cudaCheck(cudaSetDevice(m_device_id));
786  if (m_num_devices == 1) {
787  const int masterPe = deviceCUDA->getMasterPe();
788  const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
789  addGlobalForcesFromClients(
790  simParams->fixedAtomsOn, m_unique_forced_atoms, sequencer->d_f_global_x,
791  sequencer->d_f_global_y, sequencer->d_f_global_z,
792  sequencer->d_atomFixed, this->m_d_forced_atom_copy_list,
793  this->m_forced_atom_copy_list.size(),
794  this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
795  } else {
796  addGlobalForcesFromClientsMGPU(
797  simParams->fixedAtomsOn, m_unique_forced_atoms, m_d_peer_af_array.d_f_applied_x,
798  m_d_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_z,
799  (const int **)m_d_peer_af_array.d_atomFixed,
800  this->m_d_forced_atom_copy_list, this->m_forced_atom_copy_list.size(),
801  this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
802  }
803  cudaCheck(cudaStreamSynchronize(m_stream));
804  cudaCheck(cudaSetDevice(savedDevice));
805 }
806 
807 void CudaGlobalMasterServer::allocatePeerArrays() {
808 #ifdef DEBUGM
809  DebugM(3, "CudaGlobalMasterServer::allocatePeerArrays");
810 #endif
811  allocate_host<double *>(&(m_h_peer_atom_data.d_pos_x), m_num_devices);
812  allocate_host<double *>(&(m_h_peer_atom_data.d_pos_y), m_num_devices);
813  allocate_host<double *>(&(m_h_peer_atom_data.d_pos_z), m_num_devices);
814  allocate_host<double *>(&(m_h_peer_atom_data.d_vel_x), m_num_devices);
815  allocate_host<double *>(&(m_h_peer_atom_data.d_vel_y), m_num_devices);
816  allocate_host<double *>(&(m_h_peer_atom_data.d_vel_z), m_num_devices);
817  allocate_host<float *>(&(m_h_peer_atom_data.d_mass), m_num_devices);
818  allocate_host<float *>(&(m_h_peer_atom_data.d_charge), m_num_devices);
819  allocate_host<char3 *>(&(m_h_peer_atom_data.d_transform), m_num_devices);
820 
821  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_x), m_num_devices);
822  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_y), m_num_devices);
823  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_z), m_num_devices);
824  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_x),
825  m_num_devices);
826  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_y),
827  m_num_devices);
828  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_z),
829  m_num_devices);
830  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_x),
831  m_num_devices);
832  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_y),
833  m_num_devices);
834  allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_z),
835  m_num_devices);
836  allocate_host<int *>(&(m_h_peer_tf_array.d_atomFixed), m_num_devices);
837 
838  allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_x), m_num_devices);
839  allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_y), m_num_devices);
840  allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_z), m_num_devices);
841  allocate_host<int *>(&(m_h_peer_af_array.d_atomFixed), m_num_devices);
842 
843  // Save the current device ID
844  int savedDevice;
845  cudaCheck(cudaGetDevice(&savedDevice));
846  cudaCheck(cudaSetDevice(m_device_id));
847  allocate_device<double *>(&(m_d_peer_atom_data.d_pos_x), m_num_devices);
848  allocate_device<double *>(&(m_d_peer_atom_data.d_pos_y), m_num_devices);
849  allocate_device<double *>(&(m_d_peer_atom_data.d_pos_z), m_num_devices);
850  allocate_device<double *>(&(m_d_peer_atom_data.d_vel_x), m_num_devices);
851  allocate_device<double *>(&(m_d_peer_atom_data.d_vel_y), m_num_devices);
852  allocate_device<double *>(&(m_d_peer_atom_data.d_vel_z), m_num_devices);
853  allocate_device<float *>(&(m_d_peer_atom_data.d_mass), m_num_devices);
854  allocate_device<float *>(&(m_d_peer_atom_data.d_charge), m_num_devices);
855  allocate_device<char3 *>(&(m_d_peer_atom_data.d_transform), m_num_devices);
856 
857  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_x), m_num_devices);
858  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_y), m_num_devices);
859  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_z), m_num_devices);
860  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_x),
861  m_num_devices);
862  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_y),
863  m_num_devices);
864  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_z),
865  m_num_devices);
866  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_x),
867  m_num_devices);
868  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_y),
869  m_num_devices);
870  allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_z),
871  m_num_devices);
872  allocate_device<int *>(&(m_d_peer_tf_array.d_atomFixed), m_num_devices);
873 
874  allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_x), m_num_devices);
875  allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_y), m_num_devices);
876  allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_z), m_num_devices);
877  allocate_device<int *>(&(m_d_peer_af_array.d_atomFixed), m_num_devices);
878  cudaCheck(cudaSetDevice(savedDevice));
879 }
880 
881 void CudaGlobalMasterServer::copyPeerArraysToDevice() {
882 #ifdef DEBUGM
883  DebugM(3, "CudaGlobalMasterServer::copyPeerArraysToDevice");
884 #endif
885  // Save the current device ID
886  int savedDevice;
887  cudaCheck(cudaGetDevice(&savedDevice));
888  cudaCheck(cudaSetDevice(m_device_id));
889  copy_HtoD(m_h_peer_atom_data.d_pos_x, m_d_peer_atom_data.d_pos_x,
890  m_num_devices, m_stream);
891  copy_HtoD(m_h_peer_atom_data.d_pos_y, m_d_peer_atom_data.d_pos_y,
892  m_num_devices, m_stream);
893  copy_HtoD(m_h_peer_atom_data.d_pos_z, m_d_peer_atom_data.d_pos_z,
894  m_num_devices, m_stream);
895  copy_HtoD(m_h_peer_atom_data.d_vel_x, m_d_peer_atom_data.d_vel_x,
896  m_num_devices, m_stream);
897  copy_HtoD(m_h_peer_atom_data.d_vel_y, m_d_peer_atom_data.d_vel_y,
898  m_num_devices, m_stream);
899  copy_HtoD(m_h_peer_atom_data.d_vel_z, m_d_peer_atom_data.d_vel_z,
900  m_num_devices, m_stream);
901  copy_HtoD(m_h_peer_atom_data.d_mass, m_d_peer_atom_data.d_mass, m_num_devices,
902  m_stream);
903  copy_HtoD(m_h_peer_atom_data.d_charge, m_d_peer_atom_data.d_charge,
904  m_num_devices, m_stream);
905  copy_HtoD(m_h_peer_atom_data.d_transform, m_d_peer_atom_data.d_transform,
906  m_num_devices, m_stream);
907 
908  copy_HtoD(m_h_peer_tf_array.d_f_normal_x, m_d_peer_tf_array.d_f_normal_x,
909  m_num_devices, m_stream);
910  copy_HtoD(m_h_peer_tf_array.d_f_normal_y, m_d_peer_tf_array.d_f_normal_y,
911  m_num_devices, m_stream);
912  copy_HtoD(m_h_peer_tf_array.d_f_normal_z, m_d_peer_tf_array.d_f_normal_z,
913  m_num_devices, m_stream);
914  copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_x,
915  m_d_peer_tf_array.d_f_saved_nbond_x, m_num_devices, m_stream);
916  copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_y,
917  m_d_peer_tf_array.d_f_saved_nbond_y, m_num_devices, m_stream);
918  copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_z,
919  m_d_peer_tf_array.d_f_saved_nbond_z, m_num_devices, m_stream);
920  copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_x,
921  m_d_peer_tf_array.d_f_saved_slow_x, m_num_devices, m_stream);
922  copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_y,
923  m_d_peer_tf_array.d_f_saved_slow_y, m_num_devices, m_stream);
924  copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_z,
925  m_d_peer_tf_array.d_f_saved_slow_z, m_num_devices, m_stream);
926  copy_HtoD(m_h_peer_tf_array.d_atomFixed, m_d_peer_tf_array.d_atomFixed,
927  m_num_devices, m_stream);
928 
929  copy_HtoD(m_h_peer_af_array.d_f_applied_x, m_d_peer_af_array.d_f_applied_x,
930  m_num_devices, m_stream);
931  copy_HtoD(m_h_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_y,
932  m_num_devices, m_stream);
933  copy_HtoD(m_h_peer_af_array.d_f_applied_z, m_d_peer_af_array.d_f_applied_z,
934  m_num_devices, m_stream);
935  copy_HtoD(m_h_peer_af_array.d_atomFixed, m_d_peer_af_array.d_atomFixed,
936  m_num_devices, m_stream);
937 
938  // cudaCheck(cudaStreamSynchronize(m_stream));
939  cudaCheck(cudaSetDevice(savedDevice));
940 }
941 
942 #ifdef NODEGROUP_FORCE_REGISTER
943 
944 void CudaGlobalMasterServer::finishReductions(bool doEnergy, bool doVirial,
945  NodeReduction *reduction) {
946 #ifdef DEBUGM
947  DebugM(1, "Calling finishReductions at step " + std::to_string(m_step));
948 #endif
949  NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_FINISHREDUCTIONS);
950  const auto startTime = std::chrono::high_resolution_clock::now();
951  // CkPrintf("Calling CudaGlobalMasterServer::finishReductions\n");
952  for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
953  (*it)->finishReductions(doEnergy, doVirial, reduction);
954  }
955  const auto endTime = std::chrono::high_resolution_clock::now();
956  m_t_reductions += endTime - startTime;
957  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_FINISHREDUCTIONS);
958 }
959 
960 #endif // NODEGROUP_FORCE_REGISTER
961 
962 #else
963 
965  int deviceID, int printProfilingFreq /* = -1 */
966 ) {
967  NAMD_die(
968  "CudaGlobalMasterServer requires to build NAMD with CUDA support.\n");
969 }
970 #endif // defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)
static Node * Object()
Definition: Node.h:86
#define NAMD_EVENT_STOP(eon, id)
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
Helper struct to store the result of query local ID from global ID.
SimParameters * simParameters
Definition: Node.h:181
#define DebugM(x, y)
Definition: Debug.h:75
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82
#define iout
Definition: InfoStream.h:51
CudaGlobalMasterServer(int deviceID, int printProfilingFreq=-1)
int32 index
Definition: NamdTypes.h:290
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define NAMD_EVENT_START(eon, id)
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38
int getMasterPe()
Definition: DeviceCUDA.h:137
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:530
BigReal x
Definition: Vector.h:74
void NAMD_die(const char *err_msg)
Definition: common.C:147
PatchID pid
Definition: NamdTypes.h:289
const int * allDevices() const
Definition: DeviceCUDA.h:173
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
#define simParams
Definition: Output.C:129
int32 AtomID
Definition: NamdTypes.h:35
BigReal y
Definition: Vector.h:74
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:523
NAMD_HOST_DEVICE Vector origin() const
Definition: Lattice.h:278
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:409