14 #include <type_traits> 19 #define MIN_DEBUG_LEVEL 3 23 std::string ptr_to_str(
const void *ptr) {
24 std::ostringstream oss;
40 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER) 42 using CopyListTupleT = CudaGlobalMasterServer::CopyListTuple;
43 using ClientBufferT = CudaGlobalMasterServer::ClientBuffer;
58 const std::vector<AtomMap *> &atomMapsList,
59 const std::vector<CudaLocalRecord> &localRecords,
60 const int *h_globalToLocalID) {
63 for (
int i = 0; i < atomMapsList.size(); ++i) {
65 const LocalID lid = atomMapsList[i]->localID(globalID);
70 const int soaPid = h_globalToLocalID[lid.
pid];
72 result.soaIndex = localRecords[soaPid].bufferOffset + lid.
index;
74 DebugM(1,
"Atom " + std::to_string(globalID) +
" found in local patch " +
75 std::to_string(lid.
pid) +
" with local index " +
76 std::to_string(lid.
index) +
", SOA patch ID " +
77 std::to_string(soaPid) +
", SOA global index " +
78 std::to_string(result.soaIndex) +
"\n");
84 if (result.found ==
false) {
85 DebugM(3,
"Atom " + std::to_string(globalID) +
" not found.\n");
109 template <
typename F1,
typename F2>
111 F1 getAtomID, F2 getPosOrForceBuffer,
112 std::vector<std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient>> &clients,
113 const std::vector<std::vector<AtomMap *>> &atomMapsLists,
114 const std::vector<std::vector<CudaLocalRecord>> &localRecords,
115 const std::vector<int *> h_globalToLocalID,
116 const std::vector<int> &sourceDevicesList,
117 const std::unordered_map<int, int> &deviceToIndex,
118 ClientBufferT *&d_clientBuffers, std::vector<CopyListTupleT, CudaGlobalMasterServer::CudaHostAllocator<CopyListTupleT>> &hostCopyList,
119 CopyListTupleT *&d_copyList, cudaStream_t stream,
bool* checkUniqueList =
nullptr) {
122 std::vector<ClientBufferT> clientBuffers;
123 hostCopyList.clear();
124 const size_t numClients = clients.size();
125 const int numDevices = atomMapsLists.size();
126 for (
size_t i = 0; i < numClients; ++i) {
127 const auto &client = clients[i];
130 clientBuffers.push_back(ClientBufferT{
131 ((*client).*getPosOrForceBuffer)(), client->getMasses(),
132 client->getCharges(), client->getTransforms(), client->getVelocities(),
133 ((*client).*getAtomID)().size()});
134 const auto &requested_atoms = ((*client).*getAtomID)();
140 for (
size_t j = 0; j < requested_atoms.size(); ++j) {
142 const AtomID gid = requested_atoms[j];
145 for (
int k = 0; k < numDevices; ++k) {
146 result = queryLocalID(gid, atomMapsLists[k], localRecords[k],
147 h_globalToLocalID[k]);
149 hostCopyList.push_back(CopyListTupleT{
150 deviceToIndex.at(sourceDevicesList[k]), result.soaIndex, i, j});
156 const std::string error =
157 "Cannot find the local ID in SOA arrays of atom " +
158 std::to_string(gid) +
" requested by client[" + std::to_string(i) +
159 "] (" + client->name() +
")\n";
165 std::sort(hostCopyList.begin(), hostCopyList.end(),
166 [](
const CopyListTupleT &a,
const CopyListTupleT &b) {
167 return a.m_soa_index < b.m_soa_index;
169 if (checkUniqueList) {
171 auto tmp_list = hostCopyList;
176 auto last = std::unique(tmp_list.begin(), tmp_list.end(),
177 [](
const CopyListTupleT &a,
const CopyListTupleT &b){
178 return a.m_soa_index == b.m_soa_index;
180 if (last == tmp_list.end()) {
181 *checkUniqueList =
true;
183 *checkUniqueList =
false;
187 size_t copySize =
sizeof(CopyListTupleT) * hostCopyList.size();
189 DebugM(3,
"Will copy " + std::to_string(hostCopyList.size()) +
" items.\n");
193 if (d_copyList !=
nullptr) {
196 cudaCheck(cudaMalloc(&d_copyList, copySize));
197 cudaCheck(cudaMemcpyAsync(d_copyList, hostCopyList.data(), copySize,
198 cudaMemcpyHostToDevice, stream));
199 copySize =
sizeof(ClientBufferT) * clientBuffers.size();
200 if (d_clientBuffers !=
nullptr) {
203 cudaCheck(cudaMalloc(&d_clientBuffers, copySize));
204 cudaCheck(cudaMemcpyAsync(d_clientBuffers, clientBuffers.data(), copySize,
205 cudaMemcpyHostToDevice, stream));
214 void copyLatticeToClient(
const std::vector<
double, CudaGlobalMasterServer::CudaHostAllocator<double>>& lat,
int deviceID,
215 std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient> client, cudaStream_t stream) {
216 const size_t copySize =
sizeof(double) * lat.size();
220 double *d_lattice = client->getLattice();
222 cudaCheck(cudaMemcpyAsync(d_lattice, lat.data(), copySize,
223 cudaMemcpyHostToDevice, stream));
225 const std::string error =
"Failed to copy lattice to client " + client->name() +
226 " (lattice requested but the client provides a " 227 "nullptr to the device buffer)";
234 template <
typename T>
235 void debugCopyList(
const std::string &name,
237 std::cout <<
"CudaGlobalMasterServer: the copylist is " << name <<
", with " 238 << L.size() <<
" items.\n";
239 for (
size_t i = 0; i < L.size(); ++i) {
241 "i = %lu, deviceIndex = %d, soaIndex = %d, clientIndex = %lu, " 242 "clientArrayIndex = %lu\n",
243 i, L[i].m_src_dev_index, L[i].m_soa_index, L[i].m_client_index,
244 L[i].m_client_atom_pos);
250 void debugClientBuffer(
const std::string &name, ClientBufferT *B,
252 std::vector<ClientBufferT> hB(numClients);
253 cudaPointerAttributes attributes;
254 cudaCheck(cudaPointerGetAttributes(&attributes, B));
255 std::cout <<
"CudaGlobalMasterServer: the clientBuffer is " << name
256 <<
", with " << numClients <<
" items.\n";
257 std::cout <<
"deviceBuffer pointer = " <<
static_cast<void *
>(B) <<
'\n';
258 std::cout <<
"deviceBuffer attributes:\n";
259 std::cout <<
"memory type: " << attributes.type << std::endl;
260 cudaCheck(cudaMemcpy(hB.data(), B,
sizeof(ClientBufferT) * numClients,
261 cudaMemcpyDeviceToHost));
262 for (
size_t i = 0; i < numClients; ++i) {
264 "i = %lu, d_data = %p, d_mass = %p, d_charge = %p, size = %lu\n", i,
265 hB[i].d_data, hB[i].d_mass, hB[i].d_charge, hB[i].sz);
267 std::cout << std::endl;
272 : m_device_id(deviceID), m_step(0),
273 m_num_devices(
deviceCUDA->getNumDevice()), m_clients_changed(0),
274 m_atom_maps_changed(0), m_print_profiling_freq(printProfilingFreq),
275 reductionGpuResident(nullptr) {
276 iout <<
iINFO <<
"CudaGlobalMasterServer: initialized on PE " << CkMyPe()
277 <<
" and GPU device " << m_device_id <<
"\n" 283 #ifdef NAMD_NVTX_ENABLED 284 nvtxNameCuStreamA(m_stream,
"CudaGlobalMaster stream");
286 m_atom_map_lists.resize(m_num_devices);
287 m_src_devs.resize(m_num_devices);
288 m_local_records.resize(m_num_devices);
289 m_global_to_local_id.resize(m_num_devices);
291 for (
int i = 0; i < m_num_devices; ++i) {
292 const int currentDeviceID = allDevices[i];
293 m_device_id_to_index[currentDeviceID] = i;
294 m_src_devs[i] = currentDeviceID;
296 if (m_num_devices > 1) {
297 allocatePeerArrays();
300 m_d_atom_pos_copy_list =
nullptr;
301 m_d_atom_total_force_copy_list =
nullptr;
302 m_d_forced_atom_copy_list =
nullptr;
304 m_d_atom_pos_client_buffers =
nullptr;
305 m_atom_total_force_client_buffers =
nullptr;
306 m_d_forced_atom_client_buffers =
nullptr;
307 m_unique_forced_atoms =
false;
321 constexpr
int m_h_lattice_size =
322 (
sizeof(std::invoke_result_t<decltype(&Lattice::a), Lattice>) +
326 sizeof(std::invoke_result_t<decltype(&Vector::operator[]), Vector, int>);
327 m_h_lattice.resize(m_h_lattice_size);
330 if (!params->CUDASOAintegrateMode) {
331 NAMD_bug(
"CudaGlobalMasterServer only supports GPU-resident mode");
336 CudaGlobalMasterServer::~CudaGlobalMasterServer() {
337 if (m_d_atom_pos_copy_list !=
nullptr) {
338 cudaCheck(cudaFree(m_d_atom_pos_copy_list));
339 m_d_atom_pos_copy_list =
nullptr;
341 if (m_d_atom_pos_client_buffers !=
nullptr) {
342 cudaCheck(cudaFree(m_d_atom_pos_client_buffers));
343 m_d_atom_pos_client_buffers =
nullptr;
345 if (m_d_atom_total_force_copy_list !=
nullptr) {
346 cudaCheck(cudaFree(m_d_atom_total_force_copy_list));
347 m_d_atom_total_force_copy_list =
nullptr;
349 if (m_atom_total_force_client_buffers !=
nullptr) {
350 cudaCheck(cudaFree(m_atom_total_force_client_buffers));
351 m_atom_total_force_client_buffers =
nullptr;
353 if (m_d_forced_atom_copy_list !=
nullptr) {
354 cudaCheck(cudaFree(m_d_forced_atom_copy_list));
355 m_d_forced_atom_copy_list =
nullptr;
357 if (m_d_forced_atom_client_buffers !=
nullptr) {
358 cudaCheck(cudaFree(m_d_forced_atom_client_buffers));
359 m_d_forced_atom_client_buffers =
nullptr;
361 if (reductionGpuResident !=
nullptr) {
362 delete reductionGpuResident;
364 iout <<
iINFO <<
"CudaGlobalMasterServer: destructed on PE " << CkMyPe()
369 void CudaGlobalMasterServer::addClient(
370 std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient> client) {
371 auto it = std::find(m_clients.begin(), m_clients.end(), client);
372 if (it == m_clients.end()) {
373 iout <<
iINFO <<
"CudaGlobalMasterServer: adding client \"" 374 << client->name() <<
"\"\n" 376 m_clients.push_back(client);
377 m_clients_changed = CudaGlobalMasterServer::numCopyLists;
379 const std::string error =
380 "The client \"" + client->name() +
"\" are being added twice.\n";
385 void CudaGlobalMasterServer::removeClient(
386 std::shared_ptr<CudaGlobalMaster::CudaGlobalMasterClient> client) {
387 auto it = std::find(m_clients.begin(), m_clients.end(), client);
388 if (it == m_clients.end()) {
389 iout <<
iWARN <<
"CudaGlobalMasterServer: the client \"" << client->name()
390 <<
"\" is not registered with CudaGlobalMasterServer\n" 393 while (it != m_clients.end()) {
394 iout <<
iINFO <<
"CudaGlobalMasterServer: removing client \"" 395 << client->name() <<
"\"\n" 398 m_clients_changed = CudaGlobalMasterServer::numCopyLists;
399 it = std::find(m_clients.begin(), m_clients.end(), client);
403 void CudaGlobalMasterServer::updateAtomMaps() {
404 const int numPes = CkNumPes();
407 DebugM(3,
"updateAtomMaps: number of PEs = " + std::to_string(numPes) +
"\n");
409 for (
int i = 0; i < numPes; ++i) {
412 const int j = m_device_id_to_index.at(peDevice);
416 DebugM(3,
"updateAtomMaps: PE " + std::to_string(i) +
" atomMap " +
417 ptr_to_str(amap) +
" on device " + std::to_string(peDevice) +
420 m_atom_map_lists[j].push_back(amap);
422 const bool multi_gpu = m_num_devices > 1;
426 for (
int i = 0; i < m_num_devices; ++i) {
427 const int deviceID = m_src_devs[i];
431 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
432 m_global_to_local_id[i] = sequencer->globalToLocalID;
433 m_local_records[i] = sequencer->patchData->devData[deviceID].h_localPatches;
435 m_h_peer_atom_data.d_pos_x[i] = sequencer->d_pos_x;
436 m_h_peer_atom_data.d_pos_y[i] = sequencer->d_pos_y;
437 m_h_peer_atom_data.d_pos_z[i] = sequencer->d_pos_z;
438 m_h_peer_atom_data.d_vel_x[i] = sequencer->d_vel_x;
439 m_h_peer_atom_data.d_vel_y[i] = sequencer->d_vel_y;
440 m_h_peer_atom_data.d_vel_z[i] = sequencer->d_vel_z;
441 m_h_peer_atom_data.d_mass[i] = sequencer->d_mass;
442 m_h_peer_atom_data.d_charge[i] = sequencer->d_charge;
443 m_h_peer_atom_data.d_transform[i] = sequencer->d_transform;
445 m_h_peer_tf_array.d_f_normal_x[i] = sequencer->d_f_normal_x;
446 m_h_peer_tf_array.d_f_normal_y[i] = sequencer->d_f_normal_y;
447 m_h_peer_tf_array.d_f_normal_z[i] = sequencer->d_f_normal_z;
448 m_h_peer_tf_array.d_f_saved_nbond_x[i] = sequencer->d_f_saved_nbond_x;
449 m_h_peer_tf_array.d_f_saved_nbond_y[i] = sequencer->d_f_saved_nbond_y;
450 m_h_peer_tf_array.d_f_saved_nbond_z[i] = sequencer->d_f_saved_nbond_z;
451 m_h_peer_tf_array.d_f_saved_slow_x[i] = sequencer->d_f_saved_slow_x;
452 m_h_peer_tf_array.d_f_saved_slow_y[i] = sequencer->d_f_saved_slow_y;
453 m_h_peer_tf_array.d_f_saved_slow_z[i] = sequencer->d_f_saved_slow_z;
454 m_h_peer_tf_array.d_atomFixed[i] = sequencer->d_atomFixed;
456 m_h_peer_af_array.d_f_applied_x[i] = sequencer->d_f_global_x;
457 m_h_peer_af_array.d_f_applied_y[i] = sequencer->d_f_global_y;
458 m_h_peer_af_array.d_f_applied_z[i] = sequencer->d_f_global_z;
459 m_h_peer_af_array.d_atomFixed[i] = sequencer->d_atomFixed;
462 DebugM(3,
"updateAtomMaps: device " + std::to_string(deviceID) +
463 ", sequencer " + ptr_to_str(sequencer) +
"\n");
467 copyPeerArraysToDevice();
470 m_atom_maps_changed = CudaGlobalMasterServer::numCopyLists;
473 void CudaGlobalMasterServer::setStep(int64_t step) {
476 for (
auto& client: m_clients) {
477 client->setStep(step);
481 void CudaGlobalMasterServer::communicateToClients(
const Lattice* lat) {
487 m_h_lattice[0] = lat_a.
x;
488 m_h_lattice[1] = lat_a.
y;
489 m_h_lattice[2] = lat_a.
z;
490 m_h_lattice[3] = lat_b.
x;
491 m_h_lattice[4] = lat_b.
y;
492 m_h_lattice[5] = lat_b.
z;
493 m_h_lattice[6] = lat_c.
x;
494 m_h_lattice[7] = lat_c.
y;
495 m_h_lattice[8] = lat_c.
z;
496 m_h_lattice[9] = lat_o.
x;
497 m_h_lattice[10] = lat_o.
y;
498 m_h_lattice[11] = lat_o.
z;
499 bool b_buildAtomsPositionCopyList =
false;
500 bool b_buildAtomsTotalForcesCopyList =
false;
501 for (
auto&& client: m_clients) {
503 if (client->requestedAtomsChanged()) {
504 b_buildAtomsPositionCopyList =
true;
507 if (client->requestedTotalForcesAtomsChanged()) {
508 b_buildAtomsTotalForcesCopyList =
true;
511 if (client->requestUpdateLattice()) {
512 copyLatticeToClient(m_h_lattice, m_device_id, client, m_stream);
516 if (b_buildAtomsPositionCopyList || m_atom_maps_changed > 0 ||
517 m_clients_changed > 0) {
518 buildAtomsCopyList();
519 if (m_atom_maps_changed > 0)
520 m_atom_maps_changed--;
521 if (m_clients_changed > 0)
524 if (b_buildAtomsTotalForcesCopyList || m_atom_maps_changed > 0 ||
525 m_clients_changed > 0) {
526 buildAtomsTotalForcesCopyList();
527 if (m_atom_maps_changed > 0)
528 m_atom_maps_changed--;
529 if (m_clients_changed > 0)
533 bool b_copyPositions =
false;
534 bool b_copyTotalForces =
false;
535 bool b_copyMasses =
false;
536 bool b_copyCharges =
false;
537 bool b_copyTransforms =
false;
538 bool b_copyVelocities =
false;
539 for (
auto&& client: m_clients) {
540 b_copyPositions |= client->requestUpdateAtomPositions();
541 b_copyTotalForces |= client->requestUpdateAtomTotalForces();
542 b_copyMasses |= client->requestUpdateMasses();
543 b_copyCharges |= client->requestUpdateCharges();
544 b_copyTransforms |= client->requestUpdateTransforms();
545 b_copyVelocities |= client->requestUpdateVelocities();
548 if (b_buildAtomsPositionCopyList || b_copyPositions || b_copyMasses ||
549 b_copyCharges || b_copyTransforms || b_copyVelocities) {
550 copyAtomsToClients(b_copyPositions, b_copyMasses, b_copyCharges,
551 b_copyTransforms, b_copyVelocities);
553 if (b_copyTotalForces || b_buildAtomsTotalForcesCopyList) {
554 copyTotalForcesToClients();
556 for (
auto&& client: m_clients) {
557 client->onBuffersUpdated();
561 bool CudaGlobalMasterServer::requestedTotalForces()
const {
563 for (
auto&& client: m_clients) {
564 result |= (!(client->getRequestedForcedAtoms().empty()) &&
565 client->requestUpdateAtomTotalForces());
570 void CudaGlobalMasterServer::buildAtomsCopyList() {
572 DebugM(3,
"buildAtomsCopyList is called\n");
578 buildCopyList(&CudaGlobalMaster::CudaGlobalMasterClient::getRequestedAtoms,
579 &CudaGlobalMaster::CudaGlobalMasterClient::getPositions, m_clients,
580 m_atom_map_lists, m_local_records, m_global_to_local_id,
581 m_src_devs, m_device_id_to_index, m_d_atom_pos_client_buffers,
582 m_atom_pos_copy_list, m_d_atom_pos_copy_list, m_stream);
587 void CudaGlobalMasterServer::copyAtomsToClients(
bool copyPositions,
591 bool copyVelocities) {
593 DebugM(1,
"copyAtomsToClients is called\n");
600 if (m_num_devices == 1) {
602 #ifdef MIN_DEBUG_LEVEL 603 #if MIN_DEBUG_LEVEL <= 1 604 debugCopyList(
"CudaGlobalMasterServer::copyAtomsToClients",
605 m_atom_pos_copy_list);
610 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
611 copyAtomsToClientsCUDA(
612 copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
613 sequencer->d_pos_x, sequencer->d_pos_y, sequencer->d_pos_z,
614 sequencer->d_vel_x, sequencer->d_vel_y, sequencer->d_vel_z,
615 sequencer->d_transform, sequencer->d_mass, sequencer->d_charge,
616 sequencer->myLattice, this->m_d_atom_pos_copy_list,
617 this->m_atom_pos_copy_list.size(), this->m_d_atom_pos_client_buffers,
618 m_clients.size(), m_stream);
621 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
622 copyAtomsToClientsCUDAMGPU(
623 copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
624 (
const double **)m_h_peer_atom_data.d_pos_x,
625 (
const double **)m_h_peer_atom_data.d_pos_y,
626 (
const double **)m_h_peer_atom_data.d_pos_z,
627 (
const double **)m_h_peer_atom_data.d_vel_x,
628 (
const double **)m_h_peer_atom_data.d_vel_y,
629 (
const double **)m_h_peer_atom_data.d_vel_z,
630 (
const char3 **)m_h_peer_atom_data.d_transform,
631 (
const float **)m_h_peer_atom_data.d_mass,
632 (
const float **)m_h_peer_atom_data.d_charge, sequencer->myLattice,
633 this->m_d_atom_pos_copy_list, this->m_atom_pos_copy_list.size(),
634 this->m_d_atom_pos_client_buffers, m_clients.size(), m_stream);
641 void CudaGlobalMasterServer::copyTotalForcesToClients() {
643 DebugM(1,
"copyTotalForcesToClients is called\n");
651 if (m_num_devices == 1) {
653 #ifdef MIN_DEBUG_LEVEL 654 #if MIN_DEBUG_LEVEL <= 1 655 debugCopyList(
"CudaGlobalMasterServer::copyTotalForcesToClients",
656 m_atom_total_force_copy_list);
661 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
662 copyTotalForcesToClientsCUDA(
663 simParams->fixedAtomsOn, sequencer->d_f_normal_x,
664 sequencer->d_f_normal_y, sequencer->d_f_normal_z,
665 sequencer->d_f_saved_nbond_x, sequencer->d_f_saved_nbond_y,
666 sequencer->d_f_saved_nbond_z, sequencer->d_f_saved_slow_x,
667 sequencer->d_f_saved_slow_y, sequencer->d_f_saved_slow_z,
668 sequencer->d_atomFixed, this->m_d_atom_total_force_copy_list,
669 this->m_atom_total_force_copy_list.size(),
670 this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
672 copyTotalForcesToClientsCUDAMGPU(
674 (
const double **)m_d_peer_tf_array.d_f_normal_x,
675 (
const double **)m_d_peer_tf_array.d_f_normal_y,
676 (
const double **)m_d_peer_tf_array.d_f_normal_z,
677 (
const double **)m_d_peer_tf_array.d_f_saved_nbond_x,
678 (
const double **)m_d_peer_tf_array.d_f_saved_nbond_y,
679 (
const double **)m_d_peer_tf_array.d_f_saved_nbond_z,
680 (
const double **)m_d_peer_tf_array.d_f_saved_slow_x,
681 (
const double **)m_d_peer_tf_array.d_f_saved_slow_y,
682 (
const double **)m_d_peer_tf_array.d_f_saved_slow_z,
683 (
const int **)m_d_peer_tf_array.d_atomFixed,
684 this->m_d_atom_total_force_copy_list,
685 this->m_atom_total_force_copy_list.size(),
686 this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
693 void CudaGlobalMasterServer::buildAtomsTotalForcesCopyList() {
695 DebugM(3,
"buildAtomsTotalForcesCopyList is called\n");
701 buildCopyList(&CudaGlobalMaster::CudaGlobalMasterClient::getRequestedTotalForcesAtoms,
702 &CudaGlobalMaster::CudaGlobalMasterClient::getTotalForces, m_clients,
703 m_atom_map_lists, m_local_records, m_global_to_local_id,
704 m_src_devs, m_device_id_to_index,
705 m_atom_total_force_client_buffers, m_atom_total_force_copy_list,
706 m_d_atom_total_force_copy_list, m_stream);
710 void CudaGlobalMasterServer::buildForcedAtomsCopyList() {
712 DebugM(3,
"buildForcedAtomsCopyList is called\n");
718 buildCopyList(&CudaGlobalMaster::CudaGlobalMasterClient::getRequestedForcedAtoms,
719 &CudaGlobalMaster::CudaGlobalMasterClient::getAppliedForces, m_clients,
720 m_atom_map_lists, m_local_records, m_global_to_local_id,
721 m_src_devs, m_device_id_to_index,
722 m_d_forced_atom_client_buffers, m_forced_atom_copy_list,
723 m_d_forced_atom_copy_list, m_stream, &m_unique_forced_atoms);
727 void CudaGlobalMasterServer::calculate() {
729 for (
auto&& client: m_clients) {
735 void CudaGlobalMasterServer::communicateToMD(
bool doEnergy,
bool doVirial) {
738 DebugM(1,
"Calling communicateToMD at step " + std::to_string(m_step));
741 bool b_buildForcedAtomsCopyList =
false;
742 for (
auto&& client: m_clients) {
743 if (client->requestedForcedAtomsChanged()) {
744 b_buildForcedAtomsCopyList =
true;
747 if (b_buildForcedAtomsCopyList || m_atom_maps_changed > 0 ||
748 m_clients_changed > 0) {
749 buildForcedAtomsCopyList();
750 if (m_atom_maps_changed > 0)
751 m_atom_maps_changed--;
752 if (m_clients_changed > 0)
755 bool b_copyForcedAtoms =
false;
756 for (
auto&& client: m_clients) {
757 if (client->requestUpdateForcedAtoms()) {
758 b_copyForcedAtoms =
true;
762 if (b_copyForcedAtoms) {
765 auto* currentReduction = getCurrentReduction();
766 for (
auto& client: m_clients) {
767 client->finishReductions(doEnergy, doVirial, currentReduction);
769 cudaCheck(cudaStreamSynchronize(m_stream));
773 bool CudaGlobalMasterServer::willAddGlobalForces()
const {
775 for (
auto&& client: m_clients) {
776 result |= (!(client->getRequestedForcedAtoms().empty()) &&
777 client->requestUpdateForcedAtoms());
782 void CudaGlobalMasterServer::addGlobalForces() {
784 DebugM(1,
"Calling addGlobalForces at step " + std::to_string(m_step));
791 if (m_num_devices == 1) {
793 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
794 addGlobalForcesFromClients(
795 simParams->fixedAtomsOn, m_unique_forced_atoms, sequencer->d_f_global_x,
796 sequencer->d_f_global_y, sequencer->d_f_global_z,
797 sequencer->d_atomFixed, this->m_d_forced_atom_copy_list,
798 this->m_forced_atom_copy_list.size(),
799 this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
801 addGlobalForcesFromClientsMGPU(
802 simParams->fixedAtomsOn, m_unique_forced_atoms, m_d_peer_af_array.d_f_applied_x,
803 m_d_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_z,
804 (
const int **)m_d_peer_af_array.d_atomFixed,
805 this->m_d_forced_atom_copy_list, this->m_forced_atom_copy_list.size(),
806 this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
812 void CudaGlobalMasterServer::allocatePeerArrays() {
814 DebugM(3,
"CudaGlobalMasterServer::allocatePeerArrays");
816 allocate_host<double *>(&(m_h_peer_atom_data.d_pos_x), m_num_devices);
817 allocate_host<double *>(&(m_h_peer_atom_data.d_pos_y), m_num_devices);
818 allocate_host<double *>(&(m_h_peer_atom_data.d_pos_z), m_num_devices);
819 allocate_host<double *>(&(m_h_peer_atom_data.d_vel_x), m_num_devices);
820 allocate_host<double *>(&(m_h_peer_atom_data.d_vel_y), m_num_devices);
821 allocate_host<double *>(&(m_h_peer_atom_data.d_vel_z), m_num_devices);
822 allocate_host<float *>(&(m_h_peer_atom_data.d_mass), m_num_devices);
823 allocate_host<float *>(&(m_h_peer_atom_data.d_charge), m_num_devices);
824 allocate_host<char3 *>(&(m_h_peer_atom_data.d_transform), m_num_devices);
826 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_x), m_num_devices);
827 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_y), m_num_devices);
828 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_z), m_num_devices);
829 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_x),
831 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_y),
833 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_z),
835 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_x),
837 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_y),
839 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_z),
841 allocate_host<int *>(&(m_h_peer_tf_array.d_atomFixed), m_num_devices);
843 allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_x), m_num_devices);
844 allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_y), m_num_devices);
845 allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_z), m_num_devices);
846 allocate_host<int *>(&(m_h_peer_af_array.d_atomFixed), m_num_devices);
852 allocate_device<double *>(&(m_d_peer_atom_data.d_pos_x), m_num_devices);
853 allocate_device<double *>(&(m_d_peer_atom_data.d_pos_y), m_num_devices);
854 allocate_device<double *>(&(m_d_peer_atom_data.d_pos_z), m_num_devices);
855 allocate_device<double *>(&(m_d_peer_atom_data.d_vel_x), m_num_devices);
856 allocate_device<double *>(&(m_d_peer_atom_data.d_vel_y), m_num_devices);
857 allocate_device<double *>(&(m_d_peer_atom_data.d_vel_z), m_num_devices);
858 allocate_device<float *>(&(m_d_peer_atom_data.d_mass), m_num_devices);
859 allocate_device<float *>(&(m_d_peer_atom_data.d_charge), m_num_devices);
860 allocate_device<char3 *>(&(m_d_peer_atom_data.d_transform), m_num_devices);
862 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_x), m_num_devices);
863 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_y), m_num_devices);
864 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_z), m_num_devices);
865 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_x),
867 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_y),
869 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_z),
871 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_x),
873 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_y),
875 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_z),
877 allocate_device<int *>(&(m_d_peer_tf_array.d_atomFixed), m_num_devices);
879 allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_x), m_num_devices);
880 allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_y), m_num_devices);
881 allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_z), m_num_devices);
882 allocate_device<int *>(&(m_d_peer_af_array.d_atomFixed), m_num_devices);
886 void CudaGlobalMasterServer::copyPeerArraysToDevice() {
888 DebugM(3,
"CudaGlobalMasterServer::copyPeerArraysToDevice");
894 copy_HtoD(m_h_peer_atom_data.d_pos_x, m_d_peer_atom_data.d_pos_x,
895 m_num_devices, m_stream);
896 copy_HtoD(m_h_peer_atom_data.d_pos_y, m_d_peer_atom_data.d_pos_y,
897 m_num_devices, m_stream);
898 copy_HtoD(m_h_peer_atom_data.d_pos_z, m_d_peer_atom_data.d_pos_z,
899 m_num_devices, m_stream);
900 copy_HtoD(m_h_peer_atom_data.d_vel_x, m_d_peer_atom_data.d_vel_x,
901 m_num_devices, m_stream);
902 copy_HtoD(m_h_peer_atom_data.d_vel_y, m_d_peer_atom_data.d_vel_y,
903 m_num_devices, m_stream);
904 copy_HtoD(m_h_peer_atom_data.d_vel_z, m_d_peer_atom_data.d_vel_z,
905 m_num_devices, m_stream);
906 copy_HtoD(m_h_peer_atom_data.d_mass, m_d_peer_atom_data.d_mass, m_num_devices,
908 copy_HtoD(m_h_peer_atom_data.d_charge, m_d_peer_atom_data.d_charge,
909 m_num_devices, m_stream);
910 copy_HtoD(m_h_peer_atom_data.d_transform, m_d_peer_atom_data.d_transform,
911 m_num_devices, m_stream);
913 copy_HtoD(m_h_peer_tf_array.d_f_normal_x, m_d_peer_tf_array.d_f_normal_x,
914 m_num_devices, m_stream);
915 copy_HtoD(m_h_peer_tf_array.d_f_normal_y, m_d_peer_tf_array.d_f_normal_y,
916 m_num_devices, m_stream);
917 copy_HtoD(m_h_peer_tf_array.d_f_normal_z, m_d_peer_tf_array.d_f_normal_z,
918 m_num_devices, m_stream);
919 copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_x,
920 m_d_peer_tf_array.d_f_saved_nbond_x, m_num_devices, m_stream);
921 copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_y,
922 m_d_peer_tf_array.d_f_saved_nbond_y, m_num_devices, m_stream);
923 copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_z,
924 m_d_peer_tf_array.d_f_saved_nbond_z, m_num_devices, m_stream);
925 copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_x,
926 m_d_peer_tf_array.d_f_saved_slow_x, m_num_devices, m_stream);
927 copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_y,
928 m_d_peer_tf_array.d_f_saved_slow_y, m_num_devices, m_stream);
929 copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_z,
930 m_d_peer_tf_array.d_f_saved_slow_z, m_num_devices, m_stream);
931 copy_HtoD(m_h_peer_tf_array.d_atomFixed, m_d_peer_tf_array.d_atomFixed,
932 m_num_devices, m_stream);
934 copy_HtoD(m_h_peer_af_array.d_f_applied_x, m_d_peer_af_array.d_f_applied_x,
935 m_num_devices, m_stream);
936 copy_HtoD(m_h_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_y,
937 m_num_devices, m_stream);
938 copy_HtoD(m_h_peer_af_array.d_f_applied_z, m_d_peer_af_array.d_f_applied_z,
939 m_num_devices, m_stream);
940 copy_HtoD(m_h_peer_af_array.d_atomFixed, m_d_peer_af_array.d_atomFixed,
941 m_num_devices, m_stream);
947 void CudaGlobalMasterServer::finishReductions() {
949 DebugM(1,
"Calling finishReductions at step " + std::to_string(m_step));
952 auto* currentReduction = getCurrentReduction();
953 currentReduction->submit();
959 int deviceID,
int printProfilingFreq
962 "CudaGlobalMasterServer requires to build NAMD with CUDA support.\n");
964 #endif // (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
#define NAMD_EVENT_STOP(eon, id)
std::ostream & iINFO(std::ostream &s)
NAMD_HOST_DEVICE Vector c() const
Helper struct to store the result of query local ID from global ID.
SimParameters * simParameters
std::ostream & endi(std::ostream &s)
std::ostream & iWARN(std::ostream &s)
SubmitReduction * willSubmit(int setID, int size=-1)
static ReductionMgr * Object(void)
CudaGlobalMasterServer(int deviceID, int printProfilingFreq=-1)
__thread DeviceCUDA * deviceCUDA
#define NAMD_EVENT_START(eon, id)
static AtomMap * ObjectOnPe(int pe)
void NAMD_bug(const char *err_msg)
int getMasterPeForDeviceID(int deviceID)
void NAMD_die(const char *err_msg)
const int * allDevices() const
NAMD_HOST_DEVICE Vector b() const
NAMD_HOST_DEVICE Vector a() const
int getDeviceIDforPe(int pe)
NAMD_HOST_DEVICE Vector origin() const
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)