19 #define MIN_DEBUG_LEVEL 3 23 std::string ptr_to_str(
const void *ptr) {
24 std::ostringstream oss;
40 #if defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER) 42 using CopyListTupleT = CudaGlobalMasterServer::CopyListTuple;
43 using ClientBufferT = CudaGlobalMasterServer::ClientBuffer;
58 const std::vector<AtomMap *> &atomMapsList,
59 const std::vector<CudaLocalRecord> &localRecords,
60 const int *h_globalToLocalID) {
63 for (
int i = 0; i < atomMapsList.size(); ++i) {
65 const LocalID lid = atomMapsList[i]->localID(globalID);
70 const int soaPid = h_globalToLocalID[lid.
pid];
72 result.soaIndex = localRecords[soaPid].bufferOffset + lid.
index;
74 DebugM(1,
"Atom " + std::to_string(globalID) +
" found in local patch " +
75 std::to_string(lid.
pid) +
" with local index " +
76 std::to_string(lid.
index) +
", SOA patch ID " +
77 std::to_string(soaPid) +
", SOA global index " +
78 std::to_string(result.soaIndex) +
"\n");
84 if (result.found ==
false) {
85 DebugM(3,
"Atom " + std::to_string(globalID) +
" not found.\n");
109 template <
typename F1,
typename F2>
111 F1 getAtomID, F2 getPosOrForceBuffer,
112 std::vector<std::shared_ptr<CudaGlobalMasterClient>> &clients,
113 const std::vector<std::vector<AtomMap *>> &atomMapsLists,
114 const std::vector<std::vector<CudaLocalRecord>> &localRecords,
115 const std::vector<int *> h_globalToLocalID,
116 const std::vector<int> &sourceDevicesList,
117 const std::unordered_map<int, int> &deviceToIndex,
118 ClientBufferT *&d_clientBuffers, std::vector<CopyListTupleT> &hostCopyList,
119 CopyListTupleT *&d_copyList, cudaStream_t stream,
bool* checkUniqueList =
nullptr) {
122 std::vector<ClientBufferT> clientBuffers;
123 hostCopyList.clear();
124 const size_t numClients = clients.size();
125 const int numDevices = atomMapsLists.size();
126 for (
size_t i = 0; i < numClients; ++i) {
127 const auto &client = clients[i];
130 clientBuffers.push_back(ClientBufferT{
131 ((*client).*getPosOrForceBuffer)(), client->getMasses(),
132 client->getCharges(), client->getTransforms(), client->getVelocities(),
133 ((*client).*getAtomID)().size()});
134 const auto &requested_atoms = ((*client).*getAtomID)();
140 for (
size_t j = 0; j < requested_atoms.size(); ++j) {
142 const AtomID gid = requested_atoms[j];
145 for (
int k = 0; k < numDevices; ++k) {
146 result = queryLocalID(gid, atomMapsLists[k], localRecords[k],
147 h_globalToLocalID[k]);
149 hostCopyList.push_back(CopyListTupleT{
150 deviceToIndex.at(sourceDevicesList[k]), result.soaIndex, i, j});
156 const std::string error =
157 "Cannot find the local ID in SOA arrays of atom " +
158 std::to_string(gid) +
" requested by client[" + std::to_string(i) +
159 "] (" + client->name() +
")\n";
165 std::sort(hostCopyList.begin(), hostCopyList.end(),
166 [](
const CopyListTupleT &a,
const CopyListTupleT &b) {
167 return a.m_soa_index < b.m_soa_index;
169 if (checkUniqueList) {
171 auto tmp_list = hostCopyList;
176 auto last = std::unique(tmp_list.begin(), tmp_list.end(),
177 [](
const CopyListTupleT &a,
const CopyListTupleT &b){
178 return a.m_soa_index == b.m_soa_index;
180 if (last == tmp_list.end()) {
181 *checkUniqueList =
true;
183 *checkUniqueList =
false;
187 size_t copySize =
sizeof(CopyListTupleT) * hostCopyList.size();
189 DebugM(3,
"Will copy " + std::to_string(hostCopyList.size()) +
" items.\n");
193 if (d_copyList !=
nullptr) {
196 cudaCheck(cudaMalloc(&d_copyList, copySize));
197 cudaCheck(cudaMemcpyAsync(d_copyList, hostCopyList.data(), copySize,
198 cudaMemcpyHostToDevice, stream));
199 copySize =
sizeof(ClientBufferT) * clientBuffers.size();
200 if (d_clientBuffers !=
nullptr) {
203 cudaCheck(cudaMalloc(&d_clientBuffers, copySize));
204 cudaCheck(cudaMemcpyAsync(d_clientBuffers, clientBuffers.data(), copySize,
205 cudaMemcpyHostToDevice, stream));
214 void copyLatticeToClient(
const Lattice* lat,
int deviceID,
215 std::shared_ptr<CudaGlobalMasterClient> client, cudaStream_t stream) {
216 const std::vector<double> h_lattice{
217 lat->
a().
x, lat->
a().
y,
218 lat->
a().
z, lat->
b().
x,
219 lat->
b().
y, lat->
b().
z,
220 lat->
c().
x, lat->
c().
y,
224 const size_t copySize =
sizeof(double) * h_lattice.size();
228 double *d_lattice = client->getLattice();
230 cudaCheck(cudaMemcpyAsync(d_lattice, h_lattice.data(), copySize,
231 cudaMemcpyHostToDevice, stream));
233 const std::string error =
"Failed to copy lattice to client " + client->name() +
234 " (lattice requested but the client provides a " 235 "nullptr to the device buffer)";
242 void debugCopyList(
const std::string &name,
243 const std::vector<CopyListTupleT> &L) {
244 std::cout <<
"CudaGlobalMasterServer: the copylist is " << name <<
", with " 245 << L.size() <<
" items.\n";
246 for (
size_t i = 0; i < L.size(); ++i) {
248 "i = %lu, deviceIndex = %d, soaIndex = %d, clientIndex = %lu, " 249 "clientArrayIndex = %lu\n",
250 i, L[i].m_src_dev_index, L[i].m_soa_index, L[i].m_client_index,
251 L[i].m_client_atom_pos);
257 void debugClientBuffer(
const std::string &name, ClientBufferT *B,
259 std::vector<ClientBufferT> hB(numClients);
260 cudaPointerAttributes attributes;
261 cudaCheck(cudaPointerGetAttributes(&attributes, B));
262 std::cout <<
"CudaGlobalMasterServer: the clientBuffer is " << name
263 <<
", with " << numClients <<
" items.\n";
264 std::cout <<
"deviceBuffer pointer = " <<
static_cast<void *
>(B) <<
'\n';
265 std::cout <<
"deviceBuffer attributes:\n";
266 std::cout <<
"memory type: " << attributes.type << std::endl;
267 cudaCheck(cudaMemcpy(hB.data(), B,
sizeof(ClientBufferT) * numClients,
268 cudaMemcpyDeviceToHost));
269 for (
size_t i = 0; i < numClients; ++i) {
271 "i = %lu, d_data = %p, d_mass = %p, d_charge = %p, size = %lu\n", i,
272 hB[i].d_data, hB[i].d_mass, hB[i].d_charge, hB[i].sz);
274 std::cout << std::endl;
279 : m_device_id(deviceID), m_step(0),
280 m_num_devices(
deviceCUDA->getNumDevice()), m_clients_changed(0),
281 m_atom_maps_changed(0), m_print_profiling_freq(printProfilingFreq),
282 m_t_build_copy_lists(0), m_t_copy_atoms(0),
283 m_t_copy_total_forces(0), m_t_add_global_forces(0), m_t_calc(0),
285 iout <<
iINFO <<
"CudaGlobalMasterServer: initialized on PE " << CkMyPe()
286 <<
" and GPU device " << m_device_id <<
"\n" 292 #ifdef NAMD_NVTX_ENABLED 293 nvtxNameCuStreamA(m_stream,
"CudaGlobalMaster stream");
295 m_atom_map_lists.resize(m_num_devices);
296 m_src_devs.resize(m_num_devices);
297 m_local_records.resize(m_num_devices);
298 m_global_to_local_id.resize(m_num_devices);
300 for (
int i = 0; i < m_num_devices; ++i) {
301 const int currentDeviceID = allDevices[i];
302 m_device_id_to_index[currentDeviceID] = i;
303 m_src_devs[i] = currentDeviceID;
305 if (m_num_devices > 1) {
306 allocatePeerArrays();
309 m_d_atom_pos_copy_list =
nullptr;
310 m_d_atom_total_force_copy_list =
nullptr;
311 m_d_forced_atom_copy_list =
nullptr;
313 m_d_atom_pos_client_buffers =
nullptr;
314 m_atom_total_force_client_buffers =
nullptr;
315 m_d_forced_atom_client_buffers =
nullptr;
316 m_unique_forced_atoms =
false;
320 CudaGlobalMasterServer::~CudaGlobalMasterServer() {
322 if (m_d_atom_pos_copy_list !=
nullptr) {
323 cudaCheck(cudaFree(m_d_atom_pos_copy_list));
324 m_d_atom_pos_copy_list =
nullptr;
326 if (m_d_atom_pos_client_buffers !=
nullptr) {
327 cudaCheck(cudaFree(m_d_atom_pos_client_buffers));
328 m_d_atom_pos_client_buffers =
nullptr;
330 if (m_d_atom_total_force_copy_list !=
nullptr) {
331 cudaCheck(cudaFree(m_d_atom_total_force_copy_list));
332 m_d_atom_total_force_copy_list =
nullptr;
334 if (m_atom_total_force_client_buffers !=
nullptr) {
335 cudaCheck(cudaFree(m_atom_total_force_client_buffers));
336 m_atom_total_force_client_buffers =
nullptr;
338 if (m_d_forced_atom_copy_list !=
nullptr) {
339 cudaCheck(cudaFree(m_d_forced_atom_copy_list));
340 m_d_forced_atom_copy_list =
nullptr;
342 if (m_d_forced_atom_client_buffers !=
nullptr) {
343 cudaCheck(cudaFree(m_d_forced_atom_client_buffers));
344 m_d_forced_atom_client_buffers =
nullptr;
346 iout <<
iINFO <<
"CudaGlobalMasterServer: destructed on PE " << CkMyPe()
351 void CudaGlobalMasterServer::printProfiling()
const {
352 CkPrintf(
"====================================================\n");
353 CkPrintf(
"========= CudaGlobalMasterServer Profiling =========\n");
354 CkPrintf(
"========== (Time is displayed in seconds) ==========\n");
355 CkPrintf(
"====================================================\n");
356 CkPrintf(
" Build copy lists: %.2f\n", m_t_build_copy_lists.count());
357 CkPrintf(
" Copy atoms: %.2f\n", m_t_copy_atoms.count());
358 CkPrintf(
" Copy total forces: %.2f\n", m_t_copy_total_forces.count());
359 CkPrintf(
" Add forces from clients: %.2f\n", m_t_add_global_forces.count());
360 CkPrintf(
" Clients\' calculate(): %.2f\n", m_t_calc.count());
361 CkPrintf(
" Clients\' finishReductions(): %.2f\n", m_t_reductions.count());
362 CkPrintf(
"====================================================\n");
365 void CudaGlobalMasterServer::addClient(
366 std::shared_ptr<CudaGlobalMasterClient> client) {
367 auto it = std::find(m_clients.begin(), m_clients.end(), client);
368 if (it == m_clients.end()) {
369 iout <<
iINFO <<
"CudaGlobalMasterServer: adding client \"" 370 << client->name() <<
"\"\n" 372 m_clients.push_back(client);
373 m_clients_changed = CudaGlobalMasterServer::numCopyLists;
375 const std::string error =
376 "The client \"" + client->name() +
"\" are being added twice.\n";
381 void CudaGlobalMasterServer::removeClient(
382 std::shared_ptr<CudaGlobalMasterClient> client) {
383 auto it = std::find(m_clients.begin(), m_clients.end(), client);
384 if (it == m_clients.end()) {
385 iout <<
iWARN <<
"CudaGlobalMasterServer: the client \"" << client->name()
386 <<
"\" is not registered with CudaGlobalMasterServer\n" 389 while (it != m_clients.end()) {
390 iout <<
iINFO <<
"CudaGlobalMasterServer: removing client \"" 391 << client->name() <<
"\"\n" 394 m_clients_changed = CudaGlobalMasterServer::numCopyLists;
395 it = std::find(m_clients.begin(), m_clients.end(), client);
399 void CudaGlobalMasterServer::updateAtomMaps() {
400 const int numPes = CkNumPes();
403 DebugM(3,
"updateAtomMaps: number of PEs = " + std::to_string(numPes) +
"\n");
405 for (
int i = 0; i < numPes; ++i) {
408 const int j = m_device_id_to_index.at(peDevice);
412 DebugM(3,
"updateAtomMaps: PE " + std::to_string(i) +
" atomMap " +
413 ptr_to_str(amap) +
" on device " + std::to_string(peDevice) +
416 m_atom_map_lists[j].push_back(amap);
418 const bool multi_gpu = m_num_devices > 1;
422 for (
int i = 0; i < m_num_devices; ++i) {
423 const int deviceID = m_src_devs[i];
427 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
428 m_global_to_local_id[i] = sequencer->globalToLocalID;
429 m_local_records[i] = sequencer->patchData->devData[deviceID].h_localPatches;
431 m_h_peer_atom_data.d_pos_x[i] = sequencer->d_pos_x;
432 m_h_peer_atom_data.d_pos_y[i] = sequencer->d_pos_y;
433 m_h_peer_atom_data.d_pos_z[i] = sequencer->d_pos_z;
434 m_h_peer_atom_data.d_vel_x[i] = sequencer->d_vel_x;
435 m_h_peer_atom_data.d_vel_y[i] = sequencer->d_vel_y;
436 m_h_peer_atom_data.d_vel_z[i] = sequencer->d_vel_z;
437 m_h_peer_atom_data.d_mass[i] = sequencer->d_mass;
438 m_h_peer_atom_data.d_charge[i] = sequencer->d_charge;
439 m_h_peer_atom_data.d_transform[i] = sequencer->d_transform;
441 m_h_peer_tf_array.d_f_normal_x[i] = sequencer->d_f_normal_x;
442 m_h_peer_tf_array.d_f_normal_y[i] = sequencer->d_f_normal_y;
443 m_h_peer_tf_array.d_f_normal_z[i] = sequencer->d_f_normal_z;
444 m_h_peer_tf_array.d_f_saved_nbond_x[i] = sequencer->d_f_saved_nbond_x;
445 m_h_peer_tf_array.d_f_saved_nbond_y[i] = sequencer->d_f_saved_nbond_y;
446 m_h_peer_tf_array.d_f_saved_nbond_z[i] = sequencer->d_f_saved_nbond_z;
447 m_h_peer_tf_array.d_f_saved_slow_x[i] = sequencer->d_f_saved_slow_x;
448 m_h_peer_tf_array.d_f_saved_slow_y[i] = sequencer->d_f_saved_slow_y;
449 m_h_peer_tf_array.d_f_saved_slow_z[i] = sequencer->d_f_saved_slow_z;
450 m_h_peer_tf_array.d_atomFixed[i] = sequencer->d_atomFixed;
452 m_h_peer_af_array.d_f_applied_x[i] = sequencer->d_f_global_x;
453 m_h_peer_af_array.d_f_applied_y[i] = sequencer->d_f_global_y;
454 m_h_peer_af_array.d_f_applied_z[i] = sequencer->d_f_global_z;
455 m_h_peer_af_array.d_atomFixed[i] = sequencer->d_atomFixed;
458 DebugM(3,
"updateAtomMaps: device " + std::to_string(deviceID) +
459 ", sequencer " + ptr_to_str(sequencer) +
"\n");
463 copyPeerArraysToDevice();
466 m_atom_maps_changed = CudaGlobalMasterServer::numCopyLists;
469 void CudaGlobalMasterServer::setStep(int64_t step) {
472 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
473 (*it)->setStep(step);
475 if (m_print_profiling_freq > 0) {
476 if (step % m_print_profiling_freq == 0) {
482 void CudaGlobalMasterServer::communicateToClients(
const Lattice* lat) {
484 bool b_buildAtomsPositionCopyList =
false;
485 bool b_buildAtomsTotalForcesCopyList =
false;
486 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
488 if ((*it)->requestedAtomsChanged()) {
489 b_buildAtomsPositionCopyList =
true;
492 if ((*it)->requestedTotalForcesAtomsChanged()) {
493 b_buildAtomsTotalForcesCopyList =
true;
496 if ((*it)->requestUpdateLattice()) {
497 copyLatticeToClient(lat, m_device_id, *it, m_stream);
501 if (b_buildAtomsPositionCopyList || m_atom_maps_changed > 0 ||
502 m_clients_changed > 0) {
503 buildAtomsCopyList();
504 if (m_atom_maps_changed > 0)
505 m_atom_maps_changed--;
506 if (m_clients_changed > 0)
509 if (b_buildAtomsTotalForcesCopyList || m_atom_maps_changed > 0 ||
510 m_clients_changed > 0) {
511 buildAtomsTotalForcesCopyList();
512 if (m_atom_maps_changed > 0)
513 m_atom_maps_changed--;
514 if (m_clients_changed > 0)
518 bool b_copyPositions =
false;
519 bool b_copyTotalForces =
false;
520 bool b_copyMasses =
false;
521 bool b_copyCharges =
false;
522 bool b_copyTransforms =
false;
523 bool b_copyVelocities =
false;
524 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
525 b_copyPositions |= (*it)->requestUpdateAtomPositions();
526 b_copyTotalForces |= (*it)->requestUpdateAtomTotalForces();
527 b_copyMasses |= (*it)->requestUpdateMasses();
528 b_copyCharges |= (*it)->requestUpdateCharges();
529 b_copyTransforms |= (*it)->requestUpdateTransforms();
530 b_copyVelocities |= (*it)->requestUpdateVelocities();
533 if (b_buildAtomsPositionCopyList || b_copyPositions || b_copyMasses ||
534 b_copyCharges || b_copyTransforms || b_copyVelocities) {
535 copyAtomsToClients(b_copyPositions, b_copyMasses, b_copyCharges,
536 b_copyTransforms, b_copyVelocities);
538 if (b_copyTotalForces || b_buildAtomsTotalForcesCopyList) {
539 copyTotalForcesToClients();
543 const auto startTime = std::chrono::high_resolution_clock::now();
544 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
547 const auto endTime = std::chrono::high_resolution_clock::now();
548 m_t_calc += endTime - startTime;
552 bool CudaGlobalMasterServer::requestedTotalForces()
const {
554 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
555 result |= (!((*it)->getRequestedForcedAtoms().empty()) &&
556 (*it)->requestUpdateAtomTotalForces());
561 void CudaGlobalMasterServer::buildAtomsCopyList() {
563 DebugM(3,
"buildAtomsCopyList is called\n");
565 const auto startTime = std::chrono::high_resolution_clock::now();
570 buildCopyList(&CudaGlobalMasterClient::getRequestedAtoms,
571 &CudaGlobalMasterClient::getPositions, m_clients,
572 m_atom_map_lists, m_local_records, m_global_to_local_id,
573 m_src_devs, m_device_id_to_index, m_d_atom_pos_client_buffers,
574 m_atom_pos_copy_list, m_d_atom_pos_copy_list, m_stream);
577 const auto endTime = std::chrono::high_resolution_clock::now();
578 m_t_build_copy_lists += endTime - startTime;
581 void CudaGlobalMasterServer::copyAtomsToClients(
bool copyPositions,
585 bool copyVelocities) {
587 DebugM(1,
"copyAtomsToClients is called\n");
590 const auto startTime = std::chrono::high_resolution_clock::now();
595 if (m_num_devices == 1) {
597 #ifdef MIN_DEBUG_LEVEL 598 #if MIN_DEBUG_LEVEL <= 1 599 debugCopyList(
"CudaGlobalMasterServer::copyAtomsToClients",
600 m_atom_pos_copy_list);
605 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
606 copyAtomsToClientsCUDA(
607 copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
608 sequencer->d_pos_x, sequencer->d_pos_y, sequencer->d_pos_z,
609 sequencer->d_vel_x, sequencer->d_vel_y, sequencer->d_vel_z,
610 sequencer->d_transform, sequencer->d_mass, sequencer->d_charge,
611 sequencer->myLattice, this->m_d_atom_pos_copy_list,
612 this->m_atom_pos_copy_list.size(), this->m_d_atom_pos_client_buffers,
613 m_clients.size(), m_stream);
616 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
617 copyAtomsToClientsCUDAMGPU(
618 copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
619 (
const double **)m_h_peer_atom_data.d_pos_x,
620 (
const double **)m_h_peer_atom_data.d_pos_y,
621 (
const double **)m_h_peer_atom_data.d_pos_z,
622 (
const double **)m_h_peer_atom_data.d_vel_x,
623 (
const double **)m_h_peer_atom_data.d_vel_y,
624 (
const double **)m_h_peer_atom_data.d_vel_z,
625 (
const char3 **)m_h_peer_atom_data.d_transform,
626 (
const float **)m_h_peer_atom_data.d_mass,
627 (
const float **)m_h_peer_atom_data.d_charge, sequencer->myLattice,
628 this->m_d_atom_pos_copy_list, this->m_atom_pos_copy_list.size(),
629 this->m_d_atom_pos_client_buffers, m_clients.size(), m_stream);
633 const auto endTime = std::chrono::high_resolution_clock::now();
634 m_t_copy_atoms += endTime - startTime;
638 void CudaGlobalMasterServer::copyTotalForcesToClients() {
640 DebugM(1,
"copyTotalForcesToClients is called\n");
644 const auto startTime = std::chrono::high_resolution_clock::now();
649 if (m_num_devices == 1) {
651 #ifdef MIN_DEBUG_LEVEL 652 #if MIN_DEBUG_LEVEL <= 1 653 debugCopyList(
"CudaGlobalMasterServer::copyTotalForcesToClients",
654 m_atom_total_force_copy_list);
659 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
660 copyTotalForcesToClientsCUDA(
661 simParams->fixedAtomsOn, sequencer->d_f_normal_x,
662 sequencer->d_f_normal_y, sequencer->d_f_normal_z,
663 sequencer->d_f_saved_nbond_x, sequencer->d_f_saved_nbond_y,
664 sequencer->d_f_saved_nbond_z, sequencer->d_f_saved_slow_x,
665 sequencer->d_f_saved_slow_y, sequencer->d_f_saved_slow_z,
666 sequencer->d_atomFixed, this->m_d_atom_total_force_copy_list,
667 this->m_atom_total_force_copy_list.size(),
668 this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
670 copyTotalForcesToClientsCUDAMGPU(
672 (
const double **)m_d_peer_tf_array.d_f_normal_x,
673 (
const double **)m_d_peer_tf_array.d_f_normal_y,
674 (
const double **)m_d_peer_tf_array.d_f_normal_z,
675 (
const double **)m_d_peer_tf_array.d_f_saved_nbond_x,
676 (
const double **)m_d_peer_tf_array.d_f_saved_nbond_y,
677 (
const double **)m_d_peer_tf_array.d_f_saved_nbond_z,
678 (
const double **)m_d_peer_tf_array.d_f_saved_slow_x,
679 (
const double **)m_d_peer_tf_array.d_f_saved_slow_y,
680 (
const double **)m_d_peer_tf_array.d_f_saved_slow_z,
681 (
const int **)m_d_peer_tf_array.d_atomFixed,
682 this->m_d_atom_total_force_copy_list,
683 this->m_atom_total_force_copy_list.size(),
684 this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
688 const auto endTime = std::chrono::high_resolution_clock::now();
689 m_t_copy_total_forces += endTime - startTime;
693 void CudaGlobalMasterServer::buildAtomsTotalForcesCopyList() {
695 DebugM(3,
"buildAtomsTotalForcesCopyList is called\n");
697 const auto startTime = std::chrono::high_resolution_clock::now();
702 buildCopyList(&CudaGlobalMasterClient::getRequestedTotalForcesAtoms,
703 &CudaGlobalMasterClient::getTotalForces, m_clients,
704 m_atom_map_lists, m_local_records, m_global_to_local_id,
705 m_src_devs, m_device_id_to_index,
706 m_atom_total_force_client_buffers, m_atom_total_force_copy_list,
707 m_d_atom_total_force_copy_list, m_stream);
709 const auto endTime = std::chrono::high_resolution_clock::now();
710 m_t_build_copy_lists += endTime - startTime;
713 void CudaGlobalMasterServer::buildForcedAtomsCopyList() {
715 DebugM(3,
"buildForcedAtomsCopyList is called\n");
717 const auto startTime = std::chrono::high_resolution_clock::now();
722 buildCopyList(&CudaGlobalMasterClient::getRequestedForcedAtoms,
723 &CudaGlobalMasterClient::getAppliedForces, m_clients,
724 m_atom_map_lists, m_local_records, m_global_to_local_id,
725 m_src_devs, m_device_id_to_index,
726 m_d_forced_atom_client_buffers, m_forced_atom_copy_list,
727 m_d_forced_atom_copy_list, m_stream, &m_unique_forced_atoms);
729 const auto endTime = std::chrono::high_resolution_clock::now();
730 m_t_build_copy_lists += endTime - startTime;
733 void CudaGlobalMasterServer::communicateToMD() {
736 DebugM(1,
"Calling communicateToMD at step " + std::to_string(m_step));
738 bool b_buildForcedAtomsCopyList =
false;
739 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
740 if ((*it)->requestedForcedAtomsChanged()) {
741 b_buildForcedAtomsCopyList =
true;
744 if (b_buildForcedAtomsCopyList || m_atom_maps_changed > 0 ||
745 m_clients_changed > 0) {
746 buildForcedAtomsCopyList();
747 if (m_atom_maps_changed > 0)
748 m_atom_maps_changed--;
749 if (m_clients_changed > 0)
752 bool b_copyForcedAtoms =
false;
753 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
754 if ((*it)->requestUpdateForcedAtoms()) {
755 b_copyForcedAtoms =
true;
759 const auto startTime = std::chrono::high_resolution_clock::now();
760 if (b_copyForcedAtoms) {
763 const auto endTime = std::chrono::high_resolution_clock::now();
764 m_t_add_global_forces += endTime - startTime;
768 bool CudaGlobalMasterServer::willAddGlobalForces()
const {
770 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
771 result |= (!((*it)->getRequestedForcedAtoms().empty()) &&
772 (*it)->requestUpdateForcedAtoms());
777 void CudaGlobalMasterServer::addGlobalForces() {
779 DebugM(1,
"Calling addGlobalForces at step " + std::to_string(m_step));
786 if (m_num_devices == 1) {
788 const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
789 addGlobalForcesFromClients(
790 simParams->fixedAtomsOn, m_unique_forced_atoms, sequencer->d_f_global_x,
791 sequencer->d_f_global_y, sequencer->d_f_global_z,
792 sequencer->d_atomFixed, this->m_d_forced_atom_copy_list,
793 this->m_forced_atom_copy_list.size(),
794 this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
796 addGlobalForcesFromClientsMGPU(
797 simParams->fixedAtomsOn, m_unique_forced_atoms, m_d_peer_af_array.d_f_applied_x,
798 m_d_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_z,
799 (
const int **)m_d_peer_af_array.d_atomFixed,
800 this->m_d_forced_atom_copy_list, this->m_forced_atom_copy_list.size(),
801 this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
803 cudaCheck(cudaStreamSynchronize(m_stream));
807 void CudaGlobalMasterServer::allocatePeerArrays() {
809 DebugM(3,
"CudaGlobalMasterServer::allocatePeerArrays");
811 allocate_host<double *>(&(m_h_peer_atom_data.d_pos_x), m_num_devices);
812 allocate_host<double *>(&(m_h_peer_atom_data.d_pos_y), m_num_devices);
813 allocate_host<double *>(&(m_h_peer_atom_data.d_pos_z), m_num_devices);
814 allocate_host<double *>(&(m_h_peer_atom_data.d_vel_x), m_num_devices);
815 allocate_host<double *>(&(m_h_peer_atom_data.d_vel_y), m_num_devices);
816 allocate_host<double *>(&(m_h_peer_atom_data.d_vel_z), m_num_devices);
817 allocate_host<float *>(&(m_h_peer_atom_data.d_mass), m_num_devices);
818 allocate_host<float *>(&(m_h_peer_atom_data.d_charge), m_num_devices);
819 allocate_host<char3 *>(&(m_h_peer_atom_data.d_transform), m_num_devices);
821 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_x), m_num_devices);
822 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_y), m_num_devices);
823 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_z), m_num_devices);
824 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_x),
826 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_y),
828 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_z),
830 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_x),
832 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_y),
834 allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_z),
836 allocate_host<int *>(&(m_h_peer_tf_array.d_atomFixed), m_num_devices);
838 allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_x), m_num_devices);
839 allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_y), m_num_devices);
840 allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_z), m_num_devices);
841 allocate_host<int *>(&(m_h_peer_af_array.d_atomFixed), m_num_devices);
847 allocate_device<double *>(&(m_d_peer_atom_data.d_pos_x), m_num_devices);
848 allocate_device<double *>(&(m_d_peer_atom_data.d_pos_y), m_num_devices);
849 allocate_device<double *>(&(m_d_peer_atom_data.d_pos_z), m_num_devices);
850 allocate_device<double *>(&(m_d_peer_atom_data.d_vel_x), m_num_devices);
851 allocate_device<double *>(&(m_d_peer_atom_data.d_vel_y), m_num_devices);
852 allocate_device<double *>(&(m_d_peer_atom_data.d_vel_z), m_num_devices);
853 allocate_device<float *>(&(m_d_peer_atom_data.d_mass), m_num_devices);
854 allocate_device<float *>(&(m_d_peer_atom_data.d_charge), m_num_devices);
855 allocate_device<char3 *>(&(m_d_peer_atom_data.d_transform), m_num_devices);
857 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_x), m_num_devices);
858 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_y), m_num_devices);
859 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_z), m_num_devices);
860 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_x),
862 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_y),
864 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_z),
866 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_x),
868 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_y),
870 allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_z),
872 allocate_device<int *>(&(m_d_peer_tf_array.d_atomFixed), m_num_devices);
874 allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_x), m_num_devices);
875 allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_y), m_num_devices);
876 allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_z), m_num_devices);
877 allocate_device<int *>(&(m_d_peer_af_array.d_atomFixed), m_num_devices);
881 void CudaGlobalMasterServer::copyPeerArraysToDevice() {
883 DebugM(3,
"CudaGlobalMasterServer::copyPeerArraysToDevice");
889 copy_HtoD(m_h_peer_atom_data.d_pos_x, m_d_peer_atom_data.d_pos_x,
890 m_num_devices, m_stream);
891 copy_HtoD(m_h_peer_atom_data.d_pos_y, m_d_peer_atom_data.d_pos_y,
892 m_num_devices, m_stream);
893 copy_HtoD(m_h_peer_atom_data.d_pos_z, m_d_peer_atom_data.d_pos_z,
894 m_num_devices, m_stream);
895 copy_HtoD(m_h_peer_atom_data.d_vel_x, m_d_peer_atom_data.d_vel_x,
896 m_num_devices, m_stream);
897 copy_HtoD(m_h_peer_atom_data.d_vel_y, m_d_peer_atom_data.d_vel_y,
898 m_num_devices, m_stream);
899 copy_HtoD(m_h_peer_atom_data.d_vel_z, m_d_peer_atom_data.d_vel_z,
900 m_num_devices, m_stream);
901 copy_HtoD(m_h_peer_atom_data.d_mass, m_d_peer_atom_data.d_mass, m_num_devices,
903 copy_HtoD(m_h_peer_atom_data.d_charge, m_d_peer_atom_data.d_charge,
904 m_num_devices, m_stream);
905 copy_HtoD(m_h_peer_atom_data.d_transform, m_d_peer_atom_data.d_transform,
906 m_num_devices, m_stream);
908 copy_HtoD(m_h_peer_tf_array.d_f_normal_x, m_d_peer_tf_array.d_f_normal_x,
909 m_num_devices, m_stream);
910 copy_HtoD(m_h_peer_tf_array.d_f_normal_y, m_d_peer_tf_array.d_f_normal_y,
911 m_num_devices, m_stream);
912 copy_HtoD(m_h_peer_tf_array.d_f_normal_z, m_d_peer_tf_array.d_f_normal_z,
913 m_num_devices, m_stream);
914 copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_x,
915 m_d_peer_tf_array.d_f_saved_nbond_x, m_num_devices, m_stream);
916 copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_y,
917 m_d_peer_tf_array.d_f_saved_nbond_y, m_num_devices, m_stream);
918 copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_z,
919 m_d_peer_tf_array.d_f_saved_nbond_z, m_num_devices, m_stream);
920 copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_x,
921 m_d_peer_tf_array.d_f_saved_slow_x, m_num_devices, m_stream);
922 copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_y,
923 m_d_peer_tf_array.d_f_saved_slow_y, m_num_devices, m_stream);
924 copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_z,
925 m_d_peer_tf_array.d_f_saved_slow_z, m_num_devices, m_stream);
926 copy_HtoD(m_h_peer_tf_array.d_atomFixed, m_d_peer_tf_array.d_atomFixed,
927 m_num_devices, m_stream);
929 copy_HtoD(m_h_peer_af_array.d_f_applied_x, m_d_peer_af_array.d_f_applied_x,
930 m_num_devices, m_stream);
931 copy_HtoD(m_h_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_y,
932 m_num_devices, m_stream);
933 copy_HtoD(m_h_peer_af_array.d_f_applied_z, m_d_peer_af_array.d_f_applied_z,
934 m_num_devices, m_stream);
935 copy_HtoD(m_h_peer_af_array.d_atomFixed, m_d_peer_af_array.d_atomFixed,
936 m_num_devices, m_stream);
942 #ifdef NODEGROUP_FORCE_REGISTER 944 void CudaGlobalMasterServer::finishReductions(
bool doEnergy,
bool doVirial,
947 DebugM(1,
"Calling finishReductions at step " + std::to_string(m_step));
950 const auto startTime = std::chrono::high_resolution_clock::now();
952 for (
auto it = m_clients.begin(); it != m_clients.end(); ++it) {
953 (*it)->finishReductions(doEnergy, doVirial, reduction);
955 const auto endTime = std::chrono::high_resolution_clock::now();
956 m_t_reductions += endTime - startTime;
960 #endif // NODEGROUP_FORCE_REGISTER 965 int deviceID,
int printProfilingFreq
968 "CudaGlobalMasterServer requires to build NAMD with CUDA support.\n");
970 #endif // defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)
#define NAMD_EVENT_STOP(eon, id)
std::ostream & iINFO(std::ostream &s)
NAMD_HOST_DEVICE Vector c() const
Helper struct to store the result of query local ID from global ID.
SimParameters * simParameters
std::ostream & endi(std::ostream &s)
std::ostream & iWARN(std::ostream &s)
CudaGlobalMasterServer(int deviceID, int printProfilingFreq=-1)
__thread DeviceCUDA * deviceCUDA
#define NAMD_EVENT_START(eon, id)
static AtomMap * ObjectOnPe(int pe)
void NAMD_bug(const char *err_msg)
int getMasterPeForDeviceID(int deviceID)
void NAMD_die(const char *err_msg)
const int * allDevices() const
NAMD_HOST_DEVICE Vector b() const
NAMD_HOST_DEVICE Vector a() const
int getDeviceIDforPe(int pe)
NAMD_HOST_DEVICE Vector origin() const
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)