namd/doxygen/CudaGlobalMasterServer_8C_source.html

 #include "AtomMap.h"
 #include "CudaGlobalMasterClient.h"
 #include "CudaGlobalMasterServer.h"
 #include "CudaGlobalMasterServerKernel.h"
 #include "CudaUtils.h"
 #include "DeviceCUDA.h"
 #include "InfoStream.h"
 #include "Lattice.h"
 #include "NamdEventsProfiling.h"
 #include "NamdTypes.h"
 #include "ProcessorPrivate.h"
 #include "SequencerCUDA.h"
 #include "SimParameters.h"
 // #include <functional>
 #include <string>

 #ifdef DEBUGM
 #include "Debug.h"
 #define MIN_DEBUG_LEVEL 3

 std::string ptr_to_str(const void *ptr) {
   std::ostringstream oss;
   oss << ptr;
   return oss.str();
 }
 #endif

 struct localIDResult {
   bool found;
   // int pe;
   int soaIndex;
 };

 #if defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)

 using CopyListTupleT = CudaGlobalMasterServer::CopyListTuple;
 using ClientBufferT = CudaGlobalMasterServer::ClientBuffer;

 extern __thread DeviceCUDA *deviceCUDA;

 localIDResult queryLocalID(const AtomID globalID,
                            const std::vector<AtomMap *> &atomMapsList,
                            const std::vector<CudaLocalRecord> &localRecords,
                            const int *h_globalToLocalID) {
   // The logic here is similar to ComputeRestraintsCUDA::updateRestrainedAtoms
   localIDResult result{false, -1};
   for (int i = 0; i < atomMapsList.size(); ++i) {
     // Lookup a LocalID from the global atom ID
     const LocalID lid = atomMapsList[i]->localID(globalID);
     if (lid.pid != notUsed) {
       // Atom found
       result.found = true;
       // Mapping from global patch ID to local patch ID
       const int soaPid = h_globalToLocalID[lid.pid];
       // Memory location of the atom = patch start offset + atom local index in the patch
       result.soaIndex = localRecords[soaPid].bufferOffset + lid.index;
 #ifdef DEBUGM
       DebugM(1, "Atom " + std::to_string(globalID) + " found in local patch " +
                     std::to_string(lid.pid) + " with local index " +
                     std::to_string(lid.index) + ", SOA patch ID " +
                     std::to_string(soaPid) + ", SOA global index " +
                     std::to_string(result.soaIndex) + "\n");
 #endif
       break;
     }
   }
 #ifdef DEBUGM
   if (result.found == false) {
     DebugM(3, "Atom " + std::to_string(globalID) + " not found.\n");
   }
 #endif
   return result;
 }

 template <typename F1, typename F2>
 void buildCopyList(
     F1 getAtomID, F2 getPosOrForceBuffer,
     std::vector<std::shared_ptr<CudaGlobalMasterClient>> &clients,
     const std::vector<std::vector<AtomMap *>> &atomMapsLists,
     const std::vector<std::vector<CudaLocalRecord>> &localRecords,
     const std::vector<int *> h_globalToLocalID,
     const std::vector<int> &sourceDevicesList,
     const std::unordered_map<int, int> &deviceToIndex,
     ClientBufferT *&d_clientBuffers, std::vector<CopyListTupleT> &hostCopyList,
     CopyListTupleT *&d_copyList, cudaStream_t stream, bool* checkUniqueList = nullptr) {
   // Number of devices == number of master PEs == number of
   // "std::vector<AtomMap*>"s
   std::vector<ClientBufferT> clientBuffers;
   hostCopyList.clear();
   const size_t numClients = clients.size();
   const int numDevices = atomMapsLists.size();
   for (size_t i = 0; i < numClients; ++i) {
     const auto &client = clients[i];
     // getPosOrForceBuffer should return the device memory address of the client
     // for copying to/from
     clientBuffers.push_back(ClientBufferT{
         ((*client).*getPosOrForceBuffer)(), client->getMasses(),
         client->getCharges(), client->getTransforms(), client->getVelocities(),
         ((*client).*getAtomID)().size()});
     const auto &requested_atoms = ((*client).*getAtomID)();
     // iout << iINFO << "PE: " << CkMyPe() << ", client atoms address " << &requested_atoms << "\n" << endi;
     // For each client, we try to map the global IDs of its
     // requested atoms to local SOA array indicies.
     // Also, taking the multi-GPU cases into account, we need
     // to bookkeep the device index of the peer arrays.
     for (size_t j = 0; j < requested_atoms.size(); ++j) {
       // The global ID of an atom
       const AtomID gid = requested_atoms[j];
       localIDResult result{false, -1};
       // We find the global ID in all atomMaps associated to all master PEs
       for (int k = 0; k < numDevices; ++k) {
         result = queryLocalID(gid, atomMapsLists[k], localRecords[k],
                               h_globalToLocalID[k]);
         if (result.found) {
           hostCopyList.push_back(CopyListTupleT{
               deviceToIndex.at(sourceDevicesList[k]), result.soaIndex, i, j});
           break;
         }
       }
       // The global ID should be found somewhere. Otherwise there is a bug.
       if (!result.found) {
         const std::string error =
             "Cannot find the local ID in SOA arrays of atom " +
             std::to_string(gid) + " requested by client[" + std::to_string(i) +
             "] (" + client->name() + ")\n";
         NAMD_bug(error.c_str());
       }
     }
   }
   // Sorting the copy list might improve the performance
   std::sort(hostCopyList.begin(), hostCopyList.end(),
             [](const CopyListTupleT &a, const CopyListTupleT &b) {
               return a.m_soa_index < b.m_soa_index;
             });
   if (checkUniqueList) {
     // Check if the list is unique
     auto tmp_list = hostCopyList;
     // std::sort(tmp_list.begin(), tmp_list.end(),
     //           [](const CopyListTupleT &a, const CopyListTupleT &b) {
     //             return a.m_soa_index < b.m_soa_index;
     //           });
     auto last = std::unique(tmp_list.begin(), tmp_list.end(),
                             [](const CopyListTupleT &a, const CopyListTupleT &b){
                               return a.m_soa_index == b.m_soa_index;
                             });
     if (last == tmp_list.end()) {
       *checkUniqueList = true;
     } else {
       *checkUniqueList = false;
     }
   }
   // Copy the copy list structure to the device memory
   size_t copySize = sizeof(CopyListTupleT) * hostCopyList.size();
 #ifdef DEBUGM
   DebugM(3, "Will copy " + std::to_string(hostCopyList.size()) + " items.\n");
 #endif
   // std::cout << "Trying to assign a copyList with size of " <<
   // std::to_string(copySize) << std::endl;
   if (d_copyList != nullptr) {
     cudaCheck(cudaFree(d_copyList));
   }
   cudaCheck(cudaMalloc(&d_copyList, copySize));
   cudaCheck(cudaMemcpyAsync(d_copyList, hostCopyList.data(), copySize,
                             cudaMemcpyHostToDevice, stream));
   copySize = sizeof(ClientBufferT) * clientBuffers.size();
   if (d_clientBuffers != nullptr) {
     cudaCheck(cudaFree(d_clientBuffers));
   }
   cudaCheck(cudaMalloc(&d_clientBuffers, copySize));
   cudaCheck(cudaMemcpyAsync(d_clientBuffers, clientBuffers.data(), copySize,
                             cudaMemcpyHostToDevice, stream));
 }

 void copyLatticeToClient(const Lattice* lat, int deviceID,
                          std::shared_ptr<CudaGlobalMasterClient> client, cudaStream_t stream) {
   const std::vector<double> h_lattice{
       lat->a().x,      lat->a().y,
       lat->a().z,      lat->b().x,
       lat->b().y,      lat->b().z,
       lat->c().x,      lat->c().y,
       lat->c().z,      lat->origin().x,
       lat->origin().y, lat->origin().z // Do I need this for wrapping?
   };
   const size_t copySize = sizeof(double) * h_lattice.size();
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(deviceID));
   double *d_lattice = client->getLattice();
   if (d_lattice) {
     cudaCheck(cudaMemcpyAsync(d_lattice, h_lattice.data(), copySize,
                               cudaMemcpyHostToDevice, stream));
   } else {
     const std::string error = "Failed to copy lattice to client " + client->name() +
                               " (lattice requested but the client provides a "
                               "nullptr to the device buffer)";
     NAMD_die(error.c_str());
   }
   cudaCheck(cudaSetDevice(savedDevice));
 }

 #ifdef DEBUGM
 void debugCopyList(const std::string &name,
                    const std::vector<CopyListTupleT> &L) {
   std::cout << "CudaGlobalMasterServer: the copylist is " << name << ", with "
             << L.size() << " items.\n";
   for (size_t i = 0; i < L.size(); ++i) {
     fprintf(stdout,
             "i = %lu, deviceIndex = %d, soaIndex = %d, clientIndex = %lu, "
             "clientArrayIndex = %lu\n",
             i, L[i].m_src_dev_index, L[i].m_soa_index, L[i].m_client_index,
             L[i].m_client_atom_pos);
   }
 }
 #endif

 #if 0
 void debugClientBuffer(const std::string &name, ClientBufferT *B,
                        size_t numClients) {
   std::vector<ClientBufferT> hB(numClients);
   cudaPointerAttributes attributes;
   cudaCheck(cudaPointerGetAttributes(&attributes, B));
   std::cout << "CudaGlobalMasterServer: the clientBuffer is " << name
             << ", with " << numClients << " items.\n";
   std::cout << "deviceBuffer pointer = " << static_cast<void *>(B) << '\n';
   std::cout << "deviceBuffer attributes:\n";
   std::cout << "memory type: " << attributes.type << std::endl;
   cudaCheck(cudaMemcpy(hB.data(), B, sizeof(ClientBufferT) * numClients,
                        cudaMemcpyDeviceToHost));
   for (size_t i = 0; i < numClients; ++i) {
     fprintf(stdout,
             "i = %lu, d_data = %p, d_mass = %p, d_charge = %p, size = %lu\n", i,
             hB[i].d_data, hB[i].d_mass, hB[i].d_charge, hB[i].sz);
   }
   std::cout << std::endl;
 }
 #endif

 CudaGlobalMasterServer::CudaGlobalMasterServer(int deviceID, int printProfilingFreq)
     : m_device_id(deviceID), m_step(0),
       m_num_devices(deviceCUDA->getNumDevice()), m_clients_changed(0),
       m_atom_maps_changed(0), m_print_profiling_freq(printProfilingFreq),
       m_t_build_copy_lists(0), m_t_copy_atoms(0),
       m_t_copy_total_forces(0), m_t_add_global_forces(0), m_t_calc(0),
       m_t_reductions(0) {
   iout << iINFO << "CudaGlobalMasterServer: initialized on PE " << CkMyPe()
        << " and GPU device " << m_device_id << "\n"
        << endi;
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   cudaCheck(cudaStreamCreate(&m_stream));
 #ifdef NAMD_NVTX_ENABLED
   nvtxNameCuStreamA(m_stream, "CudaGlobalMaster stream");
 #endif
   m_atom_map_lists.resize(m_num_devices);
   m_src_devs.resize(m_num_devices);
   m_local_records.resize(m_num_devices);
   m_global_to_local_id.resize(m_num_devices);
   const int *allDevices = deviceCUDA->allDevices();
   for (int i = 0; i < m_num_devices; ++i) {
     const int currentDeviceID = allDevices[i];
     m_device_id_to_index[currentDeviceID] = i;
     m_src_devs[i] = currentDeviceID;
   }
   if (m_num_devices > 1) {
     allocatePeerArrays();
   }
   // Copy lists
   m_d_atom_pos_copy_list = nullptr;
   m_d_atom_total_force_copy_list = nullptr;
   m_d_forced_atom_copy_list = nullptr;
   // Client buffers
   m_d_atom_pos_client_buffers = nullptr;
   m_atom_total_force_client_buffers = nullptr;
   m_d_forced_atom_client_buffers = nullptr;
   m_unique_forced_atoms = false;
   cudaCheck(cudaSetDevice(savedDevice));
 }

 CudaGlobalMasterServer::~CudaGlobalMasterServer() {
   printProfiling();
   if (m_d_atom_pos_copy_list != nullptr) {
     cudaCheck(cudaFree(m_d_atom_pos_copy_list));
     m_d_atom_pos_copy_list = nullptr;
   }
   if (m_d_atom_pos_client_buffers != nullptr) {
     cudaCheck(cudaFree(m_d_atom_pos_client_buffers));
     m_d_atom_pos_client_buffers = nullptr;
   }
   if (m_d_atom_total_force_copy_list != nullptr) {
     cudaCheck(cudaFree(m_d_atom_total_force_copy_list));
     m_d_atom_total_force_copy_list = nullptr;
   }
   if (m_atom_total_force_client_buffers != nullptr) {
     cudaCheck(cudaFree(m_atom_total_force_client_buffers));
     m_atom_total_force_client_buffers = nullptr;
   }
   if (m_d_forced_atom_copy_list != nullptr) {
     cudaCheck(cudaFree(m_d_forced_atom_copy_list));
     m_d_forced_atom_copy_list = nullptr;
   }
   if (m_d_forced_atom_client_buffers != nullptr) {
     cudaCheck(cudaFree(m_d_forced_atom_client_buffers));
     m_d_forced_atom_client_buffers = nullptr;
   }
   iout << iINFO << "CudaGlobalMasterServer: destructed on PE " << CkMyPe()
        << "\n"
        << endi;
 }

 void CudaGlobalMasterServer::printProfiling() const {
   CkPrintf("====================================================\n");
   CkPrintf("========= CudaGlobalMasterServer Profiling =========\n");
   CkPrintf("========== (Time is displayed in seconds) ==========\n");
   CkPrintf("====================================================\n");
   CkPrintf(" Build copy lists: %.2f\n", m_t_build_copy_lists.count());
   CkPrintf(" Copy atoms: %.2f\n", m_t_copy_atoms.count());
   CkPrintf(" Copy total forces: %.2f\n", m_t_copy_total_forces.count());
   CkPrintf(" Add forces from clients: %.2f\n", m_t_add_global_forces.count());
   CkPrintf(" Clients\' calculate(): %.2f\n", m_t_calc.count());
   CkPrintf(" Clients\' finishReductions(): %.2f\n", m_t_reductions.count());
   CkPrintf("====================================================\n");
 }

 void CudaGlobalMasterServer::addClient(
     std::shared_ptr<CudaGlobalMasterClient> client) {
   auto it = std::find(m_clients.begin(), m_clients.end(), client);
   if (it == m_clients.end()) {
     iout << iINFO << "CudaGlobalMasterServer: adding client \""
          << client->name() << "\"\n"
          << endi;
     m_clients.push_back(client);
     m_clients_changed = CudaGlobalMasterServer::numCopyLists;
   } else {
     const std::string error =
         "The client \"" + client->name() + "\" are being added twice.\n";
     NAMD_die(error.c_str());
   }
 }

 void CudaGlobalMasterServer::removeClient(
     std::shared_ptr<CudaGlobalMasterClient> client) {
   auto it = std::find(m_clients.begin(), m_clients.end(), client);
   if (it == m_clients.end()) {
     iout << iWARN << "CudaGlobalMasterServer: the client \"" << client->name()
          << "\" is not registered with CudaGlobalMasterServer\n"
          << endi;
   }
   while (it != m_clients.end()) {
     iout << iINFO << "CudaGlobalMasterServer: removing client \""
          << client->name() << "\"\n"
          << endi;
     m_clients.erase(it);
     m_clients_changed = CudaGlobalMasterServer::numCopyLists;
     it = std::find(m_clients.begin(), m_clients.end(), client);
   }
 }

 void CudaGlobalMasterServer::updateAtomMaps() {
   const int numPes = CkNumPes();
   // Iterate over all PEs
 #ifdef DEBUGM
   DebugM(3, "updateAtomMaps: number of PEs = " + std::to_string(numPes) + "\n");
 #endif
   for (int i = 0; i < numPes; ++i) {
     // Find the device ID of the i-th PE
     const int peDevice = deviceCUDA->getDeviceIDforPe(i);
     const int j = m_device_id_to_index.at(peDevice);
     // Get the atom maps
     AtomMap *amap = AtomMap::ObjectOnPe(i);
 #ifdef DEBUGM
     DebugM(3, "updateAtomMaps: PE " + std::to_string(i) + " atomMap " +
                   ptr_to_str(amap) + " on device " + std::to_string(peDevice) +
                   "\n");
 #endif
     m_atom_map_lists[j].push_back(amap);
   }
   const bool multi_gpu = m_num_devices > 1;
   // Iterate over all devices to get the map of global patch ID to local patch
   // ID
   // TODO: I assume all devices are used. Is this correct?
   for (int i = 0; i < m_num_devices; ++i) {
     const int deviceID = m_src_devs[i];
     // Get the master PE
     const int masterPe = deviceCUDA->getMasterPeForDeviceID(deviceID);
     // Get the corresponding SequencerCUDA instance
     const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
     m_global_to_local_id[i] = sequencer->globalToLocalID;
     m_local_records[i] = sequencer->patchData->devData[deviceID].h_localPatches;
     if (multi_gpu) {
       m_h_peer_atom_data.d_pos_x[i] = sequencer->d_pos_x;
       m_h_peer_atom_data.d_pos_y[i] = sequencer->d_pos_y;
       m_h_peer_atom_data.d_pos_z[i] = sequencer->d_pos_z;
       m_h_peer_atom_data.d_vel_x[i] = sequencer->d_vel_x;
       m_h_peer_atom_data.d_vel_y[i] = sequencer->d_vel_y;
       m_h_peer_atom_data.d_vel_z[i] = sequencer->d_vel_z;
       m_h_peer_atom_data.d_mass[i] = sequencer->d_mass;
       m_h_peer_atom_data.d_charge[i] = sequencer->d_charge;
       m_h_peer_atom_data.d_transform[i] = sequencer->d_transform;

       m_h_peer_tf_array.d_f_normal_x[i] = sequencer->d_f_normal_x;
       m_h_peer_tf_array.d_f_normal_y[i] = sequencer->d_f_normal_y;
       m_h_peer_tf_array.d_f_normal_z[i] = sequencer->d_f_normal_z;
       m_h_peer_tf_array.d_f_saved_nbond_x[i] = sequencer->d_f_saved_nbond_x;
       m_h_peer_tf_array.d_f_saved_nbond_y[i] = sequencer->d_f_saved_nbond_y;
       m_h_peer_tf_array.d_f_saved_nbond_z[i] = sequencer->d_f_saved_nbond_z;
       m_h_peer_tf_array.d_f_saved_slow_x[i] = sequencer->d_f_saved_slow_x;
       m_h_peer_tf_array.d_f_saved_slow_y[i] = sequencer->d_f_saved_slow_y;
       m_h_peer_tf_array.d_f_saved_slow_z[i] = sequencer->d_f_saved_slow_z;
       m_h_peer_tf_array.d_atomFixed[i] = sequencer->d_atomFixed;

       m_h_peer_af_array.d_f_applied_x[i] = sequencer->d_f_global_x;
       m_h_peer_af_array.d_f_applied_y[i] = sequencer->d_f_global_y;
       m_h_peer_af_array.d_f_applied_z[i] = sequencer->d_f_global_z;
       m_h_peer_af_array.d_atomFixed[i] = sequencer->d_atomFixed;
     }
 #ifdef DEBUGM
     DebugM(3, "updateAtomMaps: device " + std::to_string(deviceID) +
                   ", sequencer " + ptr_to_str(sequencer) + "\n");
 #endif
   }
   if (multi_gpu) {
     copyPeerArraysToDevice();
   }
   // Setup the flag to rebuild all copy lists
   m_atom_maps_changed = CudaGlobalMasterServer::numCopyLists;
 }

 void CudaGlobalMasterServer::setStep(int64_t step) {
   m_step = step;
   // Update the step number
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     (*it)->setStep(step);
   }
   if (m_print_profiling_freq > 0) {
     if (step % m_print_profiling_freq == 0) {
       printProfiling();
     }
   }
 }

 void CudaGlobalMasterServer::communicateToClients(const Lattice* lat) {
   // iout << iINFO << "PE: " << CkMyPe() << ", communicateToClients this address " << this << "\n" << endi;
   bool b_buildAtomsPositionCopyList = false;
   bool b_buildAtomsTotalForcesCopyList = false;
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     // Check all clients if their requested atoms are changed
     if ((*it)->requestedAtomsChanged()) {
       b_buildAtomsPositionCopyList = true;
     }
     // Check all clients if their requested total force atoms are changed
     if ((*it)->requestedTotalForcesAtomsChanged()) {
       b_buildAtomsTotalForcesCopyList = true;
     }
     // Copy lattice to client buffers if necessary
     if ((*it)->requestUpdateLattice()) {
       copyLatticeToClient(lat, m_device_id, *it, m_stream);
     }
   }
   // Rebuild the list of atoms to copy if necessary
   if (b_buildAtomsPositionCopyList || m_atom_maps_changed > 0 ||
       m_clients_changed > 0) {
     buildAtomsCopyList();
     if (m_atom_maps_changed > 0)
       m_atom_maps_changed--;
     if (m_clients_changed > 0)
       m_clients_changed--;
   }
   if (b_buildAtomsTotalForcesCopyList || m_atom_maps_changed > 0 ||
       m_clients_changed > 0) {
     buildAtomsTotalForcesCopyList();
     if (m_atom_maps_changed > 0)
       m_atom_maps_changed--;
     if (m_clients_changed > 0)
       m_clients_changed--;
   }
   // Check all clients if they request to update the atomic positions
   bool b_copyPositions = false;
   bool b_copyTotalForces = false;
   bool b_copyMasses = false;
   bool b_copyCharges = false;
   bool b_copyTransforms = false;
   bool b_copyVelocities = false;
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     b_copyPositions |= (*it)->requestUpdateAtomPositions();
     b_copyTotalForces |= (*it)->requestUpdateAtomTotalForces();
     b_copyMasses |= (*it)->requestUpdateMasses();
     b_copyCharges |= (*it)->requestUpdateCharges();
     b_copyTransforms |= (*it)->requestUpdateTransforms();
     b_copyVelocities |= (*it)->requestUpdateVelocities();
   }
   // Update the atomic positions if necessary
   if (b_buildAtomsPositionCopyList || b_copyPositions || b_copyMasses ||
       b_copyCharges || b_copyTransforms || b_copyVelocities) {
     copyAtomsToClients(b_copyPositions, b_copyMasses, b_copyCharges,
                        b_copyTransforms, b_copyVelocities);
   }
   if (b_copyTotalForces || b_buildAtomsTotalForcesCopyList) {
     copyTotalForcesToClients();
   }
   // Call the calculate function of every client
   NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_CALCULATECLIENTS);
   const auto startTime = std::chrono::high_resolution_clock::now();
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     (*it)->calculate();
   }
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_calc += endTime - startTime;
   NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_CALCULATECLIENTS);
 }

 bool CudaGlobalMasterServer::requestedTotalForces() const {
   bool result = false;
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     result |= (!((*it)->getRequestedForcedAtoms().empty()) &&
                (*it)->requestUpdateAtomTotalForces());
   }
   return result;
 }

 void CudaGlobalMasterServer::buildAtomsCopyList() {
 #ifdef DEBUGM
   DebugM(3, "buildAtomsCopyList is called\n");
 #endif
   const auto startTime = std::chrono::high_resolution_clock::now();
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   buildCopyList(&CudaGlobalMasterClient::getRequestedAtoms,
                 &CudaGlobalMasterClient::getPositions, m_clients,
                 m_atom_map_lists, m_local_records, m_global_to_local_id,
                 m_src_devs, m_device_id_to_index, m_d_atom_pos_client_buffers,
                 m_atom_pos_copy_list, m_d_atom_pos_copy_list, m_stream);
   // cudaCheck(cudaStreamSynchronize(m_stream));
   cudaCheck(cudaSetDevice(savedDevice));
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_build_copy_lists += endTime - startTime;
 }

 void CudaGlobalMasterServer::copyAtomsToClients(bool copyPositions,
                                                 bool copyMasses,
                                                 bool copyCharges,
                                                 bool copyTransforms,
                                                 bool copyVelocities) {
 #ifdef DEBUGM
   DebugM(1, "copyAtomsToClients is called\n");
 #endif
   NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_ATOMTOCLIENTS);
   const auto startTime = std::chrono::high_resolution_clock::now();
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   if (m_num_devices == 1) {
 #ifdef DEBUGM
 #ifdef MIN_DEBUG_LEVEL
 #if MIN_DEBUG_LEVEL <= 1
     debugCopyList("CudaGlobalMasterServer::copyAtomsToClients",
                   m_atom_pos_copy_list);
 #endif
 #endif
 #endif
     const int masterPe = deviceCUDA->getMasterPe();
     const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
     copyAtomsToClientsCUDA(
         copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
         sequencer->d_pos_x, sequencer->d_pos_y, sequencer->d_pos_z,
         sequencer->d_vel_x, sequencer->d_vel_y, sequencer->d_vel_z,
         sequencer->d_transform, sequencer->d_mass, sequencer->d_charge,
         sequencer->myLattice, this->m_d_atom_pos_copy_list,
         this->m_atom_pos_copy_list.size(), this->m_d_atom_pos_client_buffers,
         m_clients.size(), m_stream);
   } else {
     const int masterPe = deviceCUDA->getMasterPe();
     const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
     copyAtomsToClientsCUDAMGPU(
         copyPositions, copyMasses, copyCharges, copyTransforms, copyVelocities,
         (const double **)m_h_peer_atom_data.d_pos_x,
         (const double **)m_h_peer_atom_data.d_pos_y,
         (const double **)m_h_peer_atom_data.d_pos_z,
         (const double **)m_h_peer_atom_data.d_vel_x,
         (const double **)m_h_peer_atom_data.d_vel_y,
         (const double **)m_h_peer_atom_data.d_vel_z,
         (const char3 **)m_h_peer_atom_data.d_transform,
         (const float **)m_h_peer_atom_data.d_mass,
         (const float **)m_h_peer_atom_data.d_charge, sequencer->myLattice,
         this->m_d_atom_pos_copy_list, this->m_atom_pos_copy_list.size(),
         this->m_d_atom_pos_client_buffers, m_clients.size(), m_stream);
   }
   // cudaCheck(cudaStreamSynchronize(m_stream));
   cudaCheck(cudaSetDevice(savedDevice));
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_copy_atoms += endTime - startTime;
   NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_ATOMTOCLIENTS);
 }

 void CudaGlobalMasterServer::copyTotalForcesToClients() {
 #ifdef DEBUGM
   DebugM(1, "copyTotalForcesToClients is called\n");
 #endif
   const SimParameters *simParams = Node::Object()->simParameters;
   NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_TOTALFORCETOCLIENTS);
   const auto startTime = std::chrono::high_resolution_clock::now();
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   if (m_num_devices == 1) {
 #ifdef DEBUGM
 #ifdef MIN_DEBUG_LEVEL
 #if MIN_DEBUG_LEVEL <= 1
     debugCopyList("CudaGlobalMasterServer::copyTotalForcesToClients",
                   m_atom_total_force_copy_list);
 #endif
 #endif
 #endif
     const int masterPe = deviceCUDA->getMasterPe();
     const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
     copyTotalForcesToClientsCUDA(
         simParams->fixedAtomsOn, sequencer->d_f_normal_x,
         sequencer->d_f_normal_y, sequencer->d_f_normal_z,
         sequencer->d_f_saved_nbond_x, sequencer->d_f_saved_nbond_y,
         sequencer->d_f_saved_nbond_z, sequencer->d_f_saved_slow_x,
         sequencer->d_f_saved_slow_y, sequencer->d_f_saved_slow_z,
         sequencer->d_atomFixed, this->m_d_atom_total_force_copy_list,
         this->m_atom_total_force_copy_list.size(),
         this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
   } else {
     copyTotalForcesToClientsCUDAMGPU(
         simParams->fixedAtomsOn,
         (const double **)m_d_peer_tf_array.d_f_normal_x,
         (const double **)m_d_peer_tf_array.d_f_normal_y,
         (const double **)m_d_peer_tf_array.d_f_normal_z,
         (const double **)m_d_peer_tf_array.d_f_saved_nbond_x,
         (const double **)m_d_peer_tf_array.d_f_saved_nbond_y,
         (const double **)m_d_peer_tf_array.d_f_saved_nbond_z,
         (const double **)m_d_peer_tf_array.d_f_saved_slow_x,
         (const double **)m_d_peer_tf_array.d_f_saved_slow_y,
         (const double **)m_d_peer_tf_array.d_f_saved_slow_z,
         (const int **)m_d_peer_tf_array.d_atomFixed,
         this->m_d_atom_total_force_copy_list,
         this->m_atom_total_force_copy_list.size(),
         this->m_atom_total_force_client_buffers, m_clients.size(), m_stream);
   }
   // cudaCheck(cudaStreamSynchronize(m_stream));
   cudaCheck(cudaSetDevice(savedDevice));
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_copy_total_forces += endTime - startTime;
   NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_TOTALFORCETOCLIENTS);
 }

 void CudaGlobalMasterServer::buildAtomsTotalForcesCopyList() {
 #ifdef DEBUGM
   DebugM(3, "buildAtomsTotalForcesCopyList is called\n");
 #endif
   const auto startTime = std::chrono::high_resolution_clock::now();
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   buildCopyList(&CudaGlobalMasterClient::getRequestedTotalForcesAtoms,
                 &CudaGlobalMasterClient::getTotalForces, m_clients,
                 m_atom_map_lists, m_local_records, m_global_to_local_id,
                 m_src_devs, m_device_id_to_index,
                 m_atom_total_force_client_buffers, m_atom_total_force_copy_list,
                 m_d_atom_total_force_copy_list, m_stream);
   cudaCheck(cudaSetDevice(savedDevice));
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_build_copy_lists += endTime - startTime;
 }

 void CudaGlobalMasterServer::buildForcedAtomsCopyList() {
 #ifdef DEBUGM
   DebugM(3, "buildForcedAtomsCopyList is called\n");
 #endif
   const auto startTime = std::chrono::high_resolution_clock::now();
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   buildCopyList(&CudaGlobalMasterClient::getRequestedForcedAtoms,
                 &CudaGlobalMasterClient::getAppliedForces, m_clients,
                 m_atom_map_lists, m_local_records, m_global_to_local_id,
                 m_src_devs, m_device_id_to_index,
                 m_d_forced_atom_client_buffers, m_forced_atom_copy_list,
                 m_d_forced_atom_copy_list, m_stream, &m_unique_forced_atoms);
   cudaCheck(cudaSetDevice(savedDevice));
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_build_copy_lists += endTime - startTime;
 }

 void CudaGlobalMasterServer::communicateToMD() {
   // iout << iINFO << "PE: " << CkMyPe() << ", communicateToMD this address " << this << "\n" << endi;
 #ifdef DEBUGM
   DebugM(1, "Calling communicateToMD at step " + std::to_string(m_step));
 #endif
   bool b_buildForcedAtomsCopyList = false;
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     if ((*it)->requestedForcedAtomsChanged()) {
       b_buildForcedAtomsCopyList = true;
     }
   }
   if (b_buildForcedAtomsCopyList || m_atom_maps_changed > 0 ||
       m_clients_changed > 0) {
     buildForcedAtomsCopyList();
     if (m_atom_maps_changed > 0)
       m_atom_maps_changed--;
     if (m_clients_changed > 0)
       m_clients_changed--;
   }
   bool b_copyForcedAtoms = false;
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     if ((*it)->requestUpdateForcedAtoms()) {
       b_copyForcedAtoms = true;
     }
   }
   NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_ADDGLOBALFORCES);
   const auto startTime = std::chrono::high_resolution_clock::now();
   if (b_copyForcedAtoms) {
     addGlobalForces();
   }
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_add_global_forces += endTime - startTime;
   NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_ADDGLOBALFORCES);
 }

 bool CudaGlobalMasterServer::willAddGlobalForces() const {
   bool result = false;
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     result |= (!((*it)->getRequestedForcedAtoms().empty()) &&
                (*it)->requestUpdateForcedAtoms());
   }
   return result;
 }

 void CudaGlobalMasterServer::addGlobalForces() {
 #ifdef DEBUGM
   DebugM(1, "Calling addGlobalForces at step " + std::to_string(m_step));
 #endif
   const SimParameters *simParams = Node::Object()->simParameters;
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   if (m_num_devices == 1) {
     const int masterPe = deviceCUDA->getMasterPe();
     const SequencerCUDA *sequencer = SequencerCUDA::ObjectOnPe(masterPe);
     addGlobalForcesFromClients(
         simParams->fixedAtomsOn, m_unique_forced_atoms, sequencer->d_f_global_x,
         sequencer->d_f_global_y, sequencer->d_f_global_z,
         sequencer->d_atomFixed, this->m_d_forced_atom_copy_list,
         this->m_forced_atom_copy_list.size(),
         this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
   } else {
     addGlobalForcesFromClientsMGPU(
         simParams->fixedAtomsOn, m_unique_forced_atoms, m_d_peer_af_array.d_f_applied_x,
         m_d_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_z,
         (const int **)m_d_peer_af_array.d_atomFixed,
         this->m_d_forced_atom_copy_list, this->m_forced_atom_copy_list.size(),
         this->m_d_forced_atom_client_buffers, m_clients.size(), m_stream);
   }
   cudaCheck(cudaStreamSynchronize(m_stream));
   cudaCheck(cudaSetDevice(savedDevice));
 }

 void CudaGlobalMasterServer::allocatePeerArrays() {
 #ifdef DEBUGM
   DebugM(3, "CudaGlobalMasterServer::allocatePeerArrays");
 #endif
   allocate_host<double *>(&(m_h_peer_atom_data.d_pos_x), m_num_devices);
   allocate_host<double *>(&(m_h_peer_atom_data.d_pos_y), m_num_devices);
   allocate_host<double *>(&(m_h_peer_atom_data.d_pos_z), m_num_devices);
   allocate_host<double *>(&(m_h_peer_atom_data.d_vel_x), m_num_devices);
   allocate_host<double *>(&(m_h_peer_atom_data.d_vel_y), m_num_devices);
   allocate_host<double *>(&(m_h_peer_atom_data.d_vel_z), m_num_devices);
   allocate_host<float *>(&(m_h_peer_atom_data.d_mass), m_num_devices);
   allocate_host<float *>(&(m_h_peer_atom_data.d_charge), m_num_devices);
   allocate_host<char3 *>(&(m_h_peer_atom_data.d_transform), m_num_devices);

   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_x), m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_y), m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_normal_z), m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_x),
                            m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_y),
                            m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_nbond_z),
                            m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_x),
                            m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_y),
                            m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_tf_array.d_f_saved_slow_z),
                            m_num_devices);
   allocate_host<int *>(&(m_h_peer_tf_array.d_atomFixed), m_num_devices);

   allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_x), m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_y), m_num_devices);
   allocate_host<tf_type *>(&(m_h_peer_af_array.d_f_applied_z), m_num_devices);
   allocate_host<int *>(&(m_h_peer_af_array.d_atomFixed), m_num_devices);

   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   allocate_device<double *>(&(m_d_peer_atom_data.d_pos_x), m_num_devices);
   allocate_device<double *>(&(m_d_peer_atom_data.d_pos_y), m_num_devices);
   allocate_device<double *>(&(m_d_peer_atom_data.d_pos_z), m_num_devices);
   allocate_device<double *>(&(m_d_peer_atom_data.d_vel_x), m_num_devices);
   allocate_device<double *>(&(m_d_peer_atom_data.d_vel_y), m_num_devices);
   allocate_device<double *>(&(m_d_peer_atom_data.d_vel_z), m_num_devices);
   allocate_device<float *>(&(m_d_peer_atom_data.d_mass), m_num_devices);
   allocate_device<float *>(&(m_d_peer_atom_data.d_charge), m_num_devices);
   allocate_device<char3 *>(&(m_d_peer_atom_data.d_transform), m_num_devices);

   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_x), m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_y), m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_normal_z), m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_x),
                              m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_y),
                              m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_nbond_z),
                              m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_x),
                              m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_y),
                              m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_tf_array.d_f_saved_slow_z),
                              m_num_devices);
   allocate_device<int *>(&(m_d_peer_tf_array.d_atomFixed), m_num_devices);

   allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_x), m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_y), m_num_devices);
   allocate_device<tf_type *>(&(m_d_peer_af_array.d_f_applied_z), m_num_devices);
   allocate_device<int *>(&(m_d_peer_af_array.d_atomFixed), m_num_devices);
   cudaCheck(cudaSetDevice(savedDevice));
 }

 void CudaGlobalMasterServer::copyPeerArraysToDevice() {
 #ifdef DEBUGM
   DebugM(3, "CudaGlobalMasterServer::copyPeerArraysToDevice");
 #endif
   // Save the current device ID
   int savedDevice;
   cudaCheck(cudaGetDevice(&savedDevice));
   cudaCheck(cudaSetDevice(m_device_id));
   copy_HtoD(m_h_peer_atom_data.d_pos_x, m_d_peer_atom_data.d_pos_x,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_pos_y, m_d_peer_atom_data.d_pos_y,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_pos_z, m_d_peer_atom_data.d_pos_z,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_vel_x, m_d_peer_atom_data.d_vel_x,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_vel_y, m_d_peer_atom_data.d_vel_y,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_vel_z, m_d_peer_atom_data.d_vel_z,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_mass, m_d_peer_atom_data.d_mass, m_num_devices,
             m_stream);
   copy_HtoD(m_h_peer_atom_data.d_charge, m_d_peer_atom_data.d_charge,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_atom_data.d_transform, m_d_peer_atom_data.d_transform,
             m_num_devices, m_stream);

   copy_HtoD(m_h_peer_tf_array.d_f_normal_x, m_d_peer_tf_array.d_f_normal_x,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_normal_y, m_d_peer_tf_array.d_f_normal_y,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_normal_z, m_d_peer_tf_array.d_f_normal_z,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_x,
             m_d_peer_tf_array.d_f_saved_nbond_x, m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_y,
             m_d_peer_tf_array.d_f_saved_nbond_y, m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_saved_nbond_z,
             m_d_peer_tf_array.d_f_saved_nbond_z, m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_x,
             m_d_peer_tf_array.d_f_saved_slow_x, m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_y,
             m_d_peer_tf_array.d_f_saved_slow_y, m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_f_saved_slow_z,
             m_d_peer_tf_array.d_f_saved_slow_z, m_num_devices, m_stream);
   copy_HtoD(m_h_peer_tf_array.d_atomFixed, m_d_peer_tf_array.d_atomFixed,
             m_num_devices, m_stream);

   copy_HtoD(m_h_peer_af_array.d_f_applied_x, m_d_peer_af_array.d_f_applied_x,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_af_array.d_f_applied_y, m_d_peer_af_array.d_f_applied_y,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_af_array.d_f_applied_z, m_d_peer_af_array.d_f_applied_z,
             m_num_devices, m_stream);
   copy_HtoD(m_h_peer_af_array.d_atomFixed, m_d_peer_af_array.d_atomFixed,
             m_num_devices, m_stream);

   // cudaCheck(cudaStreamSynchronize(m_stream));
   cudaCheck(cudaSetDevice(savedDevice));
 }

 #ifdef NODEGROUP_FORCE_REGISTER

 void CudaGlobalMasterServer::finishReductions(bool doEnergy, bool doVirial,
                                               NodeReduction *reduction) {
 #ifdef DEBUGM
   DebugM(1, "Calling finishReductions at step " + std::to_string(m_step));
 #endif
   NAMD_EVENT_START(1, NamdProfileEvent::CUDAGM_FINISHREDUCTIONS);
   const auto startTime = std::chrono::high_resolution_clock::now();
   // CkPrintf("Calling CudaGlobalMasterServer::finishReductions\n");
   for (auto it = m_clients.begin(); it != m_clients.end(); ++it) {
     (*it)->finishReductions(doEnergy, doVirial, reduction);
   }
   const auto endTime = std::chrono::high_resolution_clock::now();
   m_t_reductions += endTime - startTime;
   NAMD_EVENT_STOP(1, NamdProfileEvent::CUDAGM_FINISHREDUCTIONS);
 }

 #endif // NODEGROUP_FORCE_REGISTER

 #else

 CudaGlobalMasterServer::CudaGlobalMasterServer(
   int deviceID, int printProfilingFreq /* = -1 */
 ) {
   NAMD_die(
       "CudaGlobalMasterServer requires to build NAMD with CUDA support.\n");
 }
 #endif // defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)
Node::Object
static Node * Object()
Definition: Node.h:86

CudaUtils.h

NAMD_EVENT_STOP
#define NAMD_EVENT_STOP(eon, id)
Definition: NamdEventsProfiling.h:318

iINFO
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81

Lattice::c
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270

localIDResult
Helper struct to store the result of query local ID from global ID.
Definition: CudaGlobalMasterServer.C:34

Debug.h

SimParameters
Definition: SimParameters.h:102

NamdEventsProfiling.h

Node::simParameters
SimParameters * simParameters
Definition: Node.h:181

DebugM
#define DebugM(x, y)
Definition: Debug.h:75

InfoStream.h

endi
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54

NodeReduction
Definition: ReductionMgr.h:364

Vector::z
BigReal z
Definition: Vector.h:74

iWARN
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82

iout
#define iout
Definition: InfoStream.h:51

CudaGlobalMasterServer::CudaGlobalMasterServer
CudaGlobalMasterServer(int deviceID, int printProfilingFreq=-1)
Definition: CudaGlobalMasterServer.C:964

LocalID
Definition: NamdTypes.h:287

LocalID::index
int32 index
Definition: NamdTypes.h:290

deviceCUDA
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23

localIDResult::soaIndex
int soaIndex
Definition: CudaGlobalMasterServer.C:37

NAMD_EVENT_START
#define NAMD_EVENT_START(eon, id)
Definition: NamdEventsProfiling.h:312

AtomMap::ObjectOnPe
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38

DeviceCUDA::getMasterPe
int getMasterPe()
Definition: DeviceCUDA.h:137

NAMD_bug
void NAMD_bug(const char *err_msg)
Definition: common.C:195

CudaGlobalMasterServer.h

DeviceCUDA::getMasterPeForDeviceID
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:530

CudaGlobalMasterServerKernel.h

CudaGlobalMasterClient.h

Vector::x
BigReal x
Definition: Vector.h:74

SequencerCUDA.h

notUsed
Definition: AtomMap.h:31

NAMD_die
void NAMD_die(const char *err_msg)
Definition: common.C:147

AtomMap.h

LocalID::pid
PatchID pid
Definition: NamdTypes.h:289

DeviceCUDA::allDevices
const int * allDevices() const
Definition: DeviceCUDA.h:173

Lattice::b
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269

ProcessorPrivate.h

NamdTypes.h

simParams
#define simParams
Definition: Output.C:129

DeviceCUDA.h

AtomMap
Definition: AtomMap.h:33

AtomID
int32 AtomID
Definition: NamdTypes.h:35

Vector::y
BigReal y
Definition: Vector.h:74

localIDResult::found
bool found
Definition: CudaGlobalMasterServer.C:35

cudaCheck
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

Lattice.h

Lattice
Definition: Lattice.h:17

Lattice::a
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268

DeviceCUDA::getDeviceIDforPe
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:523

Lattice::origin
NAMD_HOST_DEVICE Vector origin() const
Definition: Lattice.h:278

copy_HtoD
void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:409

DeviceCUDA
Definition: DeviceCUDA.h:54

SimParameters.h