namd/doxygen/SynchronousCollectives_8h_source.html

 #ifndef SYNCHRONOUS_COLLECTIVES_H
 #define SYNCHRONOUS_COLLECTIVES_H

 #include "charm++.h"

 #include "main.h"
 #include "NamdTypes.h"
 #include "ProcessorPrivate.h"
 #include "CudaRecord.h"
 #include "CudaUtils.h"

 #include "SynchronousCollectives.decl.h"

 #include <vector>
 #include <any>
 #include <map>

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)

 /*
  * Defines the scopes used by collectives
  */
 enum class SynchronousCollectiveScope: unsigned int {
   all,
   master,
   single
 };

 class SynchronousCollectivesMulticastMsg :
   public CkMcastBaseMsg, public CMessage_SynchronousCollectivesMulticastMsg
 {
 public:
   SynchronousCollectivesMulticastMsg() {}
 };

 /*
  * A collection of bulk synchronous collective functions used in the GPU resident code
  *
  * The general strategy for the collectives is for the calling thread to invoke the
  * entry methods and then enter a loop yielding control to the converse scheduler
  * and checking if the expected messages have arrived. The entry methods store the
  * data they receive in a temporary buffer and then indicate that they have received
  * that message.
  *
  * This approach doesn't have strict ordering guarantees for entry functions.
  * For example:
  *
  * If we have three PEs (0, 1, 2) which call some entry function A. If PE 0 receives
  * messages from PEs 1 and 2 and executes the corresponding A entry functions, it will
  * continue with the main thread execution. However, there is no guarantee PE 1 has
  * received the message from PE 2 at this point. PE 0 could then invoke another entry
  * function, B. PE 1 could execute B based on PE 0's message before it executes A based
  * on PE 2's message.
  *
  * The entry methods store the data they receive in a std::map; the key to the map is
  * an unsigned integer that gets incremented when the collectives is called and then
  * passed to the entry function.
  *
  */
 class SynchronousCollectives : public CBase_SynchronousCollectives
 {
 public:
   static SynchronousCollectives* Object() {
     return CkpvAccess(SynchronousCollectives_instance);
   }
   static SynchronousCollectives* ObjectOnPe(const int pe) {
     return CkpvAccessOther(SynchronousCollectives_instance, CmiRankOf(pe));
   }

   SynchronousCollectives();
   ~SynchronousCollectives();

   /*
    * @brief Initializes the all-scoped collectives
    *
    * This will initialize the list of PEs and barrier used for all-scoped collectives
    */
   void initAllScope();

   /*
    * @brief Initializes the masterPE-scoped collectives
    *
    * This will initialize the Charm++ sections used for master PE communication as well
    * as the barrier for master PE scoped collectives. This is separate from the init
    * for all-scoped collectives because we need information from the GlobalGPUMgr, but
    * the GlobalGPUMgr needs to use all-scoped collectives to initialize.
    *
    * This needs to be called by all PEs.
    *
    * @param[in] isMasterPe whether or not this PE is a master PE
    * @param[in] isMasterDevice whether or not PE is assigned to the master device
    * @param[in] numDevices number of devices used by NAMD across all nodes
    * @param[in] deviceIndex this devices index among GPUs on all nodes
    * @param[in] masterPeList list of master PEs on all nodes used by master PE scoped
    *                         collectives
    */
   void initMasterScope(const int isMasterPe, const int isMasterDevice,
     const int numDevices, const int deviceIndex, const std::vector<int>& masterPeList);

   /*
    * @brief Computes the element-wise reduction between PEs on a vector of elements
    *
    * This uses Charm++'s reduction functionality to compute the given reduction over
    * a std::vector. It will either involved all PEs or master PEs depending on the
    * given scope. This function can be called with a master PE scope by non-master PEs,
    * in this case, they will return their input data without blocking.
    *
    *
    * @param[in] data Reference to std::vector of data on which the reduction will happen
    * @param[in] type The type of reduction to perform
    * @param[in] scope The scope of the collective operation
    *
    * @return A vector of data containing the result of the allreduce. This will be the
    *         same length as the inputed data
    */
   template<typename T>
   std::vector<T> allReduce(std::vector<T>& data, CkReduction::reducerType type,
     const SynchronousCollectiveScope scope);

   /*
    * @brief Performs an all gather between PEs
    *
    * This performs an allgather between either all PEs or master PEs depending on the given
    * scope. Each participating PE provides data that will be distributed to all other participating
    * PEs; this operation works on std::vectors.
    *
    * @param[in] data Reference to data being spent by this PE
    * @param[in] scope The scope of the collective operation
    *
    * @return A vector containing data entries from all participating PEs
    */
   template<typename T>
   std::vector<T> allGather(const T& data, const SynchronousCollectiveScope scope);

   /*
    * @brief Performs an alltoallv between PEs
    *
    * This performs an alltoallv between all PEs or master PEs depending on the given scope. Each PE
    * provides a vector of data where each element is sent to a different remote PE. This operation
    * works on std::vectors, allowing for different amounts of data to be sent to different PEs.
    *
    * @param[in] data Reference to data being spent by this PE
    * @param[in] scope The scope of the collective operation
    *
    * @return A vector containing data entries from all participating PEs
    */
   template<typename T>
   std::vector<T> alltoallv(const std::vector<T>& data, const SynchronousCollectiveScope scope);

   /*
    * @brief Performs a broadcast
    *
    * This performs a broadcast from one PE to either all PEs or master PEs depending on the
    * given scope. The PE with isRoot set to true will broadcast data, and isRoot should only
    * be true on one PE.
    *
    * @param[in] data Reference to data being sent to other PEs
    * @param[in] isRoot Boolean set to true for the broadcasting PE
    * @param[in] scope The scope of the broadcast
    *
    * @return Data from root PE
    */
   template<typename T>
   T broadcast(const T& data, const bool isRoot, const SynchronousCollectiveScope scope);

   /*
    * @brief Waits for all charm++ operations to complete and awakens current thread
    *
    * This will suspend all threads an use quiescence detection to resume execution.
    * This allows for all outstanding Charm++ operations to complete. This is necessary
    * in the GPU resident code path in order to use the existing broadcast, reduction, and
    * outputting code where it needs to wait for Charm++ entry functions to complete
    */
   void waitAndAwaken();

   /*
    * @brief Barrier function
    *
    * This will block until all participating PEs have reached the barrier. It can operate
    * with all PEs or just master PEs.
    *
    * @param[in] scope The scope of the broadcast
    */
   void barrier(const SynchronousCollectiveScope scope);

   /*
    * @brief Barrier All Function
    *
    * This function will always use a single barrier between all PEs, where as the
    * barrier function can use barrier between master PEs plus a CmiNodeBarrier. Sometimes
    * this function is needs during startup.
    *
    * @param[in] scope The scope of the broadcast
    */
   void forceBarrierAll();

   /*
    * Charm++ Entry Functions
    */

   /*
    * Sends a message to all master PE to initialize multi-cast cookie for reductions
    */
   void setupMulticastSection(SynchronousCollectivesMulticastMsg *msg);

   /*
    * This will be used as the callback function given to Charm++'s reduction
    * function. It will be invoked on one PE and will broadcast the results
    * of the reduction to all other PEs
    */
   void handleReductionAll(CkReductionMsg *msg);

   /*
    * This will be used as the callback function given to Charm++'s reduction
    * function. It will be invoked on one PE and will broadcast the results
    * of the reduction to all other master PEs
    */
   void handleReductionMaster(CkReductionMsg *msg);

   /*
    * This function will be called by one PE and be executed on all other PEs. It sets
    * temp value of all PEs to the reduction value and then increments the count
    */
   void broadcastReductionResult(int n, char* data);

   /*
    * Entry function for all gather and all-to-all; receives data and increments count
    */
   template<typename T>
   void recvIndexData(const int index, const T& data, const SynchronousCollectiveScope scope,
     const unsigned int key);

   /*
    * Entry function that receives data and increments count
    */
   template<typename T>
   void recvBroadcast(const T& data, const unsigned int key);

 /*
  * @brief Helper function used by waitAndAwaken
  *
  * This will use quiescence detection to allow for all outstanding charm++ operations
  * to finish
  */
   void wait();

   /*
    * Entry function used in all-scoped barrier; this will just increment the counter
    */
   void recvBarrierAll(const int PE);

   /*
    * Entry function used in all-scoped barrier; this will just increment the counter
    */
   void recvBarrierMasterPe(const int deviceIndex);

 private:
   /*
    * Helper function to call entry functions for all gather
    */
   template<typename T>
   void sendAllGather(const T& data, const SynchronousCollectiveScope scope, const unsigned int key);

   /*
    * Helper function to call entry functions for all-to-all
    */
   template<typename T>
   void sendAlltoallv(const std::vector<T>& data, const SynchronousCollectiveScope scope, const unsigned int key);

   /*
    * Helper function to call entry functions for broadcast
    */
   template<typename T>
   void sendBroadcast(const T& data, const SynchronousCollectiveScope scope, const unsigned int key);

   /*
    * @brief Helper function for counting received messages
    *
    * This function is called by entry functions in order to record that a message
    * has been received.
    */
   void incrementCount(const SynchronousCollectiveScope scope, const int index);

   /*
    * @brief Helper function for waiting for all expected messages
    *
    * This function will enter a while loop yielding control to converse and checking
    * if all the expected messages (based on scope) have been received.
    */
   void suspendAndCheck(const SynchronousCollectiveScope scope);
   void setThread(CthThread thread) { self_awaken_thread_ = thread; }

   /*
    * Helper function that grabs data from temp storage
    */
   template<typename T>
   T retrieveTemp(const unsigned int key);

   /*
    * Helper function that returns the key for the given scope
    */
   unsigned int getKey(const SynchronousCollectiveScope scope) {
     return (scope == SynchronousCollectiveScope::all) ? tempDataAllKey_++ : tempDataMasterKey_++;
   }

   /*
    * Helper function that returns barrier based on the given scope
    */
   std::vector<int>& getBarrier(const SynchronousCollectiveScope scope) {
     if (scope == SynchronousCollectiveScope::all) {
       return currentBarrierAll_;
     } else if (scope == SynchronousCollectiveScope::master) {
       return currentBarrierMasterPe_;
     } else {
       return currentBarrierSingle_;
     }
   }

   std::vector<int> masterPeList_;
   CProxySection_SynchronousCollectives masterPes_;
   CProxySection_SynchronousCollectives masterPesMulticast_;
   CkSectionInfo reductionCookie_;
   CProxy_SynchronousCollectives allPes_;

   // Used to store temporary variables
   unsigned int tempDataMasterKey_ = 0;
   unsigned int tempDataAllKey_ = 0;
   std::map<unsigned int, std::any> tempData_;
   std::any reductionTemp_;
   void* reductionPtr_;

   // Barrier counters
   std::vector<int> currentBarrierAll_;
   std::vector<int> currentBarrierMasterPe_;
   std::vector<int> currentBarrierSingle_;

   // Used for QD
   int waitPhase_ = 0;
   CthThread self_awaken_thread_;

   // Store information for MasterPe scoped communication
   int isMasterPe_ = 0;
   int numDevices_ = 0;
   int deviceIndex_ = -1;
 };

 #if !(defined(__NVCC__) || defined(__HIPCC__))
 #include <pup.h>
 PUPbytes (cudaIpcMemHandle_t);
 #endif

 #endif  /* NAMD_CUDA || NAMD_HIP */

 #define CK_TEMPLATES_ONLY
 #include "SynchronousCollectives.def.h"
 #undef CK_TEMPLATES_ONLY

 #endif  /* SYNCHRONOUS_COLLECTIVES_H */
CudaUtils.h

SynchronousCollectives::barrier
void barrier(const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:145

SynchronousCollectives::allGather
std::vector< T > allGather(const T &data, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:260

SynchronousCollectives::handleReductionMaster
void handleReductionMaster(CkReductionMsg *msg)
Definition: SynchronousCollectives.C:176

SynchronousCollectives::allReduce
std::vector< T > allReduce(std::vector< T > &data, CkReduction::reducerType type, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:187

SynchronousCollectivesMulticastMsg
Definition: SynchronousCollectives.h:29

SynchronousCollectives::initMasterScope
void initMasterScope(const int isMasterPe, const int isMasterDevice, const int numDevices, const int deviceIndex, const std::vector< int > &masterPeList)
Definition: SynchronousCollectives.C:43

masterPeList
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95

SynchronousCollectives::SynchronousCollectives
SynchronousCollectives()
Definition: SynchronousCollectives.C:26

main.h

SynchronousCollectives::broadcast
T broadcast(const T &data, const bool isRoot, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:332

SynchronousCollectives::ObjectOnPe
static SynchronousCollectives * ObjectOnPe(const int pe)
Definition: SynchronousCollectives.h:66

SynchronousCollectives::recvBarrierAll
void recvBarrierAll(const int PE)
Definition: SynchronousCollectives.C:132

SynchronousCollectives::~SynchronousCollectives
~SynchronousCollectives()
Definition: SynchronousCollectives.C:35

SynchronousCollectives::recvBarrierMasterPe
void recvBarrierMasterPe(const int deviceIndex)
Definition: SynchronousCollectives.C:136

SynchronousCollectiveScope::all

SynchronousCollectives::broadcastReductionResult
void broadcastReductionResult(int n, char *data)
Definition: SynchronousCollectives.C:181

SynchronousCollectives::initAllScope
void initAllScope()
Definition: SynchronousCollectives.C:37

SynchronousCollectives
Definition: SynchronousCollectives.h:60

SynchronousCollectiveScope::master

ProcessorPrivate.h

NamdTypes.h

SynchronousCollectiveScope
SynchronousCollectiveScope
Definition: SynchronousCollectives.h:23

SynchronousCollectives::recvIndexData
void recvIndexData(const int index, const T &data, const SynchronousCollectiveScope scope, const unsigned int key)
Definition: SynchronousCollectives.C:235

SynchronousCollectivesMulticastMsg::SynchronousCollectivesMulticastMsg
SynchronousCollectivesMulticastMsg()
Definition: SynchronousCollectives.h:33

SynchronousCollectives::setupMulticastSection
void setupMulticastSection(SynchronousCollectivesMulticastMsg *msg)
Definition: SynchronousCollectives.C:219

SynchronousCollectives::handleReductionAll
void handleReductionAll(CkReductionMsg *msg)
Definition: SynchronousCollectives.C:171

SynchronousCollectives::waitAndAwaken
void waitAndAwaken()
Definition: SynchronousCollectives.C:126

SynchronousCollectives::alltoallv
std::vector< T > alltoallv(const std::vector< T > &data, const SynchronousCollectiveScope scope)
Definition: SynchronousCollectives.C:296

SynchronousCollectives::wait
void wait()
Definition: SynchronousCollectives.C:103

SynchronousCollectiveScope::single

PUPbytes
PUPbytes(cudaIpcMemHandle_t)

SynchronousCollectives::Object
static SynchronousCollectives * Object()
Definition: SynchronousCollectives.h:63

SynchronousCollectives::forceBarrierAll
void forceBarrierAll()
Definition: SynchronousCollectives.C:140

SynchronousCollectives::recvBroadcast
void recvBroadcast(const T &data, const unsigned int key)
Definition: SynchronousCollectives.C:325

CudaRecord.h