NAMD
Public Member Functions | List of all members
CollectiveDeviceBuffer< T > Class Template Reference

#include <CollectiveDeviceBuffer.h>

Public Member Functions

void allocate (CollectiveBufferType type_in, const size_t numElemsIn, SynchronousCollectiveScope scope=SynchronousCollectiveScope::all)
 Allocates a symmetric buffer on all devices. More...
 
void allocate_no_check (CollectiveBufferType type_in, const size_t numElemsIn)
 Allocates a symmetric buffer on all devices without reducing the buffer sizes. More...
 
void reallocate (CollectiveBufferType type_in, const size_t newNumElems, const double factor, SynchronousCollectiveScope scope=SynchronousCollectiveScope::all)
 Reallocates a symmetric device buffer on all devices if needed. More...
 
void reallocate_no_check (CollectiveBufferType type_in, const size_t newNumElems, const double factor)
 
void deallocate ()
 
T *const getDevicePtr () const
 Returns the pointer to the device buffer. More...
 
T **const getDevicePeerPtr () const
 Returns the pointer to peer's pointer on the device. More...
 
std::vector< T * > const & getHostPeer () const
 Returns a host-vector containing peer's device pointers. More...
 
CollectiveDeviceBuffer< T > & operator= (CollectiveDeviceBuffer< T > &&other)
 
CollectiveDeviceBuffer< T > & operator= (const CollectiveDeviceBuffer< T > &other)
 
 CollectiveDeviceBuffer (const CollectiveDeviceBuffer &other)
 
 CollectiveDeviceBuffer ()
 

Detailed Description

template<typename T>
class CollectiveDeviceBuffer< T >

Definition at line 25 of file CollectiveDeviceBuffer.h.

Constructor & Destructor Documentation

◆ CollectiveDeviceBuffer() [1/2]

template<typename T>
CollectiveDeviceBuffer< T >::CollectiveDeviceBuffer ( const CollectiveDeviceBuffer< T > &  other)
inline

Definition at line 117 of file CollectiveDeviceBuffer.h.

117  {
118  type = other.type;
119  numElemsAlloc = other.numElemsAlloc;
120  buffer = other.buffer;
121  d_peerBuffers = other.d_peerBuffers;
122  h_peerBuffers = other.h_peerBuffers;
123  }

◆ CollectiveDeviceBuffer() [2/2]

template<typename T>
CollectiveDeviceBuffer< T >::CollectiveDeviceBuffer ( )
inline

Definition at line 126 of file CollectiveDeviceBuffer.h.

126 {}

Member Function Documentation

◆ allocate()

template<typename T >
void CollectiveDeviceBuffer< T >::allocate ( CollectiveBufferType  type_in,
const size_t  numElemsIn,
SynchronousCollectiveScope  scope = SynchronousCollectiveScope::all 
)

Allocates a symmetric buffer on all devices.

This will allocate a buffer on all devices using the allocation method of the given CollectiveBufferType. The buffer will have the same size across all devices and this function will automatically compute the maximum number of elements across all master PEs.

This function can be called by either all PEs or just master PEs depending on the given scope

Definition at line 10 of file CollectiveDeviceBuffer.C.

References SynchronousCollectives::allReduce(), GlobalGPUMgr::getIsMasterPe(), GlobalGPUMgr::Object(), and SynchronousCollectives::Object().

11  {
13  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
14  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
15 
16  size_t numElemsTemp;
17  if (isMasterPe) {
18  numElemsTemp = numElemsIn;
19  } else {
20  numElemsTemp = 0;
21  }
22 
23  // Compute size of buffer
24  std::vector<size_t> numElemsVec = {numElemsTemp};
25  const size_t maxNumElems = syncColl->allReduce<size_t>(numElemsVec, CkReduction::max_ulong_long,
26  scope)[0];
27 
28  // Call the no check version of allocate
29  allocate_no_check(type_in, maxNumElems);
30 }
std::vector< T > allReduce(std::vector< T > &data, CkReduction::reducerType type, const SynchronousCollectiveScope scope)
void allocate_no_check(CollectiveBufferType type_in, const size_t numElemsIn)
Allocates a symmetric buffer on all devices without reducing the buffer sizes.
static GlobalGPUMgr * Object()
Definition: GlobalGPUMgr.h:61
int getIsMasterPe()
Definition: GlobalGPUMgr.h:106
static SynchronousCollectives * Object()

◆ allocate_no_check()

template<typename T >
void CollectiveDeviceBuffer< T >::allocate_no_check ( CollectiveBufferType  type_in,
const size_t  numElemsIn 
)

Allocates a symmetric buffer on all devices without reducing the buffer sizes.

This function will allocate a buffer on all devices, but it assumes that the number of elements given to it is the same across all master pes

Definition at line 33 of file CollectiveDeviceBuffer.C.

References SynchronousCollectives::allGather(), cudaCheck, GlobalGPUMgr::getDeviceIndex(), GlobalGPUMgr::getIsMasterPe(), GlobalGPUMgr::getNumDevices(), IPC, master, NAMD_die(), GlobalGPUMgr::Object(), SynchronousCollectives::Object(), and SingleProcess.

33  {
35  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
36  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
37  const int numDevices = globalGPUMgr->getNumDevices();
38  const int deviceIndex = globalGPUMgr->getDeviceIndex();
39 
40  type = type_in;
41 
42  numElemsAlloc = numElemsIn;
43 
45  if (isMasterPe) {
46  // Allocate Buffer locally
47  allocate_device<T>(&(buffer), numElemsAlloc);
48 
49  // Communicate buffer to peers
50  auto temp_peerBuffers = syncColl->allGather<unsigned long long>(
51  (unsigned long long) buffer, SynchronousCollectiveScope::master);
52 
53  // Copy peer buffers to vector
54  h_peerBuffers.resize(numDevices);
55  for (int i = 0; i < numDevices; i++) {
56  h_peerBuffers[i] = (T*) temp_peerBuffers[i];
57  }
58 
59  // Copy P2P buffer to device
60  allocate_device<T*>(&(d_peerBuffers), numDevices);
61  copy_HtoD<T*>(h_peerBuffers.data(), d_peerBuffers, numDevices, nullptr);
62  cudaStreamSynchronize(nullptr);
63  }
64 #if ! defined(NAMD_HIP)
65  } else if (CollectiveBufferType::IPC == type) {
66  if (isMasterPe) {
67  cudaIpcMemHandle_t handle;
68  // Allocate Buffer locally
69  allocate_device<T>(&(buffer), numElemsAlloc);
70  cudaCheck(cudaIpcGetMemHandle(&handle, (void*) buffer));
71 
72  // Communicate handles to peers
73  auto temp_peerHandles = syncColl->allGather<cudaIpcMemHandle_t>(handle,
75 
76  // Open CUDA IPC mem handles and store in vector
77  h_peerBuffers.resize(numDevices);
78  for (int i = 0; i < numDevices; i++) {
79  if (i != deviceIndex) {
80  cudaCheck(cudaIpcOpenMemHandle((void**) &(h_peerBuffers[i]),
81  temp_peerHandles[i],
82  cudaIpcMemLazyEnablePeerAccess));
83  } else {
84  h_peerBuffers[i] = buffer;
85  }
86  }
87 
88  // Copy P2P buffer to device
89  allocate_device<T*>(&(d_peerBuffers), numDevices);
90  copy_HtoD<T*>(h_peerBuffers.data(), d_peerBuffers, numDevices, nullptr);
91  cudaCheck(cudaStreamSynchronize(nullptr));
92  }
93 #endif
94  } else {
95  NAMD_die("CollectiveBufferType not currently implemented");
96  }
97 }
std::vector< T > allGather(const T &data, const SynchronousCollectiveScope scope)
int getDeviceIndex()
Definition: GlobalGPUMgr.h:101
void NAMD_die(const char *err_msg)
Definition: common.C:148
int getNumDevices()
Definition: GlobalGPUMgr.h:96
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
static GlobalGPUMgr * Object()
Definition: GlobalGPUMgr.h:61
int getIsMasterPe()
Definition: GlobalGPUMgr.h:106
static SynchronousCollectives * Object()

◆ deallocate()

template<typename T >
void CollectiveDeviceBuffer< T >::deallocate ( )

Definition at line 145 of file CollectiveDeviceBuffer.C.

References GlobalGPUMgr::getIsMasterPe(), and GlobalGPUMgr::Object().

145  {
146  // This is needed because different PEs can call functions on objects created by other PEs.
147  // When this happens, both PEs could try to call deallocate leading to a double free and
148  // a seg fault. This seems like a hacky way to fix
149  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
150 
151  numElemsAlloc = 0;
152 
153  if (!globalGPUMgr->getIsMasterPe()) return;
154 
155  if (buffer) {
156  deallocate_device<T>(&buffer);
157  buffer = nullptr;
158  }
159  if (d_peerBuffers) {
160  deallocate_device<T*>(&d_peerBuffers);
161  d_peerBuffers = nullptr;
162  }
163 }
static GlobalGPUMgr * Object()
Definition: GlobalGPUMgr.h:61
int getIsMasterPe()
Definition: GlobalGPUMgr.h:106

◆ getDevicePeerPtr()

template<typename T>
T** const CollectiveDeviceBuffer< T >::getDevicePeerPtr ( ) const
inline

Returns the pointer to peer's pointer on the device.

Definition at line 76 of file CollectiveDeviceBuffer.h.

76  {
77  return d_peerBuffers;
78  }

◆ getDevicePtr()

template<typename T>
T* const CollectiveDeviceBuffer< T >::getDevicePtr ( ) const
inline

Returns the pointer to the device buffer.

Definition at line 69 of file CollectiveDeviceBuffer.h.

69  {
70  return buffer;
71  }

◆ getHostPeer()

template<typename T>
std::vector<T*> const& CollectiveDeviceBuffer< T >::getHostPeer ( ) const
inline

Returns a host-vector containing peer's device pointers.

Definition at line 83 of file CollectiveDeviceBuffer.h.

83  {
84  return h_peerBuffers;
85  }

◆ operator=() [1/2]

template<typename T>
CollectiveDeviceBuffer<T>& CollectiveDeviceBuffer< T >::operator= ( CollectiveDeviceBuffer< T > &&  other)
inline

Definition at line 88 of file CollectiveDeviceBuffer.h.

88  {
89  type = other.type;
90 
91  numElemsAlloc = other.numElemsAlloc;
92  buffer = other.buffer;
93  d_peerBuffers = other.d_peerBuffers;
94 
95  // This object owns the underlying data now, so we modify other
96  other.numElemsAlloc = 0;
97  other.buffer = nullptr;
98  other.d_peerBuffers = nullptr;
99 
100  h_peerBuffers = std::move(other.h_peerBuffers);
101 
102  return *this;
103  }

◆ operator=() [2/2]

template<typename T>
CollectiveDeviceBuffer<T>& CollectiveDeviceBuffer< T >::operator= ( const CollectiveDeviceBuffer< T > &  other)
inline

Definition at line 106 of file CollectiveDeviceBuffer.h.

106  {
107  type = other.type;
108  numElemsAlloc = other.numElemsAlloc;
109  buffer = other.buffer;
110  d_peerBuffers = other.d_peerBuffers;
111  h_peerBuffers = other.h_peerBuffers;
112 
113  return *this;
114  }

◆ reallocate()

template<typename T >
void CollectiveDeviceBuffer< T >::reallocate ( CollectiveBufferType  type_in,
const size_t  newNumElems,
const double  factor,
SynchronousCollectiveScope  scope = SynchronousCollectiveScope::all 
)

Reallocates a symmetric device buffer on all devices if needed.

Definition at line 100 of file CollectiveDeviceBuffer.C.

References SynchronousCollectives::allReduce(), GlobalGPUMgr::getIsMasterPe(), GlobalGPUMgr::Object(), and SynchronousCollectives::Object().

101  {
103  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
104  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
105 
106  size_t numElemRequested;
107  if (isMasterPe) {
108  numElemRequested = newNumElems;
109  } else {
110  numElemRequested = 0;
111  }
112 
113  std::vector<size_t> numElemsVec = {numElemRequested};
114  const size_t maxNumElemsRequested = syncColl->allReduce<size_t>(numElemsVec, CkReduction::max_ulong_long,
115  scope)[0];
116 
117  reallocate_no_check(type_in, maxNumElemsRequested, factor);
118 }
std::vector< T > allReduce(std::vector< T > &data, CkReduction::reducerType type, const SynchronousCollectiveScope scope)
void reallocate_no_check(CollectiveBufferType type_in, const size_t newNumElems, const double factor)
static GlobalGPUMgr * Object()
Definition: GlobalGPUMgr.h:61
int getIsMasterPe()
Definition: GlobalGPUMgr.h:106
static SynchronousCollectives * Object()

◆ reallocate_no_check()

template<typename T >
void CollectiveDeviceBuffer< T >::reallocate_no_check ( CollectiveBufferType  type_in,
const size_t  newNumElems,
const double  factor 
)

Definition at line 121 of file CollectiveDeviceBuffer.C.

References Empty, GlobalGPUMgr::getIsMasterPe(), NAMD_die(), and GlobalGPUMgr::Object().

122  {
123 
124  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
125  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
126 
127  if (isMasterPe) {
128  if (type == CollectiveBufferType::Empty) {
129  type = type_in;
130  } else if (type != type_in) {
131  NAMD_die("Reallocating buffer with different type");
132  }
133 
134  const size_t maxNumElemsRequested = newNumElems;
135 
136  if (maxNumElemsRequested > numElemsAlloc) {
137  const size_t newNumElemsAlloc = (size_t) ((double)maxNumElemsRequested * factor);
138  deallocate();
139  allocate_no_check(type_in, newNumElemsAlloc);
140  }
141  }
142 }
void NAMD_die(const char *err_msg)
Definition: common.C:148
void allocate_no_check(CollectiveBufferType type_in, const size_t numElemsIn)
Allocates a symmetric buffer on all devices without reducing the buffer sizes.
static GlobalGPUMgr * Object()
Definition: GlobalGPUMgr.h:61
int getIsMasterPe()
Definition: GlobalGPUMgr.h:106

The documentation for this class was generated from the following files: