NAMD
CollectiveDeviceBuffer.C
Go to the documentation of this file.
1 #include "GlobalGPUMgr.h"
4 #include "NamdEventsProfiling.h"
5 #include "TupleTypesCUDA.h"
6 
7 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
8 
9 template<typename T>
10 void CollectiveDeviceBuffer<T>::allocate(CollectiveBufferType type_in, const size_t numElemsIn,
13  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
14  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
15 
16  size_t numElemsTemp;
17  if (isMasterPe) {
18  numElemsTemp = numElemsIn;
19  } else {
20  numElemsTemp = 0;
21  }
22 
23  // Compute size of buffer
24  std::vector<size_t> numElemsVec = {numElemsTemp};
25  const size_t maxNumElems = syncColl->allReduce<size_t>(numElemsVec, CkReduction::max_ulong_long,
26  scope)[0];
27 
28  // Call the no check version of allocate
29  allocate_no_check(type_in, maxNumElems);
30 }
31 
32 template<typename T>
35  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
36  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
37  const int numDevices = globalGPUMgr->getNumDevices();
38  const int deviceIndex = globalGPUMgr->getDeviceIndex();
39 
40  type = type_in;
41 
42  numElemsAlloc = numElemsIn;
43 
45  if (isMasterPe) {
46  // Allocate Buffer locally
47  allocate_device<T>(&(buffer), numElemsAlloc);
48 
49  // Communicate buffer to peers
50  auto temp_peerBuffers = syncColl->allGather<unsigned long long>(
51  (unsigned long long) buffer, SynchronousCollectiveScope::master);
52 
53  // Copy peer buffers to vector
54  h_peerBuffers.resize(numDevices);
55  for (int i = 0; i < numDevices; i++) {
56  h_peerBuffers[i] = (T*) temp_peerBuffers[i];
57  }
58 
59  // Copy P2P buffer to device
60  allocate_device<T*>(&(d_peerBuffers), numDevices);
61  copy_HtoD<T*>(h_peerBuffers.data(), d_peerBuffers, numDevices, nullptr);
62  cudaStreamSynchronize(nullptr);
63  }
64 #if ! defined(NAMD_HIP)
65  } else if (CollectiveBufferType::IPC == type) {
66  if (isMasterPe) {
67  cudaIpcMemHandle_t handle;
68  // Allocate Buffer locally
69  allocate_device<T>(&(buffer), numElemsAlloc);
70  cudaCheck(cudaIpcGetMemHandle(&handle, (void*) buffer));
71 
72  // Communicate handles to peers
73  auto temp_peerHandles = syncColl->allGather<cudaIpcMemHandle_t>(handle,
75 
76  // Open CUDA IPC mem handles and store in vector
77  h_peerBuffers.resize(numDevices);
78  for (int i = 0; i < numDevices; i++) {
79  if (i != deviceIndex) {
80  cudaCheck(cudaIpcOpenMemHandle((void**) &(h_peerBuffers[i]),
81  temp_peerHandles[i],
82  cudaIpcMemLazyEnablePeerAccess));
83  } else {
84  h_peerBuffers[i] = buffer;
85  }
86  }
87 
88  // Copy P2P buffer to device
89  allocate_device<T*>(&(d_peerBuffers), numDevices);
90  copy_HtoD<T*>(h_peerBuffers.data(), d_peerBuffers, numDevices, nullptr);
91  cudaCheck(cudaStreamSynchronize(nullptr));
92  }
93 #endif
94  } else {
95  NAMD_die("CollectiveBufferType not currently implemented");
96  }
97 }
98 
99 template<typename T>
100 void CollectiveDeviceBuffer<T>::reallocate(CollectiveBufferType type_in, const size_t newNumElems, const double factor,
103  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
104  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
105 
106  size_t numElemRequested;
107  if (isMasterPe) {
108  numElemRequested = newNumElems;
109  } else {
110  numElemRequested = 0;
111  }
112 
113  std::vector<size_t> numElemsVec = {numElemRequested};
114  const size_t maxNumElemsRequested = syncColl->allReduce<size_t>(numElemsVec, CkReduction::max_ulong_long,
115  scope)[0];
116 
117  reallocate_no_check(type_in, maxNumElemsRequested, factor);
118 }
119 
120 template<typename T>
122  const double factor) {
123 
124  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
125  const bool isMasterPe = globalGPUMgr->getIsMasterPe();
126 
127  if (isMasterPe) {
128  if (type == CollectiveBufferType::Empty) {
129  type = type_in;
130  } else if (type != type_in) {
131  NAMD_die("Reallocating buffer with different type");
132  }
133 
134  const size_t maxNumElemsRequested = newNumElems;
135 
136  if (maxNumElemsRequested > numElemsAlloc) {
137  const size_t newNumElemsAlloc = (size_t) ((double)maxNumElemsRequested * factor);
138  deallocate();
139  allocate_no_check(type_in, newNumElemsAlloc);
140  }
141  }
142 }
143 
144 template<typename T>
146  // This is needed because different PEs can call functions on objects created by other PEs.
147  // When this happens, both PEs could try to call deallocate leading to a double free and
148  // a seg fault. This seems like a hacky way to fix
149  GlobalGPUMgr* globalGPUMgr = GlobalGPUMgr::Object();
150 
151  numElemsAlloc = 0;
152 
153  if (!globalGPUMgr->getIsMasterPe()) return;
154 
155  if (buffer) {
156  deallocate_device<T>(&buffer);
157  buffer = nullptr;
158  }
159  if (d_peerBuffers) {
160  deallocate_device<T*>(&d_peerBuffers);
161  d_peerBuffers = nullptr;
162  }
163 }
164 
165 /*
166  * Explicit instantiation
167  */
168 template class CollectiveDeviceBuffer<char>;
169 template class CollectiveDeviceBuffer<uint64_t>;
170 template class CollectiveDeviceBuffer<int64_t>;
171 template class CollectiveDeviceBuffer<int>;
172 template class CollectiveDeviceBuffer<int4>;
173 template class CollectiveDeviceBuffer<float>;
174 template class CollectiveDeviceBuffer<float2>;
175 template class CollectiveDeviceBuffer<float4>;
176 template class CollectiveDeviceBuffer<double>;
177 template class CollectiveDeviceBuffer<double3>;
178 template class CollectiveDeviceBuffer<FullAtom>;
180 template class CollectiveDeviceBuffer<CudaForce>;
186 
187 #endif /* NAMD_CUDA || NAMD_HIP */
188 
std::vector< T > allGather(const T &data, const SynchronousCollectiveScope scope)
std::vector< T > allReduce(std::vector< T > &data, CkReduction::reducerType type, const SynchronousCollectiveScope scope)
void reallocate(CollectiveBufferType type_in, const size_t newNumElems, const double factor, SynchronousCollectiveScope scope=SynchronousCollectiveScope::all)
Reallocates a symmetric device buffer on all devices if needed.
int getDeviceIndex()
Definition: GlobalGPUMgr.h:101
void allocate(CollectiveBufferType type_in, const size_t numElemsIn, SynchronousCollectiveScope scope=SynchronousCollectiveScope::all)
Allocates a symmetric buffer on all devices.
void reallocate_no_check(CollectiveBufferType type_in, const size_t newNumElems, const double factor)
void NAMD_die(const char *err_msg)
Definition: common.C:148
int getNumDevices()
Definition: GlobalGPUMgr.h:96
SynchronousCollectiveScope
void allocate_no_check(CollectiveBufferType type_in, const size_t numElemsIn)
Allocates a symmetric buffer on all devices without reducing the buffer sizes.
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
static GlobalGPUMgr * Object()
Definition: GlobalGPUMgr.h:61
CollectiveBufferType
int getIsMasterPe()
Definition: GlobalGPUMgr.h:106
static SynchronousCollectives * Object()