7 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 18 numElemsTemp = numElemsIn;
24 std::vector<size_t> numElemsVec = {numElemsTemp};
25 const size_t maxNumElems = syncColl->
allReduce<
size_t>(numElemsVec, CkReduction::max_ulong_long,
29 allocate_no_check(type_in, maxNumElems);
42 numElemsAlloc = numElemsIn;
47 allocate_device<T>(&(buffer), numElemsAlloc);
50 auto temp_peerBuffers = syncColl->
allGather<
unsigned long long>(
54 h_peerBuffers.resize(numDevices);
55 for (
int i = 0; i < numDevices; i++) {
56 h_peerBuffers[i] = (T*) temp_peerBuffers[i];
60 allocate_device<T*>(&(d_peerBuffers), numDevices);
61 copy_HtoD<T*>(h_peerBuffers.data(), d_peerBuffers, numDevices,
nullptr);
62 cudaStreamSynchronize(
nullptr);
64 #if ! defined(NAMD_HIP) 67 cudaIpcMemHandle_t handle;
69 allocate_device<T>(&(buffer), numElemsAlloc);
70 cudaCheck(cudaIpcGetMemHandle(&handle, (
void*) buffer));
73 auto temp_peerHandles = syncColl->
allGather<cudaIpcMemHandle_t>(handle,
77 h_peerBuffers.resize(numDevices);
78 for (
int i = 0; i < numDevices; i++) {
79 if (i != deviceIndex) {
80 cudaCheck(cudaIpcOpenMemHandle((
void**) &(h_peerBuffers[i]),
82 cudaIpcMemLazyEnablePeerAccess));
84 h_peerBuffers[i] = buffer;
89 allocate_device<T*>(&(d_peerBuffers), numDevices);
90 copy_HtoD<T*>(h_peerBuffers.data(), d_peerBuffers, numDevices,
nullptr);
91 cudaCheck(cudaStreamSynchronize(
nullptr));
95 NAMD_die(
"CollectiveBufferType not currently implemented");
106 size_t numElemRequested;
108 numElemRequested = newNumElems;
110 numElemRequested = 0;
113 std::vector<size_t> numElemsVec = {numElemRequested};
114 const size_t maxNumElemsRequested = syncColl->
allReduce<
size_t>(numElemsVec, CkReduction::max_ulong_long,
117 reallocate_no_check(type_in, maxNumElemsRequested, factor);
122 const double factor) {
130 }
else if (type != type_in) {
131 NAMD_die(
"Reallocating buffer with different type");
134 const size_t maxNumElemsRequested = newNumElems;
136 if (maxNumElemsRequested > numElemsAlloc) {
137 const size_t newNumElemsAlloc = (size_t) ((
double)maxNumElemsRequested * factor);
139 allocate_no_check(type_in, newNumElemsAlloc);
156 deallocate_device<T>(&buffer);
160 deallocate_device<T*>(&d_peerBuffers);
161 d_peerBuffers =
nullptr;
std::vector< T > allGather(const T &data, const SynchronousCollectiveScope scope)
std::vector< T > allReduce(std::vector< T > &data, CkReduction::reducerType type, const SynchronousCollectiveScope scope)
void reallocate(CollectiveBufferType type_in, const size_t newNumElems, const double factor, SynchronousCollectiveScope scope=SynchronousCollectiveScope::all)
Reallocates a symmetric device buffer on all devices if needed.
void allocate(CollectiveBufferType type_in, const size_t numElemsIn, SynchronousCollectiveScope scope=SynchronousCollectiveScope::all)
Allocates a symmetric buffer on all devices.
void reallocate_no_check(CollectiveBufferType type_in, const size_t newNumElems, const double factor)
void NAMD_die(const char *err_msg)
SynchronousCollectiveScope
void allocate_no_check(CollectiveBufferType type_in, const size_t numElemsIn)
Allocates a symmetric buffer on all devices without reducing the buffer sizes.
static GlobalGPUMgr * Object()
static SynchronousCollectives * Object()