8 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
11 #include <cuda_runtime.h>
15 #include <hip/hip_runtime.h>
18 #define __thread __declspec(thread)
32 cudaGetDeviceCount(&ndevs);
33 for (
int dev=0; dev < ndevs; ++dev ) {
58 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
71 #define MAX_NUM_RANKS 2048
74 #define MAX_NUM_DEVICES 256
99 if (CkMyPe() == 0) register_user_events();
100 #if defined(CUDA_VERSION)
101 if (CkMyPe() == 0) CkPrintf(
"Info: Built with CUDA version %d\n", CUDA_VERSION);
104 gethostname(host, 128); host[127] = 0;
106 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
107 int myRankInPhysicalNode;
108 int numPesOnPhysicalNode;
109 int *pesOnPhysicalNode;
110 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
111 &pesOnPhysicalNode,&numPesOnPhysicalNode);
115 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
116 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
117 i = numPesOnPhysicalNode;
120 if ( pesOnPhysicalNode[i] == CkMyPe() )
break;
122 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
123 CkPrintf(
"Bad result from CmiGetPesOnPhysicalNode!\n");
124 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
125 CkPrintf(
"pe %d physnode rank %d of %d is %d\n", CkMyPe(),
126 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
128 myRankInPhysicalNode = 0;
129 numPesOnPhysicalNode = 1;
130 pesOnPhysicalNode =
new int[1];
131 pesOnPhysicalNode[0] = CkMyPe();
133 myRankInPhysicalNode = i;
139 cudaCheck(cudaGetDeviceCount(&deviceCount));
140 if ( deviceCount <= 0 ) {
141 cudaDie(
"No CUDA devices found.");
145 deviceProps =
new cudaDeviceProp[deviceCount];
146 for (
int i=0; i<deviceCount; ++i ) {
147 cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
152 if ( usedevicelist ) {
153 devices =
new int[strlen(devicelist)];
155 while ( devicelist[i] ) {
156 ndevices += sscanf(devicelist+i,
"%d",devices+ndevices);
157 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
158 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
162 CkPrintf(
"Did not find +devices i,j,k,... argument, using all\n");
164 devices =
new int[deviceCount];
165 for (
int i=0; i<deviceCount; ++i ) {
166 int dev = i % deviceCount;
167 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
168 cudaDeviceProp deviceProp;
169 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
170 if ( deviceProp.computeMode != cudaComputeModeProhibited
171 && (deviceProp.major >= 3)
172 && deviceProp.canMapHostMemory
173 && ( (deviceProp.multiProcessorCount > 2) ||
174 ((ndevices==0)&&(CkNumNodes()==1)) )
176 devices[ndevices++] = dev;
178 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
182 devices[ndevices++] = dev;
188 cudaDie(
"all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
191 if ( devicesperreplica > 0 ) {
192 if ( devicesperreplica > ndevices ) {
193 NAMD_die(
"More devices per partition requested than devices are available");
195 int *olddevices = devices;
196 devices =
new int[devicesperreplica];
197 for (
int i=0; i<devicesperreplica; ++i ) {
198 int mypart = CmiMyPartition();
199 devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
201 ndevices = devicesperreplica;
202 delete [] olddevices;
205 int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
206 int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
209 if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
211 sprintf(msg,
"Number of devices (%d) is not a multiple of number of processes (%d). "
212 "Sharing devices between processes is inefficient. "
213 "Specify +ignoresharing (each process uses all visible devices) if "
214 "not all devices are visible to each process, otherwise "
215 "adjust number of processes to evenly divide number of devices, "
216 "specify subset of devices with +devices argument (e.g., +devices 0,2), "
217 "or multiply list shared devices (e.g., +devices 0,1,2,0).",
218 ndevices, numPesForDevice / CkMyNodeSize() );
224 nodedevices =
new int[ndevices];
226 int pe = CkNodeFirst(CkMyNode());
228 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
229 int rank = ignoresharing ? i : CmiPhysicalRank(pe);
230 int peDeviceRank = rank * ndevices / numPesForDevice;
231 if ( peDeviceRank != dr ) {
233 nodedevices[nnodedevices++] = devices[dr];
240 for (
int i=0; i<nnodedevices; ++i ) {
241 for (
int j=i+1; j<nnodedevices; ++j ) {
242 if ( nodedevices[i] == nodedevices[j] ) {
244 sprintf(msg,
"Device %d bound twice by same process.", nodedevices[i]);
253 int firstPeSharingGpu = CkMyPe();
254 nextPeSharingGpu = CkMyPe();
258 if ( numPesForDevice > 1 ) {
259 int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
260 dev = devices[myDeviceRank];
263 pesSharingDevice =
new int[numPesForDevice];
265 numPesSharingDevice = 0;
266 for (
int i = 0; i < numPesForDevice; ++i ) {
267 if ( i * ndevices / numPesForDevice == myDeviceRank ) {
268 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
269 pesSharingDevice[numPesSharingDevice++] = thisPe;
270 if ( masterPe < 1 ) masterPe = thisPe;
274 for (
int j = 0; j < ndevices; ++j ) {
275 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
278 if ( sharedGpu && masterPe == CkMyPe() ) {
279 if ( CmiPhysicalNodeID(masterPe) < 2 )
280 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
283 dev = devices[CkMyPe() % ndevices];
285 pesSharingDevice =
new int[1];
286 pesSharingDevice[0] = CkMyPe();
287 numPesSharingDevice = 1;
294 NAMD_die(
"Maximum number of ranks (2048) per node exceeded");
297 if ( masterPe != CkMyPe() ) {
298 if ( CmiPhysicalNodeID(masterPe) < 2 )
299 CkPrintf(
"Pe %d physical rank %d will use CUDA device of pe %d\n",
300 CkMyPe(), myRankInPhysicalNode, masterPe);
308 NAMD_die(
"Maximum number of CUDA devices (256) per node exceeded");
313 firstPeSharingGpu = CkMyPe();
314 nextPeSharingGpu = CkMyPe();
316 gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
318 if ( dev >= deviceCount ) {
320 sprintf(buf,
"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
321 CkMyPe(), dev, host, deviceCount);
325 cudaDeviceProp deviceProp;
326 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
327 if ( CmiPhysicalNodeID(masterPe) < 2 )
328 CkPrintf(
"Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
329 CkMyPe(), myRankInPhysicalNode, dev, host,
331 (
unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
332 deviceProp.major, deviceProp.minor,
333 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
341 cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
342 if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
345 cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
351 cudaDeviceProp deviceProp;
352 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
353 if ( deviceProp.computeMode == cudaComputeModeProhibited )
354 cudaDie(
"device in prohibited mode");
355 if ( deviceProp.major < 3 )
356 cudaDie(
"device not of compute capability 3.0 or higher");
357 if ( ! deviceProp.canMapHostMemory )
358 cudaDie(
"device cannot map host memory");
370 if (deviceProps != NULL)
delete [] deviceProps;
371 if (devices != NULL)
delete [] devices;
372 delete [] pesSharingDevice;
393 for (
int i=0; i<numPesSharingDevice; ++i ) {
394 if ( pesSharingDevice[i] == pe )
return true;
403 if ( numPesSharingDevice != CkMyNodeSize() )
return false;
404 int numPesOnNodeSharingDevice = 0;
405 for (
int i=0; i<numPesSharingDevice; ++i ) {
406 if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
407 ++numPesOnNodeSharingDevice;
410 return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
416 return deviceProps[dev].maxThreadsPerBlock;
422 return deviceProps[dev].maxGridSize[0];
432 void DeviceCUDA::register_user_events() {
447 #define REGISTER_DEVICE_EVENTS(DEV) \
448 traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
449 traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
#define CUDA_GBIS2_KERNEL_EVENT
#define REGISTER_DEVICE_EVENTS(DEV)
#define CUDA_BONDED_KERNEL_EVENT
void cuda_getargs(char **)
#define CUDA_PME_SPREADCHARGE_EVENT
int masterPeList[MAX_NUM_DEVICES]
#define CUDA_EVENT_ID_POLL_REMOTE
static __thread cuda_args_t cuda_args
int deviceIDList[MAX_NUM_RANKS]
int getMasterPeForDeviceID(int deviceID)
#define CUDA_GBIS3_KERNEL_EVENT
bool device_shared_with_pe(int pe)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void NAMD_die(const char *err_msg)
#define CUDA_NONBONDED_KERNEL_EVENT
__thread DeviceCUDA * deviceCUDA
#define CUDA_GBIS1_KERNEL_EVENT
bool one_device_per_node()
int getDeviceIDforPe(int pe)
#define CUDA_EVENT_ID_POLL_LOCAL
#define CUDA_PME_GATHERFORCE_EVENT