2 #include <cuda_runtime.h> 6 #include <hip/hip_runtime.h> 17 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 19 #define __thread __declspec(thread) 30 if (CkMyRank() == 0) {
37 #if defined(NAMD_CUDA) || ((NAMD_HIP) && ((HIP_VERSION_MAJOR < 4) && (HIP_VERSION_MINOR < 5))) 39 cudaGetDeviceCount(&ndevs);
40 for (
int dev=0; dev < ndevs; ++dev ) {
72 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
81 #ifdef NODEGROUP_FORCE_REGISTER 91 #define MAX_NUM_RANKS 2048 94 #define MAX_NUM_DEVICES 256 118 #ifdef NODEGROUP_FORCE_REGISTER 121 isGlobalDevice =
false;
124 if (CkMyPe() == 0) register_user_events();
126 #if defined(CUDA_VERSION) 127 if (CkMyPe() == 0) CkPrintf(
"Info: Built with CUDA version %d\n", CUDA_VERSION);
131 gethostname(host, 128); host[127] = 0;
133 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
134 int myRankInPhysicalNode;
135 int numPesOnPhysicalNode;
136 int *pesOnPhysicalNode;
137 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
138 &pesOnPhysicalNode,&numPesOnPhysicalNode);
142 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
143 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
144 i = numPesOnPhysicalNode;
147 if ( pesOnPhysicalNode[i] == CkMyPe() )
break;
149 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
150 CkPrintf(
"Bad result from CmiGetPesOnPhysicalNode!\n");
151 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
152 CkPrintf(
"pe %d physnode rank %d of %d is %d\n", CkMyPe(),
153 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
155 myRankInPhysicalNode = 0;
156 numPesOnPhysicalNode = 1;
157 pesOnPhysicalNode =
new int[1];
158 pesOnPhysicalNode[0] = CkMyPe();
160 myRankInPhysicalNode = i;
166 cudaCheck(cudaGetDeviceCount(&deviceCount));
167 if ( deviceCount <= 0 ) {
168 cudaDie(
"No CUDA devices found.");
172 deviceProps =
new cudaDeviceProp[deviceCount];
173 for (
int i=0; i<deviceCount; ++i ) {
174 cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
179 if ( usedevicelist ) {
180 devices =
new int[strlen(devicelist)];
182 while ( devicelist[i] ) {
183 ndevices += sscanf(devicelist+i,
"%d",devices+ndevices);
184 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
185 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
189 CkPrintf(
"Did not find +devices i,j,k,... argument, using all\n");
191 devices =
new int[deviceCount];
192 for (
int i=0; i<deviceCount; ++i ) {
193 int dev = i % deviceCount;
194 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP) 195 cudaDeviceProp deviceProp;
196 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
197 if ( deviceProp.computeMode != cudaComputeModeProhibited
198 && (deviceProp.major >= 3)
199 && deviceProp.canMapHostMemory
200 && ( (deviceProp.multiProcessorCount > 2) ||
201 ((ndevices==0)&&(CkNumNodes()==1)) )
203 devices[ndevices++] = dev;
205 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
209 devices[ndevices++] = dev;
215 cudaDie(
"all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
218 if ( devicesperreplica > 0 ) {
219 if ( devicesperreplica > ndevices ) {
220 NAMD_die(
"More devices per partition requested than devices are available");
222 int *olddevices = devices;
223 devices =
new int[devicesperreplica];
224 for (
int i=0; i<devicesperreplica; ++i ) {
225 int mypart = CmiMyPartition();
226 devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
228 ndevices = devicesperreplica;
229 delete [] olddevices;
232 int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
233 int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
237 #ifdef NODEGROUP_FORCE_REGISTER 240 const int pmePes = -1;
244 if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
246 sprintf(msg,
"Number of devices (%d) is not a multiple of number of processes (%d). " 247 "Sharing devices between processes is inefficient. " 248 "Specify +ignoresharing (each process uses all visible devices) if " 249 "not all devices are visible to each process, otherwise " 250 "adjust number of processes to evenly divide number of devices, " 251 "specify subset of devices with +devices argument (e.g., +devices 0,2), " 252 "or multiply list shared devices (e.g., +devices 0,1,2,0).",
253 ndevices, numPesForDevice / CkMyNodeSize() );
259 nodedevices =
new int[ndevices];
261 int pe = CkNodeFirst(CkMyNode());
263 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
264 int rank = ignoresharing ? i : CmiPhysicalRank(pe);
265 int peDeviceRank = rank * ndevices / numPesForDevice;
266 if ( peDeviceRank != dr ) {
268 nodedevices[nnodedevices++] = devices[dr];
275 for (
int i=0; i<nnodedevices; ++i ) {
276 for (
int j=i+1; j<nnodedevices; ++j ) {
277 if ( nodedevices[i] == nodedevices[j] ) {
279 sprintf(msg,
"Device %d bound twice by same process.", nodedevices[i]);
288 int firstPeSharingGpu = CkMyPe();
289 nextPeSharingGpu = CkMyPe();
295 if (myRankForDevice < pmePes) {
298 myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
301 dev = devices[myDeviceRank];
303 if (myRankForDevice >= pmePes) {
304 pesSharingDevice =
new int[numPesForDevice];
306 numPesSharingDevice = 0;
307 for (
int i = pmePes; i < numPesForDevice; ++i ) {
308 if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
309 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
310 pesSharingDevice[numPesSharingDevice++] = thisPe;
311 if ( masterPe < 1 ) masterPe = thisPe;
315 for (
int j = 0; j < ndevices; ++j ) {
316 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
319 #ifdef NODEGROUP_FORCE_REGISTER 320 pesSharingDevice =
new int[pmePes];
322 pesSharingDevice = NULL;
325 numPesSharingDevice = 0;
326 for (
int i = 0; i < pmePes; ++i) {
327 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
328 pesSharingDevice[numPesSharingDevice++] = thisPe;
329 if ( masterPe < 1 ) masterPe = thisPe;
333 if ( sharedGpu && masterPe == CkMyPe() ) {
334 if ( CmiPhysicalNodeID(masterPe) < 2 )
335 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
337 }
else if ( numPesForDevice > 1 ) {
338 int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
339 dev = devices[myDeviceRank];
342 pesSharingDevice =
new int[numPesForDevice];
344 numPesSharingDevice = 0;
345 for (
int i = 0; i < numPesForDevice; ++i ) {
346 if ( i * ndevices / numPesForDevice == myDeviceRank ) {
347 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
348 pesSharingDevice[numPesSharingDevice++] = thisPe;
349 if ( masterPe < 1 ) masterPe = thisPe;
353 for (
int j = 0; j < ndevices; ++j ) {
354 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
357 if ( sharedGpu && masterPe == CkMyPe() ) {
358 if ( CmiPhysicalNodeID(masterPe) < 2 )
359 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
362 dev = devices[CkMyPe() % ndevices];
364 pesSharingDevice =
new int[1];
365 pesSharingDevice[0] = CkMyPe();
366 numPesSharingDevice = 1;
373 bool contained =
false;
375 for(
int i = 0; i < ndevices; i++){
377 contained = devices[i] == pmeDevice;
378 pmeDeviceIndex = (contained) ? i : -1;
380 if(deviceID == devices[i]) deviceIndex = i;
383 masterDevice = devices[0];
384 isMasterDevice = deviceID == masterDevice;
388 pmeDeviceIndex = nnodedevices;
389 isPmeDevice = isMasterDevice;
392 isPmeDevice = pmeDevice == deviceID;
399 for (
int i = 0; i < ndevices; ++i) {
401 contained = devices[i] == globalDevice;
405 NAMD_die(
"The selected GPU device for global forces is in the available devices list.\n");
407 isGlobalDevice = globalDevice == deviceID;
411 NAMD_die(
"Maximum number of ranks (2048) per node exceeded");
414 if ( masterPe != CkMyPe() ) {
415 if ( CmiPhysicalNodeID(masterPe) < 2 )
416 CkPrintf(
"Pe %d physical rank %d will use CUDA device of pe %d\n",
417 CkMyPe(), myRankInPhysicalNode, masterPe);
425 NAMD_die(
"Maximum number of CUDA devices (256) per node exceeded");
430 firstPeSharingGpu = CkMyPe();
431 nextPeSharingGpu = CkMyPe();
433 gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
435 if ( dev >= deviceCount ) {
437 sprintf(buf,
"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
438 CkMyPe(), dev, host, deviceCount);
442 cudaDeviceProp deviceProp;
443 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
444 if ( CmiPhysicalNodeID(masterPe) < 2 )
445 CkPrintf(
"Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
446 CkMyPe(), myRankInPhysicalNode, dev, host,
448 (
unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
449 deviceProp.major, deviceProp.minor,
450 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
458 cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
459 if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
462 cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
468 cudaDeviceProp deviceProp;
469 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
470 if ( deviceProp.computeMode == cudaComputeModeProhibited )
471 cudaDie(
"device in prohibited mode");
472 if ( deviceProp.major < 3 )
473 cudaDie(
"device not of compute capability 3.0 or higher");
474 if ( ! deviceProp.canMapHostMemory )
475 cudaDie(
"device cannot map host memory");
482 #if NODEGROUP_FORCE_REGISTER 488 bool contained =
false;
490 for(
int i = 0; i < ndevices; i++){
492 contained = devices[i] == pmeDevice;
493 pmeDeviceIndex = (contained) ? i : -1;
495 if(deviceID == devices[i]) deviceIndex = i;
498 if(!contained && CkMyPe() == 0)
cudaDie(
"device specified for PME is not contained in +devices!");
501 isPmeDevice = pmeDevice == deviceID;
502 masterDevice = devices[0];
503 isMasterDevice = deviceID == masterDevice;
505 if (pmeDeviceIndex != 0 && pmePes != -1) {
506 NAMD_die(
"PME device must be index 0 if pmePes is set");
517 if (deviceProps != NULL)
delete [] deviceProps;
518 if (devices != NULL)
delete [] devices;
519 delete [] pesSharingDevice;
540 for (
int i=0; i<numPesSharingDevice; ++i ) {
541 if ( pesSharingDevice[i] == pe )
return true;
550 if ( numPesSharingDevice != CkMyNodeSize() )
return false;
551 int numPesOnNodeSharingDevice = 0;
552 for (
int i=0; i<numPesSharingDevice; ++i ) {
553 if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
554 ++numPesOnNodeSharingDevice;
557 return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
563 return deviceProps[dev].maxThreadsPerBlock;
569 return deviceProps[dev].maxGridSize[0];
572 #ifdef NODEGROUP_FORCE_REGISTER 575 for(
int i = 0; i < ndevices; i++){
576 if (devices[i] == deviceID)
continue;
579 cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, devices[i]));
581 char *err =
new char[128];
582 sprintf(err,
"Failed setting up device peer access - Devices %d and %d are not paired.\n",
583 deviceID, devices[i]);
587 cudaError_t error = cudaDeviceEnablePeerAccess(devices[i], 0);
588 if(error == cudaErrorPeerAccessAlreadyEnabled ) {
589 cudaCheck(cudaDeviceDisablePeerAccess(devices[i]));
590 cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0));
596 error = cudaDeviceGetP2PAttribute(&p2pAtomics, cudaDevP2PAttrNativeAtomicSupported, deviceID, devices[i]);
598 if (p2pAtomics != 1) {
603 NAMD_die(
"Device Migration is not supported on systems without P2P atomics.");
607 cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0));
614 cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, pmeDevice));
616 char *err =
new char[128];
617 sprintf(err,
"Failed setting up device peer access - Devices %d and %d are not paired.\n",
618 deviceID, pmeDevice);
621 cudaCheck(cudaDeviceEnablePeerAccess(pmeDevice, 0));
626 #ifdef NAMD_NCCL_ALLREDUCE 628 void DeviceCUDA::setupNcclUniqueId(){
629 ncclGetUniqueId( &ncclId);
632 void DeviceCUDA::setupNcclComm(){
634 ncclCommInitRank(&ncclComm, CkNumPes(), ncclId, CkMyPe());
643 return (CkMyPe() == masterPe) && isMasterDevice;
653 void DeviceCUDA::register_user_events() {
668 #define REGISTER_DEVICE_EVENTS(DEV) \ 669 traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \ 670 traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
#define CUDA_GBIS2_KERNEL_EVENT
#define REGISTER_DEVICE_EVENTS(DEV)
#define CUDA_BONDED_KERNEL_EVENT
SimParameters * simParameters
#define CUDA_PME_SPREADCHARGE_EVENT
void cudaDie(const char *msg, cudaError_t err)
int masterPeList[MAX_NUM_DEVICES]
#define CUDA_EVENT_ID_POLL_REMOTE
void setupDevicePeerAccess()
static __thread cuda_args_t cuda_args
int deviceIDList[MAX_NUM_RANKS]
int getMasterPeForDeviceID(int deviceID)
#define CUDA_GBIS3_KERNEL_EVENT
bool device_shared_with_pe(int pe)
void NAMD_die(const char *err_msg)
#define CUDA_NONBONDED_KERNEL_EVENT
__thread DeviceCUDA * deviceCUDA
#define CUDA_GBIS1_KERNEL_EVENT
bool one_device_per_node()
int getDeviceIDforPe(int pe)
#define CUDA_EVENT_ID_POLL_LOCAL
#define CUDA_PME_GATHERFORCE_EVENT
void cuda_getargs(char **argv)