#include <DeviceCUDA.h>

Public Member Functions
	DeviceCUDA ()

	~DeviceCUDA ()

void	initialize ()

int	getDeviceCount ()

int	getNumDevice ()

bool	device_shared_with_pe (int pe)

bool	one_device_per_node ()

int	getNoStreaming ()

int	getNoMergeGrids ()

int	getMergeGrids ()

void	setMergeGrids (const int val)

bool	getSharedGpu ()

int	getNextPeSharingGpu ()

int	getMasterPe ()

int	getNumPesSharingDevice ()

int	getPesSharingDevice (const int i)

int	getGpuIsMine ()

void	setGpuIsMine (const int val)

int	getDeviceID ()

int	getDeviceIDbyRank (int rank)

int	getDeviceIDforPe (int pe)

int	getMasterPeForDeviceID (int deviceID)

int	getMaxNumThreads ()

int	getMaxNumBlocks ()

void	setupDevicePeerAccess ()

bool	isGpuReservedPme ()

int	getPmeDevice ()

int	getDeviceIndex ()

int	getPmeDeviceIndex ()

bool	getIsPmeDevice ()

bool	getIsMasterDevice ()

int	getGlobalDevice () const

bool	getIsGlobalDevice () const

const int *	allDevices () const

Detailed Description

Definition at line 54 of file DeviceCUDA.h.

Constructor & Destructor Documentation

◆ DeviceCUDA()

DeviceCUDA::DeviceCUDA ( )

Definition at line 102 of file DeviceCUDA.C.

102 : deviceProps(NULL), devices(NULL) {}

◆ ~DeviceCUDA()

DeviceCUDA::~DeviceCUDA ( )

Definition at line 514 of file DeviceCUDA.C.

                         {
   if (deviceProps != NULL) delete [] deviceProps;
   if (devices != NULL) delete [] devices;
         delete [] pesSharingDevice;
 }

Member Function Documentation

◆ allDevices()

const int* DeviceCUDA::allDevices ( ) const

inline

Definition at line 173 of file DeviceCUDA.h.

173 {return devices;}

◆ device_shared_with_pe()

bool DeviceCUDA::device_shared_with_pe ( int pe )

Definition at line 537 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

                                              {
   for ( int i=0; i<numPesSharingDevice; ++i ) {
     if ( pesSharingDevice[i] == pe ) return true;
   }
   return false;
 }

◆ getDeviceCount()

int DeviceCUDA::getDeviceCount ( )

inline

Definition at line 124 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

124 {return deviceCount;}

◆ getDeviceID()

int DeviceCUDA::getDeviceID ( )

inline

Definition at line 144 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaComputeNonbonded(), ComputePme::doWork(), ComputeCUDAMgr::getCudaComputeNonbonded(), ComputePmeMgr::initialize(), ComputePmeMgr::initialize_computes(), Sequencer::Sequencer(), and ComputePmeMgr::ungridCalc().

144 {return deviceID;}

◆ getDeviceIDbyRank()

int DeviceCUDA::getDeviceIDbyRank ( int rank )

inline

Definition at line 145 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize(), ComputePmeCUDAMgr::initialize_pencils(), ComputePmeCUDAMgr::isPmeDevice(), and ComputeCUDAMgr::update().

145 {return nodedevices[rank];}

◆ getDeviceIDforPe()

int DeviceCUDA::getDeviceIDforPe ( int pe )

Definition at line 523 of file DeviceCUDA.C.

References deviceIDList.

                                        {
   return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
 }

◆ getDeviceIndex()

int DeviceCUDA::getDeviceIndex ( )

inline

Definition at line 166 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::initialize().

166 { return deviceIndex; }

◆ getGlobalDevice()

int DeviceCUDA::getGlobalDevice ( ) const

inline

Definition at line 171 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaGlobalMaster(), and ComputeMgr::recvCudaGlobalMasterCreateMsg().

171 {return globalDevice;}

◆ getGpuIsMine()

int DeviceCUDA::getGpuIsMine ( )

inline

Definition at line 141 of file DeviceCUDA.h.

141 {return gpuIsMine;}

◆ getIsGlobalDevice()

bool DeviceCUDA::getIsGlobalDevice ( ) const

inline

Definition at line 172 of file DeviceCUDA.h.

Referenced by ComputeMgr::recvCudaGlobalMasterCreateMsg(), ComputeMgr::recvCudaGlobalMasterRemoveMsg(), and ComputeMgr::recvCudaGlobalMasterUpdateMsg().

172 {return isGlobalDevice;}

◆ getIsMasterDevice()

bool DeviceCUDA::getIsMasterDevice ( )

◆ getIsPmeDevice()

bool DeviceCUDA::getIsPmeDevice ( )

inline

Definition at line 168 of file DeviceCUDA.h.

168 { return isPmeDevice; }

◆ getMasterPe()

int DeviceCUDA::getMasterPe ( )

inline

Definition at line 137 of file DeviceCUDA.h.

Referenced by ComputeMgr::createComputes(), ComputePme::doWork(), globalAtomIDToSOAID(), ComputePmeMgr::initialize_computes(), ComputePmeMgr::initialize_pencils(), ComputeMgr::recvCudaGlobalMasterCreateMsg(), ComputeMgr::recvCudaGlobalMasterRemoveMsg(), and ComputeMgr::recvCudaGlobalMasterUpdateMsg().

137 {return masterPe;}

◆ getMasterPeForDeviceID()

int DeviceCUDA::getMasterPeForDeviceID ( int deviceID )

Definition at line 530 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

                                                    {
   return masterPeList[deviceID % deviceCount] - 1;
 }

◆ getMaxNumBlocks()

int DeviceCUDA::getMaxNumBlocks ( )

Definition at line 564 of file DeviceCUDA.C.

References cudaCheck.

                                 {
   int dev;
   cudaCheck(cudaGetDevice(&dev));
   return deviceProps[dev].maxGridSize[0];
 }

◆ getMaxNumThreads()

int DeviceCUDA::getMaxNumThreads ( )

Definition at line 558 of file DeviceCUDA.C.

References cudaCheck.

                                  {
   int dev;
   cudaCheck(cudaGetDevice(&dev));
   return deviceProps[dev].maxThreadsPerBlock;
 }

◆ getMergeGrids()

int DeviceCUDA::getMergeGrids ( )

inline

Definition at line 132 of file DeviceCUDA.h.

132 {return mergegrids;}

◆ getNextPeSharingGpu()

int DeviceCUDA::getNextPeSharingGpu ( )

inline

Definition at line 136 of file DeviceCUDA.h.

136 {return nextPeSharingGpu;}

◆ getNoMergeGrids()

int DeviceCUDA::getNoMergeGrids ( )

inline

Definition at line 131 of file DeviceCUDA.h.

131 {return nomergegrids;}

◆ getNoStreaming()

int DeviceCUDA::getNoStreaming ( )

inline

Definition at line 130 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaComputeNonbonded().

130 {return nostreaming;}

◆ getNumDevice()

int DeviceCUDA::getNumDevice ( )

inline

Definition at line 125 of file DeviceCUDA.h.

Referenced by CudaPmeOneDevice::checkPatchLevelSimParamCompatibility(), ComputeCUDAMgr::initialize(), ComputePmeCUDAMgr::initialize_pencils(), ComputePmeCUDAMgr::isPmeDevice(), ComputePmeCUDAMgr::setupPencils(), and ComputeCUDAMgr::update().

125 {return nnodedevices;}

◆ getNumPesSharingDevice()

int DeviceCUDA::getNumPesSharingDevice ( )

inline

Definition at line 138 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

138 {return numPesSharingDevice;}

◆ getPesSharingDevice()

int DeviceCUDA::getPesSharingDevice ( const int i )

inline

Definition at line 139 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

139 {return pesSharingDevice[i];}

◆ getPmeDevice()

int DeviceCUDA::getPmeDevice ( )

inline

Definition at line 165 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice().

165 { return pmeDevice; }

◆ getPmeDeviceIndex()

int DeviceCUDA::getPmeDeviceIndex ( )

inline

Definition at line 167 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice().

167 { return pmeDeviceIndex; }

◆ getSharedGpu()

bool DeviceCUDA::getSharedGpu ( )

inline

Definition at line 135 of file DeviceCUDA.h.

135 {return sharedGpu;}

◆ initialize()

void DeviceCUDA::initialize ( void )

Definition at line 107 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, for(), cuda_args_t::globaldevice, cuda_args_t::ignoresharing, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, cuda_args_t::pmedevice, cuda_args_t::pmePes, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

                             {
         // Copy command-line arguments into class
         this->devicelist = cuda_args.devicelist;
         this->usedevicelist = cuda_args.usedevicelist;
   this->devicesperreplica = cuda_args.devicesperreplica;
         this->ignoresharing = cuda_args.ignoresharing;
         this->mergegrids = cuda_args.mergegrids;
         this->nomergegrids = cuda_args.nomergegrids;
         this->nostreaming = cuda_args.nostreaming;
 #ifdef NODEGROUP_FORCE_REGISTER
   reservePme = 0;
   isPmeDevice = 0;
   isGlobalDevice = false;
 #endif
 
   if (CkMyPe() == 0) register_user_events();
 
 #if defined(CUDA_VERSION)
   if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
 #endif
 
   char host[128];
   gethostname(host, 128);  host[127] = 0;
 
   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
   int myRankInPhysicalNode;
   int numPesOnPhysicalNode;
   int *pesOnPhysicalNode;
   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
 
   {
     int i;
     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
         i = numPesOnPhysicalNode;
         break;
       }
       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
     }
     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
       }
       myRankInPhysicalNode = 0;
       numPesOnPhysicalNode = 1;
       pesOnPhysicalNode = new int[1];
       pesOnPhysicalNode[0] = CkMyPe();
     } else {
       myRankInPhysicalNode = i;
     }
   }
   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
 
   deviceCount = 0;
   cudaCheck(cudaGetDeviceCount(&deviceCount));
   if ( deviceCount <= 0 ) {
     cudaDie("No CUDA devices found.");
   }
 
   // Store all device props
   deviceProps = new cudaDeviceProp[deviceCount];
   for ( int i=0; i<deviceCount; ++i ) {
     cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
   }
 
   ndevices = 0;
   int nexclusive = 0;
   if ( usedevicelist ) {
     devices = new int[strlen(devicelist)];
     int i = 0;
     while ( devicelist[i] ) {
       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
     }
   } else {
     if ( ! CkMyPe() ) {
       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
     }
     devices = new int[deviceCount];
     for ( int i=0; i<deviceCount; ++i ) {
       int dev = i % deviceCount;
 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
       cudaDeviceProp deviceProp;
       cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
       if ( deviceProp.computeMode != cudaComputeModeProhibited
            && (deviceProp.major >= 3)
            && deviceProp.canMapHostMemory
            && ( (deviceProp.multiProcessorCount > 2) ||
                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
          ) {
         devices[ndevices++] = dev;
       }
       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
         ++nexclusive;
       }
 #else
       devices[ndevices++] = dev;
 #endif
     }
   }
 
   if ( ! ndevices ) {
     cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
   }
 
   if ( devicesperreplica > 0 ) {
     if ( devicesperreplica > ndevices ) {
       NAMD_die("More devices per partition requested than devices are available");
     }
     int *olddevices = devices;
     devices = new int[devicesperreplica];
     for ( int i=0; i<devicesperreplica; ++i ) {
       int mypart = CmiMyPartition();
       devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
     }
     ndevices = devicesperreplica;
     delete [] olddevices;
   }
 
   int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
   int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
 
   // This option allows users to specify the number of PEs given to the PME device.
   // This can be used to improve loadbalancing. 
   #ifdef NODEGROUP_FORCE_REGISTER
   const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
   #else
   const int pmePes = -1;
   #endif
 
   // catch multiple processes per device
   if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
     char msg[1024];
     sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d).  "
             "Sharing devices between processes is inefficient.  "
             "Specify +ignoresharing (each process uses all visible devices) if "
             "not all devices are visible to each process, otherwise "
             "adjust number of processes to evenly divide number of devices, "
             "specify subset of devices with +devices argument (e.g., +devices 0,2), "
             "or multiply list shared devices (e.g., +devices 0,1,2,0).",
             ndevices, numPesForDevice / CkMyNodeSize() );
     NAMD_die(msg);
   }
 
   {
     // build list of devices actually used by this node
     nodedevices = new int[ndevices];
     nnodedevices = 0;
     int pe = CkNodeFirst(CkMyNode());
     int dr = -1;
     for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
       int rank = ignoresharing ? i : CmiPhysicalRank(pe);
       int peDeviceRank = rank * ndevices / numPesForDevice;
       if ( peDeviceRank != dr ) {
         dr = peDeviceRank;
         nodedevices[nnodedevices++] = devices[dr];
       }
     }
   }
 
   {
     // check for devices used twice by this node
     for ( int i=0; i<nnodedevices; ++i ) {
       for ( int j=i+1; j<nnodedevices; ++j ) {
         if ( nodedevices[i] == nodedevices[j] ) {
           char msg[1024];
           sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
           NAMD_die(msg);
         }
       }
     }
   }
 
   sharedGpu = 0;
   gpuIsMine = 1;
   int firstPeSharingGpu = CkMyPe();
   nextPeSharingGpu = CkMyPe();
 
  {
     int dev;
     if (pmePes != -1) {
       int myDeviceRank;
       if (myRankForDevice < pmePes) {
           myDeviceRank = 0;
       } else {
           myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
       }
       
       dev = devices[myDeviceRank];
       masterPe = CkMyPe();
       if (myRankForDevice >= pmePes) {
         pesSharingDevice = new int[numPesForDevice];
         masterPe = -1;
         numPesSharingDevice = 0;
         for ( int i = pmePes; i < numPesForDevice; ++i ) {
           if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
             int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
             pesSharingDevice[numPesSharingDevice++] = thisPe;
             if ( masterPe < 1 ) masterPe = thisPe;
             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
           }
         }
         for ( int j = 0; j < ndevices; ++j ) {
           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
         }
       } else {
         #ifdef NODEGROUP_FORCE_REGISTER
         pesSharingDevice = new int[pmePes];
         #else
         pesSharingDevice = NULL;
         #endif
         masterPe = -1;
         numPesSharingDevice = 0;
         for (int i = 0; i < pmePes; ++i) {
             int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
             pesSharingDevice[numPesSharingDevice++] = thisPe;
             if ( masterPe < 1 ) masterPe = thisPe;
             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
         }
       }
       if ( sharedGpu && masterPe == CkMyPe() ) {
         if ( CmiPhysicalNodeID(masterPe) < 2 )
         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
       }
     } else if ( numPesForDevice > 1 ) {
       int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
       dev = devices[myDeviceRank];
       masterPe = CkMyPe();
       {
         pesSharingDevice = new int[numPesForDevice];
         masterPe = -1;
         numPesSharingDevice = 0;
         for ( int i = 0; i < numPesForDevice; ++i ) {
           if ( i * ndevices / numPesForDevice == myDeviceRank ) {
             int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
             pesSharingDevice[numPesSharingDevice++] = thisPe;
             if ( masterPe < 1 ) masterPe = thisPe;
             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
           }
         }
         for ( int j = 0; j < ndevices; ++j ) {
           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
         }
       }
       if ( sharedGpu && masterPe == CkMyPe() ) {
         if ( CmiPhysicalNodeID(masterPe) < 2 )
         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
       }
     } else {  // in case phys node code is lying
       dev = devices[CkMyPe() % ndevices];
       masterPe = CkMyPe();
       pesSharingDevice = new int[1];
       pesSharingDevice[0] = CkMyPe();
       numPesSharingDevice = 1;
     }
 
     deviceID = dev;
 
     // Setting PME device for single-node scheme
     // Sanity check in order to see if pmeDevice in contained in the device list
     bool contained = false;
     pmeDevice   = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
     for(int i = 0; i < ndevices; i++){
       if(!contained) {
         contained = devices[i] == pmeDevice;
         pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
       }
       if(deviceID == devices[i]) deviceIndex = i;
     }
 
     masterDevice = devices[0]; // head of device list responsible for printing stuff
     isMasterDevice = deviceID == masterDevice;
     if(!contained){
       // Uses a particular device to do PME and reserves it  (no other force terms on it)
       reservePme = true;
       pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
       isPmeDevice = isMasterDevice;  // Master device launches work on the PME device as well
     }else{ 
       reservePme = false;
       isPmeDevice = pmeDevice == deviceID;
     }
 
     // Device for CudaGlobalMaster
     globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
     // Sanity check in order to see if globalDevice is contained in the device list
     contained = false;
     for (int i = 0; i < ndevices; ++i) {
       if(!contained) {
         contained = devices[i] == globalDevice;
       }
     }
     if (!contained) {
       NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
     }
     isGlobalDevice = globalDevice == deviceID;
 
     // Store device IDs to node-wide list
     if (CkMyRank() >= MAX_NUM_RANKS)
       NAMD_die("Maximum number of ranks (2048) per node exceeded");
     deviceIDList[CkMyRank()] = deviceID;
 
     if ( masterPe != CkMyPe() ) {
       if ( CmiPhysicalNodeID(masterPe) < 2 )
       CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
                CkMyPe(), myRankInPhysicalNode, masterPe);
       // for PME only
       cudaCheck(cudaSetDevice(dev));
       return;
     }
 
     // Store master PEs for every device ID to node-wide list
     if (deviceID >= MAX_NUM_DEVICES)
       NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
     masterPeList[deviceID] = masterPe + 1;  // array is pre-initialized to zeros
 
     // disable token-passing but don't submit local until remote finished
     // if shared_gpu is true, otherwise submit all work immediately
     firstPeSharingGpu = CkMyPe();
     nextPeSharingGpu = CkMyPe();
 
     gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
 
     if ( dev >= deviceCount ) {
       char buf[256];
       sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
                 CkMyPe(), dev, host, deviceCount);
       NAMD_die(buf);
     }
 
     cudaDeviceProp deviceProp;
     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
     if ( CmiPhysicalNodeID(masterPe) < 2 )
         CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %luMB  Rev: %d.%d  PCI: %x:%x:%x\n",
                CkMyPe(), myRankInPhysicalNode, dev, host,
                deviceProp.name,
                (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
                deviceProp.major, deviceProp.minor,
                deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
 
     cudaCheck(cudaSetDevice(dev));
 
   }  // just let CUDA pick a device for us
 
   {
     // if only one device then already initialized in cuda_affinity_initialize()
     cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
     if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
       cudaGetLastError();
     } else {
       cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
     }
 
     int dev;
     cudaCheck(cudaGetDevice(&dev));
     deviceID = dev;
     cudaDeviceProp deviceProp;
     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
     if ( deviceProp.computeMode == cudaComputeModeProhibited )
       cudaDie("device in prohibited mode");
     if ( deviceProp.major < 3 )
       cudaDie("device not of compute capability 3.0 or higher");
     if ( ! deviceProp.canMapHostMemory )
       cudaDie("device cannot map host memory");
 
     // initialize the device on this thread
     int *dummy;
     cudaCheck(cudaMalloc(&dummy, 4));
   }
 
 #if NODEGROUP_FORCE_REGISTER
   
   {
     // Setting PME device for single-node scheme
 
     // Sanity check in order to see if pmeDevice in contained in the device list
     bool contained = false;
     pmeDevice   = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
     for(int i = 0; i < ndevices; i++){
       if(!contained) {
         contained = devices[i] == pmeDevice;
         pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
       }
       if(deviceID == devices[i]) deviceIndex = i;
     }
 
     if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
     // Everything is OK, sets flags
     
     isPmeDevice = pmeDevice == deviceID;
     masterDevice = devices[0]; // head of device list responsible for printing stuff
     isMasterDevice = deviceID == masterDevice;
 
     if (pmeDeviceIndex != 0 && pmePes != -1) {
         NAMD_die("PME device must be index 0 if pmePes is set");
     }
   }
 
 #endif
 }

◆ isGpuReservedPme()

bool DeviceCUDA::isGpuReservedPme ( )

inline

Definition at line 164 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize().

164 { return reservePme; }

◆ one_device_per_node()

bool DeviceCUDA::one_device_per_node ( )

Definition at line 547 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

                                      {
   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
   int numPesOnNodeSharingDevice = 0;
   for ( int i=0; i<numPesSharingDevice; ++i ) {
     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
       ++numPesOnNodeSharingDevice;
     }
   }
   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
 }

◆ setGpuIsMine()

void DeviceCUDA::setGpuIsMine ( const int val )

inline

Definition at line 142 of file DeviceCUDA.h.

142 {gpuIsMine = val;}

◆ setMergeGrids()

void DeviceCUDA::setMergeGrids ( const int val )

inline

Definition at line 133 of file DeviceCUDA.h.

133 {mergegrids = val;}

◆ setupDevicePeerAccess()

void DeviceCUDA::setupDevicePeerAccess ( )

The documentation for this class was generated from the following files:

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

◆ DeviceCUDA()

◆ ~DeviceCUDA()

Member Function Documentation

◆ allDevices()

◆ device_shared_with_pe()

◆ getDeviceCount()

◆ getDeviceID()

◆ getDeviceIDbyRank()

◆ getDeviceIDforPe()

◆ getDeviceIndex()

◆ getGlobalDevice()

◆ getGpuIsMine()

◆ getIsGlobalDevice()

◆ getIsMasterDevice()

◆ getIsPmeDevice()

◆ getMasterPe()

◆ getMasterPeForDeviceID()

◆ getMaxNumBlocks()

◆ getMaxNumThreads()

◆ getMergeGrids()

◆ getNextPeSharingGpu()

◆ getNoMergeGrids()

◆ getNoStreaming()

◆ getNumDevice()

◆ getNumPesSharingDevice()

◆ getPesSharingDevice()

◆ getPmeDevice()

◆ getPmeDeviceIndex()

◆ getSharedGpu()

◆ initialize()

◆ isGpuReservedPme()

◆ one_device_per_node()

◆ setGpuIsMine()

◆ setMergeGrids()

◆ setupDevicePeerAccess()