DeviceCUDA Class Reference

#include <DeviceCUDA.h>

List of all members.

Public Member Functions

 DeviceCUDA ()
 ~DeviceCUDA ()
void initialize ()
int getDeviceCount ()
int getNumDevice ()
bool device_shared_with_pe (int pe)
bool one_device_per_node ()
int getNoStreaming ()
int getNoMergeGrids ()
int getMergeGrids ()
void setMergeGrids (const int val)
bool getSharedGpu ()
int getNextPeSharingGpu ()
int getMasterPe ()
int getNumPesSharingDevice ()
int getPesSharingDevice (const int i)
int getGpuIsMine ()
void setGpuIsMine (const int val)
int getDeviceID ()
int getDeviceIDbyRank (int rank)
int getDeviceIDforPe (int pe)
int getMasterPeForDeviceID (int deviceID)
int getMaxNumThreads ()
int getMaxNumBlocks ()


Detailed Description

Definition at line 33 of file DeviceCUDA.h.


Constructor & Destructor Documentation

DeviceCUDA::DeviceCUDA (  ) 

Definition at line 80 of file DeviceCUDA.C.

00080 : deviceProps(NULL), devices(NULL) {}

DeviceCUDA::~DeviceCUDA (  ) 

Definition at line 363 of file DeviceCUDA.C.

00363                         {
00364   if (deviceProps != NULL) delete [] deviceProps;
00365   if (devices != NULL) delete [] devices;
00366         delete [] pesSharingDevice;
00367 }


Member Function Documentation

bool DeviceCUDA::device_shared_with_pe ( int  pe  ) 

Definition at line 386 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

00386                                              {
00387   for ( int i=0; i<numPesSharingDevice; ++i ) {
00388     if ( pesSharingDevice[i] == pe ) return true;
00389   }
00390   return false;
00391 }

int DeviceCUDA::getDeviceCount (  )  [inline]

Definition at line 87 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

00087 {return deviceCount;}

int DeviceCUDA::getDeviceID (  )  [inline]

Definition at line 107 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeCUDAMgr::createCudaComputeNonbonded(), ComputePme::doWork(), ComputeCUDAMgr::getCudaComputeNonbonded(), ComputePmeMgr::initialize(), ComputePmeMgr::initialize_computes(), and ComputePmeMgr::ungridCalc().

00107 {return deviceID;}

int DeviceCUDA::getDeviceIDbyRank ( int  rank  )  [inline]

Definition at line 108 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize(), ComputePmeCUDAMgr::initialize_pencils(), and ComputePmeCUDAMgr::isPmeDevice().

00108 {return nodedevices[rank];}

int DeviceCUDA::getDeviceIDforPe ( int  pe  ) 

Definition at line 372 of file DeviceCUDA.C.

References deviceIDList.

00372                                        {
00373   return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
00374 }

int DeviceCUDA::getGpuIsMine (  )  [inline]

Definition at line 104 of file DeviceCUDA.h.

00104 {return gpuIsMine;}

int DeviceCUDA::getMasterPe (  )  [inline]

Definition at line 100 of file DeviceCUDA.h.

Referenced by build_cuda_exclusions(), build_cuda_force_table(), ComputePmeMgr::chargeGridSubmitted(), ComputeMgr::createComputes(), ComputePmeMgr::initialize_computes(), and ComputePmeMgr::initialize_pencils().

00100 {return masterPe;}

int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID  ) 

Definition at line 379 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

00379                                                    {
00380   return masterPeList[deviceID % deviceCount] - 1;
00381 }

int DeviceCUDA::getMaxNumBlocks (  ) 

Definition at line 413 of file DeviceCUDA.C.

References cudaCheck.

Referenced by ComputeBondedCUDAKernel::bondedForce(), CudaTileListKernel::buildTileLists(), CudaComputeNonbondedKernel::nonbondedForce(), and CudaComputeNonbondedKernel::reduceVirialEnergy().

00413                                 {
00414   int dev;
00415   cudaCheck(cudaGetDevice(&dev));
00416   return deviceProps[dev].maxGridSize[0];
00417 }

int DeviceCUDA::getMaxNumThreads (  ) 

Definition at line 407 of file DeviceCUDA.C.

References cudaCheck.

00407                                  {
00408   int dev;
00409   cudaCheck(cudaGetDevice(&dev));
00410   return deviceProps[dev].maxThreadsPerBlock;
00411 }

int DeviceCUDA::getMergeGrids (  )  [inline]

Definition at line 95 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_remote_progress(), ComputeNonbondedCUDA::recvYieldDevice(), and ComputeNonbondedCUDA::requirePatch().

00095 {return mergegrids;}

int DeviceCUDA::getNextPeSharingGpu (  )  [inline]

Definition at line 99 of file DeviceCUDA.h.

Referenced by cuda_check_local_calc(), and cuda_check_remote_calc().

00099 {return nextPeSharingGpu;}

int DeviceCUDA::getNoMergeGrids (  )  [inline]

Definition at line 94 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00094 {return nomergegrids;}

int DeviceCUDA::getNoStreaming (  )  [inline]

Definition at line 93 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeCUDAMgr::createCudaComputeNonbonded(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

00093 {return nostreaming;}

int DeviceCUDA::getNumDevice (  )  [inline]

Definition at line 88 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize(), ComputePmeCUDAMgr::initialize_pencils(), ComputePmeCUDAMgr::isPmeDevice(), and ComputePmeCUDAMgr::setupPencils().

00088 {return nnodedevices;}

int DeviceCUDA::getNumPesSharingDevice (  )  [inline]

Definition at line 101 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeNonbondedCUDA::assignPatches().

00101 {return numPesSharingDevice;}

int DeviceCUDA::getPesSharingDevice ( const int  i  )  [inline]

Definition at line 102 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeNonbondedCUDA::assignPatches().

00102 {return pesSharingDevice[i];}

bool DeviceCUDA::getSharedGpu (  )  [inline]

Definition at line 98 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

00098 {return sharedGpu;}

void DeviceCUDA::initialize (  ) 

Definition at line 85 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, cuda_args_t::ignoresharing, j, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

00085                             {
00086         // Copy command-line arguments into class
00087         this->devicelist = cuda_args.devicelist;
00088         this->usedevicelist = cuda_args.usedevicelist;
00089   this->devicesperreplica = cuda_args.devicesperreplica;
00090         this->ignoresharing = cuda_args.ignoresharing;
00091         this->mergegrids = cuda_args.mergegrids;
00092         this->nomergegrids = cuda_args.nomergegrids;
00093         this->nostreaming = cuda_args.nostreaming;
00094 
00095   if (CkMyPe() == 0) register_user_events();
00096 
00097   if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
00098 
00099   char host[128];
00100 #ifdef NOHOSTNAME
00101   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00102 #else
00103   gethostname(host, 128);  host[127] = 0;
00104 #endif
00105 
00106   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00107   int myRankInPhysicalNode;
00108   int numPesOnPhysicalNode;
00109   int *pesOnPhysicalNode;
00110   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00111                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00112 
00113   {
00114     int i;
00115     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00116       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00117         i = numPesOnPhysicalNode;
00118         break;
00119       }
00120       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00121     }
00122     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00123       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00124       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00125         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00126           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00127       }
00128       myRankInPhysicalNode = 0;
00129       numPesOnPhysicalNode = 1;
00130       pesOnPhysicalNode = new int[1];
00131       pesOnPhysicalNode[0] = CkMyPe();
00132     } else {
00133       myRankInPhysicalNode = i;
00134     }
00135   }
00136   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00137 
00138   deviceCount = 0;
00139   cudaCheck(cudaGetDeviceCount(&deviceCount));
00140   if ( deviceCount <= 0 ) {
00141     cudaDie("No CUDA devices found.");
00142   }
00143 
00144   // Store all device props
00145   deviceProps = new cudaDeviceProp[deviceCount];
00146   for ( int i=0; i<deviceCount; ++i ) {
00147     cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
00148   }
00149 
00150   ndevices = 0;
00151   int nexclusive = 0;
00152   if ( usedevicelist ) {
00153     devices = new int[strlen(devicelist)];
00154     int i = 0;
00155     while ( devicelist[i] ) {
00156       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00157       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00158       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00159     }
00160   } else {
00161     if ( ! CkMyPe() ) {
00162       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00163     }
00164     devices = new int[deviceCount];
00165     for ( int i=0; i<deviceCount; ++i ) {
00166       int dev = i % deviceCount;
00167 #if CUDA_VERSION >= 2020
00168       cudaDeviceProp deviceProp;
00169       cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00170       if ( deviceProp.computeMode != cudaComputeModeProhibited
00171            && (deviceProp.major >= 3)
00172            && deviceProp.canMapHostMemory
00173            && ( (deviceProp.multiProcessorCount > 2) ||
00174                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
00175          ) {
00176         devices[ndevices++] = dev;
00177       }
00178       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00179         ++nexclusive;
00180       }
00181 #else
00182       devices[ndevices++] = dev;
00183 #endif
00184     }
00185   }
00186 
00187   if ( ! ndevices ) {
00188     cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
00189   }
00190 
00191   if ( devicesperreplica > 0 ) {
00192     if ( devicesperreplica > ndevices ) {
00193       NAMD_die("More devices per partition requested than devices are available");
00194     }
00195     int *olddevices = devices;
00196     devices = new int[devicesperreplica];
00197     for ( int i=0; i<devicesperreplica; ++i ) {
00198       int mypart = CmiMyPartition();
00199       devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
00200     }
00201     ndevices = devicesperreplica;
00202     delete [] olddevices;
00203   }
00204 
00205   int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
00206   int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
00207 
00208   // catch multiple processes per device
00209   if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
00210     char msg[1024];
00211     sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d).  "
00212             "Sharing devices between processes is inefficient.  "
00213             "Specify +ignoresharing (each process uses all visible devices) if "
00214             "not all devices are visible to each process, otherwise "
00215             "adjust number of processes to evenly divide number of devices, "
00216             "specify subset of devices with +devices argument (e.g., +devices 0,2), "
00217             "or multiply list shared devices (e.g., +devices 0,1,2,0).",
00218             ndevices, numPesForDevice / CkMyNodeSize() );
00219     NAMD_die(msg);
00220   }
00221 
00222   {
00223     // build list of devices actually used by this node
00224     nodedevices = new int[ndevices];
00225     nnodedevices = 0;
00226     int pe = CkNodeFirst(CkMyNode());
00227     int dr = -1;
00228     for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
00229       int rank = ignoresharing ? i : CmiPhysicalRank(pe);
00230       int peDeviceRank = rank * ndevices / numPesForDevice;
00231       if ( peDeviceRank != dr ) {
00232         dr = peDeviceRank;
00233         nodedevices[nnodedevices++] = devices[dr];
00234       }
00235     }
00236   }
00237 
00238   {
00239     // check for devices used twice by this node
00240     for ( int i=0; i<nnodedevices; ++i ) {
00241       for ( int j=i+1; j<nnodedevices; ++j ) {
00242         if ( nodedevices[i] == nodedevices[j] ) { 
00243           char msg[1024];
00244           sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
00245           NAMD_die(msg);
00246         }
00247       }
00248     }
00249   }
00250 
00251   sharedGpu = 0;
00252   gpuIsMine = 1;
00253   int firstPeSharingGpu = CkMyPe();
00254   nextPeSharingGpu = CkMyPe();
00255 
00256  {
00257     int dev;
00258     if ( numPesForDevice > 1 ) {
00259       int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
00260       dev = devices[myDeviceRank];
00261       masterPe = CkMyPe();
00262       {
00263         pesSharingDevice = new int[numPesForDevice];
00264         masterPe = -1;
00265         numPesSharingDevice = 0;
00266         for ( int i = 0; i < numPesForDevice; ++i ) {
00267           if ( i * ndevices / numPesForDevice == myDeviceRank ) {
00268             int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
00269             pesSharingDevice[numPesSharingDevice++] = thisPe;
00270             if ( masterPe < 1 ) masterPe = thisPe;
00271             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
00272           }
00273         }
00274         for ( int j = 0; j < ndevices; ++j ) {
00275           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
00276         }
00277       }
00278       if ( sharedGpu && masterPe == CkMyPe() ) {
00279         if ( CmiPhysicalNodeID(masterPe) < 2 )
00280         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00281       }
00282     } else {  // in case phys node code is lying
00283       dev = devices[CkMyPe() % ndevices];
00284       masterPe = CkMyPe();
00285       pesSharingDevice = new int[1];
00286       pesSharingDevice[0] = CkMyPe();
00287       numPesSharingDevice = 1;
00288     }
00289 
00290     deviceID = dev;
00291 
00292     // Store device IDs to node-wide list
00293     if (CkMyRank() >= MAX_NUM_RANKS)
00294       NAMD_die("Maximum number of ranks (2048) per node exceeded");
00295     deviceIDList[CkMyRank()] = deviceID;
00296 
00297     if ( masterPe != CkMyPe() ) {
00298       if ( CmiPhysicalNodeID(masterPe) < 2 )
00299       CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00300                CkMyPe(), myRankInPhysicalNode, masterPe);
00301       // for PME only
00302       cudaCheck(cudaSetDevice(dev));
00303       return;
00304     }
00305 
00306     // Store master PEs for every device ID to node-wide list
00307     if (deviceID >= MAX_NUM_DEVICES)
00308       NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
00309     masterPeList[deviceID] = masterPe + 1;  // array is pre-initialized to zeros
00310 
00311     // disable token-passing but don't submit local until remote finished
00312     // if shared_gpu is true, otherwise submit all work immediately
00313     firstPeSharingGpu = CkMyPe();
00314     nextPeSharingGpu = CkMyPe();
00315 
00316     gpuIsMine = ( firstPeSharingGpu == CkMyPe() ); 
00317 
00318     if ( dev >= deviceCount ) {
00319       char buf[256];
00320       sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00321                 CkMyPe(), dev, host, deviceCount);
00322       NAMD_die(buf);
00323     }
00324 
00325     cudaDeviceProp deviceProp;
00326     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00327     if ( CmiPhysicalNodeID(masterPe) < 2 )
00328         CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d  PCI: %x:%x:%x\n",
00329                CkMyPe(), myRankInPhysicalNode, dev, host,
00330                deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00331                deviceProp.major, deviceProp.minor,
00332                deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00333 
00334     cudaCheck(cudaSetDevice(dev));
00335 
00336   }  // just let CUDA pick a device for us
00337 
00338   {
00339     // if only one device then already initialized in cuda_affinity_initialize()
00340     if ( deviceCount > 1 ) cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost));
00341 
00342     int dev;
00343     cudaCheck(cudaGetDevice(&dev));
00344     deviceID = dev;
00345     cudaDeviceProp deviceProp;
00346     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00347     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00348       cudaDie("device in prohibited mode");
00349     if ( deviceProp.major < 3 )
00350       cudaDie("device not of compute capability 3.0 or higher");
00351     if ( ! deviceProp.canMapHostMemory )
00352       cudaDie("device cannot map host memory");
00353 
00354     // initialize the device on this thread
00355     int *dummy;
00356     cudaCheck(cudaMalloc(&dummy, 4));
00357   }
00358 }

bool DeviceCUDA::one_device_per_node (  ) 

Definition at line 396 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

00396                                      {
00397   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
00398   int numPesOnNodeSharingDevice = 0;
00399   for ( int i=0; i<numPesSharingDevice; ++i ) {
00400     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
00401       ++numPesOnNodeSharingDevice;
00402     }
00403   }
00404   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
00405 }

void DeviceCUDA::setGpuIsMine ( const int  val  )  [inline]

Definition at line 105 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

00105 {gpuIsMine = val;}

void DeviceCUDA::setMergeGrids ( const int  val  )  [inline]

Definition at line 96 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00096 {mergegrids = val;}


The documentation for this class was generated from the following files:
Generated on Sun Apr 22 01:17:20 2018 for NAMD by  doxygen 1.4.7