DeviceCUDA Class Reference

#include <DeviceCUDA.h>

List of all members.

Public Member Functions

 DeviceCUDA ()
 ~DeviceCUDA ()
void initialize ()
int getDeviceCount ()
int getNumDevice ()
bool device_shared_with_pe (int pe)
bool one_device_per_node ()
int getNoStreaming ()
int getNoMergeGrids ()
int getMergeGrids ()
void setMergeGrids (const int val)
bool getSharedGpu ()
int getNextPeSharingGpu ()
int getMasterPe ()
int getNumPesSharingDevice ()
int getPesSharingDevice (const int i)
int getGpuIsMine ()
void setGpuIsMine (const int val)
int getDeviceID ()
int getDeviceIDbyRank (int rank)
int getDeviceIDforPe (int pe)
int getMasterPeForDeviceID (int deviceID)
int getMaxNumThreads ()
int getMaxNumBlocks ()


Detailed Description

Definition at line 24 of file DeviceCUDA.h.


Constructor & Destructor Documentation

DeviceCUDA::DeviceCUDA (  ) 

Definition at line 80 of file DeviceCUDA.C.

00080 : deviceProps(NULL), devices(NULL) {}

DeviceCUDA::~DeviceCUDA (  ) 

Definition at line 332 of file DeviceCUDA.C.

00332                         {
00333   if (deviceProps != NULL) delete [] deviceProps;
00334   if (devices != NULL) delete [] devices;
00335         delete [] pesSharingDevice;
00336 }


Member Function Documentation

bool DeviceCUDA::device_shared_with_pe ( int  pe  ) 

Definition at line 355 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

00355                                              {
00356   for ( int i=0; i<numPesSharingDevice; ++i ) {
00357     if ( pesSharingDevice[i] == pe ) return true;
00358   }
00359   return false;
00360 }

int DeviceCUDA::getDeviceCount (  )  [inline]

Definition at line 72 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

00072 {return deviceCount;}

int DeviceCUDA::getDeviceID (  )  [inline]

Definition at line 92 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeCUDAMgr::createCudaComputeNonbonded(), ComputePme::doWork(), ComputeCUDAMgr::getCudaComputeNonbonded(), ComputePmeMgr::initialize(), ComputePmeMgr::initialize_computes(), and ComputePmeMgr::ungridCalc().

00092 {return deviceID;}

int DeviceCUDA::getDeviceIDbyRank ( int  rank  )  [inline]

Definition at line 93 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize(), ComputePmeCUDAMgr::initialize_pencils(), and ComputePmeCUDAMgr::isPmeDevice().

00093 {return devices[rank];}

int DeviceCUDA::getDeviceIDforPe ( int  pe  ) 

Definition at line 341 of file DeviceCUDA.C.

References deviceIDList.

00341                                        {
00342   return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
00343 }

int DeviceCUDA::getGpuIsMine (  )  [inline]

Definition at line 89 of file DeviceCUDA.h.

00089 {return gpuIsMine;}

int DeviceCUDA::getMasterPe (  )  [inline]

Definition at line 85 of file DeviceCUDA.h.

Referenced by build_cuda_exclusions(), build_cuda_force_table(), ComputePmeMgr::chargeGridSubmitted(), ComputeMgr::createComputes(), ComputePmeMgr::initialize_computes(), and ComputePmeMgr::initialize_pencils().

00085 {return masterPe;}

int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID  ) 

Definition at line 348 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

00348                                                    {
00349   return masterPeList[deviceID % deviceCount];
00350 }

int DeviceCUDA::getMaxNumBlocks (  ) 

Definition at line 382 of file DeviceCUDA.C.

References cudaCheck.

Referenced by ComputeBondedCUDAKernel::bondedForce(), CudaTileListKernel::buildTileLists(), CudaComputeNonbondedKernel::nonbondedForce(), and CudaComputeNonbondedKernel::reduceVirialEnergy().

00382                                 {
00383   int dev;
00384   cudaCheck(cudaGetDevice(&dev));
00385   return deviceProps[dev].maxGridSize[0];
00386 }

int DeviceCUDA::getMaxNumThreads (  ) 

Definition at line 376 of file DeviceCUDA.C.

References cudaCheck.

00376                                  {
00377   int dev;
00378   cudaCheck(cudaGetDevice(&dev));
00379   return deviceProps[dev].maxThreadsPerBlock;
00380 }

int DeviceCUDA::getMergeGrids (  )  [inline]

Definition at line 80 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_remote_progress(), ComputeNonbondedCUDA::recvYieldDevice(), and ComputeNonbondedCUDA::requirePatch().

00080 {return mergegrids;}

int DeviceCUDA::getNextPeSharingGpu (  )  [inline]

Definition at line 84 of file DeviceCUDA.h.

Referenced by cuda_check_local_calc(), and cuda_check_remote_calc().

00084 {return nextPeSharingGpu;}

int DeviceCUDA::getNoMergeGrids (  )  [inline]

Definition at line 79 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00079 {return nomergegrids;}

int DeviceCUDA::getNoStreaming (  )  [inline]

Definition at line 78 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeCUDAMgr::createCudaComputeNonbonded(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

00078 {return nostreaming;}

int DeviceCUDA::getNumDevice (  )  [inline]

Definition at line 73 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize(), ComputePmeCUDAMgr::initialize_pencils(), ComputePmeCUDAMgr::isPmeDevice(), and ComputePmeCUDAMgr::setupPencils().

00073 {return ndevices;}

int DeviceCUDA::getNumPesSharingDevice (  )  [inline]

Definition at line 86 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeNonbondedCUDA::assignPatches().

00086 {return numPesSharingDevice;}

int DeviceCUDA::getPesSharingDevice ( const int  i  )  [inline]

Definition at line 87 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeNonbondedCUDA::assignPatches().

00087 {return pesSharingDevice[i];}

bool DeviceCUDA::getSharedGpu (  )  [inline]

Definition at line 83 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

00083 {return sharedGpu;}

void DeviceCUDA::initialize (  ) 

Definition at line 85 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, cuda_args_t::ignoresharing, j, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

00085                             {
00086         // Copy command-line arguments into class
00087         this->devicelist = cuda_args.devicelist;
00088         this->usedevicelist = cuda_args.usedevicelist;
00089   this->devicesperreplica = cuda_args.devicesperreplica;
00090         this->ignoresharing = cuda_args.ignoresharing;
00091         this->mergegrids = cuda_args.mergegrids;
00092         this->nomergegrids = cuda_args.nomergegrids;
00093         this->nostreaming = cuda_args.nostreaming;
00094 
00095   if (CkMyPe() == 0) register_user_events();
00096 
00097   if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
00098 
00099   char host[128];
00100 #ifdef NOHOSTNAME
00101   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00102 #else
00103   gethostname(host, 128);  host[127] = 0;
00104 #endif
00105 
00106   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00107   int myRankInPhysicalNode;
00108   int numPesOnPhysicalNode;
00109   int *pesOnPhysicalNode;
00110   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00111                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00112 
00113   {
00114     int i;
00115     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00116       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00117         i = numPesOnPhysicalNode;
00118         break;
00119       }
00120       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00121     }
00122     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00123       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00124       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00125         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00126           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00127       }
00128       myRankInPhysicalNode = 0;
00129       numPesOnPhysicalNode = 1;
00130       pesOnPhysicalNode = new int[1];
00131       pesOnPhysicalNode[0] = CkMyPe();
00132     } else {
00133       myRankInPhysicalNode = i;
00134     }
00135   }
00136   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00137 
00138   deviceCount = 0;
00139   cudaCheck(cudaGetDeviceCount(&deviceCount));
00140   if ( deviceCount <= 0 ) {
00141     cudaDie("No CUDA devices found.");
00142   }
00143 
00144   // Store all device props
00145   deviceProps = new cudaDeviceProp[deviceCount];
00146   for ( int i=0; i<deviceCount; ++i ) {
00147     cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
00148   }
00149 
00150   ndevices = 0;
00151   int nexclusive = 0;
00152   if ( usedevicelist ) {
00153     devices = new int[strlen(devicelist)];
00154     int i = 0;
00155     while ( devicelist[i] ) {
00156       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00157       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00158       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00159     }
00160   } else {
00161     if ( ! CkMyPe() ) {
00162       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00163     }
00164     devices = new int[deviceCount];
00165     for ( int i=0; i<deviceCount; ++i ) {
00166       int dev = i % deviceCount;
00167 #if CUDA_VERSION >= 2020
00168       cudaDeviceProp deviceProp;
00169       cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00170       if ( deviceProp.computeMode != cudaComputeModeProhibited
00171            && (deviceProp.major >= 3)
00172            && deviceProp.canMapHostMemory
00173            && ( (deviceProp.multiProcessorCount > 2) ||
00174                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
00175          ) {
00176         devices[ndevices++] = dev;
00177       }
00178       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00179         ++nexclusive;
00180       }
00181 #else
00182       devices[ndevices++] = dev;
00183 #endif
00184     }
00185   }
00186 
00187   if ( ! ndevices ) {
00188     cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
00189   }
00190 
00191   if ( devicesperreplica > 0 ) {
00192     if ( devicesperreplica > ndevices ) {
00193       NAMD_die("More devices per partition requested than devices are available");
00194     }
00195     int *olddevices = devices;
00196     devices = new int[devicesperreplica];
00197     for ( int i=0; i<devicesperreplica; ++i ) {
00198       int mypart = CmiMyPartition();
00199       devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
00200     }
00201     ndevices = devicesperreplica;
00202     delete [] olddevices;
00203   }
00204 
00205   sharedGpu = 0;
00206   gpuIsMine = 1;
00207   int firstPeSharingGpu = CkMyPe();
00208   nextPeSharingGpu = CkMyPe();
00209 
00210  {
00211 
00212     int dev;
00213     if ( numPesOnPhysicalNode > 1 ) {
00214       int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00215       dev = devices[myDeviceRank];
00216       masterPe = CkMyPe();
00217       if ( ignoresharing ) {
00218         pesSharingDevice = new int[1];
00219         pesSharingDevice[0] = CkMyPe();
00220         numPesSharingDevice = 1;
00221       } else {
00222         pesSharingDevice = new int[numPesOnPhysicalNode];
00223         masterPe = -1;
00224         numPesSharingDevice = 0;
00225         for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00226           if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00227             int thisPe = pesOnPhysicalNode[i];
00228             pesSharingDevice[numPesSharingDevice++] = thisPe;
00229             if ( masterPe < 1 ) masterPe = thisPe;
00230             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
00231           }
00232         }
00233         for ( int j = 0; j < ndevices; ++j ) {
00234           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
00235         }
00236       }
00237       if ( sharedGpu && masterPe == CkMyPe() ) {
00238         if ( CmiPhysicalNodeID(masterPe) < 2 )
00239         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00240       }
00241     } else {  // in case phys node code is lying
00242       dev = devices[CkMyPe() % ndevices];
00243       masterPe = CkMyPe();
00244       pesSharingDevice = new int[1];
00245       pesSharingDevice[0] = CkMyPe();
00246       numPesSharingDevice = 1;
00247     }
00248 
00249     deviceID = dev;
00250 
00251     // Store device IDs to node-wide list
00252     if (CkMyRank() >= MAX_NUM_RANKS)
00253       NAMD_die("Maximum number of ranks (2048) per node exceeded");
00254     deviceIDList[CkMyRank()] = deviceID;
00255 
00256     if ( masterPe != CkMyPe() ) {
00257       if ( CmiPhysicalNodeID(masterPe) < 2 )
00258       CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00259                CkMyPe(), myRankInPhysicalNode, masterPe);
00260       // for PME only
00261       cudaCheck(cudaSetDevice(dev));
00262       return;
00263     }
00264 
00265     // Store master PEs for every device ID to node-wide list
00266     if (CkMyRank() >= MAX_NUM_DEVICES)
00267       NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
00268     masterPeList[deviceID] = masterPe;
00269     // Set masterPe values to -1 for devices that do not exist.
00270     // Only master Pe with deviceID == devices[0] does the writing
00271     if (deviceID == devices[0]) {
00272       // For device IDs 0...deviceCount-1, check if it is in the devices[0...deviceCount-1]
00273       for (int i=0;i < deviceCount;i++) {
00274         bool deviceOK = false;
00275         for (int j=0;j < deviceCount;j++) {
00276           if (devices[j] == i) deviceOK = true;
00277         }
00278         if (!deviceOK) masterPeList[i] = -1;
00279       }
00280       // Device IDs deviceCount ... MAX_NUM_DEVICES are not possible, just set them to -1
00281       for (int i=deviceCount;i < MAX_NUM_DEVICES;i++) {
00282         masterPeList[i] = -1;
00283       }
00284     }
00285 
00286     // disable token-passing but don't submit local until remote finished
00287     // if shared_gpu is true, otherwise submit all work immediately
00288     firstPeSharingGpu = CkMyPe();
00289     nextPeSharingGpu = CkMyPe();
00290 
00291     gpuIsMine = ( firstPeSharingGpu == CkMyPe() ); 
00292 
00293     if ( dev >= deviceCount ) {
00294       char buf[256];
00295       sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00296                 CkMyPe(), dev, host, deviceCount);
00297       NAMD_die(buf);
00298     }
00299 
00300     cudaDeviceProp deviceProp;
00301     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00302     if ( CmiPhysicalNodeID(masterPe) < 2 )
00303         CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00304                CkMyPe(), myRankInPhysicalNode, dev, host,
00305                deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00306                deviceProp.major, deviceProp.minor);
00307 
00308     cudaCheck(cudaSetDevice(dev));
00309 
00310   }  // just let CUDA pick a device for us
00311 
00312   {
00313     cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost));
00314 
00315     int dev;
00316     cudaCheck(cudaGetDevice(&dev));
00317     deviceID = dev;
00318     cudaDeviceProp deviceProp;
00319     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00320     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00321       cudaDie("device in prohibited mode");
00322     if ( deviceProp.major < 3 )
00323       cudaDie("device not of compute capability 3.0 or higher");
00324     if ( ! deviceProp.canMapHostMemory )
00325       cudaDie("device cannot map host memory");
00326   }
00327 }

bool DeviceCUDA::one_device_per_node (  ) 

Definition at line 365 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

00365                                      {
00366   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
00367   int numPesOnNodeSharingDevice = 0;
00368   for ( int i=0; i<numPesSharingDevice; ++i ) {
00369     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
00370       ++numPesOnNodeSharingDevice;
00371     }
00372   }
00373   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
00374 }

void DeviceCUDA::setGpuIsMine ( const int  val  )  [inline]

Definition at line 90 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

00090 {gpuIsMine = val;}

void DeviceCUDA::setMergeGrids ( const int  val  )  [inline]

Definition at line 81 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00081 {mergegrids = val;}


The documentation for this class was generated from the following files:
Generated on Thu Nov 23 01:17:18 2017 for NAMD by  doxygen 1.4.7