DeviceCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 #include "DeviceCUDA.h"
00005 #include "WorkDistrib.h"
00006 #include "CudaUtils.h"
00007 
00008 #ifdef NAMD_CUDA
00009 
00010 #include <cuda_runtime.h>
00011 #include <cuda.h>
00012 
00013 #ifdef WIN32
00014 #define __thread __declspec(thread)
00015 #endif
00016 
00017 // Global storage for CUDA devices
00018 __thread DeviceCUDA *deviceCUDA;
00019 
00020 void cuda_initialize() {
00021         deviceCUDA = new DeviceCUDA();
00022         deviceCUDA->initialize();
00023 }
00024 
00025 // kill all service threads
00026 void cuda_finalize() {
00027     int ndevs = 0;
00028     cudaGetDeviceCount(&ndevs);
00029     for ( int dev=0; dev < ndevs; ++dev ) {
00030         cudaSetDevice(dev);
00031         cudaDeviceReset();
00032     }
00033 }
00034 
00035 // -------------------------------------------------------------------------------------------------
00036 // Called from BackEnd.C by all processes to read command line arguments
00037 // These argument settings are used by DeviceCUDA -class
00038 // -------------------------------------------------------------------------------------------------
00039 struct cuda_args_t {
00040         char *devicelist;
00041         int usedevicelist;
00042   int devicesperreplica;
00043         int ignoresharing;
00044         int mergegrids;
00045         int nomergegrids;
00046         int nostreaming;
00047 };
00048 
00049 static __thread cuda_args_t cuda_args;
00050 
00051 void cuda_getargs(char **argv) {
00052   cuda_args.devicelist = 0;
00053   cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
00054                 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00055   cuda_args.devicesperreplica = 0;
00056   CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
00057   if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
00058   cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00059   cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00060   cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
00061   if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
00062   cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
00063 }
00064 // -------------------------------------------------------------------------------------------------
00065 
00066 // Node-wide list of device IDs for every rank
00067 #define MAX_NUM_RANKS 2048
00068 int deviceIDList[MAX_NUM_RANKS];
00069 // Node-wide of master PEs for every device ID
00070 #define MAX_NUM_DEVICES 256
00071 int masterPeList[MAX_NUM_DEVICES];
00072 
00073 // -------------------------------------------------------------------------------------------------
00074 // -------------------------------------------------------------------------------------------------
00075 // -------------------------------------------------------------------------------------------------
00076 
00077 //
00078 // Class creator
00079 //
00080 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
00081 
00082 //
00083 // Initalize device
00084 //
00085 void DeviceCUDA::initialize() {
00086         // Copy command-line arguments into class
00087         this->devicelist = cuda_args.devicelist;
00088         this->usedevicelist = cuda_args.usedevicelist;
00089   this->devicesperreplica = cuda_args.devicesperreplica;
00090         this->ignoresharing = cuda_args.ignoresharing;
00091         this->mergegrids = cuda_args.mergegrids;
00092         this->nomergegrids = cuda_args.nomergegrids;
00093         this->nostreaming = cuda_args.nostreaming;
00094 
00095   if (CkMyPe() == 0) register_user_events();
00096 
00097   if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
00098 
00099   char host[128];
00100 #ifdef NOHOSTNAME
00101   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00102 #else
00103   gethostname(host, 128);  host[127] = 0;
00104 #endif
00105 
00106   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00107   int myRankInPhysicalNode;
00108   int numPesOnPhysicalNode;
00109   int *pesOnPhysicalNode;
00110   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00111                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00112 
00113   {
00114     int i;
00115     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00116       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00117         i = numPesOnPhysicalNode;
00118         break;
00119       }
00120       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00121     }
00122     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00123       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00124       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00125         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00126           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00127       }
00128       myRankInPhysicalNode = 0;
00129       numPesOnPhysicalNode = 1;
00130       pesOnPhysicalNode = new int[1];
00131       pesOnPhysicalNode[0] = CkMyPe();
00132     } else {
00133       myRankInPhysicalNode = i;
00134     }
00135   }
00136   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00137 
00138   deviceCount = 0;
00139   cudaCheck(cudaGetDeviceCount(&deviceCount));
00140   if ( deviceCount <= 0 ) {
00141     cudaDie("No CUDA devices found.");
00142   }
00143 
00144   // Store all device props
00145   deviceProps = new cudaDeviceProp[deviceCount];
00146   for ( int i=0; i<deviceCount; ++i ) {
00147     cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
00148   }
00149 
00150   ndevices = 0;
00151   int nexclusive = 0;
00152   if ( usedevicelist ) {
00153     devices = new int[strlen(devicelist)];
00154     int i = 0;
00155     while ( devicelist[i] ) {
00156       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00157       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00158       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00159     }
00160   } else {
00161     if ( ! CkMyPe() ) {
00162       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00163     }
00164     devices = new int[deviceCount];
00165     for ( int i=0; i<deviceCount; ++i ) {
00166       int dev = i % deviceCount;
00167 #if CUDA_VERSION >= 2020
00168       cudaDeviceProp deviceProp;
00169       cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00170       if ( deviceProp.computeMode != cudaComputeModeProhibited
00171            && (deviceProp.major >= 3)
00172            && deviceProp.canMapHostMemory
00173            && ( (deviceProp.multiProcessorCount > 2) ||
00174                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
00175          ) {
00176         devices[ndevices++] = dev;
00177       }
00178       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00179         ++nexclusive;
00180       }
00181 #else
00182       devices[ndevices++] = dev;
00183 #endif
00184     }
00185   }
00186 
00187   if ( ! ndevices ) {
00188     cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
00189   }
00190 
00191   if ( devicesperreplica > 0 ) {
00192     if ( devicesperreplica > ndevices ) {
00193       NAMD_die("More devices per partition requested than devices are available");
00194     }
00195     int *olddevices = devices;
00196     devices = new int[devicesperreplica];
00197     for ( int i=0; i<devicesperreplica; ++i ) {
00198       int mypart = CmiMyPartition();
00199       devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
00200     }
00201     ndevices = devicesperreplica;
00202     delete [] olddevices;
00203   }
00204 
00205   sharedGpu = 0;
00206   gpuIsMine = 1;
00207   int firstPeSharingGpu = CkMyPe();
00208   nextPeSharingGpu = CkMyPe();
00209 
00210  {
00211 
00212     int dev;
00213     if ( numPesOnPhysicalNode > 1 ) {
00214       int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00215       dev = devices[myDeviceRank];
00216       masterPe = CkMyPe();
00217       if ( ignoresharing ) {
00218         pesSharingDevice = new int[1];
00219         pesSharingDevice[0] = CkMyPe();
00220         numPesSharingDevice = 1;
00221       } else {
00222         pesSharingDevice = new int[numPesOnPhysicalNode];
00223         masterPe = -1;
00224         numPesSharingDevice = 0;
00225         for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00226           if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00227             int thisPe = pesOnPhysicalNode[i];
00228             pesSharingDevice[numPesSharingDevice++] = thisPe;
00229             if ( masterPe < 1 ) masterPe = thisPe;
00230             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
00231           }
00232         }
00233         for ( int j = 0; j < ndevices; ++j ) {
00234           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
00235         }
00236       }
00237       if ( sharedGpu && masterPe == CkMyPe() ) {
00238         if ( CmiPhysicalNodeID(masterPe) < 2 )
00239         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00240       }
00241     } else {  // in case phys node code is lying
00242       dev = devices[CkMyPe() % ndevices];
00243       masterPe = CkMyPe();
00244       pesSharingDevice = new int[1];
00245       pesSharingDevice[0] = CkMyPe();
00246       numPesSharingDevice = 1;
00247     }
00248 
00249     deviceID = dev;
00250 
00251     // Store device IDs to node-wide list
00252     if (CkMyRank() >= MAX_NUM_RANKS)
00253       NAMD_die("Maximum number of ranks (2048) per node exceeded");
00254     deviceIDList[CkMyRank()] = deviceID;
00255 
00256     if ( masterPe != CkMyPe() ) {
00257       if ( CmiPhysicalNodeID(masterPe) < 2 )
00258       CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00259                CkMyPe(), myRankInPhysicalNode, masterPe);
00260       // for PME only
00261       cudaCheck(cudaSetDevice(dev));
00262       return;
00263     }
00264 
00265     // Store master PEs for every device ID to node-wide list
00266     if (CkMyRank() >= MAX_NUM_DEVICES)
00267       NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
00268     masterPeList[deviceID] = masterPe;
00269     // Set masterPe values to -1 for devices that do not exist.
00270     // Only master Pe with deviceID == devices[0] does the writing
00271     if (deviceID == devices[0]) {
00272       // For device IDs 0...deviceCount-1, check if it is in the devices[0...deviceCount-1]
00273       for (int i=0;i < deviceCount;i++) {
00274         bool deviceOK = false;
00275         for (int j=0;j < deviceCount;j++) {
00276           if (devices[j] == i) deviceOK = true;
00277         }
00278         if (!deviceOK) masterPeList[i] = -1;
00279       }
00280       // Device IDs deviceCount ... MAX_NUM_DEVICES are not possible, just set them to -1
00281       for (int i=deviceCount;i < MAX_NUM_DEVICES;i++) {
00282         masterPeList[i] = -1;
00283       }
00284     }
00285 
00286     // disable token-passing but don't submit local until remote finished
00287     // if shared_gpu is true, otherwise submit all work immediately
00288     firstPeSharingGpu = CkMyPe();
00289     nextPeSharingGpu = CkMyPe();
00290 
00291     gpuIsMine = ( firstPeSharingGpu == CkMyPe() ); 
00292 
00293     if ( dev >= deviceCount ) {
00294       char buf[256];
00295       sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00296                 CkMyPe(), dev, host, deviceCount);
00297       NAMD_die(buf);
00298     }
00299 
00300     cudaDeviceProp deviceProp;
00301     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00302     if ( CmiPhysicalNodeID(masterPe) < 2 )
00303         CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00304                CkMyPe(), myRankInPhysicalNode, dev, host,
00305                deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00306                deviceProp.major, deviceProp.minor);
00307 
00308     cudaCheck(cudaSetDevice(dev));
00309 
00310   }  // just let CUDA pick a device for us
00311 
00312   {
00313     cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost));
00314 
00315     int dev;
00316     cudaCheck(cudaGetDevice(&dev));
00317     deviceID = dev;
00318     cudaDeviceProp deviceProp;
00319     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00320     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00321       cudaDie("device in prohibited mode");
00322     if ( deviceProp.major < 3 )
00323       cudaDie("device not of compute capability 3.0 or higher");
00324     if ( ! deviceProp.canMapHostMemory )
00325       cudaDie("device cannot map host memory");
00326   }
00327 }
00328 
00329 //
00330 // Class destructor
00331 //
00332 DeviceCUDA::~DeviceCUDA() {
00333   if (deviceProps != NULL) delete [] deviceProps;
00334   if (devices != NULL) delete [] devices;
00335         delete [] pesSharingDevice;
00336 }
00337 
00338 //
00339 // Return device ID for pe. Assumes all nodes are the same
00340 //
00341 int DeviceCUDA::getDeviceIDforPe(int pe) {
00342   return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
00343 }
00344 
00345 //
00346 // Returns master PE for the device ID, or -1 if device not found
00347 //
00348 int DeviceCUDA::getMasterPeForDeviceID(int deviceID) {
00349   return masterPeList[deviceID % deviceCount];
00350 }
00351 
00352 //
00353 // Returns true if process "pe" shares this device
00354 //
00355 bool DeviceCUDA::device_shared_with_pe(int pe) {
00356   for ( int i=0; i<numPesSharingDevice; ++i ) {
00357     if ( pesSharingDevice[i] == pe ) return true;
00358   }
00359   return false;
00360 }
00361 
00362 //
00363 // Returns true if there is single device per node
00364 //
00365 bool DeviceCUDA::one_device_per_node() {
00366   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
00367   int numPesOnNodeSharingDevice = 0;
00368   for ( int i=0; i<numPesSharingDevice; ++i ) {
00369     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
00370       ++numPesOnNodeSharingDevice;
00371     }
00372   }
00373   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
00374 }
00375 
00376 int DeviceCUDA::getMaxNumThreads() {
00377   int dev;
00378   cudaCheck(cudaGetDevice(&dev));
00379   return deviceProps[dev].maxThreadsPerBlock;
00380 }
00381 
00382 int DeviceCUDA::getMaxNumBlocks() {
00383   int dev;
00384   cudaCheck(cudaGetDevice(&dev));
00385   return deviceProps[dev].maxGridSize[0];
00386 }
00387 
00388 /*
00389 BASE
00390 2 types (remote & local)
00391 16 pes per node
00392 3 phases (1, 2, 3)
00393 */
00394 
00395 void DeviceCUDA::register_user_events() {
00396 
00397   traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
00398   traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
00399 
00400 #define REGISTER_DEVICE_EVENTS(DEV) \
00401   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00402   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00403 
00404   REGISTER_DEVICE_EVENTS(0)
00405   REGISTER_DEVICE_EVENTS(1)
00406   REGISTER_DEVICE_EVENTS(2)
00407   REGISTER_DEVICE_EVENTS(3)
00408   REGISTER_DEVICE_EVENTS(4)
00409   REGISTER_DEVICE_EVENTS(5)
00410   REGISTER_DEVICE_EVENTS(6)
00411   REGISTER_DEVICE_EVENTS(7)
00412   REGISTER_DEVICE_EVENTS(8)
00413   REGISTER_DEVICE_EVENTS(9)
00414   REGISTER_DEVICE_EVENTS(10)
00415   REGISTER_DEVICE_EVENTS(11)
00416   REGISTER_DEVICE_EVENTS(12)
00417   REGISTER_DEVICE_EVENTS(13)
00418   REGISTER_DEVICE_EVENTS(14)
00419   REGISTER_DEVICE_EVENTS(15)
00420 
00421 }
00422 
00423 #endif  // NAMD_CUDA
00424 

Generated on Fri Sep 22 01:17:12 2017 for NAMD by  doxygen 1.4.7