DeviceCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 #include "DeviceCUDA.h"
00005 #include "WorkDistrib.h"
00006 #include "CudaUtils.h"
00007 
00008 #ifdef NAMD_CUDA
00009 
00010 #include <cuda_runtime.h>
00011 #include <cuda.h>
00012 
00013 #ifdef WIN32
00014 #define __thread __declspec(thread)
00015 #endif
00016 
00017 // Global storage for CUDA devices
00018 __thread DeviceCUDA *deviceCUDA;
00019 
00020 void cuda_initialize() {
00021         deviceCUDA = new DeviceCUDA();
00022         deviceCUDA->initialize();
00023 }
00024 
00025 // kill all service threads
00026 void cuda_finalize() {
00027     int ndevs = 0;
00028     cudaGetDeviceCount(&ndevs);
00029     for ( int dev=0; dev < ndevs; ++dev ) {
00030         cudaSetDevice(dev);
00031         cudaDeviceReset();
00032     }
00033 }
00034 
00035 // -------------------------------------------------------------------------------------------------
00036 // Called from BackEnd.C by all processes to read command line arguments
00037 // These argument settings are used by DeviceCUDA -class
00038 // -------------------------------------------------------------------------------------------------
00039 struct cuda_args_t {
00040         char *devicelist;
00041         int usedevicelist;
00042   int devicesperreplica;
00043         int ignoresharing;
00044         int mergegrids;
00045         int nomergegrids;
00046         int nostreaming;
00047 };
00048 
00049 static __thread cuda_args_t cuda_args;
00050 
00051 void cuda_getargs(char **argv) {
00052   cuda_args.devicelist = 0;
00053   cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
00054                 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00055   cuda_args.devicesperreplica = 0;
00056   CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
00057   if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
00058   cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00059   cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00060   cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
00061   if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
00062   cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
00063 }
00064 // -------------------------------------------------------------------------------------------------
00065 
00066 // Node-wide list of device IDs for every rank
00067 #define MAX_NUM_RANKS 2048
00068 int deviceIDList[MAX_NUM_RANKS];
00069 // Node-wide of master PEs for every device ID
00070 #define MAX_NUM_DEVICES 256
00071 int masterPeList[MAX_NUM_DEVICES];
00072 
00073 // -------------------------------------------------------------------------------------------------
00074 // -------------------------------------------------------------------------------------------------
00075 // -------------------------------------------------------------------------------------------------
00076 
00077 //
00078 // Class creator
00079 //
00080 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
00081 
00082 //
00083 // Initalize device
00084 //
00085 void DeviceCUDA::initialize() {
00086         // Copy command-line arguments into class
00087         this->devicelist = cuda_args.devicelist;
00088         this->usedevicelist = cuda_args.usedevicelist;
00089   this->devicesperreplica = cuda_args.devicesperreplica;
00090         this->ignoresharing = cuda_args.ignoresharing;
00091         this->mergegrids = cuda_args.mergegrids;
00092         this->nomergegrids = cuda_args.nomergegrids;
00093         this->nostreaming = cuda_args.nostreaming;
00094 
00095   if (CkMyPe() == 0) register_user_events();
00096 
00097   if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
00098 
00099   char host[128];
00100 #ifdef NOHOSTNAME
00101   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00102 #else
00103   gethostname(host, 128);  host[127] = 0;
00104 #endif
00105 
00106   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00107   int myRankInPhysicalNode;
00108   int numPesOnPhysicalNode;
00109   int *pesOnPhysicalNode;
00110   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00111                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00112 
00113   {
00114     int i;
00115     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00116       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00117         i = numPesOnPhysicalNode;
00118         break;
00119       }
00120       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00121     }
00122     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00123       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00124       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00125         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00126           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00127       }
00128       myRankInPhysicalNode = 0;
00129       numPesOnPhysicalNode = 1;
00130       pesOnPhysicalNode = new int[1];
00131       pesOnPhysicalNode[0] = CkMyPe();
00132     } else {
00133       myRankInPhysicalNode = i;
00134     }
00135   }
00136   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00137 
00138   deviceCount = 0;
00139   cudaCheck(cudaGetDeviceCount(&deviceCount));
00140   if ( deviceCount <= 0 ) {
00141     cudaDie("No CUDA devices found.");
00142   }
00143 
00144   // Store all device props
00145   deviceProps = new cudaDeviceProp[deviceCount];
00146   for ( int i=0; i<deviceCount; ++i ) {
00147     cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
00148   }
00149 
00150   ndevices = 0;
00151   int nexclusive = 0;
00152   if ( usedevicelist ) {
00153     devices = new int[strlen(devicelist)];
00154     int i = 0;
00155     while ( devicelist[i] ) {
00156       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00157       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00158       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00159     }
00160   } else {
00161     if ( ! CkMyPe() ) {
00162       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00163     }
00164     devices = new int[deviceCount];
00165     for ( int i=0; i<deviceCount; ++i ) {
00166       int dev = i % deviceCount;
00167 #if CUDA_VERSION >= 2020
00168       cudaDeviceProp deviceProp;
00169       cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00170       if ( deviceProp.computeMode != cudaComputeModeProhibited
00171            && (deviceProp.major >= 3)
00172            && deviceProp.canMapHostMemory
00173            && ( (deviceProp.multiProcessorCount > 2) ||
00174                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
00175          ) {
00176         devices[ndevices++] = dev;
00177       }
00178       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00179         ++nexclusive;
00180       }
00181 #else
00182       devices[ndevices++] = dev;
00183 #endif
00184     }
00185   }
00186 
00187   if ( ! ndevices ) {
00188     cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
00189   }
00190 
00191   if ( devicesperreplica > 0 ) {
00192     if ( devicesperreplica > ndevices ) {
00193       NAMD_die("More devices per partition requested than devices are available");
00194     }
00195     int *olddevices = devices;
00196     devices = new int[devicesperreplica];
00197     for ( int i=0; i<devicesperreplica; ++i ) {
00198       int mypart = CmiMyPartition();
00199       devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
00200     }
00201     ndevices = devicesperreplica;
00202     delete [] olddevices;
00203   }
00204 
00205   int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
00206   int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
00207 
00208   // catch multiple processes per device
00209   if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
00210     char msg[1024];
00211     sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d).  "
00212             "Sharing devices between processes is inefficient.  "
00213             "Specify +ignoresharing (each process uses all visible devices) if "
00214             "not all devices are visible to each process, otherwise "
00215             "adjust number of processes to evenly divide number of devices, "
00216             "specify subset of devices with +devices argument (e.g., +devices 0,2), "
00217             "or multiply list shared devices (e.g., +devices 0,1,2,0).",
00218             ndevices, numPesForDevice / CkMyNodeSize() );
00219     NAMD_die(msg);
00220   }
00221 
00222   {
00223     // build list of devices actually used by this node
00224     nodedevices = new int[ndevices];
00225     nnodedevices = 0;
00226     int pe = CkNodeFirst(CkMyNode());
00227     int dr = -1;
00228     for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
00229       int rank = ignoresharing ? i : CmiPhysicalRank(pe);
00230       int peDeviceRank = rank * ndevices / numPesForDevice;
00231       if ( peDeviceRank != dr ) {
00232         dr = peDeviceRank;
00233         nodedevices[nnodedevices++] = devices[dr];
00234       }
00235     }
00236   }
00237 
00238   {
00239     // check for devices used twice by this node
00240     for ( int i=0; i<nnodedevices; ++i ) {
00241       for ( int j=i+1; j<nnodedevices; ++j ) {
00242         if ( nodedevices[i] == nodedevices[j] ) { 
00243           char msg[1024];
00244           sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
00245           NAMD_die(msg);
00246         }
00247       }
00248     }
00249   }
00250 
00251   sharedGpu = 0;
00252   gpuIsMine = 1;
00253   int firstPeSharingGpu = CkMyPe();
00254   nextPeSharingGpu = CkMyPe();
00255 
00256  {
00257     int dev;
00258     if ( numPesForDevice > 1 ) {
00259       int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
00260       dev = devices[myDeviceRank];
00261       masterPe = CkMyPe();
00262       {
00263         pesSharingDevice = new int[numPesForDevice];
00264         masterPe = -1;
00265         numPesSharingDevice = 0;
00266         for ( int i = 0; i < numPesForDevice; ++i ) {
00267           if ( i * ndevices / numPesForDevice == myDeviceRank ) {
00268             int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
00269             pesSharingDevice[numPesSharingDevice++] = thisPe;
00270             if ( masterPe < 1 ) masterPe = thisPe;
00271             if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
00272           }
00273         }
00274         for ( int j = 0; j < ndevices; ++j ) {
00275           if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
00276         }
00277       }
00278       if ( sharedGpu && masterPe == CkMyPe() ) {
00279         if ( CmiPhysicalNodeID(masterPe) < 2 )
00280         CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00281       }
00282     } else {  // in case phys node code is lying
00283       dev = devices[CkMyPe() % ndevices];
00284       masterPe = CkMyPe();
00285       pesSharingDevice = new int[1];
00286       pesSharingDevice[0] = CkMyPe();
00287       numPesSharingDevice = 1;
00288     }
00289 
00290     deviceID = dev;
00291 
00292     // Store device IDs to node-wide list
00293     if (CkMyRank() >= MAX_NUM_RANKS)
00294       NAMD_die("Maximum number of ranks (2048) per node exceeded");
00295     deviceIDList[CkMyRank()] = deviceID;
00296 
00297     if ( masterPe != CkMyPe() ) {
00298       if ( CmiPhysicalNodeID(masterPe) < 2 )
00299       CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00300                CkMyPe(), myRankInPhysicalNode, masterPe);
00301       // for PME only
00302       cudaCheck(cudaSetDevice(dev));
00303       return;
00304     }
00305 
00306     // Store master PEs for every device ID to node-wide list
00307     if (deviceID >= MAX_NUM_DEVICES)
00308       NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
00309     masterPeList[deviceID] = masterPe + 1;  // array is pre-initialized to zeros
00310 
00311     // disable token-passing but don't submit local until remote finished
00312     // if shared_gpu is true, otherwise submit all work immediately
00313     firstPeSharingGpu = CkMyPe();
00314     nextPeSharingGpu = CkMyPe();
00315 
00316     gpuIsMine = ( firstPeSharingGpu == CkMyPe() ); 
00317 
00318     if ( dev >= deviceCount ) {
00319       char buf[256];
00320       sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00321                 CkMyPe(), dev, host, deviceCount);
00322       NAMD_die(buf);
00323     }
00324 
00325     cudaDeviceProp deviceProp;
00326     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00327     if ( CmiPhysicalNodeID(masterPe) < 2 )
00328         CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d  PCI: %x:%x:%x\n",
00329                CkMyPe(), myRankInPhysicalNode, dev, host,
00330                deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00331                deviceProp.major, deviceProp.minor,
00332                deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00333 
00334     cudaCheck(cudaSetDevice(dev));
00335 
00336   }  // just let CUDA pick a device for us
00337 
00338   {
00339     // if only one device then already initialized in cuda_affinity_initialize()
00340     cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
00341     if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
00342       cudaGetLastError();
00343     } else {
00344       cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
00345     }
00346 
00347     int dev;
00348     cudaCheck(cudaGetDevice(&dev));
00349     deviceID = dev;
00350     cudaDeviceProp deviceProp;
00351     cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
00352     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00353       cudaDie("device in prohibited mode");
00354     if ( deviceProp.major < 3 )
00355       cudaDie("device not of compute capability 3.0 or higher");
00356     if ( ! deviceProp.canMapHostMemory )
00357       cudaDie("device cannot map host memory");
00358 
00359     // initialize the device on this thread
00360     int *dummy;
00361     cudaCheck(cudaMalloc(&dummy, 4));
00362   }
00363 }
00364 
00365 //
00366 // Class destructor
00367 //
00368 DeviceCUDA::~DeviceCUDA() {
00369   if (deviceProps != NULL) delete [] deviceProps;
00370   if (devices != NULL) delete [] devices;
00371         delete [] pesSharingDevice;
00372 }
00373 
00374 //
00375 // Return device ID for pe. Assumes all nodes are the same
00376 //
00377 int DeviceCUDA::getDeviceIDforPe(int pe) {
00378   return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
00379 }
00380 
00381 //
00382 // Returns master PE for the device ID, or -1 if device not found
00383 //
00384 int DeviceCUDA::getMasterPeForDeviceID(int deviceID) {
00385   return masterPeList[deviceID % deviceCount] - 1;
00386 }
00387 
00388 //
00389 // Returns true if process "pe" shares this device
00390 //
00391 bool DeviceCUDA::device_shared_with_pe(int pe) {
00392   for ( int i=0; i<numPesSharingDevice; ++i ) {
00393     if ( pesSharingDevice[i] == pe ) return true;
00394   }
00395   return false;
00396 }
00397 
00398 //
00399 // Returns true if there is single device per node
00400 //
00401 bool DeviceCUDA::one_device_per_node() {
00402   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
00403   int numPesOnNodeSharingDevice = 0;
00404   for ( int i=0; i<numPesSharingDevice; ++i ) {
00405     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
00406       ++numPesOnNodeSharingDevice;
00407     }
00408   }
00409   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
00410 }
00411 
00412 int DeviceCUDA::getMaxNumThreads() {
00413   int dev;
00414   cudaCheck(cudaGetDevice(&dev));
00415   return deviceProps[dev].maxThreadsPerBlock;
00416 }
00417 
00418 int DeviceCUDA::getMaxNumBlocks() {
00419   int dev;
00420   cudaCheck(cudaGetDevice(&dev));
00421   return deviceProps[dev].maxGridSize[0];
00422 }
00423 
00424 /*
00425 BASE
00426 2 types (remote & local)
00427 16 pes per node
00428 3 phases (1, 2, 3)
00429 */
00430 
00431 void DeviceCUDA::register_user_events() {
00432 
00433   traceRegisterUserEvent("CUDA PME spreadCharge", CUDA_PME_SPREADCHARGE_EVENT);
00434   traceRegisterUserEvent("CUDA PME gatherForce", CUDA_PME_GATHERFORCE_EVENT);
00435 
00436   traceRegisterUserEvent("CUDA bonded", CUDA_BONDED_KERNEL_EVENT);
00437   traceRegisterUserEvent("CUDA debug", CUDA_DEBUG_EVENT);
00438   traceRegisterUserEvent("CUDA nonbonded", CUDA_NONBONDED_KERNEL_EVENT);
00439   traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL_EVENT);
00440   traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL_EVENT);
00441   traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL_EVENT);
00442 
00443   traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
00444   traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
00445 
00446 #define REGISTER_DEVICE_EVENTS(DEV) \
00447   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00448   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00449 
00450   REGISTER_DEVICE_EVENTS(0)
00451   REGISTER_DEVICE_EVENTS(1)
00452   REGISTER_DEVICE_EVENTS(2)
00453   REGISTER_DEVICE_EVENTS(3)
00454   REGISTER_DEVICE_EVENTS(4)
00455   REGISTER_DEVICE_EVENTS(5)
00456   REGISTER_DEVICE_EVENTS(6)
00457   REGISTER_DEVICE_EVENTS(7)
00458   REGISTER_DEVICE_EVENTS(8)
00459   REGISTER_DEVICE_EVENTS(9)
00460   REGISTER_DEVICE_EVENTS(10)
00461   REGISTER_DEVICE_EVENTS(11)
00462   REGISTER_DEVICE_EVENTS(12)
00463   REGISTER_DEVICE_EVENTS(13)
00464   REGISTER_DEVICE_EVENTS(14)
00465   REGISTER_DEVICE_EVENTS(15)
00466 
00467 }
00468 
00469 #endif  // NAMD_CUDA
00470 

Generated on Wed Jul 18 01:17:13 2018 for NAMD by  doxygen 1.4.7