CUDAAccel.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAAccel.C,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.70 $       $Date: 2022/02/13 05:34:21 $
00014  *
00015  ***************************************************************************/
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include "config.h"     // rebuild on config changes
00035 #include "Inform.h"
00036 #include "ResizeArray.h"
00037 #include "CUDAAccel.h"
00038 #include "CUDAKernels.h"
00039 #include "WKFThreads.h"
00040 #include "ProfileHooks.h"
00041 
00042 CUDAAccel::CUDAAccel(void) {
00043   cudaavail = 0;
00044   numdevices = 0;
00045   numphysdevices = 0;
00046 
00047   nvmlh=NULL;
00048   cudapool=NULL;
00049 
00050   if (getenv("VMDNOCUDA") != NULL) {
00051     msgInfo << "VMDNOCUDA environment variable is set, CUDA support disabled."
00052             << sendmsg;
00053     return; 
00054   }
00055 
00056 #if defined(VMDCUDA)
00057   PROFILE_PUSH_RANGE("CUDAAccel::CUDAAccel()", 0);
00058 
00059   unsigned int gpumask = 0xffffffff;
00060   const char *gpumaskstr = getenv("VMDCUDADEVICEMASK");
00061   if (gpumaskstr != NULL) {
00062     unsigned int tmp;
00063     if (sscanf(gpumaskstr, "%x", &tmp) == 1) {
00064       gpumask = tmp;
00065       msgInfo << "Using GPU device mask '"
00066               << gpumaskstr << "'" << sendmsg;
00067     } else {
00068       msgInfo << "Failed to parse CUDA GPU device mask string '" 
00069               << gpumaskstr << "'" << sendmsg;
00070     }
00071   }
00072 
00073   // This is the very first CUDA API call during VMD startup.
00074   // There's a >= 2.0 second startup lag associated with it on the DGX-2, 
00075   // likely due to CUDA runtime library internal initialization overheads
00076   // across the 16 GPUs.  The first internal call checks the CUDA runtime
00077   // and driver version compatibility.
00078   int usabledevices = 0;
00079   int rc = 0;
00080   if ((rc=vmd_cuda_num_devices(&numphysdevices)) != VMDCUDA_ERR_NONE) {
00081     numdevices = 0;
00082     numphysdevices = 0;
00083 
00084     // Only emit error messages when there are CUDA GPUs on the machine
00085     // but that they can't be used for some reason
00086     // XXX turning this off for the time being, as some people have 
00087     //     NVIDIA drivers installed on machines with no NVIDIA GPU, as can
00088     //     happen with some distros that package the drivers by default.
00089     switch (rc) {
00090       case VMDCUDA_ERR_NODEVICES:
00091       case VMDCUDA_ERR_SOMEDEVICES:
00092 //        msgInfo << "No CUDA accelerator devices available." << sendmsg;
00093         break;
00094 
00095 #if 0
00096       case VMDCUDA_ERR_SOMEDEVICES:
00097         msgWarn << "One or more CUDA accelerators may exist but are not usable." << sendmsg; 
00098         msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg;
00099         break;
00100 #endif
00101 
00102       case VMDCUDA_ERR_DRVMISMATCH:
00103         msgWarn << "Detected a mismatch between CUDA runtime and GPU driver" << sendmsg; 
00104         msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg;
00105 //        msgInfo << "No CUDA accelerator devices available." << sendmsg;
00106         break;
00107     }
00108    
00109     PROFILE_POP_RANGE();
00110     return;
00111   }
00112 
00113   // 
00114   // Runtime load of NVML shared library (packaged with CUDA driver) to
00115   // manually obtain function points for query of low-level host platform
00116   // and GPU hardware details such as the best CPU affinity mask associated 
00117   // with each GPU, taking into account the NUMA node, PCIe topology, and 
00118   // NVLink topology that exist on the system.
00119   nvmlh = wrap_nvml_create();
00120 
00121 
00122   // The following loop queries the individual GPU hardware and API 
00123   // compatibility properties and records their results for subsequent use.
00124   // This phase of startup costs about 0.05 seconds on a DGX-2 with 16 GPUs.
00125   if (numphysdevices > 0) {
00126     cudaavail = 1;
00127 
00128     int i;
00129     for (i=0; i<numphysdevices; i++) {
00130       cudadevprops dp;
00131       memset(&dp, 0, sizeof(dp));
00132       if (!vmd_cuda_device_props(i, dp.name, sizeof(dp.name),
00133                                 &dp.major, &dp.minor,
00134                                 &dp.membytes, &dp.clockratekhz, 
00135                                 &dp.smcount, &dp.integratedgpu,
00136                                 &dp.asyncenginecount, 
00137                                 &dp.kernelexectimeoutenabled,
00138                                 &dp.canmaphostmem, &dp.computemode,
00139                                 &dp.spdpfpperfratio, 
00140                                 &dp.pageablememaccess,
00141                                 &dp.pageablememaccessuseshostpagetables)) {
00142         dp.deviceid=i; // save the device index
00143 
00144         // Check that each GPU device has not been excluded by virtue of 
00145         // being used for display, by a GPU device mask, or by the CUDA
00146         // device mode being set to a "prohibited" status.
00147         if (!(dp.kernelexectimeoutenabled && getenv("VMDCUDANODISPLAYGPUS")) &&
00148             (gpumask & (1 << i)) && 
00149             (dp.computemode != computeModeProhibited)) {
00150           devprops.append(dp);
00151           usabledevices++;
00152         }
00153       } else {
00154         msgWarn << "  Failed to retrieve properties for CUDA accelerator " << i << sendmsg; 
00155       }
00156     }
00157   }
00158 
00159   // assign the final usable device count as the number of available
00160   // CUDA devices (physical device count is maintained separately)
00161   numdevices=usabledevices;
00162 
00163   // This code creates a pool of CPU worker threads (one per GPU) that
00164   // are hereafter responsible for managing each device.  To ensure that
00165   // the GPUs are all actually usable, each worker thread allocates a 
00166   // few bytes of memory and executes a trivial kernel on it.
00167   // On a DGX-2, this phase of startup costs about 7.63 seconds on 16 GPUs.
00168   devpool_init();
00169 
00170   PROFILE_POP_RANGE();
00171 #endif
00172 }
00173 
00174 
00175 // destructor
00176 CUDAAccel::~CUDAAccel(void) {
00177   devpool_fini(); 
00178 
00179 #if defined(VMDCUDA)
00180   // destroy the live connection to NVML library
00181   if (nvmlh != NULL) {
00182     wrap_nvml_destroy(nvmlh);
00183   }
00184 #endif
00185 }
00186 
00187 
00188 void CUDAAccel::devpool_init(void) {
00189   cudapool=NULL;
00190 
00191 #if defined(VMDCUDA)
00192   PROFILE_PUSH_RANGE("CUDAAccel::devpool_init()", 0);
00193 
00194   // don't proceed any further if there are no devices or CUDA usage
00195   // has been disabled by the user
00196   if (!cudaavail || numdevices == 0 || getenv("VMDNOCUDA") != NULL)
00197     return;
00198 
00199   // only use as many GPUs as CPU cores we're allowed to use
00200   int workercount=numdevices;
00201   if (workercount > wkf_thread_numprocessors())
00202     workercount=wkf_thread_numprocessors();
00203 
00204   int *devlist = new int[workercount];
00205   int i;
00206   for (i=0; i<workercount; i++) {
00207     devlist[i]=device_index(i);
00208   }
00209 
00210   msgInfo << "Creating CUDA device pool and initializing hardware..." << sendmsg;
00211   cudapool=wkf_threadpool_create(workercount, devlist);
00212   delete [] devlist;
00213 
00214   // associate each worker thread with a specific GPU
00215   if (getenv("VMDCUDAVERBOSE") != NULL)
00216     wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, (void*)"VMD CUDA Dev Init", 1);
00217   else
00218     wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, NULL, 1);
00219 
00220   // clear all available device memory on each of the GPUs
00221   wkf_threadpool_launch(cudapool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
00222 
00223   // XXX enable fully-connected NVLink peer-to-peer GPU memory access
00224   //     when requested (not fully generalized yet).  This is done only
00225   //     once per VMD process, per GPU, and never again.
00226   if (getenv("VMDCUDAP2PENABLE") != NULL) {
00227     msgInfo << "Enabling DGX-2 fully-connected NVLink GPU P2P..." << sendmsg;
00228     wkf_threadpool_launch(cudapool, vmd_cuda_devpool_enable_P2P, NULL, 1);
00229   }
00230 
00231   PROFILE_POP_RANGE();
00232 #endif
00233 }
00234 
00235 void CUDAAccel::devpool_fini(void) {
00236   if (!cudapool)
00237     return;
00238 
00239 #if defined(VMDCUDA)
00240   devpool_wait();
00241   wkf_threadpool_destroy(cudapool);
00242 #endif
00243   cudapool=NULL;
00244 }
00245 
00246 int CUDAAccel::devpool_launch(void *fctn(void *), void *parms, int blocking) {
00247   if (!cudapool)
00248     return -1;
00249 
00250   return wkf_threadpool_launch(cudapool, fctn, parms, blocking); 
00251 }
00252 
00253 int CUDAAccel::devpool_wait(void) {
00254   if (!cudapool)
00255     return -1;
00256 
00257   return wkf_threadpool_wait(cudapool);
00258 }
00259 
00260 void CUDAAccel::print_cuda_devices(void) {
00261   if (getenv("VMDCUDANODISPLAYGPUS")) {
00262     msgInfo << "Ignoring CUDA-capable GPUs used for display" << sendmsg;
00263   }
00264 
00265   if (!cudaavail || numdevices == 0) {
00266     msgInfo << "No CUDA accelerator devices available." << sendmsg;
00267     return;
00268   }
00269 
00270   if (nvmlh == NULL) {
00271     msgInfo << "Unable to load NVML library, GPU-CPU affinity unavailable." << sendmsg;
00272   }
00273 
00274   // XXX GPU P2P hardware features need to be abstracted by CUDAAccel in the
00275   //     same way that usable CUDA devices are, so that VMDCUDADEVICEMASK 
00276   //     affects the record keeping and reporting of P2P connectivity etc.
00277   //     If the user selects a subset of GPUs, we should disinclude consideration
00278   //     of the P2P topology that connects GPUs that were masked out. 
00279   //     We should take into account the mask's impact on the number of P2P islands,
00280   //     and not count or report links to GPUs that were masked out. 
00281   //     Since the low-level peer matrix helper function doesn't know anything about
00282   //     GPU device masks or other control environment variables, CUDAAccel 
00283   //     should filter the output by copying only the P2P connectivity matrix elements 
00284   //     that correspond to links between GPUs that are enabled.  The final filtered
00285   //     and abstracted P2P matrix can then be used by the rest of VMD with appropriate
00286   //     accessor functions that take the potentially sparse GPU mapping into account.
00287   int p2plinkcount=0, p2pislands=0;
00288 #if defined(VMDCUDA)
00289   int numdev=0;
00290   int *p2pmat=NULL;
00291   int *p2psupp=NULL;
00292   int *p2patomics=NULL;
00293   int *p2parrays=NULL;
00294   int *perfmat=NULL;
00295 
00296   if (vmd_cuda_peer_matrix(&numdev, &p2pmat, &p2psupp, &p2patomics, &p2parrays,
00297                            &perfmat, &p2plinkcount, &p2pislands) != VMDCUDA_ERR_NONE) {
00298     msgWarn << "Unable to ascertain GPU peer-to-peer connectivity" << sendmsg;
00299   }
00300 
00301   if (p2pmat)
00302     free(p2pmat);
00303   if (p2psupp)
00304     free(p2psupp);
00305   if (p2patomics)
00306     free(p2patomics);
00307   if (p2parrays)
00308     free(p2parrays);
00309   if (perfmat)
00310     free(perfmat);
00311 #endif
00312 
00313   // Report detected GPU hardware and PCIe/NVLink P2P topology
00314   msgInfo << "Detected " << numdevices << " available CUDA " 
00315           << ((numdevices > 1) ? "accelerators" : "accelerator:");
00316 
00317   // XXX update to account for device masks...
00318   if (p2plinkcount > 0) {
00319     msgInfo << ", " 
00320             << p2plinkcount << ((p2plinkcount > 1) ? " P2P links, " : " P2P link, ")
00321             << p2pislands << ((p2pislands > 1) ? " islands" : " island");
00322   }
00323 
00324   msgInfo << ":" << sendmsg;
00325 
00326 
00327   char oldstr[1024], outstr[1024], gpustr[1024], idxprefix[1024];
00328   int idxrangecount=0,firstidx=-1, lastidx=-1;
00329   const char *idxfmtstring10gpus  = "[%d]";
00330   const char *idxfmtspaces10gpus  = "   ";
00331   const char *idxfmtstring100gpus = "[%2d]";
00332   const char *idxfmtspaces100gpus = "    ";
00333   const char *gpuidxfmtstring, *gpuidxfmtspaces;
00334 
00335 #if 0
00336   int outputlineperdevice = 1;
00337 #else
00338   int outputlineperdevice = (getenv("VMDCUDAOUTPUTLINEPERDEVICE") != NULL);
00339 #endif
00340 
00341   // when enumerating large DGX-2 class hardware, we ensure columns line up
00342   // by choosing format strings to fit range of device IDs we got
00343   if (device_index(numdevices-1) > 10) {
00344     gpuidxfmtstring = idxfmtstring100gpus;
00345     gpuidxfmtspaces = idxfmtspaces100gpus;
00346   } else {
00347     gpuidxfmtstring = idxfmtstring10gpus;
00348     gpuidxfmtspaces = idxfmtspaces10gpus;
00349   }
00350 
00351   memset(oldstr, 0, sizeof(oldstr));
00352   memset(gpustr, 0, sizeof(gpustr));
00353   memset(idxprefix, 0, sizeof(idxprefix));
00354 
00355   int i;
00356   int shiftgpuidx=0;
00357   for (i=0; i<numdevices; i++) {
00358     memset(outstr, 0, sizeof(outstr));
00359 
00360     // list primary GPU device attributes
00361     const char *devname = device_name(i);
00362     sprintf(gpustr, " %-20s %2d SM_%d.%d %.1f GHz", 
00363             (devname) ? devname : "NULL Device Name!",
00364             (device_sm_count(i) > 0) ? device_sm_count(i) : 0,
00365             device_version_major(i), device_version_minor(i),
00366             device_clock_ghz(i));
00367     strcpy(outstr, gpustr);
00368 
00369     // list memory capacity 
00370     int gpumemmb = (device_membytes(i) / (1024 * 1024));
00371     if (gpumemmb < 1000) {
00372       sprintf(gpustr, ", %4dMB RAM", gpumemmb);
00373     } else if (gpumemmb < 10240) {
00374       sprintf(gpustr, ", %.1fGB RAM", gpumemmb / 1024.0);
00375     } else {
00376       // round up to nearest GB
00377       sprintf(gpustr, ", %dGB RAM", (gpumemmb + 512) / 1024);
00378     }
00379     strcat(outstr, gpustr);
00380 
00381     // list optional hardware features and configuration attributes here...
00382     if (device_computemode(i) == computeModeProhibited) {
00383       strcat(outstr, ", Compute Mode: Prohibited");
00384     } else {
00385       int sfpr = device_spdpfpperfratio(i);
00386       if (sfpr > 2) {
00387         sprintf(gpustr, " SP%d", sfpr);
00388         strcat(outstr, gpustr);
00389       }
00390 
00392       if (device_integratedgpu(i)) {
00393         strcat(outstr, " IGPU");
00394       }
00395 
00398       if (device_kerneltimeoutenabled(i)) {
00399         strcat(outstr, " KT");
00400       }
00401 
00403       if (device_asyncenginecount(i)) {
00404         sprintf(gpustr, " AE%d", device_asyncenginecount(i));
00405         strcat(outstr, gpustr);
00406       }
00407 
00409       if (device_canmaphostmem(i))
00410         strcat(outstr, " ZC");
00411 
00414       if (device_pageablememaccess(i)) {
00417         if (device_pageablememaccessuseshostpagetables(i))
00418           strcat(outstr, " PMT");
00419         else 
00420           strcat(outstr, " PM");
00421       }
00422     }
00423 
00424     if (outputlineperdevice) {
00425       // emit a status line per-device despite any redundancy
00426       sprintf(idxprefix, gpuidxfmtstring, device_index(i));
00427       msgInfo << idxprefix << outstr << sendmsg; 
00428     } else {
00429       // if the current GPU description is the same as the last one,
00430       // we don't bother duplicating its listing, and instead we 
00431       // list the GPU index range(s) with matching descriptive strings
00432       int newidx = device_index(i);
00433       if (!strcmp(oldstr, outstr)) {
00434         // if we have a gap in GPU IDs, we emit the partial index range
00435         // and continue from the first index after the gap
00436         if ((newidx - lastidx) > 1) { 
00437           if (lastidx > firstidx) {
00438             sprintf(idxprefix, "%d-%d", firstidx, lastidx);
00439             shiftgpuidx=1;
00440           } else {
00441             sprintf(idxprefix, "%s%d", (shiftgpuidx) ? "  " : "", firstidx); 
00442           }
00443         
00444           msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00445           idxrangecount++;
00446           firstidx = newidx;
00447           lastidx = newidx;
00448         }
00449         lastidx=newidx;
00450       } else {
00451         if (firstidx < 0) {
00452           firstidx = newidx;
00453           lastidx = newidx;
00454           strcpy(oldstr, outstr);
00455           continue; 
00456         }
00457        
00458         if (lastidx > firstidx) {
00459           sprintf(idxprefix, "%d-%d", firstidx, lastidx); 
00460           shiftgpuidx=1;
00461         } else {
00462           sprintf(idxprefix, "%s%d", (shiftgpuidx) ? "  " : "", firstidx); 
00463         }
00464         msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00465         msgInfo << "]" << oldstr << sendmsg; 
00466 
00467         idxrangecount = 0;
00468         firstidx = newidx;
00469         lastidx = newidx;
00470         strcpy(oldstr, outstr);
00471         memset(outstr, 0, sizeof(outstr));
00472       }
00473     } 
00474   } // end of loop over devices 
00475 
00476   if (!outputlineperdevice) {
00477     if (lastidx > firstidx) {
00478       sprintf(idxprefix, "%d-%d", firstidx, lastidx); 
00479     } else {
00480       sprintf(idxprefix, "%s%d", (shiftgpuidx) ? "  " : "", firstidx); 
00481     }
00482     msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00483     msgInfo << "]";
00484     if (idxrangecount > 2) {
00485       msgInfo << ":" << sendmsg;
00486       msgInfo << gpuidxfmtspaces; // shift to right to line up with column
00487     }
00488     msgInfo << oldstr << sendmsg; 
00489   }
00490 }
00491 
00492 int CUDAAccel::num_devices(void) {
00493   return numdevices;
00494 }
00495 
00496 int CUDAAccel::device_index(int dev) {
00497   return devprops[dev].deviceid;
00498 }
00499 
00500 const char *CUDAAccel::device_name(int dev) {
00501   if (!cudaavail || dev < 0 || dev >= numdevices)
00502     return NULL;
00503   return devprops[dev].name; 
00504 }
00505 
00506 int CUDAAccel::device_version_major(int dev) {
00507   if (!cudaavail || dev < 0 || dev >= numdevices)
00508     return 0; 
00509   return devprops[dev].major;
00510 }
00511 
00512 int CUDAAccel::device_version_minor(int dev) {
00513   if (!cudaavail || dev < 0 || dev >= numdevices)
00514     return 0; 
00515   return devprops[dev].minor;
00516 }
00517 
00518 unsigned long CUDAAccel::device_membytes(int dev) {
00519   if (!cudaavail || dev < 0 || dev >= numdevices)
00520     return 0; 
00521   return devprops[dev].membytes;
00522 }
00523 
00524 float CUDAAccel::device_clock_ghz(int dev) {
00525   if (!cudaavail || dev < 0 || dev >= numdevices)
00526     return 0; 
00527   return (float) (devprops[dev].clockratekhz / 1000000.0);
00528 }
00529 
00530 int CUDAAccel::device_sm_count(int dev) {
00531   if (!cudaavail || dev < 0 || dev >= numdevices)
00532     return -1; 
00533   return devprops[dev].smcount;
00534 }
00535 
00536 int CUDAAccel::device_integratedgpu(int dev) {
00537   if (!cudaavail || dev < 0 || dev >= numdevices)
00538     return -1; 
00539   return devprops[dev].integratedgpu;
00540 }
00541 
00542 int CUDAAccel::device_asyncenginecount(int dev) {
00543   if (!cudaavail || dev < 0 || dev >= numdevices)
00544     return -1; 
00545   return devprops[dev].asyncenginecount;
00546 }
00547 
00548 int CUDAAccel::device_kerneltimeoutenabled(int dev) {
00549   if (!cudaavail || dev < 0 || dev >= numdevices)
00550     return -1; 
00551   return devprops[dev].kernelexectimeoutenabled;
00552 }
00553 
00554 int CUDAAccel::device_canmaphostmem(int dev) {
00555   if (!cudaavail || dev < 0 || dev >= numdevices)
00556     return -1; 
00557   return devprops[dev].canmaphostmem;
00558 }
00559 
00560 int CUDAAccel::device_computemode(int dev) {
00561   if (!cudaavail || dev < 0 || dev >= numdevices)
00562     return -1; 
00563   return devprops[dev].computemode;
00564 }
00565 
00566 int CUDAAccel::device_spdpfpperfratio(int dev) {
00567   if (!cudaavail || dev < 0 || dev >= numdevices)
00568     return -1; 
00569   return devprops[dev].spdpfpperfratio;
00570 }
00571 
00572 int CUDAAccel::device_pageablememaccess(int dev) {
00573   if (!cudaavail || dev < 0 || dev >= numdevices)
00574     return -1; 
00575   return devprops[dev].pageablememaccess;
00576 }
00577 
00578 int CUDAAccel::device_pageablememaccessuseshostpagetables(int dev) {
00579   if (!cudaavail || dev < 0 || dev >= numdevices)
00580     return -1; 
00581   return devprops[dev].pageablememaccessuseshostpagetables;
00582 }
00583