CUDAUtil.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAUtil.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.56 $        $Date: 2020/02/26 19:26:47 $
00014  *
00015  ***************************************************************************/
00021 #include <string.h>
00022 #include <stdio.h>
00023 #include "CUDAKernels.h"
00024 #include "WKFThreads.h"
00025 #include "intstack.h"
00026 #include "ProfileHooks.h"
00027 
00028 // to eliminate a bunch of old ifdef tests, we'll just check
00029 // once here for a reasonably recent rev of CUDA.
00030 #if CUDART_VERSION <= 6000
00031 #error VMD requires at least CUDA 6.x or later
00032 #endif
00033 
00034 #if defined(__cplusplus)
00035 extern "C" {
00036 #endif
00037 
00038 // report true if the driver is compatible with the runtime
00039 static int vmd_cuda_drv_runtime_compatible() {
00040 #if CUDART_VERSION >= 2020
00041   int cuda_driver_version=-1;
00042   int cuda_runtime_version=0;
00043 
00044   cudaDriverGetVersion(&cuda_driver_version);
00045   cudaRuntimeGetVersion(&cuda_runtime_version);
00046 
00047 #if 0
00048   printf("CUDA driver version: %d\n", cuda_driver_version);
00049   printf("CUDA runtime version: %d\n", cuda_runtime_version);
00050 #endif
00051 
00052   if (cuda_driver_version == 0) 
00053     return VMDCUDA_ERR_NODEVICES;
00054 
00055   if (cuda_driver_version < cuda_runtime_version) {
00056 #if defined(ARCH_LINUXARM64)
00057     // XXX workaround for the first native CUDA compiler toolchain (5.5)
00058     //     having a newer rev than the driver (310.32, CUDA 5.0) reports
00059     if (cuda_driver_version == 10010 && cuda_runtime_version == 10020)
00060       return VMDCUDA_ERR_NONE;
00061 #endif
00062 #if defined(ARCH_LINUXCARMA)
00063     // XXX workaround for the first native CUDA compiler toolchain (5.5)
00064     //     having a newer rev than the driver (310.32, CUDA 5.0) reports
00065     if (cuda_driver_version == 5000 && cuda_runtime_version == 5050)
00066       return VMDCUDA_ERR_NONE;
00067 #endif
00068     return VMDCUDA_ERR_DRVMISMATCH;
00069   }
00070 #endif  
00071 
00072   return VMDCUDA_ERR_NONE;
00073 }
00074 
00075 
00076 int vmd_cuda_device_props(int dev, char *name, int namelen,
00077                           int *devmajor, int *devminor, 
00078                           unsigned long *memb, int *clockratekhz,
00079                           int *smcount, int *integratedgpu,
00080                           int *asyncenginecount, int *kerneltimeout,
00081                           int *canmaphostmem, int *computemode,
00082                           int *spdpfpperfratio, 
00083                           int *pageablememaccess, 
00084                           int *pageablememaccessuseshostpagetables) {
00085   cudaError_t rc;
00086   cudaDeviceProp deviceProp;
00087 
00088   // this extra paranoia check costs about 0.04 seconds on a DGX-2 w/ 16 GPUs
00089   int vercheck;
00090   if ((vercheck = vmd_cuda_drv_runtime_compatible()) != VMDCUDA_ERR_NONE) {
00091     return vercheck;
00092   }
00093 
00094   memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00095   if ((rc=cudaGetDeviceProperties(&deviceProp, dev)) != cudaSuccess) {
00096     // printf("error: %s\n", cudaGetErrorString(rc));
00097     if (rc == cudaErrorNotYetImplemented)
00098       return VMDCUDA_ERR_EMUDEVICE;
00099     return VMDCUDA_ERR_GENERAL;
00100   }
00101 
00102   if (name)
00103     strncpy(name, deviceProp.name, namelen);
00104   if (devmajor)
00105     *devmajor = deviceProp.major;
00106   if (devminor)
00107     *devminor = deviceProp.minor;
00108   if (memb)
00109     *memb = deviceProp.totalGlobalMem;
00110   if (clockratekhz)
00111     *clockratekhz = deviceProp.clockRate;
00112   if (smcount)
00113     *smcount = deviceProp.multiProcessorCount;
00114   if (asyncenginecount)
00115     *asyncenginecount = deviceProp.asyncEngineCount;
00116   if (kerneltimeout)
00117     *kerneltimeout = (deviceProp.kernelExecTimeoutEnabled != 0);
00118   if (integratedgpu)
00119     *integratedgpu = (deviceProp.integrated != 0);
00120   if (canmaphostmem)
00121     *canmaphostmem = (deviceProp.canMapHostMemory != 0);
00122   if (computemode)
00123     *computemode = deviceProp.computeMode;
00124   if (spdpfpperfratio)
00125     *spdpfpperfratio = deviceProp.singleToDoublePrecisionPerfRatio;
00126   if (pageablememaccess)
00127     *pageablememaccess = deviceProp.pageableMemoryAccess;
00128   if (pageablememaccessuseshostpagetables)
00129 #if CUDA_VERSION >= 10000
00130     *pageablememaccessuseshostpagetables = deviceProp.pageableMemoryAccessUsesHostPageTables;
00131 #else
00132     *pageablememaccessuseshostpagetables = 0;
00133 #endif
00134 
00135   return VMDCUDA_ERR_NONE;
00136 }
00137 
00138 
00139 int vmd_cuda_num_devices(int *numdev) {
00140   int devcount=0;
00141   *numdev = 0;
00142 
00143   // When this is the very first CUDA API call during VMD startup, 
00144   // there's a 2.0 second startup lag associated with it on the DGX-2, 
00145   // likely due to CUDA runtime library internal initialization overheads
00146   // across the 16 GPUs.
00147   int vercheck;
00148   if ((vercheck = vmd_cuda_drv_runtime_compatible()) != VMDCUDA_ERR_NONE) {
00149     return vercheck;
00150   }
00151 
00152   if (cudaGetDeviceCount(&devcount) != cudaSuccess) {
00153     return VMDCUDA_ERR_NODEVICES;
00154   }
00155 
00156   // Do a sanity check in case we get complete gibberish back,
00157   // but no error. This can occur if we have either a driver or 
00158   // CUDA runtime that's badly mismatched.
00159   if (devcount > 100 || devcount < 0)
00160     return VMDCUDA_ERR_DRVMISMATCH;
00161 
00162 #if 0
00163   // XXX we can eliminate this code now since device emulation
00164   //     went away back in the very early days of CUDA.  This extra
00165   //     pass over all of the device info just slows down our initialization
00166   //     on machines like a DGX-2 that have a huge number of GPUs...
00167 
00168   // disregard emulation mode as unusable for our purposes
00169   int usabledevs=0;
00170   int i;
00171   for (i=0; i<devcount; i++) {
00172     int devmajor, devminor, rc;
00173 
00174     rc = vmd_cuda_device_props(i, NULL, 0, &devmajor, &devminor, NULL, 
00175                                NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
00176                                NULL, NULL, NULL);
00177 
00178     if (rc == VMDCUDA_ERR_NONE) {
00179       // Check for emulation mode devices, and ignore if found
00180       if (((devmajor >= 1) && (devminor >= 0)) &&
00181           ((devmajor != 9999) && (devminor != 9999))) {
00182         usabledevs++;
00183       }
00184     } else if (rc != VMDCUDA_ERR_EMUDEVICE) {
00185       return VMDCUDA_ERR_SOMEDEVICES;
00186     }
00187   } 
00188   *numdev = usabledevs;
00189 #else
00190   *numdev = devcount;
00191 #endif
00192 
00193   return VMDCUDA_ERR_NONE;
00194 }
00195 
00196 
00197 int vmd_cuda_peer_matrix(int *numdev, 
00198                          int **p2pmat, 
00199                          int **p2psupp,
00200                          int **p2patomics,
00201                          int **p2parrays, 
00202                          int **perfmat,
00203                          int *p2plinkcount,
00204                          int *islandcount) {
00205   *p2plinkcount = 0;
00206   *numdev = 0;
00207   *p2pmat = NULL;
00208   *p2psupp = NULL;
00209   *p2patomics = NULL;
00210   *p2parrays = NULL;
00211   *perfmat = NULL;
00212   *islandcount = 0;
00213 
00214   int rc = vmd_cuda_num_devices(numdev);
00215   if (rc == VMDCUDA_ERR_NONE) {
00216     // check for bogus GPU count values...
00217     int N = (*numdev);
00218     if (N < 1 || N > 1024)
00219       return VMDCUDA_ERR_GENERAL; 
00220 
00221     int i, j;
00222     int sz = N*N;
00223 
00224     // allocate and compute GPU P2P connectivity/perf/etc matrices
00225     *p2pmat = (int*) calloc(1, sz * sizeof(int));
00226     *p2psupp = (int*) calloc(1, sz * sizeof(int));
00227     *p2patomics = (int*) calloc(1, sz * sizeof(int));
00228     *p2parrays = (int*) calloc(1, sz * sizeof(int));
00229     *perfmat = (int*) calloc(1, sz * sizeof(int));
00230     for (j=0; j<N; j++) {
00231       for (i=j+1; i<N; i++) {  // symmetric matrix...
00232         cudaError_t err = cudaSuccess;
00233         int canAccessPeer = 0;
00234         int supported = 0;
00235         int atomics = 0;
00236         int arrays = 0;
00237         int perfrank = -1;
00238 
00239 #if 0
00240 printf("Checking[%d][%d]", j, i);
00241 #endif
00242         err = cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
00243         err = cudaDeviceGetP2PAttribute(&supported, cudaDevP2PAttrAccessSupported, i, j);
00244         err = cudaDeviceGetP2PAttribute(&atomics, cudaDevP2PAttrNativeAtomicSupported, i, j);
00245 #if CUDA_VERSION > 9000
00246         err = cudaDeviceGetP2PAttribute(&arrays, cudaDevP2PAttrCudaArrayAccessSupported, i, j);
00247 #endif
00248         err = cudaDeviceGetP2PAttribute(&perfrank, cudaDevP2PAttrPerformanceRank, i, j);
00249         if (err != cudaSuccess) {
00250           free(*p2pmat);
00251           free(*p2psupp);
00252           free(*p2patomics);
00253           free(*p2parrays); 
00254           free(*perfmat);
00255           return VMDCUDA_ERR_GENERAL; 
00256         }
00257 
00258         // symmetric matrices...
00259         int idx = j*N + i;
00260         int symidx = i*N + j;
00261     
00262         (*p2pmat)[idx] = canAccessPeer;
00263         (*p2pmat)[symidx] = canAccessPeer; 
00264 
00265         (*p2psupp)[idx] = supported;
00266         (*p2psupp)[symidx] = supported;
00267 
00268         (*p2patomics)[idx] = atomics;
00269         (*p2patomics)[symidx] = atomics;
00270 
00271         (*p2parrays)[idx] = arrays;
00272         (*p2parrays)[symidx] = arrays;
00273 
00274         (*perfmat)[idx] = perfrank;
00275         (*perfmat)[symidx] = perfrank;
00276 
00277         // count up the total number of P2P links
00278         if (canAccessPeer && supported)
00279           (*p2plinkcount)++;
00280       }
00281     }
00282 
00283 #if 0
00284     for (j=0; j<N; j++) {
00285       for (i=0; i<N; i++) {  // symmetric matrix...
00286         int idx = j*N + i;
00287         printf("P2P[%2d][%2d]: %d, sup %d, atomics %d, arrays %d, perf %d\n", 
00288                j, i, (*p2pmat)[idx], (*p2psupp)[idx], (*p2patomics)[idx], 
00289                (*p2parrays)[idx], (*perfmat)[idx]);
00290       }
00291     }
00292 #endif
00293 
00294     // Once we have the GPU P2P connectivity matrix we
00295     // can compute the number of GPU P2P "islands" via DFS
00296     int *flags = (int*) calloc(1, N * sizeof(int));
00297     IntStackHandle s;
00298     s = intstack_create(N);
00299     int islands, l;
00300     for (islands=0,l=0; l<N; l++) {
00301       // visit GPUs that haven't yet been accounted for
00302       if (!flags[l]) { 
00303         intstack_push(s, l);
00304 
00305         // Loop over all GPUs we have P2P links to in the same island
00306         while (!intstack_pop(s, &j)) {
00307           flags[j] = 1; // mark as visited 
00308 
00309           // check connectivity with peers
00310           for (i=0; i<N; i++) {
00311             // find GPUs connected to index i
00312             int idx = j*N + i;
00313             if ((*p2pmat)[idx]==1 && (*p2psupp)[idx]==1) {
00314               // push unvisited GPUs onto the stack
00315               if (flags[i] == 0) 
00316                 intstack_push(s, i);
00317             }
00318           }
00319         }
00320 
00321         islands++;
00322       } 
00323     } 
00324     intstack_destroy(s);
00325     free(flags);
00326     *islandcount = islands;
00327   }
00328   
00329   return rc;
00330 }
00331 
00332 
00333 void * vmd_cuda_devpool_setdeviceonly(void * voidparms) {
00334   int count, id, dev;
00335   char *mesg;
00336 
00337   wkf_threadpool_worker_getid(voidparms, &id, &count);
00338   wkf_threadpool_worker_getdata(voidparms, (void **) &mesg);
00339   wkf_threadpool_worker_getdevid(voidparms, &dev);
00340 
00341   // mark CPU-GPU management threads for display in profiling tools
00342   char threadname[1024];
00343   sprintf(threadname, "VMD GPU threadpool[%d]", id);
00344   PROFILE_NAME_THREAD(threadname);
00345 
00346   /* set active device */
00347   cudaSetDevice(dev);
00348 
00349   if (mesg != NULL)
00350     printf("devpool thread[%d / %d], device %d message: '%s'\n", id, count, dev, mesg);
00351 
00352   return NULL;
00353 }
00354 
00355 
00356 // Permanently bind worker threads to CPUs to achieve 
00357 // peak performance and reducing run-to-run jitter for
00358 // graphics related algorithms.  This code simply performs
00359 // a one-to-one mapping of threads to CPUs at present, and
00360 // doesn't handle other cases with any greater logic yet.
00361 void * vmd_cuda_affinitize_threads(void *voidparms) {
00362   int tid, numthreads;
00363   wkf_threadpool_worker_getid(voidparms, &tid, &numthreads);
00364   int physcpus = wkf_thread_numphysprocessors();
00365   int setthreadaffinity=1; // enable affinity assignment by default
00366 
00367   int devcount;
00368   cudaError_t crc = cudaGetDeviceCount(&devcount);
00369 
00370   int devid = 0;
00371   wkf_threadpool_worker_getdevid(voidparms, &devid);
00372 
00373   int cpuspergpu = (physcpus/2) / devcount; // XXX HT/SMT hack
00374   int workerthreadspergpu = numthreads / devcount;
00375   
00376   int verbose = (getenv("VMDCUDACPUAFFINITYVERBOSE") != NULL);
00377 
00378   // XXX need to add some kind of check to skip trying to set affinity
00379   //     if the OS, batch system, or other external CPU set management
00380   //     system has already restricted or set of CPUs we can use
00381 
00382   if ((physcpus > 0) && setthreadaffinity) {
00383     int taffinity=0;
00384 
00385 #if 1
00386     // On Intel/AMD hardware, physically distinct CPU cores are numbered 
00387     // consecutively, and additional SMT "cores" appear as additional multiples
00388     // of the physical CPU core count.  It is therefore sufficient to 
00389     // use consecutive CPU indices to spread worker threads fully across CPUs
00390     // and SMT hardware threads
00391     taffinity=(devid * cpuspergpu) + (tid % workerthreadspergpu); // set affinity of tid to same CPU id if numthreads == cpus
00392 #endif
00393 
00394     if (verbose)
00395       printf("Affinitizing GPU[%d] worker thread[%d] to CPU[%d]...\n", 
00396              devid, tid, taffinity);
00397 
00398     wkf_thread_set_self_cpuaffinity(taffinity);
00399   }
00400 
00401   // mark CPU threads for display in profiling tools
00402   char threadname[1024];
00403   sprintf(threadname, "VMD GPU threadpool[%d]", tid);
00404   PROFILE_NAME_THREAD(threadname);
00405 
00406   return NULL;
00407 }
00408 
00409 
00410 
00411 /* XXX hard-coded device indexing scheme, needs to use abstractions */
00412 void * vmd_cuda_devpool_enable_P2P(void * voidparms) {
00413   int threadid;
00414   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00415 
00416   int i;
00417   if (threadid == 0)
00418     printf("Enabling all-to-all GPU Peer Access for DGX-2\n");
00419 
00420   int devcount;
00421   cudaError_t crc = cudaGetDeviceCount(&devcount);
00422 
00423   int devid = 0;
00424   wkf_threadpool_worker_getdevid(voidparms, &devid);
00425 
00426   for (i=0; i<devcount; i++) {
00427     if (i != devid) {
00428       cudaError_t cuerr = cudaDeviceEnablePeerAccess(i, 0);
00429       if (cuerr != cudaSuccess) {
00430         printf("Thr[%2d] Error enabling peer access to GPU %d\n", threadid, i);
00431       }
00432     }
00433   }
00434 
00435   return NULL;
00436 }
00437 
00438 
00439 void * vmd_cuda_devpool_setdevice(void * voidparms) {
00440   int count, id, dev;
00441   cudaDeviceProp deviceProp;
00442   char *mesg;
00443   char *d_membuf;
00444   cudaError_t err;
00445 
00446   wkf_threadpool_worker_getid(voidparms, &id, &count);
00447   wkf_threadpool_worker_getdata(voidparms, (void **) &mesg);
00448   wkf_threadpool_worker_getdevid(voidparms, &dev);
00449 
00450   // XXX This point within the CPU thread GPU device association 
00451   //     process is the right point at which to enforce CPU thread
00452   //     affinity or affinity masks that are based on the host platform's
00453   //     NUMA, PCIe, and NVLink interconnect topology.  We should assign
00454   //     a CPU thread affinity or affinity mask based on a combination of:
00455   //
00456   //     A) Incoming thread affinity masks set by the job scheduler and
00457   //        subsequently enforced by the OS (e.g. IBM 'jsrun' on ORNL Summit)
00458   //
00459   //     B) The CPU affinity mask associated with each GPU device as
00460   //        reported by the NVML API.
00461   //
00462   //     C) An indexing scheme for the target CPU's logical cores
00463   //        that distributes GPU host threads among CPU cores, while
00464   //        avoiding co-scheduling multiple GPU management host threads
00465   //        on the same physical CPU cores.  This is an extra portability
00466   //        issue even among different CPU architectures on Linux, as the CPU 
00467   //        SMT logical core indexing scheme as used by IBM and by Intel are
00468   //        completely incompatible with each other:  
00469   //        C1) Intel lists CPU physical cores consecutively, then followed 
00470   //            by their hyperthread logical cores  So, to use only physical
00471   //            CPU cores, where N is the physical core count, one would use
00472   //            logical core indices 0 to N-1.
00473   //        C2) IBM lists POWER CPU physical cores followed consecutively by
00474   //            their associated logical SMT cores, then followed by the
00475   //            next physical cores, etc.  So, to use only physical
00476   //            CPU cores, where N is the physical core count, and S is the
00477   //            max SMT depth of each CPU, one would use logical core indices
00478   //            0, (S), (2*S), (3S), ... ((N-1)*S).
00479 
00480   // mark CPU-GPU management threads for display in profiling tools
00481   char threadname[1024];
00482   sprintf(threadname, "VMD GPU threadpool[%d]", id);
00483   PROFILE_NAME_THREAD(threadname);
00484 
00485   /* set active device */
00486   cudaSetDevice(dev);
00487 
00488   /* Query SM count and clock rate, and compute a speed scaling value */
00489   /* the current code uses a GeForce GTX 280 / Tesla C1060 as the     */
00490   /* "1.0" reference value, with 30 SMs, and a 1.3 GHz clock rate     */ 
00491   memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00492   if (cudaGetDeviceProperties(&deviceProp, dev) == cudaSuccess) {
00493     float smscale = ((float) deviceProp.multiProcessorCount) / 30.0f;
00494     double clockscale = ((double) deviceProp.clockRate) / 1295000.0;
00495     float speedscale = smscale * ((float) clockscale);
00496 
00497 #if 0
00498     printf("clock rate: %lf\n", (double) deviceProp.clockRate);
00499     printf("scale: %.4f smscale: %.4f clockscale: %.4f\n", 
00500            speedscale, smscale, clockscale);  
00501 #endif
00502 
00503     if (deviceProp.canMapHostMemory != 0) {
00504 #if 0
00505       printf("Enabled mapped host memory on device[%d]\n", dev);
00506 #endif
00507 
00508       /* 
00509        * set blocking/yielding API behavior and enable mapped host memory
00510        * If this fails, then either we've got a problematic device, or 
00511        * we have already set the device flags within this thread (shouldn't
00512        * ever happen), or the device we're accessing doesn't actually support
00513        * mapped host memory (shouldn't ever happen since we check for that).
00514        */
00515 
00516 #if defined(VMDLIBOPTIX)
00517       // when compiled with OptiX enabled, we tell the CUDA runtime to 
00518       // maintain the peak local memory size that occured at runtime
00519       // to avoid thrashing with difficult scenes
00520       err = cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost | cudaDeviceLmemResizeToMax);
00521 #else
00522       err = cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost);
00523 #endif
00524       if (err != cudaSuccess) {
00525         printf("Warning) thread[%d] can't set GPU[%d] device flags\n", id, dev);
00526         printf("Warning) CUDA error: %s\n", cudaGetErrorString(err)); 
00527       }
00528     }
00529 
00530     wkf_threadpool_worker_setdevspeed(voidparms, speedscale);
00531 
00532     /* 
00533      * Do a small 1MB device memory allocation to ensure that our context
00534      * has actually been initialized by the time we return.
00535      * If this tiny allocation fails, then something is seriously wrong
00536      * and we should mark this device as unusable for the rest of 
00537      * this VMD session.
00538      */
00539     if ((err = cudaMalloc((void **) &d_membuf, 1*1024*1024)) == cudaSuccess) {
00540       cudaFree(d_membuf); 
00541     } else {
00542       printf("Warning) thread[%d] can't init GPU[%d] found by device query\n", id, dev); 
00543       printf("Warning) CUDA error: %s\n", cudaGetErrorString(err));
00544       /* 
00545        * XXX we should mark the device unusable here so that no other code
00546        *     touchies it, but have no mechanism for doing that yet...
00547        */
00548     }
00549   }
00550 
00551   if (mesg != NULL)
00552     printf("devpool thread[%d / %d], device %d message: '%s'\n", id, count, dev, mesg);
00553 
00554   return NULL;
00555 }
00556 
00557 
00558 
00559 #if defined(__cplusplus)
00560 }
00561 #endif
00562 
00563