00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include "config.h"
00035 #include "Inform.h"
00036 #include "ResizeArray.h"
00037 #include "CUDAAccel.h"
00038 #include "CUDAKernels.h"
00039 #include "WKFThreads.h"
00040 #include "ProfileHooks.h"
00041
00042 CUDAAccel::CUDAAccel(void) {
00043 cudaavail = 0;
00044 numdevices = 0;
00045 numphysdevices = 0;
00046
00047 nvmlh=NULL;
00048 cudapool=NULL;
00049
00050 if (getenv("VMDNOCUDA") != NULL) {
00051 msgInfo << "VMDNOCUDA environment variable is set, CUDA support disabled."
00052 << sendmsg;
00053 return;
00054 }
00055
00056 #if defined(VMDCUDA)
00057 PROFILE_PUSH_RANGE("CUDAAccel::CUDAAccel()", 0);
00058
00059 unsigned int gpumask = 0xffffffff;
00060 const char *gpumaskstr = getenv("VMDCUDADEVICEMASK");
00061 if (gpumaskstr != NULL) {
00062 unsigned int tmp;
00063 if (sscanf(gpumaskstr, "%x", &tmp) == 1) {
00064 gpumask = tmp;
00065 msgInfo << "Using GPU device mask '"
00066 << gpumaskstr << "'" << sendmsg;
00067 } else {
00068 msgInfo << "Failed to parse CUDA GPU device mask string '"
00069 << gpumaskstr << "'" << sendmsg;
00070 }
00071 }
00072
00073
00074
00075
00076
00077
00078 int usabledevices = 0;
00079 int rc = 0;
00080 if ((rc=vmd_cuda_num_devices(&numphysdevices)) != VMDCUDA_ERR_NONE) {
00081 numdevices = 0;
00082 numphysdevices = 0;
00083
00084
00085
00086
00087
00088
00089 switch (rc) {
00090 case VMDCUDA_ERR_NODEVICES:
00091 case VMDCUDA_ERR_SOMEDEVICES:
00092
00093 break;
00094
00095 #if 0
00096 case VMDCUDA_ERR_SOMEDEVICES:
00097 msgWarn << "One or more CUDA accelerators may exist but are not usable." << sendmsg;
00098 msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg;
00099 break;
00100 #endif
00101
00102 case VMDCUDA_ERR_DRVMISMATCH:
00103 msgWarn << "Detected a mismatch between CUDA runtime and GPU driver" << sendmsg;
00104 msgWarn << "Check to make sure that GPU drivers are up to date." << sendmsg;
00105
00106 break;
00107 }
00108
00109 PROFILE_POP_RANGE();
00110 return;
00111 }
00112
00113
00114
00115
00116
00117
00118
00119 nvmlh = wrap_nvml_create();
00120
00121
00122
00123
00124
00125 if (numphysdevices > 0) {
00126 cudaavail = 1;
00127
00128 int i;
00129 for (i=0; i<numphysdevices; i++) {
00130 cudadevprops dp;
00131 memset(&dp, 0, sizeof(dp));
00132 if (!vmd_cuda_device_props(i, dp.name, sizeof(dp.name),
00133 &dp.major, &dp.minor,
00134 &dp.membytes, &dp.clockratekhz,
00135 &dp.smcount, &dp.integratedgpu,
00136 &dp.asyncenginecount,
00137 &dp.kernelexectimeoutenabled,
00138 &dp.canmaphostmem, &dp.computemode,
00139 &dp.spdpfpperfratio,
00140 &dp.pageablememaccess,
00141 &dp.pageablememaccessuseshostpagetables)) {
00142 dp.deviceid=i;
00143
00144
00145
00146
00147 if (!(dp.kernelexectimeoutenabled && getenv("VMDCUDANODISPLAYGPUS")) &&
00148 (gpumask & (1 << i)) &&
00149 (dp.computemode != computeModeProhibited)) {
00150 devprops.append(dp);
00151 usabledevices++;
00152 }
00153 } else {
00154 msgWarn << " Failed to retrieve properties for CUDA accelerator " << i << sendmsg;
00155 }
00156 }
00157 }
00158
00159
00160
00161 numdevices=usabledevices;
00162
00163
00164
00165
00166
00167
00168 devpool_init();
00169
00170 PROFILE_POP_RANGE();
00171 #endif
00172 }
00173
00174
00175
00176 CUDAAccel::~CUDAAccel(void) {
00177 devpool_fini();
00178
00179 #if defined(VMDCUDA)
00180
00181 if (nvmlh != NULL) {
00182 wrap_nvml_destroy(nvmlh);
00183 }
00184 #endif
00185 }
00186
00187
00188 void CUDAAccel::devpool_init(void) {
00189 cudapool=NULL;
00190
00191 #if defined(VMDCUDA)
00192 PROFILE_PUSH_RANGE("CUDAAccel::devpool_init()", 0);
00193
00194
00195
00196 if (!cudaavail || numdevices == 0 || getenv("VMDNOCUDA") != NULL)
00197 return;
00198
00199
00200 int workercount=numdevices;
00201 if (workercount > wkf_thread_numprocessors())
00202 workercount=wkf_thread_numprocessors();
00203
00204 int *devlist = new int[workercount];
00205 int i;
00206 for (i=0; i<workercount; i++) {
00207 devlist[i]=device_index(i);
00208 }
00209
00210 msgInfo << "Creating CUDA device pool and initializing hardware..." << sendmsg;
00211 cudapool=wkf_threadpool_create(workercount, devlist);
00212 delete [] devlist;
00213
00214
00215 if (getenv("VMDCUDAVERBOSE") != NULL)
00216 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, (void*)"VMD CUDA Dev Init", 1);
00217 else
00218 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_setdevice, NULL, 1);
00219
00220
00221 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
00222
00223
00224
00225
00226 if (getenv("VMDCUDAP2PENABLE") != NULL) {
00227 msgInfo << "Enabling DGX-2 fully-connected NVLink GPU P2P..." << sendmsg;
00228 wkf_threadpool_launch(cudapool, vmd_cuda_devpool_enable_P2P, NULL, 1);
00229 }
00230
00231 PROFILE_POP_RANGE();
00232 #endif
00233 }
00234
00235 void CUDAAccel::devpool_fini(void) {
00236 if (!cudapool)
00237 return;
00238
00239 #if defined(VMDCUDA)
00240 devpool_wait();
00241 wkf_threadpool_destroy(cudapool);
00242 #endif
00243 cudapool=NULL;
00244 }
00245
00246 int CUDAAccel::devpool_launch(void *fctn(void *), void *parms, int blocking) {
00247 if (!cudapool)
00248 return -1;
00249
00250 return wkf_threadpool_launch(cudapool, fctn, parms, blocking);
00251 }
00252
00253 int CUDAAccel::devpool_wait(void) {
00254 if (!cudapool)
00255 return -1;
00256
00257 return wkf_threadpool_wait(cudapool);
00258 }
00259
00260 void CUDAAccel::print_cuda_devices(void) {
00261 if (getenv("VMDCUDANODISPLAYGPUS")) {
00262 msgInfo << "Ignoring CUDA-capable GPUs used for display" << sendmsg;
00263 }
00264
00265 if (!cudaavail || numdevices == 0) {
00266 msgInfo << "No CUDA accelerator devices available." << sendmsg;
00267 return;
00268 }
00269
00270 if (nvmlh == NULL) {
00271 msgInfo << "Unable to load NVML library, GPU-CPU affinity unavailable." << sendmsg;
00272 }
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287 int p2plinkcount=0, p2pislands=0;
00288 #if defined(VMDCUDA)
00289 int numdev=0;
00290 int *p2pmat=NULL;
00291 int *p2psupp=NULL;
00292 int *p2patomics=NULL;
00293 int *p2parrays=NULL;
00294 int *perfmat=NULL;
00295
00296 if (vmd_cuda_peer_matrix(&numdev, &p2pmat, &p2psupp, &p2patomics, &p2parrays,
00297 &perfmat, &p2plinkcount, &p2pislands) != VMDCUDA_ERR_NONE) {
00298 msgWarn << "Unable to ascertain GPU peer-to-peer connectivity" << sendmsg;
00299 }
00300
00301 if (p2pmat)
00302 free(p2pmat);
00303 if (p2psupp)
00304 free(p2psupp);
00305 if (p2patomics)
00306 free(p2patomics);
00307 if (p2parrays)
00308 free(p2parrays);
00309 if (perfmat)
00310 free(perfmat);
00311 #endif
00312
00313
00314 msgInfo << "Detected " << numdevices << " available CUDA "
00315 << ((numdevices > 1) ? "accelerators" : "accelerator:");
00316
00317
00318 if (p2plinkcount > 0) {
00319 msgInfo << ", "
00320 << p2plinkcount << ((p2plinkcount > 1) ? " P2P links, " : " P2P link, ")
00321 << p2pislands << ((p2pislands > 1) ? " islands" : " island");
00322 }
00323
00324 msgInfo << ":" << sendmsg;
00325
00326
00327 char oldstr[1024], outstr[1024], gpustr[1024], idxprefix[1024];
00328 int idxrangecount=0,firstidx=-1, lastidx=-1;
00329 const char *idxfmtstring10gpus = "[%d]";
00330 const char *idxfmtspaces10gpus = " ";
00331 const char *idxfmtstring100gpus = "[%2d]";
00332 const char *idxfmtspaces100gpus = " ";
00333 const char *gpuidxfmtstring, *gpuidxfmtspaces;
00334
00335 #if 0
00336 int outputlineperdevice = 1;
00337 #else
00338 int outputlineperdevice = (getenv("VMDCUDAOUTPUTLINEPERDEVICE") != NULL);
00339 #endif
00340
00341
00342
00343 if (device_index(numdevices-1) > 10) {
00344 gpuidxfmtstring = idxfmtstring100gpus;
00345 gpuidxfmtspaces = idxfmtspaces100gpus;
00346 } else {
00347 gpuidxfmtstring = idxfmtstring10gpus;
00348 gpuidxfmtspaces = idxfmtspaces10gpus;
00349 }
00350
00351 memset(oldstr, 0, sizeof(oldstr));
00352 memset(gpustr, 0, sizeof(gpustr));
00353 memset(idxprefix, 0, sizeof(idxprefix));
00354
00355 int i;
00356 int shiftgpuidx=0;
00357 for (i=0; i<numdevices; i++) {
00358 memset(outstr, 0, sizeof(outstr));
00359
00360
00361 const char *devname = device_name(i);
00362 sprintf(gpustr, " %-20s %2d SM_%d.%d %.1f GHz",
00363 (devname) ? devname : "NULL Device Name!",
00364 (device_sm_count(i) > 0) ? device_sm_count(i) : 0,
00365 device_version_major(i), device_version_minor(i),
00366 device_clock_ghz(i));
00367 strcpy(outstr, gpustr);
00368
00369
00370 int gpumemmb = (device_membytes(i) / (1024 * 1024));
00371 if (gpumemmb < 1000) {
00372 sprintf(gpustr, ", %4dMB RAM", gpumemmb);
00373 } else if (gpumemmb < 10240) {
00374 sprintf(gpustr, ", %.1fGB RAM", gpumemmb / 1024.0);
00375 } else {
00376
00377 sprintf(gpustr, ", %dGB RAM", (gpumemmb + 512) / 1024);
00378 }
00379 strcat(outstr, gpustr);
00380
00381
00382 if (device_computemode(i) == computeModeProhibited) {
00383 strcat(outstr, ", Compute Mode: Prohibited");
00384 } else {
00385 int sfpr = device_spdpfpperfratio(i);
00386 if (sfpr > 2) {
00387 sprintf(gpustr, " SP%d", sfpr);
00388 strcat(outstr, gpustr);
00389 }
00390
00392 if (device_integratedgpu(i)) {
00393 strcat(outstr, " IGPU");
00394 }
00395
00398 if (device_kerneltimeoutenabled(i)) {
00399 strcat(outstr, " KT");
00400 }
00401
00403 if (device_asyncenginecount(i)) {
00404 sprintf(gpustr, " AE%d", device_asyncenginecount(i));
00405 strcat(outstr, gpustr);
00406 }
00407
00409 if (device_canmaphostmem(i))
00410 strcat(outstr, " ZC");
00411
00414 if (device_pageablememaccess(i)) {
00417 if (device_pageablememaccessuseshostpagetables(i))
00418 strcat(outstr, " PMT");
00419 else
00420 strcat(outstr, " PM");
00421 }
00422 }
00423
00424 if (outputlineperdevice) {
00425
00426 sprintf(idxprefix, gpuidxfmtstring, device_index(i));
00427 msgInfo << idxprefix << outstr << sendmsg;
00428 } else {
00429
00430
00431
00432 int newidx = device_index(i);
00433 if (!strcmp(oldstr, outstr)) {
00434
00435
00436 if ((newidx - lastidx) > 1) {
00437 if (lastidx > firstidx) {
00438 sprintf(idxprefix, "%d-%d", firstidx, lastidx);
00439 shiftgpuidx=1;
00440 } else {
00441 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx);
00442 }
00443
00444 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00445 idxrangecount++;
00446 firstidx = newidx;
00447 lastidx = newidx;
00448 }
00449 lastidx=newidx;
00450 } else {
00451 if (firstidx < 0) {
00452 firstidx = newidx;
00453 lastidx = newidx;
00454 strcpy(oldstr, outstr);
00455 continue;
00456 }
00457
00458 if (lastidx > firstidx) {
00459 sprintf(idxprefix, "%d-%d", firstidx, lastidx);
00460 shiftgpuidx=1;
00461 } else {
00462 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx);
00463 }
00464 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00465 msgInfo << "]" << oldstr << sendmsg;
00466
00467 idxrangecount = 0;
00468 firstidx = newidx;
00469 lastidx = newidx;
00470 strcpy(oldstr, outstr);
00471 memset(outstr, 0, sizeof(outstr));
00472 }
00473 }
00474 }
00475
00476 if (!outputlineperdevice) {
00477 if (lastidx > firstidx) {
00478 sprintf(idxprefix, "%d-%d", firstidx, lastidx);
00479 } else {
00480 sprintf(idxprefix, "%s%d", (shiftgpuidx) ? " " : "", firstidx);
00481 }
00482 msgInfo << ((idxrangecount == 0) ? "[" : ",") << idxprefix;
00483 msgInfo << "]";
00484 if (idxrangecount > 2) {
00485 msgInfo << ":" << sendmsg;
00486 msgInfo << gpuidxfmtspaces;
00487 }
00488 msgInfo << oldstr << sendmsg;
00489 }
00490 }
00491
00492 int CUDAAccel::num_devices(void) {
00493 return numdevices;
00494 }
00495
00496 int CUDAAccel::device_index(int dev) {
00497 return devprops[dev].deviceid;
00498 }
00499
00500 const char *CUDAAccel::device_name(int dev) {
00501 if (!cudaavail || dev < 0 || dev >= numdevices)
00502 return NULL;
00503 return devprops[dev].name;
00504 }
00505
00506 int CUDAAccel::device_version_major(int dev) {
00507 if (!cudaavail || dev < 0 || dev >= numdevices)
00508 return 0;
00509 return devprops[dev].major;
00510 }
00511
00512 int CUDAAccel::device_version_minor(int dev) {
00513 if (!cudaavail || dev < 0 || dev >= numdevices)
00514 return 0;
00515 return devprops[dev].minor;
00516 }
00517
00518 unsigned long CUDAAccel::device_membytes(int dev) {
00519 if (!cudaavail || dev < 0 || dev >= numdevices)
00520 return 0;
00521 return devprops[dev].membytes;
00522 }
00523
00524 float CUDAAccel::device_clock_ghz(int dev) {
00525 if (!cudaavail || dev < 0 || dev >= numdevices)
00526 return 0;
00527 return (float) (devprops[dev].clockratekhz / 1000000.0);
00528 }
00529
00530 int CUDAAccel::device_sm_count(int dev) {
00531 if (!cudaavail || dev < 0 || dev >= numdevices)
00532 return -1;
00533 return devprops[dev].smcount;
00534 }
00535
00536 int CUDAAccel::device_integratedgpu(int dev) {
00537 if (!cudaavail || dev < 0 || dev >= numdevices)
00538 return -1;
00539 return devprops[dev].integratedgpu;
00540 }
00541
00542 int CUDAAccel::device_asyncenginecount(int dev) {
00543 if (!cudaavail || dev < 0 || dev >= numdevices)
00544 return -1;
00545 return devprops[dev].asyncenginecount;
00546 }
00547
00548 int CUDAAccel::device_kerneltimeoutenabled(int dev) {
00549 if (!cudaavail || dev < 0 || dev >= numdevices)
00550 return -1;
00551 return devprops[dev].kernelexectimeoutenabled;
00552 }
00553
00554 int CUDAAccel::device_canmaphostmem(int dev) {
00555 if (!cudaavail || dev < 0 || dev >= numdevices)
00556 return -1;
00557 return devprops[dev].canmaphostmem;
00558 }
00559
00560 int CUDAAccel::device_computemode(int dev) {
00561 if (!cudaavail || dev < 0 || dev >= numdevices)
00562 return -1;
00563 return devprops[dev].computemode;
00564 }
00565
00566 int CUDAAccel::device_spdpfpperfratio(int dev) {
00567 if (!cudaavail || dev < 0 || dev >= numdevices)
00568 return -1;
00569 return devprops[dev].spdpfpperfratio;
00570 }
00571
00572 int CUDAAccel::device_pageablememaccess(int dev) {
00573 if (!cudaavail || dev < 0 || dev >= numdevices)
00574 return -1;
00575 return devprops[dev].pageablememaccess;
00576 }
00577
00578 int CUDAAccel::device_pageablememaccessuseshostpagetables(int dev) {
00579 if (!cudaavail || dev < 0 || dev >= numdevices)
00580 return -1;
00581 return devprops[dev].pageablememaccessuseshostpagetables;
00582 }
00583