00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021 #include <string.h>
00022 #include <stdio.h>
00023 #include "CUDAKernels.h"
00024 #include "WKFThreads.h"
00025 #include "intstack.h"
00026 #include "ProfileHooks.h"
00027
00028
00029
00030 #if CUDART_VERSION <= 6000
00031 #error VMD requires at least CUDA 6.x or later
00032 #endif
00033
00034 #if defined(__cplusplus)
00035 extern "C" {
00036 #endif
00037
00038
00039 static int vmd_cuda_drv_runtime_compatible() {
00040 #if CUDART_VERSION >= 2020
00041 int cuda_driver_version=-1;
00042 int cuda_runtime_version=0;
00043
00044 cudaDriverGetVersion(&cuda_driver_version);
00045 cudaRuntimeGetVersion(&cuda_runtime_version);
00046
00047 #if 0
00048 printf("CUDA driver version: %d\n", cuda_driver_version);
00049 printf("CUDA runtime version: %d\n", cuda_runtime_version);
00050 #endif
00051
00052 if (cuda_driver_version == 0)
00053 return VMDCUDA_ERR_NODEVICES;
00054
00055 if (cuda_driver_version < cuda_runtime_version) {
00056 #if defined(ARCH_LINUXARM64)
00057
00058
00059 if (cuda_driver_version == 10010 && cuda_runtime_version == 10020)
00060 return VMDCUDA_ERR_NONE;
00061 #endif
00062 #if defined(ARCH_LINUXCARMA)
00063
00064
00065 if (cuda_driver_version == 5000 && cuda_runtime_version == 5050)
00066 return VMDCUDA_ERR_NONE;
00067 #endif
00068 return VMDCUDA_ERR_DRVMISMATCH;
00069 }
00070 #endif
00071
00072 return VMDCUDA_ERR_NONE;
00073 }
00074
00075
00076 int vmd_cuda_device_props(int dev, char *name, int namelen,
00077 int *devmajor, int *devminor,
00078 unsigned long *memb, int *clockratekhz,
00079 int *smcount, int *integratedgpu,
00080 int *asyncenginecount, int *kerneltimeout,
00081 int *canmaphostmem, int *computemode,
00082 int *spdpfpperfratio,
00083 int *pageablememaccess,
00084 int *pageablememaccessuseshostpagetables) {
00085 cudaError_t rc;
00086 cudaDeviceProp deviceProp;
00087
00088
00089 int vercheck;
00090 if ((vercheck = vmd_cuda_drv_runtime_compatible()) != VMDCUDA_ERR_NONE) {
00091 return vercheck;
00092 }
00093
00094 memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00095 if ((rc=cudaGetDeviceProperties(&deviceProp, dev)) != cudaSuccess) {
00096
00097 if (rc == cudaErrorNotYetImplemented)
00098 return VMDCUDA_ERR_EMUDEVICE;
00099 return VMDCUDA_ERR_GENERAL;
00100 }
00101
00102 if (name)
00103 strncpy(name, deviceProp.name, namelen);
00104 if (devmajor)
00105 *devmajor = deviceProp.major;
00106 if (devminor)
00107 *devminor = deviceProp.minor;
00108 if (memb)
00109 *memb = deviceProp.totalGlobalMem;
00110 if (clockratekhz)
00111 *clockratekhz = deviceProp.clockRate;
00112 if (smcount)
00113 *smcount = deviceProp.multiProcessorCount;
00114 if (asyncenginecount)
00115 *asyncenginecount = deviceProp.asyncEngineCount;
00116 if (kerneltimeout)
00117 *kerneltimeout = (deviceProp.kernelExecTimeoutEnabled != 0);
00118 if (integratedgpu)
00119 *integratedgpu = (deviceProp.integrated != 0);
00120 if (canmaphostmem)
00121 *canmaphostmem = (deviceProp.canMapHostMemory != 0);
00122 if (computemode)
00123 *computemode = deviceProp.computeMode;
00124 if (spdpfpperfratio)
00125 *spdpfpperfratio = deviceProp.singleToDoublePrecisionPerfRatio;
00126 if (pageablememaccess)
00127 *pageablememaccess = deviceProp.pageableMemoryAccess;
00128 if (pageablememaccessuseshostpagetables)
00129 #if CUDA_VERSION >= 10000
00130 *pageablememaccessuseshostpagetables = deviceProp.pageableMemoryAccessUsesHostPageTables;
00131 #else
00132 *pageablememaccessuseshostpagetables = 0;
00133 #endif
00134
00135 return VMDCUDA_ERR_NONE;
00136 }
00137
00138
00139 int vmd_cuda_num_devices(int *numdev) {
00140 int devcount=0;
00141 *numdev = 0;
00142
00143
00144
00145
00146
00147 int vercheck;
00148 if ((vercheck = vmd_cuda_drv_runtime_compatible()) != VMDCUDA_ERR_NONE) {
00149 return vercheck;
00150 }
00151
00152 if (cudaGetDeviceCount(&devcount) != cudaSuccess) {
00153 return VMDCUDA_ERR_NODEVICES;
00154 }
00155
00156
00157
00158
00159 if (devcount > 100 || devcount < 0)
00160 return VMDCUDA_ERR_DRVMISMATCH;
00161
00162 #if 0
00163
00164
00165
00166
00167
00168
00169 int usabledevs=0;
00170 int i;
00171 for (i=0; i<devcount; i++) {
00172 int devmajor, devminor, rc;
00173
00174 rc = vmd_cuda_device_props(i, NULL, 0, &devmajor, &devminor, NULL,
00175 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
00176 NULL, NULL, NULL);
00177
00178 if (rc == VMDCUDA_ERR_NONE) {
00179
00180 if (((devmajor >= 1) && (devminor >= 0)) &&
00181 ((devmajor != 9999) && (devminor != 9999))) {
00182 usabledevs++;
00183 }
00184 } else if (rc != VMDCUDA_ERR_EMUDEVICE) {
00185 return VMDCUDA_ERR_SOMEDEVICES;
00186 }
00187 }
00188 *numdev = usabledevs;
00189 #else
00190 *numdev = devcount;
00191 #endif
00192
00193 return VMDCUDA_ERR_NONE;
00194 }
00195
00196
00197 int vmd_cuda_peer_matrix(int *numdev,
00198 int **p2pmat,
00199 int **p2psupp,
00200 int **p2patomics,
00201 int **p2parrays,
00202 int **perfmat,
00203 int *p2plinkcount,
00204 int *islandcount) {
00205 *p2plinkcount = 0;
00206 *numdev = 0;
00207 *p2pmat = NULL;
00208 *p2psupp = NULL;
00209 *p2patomics = NULL;
00210 *p2parrays = NULL;
00211 *perfmat = NULL;
00212 *islandcount = 0;
00213
00214 int rc = vmd_cuda_num_devices(numdev);
00215 if (rc == VMDCUDA_ERR_NONE) {
00216
00217 int N = (*numdev);
00218 if (N < 1 || N > 1024)
00219 return VMDCUDA_ERR_GENERAL;
00220
00221 int i, j;
00222 int sz = N*N;
00223
00224
00225 *p2pmat = (int*) calloc(1, sz * sizeof(int));
00226 *p2psupp = (int*) calloc(1, sz * sizeof(int));
00227 *p2patomics = (int*) calloc(1, sz * sizeof(int));
00228 *p2parrays = (int*) calloc(1, sz * sizeof(int));
00229 *perfmat = (int*) calloc(1, sz * sizeof(int));
00230 for (j=0; j<N; j++) {
00231 for (i=j+1; i<N; i++) {
00232 cudaError_t err = cudaSuccess;
00233 int canAccessPeer = 0;
00234 int supported = 0;
00235 int atomics = 0;
00236 int arrays = 0;
00237 int perfrank = -1;
00238
00239 #if 0
00240 printf("Checking[%d][%d]", j, i);
00241 #endif
00242 err = cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
00243 err = cudaDeviceGetP2PAttribute(&supported, cudaDevP2PAttrAccessSupported, i, j);
00244 err = cudaDeviceGetP2PAttribute(&atomics, cudaDevP2PAttrNativeAtomicSupported, i, j);
00245 #if CUDA_VERSION > 9000
00246 err = cudaDeviceGetP2PAttribute(&arrays, cudaDevP2PAttrCudaArrayAccessSupported, i, j);
00247 #endif
00248 err = cudaDeviceGetP2PAttribute(&perfrank, cudaDevP2PAttrPerformanceRank, i, j);
00249 if (err != cudaSuccess) {
00250 free(*p2pmat);
00251 free(*p2psupp);
00252 free(*p2patomics);
00253 free(*p2parrays);
00254 free(*perfmat);
00255 return VMDCUDA_ERR_GENERAL;
00256 }
00257
00258
00259 int idx = j*N + i;
00260 int symidx = i*N + j;
00261
00262 (*p2pmat)[idx] = canAccessPeer;
00263 (*p2pmat)[symidx] = canAccessPeer;
00264
00265 (*p2psupp)[idx] = supported;
00266 (*p2psupp)[symidx] = supported;
00267
00268 (*p2patomics)[idx] = atomics;
00269 (*p2patomics)[symidx] = atomics;
00270
00271 (*p2parrays)[idx] = arrays;
00272 (*p2parrays)[symidx] = arrays;
00273
00274 (*perfmat)[idx] = perfrank;
00275 (*perfmat)[symidx] = perfrank;
00276
00277
00278 if (canAccessPeer && supported)
00279 (*p2plinkcount)++;
00280 }
00281 }
00282
00283 #if 0
00284 for (j=0; j<N; j++) {
00285 for (i=0; i<N; i++) {
00286 int idx = j*N + i;
00287 printf("P2P[%2d][%2d]: %d, sup %d, atomics %d, arrays %d, perf %d\n",
00288 j, i, (*p2pmat)[idx], (*p2psupp)[idx], (*p2patomics)[idx],
00289 (*p2parrays)[idx], (*perfmat)[idx]);
00290 }
00291 }
00292 #endif
00293
00294
00295
00296 int *flags = (int*) calloc(1, N * sizeof(int));
00297 IntStackHandle s;
00298 s = intstack_create(N);
00299 int islands, l;
00300 for (islands=0,l=0; l<N; l++) {
00301
00302 if (!flags[l]) {
00303 intstack_push(s, l);
00304
00305
00306 while (!intstack_pop(s, &j)) {
00307 flags[j] = 1;
00308
00309
00310 for (i=0; i<N; i++) {
00311
00312 int idx = j*N + i;
00313 if ((*p2pmat)[idx]==1 && (*p2psupp)[idx]==1) {
00314
00315 if (flags[i] == 0)
00316 intstack_push(s, i);
00317 }
00318 }
00319 }
00320
00321 islands++;
00322 }
00323 }
00324 intstack_destroy(s);
00325 free(flags);
00326 *islandcount = islands;
00327 }
00328
00329 return rc;
00330 }
00331
00332
00333 void * vmd_cuda_devpool_setdeviceonly(void * voidparms) {
00334 int count, id, dev;
00335 char *mesg;
00336
00337 wkf_threadpool_worker_getid(voidparms, &id, &count);
00338 wkf_threadpool_worker_getdata(voidparms, (void **) &mesg);
00339 wkf_threadpool_worker_getdevid(voidparms, &dev);
00340
00341
00342 char threadname[1024];
00343 sprintf(threadname, "VMD GPU threadpool[%d]", id);
00344 PROFILE_NAME_THREAD(threadname);
00345
00346
00347 cudaSetDevice(dev);
00348
00349 if (mesg != NULL)
00350 printf("devpool thread[%d / %d], device %d message: '%s'\n", id, count, dev, mesg);
00351
00352 return NULL;
00353 }
00354
00355
00356
00357
00358
00359
00360
00361 void * vmd_cuda_affinitize_threads(void *voidparms) {
00362 int tid, numthreads;
00363 wkf_threadpool_worker_getid(voidparms, &tid, &numthreads);
00364 int physcpus = wkf_thread_numphysprocessors();
00365 int setthreadaffinity=1;
00366
00367 int devcount;
00368 cudaError_t crc = cudaGetDeviceCount(&devcount);
00369
00370 int devid = 0;
00371 wkf_threadpool_worker_getdevid(voidparms, &devid);
00372
00373 int cpuspergpu = (physcpus/2) / devcount;
00374 int workerthreadspergpu = numthreads / devcount;
00375
00376 int verbose = (getenv("VMDCUDACPUAFFINITYVERBOSE") != NULL);
00377
00378
00379
00380
00381
00382 if ((physcpus > 0) && setthreadaffinity) {
00383 int taffinity=0;
00384
00385 #if 1
00386
00387
00388
00389
00390
00391 taffinity=(devid * cpuspergpu) + (tid % workerthreadspergpu);
00392 #endif
00393
00394 if (verbose)
00395 printf("Affinitizing GPU[%d] worker thread[%d] to CPU[%d]...\n",
00396 devid, tid, taffinity);
00397
00398 wkf_thread_set_self_cpuaffinity(taffinity);
00399 }
00400
00401
00402 char threadname[1024];
00403 sprintf(threadname, "VMD GPU threadpool[%d]", tid);
00404 PROFILE_NAME_THREAD(threadname);
00405
00406 return NULL;
00407 }
00408
00409
00410
00411
00412 void * vmd_cuda_devpool_enable_P2P(void * voidparms) {
00413 int threadid;
00414 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00415
00416 int i;
00417 if (threadid == 0)
00418 printf("Enabling all-to-all GPU Peer Access for DGX-2\n");
00419
00420 int devcount;
00421 cudaError_t crc = cudaGetDeviceCount(&devcount);
00422
00423 int devid = 0;
00424 wkf_threadpool_worker_getdevid(voidparms, &devid);
00425
00426 for (i=0; i<devcount; i++) {
00427 if (i != devid) {
00428 cudaError_t cuerr = cudaDeviceEnablePeerAccess(i, 0);
00429 if (cuerr != cudaSuccess) {
00430 printf("Thr[%2d] Error enabling peer access to GPU %d\n", threadid, i);
00431 }
00432 }
00433 }
00434
00435 return NULL;
00436 }
00437
00438
00439 void * vmd_cuda_devpool_setdevice(void * voidparms) {
00440 int count, id, dev;
00441 cudaDeviceProp deviceProp;
00442 char *mesg;
00443 char *d_membuf;
00444 cudaError_t err;
00445
00446 wkf_threadpool_worker_getid(voidparms, &id, &count);
00447 wkf_threadpool_worker_getdata(voidparms, (void **) &mesg);
00448 wkf_threadpool_worker_getdevid(voidparms, &dev);
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481 char threadname[1024];
00482 sprintf(threadname, "VMD GPU threadpool[%d]", id);
00483 PROFILE_NAME_THREAD(threadname);
00484
00485
00486 cudaSetDevice(dev);
00487
00488
00489
00490
00491 memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00492 if (cudaGetDeviceProperties(&deviceProp, dev) == cudaSuccess) {
00493 float smscale = ((float) deviceProp.multiProcessorCount) / 30.0f;
00494 double clockscale = ((double) deviceProp.clockRate) / 1295000.0;
00495 float speedscale = smscale * ((float) clockscale);
00496
00497 #if 0
00498 printf("clock rate: %lf\n", (double) deviceProp.clockRate);
00499 printf("scale: %.4f smscale: %.4f clockscale: %.4f\n",
00500 speedscale, smscale, clockscale);
00501 #endif
00502
00503 if (deviceProp.canMapHostMemory != 0) {
00504 #if 0
00505 printf("Enabled mapped host memory on device[%d]\n", dev);
00506 #endif
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516 #if defined(VMDLIBOPTIX)
00517
00518
00519
00520 err = cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost | cudaDeviceLmemResizeToMax);
00521 #else
00522 err = cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost);
00523 #endif
00524 if (err != cudaSuccess) {
00525 printf("Warning) thread[%d] can't set GPU[%d] device flags\n", id, dev);
00526 printf("Warning) CUDA error: %s\n", cudaGetErrorString(err));
00527 }
00528 }
00529
00530 wkf_threadpool_worker_setdevspeed(voidparms, speedscale);
00531
00532
00533
00534
00535
00536
00537
00538
00539 if ((err = cudaMalloc((void **) &d_membuf, 1*1024*1024)) == cudaSuccess) {
00540 cudaFree(d_membuf);
00541 } else {
00542 printf("Warning) thread[%d] can't init GPU[%d] found by device query\n", id, dev);
00543 printf("Warning) CUDA error: %s\n", cudaGetErrorString(err));
00544
00545
00546
00547
00548 }
00549 }
00550
00551 if (mesg != NULL)
00552 printf("devpool thread[%d / %d], device %d message: '%s'\n", id, count, dev, mesg);
00553
00554 return NULL;
00555 }
00556
00557
00558
00559 #if defined(__cplusplus)
00560 }
00561 #endif
00562
00563