CUDAClearDevice.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAClearDevice.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.16 $      $Date: 2020/02/26 04:22:39 $
00014  *
00015  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 #include "utilities.h"
00026 #include "CUDAKernels.h"
00027 
00028 #define CUERR { cudaError_t err; \
00029   if ((err = cudaGetLastError()) != cudaSuccess) { \
00030   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00031   return NULL; }}
00032 
00033 // a full-sized 64-kB constant memory buffer to use to clear
00034 // any existing device state
00035 __constant__ static float constbuf[16384];
00036 
00037 // maximum number of allocations to use to soak up all available RAM
00038 #define MAXLOOPS 16
00039 
00040 void * vmd_cuda_devpool_clear_device_mem(void * voidparms) {
00041   int id, count, dev;
00042   char *bufs[MAXLOOPS];
00043   size_t bufszs[MAXLOOPS];
00044   float zerobuf[16 * 1024];
00045   memset(zerobuf, 0, sizeof(zerobuf));
00046   memset(bufs, 0, MAXLOOPS * sizeof(sizeof(char *)));
00047   memset(bufszs, 0, MAXLOOPS * sizeof(sizeof(size_t)));
00048 
00049   wkf_threadpool_worker_getid(voidparms, &id, &count);
00050   wkf_threadpool_worker_getdevid(voidparms, &dev);
00051 
00052   // clear constant memory
00053   cudaMemcpyToSymbol(constbuf, zerobuf, sizeof(zerobuf), 0);
00054   CUERR
00055 
00056 #if 0
00057   // 
00058   // Allocate, clear, and deallocate all global memory we can touch
00059   //
00060   // XXX on platforms where the GPU shares DRAM with the CPU such as
00061   //     Tegra K1, the old memory clearing approach is problematic. 
00062   //     The CPU might implement VM paging, and it'll just end up 
00063   //     paging itself to death if we try and get all GPU memory.
00064   //     Given modern GPU drivers being better about clearing data between
00065   //     apps, it might be best to skip this step for now and either hope
00066   //     that old data isn't laying around in global GPU memory, or else
00067   //     take a very different approach that is more compatible with 
00068   //     systems like Tegra K1 that have a single memory system for both
00069   //     the CPU and the GPU.
00070   //
00071   // XXX In MPI enabled builds, we skip the global memory clearing step 
00072   //     since multiple VMD processes may end up being mapped to the
00073   //     same node, sharing the same set of GPUs.  A better way of handling
00074   //     this would be either to perform the memory clear only on one 
00075   //     MPI rank per physical node, or to distribute GPUs among 
00076   //     VMD processes so no sharing occurs.
00077   //
00078 #if !defined(VMDMPI)
00079   int verbose=0;
00080   if (getenv("VMDCUDAVERBOSE") != NULL)
00081     verbose=1;
00082 
00083   size_t sz(1024 * 1024 * 1024); /* start with 1GB buffer size */
00084   int i, bufcnt=0;
00085   size_t totalsz=0;
00086   for (i=0; i<MAXLOOPS; i++) {
00087     // Allocate the largest buffer we can get. If we fail, we reduce request
00088     // size to half of the previous, and try again until we reach the minimum
00089     // request size threshold.
00090     cudaError_t rc;
00091     while ((sz > (16 * 1024 * 1024)) && 
00092            ((rc=cudaMalloc((void **) &bufs[i], sz)) != cudaSuccess)) {
00093       cudaGetLastError(); // reset error state
00094       sz >>= 1;
00095     }
00096 
00097     if (rc == cudaSuccess) {
00098       bufszs[i] = sz;
00099       totalsz += sz; 
00100       bufcnt++;
00101       if (verbose)
00102         printf("devpool thread[%d / %d], dev %d buf[%d] size: %d\n", id, count, dev, i, sz);
00103     } else {
00104       bufs[i] = NULL;
00105       bufszs[i] = 0;
00106       if (verbose)
00107         printf("devpool thread[%d / %d], dev %d buf[%d] failed min allocation size: %d\n", id, count, dev, i, sz);
00108 
00109       // terminate allocation loop early
00110       break;
00111     } 
00112   }
00113 
00114   if (verbose)
00115     printf("devpool thread[%d / %d], dev %d allocated %d buffers\n", id, count, dev, bufcnt);
00116 
00117   for (i=0; i<bufcnt; i++) {
00118     if ((bufs[i] != NULL) && (bufszs[i] > 0)) {
00119       cudaMemset(bufs[i], 0, bufszs[i]);
00120       cudaFree(bufs[i]);
00121       bufs[i] = NULL;
00122       bufszs[i] = 0;
00123     }
00124   }
00125   CUERR
00126 
00127   if (verbose)
00128     printf("  Device %d cleared %d MB of GPU memory\n", dev, totalsz / (1024 * 1024));
00129 
00130 #endif
00131 #endif
00132 
00133   return NULL;
00134 }
00135