Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDABench.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2007-2009 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: CUDABench.cu,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.32 $      $Date: 2011/06/22 14:48:19 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   Short benchmark kernels to measure GPU performance
00019  *
00020  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #include "WKFThreads.h"
00027 #include "WKFUtils.h"
00028 #include "CUDAKernels.h" 
00029 
00030 #define CUERR { cudaError_t err; \
00031   if ((err = cudaGetLastError()) != cudaSuccess) { \
00032   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00033   return -1; }}
00034 
00035 
00036 //
00037 // Benchmark peak Multiply-Add instruction performance, in GFLOPS
00038 //
00039 
00040 // FMADD16 macro contains a sequence of operations that the compiler
00041 // won't optimize out, and will translate into a densely packed block
00042 // of multiply-add instructions with no intervening register copies/moves
00043 // or other instructions. 
00044 #define FMADD16 \
00045     tmp0  = tmp0*tmp4+tmp7;     \
00046     tmp1  = tmp1*tmp5+tmp0;     \
00047     tmp2  = tmp2*tmp6+tmp1;     \
00048     tmp3  = tmp3*tmp7+tmp2;     \
00049     tmp4  = tmp4*tmp0+tmp3;     \
00050     tmp5  = tmp5*tmp1+tmp4;     \
00051     tmp6  = tmp6*tmp2+tmp5;     \
00052     tmp7  = tmp7*tmp3+tmp6;     \
00053     tmp8  = tmp8*tmp12+tmp15;   \
00054     tmp9  = tmp9*tmp13+tmp8;    \
00055     tmp10 = tmp10*tmp14+tmp9;   \
00056     tmp11 = tmp11*tmp15+tmp10;  \
00057     tmp12 = tmp12*tmp8+tmp11;   \
00058     tmp13 = tmp13*tmp9+tmp12;   \
00059     tmp14 = tmp14*tmp10+tmp13;  \
00060     tmp15 = tmp15*tmp11+tmp14;
00061 
00062 // CUDA grid, thread block, loop, and MADD operation counts
00063 #define GRIDSIZEX   6144
00064 #define BLOCKSIZEX  64
00065 #define GLOOPS      500
00066 #define MADDCOUNT   64
00067 
00068 // FLOP counting
00069 #define FLOPSPERLOOP (MADDCOUNT * 16)
00070 
00071 __global__ static void madd_kernel(float *doutput) {
00072   int tid = blockIdx.x * blockDim.x + threadIdx.x;
00073   float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
00074   float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15;
00075   tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f;
00076   tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f;
00077 
00078   tmp15=tmp7 = blockIdx.x * 0.001f; // prevent compiler from optimizing out
00079   tmp1 = blockIdx.y * 0.001f;       // the body of the loop...
00080 
00081   int loop;
00082   for(loop=0; loop<GLOOPS; loop++){
00083     FMADD16
00084     FMADD16
00085     FMADD16
00086     FMADD16
00087     FMADD16
00088     FMADD16
00089     FMADD16
00090     FMADD16
00091     FMADD16
00092     FMADD16
00093     FMADD16
00094     FMADD16
00095     FMADD16
00096     FMADD16
00097     FMADD16
00098     FMADD16
00099     FMADD16
00100     FMADD16
00101     FMADD16
00102     FMADD16
00103     FMADD16
00104     FMADD16
00105     FMADD16
00106     FMADD16
00107     FMADD16
00108     FMADD16
00109     FMADD16
00110     FMADD16
00111     FMADD16
00112     FMADD16
00113     FMADD16
00114     FMADD16
00115   }
00116 
00117   doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7
00118                  +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15;
00119 }
00120 
00121 
00122 static int cudamaddgflops(int cudadev, double *gflops, int testloops) {
00123   float *doutput = NULL;
00124   dim3 Bsz, Gsz;
00125   wkf_timerhandle timer;
00126   int i;
00127 
00128   cudaError_t rc;
00129   rc = cudaSetDevice(cudadev);
00130   if (rc != cudaSuccess) {
00131 #if CUDART_VERSION >= 2010
00132     rc = cudaGetLastError(); // query last error and reset error state
00133     if (rc != cudaErrorSetOnActiveProcess)
00134       return -1; // abort and return an error
00135 #else
00136     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00137                         // revs don't have a cudaErrorSetOnActiveProcess enum
00138 #endif
00139   }
00140 
00141 
00142   // setup CUDA grid and block sizes
00143   Bsz.x = BLOCKSIZEX;
00144   Bsz.y = 1;
00145   Bsz.z = 1;
00146   Gsz.x = GRIDSIZEX;
00147   Gsz.y = 1;
00148   Gsz.z = 1;
00149 
00150   // allocate output array
00151   cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float));
00152   CUERR // check and clear any existing errors
00153 
00154   timer=wkf_timer_create();
00155   wkf_timer_start(timer);
00156   for (i=0; i<testloops; i++) { 
00157     madd_kernel<<<Gsz, Bsz>>>(doutput);
00158     cudaThreadSynchronize(); // wait for kernel to finish
00159   }
00160   CUERR // check and clear any existing errors
00161   wkf_timer_stop(timer);
00162 
00163   double runtime = wkf_timer_time(timer);
00164   double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) *
00165                   ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops;
00166   
00167   *gflops = gflop / runtime;
00168 
00169   cudaFree(doutput);
00170   CUERR // check and clear any existing errors
00171 
00172   wkf_timer_destroy(timer);
00173 
00174   return 0;
00175 }
00176 
00177 typedef struct {
00178   int deviceid;
00179   int testloops;
00180   double gflops;
00181 } maddthrparms;
00182 
00183 static void * cudamaddthread(void *voidparms) {
00184   maddthrparms *parms = (maddthrparms *) voidparms;
00185   cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops);
00186   return NULL;
00187 }
00188 
00189 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops,
00190                          int testloops) {
00191   maddthrparms *parms;
00192   wkf_thread_t * threads;
00193   int i;
00194 
00195   /* allocate array of threads */
00196   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00197 
00198   /* allocate and initialize array of thread parameters */
00199   parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms));
00200   for (i=0; i<numdevs; i++) {
00201     if (devlist != NULL)
00202       parms[i].deviceid = devlist[i];
00203     else
00204       parms[i].deviceid = i;
00205 
00206     parms[i].testloops = testloops;
00207     parms[i].gflops = 0.0;
00208   }
00209 
00210 #if defined(VMDTHREADS)
00211   /* spawn child threads to do the work */
00212   /* thread 0 must also be processed this way otherwise    */
00213   /* we'll permanently bind the main thread to some device */
00214   for (i=0; i<numdevs; i++) {
00215     wkf_thread_create(&threads[i], cudamaddthread, &parms[i]);
00216   }
00217 
00218   /* join the threads after work is done */
00219   for (i=0; i<numdevs; i++) {
00220     wkf_thread_join(threads[i], NULL);
00221   }
00222 #else
00223   /* single thread does all of the work */
00224   cudamaddthread((void *) &parms[0]);
00225 #endif
00226 
00227   for (i=0; i<numdevs; i++) {
00228     gflops[i] = parms[i].gflops; 
00229   }
00230 
00231   /* free thread parms */
00232   free(parms);
00233   free(threads);
00234 
00235   return 0;
00236 }
00237 
00238 
00239 
00240 
00241 
00242 
00243 //
00244 // Host-GPU memcpy I/O bandwidth benchmark
00245 //
00246 
00247 #define BWITER 500
00248 #define LATENCYITER 50000
00249 
00250 static int cudabusbw(int cudadev, 
00251                      double *hdmbsec, double *hdlatusec, 
00252                      double *phdmbsec, double *phdlatusec, 
00253                      double *dhmbsec, double *dhlatusec,
00254                      double *pdhmbsec, double *pdhlatusec) {
00255   float *hdata = NULL;   // non-pinned DMA buffer
00256   float *phdata = NULL;  // pinned DMA buffer
00257   float *ddata = NULL;
00258   int i;
00259   double runtime;
00260   wkf_timerhandle timer;
00261   int memsz = 1024 * 1024 * sizeof(float);
00262 
00263   *hdmbsec = 0.0;
00264   *hdlatusec = 0.0;
00265   *dhmbsec = 0.0;
00266   *dhlatusec = 0.0;
00267   *phdmbsec = 0.0;
00268   *phdlatusec = 0.0;
00269   *pdhmbsec = 0.0;
00270   *pdhlatusec = 0.0;
00271 
00272   // attach to the selected device
00273   cudaError_t rc;
00274   rc = cudaSetDevice(cudadev);
00275   if (rc != cudaSuccess) {
00276 #if CUDART_VERSION >= 2010
00277     rc = cudaGetLastError(); // query last error and reset error state
00278     if (rc != cudaErrorSetOnActiveProcess)
00279       return -1; // abort and return an error
00280 #else
00281     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00282                         // revs don't have a cudaErrorSetOnActiveProcess enum
00283 #endif
00284   }
00285 
00286   // allocate non-pinned output array
00287   hdata = (float *) malloc(memsz); 
00288 
00289   // allocate pinned output array
00290   cudaMallocHost((void**) &phdata, memsz);
00291   CUERR // check and clear any existing errors
00292 
00293   // allocate device memory
00294   cudaMalloc((void**) &ddata, memsz);
00295   CUERR // check and clear any existing errors
00296 
00297   // create timer
00298   timer=wkf_timer_create();
00299 
00300   //
00301   // Host to device timings
00302   //
00303 
00304   // non-pinned bandwidth
00305   wkf_timer_start(timer);
00306   for (i=0; i<BWITER; i++) {
00307     cudaMemcpy(ddata, hdata, memsz,  cudaMemcpyHostToDevice);
00308   }
00309   wkf_timer_stop(timer);
00310   CUERR // check and clear any existing errors
00311   runtime = wkf_timer_time(timer);
00312   *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00313 
00314   // non-pinned latency
00315   wkf_timer_start(timer);
00316   for (i=0; i<LATENCYITER; i++) {
00317     cudaMemcpy(ddata, hdata, 1,  cudaMemcpyHostToDevice);
00318   }
00319   wkf_timer_stop(timer);
00320   CUERR // check and clear any existing errors
00321   runtime = wkf_timer_time(timer);
00322   *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00323 
00324 
00325   // pinned bandwidth
00326   wkf_timer_start(timer);
00327   for (i=0; i<BWITER; i++) {
00328     cudaMemcpy(ddata, phdata, memsz,  cudaMemcpyHostToDevice);
00329   }
00330   wkf_timer_stop(timer);
00331   CUERR // check and clear any existing errors
00332   runtime = wkf_timer_time(timer);
00333   *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00334 
00335   // pinned latency
00336   wkf_timer_start(timer);
00337   for (i=0; i<LATENCYITER; i++) {
00338     cudaMemcpy(ddata, phdata, 1,  cudaMemcpyHostToDevice);
00339   }
00340   wkf_timer_stop(timer);
00341   CUERR // check and clear any existing errors
00342   runtime = wkf_timer_time(timer);
00343   *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00344 
00345  
00346   //
00347   // Device to host timings
00348   //
00349 
00350   // non-pinned bandwidth
00351   wkf_timer_start(timer);
00352   for (i=0; i<BWITER; i++) {
00353     cudaMemcpy(hdata, ddata, memsz,  cudaMemcpyDeviceToHost);
00354   }
00355   wkf_timer_stop(timer);
00356   CUERR // check and clear any existing errors
00357   runtime = wkf_timer_time(timer);
00358   *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00359 
00360   // non-pinned latency
00361   wkf_timer_start(timer);
00362   for (i=0; i<LATENCYITER; i++) {
00363     cudaMemcpy(hdata, ddata, 1,  cudaMemcpyDeviceToHost);
00364   }
00365   wkf_timer_stop(timer);
00366   CUERR // check and clear any existing errors
00367   runtime = wkf_timer_time(timer);
00368   *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00369 
00370 
00371   // pinned bandwidth
00372   wkf_timer_start(timer);
00373   for (i=0; i<BWITER; i++) {
00374     cudaMemcpy(phdata, ddata, memsz,  cudaMemcpyDeviceToHost);
00375   }
00376   wkf_timer_stop(timer);
00377   CUERR // check and clear any existing errors
00378   runtime = wkf_timer_time(timer);
00379   *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00380 
00381   // pinned latency
00382   wkf_timer_start(timer);
00383   for (i=0; i<LATENCYITER; i++) {
00384     cudaMemcpy(phdata, ddata, 1,  cudaMemcpyDeviceToHost);
00385   }
00386   wkf_timer_stop(timer);
00387   CUERR // check and clear any existing errors
00388   runtime = wkf_timer_time(timer);
00389   *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00390  
00391  
00392   cudaFree(ddata);
00393   CUERR // check and clear any existing errors
00394   cudaFreeHost(phdata);
00395   CUERR // check and clear any existing errors
00396   free(hdata);
00397 
00398   wkf_timer_destroy(timer);
00399 
00400   return 0;
00401 }
00402 
00403 typedef struct {
00404   int deviceid;
00405   double hdmbsec;
00406   double hdlatusec;
00407   double phdmbsec;
00408   double phdlatusec;
00409   double dhmbsec;
00410   double dhlatusec;
00411   double pdhmbsec;
00412   double pdhlatusec;
00413 } busbwthrparms;
00414 
00415 static void * cudabusbwthread(void *voidparms) {
00416   busbwthrparms *parms = (busbwthrparms *) voidparms;
00417   cudabusbw(parms->deviceid, 
00418             &parms->hdmbsec, &parms->hdlatusec,
00419             &parms->phdmbsec, &parms->phdlatusec,
00420             &parms->dhmbsec, &parms->dhlatusec,
00421             &parms->pdhmbsec, &parms->pdhlatusec);
00422   return NULL;
00423 }
00424 
00425 int vmd_cuda_bus_bw(int numdevs, int *devlist, 
00426                     double *hdmbsec, double *hdlatusec,
00427                     double *phdmbsec,double *phdlatusec,
00428                     double *dhmbsec, double *dhlatusec,
00429                     double *pdhmbsec, double *pdhlatusec) {
00430   busbwthrparms *parms;
00431   wkf_thread_t * threads;
00432   int i;
00433 
00434   /* allocate array of threads */
00435   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00436 
00437   /* allocate and initialize array of thread parameters */
00438   parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms));
00439   for (i=0; i<numdevs; i++) {
00440     if (devlist != NULL)
00441       parms[i].deviceid = devlist[i];
00442     else
00443       parms[i].deviceid = i;
00444     parms[i].hdmbsec = 0.0;
00445     parms[i].hdlatusec = 0.0;
00446     parms[i].phdmbsec = 0.0;
00447     parms[i].phdlatusec = 0.0;
00448     parms[i].dhmbsec = 0.0;
00449     parms[i].dhlatusec = 0.0;
00450     parms[i].pdhmbsec = 0.0;
00451     parms[i].pdhlatusec = 0.0;
00452   }
00453 
00454 #if defined(VMDTHREADS)
00455   /* spawn child threads to do the work */
00456   /* thread 0 must also be processed this way otherwise    */
00457   /* we'll permanently bind the main thread to some device */
00458   for (i=0; i<numdevs; i++) {
00459     wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]);
00460   }
00461 
00462   /* join the threads after work is done */
00463   for (i=0; i<numdevs; i++) {
00464     wkf_thread_join(threads[i], NULL);
00465   }
00466 #else
00467   /* single thread does all of the work */
00468   cudabusbwthread((void *) &parms[0]);
00469 #endif
00470 
00471   for (i=0; i<numdevs; i++) {
00472     hdmbsec[i] = parms[i].hdmbsec; 
00473     hdlatusec[i] = parms[i].hdlatusec; 
00474     phdmbsec[i] = parms[i].phdmbsec; 
00475     phdlatusec[i] = parms[i].phdlatusec; 
00476     dhmbsec[i] = parms[i].dhmbsec; 
00477     dhlatusec[i] = parms[i].dhlatusec; 
00478     pdhmbsec[i] = parms[i].pdhmbsec; 
00479     pdhlatusec[i] = parms[i].pdhlatusec; 
00480   }
00481 
00482   /* free thread parms */
00483   free(parms);
00484   free(threads);
00485 
00486   return 0;
00487 }
00488 
00489 
00490 
00491 //
00492 // GPU device global memory bandwidth benchmark
00493 //
00494 template <class T>
00495 __global__ void gpuglobmemcpybw(T *dest, const T *src) {
00496   const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
00497   dest[idx] = src[idx];
00498 }
00499 
00500 template <class T>
00501 __global__ void gpuglobmemsetbw(T *dest, const T val) {
00502   int idx = threadIdx.x + blockIdx.x * blockDim.x;
00503   dest[idx] = val;
00504 }
00505 
00506 typedef float4 datatype;
00507 
00508 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
00509   int i;
00510   int len = 1 << 22; // one thread per data element
00511   int loops = 500;
00512   datatype *src, *dest;
00513   datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
00514 
00515   // initialize to zero for starters
00516   float memsettime = 0.0f;
00517   float memcpytime = 0.0f;
00518   *gpumemsetgbsec = 0.0;
00519   *gpumemcpygbsec = 0.0;
00520 
00521   // attach to the selected device
00522   cudaError_t rc;
00523   rc = cudaSetDevice(cudadev);
00524   if (rc != cudaSuccess) {
00525 #if CUDART_VERSION >= 2010
00526     rc = cudaGetLastError(); // query last error and reset error state
00527     if (rc != cudaErrorSetOnActiveProcess)
00528       return -1; // abort and return an error
00529 #else
00530     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00531                         // revs don't have a cudaErrorSetOnActiveProcess enum
00532 #endif
00533   }
00534 
00535   cudaMalloc((void **) &src, sizeof(datatype)*len);
00536   CUERR
00537   cudaMalloc((void **) &dest, sizeof(datatype)*len);
00538   CUERR
00539 
00540   dim3 BSz(256, 1, 1);
00541   dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 
00542 
00543   // do a warm-up pass
00544   gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
00545   CUERR
00546   gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00547   CUERR
00548   gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00549   CUERR
00550 
00551   cudaEvent_t start, end;
00552   cudaEventCreate(&start);
00553   cudaEventCreate(&end);
00554 
00555   // execute the memset kernel
00556   cudaEventRecord(start, 0);
00557   for (i=0; i<loops; i++) {
00558     gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00559   }
00560   CUERR
00561   cudaEventRecord(end, 0);
00562   CUERR
00563   cudaEventSynchronize(start);
00564   CUERR
00565   cudaEventSynchronize(end);
00566   CUERR
00567   cudaEventElapsedTime(&memsettime, start, end);
00568   CUERR
00569 
00570   // execute the memcpy kernel
00571   cudaEventRecord(start, 0);
00572   for (i=0; i<loops; i++) {
00573     gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00574   }
00575   cudaEventRecord(end, 0);
00576   CUERR
00577   cudaEventSynchronize(start);
00578   CUERR
00579   cudaEventSynchronize(end);
00580   CUERR
00581   cudaEventElapsedTime(&memcpytime, start, end);
00582   CUERR
00583 
00584   cudaEventDestroy(start);
00585   CUERR
00586   cudaEventDestroy(end);
00587   CUERR
00588 
00589   *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
00590   *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
00591   cudaFree(dest);
00592   cudaFree(src);
00593   CUERR
00594 
00595   return 0;
00596 }
00597 
00598 typedef struct {
00599   int deviceid;
00600   double memsetgbsec;
00601   double memcpygbsec;
00602 } globmembwthrparms;
00603 
00604 static void * cudaglobmembwthread(void *voidparms) {
00605   globmembwthrparms *parms = (globmembwthrparms *) voidparms;
00606   cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec);
00607   return NULL;
00608 }
00609 
00610 int vmd_cuda_globmem_bw(int numdevs, int *devlist, 
00611                         double *memsetgbsec, double *memcpygbsec) {
00612   globmembwthrparms *parms;
00613   wkf_thread_t * threads;
00614   int i;
00615 
00616   /* allocate array of threads */
00617   threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00618 
00619   /* allocate and initialize array of thread parameters */
00620   parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms));
00621   for (i=0; i<numdevs; i++) {
00622     if (devlist != NULL)
00623       parms[i].deviceid = devlist[i];
00624     else
00625       parms[i].deviceid = i;
00626     parms[i].memsetgbsec = 0.0;
00627     parms[i].memcpygbsec = 0.0;
00628   }
00629 
00630 #if defined(VMDTHREADS)
00631   /* spawn child threads to do the work */
00632   /* thread 0 must also be processed this way otherwise    */
00633   /* we'll permanently bind the main thread to some device */
00634   for (i=0; i<numdevs; i++) {
00635     wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]);
00636   }
00637 
00638   /* join the threads after work is done */
00639   for (i=0; i<numdevs; i++) {
00640     wkf_thread_join(threads[i], NULL);
00641   }
00642 #else
00643   /* single thread does all of the work */
00644   cudaglobmembwthread((void *) &parms[0]);
00645 #endif
00646 
00647   for (i=0; i<numdevs; i++) {
00648     memsetgbsec[i] = parms[i].memsetgbsec;
00649     memcpygbsec[i] = parms[i].memcpygbsec;
00650   }
00651 
00652   /* free thread parms */
00653   free(parms);
00654   free(threads);
00655 
00656   return 0;
00657 }
00658 
00659 
00660 //
00661 // Benchmark latency for complete threadpool barrier wakeup/run/sleep cycle
00662 //
00663 static void * vmddevpoollatencythread(void *voidparms) {
00664   return NULL;
00665 }
00666 
00667 static void * vmddevpooltilelatencythread(void *voidparms) {
00668   int threadid=-1;
00669   int tilesize=1;
00670   void *parms=NULL;
00671   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00672   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00673 
00674   // grind through task tiles until none are left
00675   wkf_tasktile_t tile;
00676   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00677     // do nothing but eat work units...
00678   }
00679 
00680   return NULL;
00681 }
00682 
00683 
00684 // no-op kernel for timing kernel launches
00685 __global__ static void nopkernel(float * ddata) {
00686   unsigned int xindex  = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
00687   unsigned int yindex  = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00688   unsigned int outaddr = __umul24(gridDim.x, blockDim.x) * yindex + xindex;
00689 
00690   if (ddata != NULL)
00691     ddata[outaddr] = outaddr;
00692 }
00693 
00694 // empty kernel for timing kernel launches
00695 __global__ static void voidkernel(void) {
00696   return;
00697 }
00698 
00699 static void * vmddevpoolcudatilelatencythread(void *voidparms) {
00700   int threadid=-1;
00701   int tilesize=1;
00702   float *parms=NULL;
00703   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00704 
00705   // XXX Note that we expect parms to be set to NULL or a valid CUDA
00706   //     global memory pointer for correct operation of the NOP kernel below
00707   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00708 
00709 #if 0
00710   // scale tile size by device performance
00711   tilesize=4; // GTX 280, Tesla C1060 starting point tile size
00712   wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00713 #endif
00714 
00715   // grind through task tiles until none are left
00716   wkf_tasktile_t tile;
00717   dim3 Gsz(1,1,0);
00718   dim3 Bsz(8,8,1);
00719   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00720     // launch a no-op CUDA kernel
00721     nopkernel<<<Gsz, Bsz, 0>>>(parms);
00722   }
00723 
00724   // wait for all GPU kernels to complete
00725   cudaThreadSynchronize();
00726 
00727   return NULL;
00728 }
00729 
00730 
00731 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize,
00732                              double *kernlaunchlatency,
00733                              double *barlatency,
00734                              double *cyclelatency, 
00735                              double *tilelatency,
00736                              double *kernellatency) {
00737   int i;
00738   wkf_tasktile_t tile;
00739   wkf_timerhandle timer;
00740   int loopcount;
00741 
00742   timer=wkf_timer_create();
00743 
00744   // execute just a CUDA kernel launch and measure latency on whatever
00745   // GPU we get.
00746   loopcount = 15000;
00747   dim3 VGsz(1,1,0);
00748   dim3 VBsz(8,8,1);
00749   wkf_timer_start(timer);
00750   for (i=0; i<loopcount; i++) {
00751     voidkernel<<<VGsz, VBsz, 0>>>();
00752   }
00753   // wait for GPU kernels to complete
00754   cudaThreadSynchronize();
00755   wkf_timer_stop(timer);
00756   *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount);
00757 
00758   // execute just a raw barrier sync and measure latency
00759   loopcount = 15000;
00760   wkf_timer_start(timer);
00761   for (i=0; i<loopcount; i++) {
00762     wkf_threadpool_wait(devpool);
00763   }
00764   wkf_timer_stop(timer);
00765   *barlatency = wkf_timer_time(timer) / ((double) loopcount);
00766 
00767   // time wake-up, launch, and sleep/join of device pool doing a no-op
00768   loopcount = 5000;
00769   wkf_timer_start(timer);
00770   for (i=0; i<loopcount; i++) {
00771     tile.start=0;
00772     tile.end=0;
00773     wkf_threadpool_sched_dynamic(devpool, &tile);
00774     wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1);
00775   }
00776   wkf_timer_stop(timer);
00777   *cyclelatency = wkf_timer_time(timer) / ((double) loopcount);
00778 
00779   // time wake-up, launch, and sleep/join of device pool eating tiles
00780   loopcount = 5000;
00781   wkf_timer_start(timer);
00782   for (i=0; i<loopcount; i++) {
00783     tile.start=0;
00784     tile.end=tilesize;
00785     wkf_threadpool_sched_dynamic(devpool, &tile);
00786     wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1);
00787   }
00788   wkf_timer_stop(timer);
00789   *tilelatency = wkf_timer_time(timer) / ((double) loopcount);
00790 
00791   // time wake-up, launch, and sleep/join of device pool eating tiles
00792   loopcount = 2000;
00793   wkf_timer_start(timer);
00794   for (i=0; i<loopcount; i++) {
00795     tile.start=0;
00796     tile.end=tilesize;
00797     wkf_threadpool_sched_dynamic(devpool, &tile);
00798     wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1);
00799   }
00800   wkf_timer_stop(timer);
00801   *kernellatency = wkf_timer_time(timer) / ((double) loopcount);
00802 
00803   wkf_timer_destroy(timer);
00804 
00805 #if 1
00806   vmd_cuda_measure_latencies(devpool);
00807 #endif
00808 
00809   return 0;
00810 }
00811 
00812 
00813 //
00814 // Benchmark CUDA kernel launch and memory copy latencies in isolation
00815 //
00816 typedef struct {
00817   int deviceid;
00818   int testloops;
00819   double kernlatency;
00820   double bcopylatency;
00821   double kbseqlatency;
00822 } latthrparms;
00823 
00824 static void * vmddevpoolcudalatencythread(void *voidparms) {
00825   int threadid=-1;
00826   latthrparms *parms=NULL;
00827 
00828   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00829   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00830   if (parms->deviceid == threadid) { 
00831     wkf_timerhandle timer;
00832     timer=wkf_timer_create();
00833     printf("Thread/device %d running...\n", threadid);
00834     cudaStream_t devstream;
00835     cudaStreamCreate(&devstream);
00836 
00837     char *hostbuf = (char *) calloc(1, 65536 * sizeof(char));
00838     char  *gpubuf = NULL;
00839     cudaMalloc((void**)&gpubuf, 65536 * sizeof(char));
00840 
00841     dim3 Gsz(1,1,0);
00842     dim3 Bsz(8,8,1);
00843 
00844     // measure back-to-back NULL kernel launches
00845     wkf_timer_start(timer);
00846     int i;
00847     for (i=0; i<parms->testloops; i++) {
00848       // launch a no-op CUDA kernel
00849       nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL);
00850     }
00851     // wait for all GPU kernels to complete
00852     cudaStreamSynchronize(devstream);
00853     wkf_timer_stop(timer);
00854     parms->kernlatency =  1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00855 
00856     // measure back-to-back round-trip 1-byte memcpy latencies
00857     wkf_timer_start(timer);
00858     for (i=0; i<parms->testloops; i++) {
00859       cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream);
00860       cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream);
00861     }
00862     // wait for all GPU kernels to complete
00863     cudaStreamSynchronize(devstream);
00864     wkf_timer_stop(timer);
00865     parms->kernlatency =  1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00866 
00867     printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency);
00868 
00869     cudaStreamDestroy(devstream);
00870     cudaFree(gpubuf);
00871     free(hostbuf);
00872     wkf_timer_destroy(timer);
00873   }
00874 
00875   return NULL;
00876 }
00877 
00878 
00879 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) {
00880   latthrparms thrparms;
00881   int workers = wkf_threadpool_get_workercount(devpool);
00882   int i;
00883 printf("vmd_cuda_measure_latencies()...\n");
00884   for (i=0; i<workers; i++) {
00885     memset(&thrparms, 0, sizeof(thrparms));
00886     thrparms.deviceid = i;
00887     thrparms.testloops = 2500;
00888     wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1);
00889   }
00890 
00891   return 0;
00892 }
00893 
00894 
00895 
00896 

Generated on Wed May 16 01:49:08 2012 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002