00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #include "WKFThreads.h"
00027 #include "WKFUtils.h"
00028 #include "CUDAKernels.h"
00029
00030 #define CUERR { cudaError_t err; \
00031 if ((err = cudaGetLastError()) != cudaSuccess) { \
00032 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00033 return -1; }}
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044 #define FMADD16 \
00045 tmp0 = tmp0*tmp4+tmp7; \
00046 tmp1 = tmp1*tmp5+tmp0; \
00047 tmp2 = tmp2*tmp6+tmp1; \
00048 tmp3 = tmp3*tmp7+tmp2; \
00049 tmp4 = tmp4*tmp0+tmp3; \
00050 tmp5 = tmp5*tmp1+tmp4; \
00051 tmp6 = tmp6*tmp2+tmp5; \
00052 tmp7 = tmp7*tmp3+tmp6; \
00053 tmp8 = tmp8*tmp12+tmp15; \
00054 tmp9 = tmp9*tmp13+tmp8; \
00055 tmp10 = tmp10*tmp14+tmp9; \
00056 tmp11 = tmp11*tmp15+tmp10; \
00057 tmp12 = tmp12*tmp8+tmp11; \
00058 tmp13 = tmp13*tmp9+tmp12; \
00059 tmp14 = tmp14*tmp10+tmp13; \
00060 tmp15 = tmp15*tmp11+tmp14;
00061
00062
00063 #define GRIDSIZEX 6144
00064 #define BLOCKSIZEX 64
00065 #define GLOOPS 500
00066 #define MADDCOUNT 64
00067
00068
00069 #define FLOPSPERLOOP (MADDCOUNT * 16)
00070
00071 __global__ static void madd_kernel(float *doutput) {
00072 int tid = blockIdx.x * blockDim.x + threadIdx.x;
00073 float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
00074 float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15;
00075 tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f;
00076 tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f;
00077
00078 tmp15=tmp7 = blockIdx.x * 0.001f;
00079 tmp1 = blockIdx.y * 0.001f;
00080
00081 int loop;
00082 for(loop=0; loop<GLOOPS; loop++){
00083 FMADD16
00084 FMADD16
00085 FMADD16
00086 FMADD16
00087 FMADD16
00088 FMADD16
00089 FMADD16
00090 FMADD16
00091 FMADD16
00092 FMADD16
00093 FMADD16
00094 FMADD16
00095 FMADD16
00096 FMADD16
00097 FMADD16
00098 FMADD16
00099 FMADD16
00100 FMADD16
00101 FMADD16
00102 FMADD16
00103 FMADD16
00104 FMADD16
00105 FMADD16
00106 FMADD16
00107 FMADD16
00108 FMADD16
00109 FMADD16
00110 FMADD16
00111 FMADD16
00112 FMADD16
00113 FMADD16
00114 FMADD16
00115 }
00116
00117 doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7
00118 +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15;
00119 }
00120
00121
00122 static int cudamaddgflops(int cudadev, double *gflops, int testloops) {
00123 float *doutput = NULL;
00124 dim3 Bsz, Gsz;
00125 wkf_timerhandle timer;
00126 int i;
00127
00128 cudaError_t rc;
00129 rc = cudaSetDevice(cudadev);
00130 if (rc != cudaSuccess) {
00131 #if CUDART_VERSION >= 2010
00132 rc = cudaGetLastError();
00133 if (rc != cudaErrorSetOnActiveProcess)
00134 return -1;
00135 #else
00136 cudaGetLastError();
00137
00138 #endif
00139 }
00140
00141
00142
00143 Bsz.x = BLOCKSIZEX;
00144 Bsz.y = 1;
00145 Bsz.z = 1;
00146 Gsz.x = GRIDSIZEX;
00147 Gsz.y = 1;
00148 Gsz.z = 1;
00149
00150
00151 cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float));
00152 CUERR
00153
00154 timer=wkf_timer_create();
00155 wkf_timer_start(timer);
00156 for (i=0; i<testloops; i++) {
00157 madd_kernel<<<Gsz, Bsz>>>(doutput);
00158 cudaThreadSynchronize();
00159 }
00160 CUERR
00161 wkf_timer_stop(timer);
00162
00163 double runtime = wkf_timer_time(timer);
00164 double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) *
00165 ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops;
00166
00167 *gflops = gflop / runtime;
00168
00169 cudaFree(doutput);
00170 CUERR
00171
00172 wkf_timer_destroy(timer);
00173
00174 return 0;
00175 }
00176
00177 typedef struct {
00178 int deviceid;
00179 int testloops;
00180 double gflops;
00181 } maddthrparms;
00182
00183 static void * cudamaddthread(void *voidparms) {
00184 maddthrparms *parms = (maddthrparms *) voidparms;
00185 cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops);
00186 return NULL;
00187 }
00188
00189 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops,
00190 int testloops) {
00191 maddthrparms *parms;
00192 wkf_thread_t * threads;
00193 int i;
00194
00195
00196 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00197
00198
00199 parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms));
00200 for (i=0; i<numdevs; i++) {
00201 if (devlist != NULL)
00202 parms[i].deviceid = devlist[i];
00203 else
00204 parms[i].deviceid = i;
00205
00206 parms[i].testloops = testloops;
00207 parms[i].gflops = 0.0;
00208 }
00209
00210 #if defined(VMDTHREADS)
00211
00212
00213
00214 for (i=0; i<numdevs; i++) {
00215 wkf_thread_create(&threads[i], cudamaddthread, &parms[i]);
00216 }
00217
00218
00219 for (i=0; i<numdevs; i++) {
00220 wkf_thread_join(threads[i], NULL);
00221 }
00222 #else
00223
00224 cudamaddthread((void *) &parms[0]);
00225 #endif
00226
00227 for (i=0; i<numdevs; i++) {
00228 gflops[i] = parms[i].gflops;
00229 }
00230
00231
00232 free(parms);
00233 free(threads);
00234
00235 return 0;
00236 }
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247 #define BWITER 500
00248 #define LATENCYITER 50000
00249
00250 static int cudabusbw(int cudadev,
00251 double *hdmbsec, double *hdlatusec,
00252 double *phdmbsec, double *phdlatusec,
00253 double *dhmbsec, double *dhlatusec,
00254 double *pdhmbsec, double *pdhlatusec) {
00255 float *hdata = NULL;
00256 float *phdata = NULL;
00257 float *ddata = NULL;
00258 int i;
00259 double runtime;
00260 wkf_timerhandle timer;
00261 int memsz = 1024 * 1024 * sizeof(float);
00262
00263 *hdmbsec = 0.0;
00264 *hdlatusec = 0.0;
00265 *dhmbsec = 0.0;
00266 *dhlatusec = 0.0;
00267 *phdmbsec = 0.0;
00268 *phdlatusec = 0.0;
00269 *pdhmbsec = 0.0;
00270 *pdhlatusec = 0.0;
00271
00272
00273 cudaError_t rc;
00274 rc = cudaSetDevice(cudadev);
00275 if (rc != cudaSuccess) {
00276 #if CUDART_VERSION >= 2010
00277 rc = cudaGetLastError();
00278 if (rc != cudaErrorSetOnActiveProcess)
00279 return -1;
00280 #else
00281 cudaGetLastError();
00282
00283 #endif
00284 }
00285
00286
00287 hdata = (float *) malloc(memsz);
00288
00289
00290 cudaMallocHost((void**) &phdata, memsz);
00291 CUERR
00292
00293
00294 cudaMalloc((void**) &ddata, memsz);
00295 CUERR
00296
00297
00298 timer=wkf_timer_create();
00299
00300
00301
00302
00303
00304
00305 wkf_timer_start(timer);
00306 for (i=0; i<BWITER; i++) {
00307 cudaMemcpy(ddata, hdata, memsz, cudaMemcpyHostToDevice);
00308 }
00309 wkf_timer_stop(timer);
00310 CUERR
00311 runtime = wkf_timer_time(timer);
00312 *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00313
00314
00315 wkf_timer_start(timer);
00316 for (i=0; i<LATENCYITER; i++) {
00317 cudaMemcpy(ddata, hdata, 1, cudaMemcpyHostToDevice);
00318 }
00319 wkf_timer_stop(timer);
00320 CUERR
00321 runtime = wkf_timer_time(timer);
00322 *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00323
00324
00325
00326 wkf_timer_start(timer);
00327 for (i=0; i<BWITER; i++) {
00328 cudaMemcpy(ddata, phdata, memsz, cudaMemcpyHostToDevice);
00329 }
00330 wkf_timer_stop(timer);
00331 CUERR
00332 runtime = wkf_timer_time(timer);
00333 *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00334
00335
00336 wkf_timer_start(timer);
00337 for (i=0; i<LATENCYITER; i++) {
00338 cudaMemcpy(ddata, phdata, 1, cudaMemcpyHostToDevice);
00339 }
00340 wkf_timer_stop(timer);
00341 CUERR
00342 runtime = wkf_timer_time(timer);
00343 *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00344
00345
00346
00347
00348
00349
00350
00351 wkf_timer_start(timer);
00352 for (i=0; i<BWITER; i++) {
00353 cudaMemcpy(hdata, ddata, memsz, cudaMemcpyDeviceToHost);
00354 }
00355 wkf_timer_stop(timer);
00356 CUERR
00357 runtime = wkf_timer_time(timer);
00358 *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00359
00360
00361 wkf_timer_start(timer);
00362 for (i=0; i<LATENCYITER; i++) {
00363 cudaMemcpy(hdata, ddata, 1, cudaMemcpyDeviceToHost);
00364 }
00365 wkf_timer_stop(timer);
00366 CUERR
00367 runtime = wkf_timer_time(timer);
00368 *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00369
00370
00371
00372 wkf_timer_start(timer);
00373 for (i=0; i<BWITER; i++) {
00374 cudaMemcpy(phdata, ddata, memsz, cudaMemcpyDeviceToHost);
00375 }
00376 wkf_timer_stop(timer);
00377 CUERR
00378 runtime = wkf_timer_time(timer);
00379 *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00380
00381
00382 wkf_timer_start(timer);
00383 for (i=0; i<LATENCYITER; i++) {
00384 cudaMemcpy(phdata, ddata, 1, cudaMemcpyDeviceToHost);
00385 }
00386 wkf_timer_stop(timer);
00387 CUERR
00388 runtime = wkf_timer_time(timer);
00389 *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00390
00391
00392 cudaFree(ddata);
00393 CUERR
00394 cudaFreeHost(phdata);
00395 CUERR
00396 free(hdata);
00397
00398 wkf_timer_destroy(timer);
00399
00400 return 0;
00401 }
00402
00403 typedef struct {
00404 int deviceid;
00405 double hdmbsec;
00406 double hdlatusec;
00407 double phdmbsec;
00408 double phdlatusec;
00409 double dhmbsec;
00410 double dhlatusec;
00411 double pdhmbsec;
00412 double pdhlatusec;
00413 } busbwthrparms;
00414
00415 static void * cudabusbwthread(void *voidparms) {
00416 busbwthrparms *parms = (busbwthrparms *) voidparms;
00417 cudabusbw(parms->deviceid,
00418 &parms->hdmbsec, &parms->hdlatusec,
00419 &parms->phdmbsec, &parms->phdlatusec,
00420 &parms->dhmbsec, &parms->dhlatusec,
00421 &parms->pdhmbsec, &parms->pdhlatusec);
00422 return NULL;
00423 }
00424
00425 int vmd_cuda_bus_bw(int numdevs, int *devlist,
00426 double *hdmbsec, double *hdlatusec,
00427 double *phdmbsec,double *phdlatusec,
00428 double *dhmbsec, double *dhlatusec,
00429 double *pdhmbsec, double *pdhlatusec) {
00430 busbwthrparms *parms;
00431 wkf_thread_t * threads;
00432 int i;
00433
00434
00435 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00436
00437
00438 parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms));
00439 for (i=0; i<numdevs; i++) {
00440 if (devlist != NULL)
00441 parms[i].deviceid = devlist[i];
00442 else
00443 parms[i].deviceid = i;
00444 parms[i].hdmbsec = 0.0;
00445 parms[i].hdlatusec = 0.0;
00446 parms[i].phdmbsec = 0.0;
00447 parms[i].phdlatusec = 0.0;
00448 parms[i].dhmbsec = 0.0;
00449 parms[i].dhlatusec = 0.0;
00450 parms[i].pdhmbsec = 0.0;
00451 parms[i].pdhlatusec = 0.0;
00452 }
00453
00454 #if defined(VMDTHREADS)
00455
00456
00457
00458 for (i=0; i<numdevs; i++) {
00459 wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]);
00460 }
00461
00462
00463 for (i=0; i<numdevs; i++) {
00464 wkf_thread_join(threads[i], NULL);
00465 }
00466 #else
00467
00468 cudabusbwthread((void *) &parms[0]);
00469 #endif
00470
00471 for (i=0; i<numdevs; i++) {
00472 hdmbsec[i] = parms[i].hdmbsec;
00473 hdlatusec[i] = parms[i].hdlatusec;
00474 phdmbsec[i] = parms[i].phdmbsec;
00475 phdlatusec[i] = parms[i].phdlatusec;
00476 dhmbsec[i] = parms[i].dhmbsec;
00477 dhlatusec[i] = parms[i].dhlatusec;
00478 pdhmbsec[i] = parms[i].pdhmbsec;
00479 pdhlatusec[i] = parms[i].pdhlatusec;
00480 }
00481
00482
00483 free(parms);
00484 free(threads);
00485
00486 return 0;
00487 }
00488
00489
00490
00491
00492
00493
00494 template <class T>
00495 __global__ void gpuglobmemcpybw(T *dest, const T *src) {
00496 const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
00497 dest[idx] = src[idx];
00498 }
00499
00500 template <class T>
00501 __global__ void gpuglobmemsetbw(T *dest, const T val) {
00502 int idx = threadIdx.x + blockIdx.x * blockDim.x;
00503 dest[idx] = val;
00504 }
00505
00506 typedef float4 datatype;
00507
00508 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
00509 int i;
00510 int len = 1 << 22;
00511 int loops = 500;
00512 datatype *src, *dest;
00513 datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
00514
00515
00516 float memsettime = 0.0f;
00517 float memcpytime = 0.0f;
00518 *gpumemsetgbsec = 0.0;
00519 *gpumemcpygbsec = 0.0;
00520
00521
00522 cudaError_t rc;
00523 rc = cudaSetDevice(cudadev);
00524 if (rc != cudaSuccess) {
00525 #if CUDART_VERSION >= 2010
00526 rc = cudaGetLastError();
00527 if (rc != cudaErrorSetOnActiveProcess)
00528 return -1;
00529 #else
00530 cudaGetLastError();
00531
00532 #endif
00533 }
00534
00535 cudaMalloc((void **) &src, sizeof(datatype)*len);
00536 CUERR
00537 cudaMalloc((void **) &dest, sizeof(datatype)*len);
00538 CUERR
00539
00540 dim3 BSz(256, 1, 1);
00541 dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1);
00542
00543
00544 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
00545 CUERR
00546 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00547 CUERR
00548 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00549 CUERR
00550
00551 cudaEvent_t start, end;
00552 cudaEventCreate(&start);
00553 cudaEventCreate(&end);
00554
00555
00556 cudaEventRecord(start, 0);
00557 for (i=0; i<loops; i++) {
00558 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00559 }
00560 CUERR
00561 cudaEventRecord(end, 0);
00562 CUERR
00563 cudaEventSynchronize(start);
00564 CUERR
00565 cudaEventSynchronize(end);
00566 CUERR
00567 cudaEventElapsedTime(&memsettime, start, end);
00568 CUERR
00569
00570
00571 cudaEventRecord(start, 0);
00572 for (i=0; i<loops; i++) {
00573 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00574 }
00575 cudaEventRecord(end, 0);
00576 CUERR
00577 cudaEventSynchronize(start);
00578 CUERR
00579 cudaEventSynchronize(end);
00580 CUERR
00581 cudaEventElapsedTime(&memcpytime, start, end);
00582 CUERR
00583
00584 cudaEventDestroy(start);
00585 CUERR
00586 cudaEventDestroy(end);
00587 CUERR
00588
00589 *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
00590 *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
00591 cudaFree(dest);
00592 cudaFree(src);
00593 CUERR
00594
00595 return 0;
00596 }
00597
00598 typedef struct {
00599 int deviceid;
00600 double memsetgbsec;
00601 double memcpygbsec;
00602 } globmembwthrparms;
00603
00604 static void * cudaglobmembwthread(void *voidparms) {
00605 globmembwthrparms *parms = (globmembwthrparms *) voidparms;
00606 cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec);
00607 return NULL;
00608 }
00609
00610 int vmd_cuda_globmem_bw(int numdevs, int *devlist,
00611 double *memsetgbsec, double *memcpygbsec) {
00612 globmembwthrparms *parms;
00613 wkf_thread_t * threads;
00614 int i;
00615
00616
00617 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00618
00619
00620 parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms));
00621 for (i=0; i<numdevs; i++) {
00622 if (devlist != NULL)
00623 parms[i].deviceid = devlist[i];
00624 else
00625 parms[i].deviceid = i;
00626 parms[i].memsetgbsec = 0.0;
00627 parms[i].memcpygbsec = 0.0;
00628 }
00629
00630 #if defined(VMDTHREADS)
00631
00632
00633
00634 for (i=0; i<numdevs; i++) {
00635 wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]);
00636 }
00637
00638
00639 for (i=0; i<numdevs; i++) {
00640 wkf_thread_join(threads[i], NULL);
00641 }
00642 #else
00643
00644 cudaglobmembwthread((void *) &parms[0]);
00645 #endif
00646
00647 for (i=0; i<numdevs; i++) {
00648 memsetgbsec[i] = parms[i].memsetgbsec;
00649 memcpygbsec[i] = parms[i].memcpygbsec;
00650 }
00651
00652
00653 free(parms);
00654 free(threads);
00655
00656 return 0;
00657 }
00658
00659
00660
00661
00662
00663 static void * vmddevpoollatencythread(void *voidparms) {
00664 return NULL;
00665 }
00666
00667 static void * vmddevpooltilelatencythread(void *voidparms) {
00668 int threadid=-1;
00669 int tilesize=1;
00670 void *parms=NULL;
00671 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00672 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00673
00674
00675 wkf_tasktile_t tile;
00676 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00677
00678 }
00679
00680 return NULL;
00681 }
00682
00683
00684
00685 __global__ static void nopkernel(float * ddata) {
00686 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
00687 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00688 unsigned int outaddr = __umul24(gridDim.x, blockDim.x) * yindex + xindex;
00689
00690 if (ddata != NULL)
00691 ddata[outaddr] = outaddr;
00692 }
00693
00694
00695 __global__ static void voidkernel(void) {
00696 return;
00697 }
00698
00699 static void * vmddevpoolcudatilelatencythread(void *voidparms) {
00700 int threadid=-1;
00701 int tilesize=1;
00702 float *parms=NULL;
00703 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00704
00705
00706
00707 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00708
00709 #if 0
00710
00711 tilesize=4;
00712 wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00713 #endif
00714
00715
00716 wkf_tasktile_t tile;
00717 dim3 Gsz(1,1,0);
00718 dim3 Bsz(8,8,1);
00719 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00720
00721 nopkernel<<<Gsz, Bsz, 0>>>(parms);
00722 }
00723
00724
00725 cudaThreadSynchronize();
00726
00727 return NULL;
00728 }
00729
00730
00731 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize,
00732 double *kernlaunchlatency,
00733 double *barlatency,
00734 double *cyclelatency,
00735 double *tilelatency,
00736 double *kernellatency) {
00737 int i;
00738 wkf_tasktile_t tile;
00739 wkf_timerhandle timer;
00740 int loopcount;
00741
00742 timer=wkf_timer_create();
00743
00744
00745
00746 loopcount = 15000;
00747 dim3 VGsz(1,1,0);
00748 dim3 VBsz(8,8,1);
00749 wkf_timer_start(timer);
00750 for (i=0; i<loopcount; i++) {
00751 voidkernel<<<VGsz, VBsz, 0>>>();
00752 }
00753
00754 cudaThreadSynchronize();
00755 wkf_timer_stop(timer);
00756 *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount);
00757
00758
00759 loopcount = 15000;
00760 wkf_timer_start(timer);
00761 for (i=0; i<loopcount; i++) {
00762 wkf_threadpool_wait(devpool);
00763 }
00764 wkf_timer_stop(timer);
00765 *barlatency = wkf_timer_time(timer) / ((double) loopcount);
00766
00767
00768 loopcount = 5000;
00769 wkf_timer_start(timer);
00770 for (i=0; i<loopcount; i++) {
00771 tile.start=0;
00772 tile.end=0;
00773 wkf_threadpool_sched_dynamic(devpool, &tile);
00774 wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1);
00775 }
00776 wkf_timer_stop(timer);
00777 *cyclelatency = wkf_timer_time(timer) / ((double) loopcount);
00778
00779
00780 loopcount = 5000;
00781 wkf_timer_start(timer);
00782 for (i=0; i<loopcount; i++) {
00783 tile.start=0;
00784 tile.end=tilesize;
00785 wkf_threadpool_sched_dynamic(devpool, &tile);
00786 wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1);
00787 }
00788 wkf_timer_stop(timer);
00789 *tilelatency = wkf_timer_time(timer) / ((double) loopcount);
00790
00791
00792 loopcount = 2000;
00793 wkf_timer_start(timer);
00794 for (i=0; i<loopcount; i++) {
00795 tile.start=0;
00796 tile.end=tilesize;
00797 wkf_threadpool_sched_dynamic(devpool, &tile);
00798 wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1);
00799 }
00800 wkf_timer_stop(timer);
00801 *kernellatency = wkf_timer_time(timer) / ((double) loopcount);
00802
00803 wkf_timer_destroy(timer);
00804
00805 #if 1
00806 vmd_cuda_measure_latencies(devpool);
00807 #endif
00808
00809 return 0;
00810 }
00811
00812
00813
00814
00815
00816 typedef struct {
00817 int deviceid;
00818 int testloops;
00819 double kernlatency;
00820 double bcopylatency;
00821 double kbseqlatency;
00822 } latthrparms;
00823
00824 static void * vmddevpoolcudalatencythread(void *voidparms) {
00825 int threadid=-1;
00826 latthrparms *parms=NULL;
00827
00828 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00829 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00830 if (parms->deviceid == threadid) {
00831 wkf_timerhandle timer;
00832 timer=wkf_timer_create();
00833 printf("Thread/device %d running...\n", threadid);
00834 cudaStream_t devstream;
00835 cudaStreamCreate(&devstream);
00836
00837 char *hostbuf = (char *) calloc(1, 65536 * sizeof(char));
00838 char *gpubuf = NULL;
00839 cudaMalloc((void**)&gpubuf, 65536 * sizeof(char));
00840
00841 dim3 Gsz(1,1,0);
00842 dim3 Bsz(8,8,1);
00843
00844
00845 wkf_timer_start(timer);
00846 int i;
00847 for (i=0; i<parms->testloops; i++) {
00848
00849 nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL);
00850 }
00851
00852 cudaStreamSynchronize(devstream);
00853 wkf_timer_stop(timer);
00854 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00855
00856
00857 wkf_timer_start(timer);
00858 for (i=0; i<parms->testloops; i++) {
00859 cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream);
00860 cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream);
00861 }
00862
00863 cudaStreamSynchronize(devstream);
00864 wkf_timer_stop(timer);
00865 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00866
00867 printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency);
00868
00869 cudaStreamDestroy(devstream);
00870 cudaFree(gpubuf);
00871 free(hostbuf);
00872 wkf_timer_destroy(timer);
00873 }
00874
00875 return NULL;
00876 }
00877
00878
00879 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) {
00880 latthrparms thrparms;
00881 int workers = wkf_threadpool_get_workercount(devpool);
00882 int i;
00883 printf("vmd_cuda_measure_latencies()...\n");
00884 for (i=0; i<workers; i++) {
00885 memset(&thrparms, 0, sizeof(thrparms));
00886 thrparms.deviceid = i;
00887 thrparms.testloops = 2500;
00888 wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1);
00889 }
00890
00891 return 0;
00892 }
00893
00894
00895
00896