00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <tcl.h>
00022 #include <ctype.h>
00023 #include <stdio.h>
00024 #include <stdlib.h>
00025 #include <string.h>
00026 #include "Benchmark.h"
00027 #include "config.h"
00028 #include "VMDApp.h"
00029 #include "TclCommands.h"
00030 #include "CUDAKernels.h"
00031 #include "CUDAAccel.h"
00032 #include "WKFThreads.h"
00033
00034 static void cmd_vmdbench_usage(Tcl_Interp *interp) {
00035 Tcl_AppendResult(interp,
00036 "usage: vmdbench <command> [args...]\n"
00037 "vmdbench stream [N] - built-in STREAM memory bandwidth test\n",
00038 "vmdbench cudamadd [devices] - CUDA multiply-add arithmetic (*)\n",
00039 "vmdbench cudabusbw [devices] - CUDA host/device bus bandwidth (*)\n",
00040 "vmdbench cudaglobmembw [devices] - CUDA global memory bandwidth (*)\n",
00041 "vmdbench cudadevpool [N] - CUDA threadpool run-cycle latency (*)\n",
00042 "(*) Only available in CUDA-enabled builds of VMD\n",
00043 NULL);
00044 }
00045
00046 int text_cmd_vmdbench(ClientData cd, Tcl_Interp *interp, int argc,
00047 const char *argv[]) {
00048
00049 VMDApp *app = (VMDApp *)cd;
00050
00051 if (argc == 1) {
00052 cmd_vmdbench_usage(interp);
00053 return TCL_ERROR;
00054 }
00055
00056 if (argc >= 2) {
00057 if (!strupncmp(argv[1], "stream", CMDLEN)) {
00058 double times[8], mbsec[8];
00059 int N = 1024*1024 * 16;
00060
00061 if (argc == 3) {
00062 if (Tcl_GetInt(interp, argv[2], &N) != TCL_OK) {
00063 Tcl_AppendResult(interp, " in vmdbench stream", NULL);
00064 return TCL_ERROR;
00065 }
00066 }
00067
00068 int rc = stream_bench(N, times, mbsec);
00069 if (rc) {
00070 Tcl_AppendResult(interp,
00071 "unable to complete stream benchmark, out of memory", NULL);
00072 return TCL_ERROR;
00073 }
00074
00075 Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
00076 const char *benchnames[] = {
00077 "copy (double)",
00078 "scale (double)",
00079 "add (double)",
00080 "triad (double)",
00081 "copy (float)",
00082 "scale (float)",
00083 "add (float)",
00084 "triad (float)"
00085 };
00086
00087 Tcl_Obj *colNameObj = Tcl_NewListObj(0, NULL);
00088 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Test", -1));
00089 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Time", -1));
00090 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("MB/sec", -1));
00091 Tcl_ListObjAppendElement(interp, tcl_result, colNameObj);
00092
00093 int i;
00094 for (i=0; i<8; i++) {
00095 Tcl_Obj *rowListObj = Tcl_NewListObj(0, NULL);
00096 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewStringObj(benchnames[i], -1));
00097 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(times[i]));
00098 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(mbsec[i]));
00099 Tcl_ListObjAppendElement(interp, tcl_result, rowListObj);
00100
00101 }
00102 Tcl_SetObjResult(interp, tcl_result);
00103
00104 return TCL_OK;
00105 } else if (!strupncmp(argv[1], "cudamadd", CMDLEN)) {
00106 #if defined(VMDCUDA)
00107 int numdevs, physnumdevs;
00108 int *devlist = NULL;
00109 vmd_cuda_num_devices(&physnumdevs);
00110 numdevs = physnumdevs;
00111 #if !defined(VMDTHREADS)
00112 numdevs = 1;
00113 #endif
00114
00115
00116 if (argc > 2) {
00117 if ((argc-2) > numdevs) {
00118 Tcl_AppendResult(interp, "vmdbench: bad device argument", NULL);
00119 return TCL_ERROR;
00120 } else {
00121 numdevs = argc-2;
00122 }
00123 devlist = (int *) malloc(numdevs * sizeof(int));
00124 int arg, dev;
00125 for (arg=0; arg<numdevs; arg++) {
00126 if (Tcl_GetInt(interp, argv[arg+2], &dev) != TCL_OK) {
00127 Tcl_AppendResult(interp, "vmdbench: bad device argument", NULL);
00128 free(devlist);
00129 return TCL_ERROR;
00130 }
00131 if (dev < 0 || dev >= physnumdevs) {
00132 Tcl_AppendResult(interp, "vmdbench: device argument out of range", NULL);
00133 free(devlist);
00134 return TCL_ERROR;
00135 }
00136 devlist[arg] = dev;
00137 }
00138 }
00139
00140 double *gflops = (double *) malloc(numdevs * sizeof(double));
00141 int testloops=1;
00142 if (getenv("VMDMADDLOOPS") != NULL)
00143 testloops = atoi(getenv("VMDMADDLOOPS"));
00144
00145 vmd_cuda_madd_gflops(numdevs, devlist, gflops, testloops);
00146
00147 Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
00148 Tcl_Obj *colNameObj = Tcl_NewListObj(0, NULL);
00149 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device", -1));
00150 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("GFLOPS", -1));
00151 Tcl_ListObjAppendElement(interp, tcl_result, colNameObj);
00152
00153 int i;
00154 for (i=0; i<numdevs; i++) {
00155 Tcl_Obj *rowListObj = Tcl_NewListObj(0, NULL);
00156 if (devlist != NULL)
00157 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewIntObj(devlist[i]));
00158 else
00159 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewIntObj(i));
00160 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(gflops[i]));
00161 Tcl_ListObjAppendElement(interp, tcl_result, rowListObj);
00162 }
00163 Tcl_SetObjResult(interp, tcl_result);
00164
00165 if (devlist)
00166 free(devlist);
00167
00168 return TCL_OK;
00169 #else
00170 Tcl_AppendResult(interp, "CUDA Acceleration not available in this build", NULL);
00171 return TCL_ERROR;
00172 #endif
00173 } else if (!strupncmp(argv[1], "cudabusbw", CMDLEN)) {
00174 #if defined(VMDCUDA)
00175 int numdevs, physnumdevs;
00176 int *devlist = NULL;
00177 vmd_cuda_num_devices(&physnumdevs);
00178 numdevs = physnumdevs;
00179 #if !defined(VMDTHREADS)
00180 numdevs = 1;
00181 #endif
00182
00183
00184 if (argc > 2) {
00185 if ((argc-2) > numdevs) {
00186 Tcl_AppendResult(interp, "vmdbench: bad device argument", NULL);
00187 return TCL_ERROR;
00188 } else {
00189 numdevs = argc-2;
00190 }
00191 devlist = (int *) malloc(numdevs * sizeof(int));
00192 int arg, dev;
00193 for (arg=0; arg<numdevs; arg++) {
00194 if (Tcl_GetInt(interp, argv[arg+2], &dev) != TCL_OK) {
00195 Tcl_AppendResult(interp, "vmdbench: bad device argument", NULL);
00196 free(devlist);
00197 return TCL_ERROR;
00198 }
00199 if (dev < 0 || dev >= physnumdevs) {
00200 Tcl_AppendResult(interp, "vmdbench: device argument out of range", NULL);
00201 free(devlist);
00202 return TCL_ERROR;
00203 }
00204 devlist[arg] = dev;
00205 }
00206 }
00207
00208 double *hdmbsec = (double *) malloc(numdevs * sizeof(double));
00209 double *hdlatusec = (double *) malloc(numdevs * sizeof(double));
00210 double *phdmbsec = (double *) malloc(numdevs * sizeof(double));
00211 double *phdlatusec = (double *) malloc(numdevs * sizeof(double));
00212 double *dhmbsec = (double *) malloc(numdevs * sizeof(double));
00213 double *dhlatusec = (double *) malloc(numdevs * sizeof(double));
00214 double *pdhmbsec = (double *) malloc(numdevs * sizeof(double));
00215 double *pdhlatusec = (double *) malloc(numdevs * sizeof(double));
00216
00217 vmd_cuda_bus_bw(numdevs, devlist,
00218 hdmbsec, hdlatusec, phdmbsec, phdlatusec,
00219 dhmbsec, dhlatusec, pdhmbsec, pdhlatusec);
00220
00221 Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
00222 Tcl_Obj *colNameObj = Tcl_NewListObj(0, NULL);
00223 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device", -1));
00224 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Host-device bandwidth (MB/sec)", -1));
00225 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Host-device latency (usec)", -1));
00226 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Host-device pinned bandwidth (MB/sec)", -1));
00227 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Host-device pinned latency (usec)", -1));
00228 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device-host bandwidth (MB/sec)", -1));
00229 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device-host latency (usec)", -1));
00230 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device-host pinned bandwidth (MB/sec)", -1));
00231 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device-host pinned latency (usec)", -1));
00232 Tcl_ListObjAppendElement(interp, tcl_result, colNameObj);
00233
00234 int i;
00235 for (i=0; i<numdevs; i++) {
00236 Tcl_Obj *rowListObj = Tcl_NewListObj(0, NULL);
00237 if (devlist != NULL)
00238 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewIntObj(devlist[i]));
00239 else
00240 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewIntObj(i));
00241
00242 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(hdmbsec[i]));
00243 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(hdlatusec[i]));
00244 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(phdmbsec[i]));
00245 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(phdlatusec[i]));
00246 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(dhmbsec[i]));
00247 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(dhlatusec[i]));
00248 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(pdhmbsec[i]));
00249 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(pdhlatusec[i]));
00250 Tcl_ListObjAppendElement(interp, tcl_result, rowListObj);
00251 }
00252 Tcl_SetObjResult(interp, tcl_result);
00253 return TCL_OK;
00254 #else
00255 Tcl_AppendResult(interp, "CUDA Acceleration not available in this build", NULL);
00256 return TCL_ERROR;
00257 #endif
00258 } else if (!strupncmp(argv[1], "cudaglobmembw", CMDLEN)) {
00259 #if defined(VMDCUDA)
00260 int numdevs, physnumdevs;
00261 int *devlist = NULL;
00262 vmd_cuda_num_devices(&physnumdevs);
00263 numdevs = physnumdevs;
00264 #if !defined(VMDTHREADS)
00265 numdevs = 1;
00266 #endif
00267
00268
00269 if (argc > 2) {
00270 if ((argc-2) > numdevs) {
00271 Tcl_AppendResult(interp, "vmdbench: bad device argument", NULL);
00272 return TCL_ERROR;
00273 } else {
00274 numdevs = argc-2;
00275 }
00276 devlist = (int *) malloc(numdevs * sizeof(int));
00277 int arg, dev;
00278 for (arg=0; arg<numdevs; arg++) {
00279 if (Tcl_GetInt(interp, argv[arg+2], &dev) != TCL_OK) {
00280 Tcl_AppendResult(interp, "vmdbench: bad device argument", NULL);
00281 free(devlist);
00282 return TCL_ERROR;
00283 }
00284 if (dev < 0 || dev >= physnumdevs) {
00285 Tcl_AppendResult(interp, "vmdbench: device argument out of range", NULL);
00286 free(devlist);
00287 return TCL_ERROR;
00288 }
00289 devlist[arg] = dev;
00290 }
00291 }
00292
00293 double *memsetgbsec = (double *) malloc(numdevs * sizeof(double));
00294 double *memcpygbsec = (double *) malloc(numdevs * sizeof(double));
00295
00296 vmd_cuda_globmem_bw(numdevs, devlist, memsetgbsec, memcpygbsec);
00297
00298 Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
00299 Tcl_Obj *colNameObj = Tcl_NewListObj(0, NULL);
00300 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Device", -1));
00301 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Memory set bandwidth (GB/sec)", -1));
00302 Tcl_ListObjAppendElement(interp, colNameObj, Tcl_NewStringObj("Memory copy bandwidth (GB/sec)", -1));
00303 Tcl_ListObjAppendElement(interp, tcl_result, colNameObj);
00304
00305 int i;
00306 for (i=0; i<numdevs; i++) {
00307 Tcl_Obj *rowListObj = Tcl_NewListObj(0, NULL);
00308 if (devlist != NULL)
00309 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewIntObj(devlist[i]));
00310 else
00311 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewIntObj(i));
00312
00313 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(memsetgbsec[i]));
00314 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(memcpygbsec[i]));
00315 Tcl_ListObjAppendElement(interp, tcl_result, rowListObj);
00316 }
00317 Tcl_SetObjResult(interp, tcl_result);
00318 return TCL_OK;
00319 #else
00320 Tcl_AppendResult(interp, "CUDA Acceleration not available in this build", NULL);
00321 return TCL_ERROR;
00322 #endif
00323 } else if (!strupncmp(argv[1], "cudadevpool", CMDLEN)) {
00324 #if defined(VMDCUDA)
00325 int N=1;
00326 if (argc == 3) {
00327 if (Tcl_GetInt(interp, argv[2], &N) != TCL_OK) {
00328 Tcl_AppendResult(interp, " in vmdbench cudadevpool", NULL);
00329 return TCL_ERROR;
00330 }
00331 }
00332
00333 wkf_threadpool_t * devpool = app->cuda->get_cuda_devpool();
00334 Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
00335 Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj("Empty kernel launch latency (usec)", -1));
00336 Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj("Device pool barrier latency (usec)", -1));
00337 Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj("Device pool empty run cycle latency (usec)", -1));
00338 Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj("Device pool tile run latency (usec)", -1));
00339 Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj("Device pool GPU kernel tile latency (usec)", -1));
00340
00341 int i;
00342 double kernlaunchlatency, barlatency;
00343 double cyclelatency, tilelatency;
00344 double kernellatency;
00345 for (i=0; i<2; i++) {
00346 vmd_cuda_devpool_latency(devpool, N, &kernlaunchlatency,
00347 &barlatency, &cyclelatency,
00348 &tilelatency, &kernellatency);
00349
00350
00351 if (i < 1)
00352 continue;
00353
00354
00355 Tcl_Obj *rowListObj = Tcl_NewListObj(0, NULL);
00356 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(kernlaunchlatency*1000000));
00357 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(barlatency*1000000));
00358 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(cyclelatency*1000000));
00359 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(tilelatency*1000000));
00360 Tcl_ListObjAppendElement(interp, rowListObj, Tcl_NewDoubleObj(kernellatency*1000000));
00361 Tcl_ListObjAppendElement(interp, tcl_result, rowListObj);
00362 }
00363
00364 Tcl_SetObjResult(interp, tcl_result);
00365 return TCL_OK;
00366 #else
00367 Tcl_AppendResult(interp, "CUDA Acceleration not available in this build", NULL);
00368 return TCL_ERROR;
00369 #endif
00370
00371 } else {
00372 cmd_vmdbench_usage(interp);
00373 return TCL_ERROR;
00374 }
00375 } else {
00376 cmd_vmdbench_usage(interp);
00377 return TCL_ERROR;
00378 }
00379
00380
00381 return TCL_OK;
00382 }
00383
00384