00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #if CUDART_VERSION < 4000
00027 #error The VMD QuickSurf feature requires CUDA 4.0 or later
00028 #endif
00029
00030 #include "Inform.h"
00031 #include "utilities.h"
00032 #include "WKFThreads.h"
00033 #include "WKFUtils.h"
00034 #include "CUDAKernels.h"
00035 #include "CUDAMarchingCubes.h"
00036 #include "CUDAQuickSurf.h"
00037
00038 #include "DispCmds.h"
00039 #include "VMDDisplayList.h"
00040
00041
00042
00043
00044
00045 typedef struct {
00046 float isovalue;
00047 float radscale;
00048 float4 *xyzr;
00049 float4 *colors;
00050 float *volmap;
00051 float *voltexmap;
00052 int numplane;
00053 int numcol;
00054 int numpt;
00055 long int natoms;
00056 float gridspacing;
00057 } enthrparms;
00058
00059
00060 static void * cudadensitythread(void *);
00061
00062 #if 1
00063 #define CUERR { cudaError_t err; \
00064 if ((err = cudaGetLastError()) != cudaSuccess) { \
00065 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00066 printf("Thread aborting...\n"); \
00067 return NULL; }}
00068 #else
00069 #define CUERR
00070 #endif
00071
00072
00073
00074
00075 #if 1
00076
00077 #define DUNROLLX 8
00078 #define DBLOCKSZX 8 // make large enough to allow coalesced global mem ops
00079 #define DBLOCKSZY 8 // make as small as possible for finer granularity
00080
00081 #else
00082
00083 #define DUNROLLX 4
00084 #define DBLOCKSZX 16 // make large enough to allow coalesced global mem ops
00085 #define DBLOCKSZY 16 // make as small as possible for finer granularity
00086
00087 #endif
00088
00089 #define DBLOCKSZ (DBLOCKSZX*DBLOCKSZY)
00090
00091
00092 #if DUNROLLX == 8
00093 #define FLOPSPERATOMEVALTEX (109.0/8.0)
00094 #define FLOPSPERATOMEVAL (53.0/8.0)
00095 #elif DUNROLLX == 4
00096 #define FLOPSPERATOMEVALTEX (57.0/4.0)
00097 #define FLOPSPERATOMEVAL (29.0/4.0)
00098 #elif DUNROLLX == 2
00099 #define FLOPSPERATOMEVALTEX (30.0/2.0)
00100 #define FLOPSPERATOMEVAL (17.0/2.0)
00101 #endif
00102
00103 __global__ static void gaussdensity_direct_tex(int natoms,
00104 const float4 *xyzr,
00105 const float4 *colors,
00106 float gridspacing, unsigned int z,
00107 float *densitygrid,
00108 float3 *voltexmap,
00109 float invisovalue) {
00110 unsigned int xindex = (blockIdx.x * blockDim.x) * DUNROLLX + threadIdx.x;
00111 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00112 unsigned int zindex = (blockIdx.z * blockDim.z) + threadIdx.z;
00113 unsigned int outaddr =
00114 ((gridDim.x * blockDim.x) * DUNROLLX) * (gridDim.y * blockDim.y) * zindex +
00115 ((gridDim.x * blockDim.x) * DUNROLLX) * yindex + xindex;
00116 zindex += z;
00117
00118 float coorx = gridspacing * xindex;
00119 float coory = gridspacing * yindex;
00120 float coorz = gridspacing * zindex;
00121
00122 float densityvalx1=0.0f;
00123 float densityvalx2=0.0f;
00124 float3 densitycolx1;
00125 densitycolx1=make_float3(0.0f, 0.0f, 0.0f);
00126 float3 densitycolx2=densitycolx1;
00127
00128 #if DUNROLLX >= 4
00129 float densityvalx3=0.0f;
00130 float densityvalx4=0.0f;
00131 float3 densitycolx3=densitycolx1;
00132 float3 densitycolx4=densitycolx1;
00133 #endif
00134 #if DUNROLLX >= 8
00135 float densityvalx5=0.0f;
00136 float densityvalx6=0.0f;
00137 float densityvalx7=0.0f;
00138 float densityvalx8=0.0f;
00139
00140 float3 densitycolx5=densitycolx1;
00141 float3 densitycolx6=densitycolx1;
00142 float3 densitycolx7=densitycolx1;
00143 float3 densitycolx8=densitycolx1;
00144 #endif
00145
00146 float gridspacing_coalesce = gridspacing * DBLOCKSZX;
00147
00148 int atomid;
00149 for (atomid=0; atomid<natoms; atomid++) {
00150 float4 atom = xyzr[atomid];
00151 float4 color = colors[atomid];
00152
00153 float dy = coory - atom.y;
00154 float dz = coorz - atom.z;
00155 float dyz2 = dy*dy + dz*dz;
00156
00157 float dx1 = coorx - atom.x;
00158 float r21 = (dx1*dx1 + dyz2) * atom.w;
00159 float tmp1 = exp2f(-r21);
00160 densityvalx1 += tmp1;
00161 tmp1 *= invisovalue;
00162 densitycolx1.x += tmp1 * color.x;
00163 densitycolx1.y += tmp1 * color.y;
00164 densitycolx1.z += tmp1 * color.z;
00165
00166 float dx2 = dx1 + gridspacing_coalesce;
00167 float r22 = (dx2*dx2 + dyz2) * atom.w;
00168 float tmp2 = exp2f(-r22);
00169 densityvalx2 += tmp2;
00170 tmp2 *= invisovalue;
00171 densitycolx2.x += tmp2 * color.x;
00172 densitycolx2.y += tmp2 * color.y;
00173 densitycolx2.z += tmp2 * color.z;
00174
00175 #if DUNROLLX >= 4
00176 float dx3 = dx2 + gridspacing_coalesce;
00177 float r23 = (dx3*dx3 + dyz2) * atom.w;
00178 float tmp3 = exp2f(-r23);
00179 densityvalx3 += tmp3;
00180 tmp3 *= invisovalue;
00181 densitycolx3.x += tmp3 * color.x;
00182 densitycolx3.y += tmp3 * color.y;
00183 densitycolx3.z += tmp3 * color.z;
00184
00185 float dx4 = dx3 + gridspacing_coalesce;
00186 float r24 = (dx4*dx4 + dyz2) * atom.w;
00187 float tmp4 = exp2f(-r24);
00188 densityvalx4 += tmp4;
00189 tmp4 *= invisovalue;
00190 densitycolx4.x += tmp4 * color.x;
00191 densitycolx4.y += tmp4 * color.y;
00192 densitycolx4.z += tmp4 * color.z;
00193 #endif
00194 #if DUNROLLX >= 8
00195 float dx5 = dx4 + gridspacing_coalesce;
00196 float r25 = (dx5*dx5 + dyz2) * atom.w;
00197 float tmp5 = exp2f(-r25);
00198 densityvalx5 += tmp5;
00199 tmp5 *= invisovalue;
00200 densitycolx5.x += tmp5 * color.x;
00201 densitycolx5.y += tmp5 * color.y;
00202 densitycolx5.z += tmp5 * color.z;
00203
00204 float dx6 = dx5 + gridspacing_coalesce;
00205 float r26 = (dx6*dx6 + dyz2) * atom.w;
00206 float tmp6 = exp2f(-r26);
00207 densityvalx6 += tmp6;
00208 tmp6 *= invisovalue;
00209 densitycolx6.x += tmp6 * color.x;
00210 densitycolx6.y += tmp6 * color.y;
00211 densitycolx6.z += tmp6 * color.z;
00212
00213 float dx7 = dx6 + gridspacing_coalesce;
00214 float r27 = (dx7*dx7 + dyz2) * atom.w;
00215 float tmp7 = exp2f(-r27);
00216 densityvalx7 += tmp7;
00217 tmp7 *= invisovalue;
00218 densitycolx7.x += tmp7 * color.x;
00219 densitycolx7.y += tmp7 * color.y;
00220 densitycolx7.z += tmp7 * color.z;
00221
00222 float dx8 = dx7 + gridspacing_coalesce;
00223 float r28 = (dx8*dx8 + dyz2) * atom.w;
00224 float tmp8 = exp2f(-r28);
00225 densityvalx8 += tmp8;
00226 tmp8 *= invisovalue;
00227 densitycolx8.x += tmp8 * color.x;
00228 densitycolx8.y += tmp8 * color.y;
00229 densitycolx8.z += tmp8 * color.z;
00230 #endif
00231 }
00232
00233 densitygrid[outaddr ] += densityvalx1;
00234 voltexmap[outaddr ].x += densitycolx1.x;
00235 voltexmap[outaddr ].y += densitycolx1.y;
00236 voltexmap[outaddr ].z += densitycolx1.z;
00237
00238 densitygrid[outaddr+1*DBLOCKSZX] += densityvalx2;
00239 voltexmap[outaddr+1*DBLOCKSZX].x += densitycolx2.x;
00240 voltexmap[outaddr+1*DBLOCKSZX].y += densitycolx2.y;
00241 voltexmap[outaddr+1*DBLOCKSZX].z += densitycolx2.z;
00242
00243 #if DUNROLLX >= 4
00244 densitygrid[outaddr+2*DBLOCKSZX] += densityvalx3;
00245 voltexmap[outaddr+2*DBLOCKSZX].x += densitycolx3.x;
00246 voltexmap[outaddr+2*DBLOCKSZX].y += densitycolx3.y;
00247 voltexmap[outaddr+2*DBLOCKSZX].z += densitycolx3.z;
00248
00249 densitygrid[outaddr+3*DBLOCKSZX] += densityvalx4;
00250 voltexmap[outaddr+3*DBLOCKSZX].x += densitycolx4.x;
00251 voltexmap[outaddr+3*DBLOCKSZX].y += densitycolx4.y;
00252 voltexmap[outaddr+3*DBLOCKSZX].z += densitycolx4.z;
00253 #endif
00254 #if DUNROLLX >= 8
00255 densitygrid[outaddr+4*DBLOCKSZX] += densityvalx5;
00256 voltexmap[outaddr+4*DBLOCKSZX].x += densitycolx5.x;
00257 voltexmap[outaddr+4*DBLOCKSZX].y += densitycolx5.y;
00258 voltexmap[outaddr+4*DBLOCKSZX].z += densitycolx5.z;
00259
00260 densitygrid[outaddr+5*DBLOCKSZX] += densityvalx6;
00261 voltexmap[outaddr+5*DBLOCKSZX].x += densitycolx6.x;
00262 voltexmap[outaddr+5*DBLOCKSZX].y += densitycolx6.y;
00263 voltexmap[outaddr+5*DBLOCKSZX].z += densitycolx6.z;
00264
00265 densitygrid[outaddr+6*DBLOCKSZX] += densityvalx7;
00266 voltexmap[outaddr+6*DBLOCKSZX].x += densitycolx7.x;
00267 voltexmap[outaddr+6*DBLOCKSZX].y += densitycolx7.y;
00268 voltexmap[outaddr+6*DBLOCKSZX].z += densitycolx7.z;
00269
00270 densitygrid[outaddr+7*DBLOCKSZX] += densityvalx8;
00271 voltexmap[outaddr+7*DBLOCKSZX].x += densitycolx8.x;
00272 voltexmap[outaddr+7*DBLOCKSZX].y += densitycolx8.y;
00273 voltexmap[outaddr+7*DBLOCKSZX].z += densitycolx8.z;
00274 #endif
00275 }
00276
00277
00278 __global__ static void gaussdensity_direct(int natoms,
00279 const float4 *xyzr,
00280 float gridspacing, unsigned int z,
00281 float *densitygrid) {
00282 unsigned int xindex = (blockIdx.x * blockDim.x) * DUNROLLX + threadIdx.x;
00283 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00284 unsigned int zindex = (blockIdx.z * blockDim.z) + threadIdx.z;
00285 unsigned int outaddr =
00286 ((gridDim.x * blockDim.x) * DUNROLLX) * (gridDim.y * blockDim.y) * zindex +
00287 ((gridDim.x * blockDim.x) * DUNROLLX) * yindex + xindex;
00288 zindex += z;
00289
00290 float coorx = gridspacing * xindex;
00291 float coory = gridspacing * yindex;
00292 float coorz = gridspacing * zindex;
00293
00294 float densityvalx1=0.0f;
00295 float densityvalx2=0.0f;
00296 #if DUNROLLX >= 4
00297 float densityvalx3=0.0f;
00298 float densityvalx4=0.0f;
00299 #endif
00300 #if DUNROLLX >= 8
00301 float densityvalx5=0.0f;
00302 float densityvalx6=0.0f;
00303 float densityvalx7=0.0f;
00304 float densityvalx8=0.0f;
00305 #endif
00306
00307 float gridspacing_coalesce = gridspacing * DBLOCKSZX;
00308
00309 int atomid;
00310 for (atomid=0; atomid<natoms; atomid++) {
00311 float4 atom = xyzr[atomid];
00312 float dy = coory - atom.y;
00313 float dz = coorz - atom.z;
00314 float dyz2 = dy*dy + dz*dz;
00315
00316 float dx1 = coorx - atom.x;
00317 float r21 = (dx1*dx1 + dyz2) * atom.w;
00318 densityvalx1 += exp2f(-r21);
00319
00320 float dx2 = dx1 + gridspacing_coalesce;
00321 float r22 = (dx2*dx2 + dyz2) * atom.w;
00322 densityvalx2 += exp2f(-r22);
00323
00324 #if DUNROLLX >= 4
00325 float dx3 = dx2 + gridspacing_coalesce;
00326 float r23 = (dx3*dx3 + dyz2) * atom.w;
00327 densityvalx3 += exp2f(-r23);
00328
00329 float dx4 = dx3 + gridspacing_coalesce;
00330 float r24 = (dx4*dx4 + dyz2) * atom.w;
00331 densityvalx4 += exp2f(-r24);
00332 #endif
00333 #if DUNROLLX >= 8
00334 float dx5 = dx4 + gridspacing_coalesce;
00335 float r25 = (dx5*dx5 + dyz2) * atom.w;
00336 densityvalx5 += exp2f(-r25);
00337
00338 float dx6 = dx5 + gridspacing_coalesce;
00339 float r26 = (dx6*dx6 + dyz2) * atom.w;
00340 densityvalx6 += exp2f(-r26);
00341
00342 float dx7 = dx6 + gridspacing_coalesce;
00343 float r27 = (dx7*dx7 + dyz2) * atom.w;
00344 densityvalx7 += exp2f(-r27);
00345
00346 float dx8 = dx7 + gridspacing_coalesce;
00347 float r28 = (dx8*dx8 + dyz2) * atom.w;
00348 densityvalx8 += exp2f(-r28);
00349 #endif
00350 }
00351
00352 densitygrid[outaddr ] += densityvalx1;
00353 densitygrid[outaddr+1*DBLOCKSZX] += densityvalx2;
00354 #if DUNROLLX >= 4
00355 densitygrid[outaddr+2*DBLOCKSZX] += densityvalx3;
00356 densitygrid[outaddr+3*DBLOCKSZX] += densityvalx4;
00357 #endif
00358 #if DUNROLLX >= 8
00359 densitygrid[outaddr+4*DBLOCKSZX] += densityvalx5;
00360 densitygrid[outaddr+5*DBLOCKSZX] += densityvalx6;
00361 densitygrid[outaddr+6*DBLOCKSZX] += densityvalx7;
00362 densitygrid[outaddr+7*DBLOCKSZX] += densityvalx8;
00363 #endif
00364 }
00365
00366
00367
00368
00369 #define TILESIZEX DBLOCKSZX*DUNROLLX
00370 #define TILESIZEY DBLOCKSZY
00371 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00372 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00373
00374 int vmd_cuda_gaussdensity_direct(long int natoms, float4 *xyzr, float4 *colors,
00375 float* volmap, float *voltexmap, int3 volsz,
00376 float radscale, float gridspacing,
00377 float isovalue, float gausslim) {
00378 enthrparms parms;
00379 wkf_timerhandle globaltimer;
00380 double totalruntime;
00381 int rc=0;
00382
00383 int numprocs = wkf_thread_numprocessors();
00384 int deviceCount = 0;
00385 if (vmd_cuda_num_devices(&deviceCount))
00386 return -1;
00387 if (deviceCount < 1)
00388 return -1;
00389
00390
00391
00392 if (deviceCount < numprocs) {
00393 numprocs = deviceCount;
00394 }
00395
00396 printf("Using %d CUDA GPUs\n", numprocs);
00397 printf("GPU padded grid size: %d x %d x %d\n",
00398 (volsz.x + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00399 (volsz.y + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00400 volsz.z);
00401
00402 parms.isovalue = isovalue;
00403 parms.radscale = radscale;
00404 parms.xyzr = xyzr;
00405 parms.colors = colors;
00406 parms.volmap = volmap;
00407 parms.voltexmap = voltexmap;
00408 parms.numplane = volsz.z;
00409 parms.numcol = volsz.y;
00410 parms.numpt = volsz.x;
00411 parms.natoms = natoms;
00412 parms.gridspacing = gridspacing;
00413
00414 globaltimer = wkf_timer_create();
00415 wkf_timer_start(globaltimer);
00416
00417
00418 wkf_tasktile_t tile;
00419 tile.start=0;
00420 tile.end=volsz.z;
00421 rc = wkf_threadlaunch(numprocs, &parms, cudadensitythread, &tile);
00422
00423
00424 wkf_timer_stop(globaltimer);
00425 totalruntime = wkf_timer_time(globaltimer);
00426 wkf_timer_destroy(globaltimer);
00427
00428 if (!rc) {
00429 double atomevalssec = ((double) volsz.x * volsz.y * volsz.z * natoms) / (totalruntime * 1000000000.0);
00430 printf(" %g billion atom evals/second, %g GFLOPS\n",
00431 atomevalssec,
00432 atomevalssec * ((voltexmap==NULL) ? FLOPSPERATOMEVAL : FLOPSPERATOMEVALTEX));
00433 } else {
00434 msgWarn << "A GPU encountered an unrecoverable error." << sendmsg;
00435 msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00436 }
00437 return rc;
00438 }
00439
00440
00441
00442
00443
00444 static void * cudadensitythread(void *voidparms) {
00445 dim3 volsize, Gsz, Bsz;
00446 float *devdensity = NULL;
00447 float *devtexmap = NULL;
00448 float *hostdensity = NULL;
00449 float *hosttexmap = NULL;
00450 enthrparms *parms = NULL;
00451 int threadid=0;
00452
00453 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00454 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00455
00456
00457
00458
00459 float isovalue = parms->isovalue;
00460
00461 const float4 *xyzr = parms->xyzr;
00462 const float4 *colors = parms->colors;
00463 float *volmap = parms->volmap;
00464 float *voltexmap = parms->voltexmap;
00465 const long int numplane = parms->numplane;
00466 const int numcol = parms->numcol;
00467 const int numpt = parms->numpt;
00468 const int natoms = parms->natoms;
00469 const float gridspacing = parms->gridspacing;
00470 double lasttime, totaltime;
00471
00472 cudaError_t rc;
00473 rc = cudaSetDevice(threadid);
00474 if (rc != cudaSuccess) {
00475 #if CUDART_VERSION >= 2010
00476 rc = cudaGetLastError();
00477 if (rc != cudaErrorSetOnActiveProcess)
00478 return NULL;
00479 #else
00480 cudaGetLastError();
00481
00482 #endif
00483 }
00484
00485
00486 volsize.x = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00487 volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00488 volsize.z = 1;
00489
00490
00491 Bsz.x = DBLOCKSZX;
00492 Bsz.y = DBLOCKSZY;
00493 Bsz.z = 1;
00494 Gsz.x = volsize.x / (Bsz.x * DUNROLLX);
00495 Gsz.y = volsize.y / Bsz.y;
00496 Gsz.z = 1;
00497 int maxplanes = 4;
00498 int volplanesz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00499 int volmemsz = maxplanes * volplanesz;
00500
00501 printf("Thread %d started for CUDA device %d, grid size %dx%dx%d\n",
00502 threadid, threadid, Gsz.x, Gsz.y, Gsz.z);
00503 wkf_timerhandle timer = wkf_timer_create();
00504 wkf_timer_start(timer);
00505 wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00506
00507
00508 cudaMalloc((void**)&devdensity, volmemsz);
00509 if (voltexmap != NULL) cudaMalloc((void**)&devtexmap, volmemsz*3);
00510 CUERR
00511
00512 hostdensity = (float *) malloc(volmemsz);
00513 if (voltexmap != NULL) hosttexmap = (float *) malloc(volmemsz*3);
00514
00515 float *devatoms=NULL, *devradii=NULL, *devcolors=NULL;
00516 cudaMalloc((void**)&devatoms, natoms*4*sizeof(float));
00517 cudaMemcpy(devatoms, xyzr, natoms*4*sizeof(float), cudaMemcpyHostToDevice);
00518 if (colors) {
00519 cudaMalloc((void**)&devcolors, natoms*4*sizeof(float));
00520 cudaMemcpy(devcolors, colors, natoms*4*sizeof(float), cudaMemcpyHostToDevice);
00521 }
00522 CUERR
00523
00524
00525 int computedplanes=0;
00526 wkf_tasktile_t tile;
00527 while (wkf_threadlaunch_next_tile(voidparms, maxplanes, &tile) != WKF_SCHED_DONE) {
00528 int z, k;
00529 int numk=tile.end - tile.start;
00530 int runplanes = min(numk, maxplanes);
00531
00532 Gsz.z = runplanes;
00533 for (k=tile.start; k<tile.end; k+=runplanes) {
00534
00535 int y;
00536 computedplanes++;
00537
00538 #if 1
00539 cudaMemset(devdensity, 0, volmemsz);
00540 if (hosttexmap != NULL)
00541 cudaMemset(devtexmap, 0, 3*volmemsz);
00542 CUERR
00543 #else
00544
00545
00546 for (z=k; z<k+runplanes; z++) {
00547 for (y=0; y<numcol; y++) {
00548 long densaddr = z*numcol*numpt + y*numpt;
00549 memcpy(&hostdensity[(z-k)*volsize.x*volsize.y + y*volsize.x],
00550 &volmap[densaddr], numpt * sizeof(float));
00551 if (hosttexmap != NULL)
00552 memcpy(&hosttexmap[((z-k)*volsize.x*volsize.y + y*volsize.x)*3],
00553 &voltexmap[densaddr*3], 3 * numpt * sizeof(float));
00554 }
00555 }
00556
00557
00558 cudaMemcpy(devdensity, hostdensity, volmemsz, cudaMemcpyHostToDevice);
00559 if (devtexmap != NULL)
00560 cudaMemcpy(devtexmap, hosttexmap, volmemsz*3, cudaMemcpyHostToDevice);
00561 CUERR
00562 #endif
00563
00564 lasttime = wkf_timer_timenow(timer);
00565
00566 if (devtexmap != NULL) {
00567 gaussdensity_direct_tex<<<Gsz, Bsz, 0>>>(natoms,
00568 (float4*) devatoms,
00569 (float4*) devcolors,
00570 gridspacing, k,
00571 devdensity,
00572 (float3*) devtexmap,
00573 1.0f / isovalue);
00574 } else {
00575 gaussdensity_direct<<<Gsz, Bsz, 0>>>(natoms,
00576 (float4*) devatoms,
00577 gridspacing, k, devdensity);
00578 }
00579 cudaThreadSynchronize();
00580 CUERR
00581
00582
00583 cudaMemcpy(hostdensity, devdensity, volmemsz, cudaMemcpyDeviceToHost);
00584 if (devtexmap != NULL)
00585 cudaMemcpy(hosttexmap, devtexmap, volmemsz*3, cudaMemcpyDeviceToHost);
00586 CUERR
00587
00588
00589 for (z=k; z<k+runplanes; z++) {
00590 for (y=0; y<numcol; y++) {
00591 long densaddr = z*numcol*numpt + y*numpt;
00592 memcpy(&volmap[densaddr],
00593 &hostdensity[(z-k)*volsize.x*volsize.y + y*volsize.x],
00594 numpt * sizeof(float));
00595 if (hosttexmap != NULL)
00596 memcpy(&voltexmap[densaddr*3],
00597 &hosttexmap[((z-k)*volsize.x*volsize.y + y*volsize.x)*3],
00598 3 * numpt * sizeof(float));
00599 }
00600 }
00601
00602 totaltime = wkf_timer_timenow(timer);
00603 if (wkf_msg_timer_timeout(msgt)) {
00604
00605 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00606 threadid, k, numplane, computedplanes,
00607 totaltime - lasttime, totaltime,
00608 totaltime * numplane / (k+1));
00609 }
00610 }
00611 }
00612
00613 wkf_timer_destroy(timer);
00614 wkf_msg_timer_destroy(msgt);
00615 free(hostdensity);
00616 if (hosttexmap != NULL)
00617 free(hosttexmap);
00618 cudaFree(devdensity);
00619 if (devtexmap != NULL)
00620 cudaFree(devtexmap);
00621 if (devatoms != NULL)
00622 cudaFree(devatoms);
00623 if (devradii != NULL)
00624 cudaFree(devradii);
00625 if (devcolors != NULL)
00626 cudaFree(devcolors);
00627
00628 CUERR
00629
00630 return NULL;
00631 }
00632
00633
00634
00635
00636
00637
00638
00639
00640 #include <thrust/sort.h>
00641
00642 #define GRID_CELL_EMPTY 0xffffffff
00643
00644
00645 __global__ static void hashAtoms(unsigned int natoms,
00646 const float4 *xyzr,
00647 int3 numvoxels,
00648 float invgridspacing,
00649 unsigned int *atomIndex,
00650 unsigned int *atomHash) {
00651 unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00652 if (index >= natoms)
00653 return;
00654
00655 float4 atom = xyzr[index];
00656
00657 int3 cell;
00658 cell.x = atom.x * invgridspacing;
00659 cell.y = atom.y * invgridspacing;
00660 cell.z = atom.z * invgridspacing;
00661
00662 cell.x = min(cell.x, numvoxels.x-1);
00663 cell.y = min(cell.y, numvoxels.y-1);
00664 cell.z = min(cell.z, numvoxels.z-1);
00665
00666 unsigned int hash = (cell.z * numvoxels.y * numvoxels.x) +
00667 (cell.y * numvoxels.x) + cell.x;
00668
00669 atomIndex[index] = index;
00670 atomHash[index] = hash;
00671 }
00672
00673
00674
00675 __global__ static void sortAtomsGenCellLists(unsigned int natoms,
00676 const float4 *xyzr_d,
00677 const float4 *color_d,
00678 const unsigned int *atomIndex_d,
00679 const unsigned int *atomHash_d,
00680 float4 *sorted_xyzr_d,
00681 float4 *sorted_color_d,
00682 uint2 *cellStartEnd_d) {
00683 extern __shared__ unsigned int hash_s[];
00684 unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00685 unsigned int hash;
00686
00687 if (index < natoms) {
00688 hash = atomHash_d[index];
00689 hash_s[threadIdx.x+1] = hash;
00690 if (index > 0 && threadIdx.x == 0) {
00691
00692 hash_s[0] = atomHash_d[index-1];
00693 }
00694 }
00695
00696 __syncthreads();
00697
00698 if (index < natoms) {
00699
00700
00701
00702 if (index == 0 || hash != hash_s[threadIdx.x]) {
00703 cellStartEnd_d[hash].x = index;
00704 if (index > 0)
00705 cellStartEnd_d[hash_s[threadIdx.x]].y = index;
00706 }
00707
00708 if (index == natoms - 1) {
00709 cellStartEnd_d[hash].y = index + 1;
00710 }
00711
00712
00713 unsigned int sortedIndex = atomIndex_d[index];
00714 float4 pos = xyzr_d[sortedIndex];
00715 sorted_xyzr_d[index] = pos;
00716
00717
00718 if (color_d != NULL) {
00719 float4 col = color_d[sortedIndex];
00720 sorted_color_d[index] = col;
00721 }
00722 }
00723 }
00724
00725
00726 static int vmd_cuda_build_density_atom_grid(int natoms,
00727 const float4 * xyzr_d,
00728 const float4 * color_d,
00729 float4 * sorted_xyzr_d,
00730 float4 * sorted_color_d,
00731 unsigned int *atomIndex_d,
00732 unsigned int *atomHash_d,
00733 uint2 * cellStartEnd_d,
00734 int3 volsz,
00735 float invgridspacing) {
00736
00737
00738 dim3 hBsz(256, 1, 1);
00739
00740
00741
00742
00743
00744
00745
00746 if (natoms > 16000000)
00747 hBsz.x = 512;
00748
00749 dim3 hGsz(((natoms+hBsz.x-1) / hBsz.x), 1, 1);
00750
00751
00752
00753
00754 hashAtoms<<<hGsz, hBsz>>>(natoms, xyzr_d, volsz, invgridspacing,
00755 atomIndex_d, atomHash_d);
00756
00757
00758
00759 #if 1
00760
00761
00762
00763
00764 try {
00765 thrust::sort_by_key(thrust::device_ptr<unsigned int>(atomHash_d),
00766 thrust::device_ptr<unsigned int>(atomHash_d + natoms),
00767 thrust::device_ptr<unsigned int>(atomIndex_d));
00768 }
00769 catch (std::bad_alloc) {
00770 printf("CUDA Thrust memory allocation failed: %s line %d\n", __FILE__, __LINE__);
00771 return -1;
00772 }
00773 #else
00774 thrust::sort_by_key(thrust::device_ptr<unsigned int>(atomHash_d),
00775 thrust::device_ptr<unsigned int>(atomHash_d + natoms),
00776 thrust::device_ptr<unsigned int>(atomIndex_d));
00777 #endif
00778
00779
00780 int ncells = volsz.x * volsz.y * volsz.z;
00781 cudaMemset(cellStartEnd_d, GRID_CELL_EMPTY, ncells*sizeof(uint2));
00782
00783
00784
00785
00786 unsigned int smemSize = sizeof(unsigned int)*(hBsz.x+1);
00787 sortAtomsGenCellLists<<<hGsz, hBsz, smemSize>>>(
00788 natoms, xyzr_d, color_d, atomIndex_d, atomHash_d,
00789 sorted_xyzr_d, sorted_color_d, cellStartEnd_d);
00790
00791 #if 1
00792
00793
00794
00795
00796 cudaThreadSynchronize();
00797 cudaError_t err = cudaGetLastError();
00798 if (err != cudaSuccess) {
00799 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
00800 return -1;
00801 }
00802 #endif
00803
00804 return 0;
00805 }
00806
00807
00808
00809
00810
00811 #define GGRIDSZ 8.0f
00812 #define GBLOCKSZX 8
00813 #define GBLOCKSZY 8
00814
00815 #if 1
00816 #define GTEXBLOCKSZZ 2
00817 #define GTEXUNROLL 4
00818 #define GBLOCKSZZ 2
00819 #define GUNROLL 4
00820 #else
00821 #define GTEXBLOCKSZZ 8
00822 #define GTEXUNROLL 1
00823 #define GBLOCKSZZ 8
00824 #define GUNROLL 1
00825 #endif
00826
00827 __global__ static void gaussdensity_fast_tex(int natoms,
00828 const float4 *sorted_xyzr,
00829 const float4 *sorted_color,
00830 int3 numvoxels,
00831 int3 acncells,
00832 float acgridspacing,
00833 float invacgridspacing,
00834 const uint2 * cellStartEnd,
00835 float gridspacing, unsigned int z,
00836 float *densitygrid,
00837 float3 *voltexmap,
00838 float invisovalue) {
00839 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00840 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00841 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GTEXUNROLL;
00842
00843
00844 unsigned int outaddr = zindex * numvoxels.x * numvoxels.y +
00845 yindex * numvoxels.x + xindex;
00846
00847
00848 if (xindex >= numvoxels.x || yindex >= numvoxels.y || zindex >= numvoxels.z)
00849 return;
00850
00851 zindex += z;
00852
00853
00854 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00855 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00856 int zabmin = ((z + blockIdx.z * blockDim.z * GTEXUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00857
00858
00859 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00860 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00861 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GTEXUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00862
00863 xabmin = (xabmin < 0) ? 0 : xabmin;
00864 yabmin = (yabmin < 0) ? 0 : yabmin;
00865 zabmin = (zabmin < 0) ? 0 : zabmin;
00866 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
00867 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
00868 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
00869
00870 float coorx = gridspacing * xindex;
00871 float coory = gridspacing * yindex;
00872 float coorz = gridspacing * zindex;
00873
00874 float densityval1=0.0f;
00875 float3 densitycol1=make_float3(0.0f, 0.0f, 0.0f);
00876 #if GTEXUNROLL >= 2
00877 float densityval2=0.0f;
00878 float3 densitycol2=densitycol1;
00879 #endif
00880 #if GTEXUNROLL >= 4
00881 float densityval3=0.0f;
00882 float3 densitycol3=densitycol1;
00883 float densityval4=0.0f;
00884 float3 densitycol4=densitycol1;
00885 #endif
00886
00887 int acplanesz = acncells.x * acncells.y;
00888 int xab, yab, zab;
00889 for (zab=zabmin; zab<=zabmax; zab++) {
00890 for (yab=yabmin; yab<=yabmax; yab++) {
00891 for (xab=xabmin; xab<=xabmax; xab++) {
00892 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00893 uint2 atomstartend = cellStartEnd[abcellidx];
00894 if (atomstartend.x != GRID_CELL_EMPTY) {
00895 unsigned int atomid;
00896 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00897 float4 atom = sorted_xyzr[atomid];
00898 float4 color = sorted_color[atomid];
00899 float dx = coorx - atom.x;
00900 float dy = coory - atom.y;
00901 float dxy2 = dx*dx + dy*dy;
00902
00903 float dz = coorz - atom.z;
00904 float r21 = (dxy2 + dz*dz) * atom.w;
00905 float tmp1 = exp2f(r21);
00906 densityval1 += tmp1;
00907 tmp1 *= invisovalue;
00908 densitycol1.x += tmp1 * color.x;
00909 densitycol1.y += tmp1 * color.y;
00910 densitycol1.z += tmp1 * color.z;
00911
00912 #if GTEXUNROLL >= 2
00913 float dz2 = dz + gridspacing;
00914 float r22 = (dxy2 + dz2*dz2) * atom.w;
00915 float tmp2 = exp2f(r22);
00916 densityval2 += tmp2;
00917 tmp2 *= invisovalue;
00918 densitycol2.x += tmp2 * color.x;
00919 densitycol2.y += tmp2 * color.y;
00920 densitycol2.z += tmp2 * color.z;
00921 #endif
00922 #if GTEXUNROLL >= 4
00923 float dz3 = dz2 + gridspacing;
00924 float r23 = (dxy2 + dz3*dz3) * atom.w;
00925 float tmp3 = exp2f(r23);
00926 densityval3 += tmp3;
00927 tmp3 *= invisovalue;
00928 densitycol3.x += tmp3 * color.x;
00929 densitycol3.y += tmp3 * color.y;
00930 densitycol3.z += tmp3 * color.z;
00931
00932 float dz4 = dz3 + gridspacing;
00933 float r24 = (dxy2 + dz4*dz4) * atom.w;
00934 float tmp4 = exp2f(r24);
00935 densityval4 += tmp4;
00936 tmp4 *= invisovalue;
00937 densitycol4.x += tmp4 * color.x;
00938 densitycol4.y += tmp4 * color.y;
00939 densitycol4.z += tmp4 * color.z;
00940 #endif
00941 }
00942 }
00943 }
00944 }
00945 }
00946
00947 densitygrid[outaddr ] = densityval1;
00948 voltexmap[outaddr ].x = densitycol1.x;
00949 voltexmap[outaddr ].y = densitycol1.y;
00950 voltexmap[outaddr ].z = densitycol1.z;
00951
00952 #if GTEXUNROLL >= 2
00953 int planesz = numvoxels.x * numvoxels.y;
00954 densitygrid[outaddr + planesz] = densityval2;
00955 voltexmap[outaddr + planesz].x = densitycol2.x;
00956 voltexmap[outaddr + planesz].y = densitycol2.y;
00957 voltexmap[outaddr + planesz].z = densitycol2.z;
00958 #endif
00959 #if GTEXUNROLL >= 4
00960 densitygrid[outaddr + 2*planesz] = densityval3;
00961 voltexmap[outaddr + 2*planesz].x = densitycol3.x;
00962 voltexmap[outaddr + 2*planesz].y = densitycol3.y;
00963 voltexmap[outaddr + 2*planesz].z = densitycol3.z;
00964
00965 densitygrid[outaddr + 3*planesz] = densityval4;
00966 voltexmap[outaddr + 3*planesz].x = densitycol4.x;
00967 voltexmap[outaddr + 3*planesz].y = densitycol4.y;
00968 voltexmap[outaddr + 3*planesz].z = densitycol4.z;
00969 #endif
00970 }
00971
00972
00973 __global__ static void gaussdensity_fast(int natoms,
00974 const float4 *sorted_xyzr,
00975 int3 numvoxels,
00976 int3 acncells,
00977 float acgridspacing,
00978 float invacgridspacing,
00979 const uint2 * cellStartEnd,
00980 float gridspacing, unsigned int z,
00981 float *densitygrid) {
00982 unsigned int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
00983 unsigned int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
00984 unsigned int zindex = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00985 unsigned int outaddr = zindex * numvoxels.x * numvoxels.y +
00986 yindex * numvoxels.x +
00987 xindex;
00988
00989
00990 if (xindex >= numvoxels.x || yindex >= numvoxels.y || zindex >= numvoxels.z)
00991 return;
00992
00993 zindex += z;
00994
00995
00996 int xabmin = ((blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00997 int yabmin = ((blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00998 int zabmin = ((z + blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00999
01000
01001 int xabmax = (((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
01002 int yabmax = (((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
01003 int zabmax = ((z + (blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
01004
01005 xabmin = (xabmin < 0) ? 0 : xabmin;
01006 yabmin = (yabmin < 0) ? 0 : yabmin;
01007 zabmin = (zabmin < 0) ? 0 : zabmin;
01008 xabmax = (xabmax >= acncells.x-1) ? acncells.x-1 : xabmax;
01009 yabmax = (yabmax >= acncells.y-1) ? acncells.y-1 : yabmax;
01010 zabmax = (zabmax >= acncells.z-1) ? acncells.z-1 : zabmax;
01011
01012 float coorx = gridspacing * xindex;
01013 float coory = gridspacing * yindex;
01014 float coorz = gridspacing * zindex;
01015
01016 float densityval1=0.0f;
01017 #if GUNROLL >= 2
01018 float densityval2=0.0f;
01019 #endif
01020 #if GUNROLL >= 4
01021 float densityval3=0.0f;
01022 float densityval4=0.0f;
01023 #endif
01024
01025 int acplanesz = acncells.x * acncells.y;
01026 int xab, yab, zab;
01027 for (zab=zabmin; zab<=zabmax; zab++) {
01028 for (yab=yabmin; yab<=yabmax; yab++) {
01029 for (xab=xabmin; xab<=xabmax; xab++) {
01030 int abcellidx = zab * acplanesz + yab * acncells.x + xab;
01031 uint2 atomstartend = cellStartEnd[abcellidx];
01032 if (atomstartend.x != GRID_CELL_EMPTY) {
01033 unsigned int atomid;
01034 for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
01035 float4 atom = sorted_xyzr[atomid];
01036 float dx = coorx - atom.x;
01037 float dy = coory - atom.y;
01038 float dxy2 = dx*dx + dy*dy;
01039
01040 float dz = coorz - atom.z;
01041 float r21 = (dxy2 + dz*dz) * atom.w;
01042 densityval1 += exp2f(r21);
01043
01044 #if GUNROLL >= 2
01045 float dz2 = dz + gridspacing;
01046 float r22 = (dxy2 + dz2*dz2) * atom.w;
01047 densityval2 += exp2f(r22);
01048 #endif
01049 #if GUNROLL >= 4
01050 float dz3 = dz2 + gridspacing;
01051 float r23 = (dxy2 + dz3*dz3) * atom.w;
01052 densityval3 += exp2f(r23);
01053
01054 float dz4 = dz3 + gridspacing;
01055 float r24 = (dxy2 + dz4*dz4) * atom.w;
01056 densityval4 += exp2f(r24);
01057 #endif
01058 }
01059 }
01060 }
01061 }
01062 }
01063
01064 densitygrid[outaddr ] = densityval1;
01065 #if GUNROLL >= 2
01066 int planesz = numvoxels.x * numvoxels.y;
01067 densitygrid[outaddr + planesz] = densityval2;
01068 #endif
01069 #if GUNROLL >= 4
01070 densitygrid[outaddr + 2*planesz] = densityval3;
01071 densitygrid[outaddr + 3*planesz] = densityval4;
01072 #endif
01073 }
01074
01075
01076
01077 typedef struct {
01079 long int natoms;
01080 int colorperatom;
01081 int gx;
01082 int gy;
01083 int gz;
01084
01085 CUDAMarchingCubes *mc;
01086
01087 float *devdensity;
01088 float *devvoltexmap;
01089 float4 *xyzr_d;
01090 float4 *sorted_xyzr_d;
01091 float4 *color_d;
01092 float4 *sorted_color_d;
01093
01094 unsigned int *atomIndex_d;
01095 unsigned int *atomHash_d;
01096 uint2 *cellStartEnd_d;
01097
01098 void *safety;
01099 float *v3f_d;
01100 float *n3f_d;
01101 float *c3f_d;
01102
01103 } qsurf_gpuhandle;
01104
01105
01106 CUDAQuickSurf::CUDAQuickSurf() {
01107 voidgpu = calloc(1, sizeof(qsurf_gpuhandle));
01108
01109 }
01110
01111
01112 CUDAQuickSurf::~CUDAQuickSurf() {
01113 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01114
01115
01116 free_bufs();
01117
01118
01119 delete gpuh->mc;
01120
01121 free(voidgpu);
01122 }
01123
01124
01125 int CUDAQuickSurf::free_bufs() {
01126 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01127
01128
01129 gpuh->natoms = 0;
01130 gpuh->colorperatom = 0;
01131 gpuh->gx = 0;
01132 gpuh->gy = 0;
01133 gpuh->gz = 0;
01134
01135 if (gpuh->safety != NULL)
01136 cudaFree(gpuh->safety);
01137 gpuh->safety=NULL;
01138
01139 if (gpuh->devdensity != NULL)
01140 cudaFree(gpuh->devdensity);
01141 gpuh->devdensity=NULL;
01142
01143 if (gpuh->devvoltexmap != NULL)
01144 cudaFree(gpuh->devvoltexmap);
01145 gpuh->devvoltexmap=NULL;
01146
01147 if (gpuh->xyzr_d != NULL)
01148 cudaFree(gpuh->xyzr_d);
01149 gpuh->xyzr_d=NULL;
01150
01151 if (gpuh->sorted_xyzr_d != NULL)
01152 cudaFree(gpuh->sorted_xyzr_d);
01153 gpuh->sorted_xyzr_d=NULL;
01154
01155 if (gpuh->color_d != NULL)
01156 cudaFree(gpuh->color_d);
01157 gpuh->color_d=NULL;
01158
01159 if (gpuh->sorted_color_d != NULL)
01160 cudaFree(gpuh->sorted_color_d);
01161 gpuh->sorted_color_d=NULL;
01162
01163 if (gpuh->atomIndex_d != NULL)
01164 cudaFree(gpuh->atomIndex_d);
01165 gpuh->atomIndex_d=NULL;
01166
01167 if (gpuh->atomHash_d != NULL)
01168 cudaFree(gpuh->atomHash_d);
01169 gpuh->atomHash_d=NULL;
01170
01171 if (gpuh->cellStartEnd_d != NULL)
01172 cudaFree(gpuh->cellStartEnd_d);
01173 gpuh->cellStartEnd_d=NULL;
01174
01175 if (gpuh->v3f_d != NULL)
01176 cudaFree(gpuh->v3f_d);
01177 gpuh->v3f_d=NULL;
01178
01179 if (gpuh->n3f_d != NULL)
01180 cudaFree(gpuh->n3f_d);
01181 gpuh->n3f_d=NULL;
01182
01183 if (gpuh->c3f_d != NULL)
01184 cudaFree(gpuh->c3f_d);
01185 gpuh->c3f_d=NULL;
01186
01187 return 0;
01188 }
01189
01190
01191 int CUDAQuickSurf::check_bufs(long int natoms, int colorperatom,
01192 int gx, int gy, int gz) {
01193 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01194
01195
01196
01197
01198
01199
01200 if (natoms <= gpuh->natoms &&
01201 colorperatom <= gpuh->colorperatom &&
01202 (gx*gy*gz) <= (gpuh->gx * gpuh->gy * gpuh->gz))
01203 return 0;
01204
01205 return -1;
01206 }
01207
01208
01209 int CUDAQuickSurf::alloc_bufs(long int natoms, int colorperatom,
01210 int gx, int gy, int gz) {
01211 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01212
01213
01214
01215 if (check_bufs(natoms, colorperatom, gx, gy, gz) == 0)
01216 return 0;
01217
01218
01219
01220 free_bufs();
01221
01222 long int ncells = gx * gy * gz;
01223 long int volmemsz = ncells * sizeof(float);
01224
01225
01226
01227
01228 cudaMalloc((void**)&gpuh->devdensity, volmemsz);
01229 if (colorperatom) {
01230 cudaMalloc((void**)&gpuh->devvoltexmap, 3*volmemsz);
01231 cudaMalloc((void**)&gpuh->color_d, natoms * sizeof(float4));
01232 cudaMalloc((void**)&gpuh->sorted_color_d, natoms * sizeof(float4));
01233 }
01234 cudaMalloc((void**)&gpuh->xyzr_d, natoms * sizeof(float4));
01235 cudaMalloc((void**)&gpuh->sorted_xyzr_d, natoms * sizeof(float4));
01236 cudaMalloc((void**)&gpuh->atomIndex_d, natoms * sizeof(unsigned int));
01237 cudaMalloc((void**)&gpuh->atomHash_d, natoms * sizeof(unsigned int));
01238 cudaMalloc((void**)&gpuh->cellStartEnd_d, ncells * sizeof(uint2));
01239
01240
01241 int chunkmaxverts = 3 * ncells;
01242 cudaMalloc((void**)&gpuh->v3f_d, 3 * chunkmaxverts * sizeof(float3));
01243 cudaMalloc((void**)&gpuh->n3f_d, 3 * chunkmaxverts * sizeof(float3));
01244 cudaMalloc((void**)&gpuh->c3f_d, 3 * chunkmaxverts * sizeof(float3));
01245
01246
01247
01248
01249
01250
01251 cudaMalloc(&gpuh->safety,
01252 natoms*sizeof(float4) +
01253 8 * gx * gy * sizeof(float) +
01254 CUDAMarchingCubes::MemUsageMC(gx, gy, gz));
01255
01256 cudaError_t err = cudaGetLastError();
01257 if (err != cudaSuccess)
01258 return -1;
01259
01260
01261
01262 gpuh->natoms = natoms;
01263 gpuh->colorperatom = colorperatom;
01264 gpuh->gx = gx;
01265 gpuh->gy = gy;
01266 gpuh->gz = gz;
01267
01268 return 0;
01269 }
01270
01271
01272 int CUDAQuickSurf::get_chunk_bufs(int testexisting,
01273 long int natoms, int colorperatom,
01274 int gx, int gy, int gz,
01275 int &cx, int &cy, int &cz,
01276 int &sx, int &sy, int &sz) {
01277 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01278 if (colorperatom)
01279 Bsz.z = GTEXBLOCKSZZ;
01280
01281 cudaError_t err = cudaGetLastError();
01282
01283
01284
01285
01286
01287 cz <<= 1;
01288 int chunkiters = 0;
01289 int chunkallocated = 0;
01290 while (!chunkallocated) {
01291
01292 chunkiters++;
01293 cz >>= 1;
01294
01295
01296
01297
01298
01299 if (cz != gz)
01300 cz-=4;
01301
01302
01303
01304 cz += (8 - (cz % 8));
01305
01306
01307
01308
01309 sx = cx;
01310 sy = cy;
01311 sz = cz;
01312
01313
01314
01315 cz+=4;
01316
01317 #if 0
01318 printf(" Trying slab size: %d (test: %d)\n", sz, testexisting);
01319 #endif
01320
01321 #if 1
01322
01323
01324 dim3 tGsz((sx+Bsz.x-1) / Bsz.x,
01325 (sy+Bsz.y-1) / Bsz.y,
01326 (sz+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01327 if (colorperatom) {
01328 tGsz.z = (sz+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01329 }
01330 if (tGsz.x * tGsz.y * tGsz.z > 65535)
01331 continue;
01332 #endif
01333
01334
01335
01336
01337 if (sz <= 8) {
01338 return -1;
01339 }
01340
01341 if (testexisting) {
01342 if (check_bufs(natoms, colorperatom, cx, cy, cz) != 0)
01343 continue;
01344 } else {
01345 if (alloc_bufs(natoms, colorperatom, cx, cy, cz) != 0)
01346 continue;
01347 }
01348
01349 chunkallocated=1;
01350 }
01351
01352 return 0;
01353 }
01354
01355
01356 int CUDAQuickSurf::calc_surf(long int natoms, const float *xyzr_f,
01357 const float *colors_f,
01358 int colorperatom,
01359 float *origin, int *numvoxels, float maxrad,
01360 float radscale, float gridspacing,
01361 float isovalue, float gausslim,
01362 VMDDisplayList *cmdList) {
01363 qsurf_gpuhandle *gpuh = (qsurf_gpuhandle *) voidgpu;
01364
01365 float4 *colors = (float4 *) colors_f;
01366 int3 volsz;
01367 volsz.x = numvoxels[0];
01368 volsz.y = numvoxels[1];
01369 volsz.z = numvoxels[2];
01370
01371 int chunkmaxverts=0;
01372 int chunknumverts=0;
01373 int numverts=0;
01374 int numfacets=0;
01375
01376 wkf_timerhandle globaltimer = wkf_timer_create();
01377 wkf_timer_start(globaltimer);
01378
01379 cudaError_t err;
01380 cudaDeviceProp deviceProp;
01381 int dev;
01382 if (cudaGetDevice(&dev) != cudaSuccess) {
01383 wkf_timer_destroy(globaltimer);
01384 return -1;
01385 }
01386
01387 memset(&deviceProp, 0, sizeof(cudaDeviceProp));
01388
01389 if (cudaGetDeviceProperties(&deviceProp, dev) != cudaSuccess) {
01390 wkf_timer_destroy(globaltimer);
01391 err = cudaGetLastError();
01392 return -1;
01393 }
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404 if ((deviceProp.major < 2) &&
01405 ((deviceProp.major == 1) && (deviceProp.minor < 3))) {
01406 wkf_timer_destroy(globaltimer);
01407 return -1;
01408 }
01409
01410
01411 float acgridspacing = gausslim * radscale * maxrad;
01412
01413
01414 if (acgridspacing < gridspacing)
01415 acgridspacing = gridspacing;
01416
01417
01418
01419
01420 int3 chunksz = volsz;
01421 int3 slabsz = volsz;
01422
01423 int3 accelcells;
01424 accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01425 accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01426 accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01427
01428 dim3 Bsz(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01429 if (colorperatom)
01430 Bsz.z = GTEXBLOCKSZZ;
01431
01432
01433
01434
01435 if (gpuh->natoms == 0 ||
01436 get_chunk_bufs(1, natoms, colorperatom,
01437 volsz.x, volsz.y, volsz.z,
01438 chunksz.x, chunksz.y, chunksz.z,
01439 slabsz.x, slabsz.y, slabsz.z) == -1) {
01440
01441
01442 chunksz = volsz;
01443 slabsz = volsz;
01444
01445
01446
01447 if (get_chunk_bufs(0, natoms, colorperatom,
01448 volsz.x, volsz.y, volsz.z,
01449 chunksz.x, chunksz.y, chunksz.z,
01450 slabsz.x, slabsz.y, slabsz.z) == -1) {
01451 wkf_timer_destroy(globaltimer);
01452 return -1;
01453 }
01454 }
01455 chunkmaxverts = 3 * chunksz.x * chunksz.y * chunksz.z;
01456
01457
01458
01459 if (gpuh->safety != NULL)
01460 cudaFree(gpuh->safety);
01461 gpuh->safety = NULL;
01462
01463 #if 0
01464 if (chunkiters > 1)
01465 printf(" Using GPU chunk size: %d\n", chunksz.z);
01466
01467 printf(" Accel grid(%d, %d, %d) spacing %f\n",
01468 accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01469 #endif
01470
01471
01472
01473 int i, i4;
01474 float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01475 float log2e = log2(2.718281828);
01476 for (i=0,i4=0; i<natoms; i++,i4+=4) {
01477 xyzr[i].x = xyzr_f[i4 ];
01478 xyzr[i].y = xyzr_f[i4 + 1];
01479 xyzr[i].z = xyzr_f[i4 + 2];
01480
01481 float scaledrad = xyzr_f[i4 + 3] * radscale;
01482 float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01483
01484 xyzr[i].w = arinv;
01485 }
01486 cudaMemcpy(gpuh->xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01487 free(xyzr);
01488
01489 if (colorperatom)
01490 cudaMemcpy(gpuh->color_d, colors, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01491
01492
01493 if (vmd_cuda_build_density_atom_grid(natoms, gpuh->xyzr_d, gpuh->color_d,
01494 gpuh->sorted_xyzr_d,
01495 gpuh->sorted_color_d,
01496 gpuh->atomIndex_d, gpuh->atomHash_d,
01497 gpuh->cellStartEnd_d,
01498 accelcells, 1.0f / acgridspacing) != 0) {
01499 wkf_timer_destroy(globaltimer);
01500 free_bufs();
01501 return -1;
01502 }
01503
01504 double sorttime = wkf_timer_timenow(globaltimer);
01505 double lastlooptime = sorttime;
01506
01507 double densitykerneltime = 0.0f;
01508 double densitytime = 0.0f;
01509 double mckerneltime = 0.0f;
01510 double mctime = 0.0f;
01511 double copycalltime = 0.0f;
01512 double copytime = 0.0f;
01513
01514 float *volslab_d = NULL;
01515 float *texslab_d = NULL;
01516
01517 int lzplane = GBLOCKSZZ * GUNROLL;
01518 if (colorperatom)
01519 lzplane = GTEXBLOCKSZZ * GTEXUNROLL;
01520
01521
01522 uint3 mgsz = make_uint3(chunksz.x, chunksz.y, chunksz.z);
01523 if (gpuh->mc == NULL) {
01524 gpuh->mc = new CUDAMarchingCubes();
01525 if (!gpuh->mc->Initialize(mgsz)) {
01526 printf("MC Initialize() failed\n");
01527 }
01528 } else {
01529 uint3 mcmaxgridsize = gpuh->mc->GetMaxGridSize();
01530 if (slabsz.x > mcmaxgridsize.x ||
01531 slabsz.y > mcmaxgridsize.y ||
01532 slabsz.z > mcmaxgridsize.z) {
01533 printf("*** Allocating new MC object...\n");
01534
01535 delete gpuh->mc;
01536
01537
01538 gpuh->mc = new CUDAMarchingCubes();
01539
01540 if (!gpuh->mc->Initialize(mgsz)) {
01541 printf("MC Initialize() failed while recreating MC object\n");
01542 }
01543 }
01544 }
01545
01546 int z;
01547 int chunkcount=0;
01548 for (z=0; z<volsz.z; z+=slabsz.z) {
01549 int3 curslab = slabsz;
01550 if (z+curslab.z > volsz.z)
01551 curslab.z = volsz.z - z;
01552
01553 int slabplanesz = curslab.x * curslab.y;
01554
01555 dim3 Gsz((curslab.x+Bsz.x-1) / Bsz.x,
01556 (curslab.y+Bsz.y-1) / Bsz.y,
01557 (curslab.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01558 if (colorperatom)
01559 Gsz.z = (curslab.z+(Bsz.z*GTEXUNROLL)-1) / (Bsz.z * GTEXUNROLL);
01560
01561
01562
01563
01564
01565 dim3 Gszslice = Gsz;
01566 if (deviceProp.major < 2)
01567 Gszslice.z = 1;
01568
01569 #if 0
01570 printf("CUDA device %d, grid size %dx%dx%d\n",
01571 0, Gsz.x, Gsz.y, Gsz.z);
01572 printf("CUDA: vol(%d,%d,%d) accel(%d,%d,%d)\n",
01573 curslab.x, curslab.y, curslab.z,
01574 accelcells.x, accelcells.y, accelcells.z);
01575 printf("Z=%d, curslab.z=%d\n", z, curslab.z);
01576 #endif
01577
01578
01579
01580
01581 if (z == 0) {
01582 volslab_d = gpuh->devdensity;
01583 if (colorperatom)
01584 texslab_d = gpuh->devvoltexmap;
01585 } else {
01586 cudaMemcpy(gpuh->devdensity,
01587 volslab_d + (slabsz.z-4) * slabplanesz,
01588 4 * slabplanesz * sizeof(float), cudaMemcpyDeviceToDevice);
01589 if (colorperatom)
01590 cudaMemcpy(gpuh->devvoltexmap,
01591 texslab_d + (slabsz.z-4) * 3 * slabplanesz,
01592 4*3 * slabplanesz * sizeof(float), cudaMemcpyDeviceToDevice);
01593
01594 volslab_d = gpuh->devdensity + (4 * slabplanesz);
01595 if (colorperatom)
01596 texslab_d = gpuh->devvoltexmap + (4 * 3 * slabplanesz);
01597 }
01598
01599 for (int lz=0; lz<Gsz.z; lz+=Gszslice.z) {
01600 int lzinc = lz * lzplane;
01601 float *volslice_d = volslab_d + lzinc * slabplanesz;
01602
01603 if (colorperatom) {
01604 float *texslice_d = texslab_d + lzinc * slabplanesz * 3;
01605 gaussdensity_fast_tex<<<Gszslice, Bsz, 0>>>(natoms,
01606 gpuh->sorted_xyzr_d, gpuh->sorted_color_d,
01607 curslab, accelcells, acgridspacing,
01608 1.0f / acgridspacing, gpuh->cellStartEnd_d, gridspacing, z+lzinc,
01609 volslice_d, (float3 *) texslice_d, 1.0f / isovalue);
01610 } else {
01611 gaussdensity_fast<<<Gszslice, Bsz, 0>>>(natoms,
01612 gpuh->sorted_xyzr_d,
01613 curslab, accelcells, acgridspacing, 1.0f / acgridspacing,
01614 gpuh->cellStartEnd_d, gridspacing, z+lzinc, volslice_d);
01615 }
01616 }
01617 cudaThreadSynchronize();
01618 densitykerneltime = wkf_timer_timenow(globaltimer);
01619
01620 #if 0
01621 printf(" CUDA mcubes..."); fflush(stdout);
01622 #endif
01623
01624 int vsz[3];
01625 vsz[0]=curslab.x;
01626 vsz[1]=curslab.y;
01627 vsz[2]=curslab.z;
01628
01629
01630
01631
01632 if (z != 0)
01633 vsz[2]=curslab.z + 4;
01634
01635 float bbox[3];
01636 bbox[0] = vsz[0] * gridspacing;
01637 bbox[1] = vsz[1] * gridspacing;
01638 bbox[2] = vsz[2] * gridspacing;
01639
01640
01641 float gorigin[3];
01642 gorigin[0] = origin[0];
01643 gorigin[1] = origin[1];
01644 gorigin[2] = origin[2] + (z * gridspacing);
01645
01646 if (z != 0)
01647 gorigin[2] = origin[2] + ((z-4) * gridspacing);
01648
01649 #if 0
01650 printf("\n ... vsz: %d %d %d\n", vsz[0], vsz[1], vsz[2]);
01651 printf(" ... org: %.2f %.2f %.2f\n", gorigin[0], gorigin[1], gorigin[2]);
01652 printf(" ... bxs: %.2f %.2f %.2f\n", bbox[0], bbox[1], bbox[2]);
01653 printf(" ... bbe: %.2f %.2f %.2f\n",
01654 gorigin[0]+bbox[0], gorigin[1]+bbox[1], gorigin[2]+bbox[2]);
01655 #endif
01656
01657
01658
01659
01660
01661 int skipstartplane=0;
01662 int skipendplane=0;
01663 if (chunksz.z < volsz.z) {
01664
01665 if (z != 0)
01666 skipstartplane=1;
01667
01668
01669 if (z+curslab.z < volsz.z)
01670 skipendplane=1;
01671 }
01672
01673
01674
01675
01676 uint3 gvsz = make_uint3(vsz[0], vsz[1], vsz[2]);
01677 float3 gorg = make_float3(gorigin[0], gorigin[1], gorigin[2]);
01678 float3 gbnds = make_float3(bbox[0], bbox[1], bbox[2]);
01679
01680 gpuh->mc->SetIsovalue(isovalue);
01681 if (!gpuh->mc->SetVolumeData(gpuh->devdensity, gpuh->devvoltexmap,
01682 gvsz, gorg, gbnds, true)) {
01683 printf("MC SetVolumeData() failed\n");
01684 }
01685
01686 if (skipstartplane || skipendplane) {
01687 uint3 volstart = make_uint3(0, 0, 0);
01688 uint3 volend = make_uint3(gvsz.x, gvsz.y, gvsz.z);
01689
01690 if (skipstartplane)
01691 volstart.z = 2;
01692
01693 if (skipendplane)
01694 volend.z = gvsz.z - 2;
01695
01696 gpuh->mc->SetSubVolume(volstart, volend);
01697 }
01698 gpuh->mc->computeIsosurface((float3 *) gpuh->v3f_d, (float3 *) gpuh->n3f_d,
01699 (float3 *) gpuh->c3f_d, chunkmaxverts);
01700
01701 chunknumverts = gpuh->mc->GetVertexCount();
01702
01703 #if 0
01704 printf("generated %d vertices, max vert limit: %d\n", chunknumverts, chunkmaxverts);
01705 #endif
01706 if (chunknumverts == chunkmaxverts)
01707 printf(" *** Exceeded marching cubes vertex limit (%d verts)\n", chunknumverts);
01708
01709 cudaThreadSynchronize();
01710 mckerneltime = wkf_timer_timenow(globaltimer);
01711
01712
01713 if (chunknumverts > 0) {
01714 DispCmdTriMesh cmdTriMesh;
01715 if (colorperatom) {
01716
01717 cmdTriMesh.cuda_putdata(gpuh->v3f_d, gpuh->n3f_d, gpuh->c3f_d,
01718 chunknumverts/3, cmdList);
01719 } else {
01720
01721 cmdTriMesh.cuda_putdata(gpuh->v3f_d, gpuh->n3f_d, NULL,
01722 chunknumverts/3, cmdList);
01723 }
01724 }
01725 numverts+=chunknumverts;
01726 numfacets+=chunknumverts/3;
01727
01728 #if 0
01729
01730
01731
01732 int l;
01733 int vertstart = 3 * numverts;
01734 int vertbufsz = 3 * (numverts + chunknumverts) * sizeof(float);
01735 int facebufsz = (numverts + chunknumverts) * sizeof(int);
01736 int chunkvertsz = 3 * chunknumverts * sizeof(float);
01737
01738 v = (float*) realloc(v, vertbufsz);
01739 n = (float*) realloc(n, vertbufsz);
01740 c = (float*) realloc(c, vertbufsz);
01741 f = (int*) realloc(f, facebufsz);
01742 cudaMemcpy(v+vertstart, gpuh->v3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01743 cudaMemcpy(n+vertstart, gpuh->n3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01744 if (colorperatom) {
01745 cudaMemcpy(c+vertstart, gpuh->c3f_d, chunkvertsz, cudaMemcpyDeviceToHost);
01746 } else {
01747 float *color = c+vertstart;
01748 for (l=0; l<chunknumverts*3; l+=3) {
01749 color[l + 0] = colors[0].x;
01750 color[l + 1] = colors[0].y;
01751 color[l + 2] = colors[0].z;
01752 }
01753 }
01754 for (l=numverts; l<numverts+chunknumverts; l++) {
01755 f[l]=l;
01756 }
01757 numverts+=chunknumverts;
01758 numfacets+=chunknumverts/3;
01759 #endif
01760
01761 copycalltime = wkf_timer_timenow(globaltimer);
01762
01763 densitytime += densitykerneltime - lastlooptime;
01764 mctime += mckerneltime - densitykerneltime;
01765 copytime += copycalltime - mckerneltime;
01766
01767 lastlooptime = wkf_timer_timenow(globaltimer);
01768
01769 chunkcount++;
01770 }
01771
01772
01773
01774 err = cudaGetLastError();
01775
01776 wkf_timer_stop(globaltimer);
01777 double totalruntime = wkf_timer_time(globaltimer);
01778 wkf_timer_destroy(globaltimer);
01779
01780
01781
01782 if (err != cudaSuccess) {
01783 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01784 return -1;
01785 }
01786
01787 #if 1
01788 printf(" GPU generated %d vertices, %d facets, in %d passes\n", numverts, numfacets, chunkcount);
01789
01790 printf(" GPU time (%s): %.3f [sort: %.3f density %.3f mcubes: %.3f copy: %.3f]\n",
01791 (deviceProp.major == 1 && deviceProp.minor == 3) ? "SM 1.3" : "SM 2.x",
01792 totalruntime, sorttime, densitytime, mctime, copytime);
01793 #endif
01794
01795 return 0;
01796 }
01797
01798
01799
01800
01801