QuickSurf.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: QuickSurf.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.136 $      $Date: 2022/05/23 19:10:01 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   Fast gaussian surface representation
00019  ***************************************************************************/
00020 
00021 // pgcc 2016 has troubles with hand-vectorized x86 intrinsics presently
00022 #if !defined(__PGIC__)
00023 
00024 // Intel x86 hardware
00025 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00026 #if !defined(__SSE2__) && defined(_WIN64)
00027 #define __SSE2__ 1      /* MSVC fails to define SSE macros */
00028 #endif
00029 #endif
00030 
00031 #define VMDQSURFUSESSE 1
00032 #endif
00033 
00034 // The OpenPOWER VSX code path runs on POWER8 and later hardware, but is
00035 // untested on older platforms that support VSX instructions.
00036 // XXX GCC 4.8.5 breaks with conflicts between vec_xxx() routines
00037 //     defined in utilities.h vs. VSX intrinsics in altivec.h and similar.
00038 //     For now, we disable VSX for GCC for this source file.
00039 // IBM Power 8/9/10 Altivec/VSX instructions:
00040 //   https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
00041 #if !defined(__GNUC__) && defined(__VEC__)
00042 #define VMDQSURFUSEVSX 1
00043 #endif
00044 
00045 #include <stdio.h>
00046 #include <stdlib.h>
00047 #if VMDQSURFUSESSE && defined(__SSE2__) 
00048 #include <emmintrin.h>
00049 #endif
00050 
00051 #if (defined(VMDQSURFUSEVSX) && defined(__VSX__))
00052 #if defined(__GNUC__) && defined(__VEC__)
00053 #include <altivec.h>
00054 #endif
00055 #endif
00056 
00057 #include <string.h>
00058 #include <math.h>
00059 #include "QuickSurf.h"
00060 #if defined(VMDCUDA)
00061 #include "CUDAQuickSurf.h"
00062 #endif
00063 #include "Measure.h"
00064 #include "Inform.h"
00065 #include "utilities.h"
00066 #include "WKFUtils.h"
00067 #include "VolumetricData.h"
00068 
00069 #include "VMDDisplayList.h"
00070 #include "Displayable.h"
00071 #include "DispCmds.h"
00072 #include "ProfileHooks.h"
00073 #include "VMDApp.h" // access global CPU insn flags for dispatch
00074 
00075 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00076 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00077 
00078 // fctn prototypes for runtime dispatched AVX2 kernels, etc
00079 void vmd_gaussdensity_avx2(int verbose,
00080                            int natoms, const float *xyzr,
00081                            const float *atomicnum,
00082                            const float *colors,
00083                            float *densitymap, float *voltexmap,
00084                            const int *numvoxels,
00085                            float radscale, float gridspacing,
00086                            float isovalue, float gausslim);
00087 
00088 void vmd_gaussdensity_neon(int verbose,
00089                            int natoms, const float *xyzr,
00090                            const float *atomicnum,
00091                            const float *colors,
00092                            float *densitymap, float *voltexmap,
00093                            const int *numvoxels,
00094                            float radscale, float gridspacing,
00095                            float isovalue, float gausslim);
00096 
00097 /*
00098  * David J. Hardy
00099  * 12 Dec 2008
00100  *
00101  * aexpfnx() - Approximate expf() for negative x.
00102  *
00103  * Assumes that x <= 0.
00104  *
00105  * Assumes IEEE format for single precision float, specifically:
00106  * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00107  *
00108  * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00109  * multiplication of a fast calculation for 2^(-N).  The interpolation
00110  * uses a linear blending of 3rd degree Taylor polynomials at the end
00111  * points, so the approximation is once differentiable.
00112  *
00113  * The error is small (max relative error per interval is calculated
00114  * to be 0.131%, with a max absolute error of -0.000716).
00115  *
00116  * The cutoff is chosen so as to speed up the computation by early
00117  * exit from function, with the value chosen to give less than the
00118  * the max absolute error.  Use of a cutoff is unnecessary, except
00119  * for needing to shift smallest floating point numbers to zero,
00120  * i.e. you could remove cutoff and replace by:
00121  *
00122  * #define MINXNZ  -88.0296919311130  // -127 * log(2)
00123  *
00124  *   if (x < MINXNZ) return 0.f;
00125  *
00126  * Use of a cutoff causes a discontinuity which can be eliminated
00127  * through the use of a switching function.
00128  *
00129  * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00130  * the interval and weighting their respective Taylor polynomials by the
00131  * kth order Lagrange interpolant through those nodes.  The wiggle in the
00132  * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00133  * can be reduced by using Chebyshev nodes.
00134  */
00135 
00136 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00137 #define __align(X)  __attribute__((aligned(X) ))
00138 #if (__GNUC__ < 4)
00139 #define MISSING_mm_cvtsd_f64
00140 #endif
00141 #else
00142 #define __align(X) __declspec(align(X) )
00143 #endif
00144 
00145 #define MLOG2EF    -1.44269504088896f
00146 
00147 /*
00148  * Interpolating coefficients for linear blending of the
00149  * 3rd degree Taylor expansion of 2^x about 0 and -1.
00150  */
00151 #define SCEXP0     1.0000000000000000f
00152 #define SCEXP1     0.6987082824680118f
00153 #define SCEXP2     0.2633174272827404f
00154 #define SCEXP3     0.0923611991471395f
00155 #define SCEXP4     0.0277520543324108f
00156 
00157 /* for single precision float */
00158 #define EXPOBIAS   127
00159 #define EXPOSHIFT   23
00160 
00161 /* cutoff is optional, but can help avoid unnecessary work */
00162 #define ACUTOFF    -10
00163 
00164 typedef union flint_t {
00165   float f;
00166   int n;
00167 } flint;
00168 
00169 #if VMDQSURFUSESSE && defined(__SSE2__)
00170 // SSE variant of the 'flint' union above
00171 typedef union SSEreg_t {
00172   __m128  f;  // 4x float (SSE)
00173   __m128i i;  // 4x 32-bit int (SSE2)
00174 } SSEreg;
00175 #endif
00176 
00177 
00178 #if 0
00179 static float aexpfnx(float x) {
00180   /* assume x <= 0 */
00181   float mb;
00182   int mbflr;
00183   float d;
00184   float sy;
00185   flint scalfac;
00186 
00187   if (x < ACUTOFF) return 0.f;
00188 
00189   mb = x * MLOG2EF;    /* change base to 2, mb >= 0 */
00190   mbflr = (int) mb;    /* get int part, floor() */
00191   d = mbflr - mb;      /* remaining exponent, -1 < d <= 0 */
00192   sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00193                        /* approx with linear blend of Taylor polys */
00194   scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  /* 2^(-mbflr) */
00195   return (sy * scalfac.f);  /* scaled approx */
00196 }
00197 
00198 
00199 static void vmd_gaussdensity(int verbose, 
00200                              int natoms, const float *xyzr,
00201                              const float *atomicnum,
00202                              const float *colors,
00203                              float *densitymap, float *voltexmap, 
00204                              const int *numvoxels, 
00205                              float radscale, float gridspacing, 
00206                              float isovalue, float gausslim) {
00207   int i, x, y, z;
00208   int maxvoxel[3];
00209   maxvoxel[0] = numvoxels[0]-1; 
00210   maxvoxel[1] = numvoxels[1]-1; 
00211   maxvoxel[2] = numvoxels[2]-1; 
00212   const float invgridspacing = 1.0f / gridspacing;
00213 
00214   // compute colors only if necessary, since they are costly
00215   if (voltexmap != NULL) {
00216     float invisovalue = 1.0f / isovalue;
00217     // compute both density map and floating point color texture map
00218     for (i=0; i<natoms; i++) {
00219       if (verbose && ((i & 0x3fff) == 0)) {
00220         printf("."); 
00221         fflush(stdout);
00222       }
00223 
00224       ptrdiff_t ind = i*4L;
00225       float scaledrad = xyzr[ind + 3L] * radscale;
00226 
00227       // MDFF atomic number weighted density factor
00228       float atomicnumfactor = 1.0f;
00229       if (atomicnum != NULL) {
00230         atomicnumfactor = atomicnum[i];
00231       }
00232 
00233       float arinv = 1.0f/(2.0f*scaledrad*scaledrad);
00234       float radlim = gausslim * scaledrad;
00235       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00236       radlim *= invgridspacing;
00237 
00238       float tmp;
00239       tmp = xyzr[ind  ] * invgridspacing;
00240       int xmin = MAX((int) (tmp - radlim), 0);
00241       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00242       tmp = xyzr[ind+1] * invgridspacing;
00243       int ymin = MAX((int) (tmp - radlim), 0);
00244       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00245       tmp = xyzr[ind+2] * invgridspacing;
00246       int zmin = MAX((int) (tmp - radlim), 0);
00247       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00248 
00249       float dz = zmin*gridspacing - xyzr[ind+2];
00250       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00251         float dy = ymin*gridspacing - xyzr[ind+1];
00252         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00253           float dy2dz2 = dy*dy + dz*dz;
00254 
00255           // early-exit when outside the cutoff radius in the Y-Z plane
00256           if (dy2dz2 >= radlim2) 
00257             continue;
00258 
00259           int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0];
00260           float dx = xmin*gridspacing - xyzr[ind];
00261           for (x=xmin; x<=xmax; x++,dx+=gridspacing) {
00262             float r2 = dx*dx + dy2dz2;
00263             float expval = -r2 * arinv;
00264 #if VMDUSEFULLEXP
00265             // use the math library exponential routine
00266             float density = exp(expval);
00267 #else
00268             // use our (much faster) fast exponential approximation
00269             float density = aexpfnx(expval);
00270 #endif
00271 
00272             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00273 
00274             // accumulate density value to density map
00275             densitymap[addr + x] += density;
00276 
00277             // Accumulate density-weighted color to texture map.
00278             // Pre-multiply colors by the inverse isovalue we will extract   
00279             // the surface on, to cause the final color to be normalized.
00280             density *= invisovalue;
00281             ptrdiff_t caddr = (addr + x) * 3L;
00282 
00283             // color by atom colors
00284             voltexmap[caddr    ] += density * colors[ind    ];
00285             voltexmap[caddr + 1] += density * colors[ind + 1];
00286             voltexmap[caddr + 2] += density * colors[ind + 2];
00287           }
00288         }
00289       }
00290     }
00291   } else {
00292     // compute density map only
00293     for (i=0; i<natoms; i++) {
00294       if (verbose && ((i & 0x3fff) == 0)) {
00295         printf("."); 
00296         fflush(stdout);
00297       }
00298 
00299       ptrdiff_t ind = i*4L;
00300       float scaledrad = xyzr[ind + 3] * radscale;
00301 
00302       // MDFF atomic number weighted density factor
00303       float atomicnumfactor = 1.0f;
00304       if (atomicnum != NULL) {
00305         atomicnumfactor = atomicnum[i];
00306       }
00307 
00308       float arinv = 1.0f/(2.0f*scaledrad*scaledrad);
00309       float radlim = gausslim * scaledrad;
00310       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00311       radlim *= invgridspacing;
00312 
00313       float tmp;
00314       tmp = xyzr[ind  ] * invgridspacing;
00315       int xmin = MAX((int) (tmp - radlim), 0);
00316       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00317       tmp = xyzr[ind+1] * invgridspacing;
00318       int ymin = MAX((int) (tmp - radlim), 0);
00319       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00320       tmp = xyzr[ind+2] * invgridspacing;
00321       int zmin = MAX((int) (tmp - radlim), 0);
00322       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00323 
00324       float dz = zmin*gridspacing - xyzr[ind+2];
00325       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00326         float dy = ymin*gridspacing - xyzr[ind+1];
00327         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00328           float dy2dz2 = dy*dy + dz*dz;
00329 
00330           // early-exit when outside the cutoff radius in the Y-Z plane
00331           if (dy2dz2 >= radlim2) 
00332             continue;
00333 
00334           int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0];
00335           float dx = xmin*gridspacing - xyzr[ind];
00336           for (x=xmin; x<=xmax; x++,dx+=gridspacing) {
00337             float r2 = dx*dx + dy2dz2;
00338             float expval = -r2 * arinv;
00339 #if VMDUSEFULLEXP
00340             // use the math library exponential routine
00341             float density = exp(expval);
00342 #else
00343             // use our (much faster) fast exponential approximation
00344             float density = aexpfnx(expval);
00345 #endif
00346 
00347             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00348 
00349             // accumulate density value to density map
00350             densitymap[addr + x] += density;
00351           }
00352         }
00353       }
00354     }
00355   }
00356 }
00357 #endif
00358 
00359 
00360 
00361 static void vmd_gaussdensity_opt(wkf_cpu_caps_t *cpucaps, int verbose,
00362                                  int natoms, const float *xyzr,
00363                                  const float *atomicnum,
00364                                  const float *colors,
00365                                  float *densitymap, float *voltexmap, 
00366                                  const int *numvoxels, 
00367                                  float radscale, float gridspacing, 
00368                                  float isovalue, float gausslim) {
00369   int i, x, y, z;
00370   int maxvoxel[3];
00371   maxvoxel[0] = numvoxels[0]-1; 
00372   maxvoxel[1] = numvoxels[1]-1; 
00373   maxvoxel[2] = numvoxels[2]-1; 
00374   const float invgridspacing = 1.0f / gridspacing;
00375 
00376   //
00377   // runtime CPU dispatch
00378   //   check for optional vector instructions and execute custom kernels
00379   //   for the fastest code path supported by the detected hardware
00380   //
00381 #if defined(VMDCPUDISPATCH)
00382 
00383 // Intel x86
00384 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00385 
00386   if ((cpucaps != NULL) && 
00387       (cpucaps->flags & CPU_FMA) && (cpucaps->flags & CPU_AVX2) && 
00388       (getenv("VMDNOAVX2")==NULL)) {
00389     if (verbose)
00390       printf("vmd_gaussdensity_avx2()\n");
00391 
00392     vmd_gaussdensity_avx2(verbose, natoms, xyzr, atomicnum, colors, 
00393                           densitymap, voltexmap, numvoxels, radscale, 
00394                           gridspacing, isovalue, gausslim);
00395     return;
00396   }
00397 #endif
00398 
00399 #if defined(VMDUSENEON)
00400 // if ((cpucaps->flags & CPU_NEON) && (getenv("VMDNONEON") == NULL)) {
00401   if (1 && (getenv("VMDNONEON") == NULL)) {
00402     if (verbose)
00403       printf("vmd_gaussdensity_neon()\n");
00404 
00405     vmd_gaussdensity_neon(verbose, natoms, xyzr, atomicnum, colors, 
00406                           densitymap, voltexmap, numvoxels, radscale, 
00407                           gridspacing, isovalue, gausslim);
00408     return;
00409   }
00410 #endif
00411 
00412 
00413   if (verbose)
00414     printf("vmd_gaussdensity_opt()\n");
00415 #endif
00416 
00417 #if VMDQSURFUSESSE && defined(__SSE2__)
00418   int usesse=1;
00419   if (getenv("VMDNOSSE")) {
00420     usesse=0;
00421   }
00422 #endif
00423 
00424 #if VMDQSURFUSEVSX && defined(__VEC__)
00425   int usevsx=1;
00426   if (getenv("VMDNOVSX")) {
00427     usevsx=0;
00428   }
00429 #endif
00430 
00431 #if VMDQSURFUSESSE && defined(__SSE2__)
00432   // Variables for SSE optimized inner loop
00433   __m128 gridspacing4_4;
00434   __align(16) float sxdelta4[4]; // 16-byte aligned for SSE
00435 
00436   if (usesse) {
00437     gridspacing4_4 = _mm_set1_ps(gridspacing * 4.0f);
00438     for (x=0; x<4; x++)
00439       sxdelta4[x] = ((float) x) * gridspacing;
00440   }
00441 #endif
00442 
00443 #if VMDQSURFUSEVSX && defined(__VEC__)
00444   // Variables for VSX optimized inner loop
00445   vector float gridspacing4_4;
00446   __attribute__((aligned(16))) float sxdelta4[4]; // 16-byte aligned for VSX
00447 
00448   if (usevsx) {
00449     gridspacing4_4 = vec_splats(gridspacing * 4.0f);
00450     for (x=0; x<4; x++)
00451       sxdelta4[x] = ((float) x) * gridspacing;
00452   }
00453 #endif
00454 
00455   // compute colors only if necessary, since they are costly
00456   if (voltexmap != NULL) {
00457     float invisovalue = 1.0f / isovalue;
00458     // compute both density map and floating point color texture map
00459     for (i=0; i<natoms; i++) {
00460       if (verbose && ((i & 0x3fff) == 0)) {
00461         printf("."); 
00462         fflush(stdout);
00463       }
00464 
00465       ptrdiff_t ind = i*4L;
00466       float scaledrad = xyzr[ind + 3] * radscale;
00467 
00468       // MDFF atomic number weighted density factor
00469       float atomicnumfactor = 1.0f;
00470       if (atomicnum != NULL) {
00471         atomicnumfactor = atomicnum[i];
00472       }
00473 
00474       // negate, precompute reciprocal, and change to base 2 from the outset
00475       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00476       float radlim = gausslim * scaledrad;
00477       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00478       radlim *= invgridspacing;
00479 
00480 #if VMDQSURFUSESSE && defined(__SSE2__)
00481       __m128 atomicnumfactor_4 = { 0 };
00482       __m128 arinv_4;
00483       if (usesse) {
00484         atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor);
00485 #if VMDUSESVMLEXP
00486         // Use of Intel's SVML requires changing the pre-scaling factor
00487         arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 
00488 #else
00489         // Use our fully inlined exp approximation
00490         arinv_4 = _mm_set1_ps(arinv);
00491 #endif
00492       }
00493 #endif
00494 
00495       float tmp;
00496       tmp = xyzr[ind  ] * invgridspacing;
00497       int xmin = MAX((int) (tmp - radlim), 0);
00498       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00499       tmp = xyzr[ind+1] * invgridspacing;
00500       int ymin = MAX((int) (tmp - radlim), 0);
00501       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00502       tmp = xyzr[ind+2] * invgridspacing;
00503       int zmin = MAX((int) (tmp - radlim), 0);
00504       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00505 
00506       float dz = zmin*gridspacing - xyzr[ind+2];
00507       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00508         float dy = ymin*gridspacing - xyzr[ind+1];
00509         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00510           float dy2dz2 = dy*dy + dz*dz;
00511 
00512           // early-exit when outside the cutoff radius in the Y-Z plane
00513           if (dy2dz2 >= radlim2) 
00514             continue;
00515 
00516           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00517           float dx = xmin*gridspacing - xyzr[ind];
00518           x=xmin;
00519 
00520 #if VMDQSURFUSESSE && defined(__SSE2__)
00521           // Use SSE when we have a multiple-of-4 to compute
00522           // finish all remaining density map points with regular non-SSE loop
00523           if (usesse) {
00524             __align(16) SSEreg n;
00525             __align(16) SSEreg y;
00526             __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2);
00527             __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0]));
00528 
00529             for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) {
00530               __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4);
00531               __m128 d;
00532 #if VMDUSESVMLEXP
00533               // use Intel's SVML exp2() routine
00534               y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4));
00535 #else
00536               // use our (much faster) fully inlined exponential approximation
00537               y.f = _mm_mul_ps(r2, arinv_4);         /* already negated and in base 2 */
00538               n.i = _mm_cvttps_epi32(y.f);
00539               d = _mm_cvtepi32_ps(n.i);
00540               d = _mm_sub_ps(d, y.f);
00541 
00542               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00543               // Perform Horner's method to evaluate interpolating polynomial.
00544 #if 0
00545               // SSE 4.x FMADD instructions are not universally available
00546               y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 
00547               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2));
00548               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1));
00549               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0));
00550 #else
00551               y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4));      /* for x^4 term */
00552               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3));    /* for x^3 term */
00553               y.f = _mm_mul_ps(y.f, d);
00554               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2));    /* for x^2 term */
00555               y.f = _mm_mul_ps(y.f, d);
00556               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1));    /* for x^1 term */
00557               y.f = _mm_mul_ps(y.f, d);
00558               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0));    /* for x^0 term */
00559 #endif
00560 
00561               // Calculate 2^N exactly by directly manipulating floating point exponent,
00562               // then use it to scale y for the final result.
00563               n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
00564               n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
00565               y.f = _mm_mul_ps(y.f, n.f);
00566 #endif
00567 
00568               // At present, we do unaligned loads/stores since we can't guarantee
00569               // that the X-dimension is always a multiple of 4.
00570               float *ufptr = &densitymap[addr + x];
00571               d = _mm_loadu_ps(ufptr);
00572               y.f = _mm_mul_ps(y.f, atomicnumfactor_4); // MDFF density maps
00573               _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00574 
00575               // Accumulate density-weighted color to texture map.
00576               // Pre-multiply colors by the inverse isovalue we will extract   
00577               // the surface on, to cause the final color to be normalized.
00578               d = _mm_mul_ps(y.f, _mm_set_ps1(invisovalue));
00579               ptrdiff_t caddr = (addr + x) * 3L;
00580 
00581 #if 1
00582               float *txptr = &voltexmap[caddr];
00583               // unaligned load of 4 consecutive rgb3f texture map texels
00584               __m128 r0g0b0r1 = _mm_loadu_ps(txptr+0);
00585               __m128 g1b1r2g2 = _mm_loadu_ps(txptr+4); 
00586               __m128 b2r3g3b3 = _mm_loadu_ps(txptr+8);
00587 
00588               // convert rgb3f AOS format to 4-element SOA vectors using shuffle instructions
00589               __m128 r2g2r3g3 = _mm_shuffle_ps(g1b1r2g2, b2r3g3b3, _MM_SHUFFLE(2, 1, 3, 2)); 
00590               __m128 g0b0g1b1 = _mm_shuffle_ps(r0g0b0r1, g1b1r2g2, _MM_SHUFFLE(1, 0, 2, 1));
00591               __m128 r        = _mm_shuffle_ps(r0g0b0r1, r2g2r3g3, _MM_SHUFFLE(2, 0, 3, 0)); // r0r1r2r3
00592               __m128 g        = _mm_shuffle_ps(g0b0g1b1, r2g2r3g3, _MM_SHUFFLE(3, 1, 2, 0)); // g0g1g2g3
00593               __m128 b        = _mm_shuffle_ps(g0b0g1b1, b2r3g3b3, _MM_SHUFFLE(3, 0, 3, 1)); // b0g1b2b3
00594 
00595               // accumulate density-scaled colors into texels
00596               r = _mm_add_ps(r, _mm_mul_ps(d, _mm_set_ps1(colors[ind    ])));
00597               g = _mm_add_ps(g, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 1])));
00598               b = _mm_add_ps(b, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 2])));
00599 
00600               // convert 4-element SOA vectors to rgb3f AOS format using shuffle instructions
00601               __m128 r0r2g0g2  = _mm_shuffle_ps(r, g, _MM_SHUFFLE(2, 0, 2, 0));
00602               __m128 g1g3b1b3  = _mm_shuffle_ps(g, b, _MM_SHUFFLE(3, 1, 3, 1));
00603               __m128 b0b2r1r3  = _mm_shuffle_ps(b, r, _MM_SHUFFLE(3, 1, 2, 0));
00604  
00605               __m128 rr0g0b0r1 = _mm_shuffle_ps(r0r2g0g2, b0b2r1r3, _MM_SHUFFLE(2, 0, 2, 0)); 
00606               __m128 rg1b1r2g2 = _mm_shuffle_ps(g1g3b1b3, r0r2g0g2, _MM_SHUFFLE(3, 1, 2, 0)); 
00607               __m128 rb2r3g3b3 = _mm_shuffle_ps(b0b2r1r3, g1g3b1b3, _MM_SHUFFLE(3, 1, 3, 1)); 
00608  
00609               // unaligned store of 4 consecutive rgb3f texture map texels
00610               _mm_storeu_ps(txptr+0, rr0g0b0r1);
00611               _mm_storeu_ps(txptr+4, rg1b1r2g2);
00612               _mm_storeu_ps(txptr+8, rb2r3g3b3);
00613 
00614 #else
00615 
00616               // color by atom colors
00617               float r, g, b;
00618               r = colors[ind    ];
00619               g = colors[ind + 1];
00620               b = colors[ind + 2];
00621 
00622               SSEreg tmp; 
00623               tmp.f = d;
00624               float density;
00625               density = tmp.floatreg.r0; 
00626               voltexmap[caddr     ] += density * r;
00627               voltexmap[caddr +  1] += density * g;
00628               voltexmap[caddr +  2] += density * b;
00629 
00630               density = tmp.floatreg.r1; 
00631               voltexmap[caddr +  3] += density * r;
00632               voltexmap[caddr +  4] += density * g;
00633               voltexmap[caddr +  5] += density * b;
00634 
00635               density = tmp.floatreg.r2; 
00636               voltexmap[caddr +  6] += density * r;
00637               voltexmap[caddr +  7] += density * g;
00638               voltexmap[caddr +  8] += density * b;
00639 
00640               density = tmp.floatreg.r3; 
00641               voltexmap[caddr +  9] += density * r;
00642               voltexmap[caddr + 10] += density * g;
00643               voltexmap[caddr + 11] += density * b;
00644 #endif
00645             }
00646           }
00647 #endif
00648 
00649           // finish all remaining density map points with regular non-SSE loop
00650           for (; x<=xmax; x++,dx+=gridspacing) {
00651             float r2 = dx*dx + dy2dz2;
00652 
00653             // use our (much faster) fully inlined exponential approximation
00654             float mb = r2 * arinv;         /* already negated and in base 2 */
00655             int mbflr = (int) mb;          /* get int part, floor() */
00656             float d = mbflr - mb;          /* remaining exponent, -1 < d <= 0 */
00657 
00658             /* approx with linear blend of Taylor polys */
00659             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00660 
00661             /* 2^(-mbflr) */
00662             flint scalfac;
00663             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  
00664 
00665             // XXX assume we are never beyond the cutoff value in this loop
00666             float density = (sy * scalfac.f);
00667 
00668             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00669 
00670             // accumulate density value to density map
00671             densitymap[addr + x] += density;
00672 
00673             // Accumulate density-weighted color to texture map.
00674             // Pre-multiply colors by the inverse isovalue we will extract   
00675             // the surface on, to cause the final color to be normalized.
00676             density *= invisovalue;
00677             ptrdiff_t caddr = (addr + x) * 3L;
00678 
00679             // color by atom colors
00680             voltexmap[caddr    ] += density * colors[ind    ];
00681             voltexmap[caddr + 1] += density * colors[ind + 1];
00682             voltexmap[caddr + 2] += density * colors[ind + 2];
00683           }
00684         }
00685       }
00686     }
00687   } else {
00688     // compute density map only
00689     for (i=0; i<natoms; i++) {
00690       if (verbose && ((i & 0x3fff) == 0)) {
00691         printf("."); 
00692         fflush(stdout);
00693       }
00694 
00695       ptrdiff_t ind = i*4L;
00696       float scaledrad = xyzr[ind+3] * radscale;
00697 
00698       // MDFF atomic number weighted density factor
00699       float atomicnumfactor = 1.0f;
00700       if (atomicnum != NULL) {
00701         atomicnumfactor = atomicnum[i];
00702       }
00703 
00704       // negate, precompute reciprocal, and change to base 2 from the outset
00705       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00706       float radlim = gausslim * scaledrad;
00707       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00708       radlim *= invgridspacing;
00709 
00710 #if VMDQSURFUSESSE && defined(__SSE2__)
00711       __m128 atomicnumfactor_4 = { 0 };
00712       __m128 arinv_4;
00713       if (usesse) {
00714         atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor);
00715 #if VMDUSESVMLEXP
00716         // Use of Intel's SVML requires changing the pre-scaling factor
00717         arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 
00718 #else
00719         // Use our fully inlined exp approximation
00720         arinv_4 = _mm_set1_ps(arinv);
00721 #endif
00722       }
00723 #endif
00724 
00725 #if VMDQSURFUSEVSX && defined(__VEC__)
00726       vector float atomicnumfactor_4;
00727       vector float arinv_4;
00728       if (usevsx) {
00729         atomicnumfactor_4 = vec_splats(atomicnumfactor);
00730 
00731         // Use our fully inlined exp approximation
00732         arinv_4 = vec_splats(arinv);
00733       }
00734 #endif
00735 
00736       float tmp;
00737       tmp = xyzr[ind  ] * invgridspacing;
00738       int xmin = MAX((int) (tmp - radlim), 0);
00739       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00740       tmp = xyzr[ind+1] * invgridspacing;
00741       int ymin = MAX((int) (tmp - radlim), 0);
00742       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00743       tmp = xyzr[ind+2] * invgridspacing;
00744       int zmin = MAX((int) (tmp - radlim), 0);
00745       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00746 
00747       float dz = zmin*gridspacing - xyzr[ind+2];
00748       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00749         float dy = ymin*gridspacing - xyzr[ind+1];
00750         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00751           float dy2dz2 = dy*dy + dz*dz;
00752 
00753           // early-exit when outside the cutoff radius in the Y-Z plane
00754           if (dy2dz2 >= radlim2) 
00755             continue;
00756 
00757           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00758           float dx = xmin*gridspacing - xyzr[ind];
00759           x=xmin;
00760 
00761 #if VMDQSURFUSESSE && defined(__SSE2__)
00762           // Use SSE when we have a multiple-of-4 to compute
00763           // finish all remaining density map points with regular non-SSE loop
00764           if (usesse) {
00765             __align(16) SSEreg n;
00766             __align(16) SSEreg y;
00767             __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2);
00768             __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0]));
00769 
00770             for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) {
00771               __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4);
00772               __m128 d;
00773 #if VMDUSESVMLEXP
00774               // use Intel's SVML exp2() routine
00775               y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4));
00776 #else
00777               // use our (much faster) fully inlined exponential approximation
00778               y.f = _mm_mul_ps(r2, arinv_4);         /* already negated and in base 2 */
00779               n.i = _mm_cvttps_epi32(y.f);
00780               d = _mm_cvtepi32_ps(n.i);
00781               d = _mm_sub_ps(d, y.f);
00782 
00783               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00784               // Perform Horner's method to evaluate interpolating polynomial.
00785 #if 0
00786               // SSE 4.x FMADD instructions are not universally available
00787               y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 
00788               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2));
00789               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1));
00790               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0));
00791 #else
00792               y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4));      /* for x^4 term */
00793               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3));    /* for x^3 term */
00794               y.f = _mm_mul_ps(y.f, d);
00795               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2));    /* for x^2 term */
00796               y.f = _mm_mul_ps(y.f, d);
00797               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1));    /* for x^1 term */
00798               y.f = _mm_mul_ps(y.f, d);
00799               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0));    /* for x^0 term */
00800 #endif
00801 
00802               // Calculate 2^N exactly by directly manipulating floating point exponent,
00803               // then use it to scale y for the final result.
00804               n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
00805               n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
00806               y.f = _mm_mul_ps(y.f, n.f);
00807               y.f = _mm_mul_ps(y.f, atomicnumfactor_4); // MDFF density maps
00808 #endif
00809 
00810               // At present, we do unaligned loads/stores since we can't guarantee
00811               // that the X-dimension is always a multiple of 4.
00812               float *ufptr = &densitymap[addr + x];
00813               d = _mm_loadu_ps(ufptr); 
00814               _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00815             }
00816           }
00817 #endif
00818 
00819 
00820 #if VMDQSURFUSEVSX && defined(__VEC__)
00821           // Use VSX when we have a multiple-of-4 to compute
00822           // finish all remaining density map points with regular non-VSX loop
00823           //
00824           // XXX it may be useful to compare the speed/accuracy of the
00825           // polynomial approximation vs. the hardware-provided 
00826           // exp2f() approximation: vec_expte()
00827           //
00828           if (usevsx) {
00829             vector float dy2dz2_4 = vec_splats(dy2dz2);
00830             vector float tmpvsxdelta4 = *((__vector float *) &sxdelta4[0]);
00831             vector float dx_4 = vec_add(vec_splats(dx), tmpvsxdelta4);
00832 
00833             for (; (x+3)<=xmax; x+=4,dx_4=vec_add(dx_4, gridspacing4_4)) {
00834               vector float r2 = vec_add(vec_mul(dx_4, dx_4), dy2dz2_4);
00835 
00836               // use our (much faster) fully inlined exponential approximation
00837               vector float mb = vec_mul(r2, arinv_4);   /* already negated and in base 2 */
00838               vector float mbflr = vec_floor(mb);
00839               vector float d = vec_sub(mbflr, mb);
00840               vector float y;
00841 
00842               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00843               // Perform Horner's method to evaluate interpolating polynomial.
00844               y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3)); // x^4
00845               y = vec_madd(y, d, vec_splats(SCEXP2)); // x^2 
00846               y = vec_madd(y, d, vec_splats(SCEXP1)); // x^1 
00847               y = vec_madd(y, d, vec_splats(SCEXP0)); // x^0 
00848 
00849               // Calculate 2^N exactly via vec_expte()
00850               // then use it to scale y for the final result.
00851               y = vec_mul(y, vec_expte(-mbflr));
00852               y = vec_mul(y, atomicnumfactor_4); // MDFF density maps
00853 
00854               // At present, we do unaligned loads/stores since we can't 
00855               // guarantee that the X-dimension is always a multiple of 4.
00856               float *ufptr = &densitymap[addr + x];
00857               d = *((__vector float *) &ufptr[0]);
00858               // XXX there must be a cleaner way to implement this
00859               // d = _mm_loadu_ps(ufptr); 
00860               // _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00861               d = vec_add(d, y);
00862 
00863               ufptr[0] = d[0];
00864               ufptr[1] = d[1];
00865               ufptr[2] = d[2];
00866               ufptr[3] = d[3];
00867             }
00868           }
00869 #endif
00870 
00871           // finish all remaining density map points with regular non-SSE loop
00872           for (; x<=xmax; x++,dx+=gridspacing) {
00873             float r2 = dx*dx + dy2dz2;
00874 
00875             // use our (much faster) fully inlined exponential approximation
00876             float mb = r2 * arinv;         /* already negated and in base 2 */
00877             int mbflr = (int) mb;          /* get int part, floor() */
00878             float d = mbflr - mb;          /* remaining exponent, -1 < d <= 0 */
00879 
00880             /* approx with linear blend of Taylor polys */
00881             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00882 
00883             /* 2^(-mbflr) */
00884             flint scalfac;
00885             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  
00886 
00887             // XXX assume we are never beyond the cutoff value in this loop
00888             float density = (sy * scalfac.f);
00889 
00890             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00891 
00892             densitymap[addr + x] += density;
00893           }
00894         }
00895       }
00896     }
00897   }
00898 }
00899 
00900 
00901 typedef struct {
00902   wkf_cpu_caps_t *cpucaps;
00903   int verbose;
00904   int natoms;
00905   float radscale;
00906   float gridspacing;
00907   float isovalue;
00908   float gausslim;
00909   const int *numvoxels;
00910   const float *xyzr; 
00911   const float *atomicnum;
00912   const float *colors;
00913   float **thrdensitymaps;
00914   float **thrvoltexmaps;
00915 } densitythrparms;
00916 
00917 
00918 static void * densitythread(void *voidparms) {
00919   wkf_tasktile_t tile;
00920   densitythrparms *parms = NULL;
00921   int threadid;
00922 
00923   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00924   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00925 
00926   while (wkf_threadlaunch_next_tile(voidparms, 16384, &tile) != WKF_SCHED_DONE) {
00927     int natoms = tile.end-tile.start;
00928     const float *atomicnum = (parms->atomicnum == NULL) ? NULL : &parms->atomicnum[tile.start]; 
00929     vmd_gaussdensity_opt(parms->cpucaps,
00930                          parms->verbose, natoms, 
00931                          &parms->xyzr[4L*tile.start],
00932                          atomicnum,
00933                          (parms->thrvoltexmaps[0]!=NULL) ? &parms->colors[4L*tile.start] : NULL,
00934                          parms->thrdensitymaps[threadid], 
00935                          parms->thrvoltexmaps[threadid], 
00936                          parms->numvoxels, 
00937                          parms->radscale, 
00938                          parms->gridspacing, 
00939                          parms->isovalue, 
00940                          parms->gausslim);
00941   }
00942 
00943   return NULL;
00944 }
00945 
00946 
00947 static void * reductionthread(void *voidparms) {
00948   wkf_tasktile_t tile;
00949   densitythrparms *parms = NULL;
00950   int threadid, numthreads;
00951 
00952   wkf_threadlaunch_getid(voidparms, &threadid, &numthreads);
00953   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00954 
00955   while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00956     // do a reduction over each of the individual density grids
00957     ptrdiff_t planesz = ptrdiff_t(parms->numvoxels[0]) * ptrdiff_t(parms->numvoxels[1]);
00958     ptrdiff_t start = ptrdiff_t(tile.start) * planesz;
00959     ptrdiff_t end   = ptrdiff_t(tile.end)   * planesz;
00960 
00961     ptrdiff_t i, x;
00962     for (x=start; x<end; x++) {
00963       float tmp = 0.0f;
00964       for (i=1; i<numthreads; i++) {
00965         tmp += parms->thrdensitymaps[i][x];
00966       }
00967       parms->thrdensitymaps[0][x] += tmp;
00968     }
00969 
00970     // do a reduction over each of the individual texture grids
00971     if (parms->thrvoltexmaps[0] != NULL) {
00972       for (x=start*3L; x<end*3L; x++) {
00973         float tmp = 0.0f;
00974         for (i=1; i<numthreads; i++) {
00975           tmp += parms->thrvoltexmaps[i][x];
00976         }
00977         parms->thrvoltexmaps[0][x] += tmp;
00978       }
00979     }
00980   }
00981 
00982   return NULL;
00983 }
00984 
00985 
00986 static int vmd_gaussdensity_threaded(wkf_cpu_caps_t *cpucaps, int verbose, 
00987                                      int natoms, const float *xyzr,
00988                                      const float *atomicnum,
00989                                      const float *colors,
00990                                      float *densitymap, float *voltexmap, 
00991                                      const int *numvoxels, 
00992                                      float radscale, float gridspacing, 
00993                                      float isovalue, float gausslim) {
00994   densitythrparms parms;
00995   memset(&parms, 0, sizeof(parms));
00996 
00997   parms.cpucaps = cpucaps;
00998   parms.verbose = verbose;
00999   parms.natoms = natoms;
01000   parms.radscale = radscale;
01001   parms.gridspacing = gridspacing;
01002   parms.isovalue = isovalue;
01003   parms.gausslim = gausslim;
01004   parms.numvoxels = numvoxels;
01005   parms.xyzr = xyzr;
01006   parms.atomicnum = atomicnum;
01007   parms.colors = colors;
01008  
01009   int physprocs = wkf_thread_numprocessors();
01010   int maxprocs = physprocs;
01011 
01012   // We can productively use only a few cores per socket due to the
01013   // limited memory bandwidth per socket. Also, hyperthreading
01014   // actually hurts performance.  These two considerations combined
01015   // with the linear increase in memory use prevent us from using large
01016   // numbers of cores with this simple approach, so if we've got more 
01017   // than 8 CPU cores, we'll iteratively cutting the core count in 
01018   // half until we're under 8 cores.
01019   while (maxprocs > 8) 
01020     maxprocs /= 2;
01021 
01022   // Limit the number of CPU cores used so we don't run the 
01023   // machine out of memory during surface computation.
01024   // Use either a dynamic or hard-coded heuristic to limit the
01025   // number of CPU threads we will spawn so that we don't run
01026   // the machine out of memory.  
01027   ptrdiff_t volsz = ptrdiff_t(numvoxels[0]) * 
01028                     ptrdiff_t(numvoxels[1]) * ptrdiff_t(numvoxels[2]);
01029   ptrdiff_t volmemsz = sizeof(float) * volsz;
01030   ptrdiff_t volmemszkb = volmemsz / 1024;
01031   ptrdiff_t volmemtexszkb = volmemszkb + ((voltexmap != NULL) ? 3L*volmemszkb : 0);
01032 
01033   // Platforms that don't have a means of determining available
01034   // physical memory will return -1, in which case we fall back to the
01035   // simple hard-coded 2GB-max-per-core heuristic.
01036   ptrdiff_t vmdcorefree = -1;
01037 
01038 #if defined(ARCH_BLUEWATERS) || defined(ARCH_CRAY_XC) || defined(ARCH_CRAY_XK) || defined(ARCH_LINUXAMD64) || defined(ARCH_SOLARIS2_64) || defined(ARCH_SOLARISX86_64) || defined(ARCH_AIX6_64) || defined(MACOSXARM64) || defined(ARCH_MACOSXX86_64) 
01039   // XXX The core-free query scheme has one weakness in that we might have a 
01040   // 32-bit version of VMD running on a 64-bit machine, where the available
01041   // physical memory may be much larger than is possible for a 
01042   // 32-bit VMD process to address.  To do this properly we must therefore
01043   // use conditional compilation safety checks here until we  have a better
01044   // way of determining this with a standardized helper routine.
01045   vmdcorefree = vmd_get_avail_physmem_mb();
01046 #endif
01047 
01048   if (vmdcorefree >= 0) {
01049     // Make sure QuickSurf uses no more than a fraction of the free memory
01050     // as an upper bound alternative to the hard-coded heuristic.
01051     // This should be highly preferable to the fixed-size heuristic
01052     // we had used in all cases previously.
01053     while ((volmemtexszkb * maxprocs) > (1024L*vmdcorefree/4)) {
01054       maxprocs /= 2;
01055     }
01056   } else {
01057     // Set a practical per-core maximum memory use limit to 2GB, for all cores
01058     while ((volmemtexszkb * maxprocs) > (2L * 1024L * 1024L))
01059       maxprocs /= 2;
01060   }
01061 
01062   if (maxprocs < 1) 
01063     maxprocs = 1;
01064 
01065   // Loop over number of physical processors and try to create 
01066   // per-thread volumetric maps for each of them.
01067   parms.thrdensitymaps = (float **) calloc(1,maxprocs * sizeof(float *));
01068   parms.thrvoltexmaps = (float **) calloc(1, maxprocs * sizeof(float *));
01069 
01070   // first thread is already ready to go
01071   parms.thrdensitymaps[0] = densitymap;
01072   parms.thrvoltexmaps[0] = voltexmap;
01073 
01074   int i;
01075   int numprocs = maxprocs; // ever the optimist
01076   for (i=1; i<maxprocs; i++) {
01077     parms.thrdensitymaps[i] = (float *) calloc(1, volmemsz);
01078     if (parms.thrdensitymaps[i] == NULL) {
01079       numprocs = i;
01080       break;
01081     }
01082     if (voltexmap != NULL) {
01083       parms.thrvoltexmaps[i] = (float *) calloc(1, 3L * volmemsz);
01084       if (parms.thrvoltexmaps[i] == NULL) {
01085         free(parms.thrdensitymaps[i]);
01086         parms.thrdensitymaps[i] = NULL;
01087         numprocs = i;
01088         break;
01089       }
01090     }
01091   }
01092 
01093   // launch independent thread calculations
01094   wkf_tasktile_t tile;
01095   tile.start = 0;
01096   tile.end = natoms;
01097   wkf_threadlaunch(numprocs, &parms, densitythread, &tile);
01098 
01099   // do a parallel reduction of the resulting density maps
01100   tile.start = 0;
01101   tile.end = numvoxels[2];
01102   wkf_threadlaunch(numprocs, &parms, reductionthread, &tile);
01103 
01104   // free work area
01105   for (i=1; i<maxprocs; i++) {
01106     if (parms.thrdensitymaps[i] != NULL)
01107       free(parms.thrdensitymaps[i]);
01108 
01109     if (parms.thrvoltexmaps[i] != NULL)
01110       free(parms.thrvoltexmaps[i]);
01111   }
01112   free(parms.thrdensitymaps);
01113   free(parms.thrvoltexmaps);
01114 
01115   return 0;
01116 }
01117 
01118 QuickSurf::QuickSurf(int forcecpuonly) {
01119   volmap = NULL;
01120   voltexmap = NULL;
01121   s.clear();
01122   isovalue = 0.5f;
01123 
01124   numvoxels[0] = 128;
01125   numvoxels[1] = 128;
01126   numvoxels[2] = 128;
01127 
01128   origin[0] = 0.0f;
01129   origin[1] = 0.0f;
01130   origin[2] = 0.0f;
01131 
01132   xaxis[0] = 1.0f;
01133   xaxis[1] = 0.0f;
01134   xaxis[2] = 0.0f;
01135 
01136   yaxis[0] = 0.0f;
01137   yaxis[1] = 1.0f;
01138   yaxis[2] = 0.0f;
01139 
01140   zaxis[0] = 0.0f;
01141   zaxis[1] = 0.0f;
01142   zaxis[2] = 1.0f;
01143    
01144   cudaqs = NULL;
01145   force_cpuonly = forcecpuonly;
01146 #if defined(VMDCUDA)
01147   if (!force_cpuonly && !getenv("VMDNOCUDA")) {
01148     cudaqs = new CUDAQuickSurf();
01149   }
01150 #endif
01151 
01152   timer = wkf_timer_create();
01153 }
01154 
01155 
01156 void QuickSurf::free_gpu_memory(void) {
01157   if (cudaqs) {
01158 #if defined(VMDCUDA)
01159     delete cudaqs; 
01160 #endif
01161     cudaqs = NULL; 
01162   }
01163 }
01164 
01165 
01166 int QuickSurf::calc_surf(AtomSel *atomSel, DrawMolecule *mol,
01167                          const float *atompos, const float *atomradii,
01168                          int quality, float radscale, float gridspacing,
01169                          float isoval, const int *colidx, const float *cmap,
01170                          VMDDisplayList *cmdList) {
01171   PROFILE_PUSH_RANGE("QuickSurf", 3);
01172 
01173   wkf_timer_start(timer);
01174   int colorperatom = (colidx != NULL && cmap != NULL);
01175   int usebeads=0;
01176 
01177   int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL);
01178 
01179   // Disable MDFF atomic number weighted densities until we implement
01180   // GUI controls for this if it turns out to be useful for more than
01181   // than just analytical usage.
01182   const float *atomicnum = NULL;
01183 
01184   // clean up any existing CPU arrays before going any further...
01185   if (voltexmap != NULL)
01186     free(voltexmap);
01187   voltexmap = NULL;
01188 
01189   ResizeArray<float> beadpos(64 + (3L * atomSel->selected) / 20);
01190   ResizeArray<float> beadradii(64 + (3L * atomSel->selected) / 20);
01191   ResizeArray<float> beadcolors(64 + (3L * atomSel->selected) / 20);
01192 
01193   if (getenv("VMDQUICKSURFBEADS")) {
01194     usebeads=1;
01195     if (verbose)
01196       printf("QuickSurf using residue beads representation...\n");
01197   }
01198 
01199   int numbeads = 0;
01200   if (usebeads) {
01201     int i, resid, numres;
01202 
01203     // draw a bead for each residue
01204     numres = mol->residueList.num();
01205     for (resid=0; resid<numres; resid++) {
01206       float com[3] = {0.0, 0.0, 0.0};
01207       const ResizeArray<int> &atoms = mol->residueList[resid]->atoms;
01208       int numatoms = atoms.num();
01209       int oncount = 0;
01210    
01211       // find COM for residue
01212       for (i=0; i<numatoms; i++) {
01213         int idx = atoms[i];
01214         if (atomSel->on[idx]) {
01215           oncount++;
01216           vec_add(com, com, atompos + 3L*idx);
01217         }
01218       }
01219 
01220       if (oncount < 1)
01221         continue; // exit if there weren't any atoms
01222 
01223       vec_scale(com, 1.0f / (float) oncount, com);
01224 
01225       // find radius of bounding sphere and save last atom index for color
01226       int atomcolorindex=0; // initialize, to please compilers
01227       float boundradsq = 0.0f;
01228       for (i=0; i<numatoms; i++) {
01229         int idx = atoms[i];
01230         if (atomSel->on[idx]) {
01231           float tmpdist[3];
01232           atomcolorindex = idx;
01233           vec_sub(tmpdist, com, atompos + 3L*idx);
01234           float distsq = dot_prod(tmpdist, tmpdist);
01235           if (distsq > boundradsq) {
01236             boundradsq = distsq;
01237           }
01238         }
01239       }
01240       beadpos.append3(&com[0]);
01241       beadradii.append(sqrtf(boundradsq) + 1.0f);
01242 
01243       if (colorperatom) {
01244         const float *cp = &cmap[colidx[atomcolorindex] * 3L];
01245         beadcolors.append3(&cp[0]);
01246       }
01247 
01248       // XXX still need to add pick points...
01249     }
01250 
01251     numbeads = beadpos.num() / 3;
01252   }
01253 
01254   // initialize class variables
01255   isovalue=isoval;
01256 
01257   // If no volumetric texture will be computed we will use the cmap
01258   // parameter to pass in the solid color to be applied to all vertices
01259   // Since QS can now also be called by MDFF, we have to check whether
01260   // display related parms are set or not before using them.
01261   if (cmap != NULL)
01262     vec_copy(solidcolor, cmap);
01263 
01264   // compute min/max atom radius, build list of selected atom radii,
01265   // and compute bounding box for the selected atoms
01266   float minx, miny, minz, maxx, maxy, maxz;
01267   float minrad, maxrad;
01268   int i;
01269   if (usebeads) {
01270     minx = maxx = beadpos[0];
01271     miny = maxy = beadpos[1];
01272     minz = maxz = beadpos[2];
01273     minrad = maxrad = beadradii[0];
01274     for (i=0; i<numbeads; i++) {
01275       ptrdiff_t ind = i * 3L;
01276       float tmpx = beadpos[ind  ];
01277       float tmpy = beadpos[ind+1];
01278       float tmpz = beadpos[ind+2];
01279 
01280       minx = (tmpx < minx) ? tmpx : minx;
01281       maxx = (tmpx > maxx) ? tmpx : maxx;
01282 
01283       miny = (tmpy < miny) ? tmpy : miny;
01284       maxy = (tmpy > maxy) ? tmpy : maxy;
01285 
01286       minz = (tmpz < minz) ? tmpz : minz;
01287       maxz = (tmpz > maxz) ? tmpz : maxz;
01288  
01289       // we always have to compute the rmin/rmax for beads
01290       // since these radii are defined on-the-fly
01291       float r = beadradii[i];
01292       minrad = (r < minrad) ? r : minrad;
01293       maxrad = (r > maxrad) ? r : maxrad;
01294     }
01295   } else {
01296     minx = maxx = atompos[atomSel->firstsel*3L  ];
01297     miny = maxy = atompos[atomSel->firstsel*3L+1];
01298     minz = maxz = atompos[atomSel->firstsel*3L+2];
01299 
01300     // Query min/max atom radii for the entire molecule
01301     mol->get_radii_minmax(minrad, maxrad);
01302 
01303     // We only compute rmin/rmax for the actual group of selected atoms if 
01304     // (rmax/rmin > 2.5) for the whole molecule, otherwise it's a small 
01305     // enough range that we don't care since it won't hurt our performance. 
01306     if (minrad <= 0.001 || maxrad/minrad > 2.5) {
01307       minrad = maxrad = atomradii[atomSel->firstsel];
01308       for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) {
01309         if (atomSel->on[i]) {
01310           ptrdiff_t ind = i * 3L;
01311           float tmpx = atompos[ind  ];
01312           float tmpy = atompos[ind+1];
01313           float tmpz = atompos[ind+2];
01314 
01315           minx = (tmpx < minx) ? tmpx : minx;
01316           maxx = (tmpx > maxx) ? tmpx : maxx;
01317 
01318           miny = (tmpy < miny) ? tmpy : miny;
01319           maxy = (tmpy > maxy) ? tmpy : maxy;
01320 
01321           minz = (tmpz < minz) ? tmpz : minz;
01322           maxz = (tmpz > maxz) ? tmpz : maxz;
01323   
01324           float r = atomradii[i];
01325           minrad = (r < minrad) ? r : minrad;
01326           maxrad = (r > maxrad) ? r : maxrad;
01327         }
01328       }
01329     } else {
01330 #if 1
01331       float fmin[3], fmax[3];
01332       minmax_selected_3fv_aligned(atompos, atomSel->on, atomSel->num_atoms,
01333                                   atomSel->firstsel, atomSel->lastsel,
01334                                   fmin, fmax);
01335       minx = fmin[0];
01336       miny = fmin[1];
01337       minz = fmin[2];
01338 
01339       maxx = fmax[0]; 
01340       maxy = fmax[1]; 
01341       maxz = fmax[2]; 
01342 #else
01343       for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) {
01344         if (atomSel->on[i]) {
01345           ptrdiff_t ind = i * 3L;
01346           float tmpx = atompos[ind  ];
01347           float tmpy = atompos[ind+1];
01348           float tmpz = atompos[ind+2];
01349 
01350           minx = (tmpx < minx) ? tmpx : minx;
01351           maxx = (tmpx > maxx) ? tmpx : maxx;
01352 
01353           miny = (tmpy < miny) ? tmpy : miny;
01354           maxy = (tmpy > maxy) ? tmpy : maxy;
01355 
01356           minz = (tmpz < minz) ? tmpz : minz;
01357           maxz = (tmpz > maxz) ? tmpz : maxz;
01358         }
01359       }
01360 #endif
01361     }
01362   }
01363 
01364   float mincoord[3], maxcoord[3];
01365   mincoord[0] = minx;
01366   mincoord[1] = miny;
01367   mincoord[2] = minz;
01368   maxcoord[0] = maxx;
01369   maxcoord[1] = maxy;
01370   maxcoord[2] = maxz;
01371 
01372   // crude estimate of the grid padding we require to prevent the
01373   // resulting isosurface from being clipped
01374   float gridpadding = radscale * maxrad * 1.70f;
01375   float padrad = gridpadding;
01376   padrad = 0.65f * sqrtf(4.0f/3.0f*((float) VMD_PI)*padrad*padrad*padrad);
01377   gridpadding = MAX(gridpadding, padrad);
01378 
01379   // Handle coarse-grained structures and whole-cell models
01380   // XXX The switch at 4.0A from an assumed all-atom scale structure to 
01381   //     CG or cell models is a simple heuristic at a somewhat arbitrary 
01382   //     threshold value.  
01383   //     For all-atom models the units shown in the GUI are in Angstroms
01384   //     and are absolute, but for CG or cell models the units in the GUI 
01385   //     are relative to the atom with the minimum radius.
01386   //     This code doesn't do anything to handle structures with a minrad 
01387   //     of zero, where perhaps only one particle has an unset radius.
01388   if (minrad > 4.0f) {
01389     gridspacing *= minrad;
01390   }
01391 
01392   if (verbose) {
01393     printf("QuickSurf: R*%.1f, I=%.1f, H=%.1f Pad: %.1f minR: %.1f maxR: %.1f)\n",
01394            radscale, isovalue, gridspacing, gridpadding, minrad, maxrad);
01395   }
01396 
01397   mincoord[0] -= gridpadding;
01398   mincoord[1] -= gridpadding;
01399   mincoord[2] -= gridpadding;
01400   maxcoord[0] += gridpadding;
01401   maxcoord[1] += gridpadding;
01402   maxcoord[2] += gridpadding;
01403 
01404   // compute the real grid dimensions from the selected atoms
01405   numvoxels[0] = (int) ceil((maxcoord[0]-mincoord[0]) / gridspacing);
01406   numvoxels[1] = (int) ceil((maxcoord[1]-mincoord[1]) / gridspacing);
01407   numvoxels[2] = (int) ceil((maxcoord[2]-mincoord[2]) / gridspacing);
01408 
01409   // recalc the grid dimensions from rounded/padded voxel counts
01410   xaxis[0] = (numvoxels[0]-1) * gridspacing;
01411   yaxis[1] = (numvoxels[1]-1) * gridspacing;
01412   zaxis[2] = (numvoxels[2]-1) * gridspacing;
01413   maxcoord[0] = mincoord[0] + xaxis[0];
01414   maxcoord[1] = mincoord[1] + yaxis[1];
01415   maxcoord[2] = mincoord[2] + zaxis[2];
01416 
01417   int boundserr=0;
01418   for (i=0; i<3; i++) {
01419     if (isnan(mincoord[i]) || isnan(maxcoord[i]) || (numvoxels[i] < 1))
01420       boundserr = 1;
01421   }
01422 
01423   if (boundserr)
01424     msgErr << "QuickSurf) NaN or illegal bounding box, grid size" << sendmsg;
01425 
01426   if (verbose || boundserr) {
01427     printf("  GridSZ: (%4d %4d %4d)  BBox: (%.1f %.1f %.1f)->(%.1f %.1f %.1f)\n",
01428            numvoxels[0], numvoxels[1], numvoxels[2],
01429            mincoord[0], mincoord[1], mincoord[2],
01430            maxcoord[0], maxcoord[1], maxcoord[2]);
01431   }
01432 
01433   if (boundserr)
01434     return -1; 
01435 
01436   vec_copy(origin, mincoord);
01437 
01438   // build compacted lists of bead coordinates, radii, and colors
01439   float *xyzr = NULL;
01440   float *colors = NULL;
01441   if (usebeads) { 
01442     int ind =0;
01443     int ind4=0; 
01444     xyzr = (float *) malloc(numbeads * sizeof(float) * 4L);
01445     if (colorperatom) {
01446       colors = (float *) malloc(numbeads * sizeof(float) * 4L);
01447 
01448       // build compacted lists of bead coordinates, radii, and colors
01449       for (i=0; i<numbeads; i++) {
01450         const float *fp = &beadpos[0] + ind;
01451         xyzr[ind4    ] = fp[0]-origin[0];
01452         xyzr[ind4 + 1] = fp[1]-origin[1];
01453         xyzr[ind4 + 2] = fp[2]-origin[2];
01454         xyzr[ind4 + 3] = beadradii[i];
01455  
01456         const float *cp = &beadcolors[0] + ind;
01457         colors[ind4    ] = cp[0];
01458         colors[ind4 + 1] = cp[1];
01459         colors[ind4 + 2] = cp[2];
01460         colors[ind4 + 3] = 1.0f;
01461         ind4 += 4;
01462         ind += 3;
01463       }
01464     } else {
01465       // build compacted lists of bead coordinates and radii only
01466       for (i=0; i<numbeads; i++) {
01467         const float *fp = &beadpos[0] + ind;
01468         xyzr[ind4    ] = fp[0]-origin[0];
01469         xyzr[ind4 + 1] = fp[1]-origin[1];
01470         xyzr[ind4 + 2] = fp[2]-origin[2];
01471         xyzr[ind4 + 3] = beadradii[i];
01472         ind4 += 4;
01473         ind += 3;
01474       }
01475     }
01476   } else {
01477     ptrdiff_t ind = atomSel->firstsel * 3L;
01478     ptrdiff_t ind4=0; 
01479     xyzr = (float *) malloc(atomSel->selected * sizeof(float) * 4L);
01480     if (colorperatom) {
01481       colors = (float *) malloc(atomSel->selected * sizeof(float) * 4L);
01482 
01483       // build compacted lists of atom coordinates, radii, and colors
01484       for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) {
01485         if (atomSel->on[i]) {
01486           const float *fp = atompos + ind;
01487           xyzr[ind4    ] = fp[0]-origin[0];
01488           xyzr[ind4 + 1] = fp[1]-origin[1];
01489           xyzr[ind4 + 2] = fp[2]-origin[2];
01490           xyzr[ind4 + 3] = atomradii[i];
01491  
01492           const float *cp = &cmap[colidx[i] * 3L];
01493           colors[ind4    ] = cp[0];
01494           colors[ind4 + 1] = cp[1];
01495           colors[ind4 + 2] = cp[2];
01496           colors[ind4 + 3] = 1.0f;
01497           ind4 += 4;
01498         }
01499         ind += 3;
01500       }
01501     } else {
01502       // build compacted lists of atom coordinates and radii only
01503       for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) {
01504         if (atomSel->on[i]) {
01505           const float *fp = atompos + ind;
01506           xyzr[ind4    ] = fp[0]-origin[0];
01507           xyzr[ind4 + 1] = fp[1]-origin[1];
01508           xyzr[ind4 + 2] = fp[2]-origin[2];
01509           xyzr[ind4 + 3] = atomradii[i];
01510           ind4 += 4;
01511         }
01512         ind += 3;
01513       }
01514     }
01515   }
01516 
01517   // set gaussian window size based on user-specified quality parameter
01518   float gausslim = 2.0f;
01519   switch (quality) {
01520     case 3: gausslim = 4.0f; break; // max quality
01521 
01522     case 2: gausslim = 3.0f; break; // high quality
01523 
01524     case 1: gausslim = 2.5f; break; // medium quality
01525 
01526     case 0: 
01527     default: gausslim = 2.0f; // low quality
01528       break;
01529   }
01530 
01531   pretime = wkf_timer_timenow(timer);
01532 
01533 #if defined(VMDCUDA)
01534   if (!force_cpuonly && !getenv("VMDNOCUDA")) {
01535     // allocate a new CUDAQuickSurf object if we destroyed the old one...
01536     if (cudaqs == NULL)
01537       cudaqs = new CUDAQuickSurf();
01538 
01539     // Assign texture format according to quality
01540     CUDAQuickSurf::VolTexFormat voltexformat = CUDAQuickSurf::RGB3F;
01541     switch (quality) {
01542       case 3: voltexformat = CUDAQuickSurf::RGB3F; break; // max quality
01543 
01544       case 2: voltexformat = CUDAQuickSurf::RGB3F; break; // high quality
01545 
01546       case 1: voltexformat = CUDAQuickSurf::RGB3F; break; // medium quality
01547 
01548       case 0: 
01549       default: voltexformat = CUDAQuickSurf::RGB4U; // low quality
01550         break;
01551     }
01552 
01553     // compute both density map and floating point color texture map
01554     int pcount = (usebeads) ? numbeads : atomSel->selected; 
01555     int rc = cudaqs->calc_surf(pcount, &xyzr[0],
01556                                (colorperatom) ? &colors[0] : &cmap[0],
01557                                colorperatom, voltexformat, 
01558                                origin, numvoxels, maxrad,
01559                                radscale, gridspacing, isovalue, gausslim,
01560                                cmdList);
01561 
01562     // If we're running in a memory-limited scenario, we can force
01563     // VMD to dump the QuickSurf GPU data to prevent out-of-memory
01564     // problems later on, either during other surface calcs or when 
01565     // using the GPU for things like OptiX ray tracing
01566     if (getenv("VMDQUICKSURFMINMEM")) {
01567       free_gpu_memory();
01568     }
01569 
01570     if (rc == 0) {
01571       free(xyzr);
01572       if (colors)
01573         free(colors);
01574   
01575       voltime = wkf_timer_timenow(timer);
01576 
01577       PROFILE_POP_RANGE(); // first return point
01578 
01579       return 0;
01580     }
01581   }
01582 #endif
01583 
01584   if (verbose) {
01585     printf("  Computing density map grid on CPUs ");
01586   }
01587 
01588   ptrdiff_t volsz = numvoxels[0] * numvoxels[1] * numvoxels[2];
01589   volmap = new float[volsz];
01590   if (colidx != NULL && cmap != NULL) {
01591     voltexmap = (float*) calloc(1, 3L * sizeof(float) * numvoxels[0] * numvoxels[1] * numvoxels[2]);
01592   }
01593 
01594   fflush(stdout);
01595   memset(volmap, 0, sizeof(float) * volsz);
01596   if ((volsz * atomSel->selected) > 20000000) {
01597     vmd_gaussdensity_threaded(mol->app->cpucaps, verbose, 
01598                               atomSel->selected, &xyzr[0], atomicnum,
01599                               (voltexmap!=NULL) ? &colors[0] : NULL,
01600                               volmap, voltexmap, numvoxels, radscale, 
01601                               gridspacing, isovalue, gausslim);
01602   } else {
01603     vmd_gaussdensity_opt(mol->app->cpucaps, verbose, 
01604                          atomSel->selected, &xyzr[0], atomicnum,
01605                          (voltexmap!=NULL) ? &colors[0] : NULL,
01606                          volmap, voltexmap, numvoxels, radscale, 
01607                          gridspacing, isovalue, gausslim);
01608   }
01609 
01610   free(xyzr);
01611   if (colors)
01612     free(colors);
01613 
01614   voltime = wkf_timer_timenow(timer);
01615 
01616   // draw the surface if the caller provided the display list
01617   if (cmdList != NULL) {
01618     draw_trimesh(cmdList);
01619   }
01620 
01621   if (verbose) {
01622     printf(" Done.\n");
01623   }
01624 
01625   PROFILE_POP_RANGE(); // second return point
01626 
01627   return 0;
01628 }
01629 
01630 
01631 // compute synthetic density map, but nothing else
01632 VolumetricData * QuickSurf::calc_density_map(AtomSel * atomSel, 
01633                                              DrawMolecule *mymol,  
01634                                              const float *atompos, 
01635                                              const float *atomradii,
01636                                              int quality, float radscale, 
01637                                              float gridspacing) {
01638   if (!calc_surf(atomSel, mymol, atompos, atomradii, quality, 
01639                  radscale, gridspacing, 1.0f, NULL, NULL, NULL)) {
01640     VolumetricData *surfvol;
01641     surfvol = new VolumetricData("density map", origin, xaxis, yaxis, zaxis,
01642                                  numvoxels[0], numvoxels[1], numvoxels[2],
01643                                  volmap);
01644     return surfvol;
01645   }
01646 
01647   return NULL;
01648 }
01649 
01650 
01651 // Extract the isosurface from the QuickSurf density map
01652 int QuickSurf::get_trimesh(int &numverts, 
01653                            float *&v3fv, float *&n3fv, float *&c3fv, 
01654                            int &numfacets, int *&fiv) {
01655 
01656   int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL);
01657 
01658   if (verbose)
01659     printf("Running marching cubes on CPU...\n");
01660 
01661   VolumetricData *surfvol; 
01662   surfvol = new VolumetricData("molecular surface",
01663                                origin, xaxis, yaxis, zaxis,
01664                                numvoxels[0], numvoxels[1], numvoxels[2],
01665                                volmap);
01666 
01667   // XXX we should calculate the volume gradient only for those
01668   //     vertices we extract, since for this rep any changes to settings
01669   //     will require recomputation of the entire volume
01670   surfvol->compute_volume_gradient(); // calc gradients: smooth vertex normals
01671   gradtime = wkf_timer_timenow(timer);
01672 
01673   // trimesh polygonalized surface, max of 6 triangles per voxel
01674   const int stepsize = 1;
01675   s.clear();                              // initialize isosurface data
01676   s.compute(surfvol, isovalue, stepsize); // compute the isosurface
01677 
01678   mctime = wkf_timer_timenow(timer);
01679 
01680   s.vertexfusion(9, 9);                   // eliminate duplicated vertices
01681   s.normalize();                          // normalize interpolated gradient/surface normals
01682 
01683   if (s.numtriangles > 0) {
01684     if (voltexmap != NULL) {
01685       // assign per-vertex colors by a 3-D texture map
01686       s.set_color_voltex_rgb3fv(voltexmap);
01687     } else {
01688       // use a single color for the entire mesh
01689       s.set_color_rgb3fv(solidcolor);
01690     }
01691   }
01692 
01693   numverts = s.v.num() / 3;
01694   v3fv=&s.v[0];
01695   n3fv=&s.n[0];
01696   c3fv=&s.c[0];
01697 
01698   numfacets = s.numtriangles;
01699   fiv=&s.f[0];
01700 
01701   delete surfvol;
01702 
01703   mcverttime = wkf_timer_timenow(timer);
01704   reptime = mcverttime;
01705 
01706   if (verbose) {
01707     char strmsg[1024];
01708     sprintf(strmsg, "QuickSurf: %.3f [pre:%.3f vol:%.3f gr:%.3f mc:%.2f mcv:%.3f]",
01709             reptime, pretime, voltime-pretime, gradtime-voltime, 
01710             mctime-gradtime, mcverttime-mctime);
01711 
01712     msgInfo << strmsg << sendmsg;
01713   }
01714  
01715   return 0;
01716 }
01717 
01718 
01719 int QuickSurf::draw_trimesh(VMDDisplayList *cmdList) {
01720   DispCmdTriMesh cmdTriMesh;
01721 
01722   int numverts=0;
01723   float *v=NULL, *n=NULL, *c=NULL;
01724   int numfacets=0;
01725   int *f=NULL;
01726 
01727   get_trimesh(numverts, v, n, c, numfacets, f);
01728 
01729   // Create a triangle mesh
01730   if (numfacets > 0) {
01731     cmdTriMesh.putdata(v, n, c, numverts, f, numfacets, 0, cmdList);
01732   }
01733  
01734   return 0;
01735 }
01736 
01737 
01738 QuickSurf::~QuickSurf() {
01739 #if defined(VMDCUDA)
01740   free_gpu_memory();
01741 #endif
01742 
01743   if (voltexmap != NULL)
01744     free(voltexmap);
01745   voltexmap = NULL;
01746 
01747   wkf_timer_destroy(timer);
01748 }
01749 
01750