QuickSurf_NEON.C Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: QuickSurf_NEON.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.2 $      $Date: 2021/11/12 16:36:33 $
00015  *
00016  ***************************************************************************/
00022 // The NEON code path requires FMA instructions
00023 // Due to differences in code generation between gcc/intelc/clang/msvc, we
00024 // don't have to check for a defined(__NEON__)
00025 #if defined(VMDCPUDISPATCH) && defined(VMDUSENEON)
00026 #include <arm_neon.h>
00027 
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 #include <math.h>
00032 #include "QuickSurf.h"
00033 #include "Inform.h"
00034 #include "utilities.h"
00035 #include "WKFUtils.h"
00036 #include "VolumetricData.h"
00037 
00038 #include "VMDDisplayList.h"
00039 #include "Displayable.h"
00040 #include "DispCmds.h"
00041 
00042 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00043 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00044 
00045 
00046 /*
00047  * David J. Hardy
00048  * 12 Dec 2008
00049  *
00050  * aexpfnx() - Approximate expf() for negative x.
00051  *
00052  * Assumes that x <= 0.
00053  *
00054  * Assumes IEEE format for single precision float, specifically:
00055  * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00056  *
00057  * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00058  * multiplication of a fast calculation for 2^(-N).  The interpolation
00059  * uses a linear blending of 3rd degree Taylor polynomials at the end
00060  * points, so the approximation is once differentiable.
00061  *
00062  * The error is small (max relative error per interval is calculated
00063  * to be 0.131%, with a max absolute error of -0.000716).
00064  *
00065  * The cutoff is chosen so as to speed up the computation by early
00066  * exit from function, with the value chosen to give less than the
00067  * the max absolute error.  Use of a cutoff is unnecessary, except
00068  * for needing to shift smallest floating point numbers to zero,
00069  * i.e. you could remove cutoff and replace by:
00070  *
00071  * #define MINXNZ  -88.0296919311130  // -127 * log(2)
00072  *
00073  *   if (x < MINXNZ) return 0.f;
00074  *
00075  * Use of a cutoff causes a discontinuity which can be eliminated
00076  * through the use of a switching function.
00077  *
00078  * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00079  * the interval and weighting their respective Taylor polynomials by the
00080  * kth order Lagrange interpolant through those nodes.  The wiggle in the
00081  * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00082  * can be reduced by using Chebyshev nodes.
00083  */
00084 
00085 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00086 #define __align(X)  __attribute__((aligned(X) ))
00087 #else
00088 #define __align(X) __declspec(align(X) )
00089 #endif
00090 
00091 #define MLOG2EF    -1.44269504088896f
00092 
00093 /*
00094  * Interpolating coefficients for linear blending of the
00095  * 3rd degree Taylor expansion of 2^x about 0 and -1.
00096  */
00097 #define SCEXP0     1.0000000000000000f
00098 #define SCEXP1     0.6987082824680118f
00099 #define SCEXP2     0.2633174272827404f
00100 #define SCEXP3     0.0923611991471395f
00101 #define SCEXP4     0.0277520543324108f
00102 
00103 /* for single precision float */
00104 #define EXPOBIAS   127
00105 #define EXPOSHIFT   23
00106 
00107 
00108 /* cutoff is optional, but can help avoid unnecessary work */
00109 #define ACUTOFF    -10
00110 
00111 typedef union flint_t {
00112   float f;
00113   int n;
00114 } flint;
00115 
00116 // NEON variant of the 'flint' union above
00117 typedef union NEONreg_t {
00118   float32x4_t f;  // 4x float (NEON)
00119   int32x4_t i;    // 4x 32-bit int (NEON)
00120   struct {
00121     float r0, r1, r2, r3;  // get the individual registers
00122   } floatreg;
00123 } NEONreg;
00124 
00125 void vmd_gaussdensity_neon(int verbose,
00126                            int natoms, const float *xyzr,
00127                            const float *atomicnum,
00128                            const float *colors,
00129                            float *densitymap, float *voltexmap,
00130                            const int *numvoxels,
00131                            float radscale, float gridspacing,
00132                            float isovalue, float gausslim) {
00133   int i, x, y, z;
00134   int maxvoxel[3];
00135   maxvoxel[0] = numvoxels[0]-1;
00136   maxvoxel[1] = numvoxels[1]-1;
00137   maxvoxel[2] = numvoxels[2]-1;
00138   const float invgridspacing = 1.0f / gridspacing;
00139 
00140   // Variables for NEON optimized inner loop
00141   float32x4_t gridspacing4_4;
00142   __attribute__((aligned(16))) float sxdelta4[4]; // 16-byte aligned for 
00143 
00144   gridspacing4_4 = vdupq_n_f32(gridspacing * 4.0f);
00145   for (x=0; x<4; x++)
00146     sxdelta4[x] = ((float) x) * gridspacing;
00147 
00148   // compute colors only if necessary, since they are costly
00149   if (voltexmap != NULL) {
00150     float invisovalue = 1.0f / isovalue;
00151     // compute both density map and floating point color texture map
00152     for (i=0; i<natoms; i++) {
00153       if (verbose && ((i & 0x3fff) == 0)) {
00154         printf(".");
00155         fflush(stdout);
00156       }
00157 
00158       ptrdiff_t ind = i*4L;
00159       float scaledrad = xyzr[ind + 3] * radscale;
00160 
00161       // MDFF atomic number weighted density factor
00162       float atomicnumfactor = 1.0f;
00163       if (atomicnum != NULL) {
00164         atomicnumfactor = atomicnum[i];
00165       }
00166 
00167       // negate, precompute reciprocal, and change to base 2 from the outset
00168       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00169       float radlim = gausslim * scaledrad;
00170       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00171       radlim *= invgridspacing;
00172 
00173       float32x4_t atomicnumfactor_4;
00174       float32x4_t arinv_4;
00175       atomicnumfactor_4 = vdupq_n_f32(atomicnumfactor);
00176 
00177       // Use our fully inlined exp approximation
00178       arinv_4 = vdupq_n_f32(arinv);
00179 
00180       float tmp;
00181       tmp = xyzr[ind  ] * invgridspacing;
00182       int xmin = MAX((int) (tmp - radlim), 0);
00183       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00184       tmp = xyzr[ind+1] * invgridspacing;
00185       int ymin = MAX((int) (tmp - radlim), 0);
00186       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00187       tmp = xyzr[ind+2] * invgridspacing;
00188       int zmin = MAX((int) (tmp - radlim), 0);
00189       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00190 
00191       float dz = zmin*gridspacing - xyzr[ind+2];
00192       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00193         float dy = ymin*gridspacing - xyzr[ind+1];
00194         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00195           float dy2dz2 = dy*dy + dz*dz;
00196 
00197           // early-exit when outside the cutoff radius in the Y-Z plane
00198           if (dy2dz2 >= radlim2)
00199             continue;
00200 
00201           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00202           float dx = xmin*gridspacing - xyzr[ind];
00203           x=xmin;
00204 
00205           // Use NEON when we have a multiple-of-4 to compute
00206           // finish all remaining density map points with regular C++ loop
00207           {
00208             __align(16) NEONreg n;
00209             __align(16) NEONreg y;
00210             float32x4_t dy2dz2_4 = vdupq_n_f32(dy2dz2);
00211             float32x4_t dx_4 = vaddq_f32(vdupq_n_f32(dx), vld1q_f32(&sxdelta4[0]));
00212 
00213             for (; (x+3)<=xmax; x+=4,dx_4=vaddq_f32(dx_4, gridspacing4_4)) {
00214               float32x4_t r2 = vfmaq_f32(dy2dz2_4, dx_4, dx_4);
00215               float32x4_t d;
00216 
00217               // use our (much faster) fully inlined exponential approximation
00218               y.f = vmulq_f32(r2, arinv_4);  // already negated and in base 2
00219               n.i = vcvtq_s32_f32(y.f);
00220               d = vcvtq_f32_s32(n.i);
00221               d = vsubq_f32(d, y.f);
00222 
00223               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00224               // Perform Horner's method to evaluate interpolating polynomial.
00225               y.f = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00226               y.f = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y.f);
00227               y.f = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y.f);
00228               y.f = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y.f);
00229 
00230               // Calculate 2^N exactly by directly manipulating floating point
00231               // exponent, then use it to scale y for the final result.
00232               n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00233               n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00234               y.f = vmulq_f32(y.f, n.f);
00235               y.f = vmulq_f32(y.f, atomicnumfactor_4); // MDFF density maps
00236 
00237               float *ufptr = &densitymap[addr + x];
00238               d = vld1q_f32(ufptr); // ARM64 NEON isn't alignment-sensitive
00239               vst1q_f32(ufptr, vaddq_f32(d, y.f));
00240 
00241               // Accumulate density-weighted color to texture map.
00242               // Pre-multiply colors by the inverse isovalue we will extract
00243               // the surface on, to cause the final color to be normalized.
00244               d = vmulq_f32(y.f, vdupq_n_f32(invisovalue));
00245               ptrdiff_t caddr = (addr + x) * 3L;
00246 
00247 #if 0
00248               // NEON AOS/SOA conversions
00249 #else
00250               // color by atom colors
00251               float r, g, b;
00252               r = colors[ind    ];
00253               g = colors[ind + 1];
00254               b = colors[ind + 2];
00255 
00256               NEONreg tmp;
00257               tmp.f = d;
00258               float density;
00259               density = tmp.floatreg.r0;
00260               voltexmap[caddr     ] += density * r;
00261               voltexmap[caddr +  1] += density * g;
00262               voltexmap[caddr +  2] += density * b;
00263 
00264               density = tmp.floatreg.r1;
00265               voltexmap[caddr +  3] += density * r;
00266               voltexmap[caddr +  4] += density * g;
00267               voltexmap[caddr +  5] += density * b;
00268 
00269               density = tmp.floatreg.r2;
00270               voltexmap[caddr +  6] += density * r;
00271               voltexmap[caddr +  7] += density * g;
00272               voltexmap[caddr +  8] += density * b;
00273 
00274               density = tmp.floatreg.r3;
00275               voltexmap[caddr +  9] += density * r;
00276               voltexmap[caddr + 10] += density * g;
00277               voltexmap[caddr + 11] += density * b;
00278 #endif
00279             }
00280           }
00281 
00282           // finish all remaining density map points with regular C++ loop
00283           for (; x<=xmax; x++,dx+=gridspacing) {
00284             float r2 = dx*dx + dy2dz2;
00285 
00286             // use our (much faster) fully inlined exponential approximation
00287             float mb = r2 * arinv;         /* already negated and in base 2 */
00288             int mbflr = (int) mb;          /* get int part, floor() */
00289             float d = mbflr - mb;          /* remaining exponent, -1 < d <= 0 */
00290 
00291             /* approx with linear blend of Taylor polys */
00292             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00293 
00294             /* 2^(-mbflr) */
00295             flint scalfac;
00296             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00297 
00298             // XXX assume we are never beyond the cutoff value in this loop
00299             float density = (sy * scalfac.f);
00300 
00301             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00302 
00303             // accumulate density value to density map
00304             densitymap[addr + x] += density;
00305 
00306             // Accumulate density-weighted color to texture map.
00307             // Pre-multiply colors by the inverse isovalue we will extract
00308             // the surface on, to cause the final color to be normalized.
00309             density *= invisovalue;
00310             ptrdiff_t caddr = (addr + x) * 3L;
00311 
00312             // color by atom colors
00313             voltexmap[caddr    ] += density * colors[ind    ];
00314             voltexmap[caddr + 1] += density * colors[ind + 1];
00315             voltexmap[caddr + 2] += density * colors[ind + 2];
00316           }
00317         }
00318       }
00319     }
00320   } else {
00321     // compute density map only
00322     for (i=0; i<natoms; i++) {
00323       if (verbose && ((i & 0x3fff) == 0)) {
00324         printf(".");
00325         fflush(stdout);
00326       }
00327 
00328       ptrdiff_t ind = i*4L;
00329       float scaledrad = xyzr[ind+3] * radscale;
00330 
00331       // MDFF atomic number weighted density factor
00332       float atomicnumfactor = 1.0f;
00333       if (atomicnum != NULL) {
00334         atomicnumfactor = atomicnum[i];
00335       }
00336 
00337       // negate, precompute reciprocal, and change to base 2 from the outset
00338       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00339       float radlim = gausslim * scaledrad;
00340       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00341       radlim *= invgridspacing;
00342 
00343       float32x4_t atomicnumfactor_4 = vdupq_n_f32(atomicnumfactor);
00344       float32x4_t arinv_4= vdupq_n_f32(arinv); // Use inlined exp approximation
00345 
00346       float tmp;
00347       tmp = xyzr[ind  ] * invgridspacing;
00348       int xmin = MAX((int) (tmp - radlim), 0);
00349       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00350       tmp = xyzr[ind+1] * invgridspacing;
00351       int ymin = MAX((int) (tmp - radlim), 0);
00352       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00353       tmp = xyzr[ind+2] * invgridspacing;
00354       int zmin = MAX((int) (tmp - radlim), 0);
00355       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00356 
00357       float dz = zmin*gridspacing - xyzr[ind+2];
00358       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00359         float dy = ymin*gridspacing - xyzr[ind+1];
00360         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00361           float dy2dz2 = dy*dy + dz*dz;
00362 
00363           // early-exit when outside the cutoff radius in the Y-Z plane
00364           if (dy2dz2 >= radlim2)
00365             continue;
00366 
00367           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00368           float dx = xmin*gridspacing - xyzr[ind];
00369           x=xmin;
00370 
00371           // Use NEON when we have a multiple-of-4 to compute
00372           // finish all remaining density map points with C++ loop
00373           {
00374             __align(16) NEONreg n;
00375             __align(16) NEONreg y;
00376             float32x4_t dy2dz2_4 = vdupq_n_f32(dy2dz2);
00377             float32x4_t dx_4 = vaddq_f32(vdupq_n_f32(dx), vld1q_f32(&sxdelta4[0]));
00378 
00379             for (; (x+3)<=xmax; x+=4,dx_4=vaddq_f32(dx_4, gridspacing4_4)) {
00380               float32x4_t r2 = vfmaq_f32(dy2dz2_4, dx_4, dx_4);
00381               float32x4_t d;
00382 
00383               // use our (much faster) fully inlined exponential approximation
00384               y.f = vmulq_f32(r2, arinv_4);         /* already negated and in base 2 */
00385               n.i = vcvtq_s32_f32(y.f);
00386               d = vcvtq_f32_s32(n.i);
00387               d = vsubq_f32(d, y.f);
00388 
00389               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00390               // Perform Horner's method to evaluate interpolating polynomial.
00391 #if __ARM_FEATURE_FMA
00392               // use FMA3 instructions when possible
00393               y.f = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00394               y.f = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y.f);
00395               y.f = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y.f);
00396               y.f = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y.f);
00397 #else
00398               y.f = vmulq_f32(d, vdupq_n_f32(SCEXP4));      /* for x^4 term */
00399               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP3));    /* for x^3 term */
00400               y.f = vmulq_f32(y.f, d);
00401               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP2));    /* for x^2 term */
00402               y.f = vmulq_f32(y.f, d);
00403               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP1));    /* for x^1 term */
00404               y.f = vmulq_f32(y.f, d);
00405               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP0));    /* for x^0 term */
00406 #endif
00407 
00408               // Calculate 2^N exactly by directly manipulating floating point 
00409               // exponent, then use it to scale y for the final result.
00410               n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00411               n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00412               y.f = vmulq_f32(y.f, n.f);
00413               y.f = vmulq_f32(y.f, atomicnumfactor_4); // MDFF density maps
00414 
00415               float *ufptr = &densitymap[addr + x];
00416               d = vld1q_f32(ufptr); // ARM64 NEON isn't alignment-sensitive
00417               vst1q_f32(ufptr, vaddq_f32(d, y.f));
00418             }
00419           }
00420 
00421           // finish all remaining density map points with regular C++ loop
00422           for (; x<=xmax; x++,dx+=gridspacing) {
00423             float r2 = dx*dx + dy2dz2;
00424 
00425             // use our (much faster) fully inlined exponential approximation
00426             float mb = r2 * arinv;         /* already negated and in base 2 */
00427             int mbflr = (int) mb;          /* get int part, floor() */
00428             float d = mbflr - mb;          /* remaining exponent, -1 < d <= 0 */
00429 
00430             /* approx with linear blend of Taylor polys */
00431             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00432 
00433             /* 2^(-mbflr) */
00434             flint scalfac;
00435             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00436 
00437             // XXX assume we are never beyond the cutoff value in this loop
00438             float density = (sy * scalfac.f);
00439 
00440             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00441 
00442             densitymap[addr + x] += density;
00443           }
00444         }
00445       }
00446     }
00447   }
00448 }
00449  
00450 
00451 #endif
00452