QuickSurf_AVX2.C Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: QuickSurf_AVX2.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.10 $      $Date: 2021/11/12 16:36:33 $
00015  *
00016  ***************************************************************************/
00023 // The x86 AVX code path requires FMA and AVX2 integer instructions
00024 // in order to achieve performance that actually beats SSE2.
00025 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00026 #include <immintrin.h>
00027 #endif
00028 
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include <math.h>
00033 #include "QuickSurf.h"
00034 #include "Inform.h"
00035 #include "utilities.h"
00036 #include "WKFUtils.h"
00037 #include "VolumetricData.h"
00038 
00039 #include "VMDDisplayList.h"
00040 #include "Displayable.h"
00041 #include "DispCmds.h"
00042 
00043 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00044 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00045 
00046 
00047 /*
00048  * David J. Hardy
00049  * 12 Dec 2008
00050  *
00051  * aexpfnx() - Approximate expf() for negative x.
00052  *
00053  * Assumes that x <= 0.
00054  *
00055  * Assumes IEEE format for single precision float, specifically:
00056  * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00057  *
00058  * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00059  * multiplication of a fast calculation for 2^(-N).  The interpolation
00060  * uses a linear blending of 3rd degree Taylor polynomials at the end
00061  * points, so the approximation is once differentiable.
00062  *
00063  * The error is small (max relative error per interval is calculated
00064  * to be 0.131%, with a max absolute error of -0.000716).
00065  *
00066  * The cutoff is chosen so as to speed up the computation by early
00067  * exit from function, with the value chosen to give less than the
00068  * the max absolute error.  Use of a cutoff is unnecessary, except
00069  * for needing to shift smallest floating point numbers to zero,
00070  * i.e. you could remove cutoff and replace by:
00071  *
00072  * #define MINXNZ  -88.0296919311130  // -127 * log(2)
00073  *
00074  *   if (x < MINXNZ) return 0.f;
00075  *
00076  * Use of a cutoff causes a discontinuity which can be eliminated
00077  * through the use of a switching function.
00078  *
00079  * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00080  * the interval and weighting their respective Taylor polynomials by the
00081  * kth order Lagrange interpolant through those nodes.  The wiggle in the
00082  * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00083  * can be reduced by using Chebyshev nodes.
00084  */
00085 
00086 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00087 #define __align(X)  __attribute__((aligned(X) ))
00088 #if (__GNUC__ < 4)
00089 #define MISSING_mm_cvtsd_f64
00090 #endif
00091 #else
00092 #define __align(X) __declspec(align(X) )
00093 #endif
00094 
00095 #define MLOG2EF    -1.44269504088896f
00096 
00097 /*
00098  * Interpolating coefficients for linear blending of the
00099  * 3rd degree Taylor expansion of 2^x about 0 and -1.
00100  */
00101 #define SCEXP0     1.0000000000000000f
00102 #define SCEXP1     0.6987082824680118f
00103 #define SCEXP2     0.2633174272827404f
00104 #define SCEXP3     0.0923611991471395f
00105 #define SCEXP4     0.0277520543324108f
00106 
00107 /* for single precision float */
00108 #define EXPOBIAS   127
00109 #define EXPOSHIFT   23
00110 
00111 
00112 /* cutoff is optional, but can help avoid unnecessary work */
00113 #define ACUTOFF    -10
00114 
00115 typedef union flint_t {
00116   float f;
00117   int n;
00118 } flint;
00119 
00120 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00121 // AVX variant of the 'flint' union above
00122 typedef union AVXreg_t {
00123   __m256  f;  // 8x float (AVX)
00124   __m256i i;  // 8x 32-bit int (AVX)
00125   struct {
00126     float r0, r1, r2, r3, r4, r5, r6, r7;  // get the individual registers
00127   } floatreg;
00128 } AVXreg;
00129 #endif
00130 
00131 
00132 
00133 void vmd_gaussdensity_avx2(int verbose,
00134                            int natoms, const float *xyzr,
00135                            const float *atomicnum,
00136                            const float *colors,
00137                            float *densitymap, float *voltexmap,
00138                            const int *numvoxels,
00139                            float radscale, float gridspacing,
00140                            float isovalue, float gausslim) {
00141   int i, x, y, z;
00142   int maxvoxel[3];
00143   maxvoxel[0] = numvoxels[0]-1;
00144   maxvoxel[1] = numvoxels[1]-1;
00145   maxvoxel[2] = numvoxels[2]-1;
00146   const float invgridspacing = 1.0f / gridspacing;
00147 
00148 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00149   // Variables for AVX2 optimized inner loop
00150   __m256 gridspacing8_8 = _mm256_set1_ps(gridspacing * 8.0f);
00151   __attribute__((aligned(16))) float sxdelta8[8]; // 16-byte aligned for AVX2
00152   for (x=0; x<8; x++)
00153     sxdelta8[x] = ((float) x) * gridspacing;
00154 #endif
00155 
00156   // compute colors only if necessary, since they are costly
00157   if (voltexmap != NULL) {
00158     float invisovalue = 1.0f / isovalue;
00159     // compute both density map and floating point color texture map
00160     for (i=0; i<natoms; i++) {
00161       if (verbose && ((i & 0x3fff) == 0)) {
00162         printf(".");
00163         fflush(stdout);
00164       }
00165 
00166       ptrdiff_t ind = i*4L;
00167       float scaledrad = xyzr[ind + 3] * radscale;
00168 
00169       // MDFF atomic number weighted density factor
00170       float atomicnumfactor = 1.0f;
00171       if (atomicnum != NULL) {
00172         atomicnumfactor = atomicnum[i];
00173       }
00174 
00175       // negate, precompute reciprocal, and change to base 2 from the outset
00176       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00177       float radlim = gausslim * scaledrad;
00178       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00179       radlim *= invgridspacing;
00180 
00181 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00182       __m256 atomicnumfactor_8 = _mm256_set1_ps(atomicnumfactor);
00183 #if VMDUSESVMLEXP
00184       // Use of Intel's SVML requires changing the pre-scaling factor
00185       __m256 arinv_8 = _mm256_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF);
00186 #else
00187       __m256 arinv_8 = _mm256_set1_ps(arinv); // Use inlined exp approximation
00188 #endif
00189 #endif
00190 
00191       float tmp;
00192       tmp = xyzr[ind  ] * invgridspacing;
00193       int xmin = MAX((int) (tmp - radlim), 0);
00194       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00195       tmp = xyzr[ind+1] * invgridspacing;
00196       int ymin = MAX((int) (tmp - radlim), 0);
00197       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00198       tmp = xyzr[ind+2] * invgridspacing;
00199       int zmin = MAX((int) (tmp - radlim), 0);
00200       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00201 
00202       float dz = zmin*gridspacing - xyzr[ind+2];
00203       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00204         float dy = ymin*gridspacing - xyzr[ind+1];
00205         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00206           float dy2dz2 = dy*dy + dz*dz;
00207 
00208           // early-exit when outside the cutoff radius in the Y-Z plane
00209           if (dy2dz2 >= radlim2)
00210             continue;
00211 
00212           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00213           float dx = xmin*gridspacing - xyzr[ind];
00214           x=xmin;
00215 
00216 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00217           // Use AVX when we have a multiple-of-8 to compute
00218           // finish all remaining density map points with SSE or regular non-SSE loop
00219           {
00220             __align(16) AVXreg n;
00221             __align(16) AVXreg y;
00222             __m256 dy2dz2_8 = _mm256_set1_ps(dy2dz2);
00223             __m256 dx_8 = _mm256_add_ps(_mm256_set1_ps(dx), _mm256_load_ps(&sxdelta8[0]));
00224 
00225             for (; (x+7)<=xmax; x+=8,dx_8=_mm256_add_ps(dx_8, gridspacing8_8)) {
00226               __m256 r2 = _mm256_fmadd_ps(dx_8, dx_8, dy2dz2_8);
00227               __m256 d;
00228 #if VMDUSESVMLEXP
00229               // use Intel's SVML exp2() routine
00230               y.f = _mm256_exp2_ps(_mm256_mul_ps(r2, arinv_8));
00231 #else
00232               // use our (much faster) fully inlined exponential approximation
00233               y.f = _mm256_mul_ps(r2, arinv_8);         /* already negated and in base 2 */
00234               n.i = _mm256_cvttps_epi32(y.f);
00235               d = _mm256_cvtepi32_ps(n.i);
00236               d = _mm256_sub_ps(d, y.f);
00237 
00238               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00239               // Perform Horner's method to evaluate interpolating polynomial.
00240               y.f = _mm256_fmadd_ps(d, _mm256_set1_ps(SCEXP4), _mm256_set1_ps(SCEXP3));
00241               y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP2));
00242               y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP1));
00243               y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP0));
00244 
00245               // Calculate 2^N exactly by directly manipulating floating point exponent,
00246               // then use it to scale y for the final result.
00247               // We need AVX2 instructions to be able to operate on
00248               // 8-wide integer types efficiently.
00249               n.i = _mm256_sub_epi32(_mm256_set1_epi32(EXPOBIAS), n.i);
00250               n.i = _mm256_slli_epi32(n.i, EXPOSHIFT);
00251               y.f = _mm256_mul_ps(y.f, n.f);
00252               y.f = _mm256_mul_ps(y.f, atomicnumfactor_8); // MDFF density maps
00253 #endif
00254 
00255               // At present, we do unaligned loads/stores since we can't guarantee
00256               // that the X-dimension is always a multiple of 8.
00257               float *ufptr = &densitymap[addr + x];
00258               d = _mm256_loadu_ps(ufptr);
00259               _mm256_storeu_ps(ufptr, _mm256_add_ps(d, y.f));
00260 
00261               // Accumulate density-weighted color to texture map.
00262               // Pre-multiply colors by the inverse isovalue we will extract
00263               // the surface on, to cause the final color to be normalized.
00264               d = _mm256_mul_ps(y.f, _mm256_set1_ps(invisovalue));
00265               ptrdiff_t caddr = (addr + x) * 3L;
00266 
00267 #if 0 && VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00268               // convert rgb3f AOS format to 8-element SOA vectors using shuffle instructions
00269               float *txptr = &voltexmap[caddr];
00270               __m128 *m = (__m128 *) txptr;
00271               // unaligned load of 8 consecutive rgb3f texture map texels
00272               __m256 m03 = _mm256_castps128_ps256(m[0]); // load lower halves
00273               __m256 m14 = _mm256_castps128_ps256(m[1]);
00274               __m256 m25 = _mm256_castps128_ps256(m[2]);
00275               m03  = _mm256_insertf128_ps(m03, m[3], 1); // load upper halves
00276               m14  = _mm256_insertf128_ps(m14, m[4], 1);
00277               m25  = _mm256_insertf128_ps(m25, m[5], 1);
00278 
00279               // upper Rs and Gs
00280               __m256 rg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2,1,3,2));
00281               // lower Gs and Bs
00282               __m256 gb = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1,0,2,1));
00283               __m256 r  = _mm256_shuffle_ps(m03, rg , _MM_SHUFFLE(2,0,3,0));
00284               __m256 g  = _mm256_shuffle_ps(gb , rg , _MM_SHUFFLE(3,1,2,0));
00285               __m256 b  = _mm256_shuffle_ps(gb , m25, _MM_SHUFFLE(3,0,3,1));
00286 
00287               // accumulate density-scaled colors into texels
00288               r = _mm256_fmadd_ps(d, _mm256_set1_ps(colors[ind    ]), r);
00289               g = _mm256_fmadd_ps(d, _mm256_set1_ps(colors[ind + 1]), g);
00290               b = _mm256_fmadd_ps(d, _mm256_set1_ps(colors[ind + 2]), b);
00291 
00292               // convert 8-element SOA vectors to rgb3f AOS format using shuffle instructions
00293               __m256 rrg = _mm256_shuffle_ps(r, g, _MM_SHUFFLE(2,0,2,0));
00294               __m256 rgb = _mm256_shuffle_ps(g, b, _MM_SHUFFLE(3,1,3,1));
00295               __m256 rbr = _mm256_shuffle_ps(b, r, _MM_SHUFFLE(3,1,2,0));
00296               __m256 r03 = _mm256_shuffle_ps(rrg, rbr, _MM_SHUFFLE(2,0,2,0));
00297               __m256 r14 = _mm256_shuffle_ps(rgb, rrg, _MM_SHUFFLE(3,1,2,0));
00298               __m256 r25 = _mm256_shuffle_ps(rbr, rgb, _MM_SHUFFLE(3,1,3,1));
00299 
00300               // unaligned store of consecutive rgb3f texture map texels
00301               m[0] = _mm256_castps256_ps128( r03 );
00302               m[1] = _mm256_castps256_ps128( r14 );
00303               m[2] = _mm256_castps256_ps128( r25 );
00304               m[3] = _mm256_extractf128_ps( r03 ,1);
00305               m[4] = _mm256_extractf128_ps( r14 ,1);
00306               m[5] = _mm256_extractf128_ps( r25 ,1);
00307 #else
00308               // color by atom colors
00309               float r, g, b;
00310               r = colors[ind    ];
00311               g = colors[ind + 1];
00312               b = colors[ind + 2];
00313 
00314               AVXreg tmp;
00315               tmp.f = d;
00316               float density;
00317               density = tmp.floatreg.r0;
00318               voltexmap[caddr     ] += density * r;
00319               voltexmap[caddr +  1] += density * g;
00320               voltexmap[caddr +  2] += density * b;
00321 
00322               density = tmp.floatreg.r1;
00323               voltexmap[caddr +  3] += density * r;
00324               voltexmap[caddr +  4] += density * g;
00325               voltexmap[caddr +  5] += density * b;
00326 
00327               density = tmp.floatreg.r2;
00328               voltexmap[caddr +  6] += density * r;
00329               voltexmap[caddr +  7] += density * g;
00330               voltexmap[caddr +  8] += density * b;
00331 
00332               density = tmp.floatreg.r3;
00333               voltexmap[caddr +  9] += density * r;
00334               voltexmap[caddr + 10] += density * g;
00335               voltexmap[caddr + 11] += density * b;
00336 
00337               density = tmp.floatreg.r4;
00338               voltexmap[caddr + 12] += density * r;
00339               voltexmap[caddr + 13] += density * g;
00340               voltexmap[caddr + 14] += density * b;
00341 
00342               density = tmp.floatreg.r5;
00343               voltexmap[caddr + 15] += density * r;
00344               voltexmap[caddr + 16] += density * g;
00345               voltexmap[caddr + 17] += density * b;
00346 
00347               density = tmp.floatreg.r6;
00348               voltexmap[caddr + 18] += density * r;
00349               voltexmap[caddr + 19] += density * g;
00350               voltexmap[caddr + 20] += density * b;
00351 
00352               density = tmp.floatreg.r7;
00353               voltexmap[caddr + 21] += density * r;
00354               voltexmap[caddr + 22] += density * g;
00355               voltexmap[caddr + 23] += density * b;
00356 #endif
00357             }
00358           }
00359 #endif
00360 
00361           // finish all remaining density map points with regular non-SSE loop
00362           for (; x<=xmax; x++,dx+=gridspacing) {
00363             float r2 = dx*dx + dy2dz2;
00364 
00365             // use our (much faster) fully inlined exponential approximation
00366             float mb = r2 * arinv;         /* already negated and in base 2 */
00367             int mbflr = (int) mb;          /* get int part, floor() */
00368             float d = mbflr - mb;          /* remaining exponent, -1 < d <= 0 */
00369 
00370             /* approx with linear blend of Taylor polys */
00371             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00372 
00373             /* 2^(-mbflr) */
00374             flint scalfac;
00375             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00376 
00377             // XXX assume we are never beyond the cutoff value in this loop
00378             float density = (sy * scalfac.f);
00379 
00380             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00381 
00382             // accumulate density value to density map
00383             densitymap[addr + x] += density;
00384 
00385             // Accumulate density-weighted color to texture map.
00386             // Pre-multiply colors by the inverse isovalue we will extract
00387             // the surface on, to cause the final color to be normalized.
00388             density *= invisovalue;
00389             ptrdiff_t caddr = (addr + x) * 3L;
00390 
00391             // color by atom colors
00392             voltexmap[caddr    ] += density * colors[ind    ];
00393             voltexmap[caddr + 1] += density * colors[ind + 1];
00394             voltexmap[caddr + 2] += density * colors[ind + 2];
00395           }
00396         }
00397       }
00398     }
00399   } else {
00400     // compute density map only
00401     for (i=0; i<natoms; i++) {
00402       if (verbose && ((i & 0x3fff) == 0)) {
00403         printf(".");
00404         fflush(stdout);
00405       }
00406 
00407       ptrdiff_t ind = i*4L;
00408       float scaledrad = xyzr[ind+3] * radscale;
00409 
00410       // MDFF atomic number weighted density factor
00411       float atomicnumfactor = 1.0f;
00412       if (atomicnum != NULL) {
00413         atomicnumfactor = atomicnum[i];
00414       }
00415 
00416       // negate, precompute reciprocal, and change to base 2 from the outset
00417       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00418       float radlim = gausslim * scaledrad;
00419       float radlim2 = radlim * radlim; // cutoff test done in cartesian coords
00420       radlim *= invgridspacing;
00421 
00422 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00423       __m256 atomicnumfactor_8 = _mm256_set1_ps(atomicnumfactor);
00424 #if VMDUSESVMLEXP
00425       // Use of Intel's SVML requires changing the pre-scaling factor
00426       __m256 arinv_8 = _mm256_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF);
00427 #else
00428       __m256 arinv_8 = _mm256_set1_ps(arinv); // Use inlined exp approximation
00429 #endif
00430 #endif
00431 
00432       float tmp;
00433       tmp = xyzr[ind  ] * invgridspacing;
00434       int xmin = MAX((int) (tmp - radlim), 0);
00435       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00436       tmp = xyzr[ind+1] * invgridspacing;
00437       int ymin = MAX((int) (tmp - radlim), 0);
00438       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00439       tmp = xyzr[ind+2] * invgridspacing;
00440       int zmin = MAX((int) (tmp - radlim), 0);
00441       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00442 
00443       float dz = zmin*gridspacing - xyzr[ind+2];
00444       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00445         float dy = ymin*gridspacing - xyzr[ind+1];
00446         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00447           float dy2dz2 = dy*dy + dz*dz;
00448 
00449           // early-exit when outside the cutoff radius in the Y-Z plane
00450           if (dy2dz2 >= radlim2)
00451             continue;
00452 
00453           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00454           float dx = xmin*gridspacing - xyzr[ind];
00455           x=xmin;
00456 
00457 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00458           // Use AVX when we have a multiple-of-8 to compute
00459           // finish all remaining density map points with regular C++ loop
00460           {
00461             __align(16) AVXreg n;
00462             __align(16) AVXreg y;
00463             __m256 dy2dz2_8 = _mm256_set1_ps(dy2dz2);
00464             __m256 dx_8 = _mm256_add_ps(_mm256_set1_ps(dx), _mm256_load_ps(&sxdelta8[0]));
00465 
00466             for (; (x+7)<=xmax; x+=8,dx_8=_mm256_add_ps(dx_8, gridspacing8_8)) {
00467               __m256 r2 = _mm256_fmadd_ps(dx_8, dx_8, dy2dz2_8);
00468               __m256 d;
00469 #if VMDUSESVMLEXP
00470               // use Intel's SVML exp2() routine
00471               y.f = _mm256_exp2_ps(_mm256_mul_ps(r2, arinv_8));
00472 #else
00473               // use our (much faster) fully inlined exponential approximation
00474               y.f = _mm256_mul_ps(r2, arinv_8);         /* already negated and in base 2 */
00475               n.i = _mm256_cvttps_epi32(y.f);
00476               d = _mm256_cvtepi32_ps(n.i);
00477               d = _mm256_sub_ps(d, y.f);
00478 
00479               // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00480               // Perform Horner's method to evaluate interpolating polynomial.
00481 #if defined(__FMA__)
00482               // use FMA3 instructions when possible
00483               y.f = _mm256_fmadd_ps(d, _mm256_set1_ps(SCEXP4), _mm256_set1_ps(SCEXP3));
00484               y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP2));
00485               y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP1));
00486               y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP0));
00487 #else
00488               y.f = _mm256_mul_ps(d, _mm256_set1_ps(SCEXP4));      /* for x^4 term */
00489               y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP3));    /* for x^3 term */
00490               y.f = _mm256_mul_ps(y.f, d);
00491               y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP2));    /* for x^2 term */
00492               y.f = _mm256_mul_ps(y.f, d);
00493               y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP1));    /* for x^1 term */
00494               y.f = _mm256_mul_ps(y.f, d);
00495               y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP0));    /* for x^0 term */
00496 #endif
00497 
00498               // Calculate 2^N exactly by directly manipulating floating point exponent,
00499               // then use it to scale y for the final result.
00500               // We need AVX2 instructions to be able to operate on
00501               // 8-wide integer types efficiently.
00502               n.i = _mm256_sub_epi32(_mm256_set1_epi32(EXPOBIAS), n.i);
00503               n.i = _mm256_slli_epi32(n.i, EXPOSHIFT);
00504               y.f = _mm256_mul_ps(y.f, n.f);
00505               y.f = _mm256_mul_ps(y.f, atomicnumfactor_8); // MDFF density maps
00506 #endif
00507 
00508               // At present, we do unaligned loads/stores since we can't guarantee
00509               // that the X-dimension is always a multiple of 8.
00510               float *ufptr = &densitymap[addr + x];
00511               d = _mm256_loadu_ps(ufptr);
00512               _mm256_storeu_ps(ufptr, _mm256_add_ps(d, y.f));
00513             }
00514           }
00515 #endif
00516 
00517           // finish all remaining density map points with regular non-SSE loop
00518           for (; x<=xmax; x++,dx+=gridspacing) {
00519             float r2 = dx*dx + dy2dz2;
00520 
00521             // use our (much faster) fully inlined exponential approximation
00522             float mb = r2 * arinv;         /* already negated and in base 2 */
00523             int mbflr = (int) mb;          /* get int part, floor() */
00524             float d = mbflr - mb;          /* remaining exponent, -1 < d <= 0 */
00525 
00526             /* approx with linear blend of Taylor polys */
00527             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00528 
00529             /* 2^(-mbflr) */
00530             flint scalfac;
00531             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00532 
00533             // XXX assume we are never beyond the cutoff value in this loop
00534             float density = (sy * scalfac.f);
00535 
00536             density *= atomicnumfactor; // MDFF Cryo-EM atomic number density
00537 
00538             densitymap[addr + x] += density;
00539           }
00540         }
00541       }
00542     }
00543   }
00544 
00545   // prevent x86 AVX-SSE transition performance loss due to CPU state 
00546   // transition penalties or false dependence on upper register state
00547   _mm256_zeroupper();
00548 }
00549  
00550 
00551 
00552