util_simd.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: util_simd.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.32 $       $Date: 2022/05/23 19:10:02 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *
00019  * Hand-coded SIMD loops using compiler provided intrinsics, or inline
00020  * assembly code to generate highly optimized machine code for time-critical
00021  * loops that crop up commonly used features of VMD. 
00022  *
00023  ***************************************************************************/
00024 
00025 #include <stddef.h>     // ptrdiff_t and size_t
00026 #include "WKFThreads.h" // CPU capability flags
00027 
00028 // pgcc has troubles with hand-vectorized x86 intrinsics
00029 #if !defined(__PGIC__)
00030 #define VMDUSESSE 1     // enable SSE in combination with target macros
00031 // #define VMDUSEVSX 1
00032 #endif
00033 // #define VMDUSENEON 1
00034 
00035 
00036 // Intel x86 hardware
00037 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00038 #if !defined(__SSE2__) && defined(_WIN64)
00039 #define __SSE2__ 1      /* MSVC fails to define SSE macros */
00040 #endif
00041 #endif
00042 
00043 
00044 #if defined(VMDUSESSE) && defined(__SSE2__)
00045 #include <emmintrin.h>
00046 #endif
00047 #if defined(VMDUSEAVX) && defined(__AVX__)
00048 #include <immintrin.h>
00049 #endif
00050 #if defined(VMDUSENEON) && defined(__ARM_NEON__)
00051 #include <arm_neon.h>
00052 #endif
00053 #if (defined(VMDUSEVSX) && defined(__VSX__))
00054 #if defined(__GNUC__) && defined(__VEC__)
00055 // IBM Power 8/9/10 Altivec/VSX instructions:
00056 //   https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
00057 #include <altivec.h>
00058 #endif
00059 #endif
00060 
00061 // #include <string.h>
00062 // #include <ctype.h>
00063 #include <math.h>
00064 #include <stdio.h>
00065 #include <stdlib.h>
00066 
00067 #if defined(_MSC_VER)
00068 #include <windows.h>
00069 #include <conio.h>
00070 #else
00071 #include <unistd.h>
00072 #endif // _MSC_VER
00073 
00074 // fctn prototypes for AVX, AVX2, AVX-512 runtime dispatch
00075 int analyze_selection_aligned_avx(int n, const int *on,
00076                                   int *firstsel, int *lastsel, int *selected);
00077 int analyze_selection_aligned_avx2(int n, const int *on,
00078                                   int *firstsel, int *lastsel, int *selected);
00079 
00080 #if 0
00081 //
00082 // XXX array init/copy routines that avoid polluting cache, where possible
00083 //
00084 // Fast 16-byte-aligned integer assignment loop for use in the
00085 // VMD color scale routines
00086 void set_1fv_aligned(const int *iv, int n, const int val) {
00087   int i=0;
00088 
00089 #if defined(VMDUSESSE) && defined(__SSE2__)
00090   __m128i = _mm_set_p
00091   // do groups of four elements
00092   for (; i<(n-3); i+=4) {
00093   }
00094 #endif
00095 }
00096 #endif
00097 
00098 
00099 #if defined(VMDUSESSE) || defined(VMDUSEAVX) || defined(VMDUSEVSX) || defined(VMDUSENEON)
00100 
00101 //
00102 // Helper routine for use when coping with unaligned
00103 // buffers returned by malloc() on many GNU systems:
00104 //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24261
00105 //   http://www.sourceware.org/bugzilla/show_bug.cgi?id=206
00106 //
00107 // XXX until all compilers support uintptr_t, we have to do 
00108 //     dangerous and ugly things with pointer casting here...
00109 //
00110 #if defined(_WIN64) /* sizeof(size_t) == sizeof(void*) */
00111 #define myintptrtype size_t
00112 #elif 1 /* sizeof(unsigned long) == sizeof(void*) */
00113 #define myintptrtype unsigned long
00114 #else /* C99 */
00115 #define myintptrtype uintptr_t
00116 #endif
00117 
00118 #if 0
00119 // arbitrary pointer alignment test
00120 static int is_Nbyte_aligned(const void *ptr, int N) {
00121   return ((((myintptrtype) ptr) % N) == 0);
00122 }
00123 #endif
00124 
00125 #if (defined(VMDUSESSE) && defined(__SSE2__)) || (defined(VMDUSEVSX) && defined(__VSX__)) || (defined(VMDUSEAVX) && defined(__AVX__)) || (defined(VMDUSENEON) && defined(__ARM_NEON__))
00126 // Aligment test routine for x86 16-byte SSE vector instructions
00127 static int is_16byte_aligned(const void *ptr) {
00128   return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0xf)));
00129 }
00130 #endif
00131 
00132 #if defined(VMDUSEAVX)
00133 // Aligment test routine for x86 32-byte AVX vector instructions
00134 static int is_32byte_aligned(const void *ptr) {
00135   return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0x1f)));
00136 }
00137 #endif
00138 
00139 #if 0
00140 // Aligment test routine for x86 LRB/MIC 64-byte vector instructions
00141 static int is_64byte_aligned(const void *ptr) {
00142   return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0x3f)));
00143 }
00144 #endif
00145 #endif 
00146 
00147 
00148 //
00149 // Small inlinable SSE helper routines to make code easier to read
00150 //
00151 #if defined(VMDUSESSE) && defined(__SSE2__)
00152 
00153 #if 0
00154 static void print_m128i(__m128i mask4) {
00155   int * iv = (int *) &mask4;
00156   printf("vec: %08x %08x %08x %08x\n", iv[0], iv[1], iv[2], iv[3]);
00157 }
00158 
00159 static int hand_m128i(__m128i mask4) {
00160   __m128i tmp = mask4;
00161   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00162   tmp = _mm_and_si128(mask4, tmp);
00163   mask4 = tmp;
00164   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00165   tmp = _mm_and_si128(mask4, tmp);
00166   mask4 = tmp; // all 4 elements are now set to the reduced mask
00167 
00168   int mask = _mm_cvtsi128_si32(mask4); // return zeroth element
00169   return mask;
00170 }
00171 #endif
00172 
00173 
00174 static int hor_m128i(__m128i mask4) {
00175 #if 0
00176   int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mask4, _mm_set1_epi32(1)));
00177 #else
00178   __m128i tmp = mask4;
00179   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00180   tmp = _mm_or_si128(mask4, tmp);
00181   mask4 = tmp;
00182   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00183   tmp = _mm_or_si128(mask4, tmp);
00184   mask4 = tmp; // all 4 elements are now set to the reduced mask
00185 
00186   int mask = _mm_cvtsi128_si32(mask4); // return zeroth element
00187 #endif
00188   return mask;
00189 }
00190 
00191 
00192 static int hadd_m128i(__m128i sum4) {
00193   __m128i tmp = sum4;
00194   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00195   tmp = _mm_add_epi32(sum4, tmp);
00196   sum4 = tmp;
00197   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00198   tmp = _mm_add_epi32(sum4, tmp);
00199   sum4 = tmp; // all 4 elements are now set to the sum
00200 
00201   int sum = _mm_cvtsi128_si32(sum4); // return zeroth element
00202   return sum;
00203 }
00204 
00205 
00206 #if 0
00207 static __m128i _mm_sel_m128i(const __m128i &a, const __m128i &b, const __m128i &mask) {
00208   // (((b ^ a) & mask)^a)
00209   return _mm_xor_si128(a, _mm_and_si128(mask, _mm_xor_si128(b, a)));
00210 }
00211 #endif
00212 
00213 
00214 static __m128 _mm_sel_ps(const __m128 &a, const __m128 &b, const __m128 &mask) {
00215   // (((b ^ a) & mask)^a)
00216   return _mm_xor_ps(a, _mm_and_ps(mask, _mm_xor_ps(b, a)));
00217 }
00218 
00219 
00220 // helper routine to perform a min among all 4 elements of an __m128
00221 static float fmin_m128(__m128 min4) {
00222   __m128 tmp;
00223   tmp = min4;
00224   tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1));
00225   tmp = _mm_min_ps(min4, tmp);
00226   min4 = tmp;
00227   tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2));
00228   tmp = _mm_min_ps(min4, tmp);
00229   min4 = tmp; // all 4 elements are now set to the min
00230 
00231   float fmin;
00232   _mm_store_ss(&fmin, min4);
00233   return fmin;
00234 }
00235 
00236 
00237 // helper routine to perform a max among all 4 elements of an __m128
00238 static float fmax_m128(__m128 max4) {
00239   __m128 tmp = max4;
00240   tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1));
00241   tmp = _mm_max_ps(max4, tmp);
00242   max4 = tmp;
00243   tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2));
00244   tmp = _mm_max_ps(max4, tmp);
00245   max4 = tmp; // all 4 elements are now set to the max
00246 
00247   float fmax;
00248   _mm_store_ss(&fmax, max4);
00249   return fmax;
00250 }
00251 #endif
00252 
00253 
00254 //
00255 // Small inlinable ARM Neon helper routines to make code easier to read
00256 //
00257 #if defined(VMDUSENEON) && defined(__ARM_NEON__)
00258 
00259 // helper routine to perform a min among all 4 elements of an __m128
00260 static float fmin_f32x4(float32x4_t min4) {
00261   float *f1 = (float *) &min4;
00262   float min1 = f1[0];
00263   if (f1[1] < min1) min1 = f1[1];
00264   if (f1[2] < min1) min1 = f1[2];
00265   if (f1[3] < min1) min1 = f1[3];
00266   return min1;
00267 }
00268 
00269 static float fmax_f32x4(float32x4_t max4) {
00270   float *f1 = (float *) &max4;
00271   float max1 = f1[0];
00272   if (f1[1] > max1) max1 = f1[1];
00273   if (f1[2] > max1) max1 = f1[2];
00274   if (f1[3] > max1) max1 = f1[3];
00275   return max1;
00276 }
00277 
00278 #endif
00279 
00280 
00281 // Find the first selected atom
00282 int find_first_selection_aligned(int n, const int *on, int *firstsel) {
00283   int i;
00284   *firstsel = 0;
00285 
00286   // find the first selected atom, if any
00287 #if defined(VMDUSESSE) && defined(__SSE2__)
00288   // roll up to the first 16-byte-aligned array index
00289   for (i=0; ((i<n) && !is_16byte_aligned(&on[i])); i++) {
00290     if (on[i]) {
00291       *firstsel = i; // found first selected atom
00292       return 0;
00293     }
00294   }
00295 
00296   // SSE vectorized search loop
00297   for (; i<(n-3); i+=4) {
00298     // aligned load of 4 selection flags
00299     __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00300     if (hor_m128i(on4))
00301       break; // found a block containing the first selected atom
00302   }
00303 
00304   for (; i<n; i++) {
00305     if (on[i]) {
00306       *firstsel = i; // found first selected atom
00307       return 0;
00308     }
00309   }
00310 #elif 0 && (defined(VMDUSEVSX) && defined(__VSX__))
00311   // roll up to the first 16-byte-aligned array index
00312   for (i=0; ((i<n) && !is_16byte_aligned(&on[i])); i++) {
00313     if (on[i]) {
00314       *firstsel = i; // found first selected atom
00315       return 0;
00316     }
00317   }
00318 
00319   // VSX vectorized search loop
00320   for (; i<(n-3); i+=4) {
00321     // aligned load of 4 selection flags
00322     __vector signed int on4 = *((__vector signed int *) &on[i]);
00323     if (vec_extract(vec_max(on4, on4), 0))
00324       break; // found a block containing the first selected atom
00325   }
00326 
00327   for (; i<n; i++) {
00328     if (on[i]) {
00329       *firstsel = i; // found first selected atom
00330       return 0;
00331     }
00332   }
00333 #else
00334   // plain C...
00335   for (i=0; i<n; i++) {
00336     if (on[i]) {
00337       *firstsel = i; // found first selected atom
00338       return 0;
00339     }
00340   }
00341 #endif
00342 
00343   // no atoms were selected if we got here
00344   *firstsel = 0;
00345   return -1;
00346 }
00347 
00348 
00349 // Find the last selected atom
00350 int find_last_selection_aligned(int n, const int *on, int *lastsel) {
00351   int i;
00352   *lastsel =  -1;
00353 
00354   // find the last selected atom, if any
00355 #if defined(VMDUSESSE) && defined(__SSE2__)
00356   // SSE vectorized search loop
00357   // Roll down to next 16-byte boundary
00358   for (i=n-1; i>=0; i--) {
00359     if (on[i]) {
00360       *lastsel = i; // found last selected atom
00361       return 0;
00362     }
00363 
00364     // drop out of the alignment loop once we hit a 16-byte boundary
00365     if (is_16byte_aligned(&on[i]))
00366       break;
00367   }
00368 
00369   for (i-=4; i>=0; i-=4) {
00370     // aligned load of 4 selection flags
00371     __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00372     if (hor_m128i(on4))
00373       break; // found a block containing the last selected atom
00374   }
00375 
00376   int last4=i;
00377   for (i=last4+3; i>=last4; i--) {
00378     if (on[i]) {
00379       *lastsel = i; // found last selected atom
00380       return 0;
00381     }
00382   }
00383 #elif 0 && (defined(VMDUSEVSX) && defined(__VSX__))
00384   // VSX vectorized search loop
00385   // Roll down to next 16-byte boundary
00386   for (i=n-1; i>=0; i--) {
00387     if (on[i]) {
00388       *lastsel = i; // found last selected atom
00389       return 0;
00390     }
00391 
00392     // drop out of the alignment loop once we hit a 16-byte boundary
00393     if (is_16byte_aligned(&on[i]))
00394       break;
00395   }
00396 
00397   for (i-=4; i>=0; i-=4) {
00398     // aligned load of 4 selection flags
00399     __vector signed int on4 = *((__vector signed int *) &on[i]);
00400     if (vec_extract(vec_max(on4, on4), 0))
00401       break; // found a block containing the last selected atom
00402   }
00403 
00404   int last4=i;
00405   for (i=last4+3; i>=last4; i--) {
00406     if (on[i]) {
00407       *lastsel = i; // found last selected atom
00408       return 0;
00409     }
00410   }
00411 #else
00412   // plain C...
00413   for (i=n-1; i>=0; i--) {
00414     if (on[i]) {
00415       *lastsel = i; // found last selected atom
00416       return 0;
00417     }
00418   }
00419 #endif
00420 
00421   // no atoms were selected if we got here
00422   *lastsel = -1;
00423   return -1;
00424 }
00425 
00426 
00427 // Find the first selected atom, the last selected atom,
00428 // and the total number of selected atoms.
00429 int analyze_selection_aligned(int n, const int *on, 
00430                               int *firstsel, int *lastsel, int *selected) {
00431   int sel   = 0;   // set count to zero in case of early-exit
00432   int first = 0;   // if we early-exit, firstsel is 0 
00433   int last  = -1;  // and lastsel is -1
00434   int i;
00435 
00436   // assign potential early-exit outcomes
00437   if (selected != NULL)
00438     *selected = sel; 
00439 
00440   if (firstsel != NULL)
00441     *firstsel = first;
00442 
00443   if (lastsel != NULL)
00444     *lastsel = last;
00445 
00446   // find the first selected atom, if any
00447   if ((firstsel != NULL) || (selected != NULL)) {
00448     if (find_first_selection_aligned(n, on, &first)) {
00449       return -1; // indicate that no selection was found
00450     }
00451   }
00452 
00453   // find the last selected atom, if any
00454   if ((lastsel != NULL) || (selected != NULL)) {
00455     if (find_last_selection_aligned(n, on, &last)) {
00456       return -1; // indicate that no selection was found
00457     }
00458   }
00459 
00460   // count the number of selected atoms (there are only 0s and 1s)
00461   // and determine the index of the last selected atom
00462   if (selected != NULL) {
00463     // XXX the Intel 12.x compiler is able to beat this code in some
00464     //     cases, but GCC 4.x cannot, so for Intel C/C++ we use the plain C 
00465     //     loop and let it autovectorize, but for GCC we do it by hand.
00466 #if !defined(__INTEL_COMPILER) && defined(VMDUSESSE) && defined(__SSE2__)
00467     // SSE vectorized search loop
00468     // Roll up to next 16-byte boundary
00469     for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00470       sel += on[i];
00471     }
00472 
00473 #if 1
00474     // Process groups of 4 flags at a time
00475     __m128i sum4 = _mm_setzero_si128();
00476     for (; i<=(last-3); i+=4) {
00477       // aligned load of four selection flags
00478       __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00479 
00480       // sum selected atoms vertically
00481       sum4 = _mm_add_epi32(sum4, on4);
00482     }
00483     sel += hadd_m128i(sum4); // sum horizontally to finalize count
00484 #else
00485     // Process groups of 4 flags at a time
00486     for (; i<=(last-3); i+=4) {
00487       // aligned load of four selection flags
00488       __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00489 
00490       // count selected atoms
00491       sel += hadd_m128i(on4);
00492     }
00493 #endif
00494 
00495     // check the very end of the array (non-divisible by four)
00496     for (; i<=last; i++) {
00497       sel += on[i];
00498     }
00499 #elif 1 && (defined(VMDUSEVSX) && defined(__VSX__))
00500     // VSX vectorized search loop
00501     // Roll up to next 16-byte boundary
00502     for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00503       sel += on[i];
00504     }
00505 
00506     // Process groups of 4 flags at a time
00507     vector signed int cnt4 = vec_splat_s32(0);
00508     for (; i<=(last-3); i+=4) {
00509       // aligned load of four selection flags
00510       vector signed int on4 = *((__vector signed int *) &on[i]);
00511 
00512       // count selected atoms
00513       cnt4 = vec_add(cnt4, on4);
00514     }
00515     sel += vec_extract(cnt4, 0) + vec_extract(cnt4, 1) + 
00516            vec_extract(cnt4, 2) + vec_extract(cnt4, 3);
00517 
00518     // check the very end of the array (non-divisible by four)
00519     for (; i<=last; i++) {
00520       sel += on[i];
00521     }
00522 #else
00523     // plain C...
00524     for (i=first; i<=last; i++) {
00525       sel += on[i];
00526     }
00527 #endif
00528   }
00529 
00530   if (selected != NULL)
00531     *selected = sel; 
00532 
00533   if (firstsel != NULL)
00534     *firstsel = first;
00535 
00536   if (lastsel != NULL)
00537     *lastsel = last;
00538 
00539   return 0;
00540 }
00541 
00542 
00543 // Find the first selected atom, the last selected atom,
00544 // and the total number of selected atoms.
00545 int analyze_selection_aligned_dispatch(wkf_cpu_caps_t *cpucaps,
00546                                        int n, const int *on, 
00547                                        int *firstsel, int *lastsel, 
00548                                        int *selected) {
00549   int sel   = 0;   // set count to zero in case of early-exit
00550   int first = 0;   // if we early-exit, firstsel is 0 
00551   int last  = -1;  // and lastsel is -1
00552   int i;
00553 
00554 #if defined(VMDCPUDISPATCH)
00555   if (cpucaps != NULL) {
00556 //printf("analyze_selection_aligned_dispatch()\n");
00557 
00558     // Intel x86
00559 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00560     if (cpucaps->flags & CPU_AVX2) {
00561 //printf("analyze_selection_aligned_avx2()\n");
00562       return analyze_selection_aligned_avx2(n, on, firstsel, lastsel, selected);
00563     }
00564 
00565     if (cpucaps->flags & CPU_AVX) {
00566 //printf("analyze_selection_aligned_avx()\n");
00567       return analyze_selection_aligned_avx(n, on, firstsel, lastsel, selected);
00568     }
00569 #endif
00570 
00571   }
00572 #endif
00573 
00574   // assign potential early-exit outcomes
00575   if (selected != NULL)
00576     *selected = sel; 
00577 
00578   if (firstsel != NULL)
00579     *firstsel = first;
00580 
00581   if (lastsel != NULL)
00582     *lastsel = last;
00583 
00584   // find the first selected atom, if any
00585   if ((firstsel != NULL) || (selected != NULL)) {
00586     if (find_first_selection_aligned(n, on, &first)) {
00587       return -1; // indicate that no selection was found
00588     }
00589   }
00590 
00591   // find the last selected atom, if any
00592   if ((lastsel != NULL) || (selected != NULL)) {
00593     if (find_last_selection_aligned(n, on, &last)) {
00594       return -1; // indicate that no selection was found
00595     }
00596   }
00597 
00598   // count the number of selected atoms (there are only 0s and 1s)
00599   // and determine the index of the last selected atom
00600   if (selected != NULL) {
00601     // XXX the Intel 12.x compiler is able to beat this code in some
00602     //     cases, but GCC 4.x cannot, so for Intel C/C++ we use the plain C 
00603     //     loop and let it autovectorize, but for GCC we do it by hand.
00604 #if !defined(__INTEL_COMPILER) && defined(VMDUSESSE) && defined(__SSE2__)
00605     // SSE vectorized search loop
00606     // Roll up to next 16-byte boundary
00607     for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00608       sel += on[i];
00609     }
00610 
00611     // Process groups of 4 flags at a time
00612     __m128i sum4 = _mm_setzero_si128();
00613     for (; i<=(last-3); i+=4) {
00614       // aligned load of four selection flags
00615       __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00616 
00617       // sum selected atoms vertically
00618       sum4 = _mm_add_epi32(sum4, on4);
00619     }
00620     sel += hadd_m128i(sum4); // sum horizontally to finalize count
00621 
00622     // check the very end of the array (non-divisible by four)
00623     for (; i<=last; i++) {
00624       sel += on[i];
00625     }
00626 #elif 1 && (defined(VMDUSEVSX) && defined(__VSX__))
00627     // VSX vectorized search loop
00628     // Roll up to next 16-byte boundary
00629     for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00630       sel += on[i];
00631     }
00632 
00633     // Process groups of 4 flags at a time
00634     vector signed int cnt4 = vec_splat_s32(0);
00635     for (; i<=(last-3); i+=4) {
00636       // aligned load of four selection flags
00637       vector signed int on4 = *((__vector signed int *) &on[i]);
00638 
00639       // count selected atoms
00640       cnt4 = vec_add(cnt4, on4);
00641     }
00642     sel += vec_extract(cnt4, 0) + vec_extract(cnt4, 1) + 
00643            vec_extract(cnt4, 2) + vec_extract(cnt4, 3);
00644 
00645     // check the very end of the array (non-divisible by four)
00646     for (; i<=last; i++) {
00647       sel += on[i];
00648     }
00649 #else
00650     // plain C...
00651     for (i=first; i<=last; i++) {
00652       sel += on[i];
00653     }
00654 #endif
00655   }
00656 
00657   if (selected != NULL)
00658     *selected = sel; 
00659 
00660   if (firstsel != NULL)
00661     *firstsel = first;
00662 
00663   if (lastsel != NULL)
00664     *lastsel = last;
00665 
00666   return 0;
00667 }
00668 
00669 
00670 // Compute min/max/mean values for a 16-byte-aligned array of floats
00671 void minmaxmean_1fv_aligned(const float *f, ptrdiff_t n, 
00672                             float *fmin, float *fmax, float *fmean) {
00673   if (n < 1) {
00674     *fmin = 0.0f;
00675     *fmax = 0.0f;
00676     *fmean = 0.0f;
00677     return;
00678   }
00679 
00680 #if defined(VMDUSEAVX) && defined(__AVX__)
00681   ptrdiff_t i=0;
00682   float min1 = f[0];
00683   float max1 = f[0];
00684   double mean1 = f[0];
00685   
00686   // roll up to the first 32-byte-aligned array index
00687   for (i=0; ((i<n) && !is_32byte_aligned(&f[i])); i++) {
00688     if (f[i] < min1) min1 = f[i];
00689     if (f[i] > max1) max1 = f[i];
00690     mean1 += f[i];
00691   }
00692 
00693   // AVX vectorized min/max loop
00694   __m256 min8 = _mm256_set1_ps(min1);
00695   __m256 max8 = _mm256_set1_ps(max1);
00696   __m256d mean4d = _mm256_set1_pd(0.0);
00697 
00698   // do groups of 64 elements
00699   for (; i<(n-63); i+=64) {
00700     __m256 f8 = _mm256_load_ps(&f[i]); // assume 32-byte aligned array!
00701     min8 = _mm256_min_ps(min8, f8);
00702     max8 = _mm256_max_ps(max8, f8);
00703     __m256 mean8 = f8;
00704     f8 = _mm256_load_ps(&f[i+8]); // assume 32-byte aligned array!
00705     min8 = _mm256_min_ps(min8, f8);
00706     max8 = _mm256_max_ps(max8, f8);
00707     mean8 = _mm256_add_ps(mean8, f8);
00708     f8 = _mm256_load_ps(&f[i+16]); // assume 32-byte aligned array!
00709     min8 = _mm256_min_ps(min8, f8);
00710     max8 = _mm256_max_ps(max8, f8);
00711     mean8 = _mm256_add_ps(mean8, f8);
00712     f8 = _mm256_load_ps(&f[i+24]); // assume 32-byte aligned array!
00713     min8 = _mm256_min_ps(min8, f8);
00714     max8 = _mm256_max_ps(max8, f8);
00715     mean8 = _mm256_add_ps(mean8, f8);
00716 
00717     f8 = _mm256_load_ps(&f[i+32]); // assume 32-byte aligned array!
00718     min8 = _mm256_min_ps(min8, f8);
00719     max8 = _mm256_max_ps(max8, f8);
00720     mean8 = _mm256_add_ps(mean8, f8);
00721     f8 = _mm256_load_ps(&f[i+40]); // assume 32-byte aligned array!
00722     min8 = _mm256_min_ps(min8, f8);
00723     max8 = _mm256_max_ps(max8, f8);
00724     mean8 = _mm256_add_ps(mean8, f8);
00725     f8 = _mm256_load_ps(&f[i+48]); // assume 32-byte aligned array!
00726     min8 = _mm256_min_ps(min8, f8);
00727     max8 = _mm256_max_ps(max8, f8);
00728     mean8 = _mm256_add_ps(mean8, f8);
00729     f8 = _mm256_load_ps(&f[i+56]); // assume 32-byte aligned array!
00730     min8 = _mm256_min_ps(min8, f8);
00731     max8 = _mm256_max_ps(max8, f8);
00732     mean8 = _mm256_add_ps(mean8, f8);
00733 
00734     // sum mean8 into double-precision accumulator mean4d,
00735     // converting from single to double-precision, and 
00736     // appropriately shuffling the high and low subwords
00737     // so all four elements are accumulated
00738     mean4d = _mm256_add_pd(mean4d, _mm256_cvtps_pd(_mm256_extractf128_ps(mean8, 0)));
00739     mean4d = _mm256_add_pd(mean4d, _mm256_cvtps_pd(_mm256_extractf128_ps(mean8, 1)));
00740   }
00741 
00742   // sum the mean4d against itself so all elements have the total,
00743   // write out the lower element, and sum it into mean1
00744   __m256d pairs4d = _mm256_hadd_pd(mean4d, mean4d);
00745   mean4d = _mm256_add_pd(pairs4d, 
00746                          _mm256_permute2f128_pd(pairs4d, pairs4d, 0x01));
00747 #if defined(__AVX2__)
00748   mean1 +=  _mm256_cvtsd_f64(mean4d); 
00749 #else
00750   double tmp;
00751   _mm_storel_pd(&tmp, _mm256_castpd256_pd128(mean4d));
00752   mean1 += tmp;
00753 #endif
00754 
00755   // finish last elements off
00756   for (; i<n; i++) {
00757     __m256 f8 = _mm256_set1_ps(f[i]);
00758     min8 = _mm256_min_ps(min8, f8);
00759     max8 = _mm256_max_ps(max8, f8);
00760     mean1 += f[i];
00761   }
00762 
00763   // compute min/max among the final 4-element vectors by shuffling
00764   // and and reducing the elements within the vectors
00765   float t0, t1;
00766   t0 = fmin_m128(_mm256_extractf128_ps(min8, 0));
00767   t1 = fmin_m128(_mm256_extractf128_ps(min8, 1));
00768   *fmin = (t0 < t1) ? t0 : t1;
00769 
00770   t0 = fmax_m128(_mm256_extractf128_ps(max8, 0));
00771   t1 = fmax_m128(_mm256_extractf128_ps(max8, 1));
00772   *fmax = (t0 > t1) ? t0 : t1;
00773   *fmean = mean1 / n; 
00774 #elif defined(VMDUSESSE) && defined(__SSE2__) && (defined(__GNUC__) || defined(__INTEL_COMPILER))
00775   ptrdiff_t i=0;
00776   float min1 = f[0];
00777   float max1 = f[0];
00778   double mean1 = f[0];
00779   
00780   // roll up to the first 16-byte-aligned array index
00781   for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00782     if (f[i] < min1) min1 = f[i];
00783     if (f[i] > max1) max1 = f[i];
00784     mean1 += f[i];
00785   }
00786 
00787   // SSE vectorized min/max loop
00788   __m128 min4 = _mm_set_ps1(min1);
00789   __m128 max4 = _mm_set_ps1(max1);
00790 #if defined(__clang__) && (__clang_major__ < 5)
00791   // Clang bugs checked with compiler explorer:  http://godbolt.org/
00792   __m128d mean2d = _mm_cvtps_pd(_mm_set_ps1(0.0f)); // XXX Clang workaround
00793 #else
00794   //  __m128d mean2d = _mm_set_pd1(0.0); // XXX Clang v < 5.x misses this
00795   __m128d mean2d = _mm_set1_pd(0.0); // XXX Clang v < 5.x misses this
00796 #endif
00797 
00798   // do groups of 32 elements
00799   for (; i<(n-31); i+=32) {
00800     __m128 f4 = _mm_load_ps(&f[i]); // assume 16-byte aligned array!
00801     min4 = _mm_min_ps(min4, f4);
00802     max4 = _mm_max_ps(max4, f4);
00803     __m128 mean4 = f4;
00804     f4 = _mm_load_ps(&f[i+4]); // assume 16-byte aligned array!
00805     min4 = _mm_min_ps(min4, f4);
00806     max4 = _mm_max_ps(max4, f4);
00807     mean4 = _mm_add_ps(mean4, f4);
00808     f4 = _mm_load_ps(&f[i+8]); // assume 16-byte aligned array!
00809     min4 = _mm_min_ps(min4, f4);
00810     max4 = _mm_max_ps(max4, f4);
00811     mean4 = _mm_add_ps(mean4, f4);
00812     f4 = _mm_load_ps(&f[i+12]); // assume 16-byte aligned array!
00813     min4 = _mm_min_ps(min4, f4);
00814     max4 = _mm_max_ps(max4, f4);
00815     mean4 = _mm_add_ps(mean4, f4);
00816 
00817     f4 = _mm_load_ps(&f[i+16]); // assume 16-byte aligned array!
00818     min4 = _mm_min_ps(min4, f4);
00819     max4 = _mm_max_ps(max4, f4);
00820     mean4 = _mm_add_ps(mean4, f4);
00821     f4 = _mm_load_ps(&f[i+20]); // assume 16-byte aligned array!
00822     min4 = _mm_min_ps(min4, f4);
00823     max4 = _mm_max_ps(max4, f4);
00824     mean4 = _mm_add_ps(mean4, f4);
00825     f4 = _mm_load_ps(&f[i+24]); // assume 16-byte aligned array!
00826     min4 = _mm_min_ps(min4, f4);
00827     max4 = _mm_max_ps(max4, f4);
00828     mean4 = _mm_add_ps(mean4, f4);
00829     f4 = _mm_load_ps(&f[i+28]); // assume 16-byte aligned array!
00830     min4 = _mm_min_ps(min4, f4);
00831     max4 = _mm_max_ps(max4, f4);
00832     mean4 = _mm_add_ps(mean4, f4);
00833 
00834     // sum mean4 into double-precision accumulator mean2d,
00835     // converting from single to double-precision, and 
00836     // appropriately shuffling the high and low subwords
00837     // so all four elements are accumulated
00838     mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(mean4));
00839     mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(_mm_shuffle_ps(mean4, mean4, _MM_SHUFFLE(3, 2, 3, 2))));
00840   }
00841 
00842   // do groups of 4 elements
00843   for (; i<(n-3); i+=4) {
00844     __m128 f4 = _mm_load_ps(&f[i]); // assume 16-byte aligned array!
00845     min4 = _mm_min_ps(min4, f4);
00846     max4 = _mm_max_ps(max4, f4);
00847 
00848     // sum f4 into double-precision accumulator mean2d,
00849     // converting from single to double-precision, and 
00850     // appropriately shuffling the high and low subwords
00851     // so all four elements are accumulated
00852     mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(f4));
00853     mean2d = _mm_add_pd(mean2d, _mm_cvtps_pd(_mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3, 2, 3, 2))));
00854   }
00855 
00856   // sum the mean2d against itself so both elements have the total,
00857   // write out the lower element, and sum it into mean1
00858   mean2d = _mm_add_pd(mean2d, _mm_shuffle_pd(mean2d, mean2d, 1));
00859   double tmp;
00860   _mm_storel_pd(&tmp, mean2d);
00861   mean1 += tmp;
00862 
00863   // finish last elements off
00864   for (; i<n; i++) {
00865     __m128 f4 = _mm_set_ps1(f[i]);
00866     min4 = _mm_min_ps(min4, f4);
00867     max4 = _mm_max_ps(max4, f4);
00868     mean1 += f[i];
00869   }
00870 
00871   // compute min/max among the final 4-element vectors by shuffling
00872   // and and reducing the elements within the vectors
00873   *fmin = fmin_m128(min4);
00874   *fmax = fmax_m128(max4);
00875   *fmean = mean1 / n; 
00876 #else
00877   // scalar min/max/mean loop
00878   float min1 = f[0];
00879   float max1 = f[0];
00880   double mean1 = f[0];
00881   for (ptrdiff_t i=1; i<n; i++) {
00882     if (f[i] < min1) min1 = f[i];
00883     if (f[i] > max1) max1 = f[i];
00884     mean1 += f[i];
00885   }
00886   *fmin = min1;
00887   *fmax = max1;
00888   *fmean = float(mean1 / n); 
00889 #endif
00890 }
00891 
00892 
00893 // Compute min/max values for a 16-byte-aligned array of floats
00894 void minmax_1fv_aligned(const float *f, ptrdiff_t n, float *fmin, float *fmax) {
00895   if (n < 1)
00896     return;
00897 
00898 #if defined(VMDUSESSE) && defined(__SSE2__)
00899   ptrdiff_t i=0;
00900   float min1 = f[0];
00901   float max1 = f[0];
00902 
00903   // roll up to the first 16-byte-aligned array index
00904   for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00905     if (f[i] < min1) min1 = f[i];
00906     if (f[i] > max1) max1 = f[i];
00907   }
00908 
00909   // SSE vectorized min/max loop
00910   __m128 min4 = _mm_set_ps1(min1);
00911   __m128 max4 = _mm_set_ps1(max1);
00912 
00913   // do groups of 32 elements
00914   for (; i<(n-31); i+=32) {
00915     __m128 f4 = _mm_load_ps(&f[i]); // assume 16-byte aligned array!
00916     min4 = _mm_min_ps(min4, f4);
00917     max4 = _mm_max_ps(max4, f4);
00918     f4 = _mm_load_ps(&f[i+4]); // assume 16-byte aligned array!
00919     min4 = _mm_min_ps(min4, f4);
00920     max4 = _mm_max_ps(max4, f4);
00921     f4 = _mm_load_ps(&f[i+8]); // assume 16-byte aligned array!
00922     min4 = _mm_min_ps(min4, f4);
00923     max4 = _mm_max_ps(max4, f4);
00924     f4 = _mm_load_ps(&f[i+12]); // assume 16-byte aligned array!
00925     min4 = _mm_min_ps(min4, f4);
00926     max4 = _mm_max_ps(max4, f4);
00927 
00928     f4 = _mm_load_ps(&f[i+16]); // assume 16-byte aligned array!
00929     min4 = _mm_min_ps(min4, f4);
00930     max4 = _mm_max_ps(max4, f4);
00931     f4 = _mm_load_ps(&f[i+20]); // assume 16-byte aligned array!
00932     min4 = _mm_min_ps(min4, f4);
00933     max4 = _mm_max_ps(max4, f4);
00934     f4 = _mm_load_ps(&f[i+24]); // assume 16-byte aligned array!
00935     min4 = _mm_min_ps(min4, f4);
00936     max4 = _mm_max_ps(max4, f4);
00937     f4 = _mm_load_ps(&f[i+28]); // assume 16-byte aligned array!
00938     min4 = _mm_min_ps(min4, f4);
00939     max4 = _mm_max_ps(max4, f4);
00940   }
00941 
00942   // do groups of 4 elements
00943   for (; i<(n-3); i+=4) {
00944     __m128 f4 = _mm_load_ps(&f[i]); // assume 16-byte aligned array!
00945     min4 = _mm_min_ps(min4, f4);
00946     max4 = _mm_max_ps(max4, f4);
00947   }
00948 
00949   // finish last elements off
00950   for (; i<n; i++) {
00951     __m128 f4 = _mm_set_ps1(f[i]);
00952     min4 = _mm_min_ps(min4, f4);
00953     max4 = _mm_max_ps(max4, f4);
00954   }
00955 
00956   // compute min/max among the final 4-element vectors by shuffling
00957   // and and reducing the elements within the vectors
00958   *fmin = fmin_m128(min4);
00959   *fmax = fmax_m128(max4);
00960 #elif defined(VMDUSEVSX) && defined(__VSX__)
00961   ptrdiff_t i=0;
00962   float min1 = f[0];
00963   float max1 = f[0];
00964 
00965   // roll up to the first 16-byte-aligned array index
00966   for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00967     if (f[i] < min1) min1 = f[i];
00968     if (f[i] > max1) max1 = f[i];
00969   }
00970 
00971   // VSX vectorized min/max loop
00972   vector float min4 = vec_splats(min1);
00973   vector float max4 = vec_splats(max1);
00974 
00975   // do groups of 4 elements
00976   for (; i<(n-3); i+=4) {
00977     vector float f4 = *((vector float *) &f[i]); // assume 16-byte aligned array!
00978     min4 = vec_min(min4, f4);
00979     max4 = vec_max(max4, f4);
00980   }
00981 
00982   // finish last elements off
00983   for (; i<n; i++) {
00984     vector float f4 = vec_splats(f[i]);
00985     min4 = vec_min(min4, f4);
00986     max4 = vec_max(max4, f4);
00987   }
00988 
00989   // compute min/max among the final 4-element vectors by shuffling
00990   // and and reducing the elements within the vectors
00991   min1 = min4[0];
00992   min1 = (min1 < min4[1]) ? min1 : min4[1];
00993   min1 = (min1 < min4[2]) ? min1 : min4[2];
00994   min1 = (min1 < min4[3]) ? min1 : min4[3];
00995 
00996   max1 = max4[0];
00997   max1 = (max1 < max4[1]) ? max1 : max4[1];
00998   max1 = (max1 < max4[2]) ? max1 : max4[2];
00999   max1 = (max1 < max4[3]) ? max1 : max4[3];
01000 
01001   *fmin = min1;
01002   *fmax = max1;
01003 #elif defined(VMDUSENEON) && defined(__ARM_NEON__)
01004   ptrdiff_t i=0;
01005   float min1 = f[0];
01006   float max1 = f[0];
01007 
01008   // roll up to the first 16-byte-aligned array index
01009   for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
01010     if (f[i] < min1) min1 = f[i];
01011     if (f[i] > max1) max1 = f[i];
01012   }
01013 
01014   // NEON vectorized min/max loop
01015   float32x4_t min4 = vdupq_n_f32(min1);
01016   float32x4_t max4 = vdupq_n_f32(max1);
01017 
01018   // do groups of 32 elements
01019   for (; i<(n-31); i+=32) {
01020     float32x4_t f4;
01021     f4 = vld1q_f32(&f[i   ]); // assume 16-byte aligned array!
01022     min4 = vminq_f32(min4, f4);
01023     max4 = vmaxq_f32(max4, f4);
01024     f4 = vld1q_f32(&f[i+ 4]); // assume 16-byte aligned array!
01025     min4 = vminq_f32(min4, f4);
01026     max4 = vmaxq_f32(max4, f4);
01027     f4 = vld1q_f32(&f[i+ 8]); // assume 16-byte aligned array!
01028     min4 = vminq_f32(min4, f4);
01029     max4 = vmaxq_f32(max4, f4);
01030     f4 = vld1q_f32(&f[i+12]); // assume 16-byte aligned array!
01031     min4 = vminq_f32(min4, f4);
01032     max4 = vmaxq_f32(max4, f4);
01033 
01034     f4 = vld1q_f32(&f[i+16]); // assume 16-byte aligned array!
01035     min4 = vminq_f32(min4, f4);
01036     max4 = vmaxq_f32(max4, f4);
01037     f4 = vld1q_f32(&f[i+20]); // assume 16-byte aligned array!
01038     min4 = vminq_f32(min4, f4);
01039     max4 = vmaxq_f32(max4, f4);
01040     f4 = vld1q_f32(&f[i+24]); // assume 16-byte aligned array!
01041     min4 = vminq_f32(min4, f4);
01042     max4 = vmaxq_f32(max4, f4);
01043     f4 = vld1q_f32(&f[i+28]); // assume 16-byte aligned array!
01044     min4 = vminq_f32(min4, f4);
01045     max4 = vmaxq_f32(max4, f4);
01046   }
01047 
01048   // do groups of 4 elements
01049   for (; i<(n-3); i+=4) {
01050     float32x4_t f4 = vld1q_f32(&f[i]); // assume 16-byte aligned array!
01051     min4 = vminq_f32(min4, f4);
01052     max4 = vmaxq_f32(max4, f4);
01053   }
01054 
01055   // finish last elements off
01056   for (; i<n; i++) {
01057     float32x4_t f4 = vdupq_n_f32(f[i]);
01058     min4 = vminq_f32(min4, f4);
01059     max4 = vmaxq_f32(max4, f4);
01060   }
01061 
01062   // compute min/max among the final 4-element vectors by shuffling
01063   // and and reducing the elements within the vectors
01064   *fmin = fmin_f32x4(min4);
01065   *fmax = fmax_f32x4(max4);
01066 #else
01067   // scalar min/max loop
01068   float min1 = f[0];
01069   float max1 = f[0];
01070   for (ptrdiff_t i=1; i<n; i++) {
01071     if (f[i] < min1) min1 = f[i];
01072     if (f[i] > max1) max1 = f[i];
01073   }
01074   *fmin = min1;
01075   *fmax = max1;
01076 #endif
01077 }
01078 
01079 
01080 // Compute min/max values for a 16-byte-aligned array of float3s
01081 // input value n3 is the number of 3-element vectors to process
01082 void minmax_3fv_aligned(const float *f, const ptrdiff_t n3, 
01083                         float *fmin, float *fmax) {
01084   float minx, maxx, miny, maxy, minz, maxz;
01085   const ptrdiff_t end = n3*3L;
01086 
01087   if (n3 < 1)
01088     return;
01089 
01090   ptrdiff_t i=0;
01091   minx=maxx=f[i  ];
01092   miny=maxy=f[i+1];
01093   minz=maxz=f[i+2];
01094 
01095 #if defined(VMDUSESSE) && defined(__SSE2__)
01096   // Since we may not be on a 16-byte boundary when we start, we roll 
01097   // through the first few items with plain C until we get to one.
01098   for (; i<end; i+=3L) {
01099     // exit if/when we reach a 16-byte boundary for both arrays
01100     if (is_16byte_aligned(&f[i])) {
01101       break;
01102     }
01103 
01104     float tmpx = f[i  ];
01105     if (tmpx < minx) minx = tmpx;
01106     if (tmpx > maxx) maxx = tmpx;
01107 
01108     float tmpy = f[i+1];
01109     if (tmpy < miny) miny = tmpy;
01110     if (tmpy > maxy) maxy = tmpy;
01111 
01112     float tmpz = f[i+2];
01113     if (tmpz < minz) minz = tmpz;
01114     if (tmpz > maxz) maxz = tmpz;
01115   }
01116 
01117   // initialize min/max values
01118   __m128 xmin4 = _mm_set_ps1(minx);
01119   __m128 xmax4 = _mm_set_ps1(maxx);
01120   __m128 ymin4 = _mm_set_ps1(miny);
01121   __m128 ymax4 = _mm_set_ps1(maxy);
01122   __m128 zmin4 = _mm_set_ps1(minz);
01123   __m128 zmax4 = _mm_set_ps1(maxz);
01124 
01125   for (; i<(end-11); i+=12) {
01126     // aligned load of four consecutive 3-element vectors into
01127     // three 4-element vectors
01128     __m128 x0y0z0x1 = _mm_load_ps(&f[i  ]);
01129     __m128 y1z1x2y2 = _mm_load_ps(&f[i+4]);
01130     __m128 z2x3y3z3 = _mm_load_ps(&f[i+8]);
01131 
01132     // convert rgb3f AOS format to 4-element SOA vectors using shuffle instructions
01133     __m128 x2y2x3y3 = _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
01134     __m128 y0z0y1z1 = _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
01135     __m128 x        = _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0)); // x0x1x2x3
01136     __m128 y        = _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0)); // y0y1y2y3
01137     __m128 z        = _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1)); // z0y1z2z3
01138 
01139     // compute mins and maxes
01140     xmin4 = _mm_min_ps(xmin4, x);
01141     xmax4 = _mm_max_ps(xmax4, x);
01142     ymin4 = _mm_min_ps(ymin4, y);
01143     ymax4 = _mm_max_ps(ymax4, y);
01144     zmin4 = _mm_min_ps(zmin4, z);
01145     zmax4 = _mm_max_ps(zmax4, z);
01146   }
01147 
01148   minx = fmin_m128(xmin4);
01149   miny = fmin_m128(ymin4);
01150   minz = fmin_m128(zmin4);
01151 
01152   maxx = fmax_m128(xmax4);
01153   maxy = fmax_m128(ymax4);
01154   maxz = fmax_m128(zmax4);
01155 #endif
01156 
01157   // regular C code... 
01158   for (; i<end; i+=3) {
01159     float tmpx = f[i  ];
01160     if (tmpx < minx) minx = tmpx;
01161     if (tmpx > maxx) maxx = tmpx;
01162 
01163     float tmpy = f[i+1];
01164     if (tmpy < miny) miny = tmpy;
01165     if (tmpy > maxy) maxy = tmpy;
01166 
01167     float tmpz = f[i+2];
01168     if (tmpz < minz) minz = tmpz;
01169     if (tmpz > maxz) maxz = tmpz;
01170   }
01171 
01172   fmin[0] = minx;
01173   fmax[0] = maxx;
01174   fmin[1] = miny;
01175   fmax[1] = maxy;
01176   fmin[2] = minz;
01177   fmax[2] = maxz;
01178 }
01179 
01180 
01181 // Compute min/max values for a 16-byte-aligned array of float3s
01182 // input value n3 is the number of 3-element vectors to process
01183 int minmax_selected_3fv_aligned(const float *f, const int *on, 
01184                                 const ptrdiff_t n3, 
01185                                 const ptrdiff_t firstsel, 
01186                                 const ptrdiff_t lastsel,
01187                                 float *fmin, float *fmax) {
01188   float minx, maxx, miny, maxy, minz, maxz;
01189 
01190   if ((n3 < 1) || (firstsel < 0) || (lastsel < firstsel) || (lastsel >= n3))
01191     return -1;
01192 
01193   // start at first selected atom
01194   ptrdiff_t i=firstsel;
01195   minx=maxx=f[i*3L  ];
01196   miny=maxy=f[i*3L+1];
01197   minz=maxz=f[i*3L+2];
01198 
01199   ptrdiff_t end=lastsel+1;
01200 
01201 // printf("Starting array alignment: on[%d]: %p f[%d]: %p\n",
01202 //        i, &on[i], i*3L, &f[i*3L]);
01203 
01204 #if defined(VMDUSESSE) && defined(__SSE2__)
01205   // since we may not be on a 16-byte boundary, when we start, we roll 
01206   // through the first few items with plain C until we get to one.
01207   for (; i<end; i++) {
01208     ptrdiff_t ind3 = i * 3L;
01209 
01210 #if 1
01211     // exit if/when we reach a 16-byte boundary for the coordinate array only,
01212     // for now we'll do unaligned loads of the on array since there are cases
01213     // where we get differently unaligned input arrays and they'll never 
01214     // line up at a 16-byte boundary at the same time
01215     if (is_16byte_aligned(&f[ind3])) {
01216       break;
01217     }
01218 #else
01219     // exit if/when we reach a 16-byte boundary for both arrays
01220     if (is_16byte_aligned(&on[i]) && is_16byte_aligned(&f[ind3])) {
01221 // printf("Found alignment boundary: on[%d]: %p f[%d]: %p\n",
01222 //        i, &on[i], ind3, &f[ind3]);
01223       break;
01224     }
01225 #endif
01226 
01227     if (on[i]) {
01228       float tmpx = f[ind3  ];
01229       if (tmpx < minx) minx = tmpx;
01230       if (tmpx > maxx) maxx = tmpx;
01231 
01232       float tmpy = f[ind3+1];
01233       if (tmpy < miny) miny = tmpy;
01234       if (tmpy > maxy) maxy = tmpy;
01235 
01236       float tmpz = f[ind3+2];
01237       if (tmpz < minz) minz = tmpz;
01238       if (tmpz > maxz) maxz = tmpz;
01239     }
01240   }
01241 
01242   // initialize min/max values to results from scalar loop above
01243   __m128 xmin4 = _mm_set_ps1(minx);
01244   __m128 xmax4 = _mm_set_ps1(maxx);
01245   __m128 ymin4 = _mm_set_ps1(miny);
01246   __m128 ymax4 = _mm_set_ps1(maxy);
01247   __m128 zmin4 = _mm_set_ps1(minz);
01248   __m128 zmax4 = _mm_set_ps1(maxz);
01249 
01250   for (; i<(end-3); i+=4) {
01251 #if 1
01252     // XXX unaligned load of four selection flags, since there are cases
01253     //     where the input arrays can't achieve alignment simultaneously
01254     __m128i on4 = _mm_loadu_si128((__m128i*) &on[i]);
01255 #else
01256     // aligned load of four selection flags
01257     __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
01258 #endif
01259 
01260     // compute atom selection mask
01261     __m128i mask = _mm_cmpeq_epi32(_mm_set1_epi32(1), on4);
01262     if (!hor_m128i(mask))
01263       continue; // no atoms selected
01264 
01265     // aligned load of four consecutive 3-element vectors into
01266     // three 4-element vectors
01267     ptrdiff_t ind3 = i * 3L;
01268     __m128 x0y0z0x1 = _mm_load_ps(&f[ind3+0]);
01269     __m128 y1z1x2y2 = _mm_load_ps(&f[ind3+4]);
01270     __m128 z2x3y3z3 = _mm_load_ps(&f[ind3+8]);
01271 
01272     // convert rgb3f AOS format to 4-element SOA vectors using shuffle instructions
01273     __m128 x2y2x3y3 = _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
01274     __m128 y0z0y1z1 = _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
01275     __m128 x        = _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0)); // x0x1x2x3
01276     __m128 y        = _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0)); // y0y1y2y3
01277     __m128 z        = _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1)); // z0y1z2z3
01278 
01279     // compute mins and maxes
01280     xmin4 = _mm_sel_ps(xmin4, _mm_min_ps(xmin4, x), _mm_castsi128_ps(mask));
01281     xmax4 = _mm_sel_ps(xmax4, _mm_max_ps(xmax4, x), _mm_castsi128_ps(mask)); 
01282     ymin4 = _mm_sel_ps(ymin4, _mm_min_ps(ymin4, y), _mm_castsi128_ps(mask)); 
01283     ymax4 = _mm_sel_ps(ymax4, _mm_max_ps(ymax4, y), _mm_castsi128_ps(mask)); 
01284     zmin4 = _mm_sel_ps(zmin4, _mm_min_ps(zmin4, z), _mm_castsi128_ps(mask)); 
01285     zmax4 = _mm_sel_ps(zmax4, _mm_max_ps(zmax4, z), _mm_castsi128_ps(mask)); 
01286   }
01287 
01288   minx = fmin_m128(xmin4);
01289   miny = fmin_m128(ymin4);
01290   minz = fmin_m128(zmin4);
01291 
01292   maxx = fmax_m128(xmax4);
01293   maxy = fmax_m128(ymax4);
01294   maxz = fmax_m128(zmax4);
01295 #endif
01296 
01297   // regular C code... 
01298   for (; i<end; i++) {
01299     if (on[i]) {
01300       ptrdiff_t ind3 = i * 3L;
01301       float tmpx = f[ind3  ];
01302       if (tmpx < minx) minx = tmpx;
01303       if (tmpx > maxx) maxx = tmpx;
01304 
01305       float tmpy = f[ind3+1];
01306       if (tmpy < miny) miny = tmpy;
01307       if (tmpy > maxy) maxy = tmpy;
01308 
01309       float tmpz = f[ind3+2];
01310       if (tmpz < minz) minz = tmpz;
01311       if (tmpz > maxz) maxz = tmpz;
01312     }
01313   }
01314 
01315   fmin[0] = minx;
01316   fmax[0] = maxx;
01317   fmin[1] = miny;
01318   fmax[1] = maxy;
01319   fmin[2] = minz;
01320   fmax[2] = maxz;
01321 
01322   return 0;
01323 }
01324 
01325