util_simd_AVX.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: util_simd_AVX.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.8 $        $Date: 2020/11/01 05:40:57 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *
00019  * Hand-coded SIMD loops using compiler provided intrinsics, or inline
00020  * assembly code to generate highly optimized machine code for time-critical
00021  * loops that crop up commonly used features of VMD. 
00022  *
00023  ***************************************************************************/
00024 
00025 #include "WKFThreads.h" // CPU capability flags
00026 
00027 #if defined(__SSE2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00028 #define VMDUSESSE 1     // enable SSE in combination with target macros
00029 #endif
00030 
00031 #if defined(__AVX__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00032 #define VMDUSEAVX 1     // enable AVX with runtime dispatch 
00033 #endif
00034 
00035 #if defined(VMDUSESSE) && defined(VMDUSEAVX) 
00036 
00037 #if defined(VMDUSESSE)
00038 #include <emmintrin.h>
00039 #endif
00040 #if defined(VMDUSEAVX)
00041 #include <immintrin.h>
00042 #endif
00043 
00044 // #include <string.h>
00045 // #include <ctype.h>
00046 #include <math.h>
00047 #include <stdio.h>
00048 #include <stdlib.h>
00049 
00050 #if defined(_MSC_VER)
00051 #include <windows.h>
00052 #include <conio.h>
00053 #else
00054 #include <unistd.h>
00055 #endif // _MSC_VER
00056 
00057 
00058 #if 0
00059 //
00060 // XXX array init/copy routines that avoid polluting cache, where possible
00061 //
00062 // Fast 16-byte-aligned integer assignment loop for use in the
00063 // VMD color scale routines
00064 void set_1fv_aligned(const int *iv, int n, const int val) {
00065   int i=0;
00066 
00067 #if defined(VMDUSESSE)
00068   __m128i = _mm_set_p
00069   // do groups of four elements
00070   for (; i<(n-3); i+=4) {
00071   }
00072 #endif
00073 }
00074 #endif
00075 
00076 
00077 //
00078 // Helper routine for use when coping with unaligned
00079 // buffers returned by malloc() on many GNU systems:
00080 //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24261
00081 //   http://www.sourceware.org/bugzilla/show_bug.cgi?id=206
00082 //
00083 // XXX until all compilers support uintptr_t, we have to do 
00084 //     dangerous and ugly things with pointer casting here...
00085 //
00086 #if defined(_WIN64) /* sizeof(size_t) == sizeof(void*) */
00087 #define myintptrtype size_t
00088 #elif 1 /* sizeof(unsigned long) == sizeof(void*) */
00089 #define myintptrtype unsigned long
00090 #else /* C99 */
00091 #define myintptrtype uintptr_t
00092 #endif
00093 
00094 
00095 //
00096 // Aligment test routines for vector instructions
00097 //
00098 static int test_alignment_Nbyte_powertwo(const void *ptr, unsigned int alignsz) {
00099   unsigned int alignmask = alignsz - 1;
00100   return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~alignmask)));
00101 }
00102 
00103 #if 0
00104 static int bytes_prev_alignment(const void *ptr, unsigned int alignsz) {
00105   unsigned int alignmask = alignsz - 1;
00106   myintptrtype t = (((myintptrtype) ptr) & alignmask);
00107   return t;
00108 }
00109 
00110 static int bytes_next_alignment(const void *ptr, unsigned int alignsz) {
00111   unsigned int alignmask = alignsz - 1;
00112   myintptrtype t = (alignsz - (((myintptrtype) ptr) & (alignmask))) & alignmask;
00113   return t;
00114 }
00115 #endif
00116 
00117 //
00118 // Small inlinable SSE helper routines to make code easier to read
00119 //
00120 #if defined(VMDUSESSE)
00121 
00122 static int hadd_m128i(__m128i sum4) {
00123   __m128i tmp = sum4;
00124   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00125   tmp = _mm_add_epi32(sum4, tmp);
00126   sum4 = tmp;
00127   tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00128   tmp = _mm_add_epi32(sum4, tmp);
00129   sum4 = tmp; // all 4 elements are now set to the sum
00130 
00131   int sum = _mm_cvtsi128_si32(sum4); // return zeroth element
00132   return sum;
00133 }
00134 
00135 #endif
00136 
00137 
00138 //
00139 // Small inlinable AVX helper routines to make code easier to read
00140 //
00141 #if 0 && defined(VMDUSEAVX)
00142 
00143 static int hadd_m256i(__m256i sum8) {
00144   int tmp[sizeof(__m256i)/sizeof(int)];
00145   _mm256_store_si256((__m256i *)tmp, sum8);
00146   int sum = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7];
00147   return sum;
00148 }
00149 
00150 #endif
00151 
00152 #if defined(VMDUSEAVX)
00153 // Find the first selected atom, the last selected atom,
00154 // and the total number of selected atoms.
00155 int analyze_selection_aligned_avx(int n, const int *on,
00156                                   int *firstsel, int *lastsel, int *selected) {
00157   int sel   = 0;   // set count to zero in case of early-exit
00158   int first = 0;   // if we early-exit, firstsel is 0
00159   int last  = -1;  // and lastsel is -1
00160   int i;
00161 
00162   // assign potential early-exit outcomes
00163   if (selected != NULL)
00164     *selected = sel;
00165 
00166   if (firstsel != NULL)
00167     *firstsel = first;
00168 
00169   if (lastsel != NULL)
00170     *lastsel = last;
00171 
00172   // find the first selected atom, if any
00173   if ((firstsel != NULL) || (selected != NULL)) {
00174     // find the first selected atom, if any
00175     // roll up to the first 32-byte-aligned array index
00176 #if 1
00177     for (i=0; ((i<n) && !test_alignment_Nbyte_powertwo(&on[i], 32)); i++) {
00178       if (on[i]) {
00179         first = i; // found first selected atom
00180         goto foundfirstsel;
00181       }
00182     }
00183 #else
00184     // XXX this code needs more debugging
00185     int nextalign = bytes_next_alignment(&on[0], 32) / sizeof(int);
00186     int endalign = (n < nextalign) ? n : nextalign; 
00187     for (i=0; i<endalign; i++) {
00188       if (on[i]) {
00189         first = i; // found first selected atom
00190         goto foundfirstsel;
00191       }
00192     }
00193 #endif
00194 
00195     // AVX vectorized search loop
00196     for (; i<(n-7); i+=8) {
00197       // aligned load of 8 selection flags
00198       __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00199       if (!_mm256_testz_si256(on8, on8))
00200         break; // found a block containing the first selected atom
00201     }
00202 
00203     for (; i<n; i++) {
00204       if (on[i]) {
00205         first = i; // found first selected atom
00206         goto foundfirstsel;
00207       }
00208     }
00209 
00210     // prevent x86 AVX-SSE transition performance loss due to CPU state 
00211     // transition penalties or false dependence on upper register state
00212     _mm256_zeroupper();
00213     return -1; // indicate that no selection was found
00214   }
00215 foundfirstsel:
00216 
00217   // find the last selected atom, if any
00218   if ((lastsel != NULL) || (selected != NULL)) {
00219     // AVX vectorized search loop
00220     // Roll down to next 32-byte boundary
00221 #if 1
00222     for (i=n-1; i>=0; i--) {
00223       if (on[i]) {
00224         last = i; // found last selected atom
00225         goto foundlastsel;
00226       }
00227 
00228       // drop out of the alignment loop once we hit a 32-byte boundary
00229       if (test_alignment_Nbyte_powertwo(&on[i], 32))
00230         break;
00231     }
00232 #else
00233     // XXX this code needs more debugging
00234     int prevalign = bytes_prev_alignment(&on[0], 32) / sizeof(int);
00235     int startalign = (0 > n-prevalign-1) ? 0 : (n-prevalign-1); 
00236     for (i=n-1; i>=startalign; i--) {
00237       if (on[i]) {
00238         last = i; // found last selected atom
00239         goto foundlastsel;
00240       }
00241     }
00242 #endif
00243 
00244     for (i-=8; i>=0; i-=8) {
00245       // aligned load of 8 selection flags
00246       __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00247       if (!_mm256_testz_si256(on8, on8))
00248         break; // found a block containing the last selected atom
00249     }
00250 
00251     int last8=i;
00252     for (i=last8+7; i>=last8; i--) {
00253       if (on[i]) {
00254         last = i; // found last selected atom
00255         goto foundlastsel;
00256       }
00257     }
00258 
00259     // prevent x86 AVX-SSE transition performance loss due to CPU state 
00260     // transition penalties or false dependence on upper register state
00261     _mm256_zeroupper();
00262     return -1; // indicate that no selection was found
00263   }
00264 foundlastsel:
00265 
00266   // count the number of selected atoms (there are only 0s and 1s)
00267   // and determine the index of the last selected atom
00268   if (selected != NULL) {
00269     // XXX VEX encoded SSE
00270     // Roll up to next 16-byte boundary
00271 #if 1
00272     for (i=first; ((i<=last) && (!test_alignment_Nbyte_powertwo(&on[i], 16))); i++) {
00273       sel += on[i];
00274     }
00275 #else
00276     // XXX this code failed in a recent test on a 305M-atom virus
00277     //     case, so it needs careful revisiting before it can
00278     //     be safely enabled with recent versions of GCC 8.x at least.
00279     int nextalign = bytes_next_alignment(&on[0], 16) / sizeof(int);
00280     int endalign = (last < first+nextalign) ? last : (first+nextalign);
00281     for (i=first; i<=endalign; i++) {
00282       sel += on[i];
00283     }
00284 #endif
00285 
00286     // Process groups of 4 flags at a time
00287     __m128i sum4 = _mm_setzero_si128();
00288     for (; i<=(last-3); i+=4) {
00289       // aligned load of four selection flags
00290       __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00291 
00292       // sum selected atoms vertically
00293       sum4 = _mm_add_epi32(sum4, on4);
00294     }
00295     sel += hadd_m128i(sum4); // sum horizontally to finalize count
00296 
00297     // check the very end of the array (non-divisible by four)
00298     for (; i<=last; i++) {
00299       sel += on[i];
00300     }
00301   }
00302 
00303   if (selected != NULL)
00304     *selected = sel;
00305 
00306   if (firstsel != NULL)
00307     *firstsel = first;
00308 
00309   if (lastsel != NULL)
00310     *lastsel = last;
00311 
00312   // prevent x86 AVX-SSE transition performance loss due to CPU state 
00313   // transition penalties or false dependence on upper register state
00314   _mm256_zeroupper();
00315   return 0;
00316 }
00317 #endif
00318 
00319 #endif // SSE+AVX
00320