util_simd_AVX2.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: util_simd_AVX2.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.1 $        $Date: 2020/11/01 05:00:11 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *
00019  * Hand-coded SIMD loops using compiler provided intrinsics, or inline
00020  * assembly code to generate highly optimized machine code for time-critical
00021  * loops that crop up commonly used features of VMD. 
00022  *
00023  ***************************************************************************/
00024 
00025 #include "WKFThreads.h" // CPU capability flags
00026 
00027 #if defined(__SSE2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00028 #define VMDUSESSE 1     // enable SSE in combination with target macros
00029 #endif
00030 
00031 #if defined(__AVX__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00032 #define VMDUSEAVX 1     // enable AVX with runtime dispatch 
00033 #endif
00034 
00035 #if defined(__AVX2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00036 #define VMDUSEAVX2 1     // enable AVX with runtime dispatch 
00037 #endif
00038 
00039 #if defined(VMDUSESSE) && defined(VMDUSEAVX) 
00040 
00041 #if defined(VMDUSESSE)
00042 #include <emmintrin.h>
00043 #endif
00044 #if defined(VMDUSEAVX)
00045 #include <immintrin.h>
00046 #endif
00047 
00048 // #include <string.h>
00049 // #include <ctype.h>
00050 #include <math.h>
00051 #include <stdio.h>
00052 #include <stdlib.h>
00053 
00054 #if defined(_MSC_VER)
00055 #include <windows.h>
00056 #include <conio.h>
00057 #else
00058 #include <unistd.h>
00059 #endif // _MSC_VER
00060 
00061 
00062 //
00063 // Helper routine for use when coping with unaligned
00064 // buffers returned by malloc() on many GNU systems:
00065 //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24261
00066 //   http://www.sourceware.org/bugzilla/show_bug.cgi?id=206
00067 //
00068 // XXX until all compilers support uintptr_t, we have to do 
00069 //     dangerous and ugly things with pointer casting here...
00070 //
00071 #if defined(_WIN64) /* sizeof(size_t) == sizeof(void*) */
00072 #define myintptrtype size_t
00073 #elif 1 /* sizeof(unsigned long) == sizeof(void*) */
00074 #define myintptrtype unsigned long
00075 #else /* C99 */
00076 #define myintptrtype uintptr_t
00077 #endif
00078 
00079 
00080 //
00081 // Aligment test routines for vector instructions
00082 //
00083 static int test_alignment_Nbyte_powertwo(const void *ptr, unsigned int alignsz) {
00084   unsigned int alignmask = alignsz - 1;
00085   return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~alignmask)));
00086 }
00087 
00088 
00089 //
00090 // Small inlinable AVX helper routines to make code easier to read
00091 //
00092 static int hadd_m256i(__m256i sum8) {
00093   int tmp[sizeof(__m256i)/sizeof(int)];
00094   _mm256_store_si256((__m256i *)tmp, sum8);
00095   int sum = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7];
00096   return sum;
00097 }
00098 
00099 
00100 #if defined(VMDUSEAVX)
00101 // Find the first selected atom, the last selected atom,
00102 // and the total number of selected atoms.
00103 int analyze_selection_aligned_avx2(int n, const int *on,
00104                                    int *firstsel, int *lastsel, int *selected) {
00105   int sel   = 0;   // set count to zero in case of early-exit
00106   int first = 0;   // if we early-exit, firstsel is 0
00107   int last  = -1;  // and lastsel is -1
00108   int i;
00109 
00110   // assign potential early-exit outcomes
00111   if (selected != NULL)
00112     *selected = sel;
00113 
00114   if (firstsel != NULL)
00115     *firstsel = first;
00116 
00117   if (lastsel != NULL)
00118     *lastsel = last;
00119 
00120   // find the first selected atom, if any
00121   if ((firstsel != NULL) || (selected != NULL)) {
00122     // find the first selected atom, if any
00123     // roll up to the first 32-byte-aligned array index
00124     for (i=0; ((i<n) && !test_alignment_Nbyte_powertwo(&on[i], 32)); i++) {
00125       if (on[i]) {
00126         first = i; // found first selected atom
00127         goto foundfirstsel;
00128       }
00129     }
00130 
00131     // AVX vectorized search loop
00132     for (; i<(n-7); i+=8) {
00133       // aligned load of 8 selection flags
00134       __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00135       if (!_mm256_testz_si256(on8, on8))
00136         break; // found a block containing the first selected atom
00137     }
00138 
00139     for (; i<n; i++) {
00140       if (on[i]) {
00141         first = i; // found first selected atom
00142         goto foundfirstsel;
00143       }
00144     }
00145 
00146     // prevent x86 AVX-SSE transition performance loss due to CPU state 
00147     // transition penalties or false dependence on upper register state
00148     _mm256_zeroupper();
00149     return -1; // indicate that no selection was found
00150   }
00151 foundfirstsel:
00152 
00153   // find the last selected atom, if any
00154   if ((lastsel != NULL) || (selected != NULL)) {
00155     // AVX vectorized search loop
00156     // Roll down to next 32-byte boundary
00157     for (i=n-1; i>=0; i--) {
00158       if (on[i]) {
00159         last = i; // found last selected atom
00160         goto foundlastsel;
00161       }
00162 
00163       // drop out of the alignment loop once we hit a 32-byte boundary
00164       if (test_alignment_Nbyte_powertwo(&on[i], 32))
00165         break;
00166     }
00167 
00168     for (i-=8; i>=0; i-=8) {
00169       // aligned load of 8 selection flags
00170       __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00171       if (!_mm256_testz_si256(on8, on8))
00172         break; // found a block containing the last selected atom
00173     }
00174 
00175     int last8=i;
00176     for (i=last8+7; i>=last8; i--) {
00177       if (on[i]) {
00178         last = i; // found last selected atom
00179         goto foundlastsel;
00180       }
00181     }
00182 
00183     // prevent x86 AVX-SSE transition performance loss due to CPU state 
00184     // transition penalties or false dependence on upper register state
00185     _mm256_zeroupper();
00186     return -1; // indicate that no selection was found
00187   }
00188 foundlastsel:
00189 
00190   // count the number of selected atoms (there are only 0s and 1s)
00191   // and determine the index of the last selected atom
00192   if (selected != NULL) {
00193     // AVX2 vectorized search loop
00194     // Roll up to next 32-byte boundary
00195     for (i=first; ((i<=last) && (!test_alignment_Nbyte_powertwo(&on[i], 32))); i++) {
00196       sel += on[i];
00197     }
00198 
00199     // Process groups of 8 flags at a time
00200     __m256i sum8 = _mm256_setzero_si256();
00201     for (; i<=(last-7); i+=8) {
00202       // aligned load of four selection flags
00203       __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00204 
00205       // sum selected atoms vertically
00206       sum8 = _mm256_add_epi32(sum8, on8);
00207     }
00208     sel += hadd_m256i(sum8); // sum horizontally to finalize count
00209 
00210     // check the very end of the array (non-divisible by four)
00211     for (; i<=last; i++) {
00212       sel += on[i];
00213     }
00214   }
00215 
00216   if (selected != NULL)
00217     *selected = sel;
00218 
00219   if (firstsel != NULL)
00220     *firstsel = first;
00221 
00222   if (lastsel != NULL)
00223     *lastsel = last;
00224 
00225   // prevent x86 AVX-SSE transition performance loss due to CPU state 
00226   // transition penalties or false dependence on upper register state
00227   _mm256_zeroupper();
00228   return 0;
00229 }
00230 #endif
00231 
00232 #endif // SSE+AVX
00233