Orbital_SVE.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: Orbital_SVE.C,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.10 $       $Date: 2022/05/06 03:43:05 $
00014  *
00015  ***************************************************************************/
00021 //
00022 // Notes:  
00023 //   ARM intrinsics documentation:
00024 //     https://developer.arm.com/architectures/instruction-sets/intrinsics
00025 //
00026 //   Tests on the ORNL Wombat cluster reveal that several 
00027 // compiler versions don't generate correct SVE code for 
00028 // particular intrinsics, such as svcvt_s32_f32_x(),
00029 // svcvt_s32_f32_z(), and likely others.
00030 //
00031 // Compilers found to generate bad code include:
00032 //   armclang 21.1
00033 //   armclang 22.0
00034 //   llvm/clang 10.0.1
00035 //
00036 // Compilers generating correct code include:
00037 //   armclang 20.3
00038 //   llvm/clang 11.0.1
00039 //
00040 
00041 #if defined(VMDCPUDISPATCH) && defined(__ARM_FEATURE_SVE) 
00042 #include <arm_sve.h>
00043 
00044 #include <math.h>
00045 #include <stdio.h>
00046 #include "Orbital.h"
00047 #include "DrawMolecule.h"
00048 #include "utilities.h"
00049 #include "Inform.h"
00050 #include "WKFThreads.h"
00051 #include "WKFUtils.h"
00052 #include "ProfileHooks.h"
00053 
00054 #define ANGS_TO_BOHR 1.88972612478289694072f
00055 
00056 #if defined(__GNUC__) 
00057 #define __align(X)  __attribute__((aligned(X) ))
00058 #else
00059 #define __align(X) __declspec(align(X) )
00060 #endif
00061 
00062 #define MLOG2EF    -1.44269504088896f
00063 
00064 #if 0
00065 static void print_svfloat32_t(svfloat32_t v) {
00066   float tmp[svcntw()]; // vector-size aligned for SVE
00067 
00068   svbool_t pg = svptrue_b32();
00069   svst1_f32(pg, &tmp[0], v);
00070 
00071   printf("print_svfloat32_t: ");
00072   for (int i=0; i<svcntw(); i++)
00073     printf("%g ", tmp[i]);
00074   printf("\n");
00075 }
00076 
00077 static void print_svint32_t(svint32_t v) {
00078   int tmp[svcntw()]; // vector-size-aligned for SVE
00079 
00080   svbool_t pg = svptrue_b32();
00081   svst1_s32(pg, &tmp[0], v);
00082 
00083   printf("print_svint32_t: ");
00084   for (int i=0; i<svcntw(); i++)
00085     printf("%d ", tmp[i]);
00086   printf("\n");
00087 }
00088 
00089 static void print_svhex32_t(svint32_t v) {
00090   int tmp[4]; // vector-size-aligned for SVE
00091 
00092   svbool_t pg = svptrue_b32();
00093   svst1_s32(pg, &tmp[0], v);
00094 
00095   printf("print_svhex32_t: ");
00096   for (int i=0; i<svcntw(); i++)
00097     printf("%08x ", tmp[i]);
00098   printf("\n");
00099 }
00100 #endif
00101 
00102 
00103 //
00104 // John Stone, April 2022
00105 //
00106 // aexpfnxsve() - ARM SVE version of aexpfnx().
00107 //
00108 
00109 /*
00110  * Interpolating coefficients for linear blending of the
00111  * 3rd degree Taylor expansion of 2^x about 0 and -1.
00112  */
00113 #define SCEXP0     1.0000000000000000f
00114 #define SCEXP1     0.6987082824680118f
00115 #define SCEXP2     0.2633174272827404f
00116 #define SCEXP3     0.0923611991471395f
00117 #define SCEXP4     0.0277520543324108f
00118 
00119 /* for single precision float */
00120 #define EXPOBIAS   127
00121 #define EXPOSHIFT   23
00122 
00123 /* cutoff is optional, but can help avoid unnecessary work */
00124 #define ACUTOFF    -10
00125 
00126 svfloat32_t aexpfnxarmsve(svfloat32_t x) {
00127   svbool_t pg = svptrue_b32();
00128 
00129   // If all x are outside of cutoff, return 0s.
00130   float fmv = svmaxv(pg, x);
00131   if (fmv < ACUTOFF) {
00132     return  svdup_f32(0.0f);
00133   }
00134 
00135   // if x < ACUTOFF, we return 0..
00136   pg = svcmpge_f32(pg, x, svdup_f32(ACUTOFF)); // Is x within cutoff?
00137 
00138   /*
00139    * Convert base:  exp(x) = 2^(N-d) where N is integer and 0 <= d < 1.
00140    *
00141    * Below we calculate n=N and x=-d, with "y" for temp storage,
00142    * calculate floor of x*log2(e) and subtract to get -d.
00143    */
00144   svfloat32_t mb = svmul_f32_x(pg, x, svdup_f32(MLOG2EF));
00145   svint32_t intflr = svcvt_s32_f32_x(pg, mb);
00146   svfloat32_t mbflr = svcvt_f32_s32_x(pg, intflr);
00147   svfloat32_t d = svsub_f32_x(pg, mbflr, mb);
00148 
00149   // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
00150   // Perform Horner's method to evaluate interpolating polynomial.
00151   svfloat32_t y;
00152   y = svmad_f32_x(pg, d, svdup_f32(SCEXP4), svdup_f32(SCEXP3));
00153   y = svmad_f32_x(pg, y, d, svdup_f32(SCEXP2));
00154   y = svmad_f32_x(pg, y, d, svdup_f32(SCEXP1));
00155   y = svmad_f32_x(pg, y, d, svdup_f32(SCEXP0));
00156 
00157   // Calculate 2^N exactly by directly manipulating floating point exponent,
00158   // then use it to scale y for the final result.
00159   svint32_t flint = svsub_s32_x(pg, svdup_s32(EXPOBIAS), intflr);
00160   flint = svlsl_s32_x(pg, flint, svdup_u32(EXPOSHIFT));
00161   y = svmul_f32_z(pg, y, svreinterpret_f32_s32(flint));
00162   return y;
00163 }
00164 
00165 
00166 //
00167 // ARM SVE implementation, uses exponential approximation
00168 //
00169 int evaluate_grid_sve(int numatoms,
00170                          const float *wave_f, const float *basis_array,
00171                          const float *atompos,
00172                          const int *atom_basis,
00173                          const int *num_shells_per_atom,
00174                          const int *num_prim_per_shell,
00175                          const int *shell_types,
00176                          const int *numvoxels,
00177                          float voxelsize,
00178                          const float *origin,
00179                          int density,
00180                          float * orbitalgrid) {
00181   if (!orbitalgrid)
00182     return -1;
00183 
00184   svbool_t pg = svptrue_b32();
00185 
00186   int nx, ny, nz;
00187   int vecsize = svcntw();
00188   float sxdelta[vecsize];
00189   for (nx=0; nx<vecsize; nx++) 
00190     sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
00191 
00192   // Calculate the value of the orbital at each gridpoint and store in 
00193   // the current oribtalgrid array
00194   int numgridxy = numvoxels[0]*numvoxels[1];
00195   for (nz=0; nz<numvoxels[2]; nz++) {
00196     float grid_x, grid_y, grid_z;
00197     grid_z = origin[2] + nz * voxelsize;
00198     for (ny=0; ny<numvoxels[1]; ny++) {
00199       grid_y = origin[1] + ny * voxelsize;
00200       int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
00201       for (nx=0; nx<numvoxels[0]; nx+=16) {
00202         grid_x = origin[0] + nx * voxelsize;
00203 
00204         // calculate the value of the wavefunction of the
00205         // selected orbital at the current grid point
00206         int at;
00207         int prim, shell;
00208 
00209         // initialize value of orbital at gridpoint
00210         svfloat32_t value = svdup_f32(0.0f);
00211 
00212         // initialize the wavefunction and shell counters
00213         int ifunc = 0; 
00214         int shell_counter = 0;
00215 
00216         // loop over all the QM atoms
00217         for (at=0; at<numatoms; at++) {
00218           int maxshell = num_shells_per_atom[at];
00219           int prim_counter = atom_basis[at];
00220 
00221           // calculate distance between grid point and center of atom
00222           float sxdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
00223           float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00224           float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00225 
00226           float sydist2 = sydist*sydist;
00227           float szdist2 = szdist*szdist;
00228           float yzdist2 = sydist2 + szdist2;
00229 
00230           svfloat32_t xdelta = svld1_f32(pg, &sxdelta[0]); // aligned load
00231           svfloat32_t xdist  = svdup_f32(sxdist);
00232           xdist = svadd_f32_z(pg, xdist, xdelta);
00233           svfloat32_t ydist  = svdup_f32(sydist);
00234           svfloat32_t zdist  = svdup_f32(szdist);
00235           svfloat32_t xdist2 = svmul_f32_z(pg, xdist, xdist);
00236           svfloat32_t ydist2 = svmul_f32_z(pg, ydist, ydist);
00237           svfloat32_t zdist2 = svmul_f32_z(pg, zdist, zdist);
00238           svfloat32_t dist2  = svdup_f32(yzdist2); 
00239           dist2 = svadd_f32_z(pg, dist2, xdist2);
00240  
00241           // loop over the shells belonging to this atom
00242           // XXX this is maybe a misnomer because in split valence
00243           //     basis sets like 6-31G we have more than one basis
00244           //     function per (valence-)shell and we are actually
00245           //     looping over the individual contracted GTOs
00246           for (shell=0; shell < maxshell; shell++) {
00247             svfloat32_t contracted_gto = svdup_f32(0.0f);
00248 
00249             // Loop over the Gaussian primitives of this contracted 
00250             // basis function to build the atomic orbital
00251             // 
00252             // XXX there's a significant opportunity here for further
00253             //     speedup if we replace the entire set of primitives
00254             //     with the single gaussian that they are attempting 
00255             //     to model.  This could give us another 6x speedup in 
00256             //     some of the common/simple cases.
00257             int maxprim = num_prim_per_shell[shell_counter];
00258             int shelltype = shell_types[shell_counter];
00259             for (prim=0; prim<maxprim; prim++) {
00260               // XXX pre-negate exponent value
00261               float exponent       = -basis_array[prim_counter    ];
00262               float contract_coeff =  basis_array[prim_counter + 1];
00263 
00264               // contracted_gto += contract_coeff * exp(-exponent*dist2);
00265               svfloat32_t expval = svmul_f32_z(pg, svdup_f32(exponent), dist2);
00266               // exp2f() equivalent required, use base-2 approximation
00267               svfloat32_t retval = aexpfnxarmsve(expval);
00268               contracted_gto = svmad_f32_z(pg, svdup_f32(contract_coeff), retval, contracted_gto);
00269 
00270               prim_counter += 2;
00271             }
00272 
00273             /* multiply with the appropriate wavefunction coefficient */
00274             svfloat32_t tmpshell = svdup_f32(0.0f);
00275             switch (shelltype) {
00276               // use FMADD instructions
00277               case S_SHELL:
00278                 value = svmad_f32_z(pg, svdup_f32(wave_f[ifunc++]), contracted_gto, value);
00279                 break;
00280 
00281               case P_SHELL:
00282                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), xdist, tmpshell);
00283                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), ydist, tmpshell);
00284                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), zdist, tmpshell);
00285                 value = svmad_f32_z(pg, tmpshell, contracted_gto, value);
00286                 break;
00287 
00288               case D_SHELL:
00289                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), xdist2, tmpshell);
00290                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist, ydist), tmpshell);
00291                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), ydist2, tmpshell);
00292                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist, zdist), tmpshell);
00293                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist, zdist), tmpshell);
00294                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), zdist2, tmpshell);
00295                 value = svmad_f32_z(pg, tmpshell, contracted_gto, value);
00296                 break;
00297 
00298               case F_SHELL:
00299                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist2, xdist), tmpshell);
00300                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist2, ydist), tmpshell);
00301                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist2, xdist), tmpshell);
00302                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist2, ydist), tmpshell);
00303                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist2, zdist), tmpshell);
00304                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, svmul_f32_x(pg, xdist, ydist), zdist), tmpshell);
00305                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist2, zdist), tmpshell);
00306                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, zdist2, xdist), tmpshell);
00307                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, zdist2, ydist), tmpshell);
00308                 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, zdist2, zdist), tmpshell);
00309                 value = svmad_f32_z(pg, tmpshell, contracted_gto, value);
00310                 break;
00311  
00312 #if 0
00313               default:
00314                 // avoid unnecessary branching and minimize use of pow()
00315                 int i, j; 
00316                 float xdp, ydp, zdp;
00317                 float xdiv = 1.0f / xdist;
00318                 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
00319                   int imax = shelltype - j; 
00320                   for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
00321                     tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
00322                   }
00323                 }
00324                 value += tmpshell * contracted_gto;
00325 #endif
00326             } // end switch
00327 
00328             shell_counter++;
00329           } // end shell
00330         } // end atom
00331 
00332         // return either orbital density or orbital wavefunction amplitude
00333         if (density) {
00334           pg = svcmplt_f32(pg, value, svdup_f32(0.0f));
00335           svfloat32_t sqdensity = svmul_f32_z(pg, value, value);
00336           svfloat32_t orbdensity = svsel_f32(pg, sqdensity, svmul_f32_z(pg, sqdensity, svdup_f32(-1.0f)));
00337           svst1_f32(pg, &orbitalgrid[gaddrzy + nx], orbdensity);
00338         } else {
00339           svst1_f32(pg, &orbitalgrid[gaddrzy + nx], value);
00340         }
00341       }
00342     }
00343   }
00344 
00345   return 0;
00346 }
00347 
00348 #else
00349 
00350 int evaluate_grid_sve(int numatoms,
00351                       const float *wave_f, const float *basis_array,
00352                       const float *atompos,
00353                       const int *atom_basis,
00354                       const int *num_shells_per_atom,
00355                       const int *num_prim_per_shell,
00356                       const int *shell_types,
00357                       const int *numvoxels,
00358                       float voxelsize,
00359                       const float *origin,
00360                       int density,
00361                       float * orbitalgrid) {
00362   return -1; // unimplemented by this compiler toolchain
00363 }
00364 
00365 #endif