Orbital.C Source File

00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: Orbital.C,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.166 $      $Date: 2022/05/23 19:10:01 $
00014  *
00015  ***************************************************************************/
00021 // Intel x86 hardware 
00022 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00023 #if !defined(__SSE2__) && defined(_WIN64)
00024 #define __SSE2__ 1      /* MSVC fails to define SSE macros */
00025 #endif
00026 #if defined(__SSE2__)
00027 #include <emmintrin.h>
00028 #define VMDORBUSESSE 1  /* build SSE code for static launch */
00029 #endif
00030 
00031 #if defined(VMDCPUDISPATCH)
00032 #define VECPADSZ    16  /* max vec size is for x86 AVX512F or AVX512ER */
00033 #else
00034 #define VECPADSZ     4  /* fall-back to x86 SSE vector size */
00035 #endif
00036 
00037 // IBM Power 8/9/10 Altivec/VSX instructions:
00038 //   https://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
00039 #elif defined(__VSX__)
00040 #if defined(__GNUC__) && defined(__VEC__)
00041 #include <altivec.h>
00042 #endif
00043 // The OpenPOWER VSX code path runs on POWER8 and later hardware, but is
00044 // untested on older platforms that support VSX instructions.
00045 // XXX GCC 4.8.5 breaks with conflicts between vec_xxx() routines 
00046 //     defined in utilities.h vs. VSX intrinsics in altivec.h and similar.
00047 //     For now, we disable VSX for GCC for this source file.
00048 #define VECPADSZ     4  /* IBM POWER VSX vector size */
00049 #define VMDORBUSEVSX 1  /* build POWER VSX code for static launch */
00050 
00051 // ARM64 SVE... 
00052 #elif defined(__ARM_ARCH_ISA_A64) && !defined(ARCH_MACOSXARM64)
00053 #define VMDUSESVE 1  /* build SVE code for static launch */
00054 
00055 #if defined(VMDCPUDISPATCH)
00056 #define VECPADSZ    16  /* max vec size is for Fujitsu A64fx */
00057 #else
00058 #define VECPADSZ     4  /* fall-back to NEON vector size */
00059 #endif
00060 
00061 // generic scalar C++ code
00062 #else
00063 #define VECPADSZ     1  /* scalar code, no padding */
00064 #endif
00065 
00066 // padding mask determined from hardware-specific 
00067 #define VECPADMASK (VECPADSZ - 1)
00068 
00069 // #define DEBUGORBS 1
00070 
00071 #include <math.h>
00072 #include <stdio.h>
00073 #include "VMDApp.h"
00074 #include "Orbital.h"
00075 #include "DrawMolecule.h"
00076 #include "utilities.h"
00077 #include "Inform.h"
00078 #include "WKFThreads.h"
00079 #include "WKFUtils.h"
00080 #if defined(VMDCUDA)
00081 #include "CUDAOrbital.h"
00082 #endif
00083 #if defined(VMDOPENCL)
00084 #include "OpenCLUtils.h"
00085 #include "OpenCLKernels.h"
00086 #endif
00087 #include "ProfileHooks.h"
00088 
00089 //
00090 // fctn prototypes for CPU runtime dispatch kernels
00091 //
00092 
00093 // AVX-512ER implementation for Xeon Phi w/ special fctn units
00094 extern int evaluate_grid_avx512er(int numatoms,
00095                            const float *wave_f, const float *basis_array,
00096                            const float *atompos,
00097                            const int *atom_basis,
00098                            const int *num_shells_per_atom,
00099                            const int *num_prim_per_shell,
00100                            const int *shell_types,
00101                            const int *numvoxels,
00102                            float voxelsize,
00103                            const float *origin,
00104                            int density,
00105                            float * orbitalgrid);
00106 
00107 // AVX-512F implementation for CPUs without exponent/reciprocal support
00108 extern int evaluate_grid_avx512f(int numatoms,
00109                           const float *wave_f, const float *basis_array,
00110                           const float *atompos,
00111                           const int *atom_basis,
00112                           const int *num_shells_per_atom,
00113                           const int *num_prim_per_shell,
00114                           const int *shell_types,
00115                           const int *numvoxels,
00116                           float voxelsize,
00117                           const float *origin,
00118                           int density,
00119                           float * orbitalgrid);
00120 
00121 // AVX2 implementation 
00122 extern int evaluate_grid_avx2(int numatoms,
00123                           const float *wave_f, const float *basis_array,
00124                           const float *atompos,
00125                           const int *atom_basis,
00126                           const int *num_shells_per_atom,
00127                           const int *num_prim_per_shell,
00128                           const int *shell_types,
00129                           const int *numvoxels,
00130                           float voxelsize,
00131                           const float *origin,
00132                           int density,
00133                           float * orbitalgrid);
00134 
00135 // ARM NEON implementation 
00136 extern int evaluate_grid_neon(int numatoms,
00137                           const float *wave_f, const float *basis_array,
00138                           const float *atompos,
00139                           const int *atom_basis,
00140                           const int *num_shells_per_atom,
00141                           const int *num_prim_per_shell,
00142                           const int *shell_types,
00143                           const int *numvoxels,
00144                           float voxelsize,
00145                           const float *origin,
00146                           int density,
00147                           float * orbitalgrid);
00148 
00149 // ARM SVE implementation 
00150 extern int evaluate_grid_sve(int numatoms,
00151                           const float *wave_f, const float *basis_array,
00152                           const float *atompos,
00153                           const int *atom_basis,
00154                           const int *num_shells_per_atom,
00155                           const int *num_prim_per_shell,
00156                           const int *shell_types,
00157                           const int *numvoxels,
00158                           float voxelsize,
00159                           const float *origin,
00160                           int density,
00161                           float * orbitalgrid);
00162 
00163 
00164 #define ANGS_TO_BOHR 1.88972612478289694072f
00165 
00167 Orbital::Orbital(const float *pos,
00168                  const float *wfn,
00169                  const float *barray,
00170                  const basis_atom_t *bset,
00171                  const int *types,
00172                  const int *asort,
00173                  const int *abasis,
00174                  const float **norm,
00175                  const int *nshells,
00176                  const int *nprimshell,
00177                  const int *shelltypes, 
00178                  int natoms, int ntypes, int numwave, int numbasis, 
00179                  int orbid) :
00180   numatoms(natoms), atompos(pos),
00181   num_wave_f(numwave),
00182   wave_f(NULL),
00183   num_basis_funcs(numbasis),
00184   basis_array(barray),
00185   numtypes(ntypes), 
00186   basis_set(bset),
00187   atom_types(types),
00188   atom_sort(asort),
00189   atom_basis(abasis),
00190   norm_factors(norm),
00191   num_shells_per_atom(nshells),
00192   num_prim_per_shell(nprimshell),
00193   shell_types(shelltypes), 
00194   grid_data(NULL)
00195 {
00196   origin[0] = origin[1] = origin[2] = 0.0;
00197 
00198   // Multiply wavefunction coefficients with the
00199   // angular momentum dependent part of the basis set
00200   // normalization factors.
00201   normalize_wavefunction(wfn + num_wave_f*orbid);
00202 
00203   //print_wavefunction();
00204 }
00205 
00207 Orbital::~Orbital() {
00208   if (wave_f) delete [] wave_f;
00209 }
00210 
00211 
00212 // Multiply wavefunction coefficients with the
00213 // basis set normalization factors. We do this
00214 // here rather than normalizing the basisset itself
00215 // because we need different factors for the different
00216 // cartesian components of a shell and the basis set
00217 // stores data only per shell.
00218 // By doing the multiplication here we save a lot of
00219 // flops during orbital rendering.
00220 void Orbital::normalize_wavefunction(const float *wfn) {
00221 #ifdef DEBUGORBS
00222   char shellname[8] = {'S', 'P', 'D', 'F', 'G', 'H', 'I', 'K'};
00223 #endif
00224   int i, j, k;
00225   // Get size of the symmetry-expanded wavefunction array
00226 //   int wave_size = 0;
00227 //   for (i=0; i<numatoms; i++) {
00228 //     printf("atom[%d]: type = %d\n", i, atom_types[i]);
00229 //     const basis_atom_t *basis_atom = &basis_set[atom_types[i]];
00230 //     for (j=0; j<basis_atom->numshells; j++) {
00231 //       wave_size += basis_atom->shell[j].num_cart_func;
00232 //     }
00233 //   }
00234 //   printf("num_wave_f/wave_size = %d/%d\n", num_wave_f,  wave_size);
00235 
00236   wave_f = new float[num_wave_f];
00237   int ifunc = 0;
00238   for (i=0; i<numatoms; i++) {
00239     const basis_atom_t *basis_atom = &basis_set[atom_types[i]];
00240     for (j=0; j<basis_atom->numshells; j++) {
00241       int stype = basis_atom->shell[j].type;
00242 #ifdef DEBUGORBS
00243       printf("atom %i/%i,  %i/%i %c-shell\n", i+1, numatoms, j+1, basis_atom->numshells, shellname[stype]);
00244 #endif
00245       for (k=0; k<basis_atom->shell[j].num_cart_func; k++) {
00246         wave_f[ifunc] = wfn[ifunc] * norm_factors[stype][k];
00247 
00248 #ifdef DEBUGORBS
00249         printf("%3i %c %2i wave_f[%3i]=% f  norm=%.3f  normwave=% f\n",
00250                i, shellname[stype], k, ifunc, wfn[ifunc],
00251                norm_factors[stype][k], wave_f[ifunc]);
00252 #endif
00253         ifunc++;
00254       }
00255     }
00256   }
00257 }
00258 
00259 
00260 // Sets the grid dimensions to the bounding box of the given
00261 // set of atoms *pos including a padding in all dimensions.
00262 // The resulting grid dimensions will be rounded to a multiple
00263 // of the voxel size.
00264 int Orbital::set_grid_to_bbox(const float *pos, float padding,
00265                               float resolution) {
00266   int i = 0;
00267   float xyzdim[3];
00268 
00269   /* set initial values of temp values to the coordinates
00270    * of the first atom.  */
00271   origin[0] = xyzdim[0] = pos[0];
00272   origin[1] = xyzdim[1] = pos[1];
00273   origin[2] = xyzdim[2] = pos[2];
00274 
00275   /* now loop over the rest of the atoms to check if there's
00276    * something larger/smaller for the maximum and minimum
00277    * respectively */
00278   for(i=1; i<numatoms; i++) {
00279     if (pos[3*i  ] < origin[0]) origin[0] = pos[3*i];
00280     if (pos[3*i+1] < origin[1]) origin[1] = pos[3*i+1];
00281     if (pos[3*i+2] < origin[2]) origin[2] = pos[3*i+2];
00282     if (pos[3*i  ] > xyzdim[0]) xyzdim[0] = pos[3*i];
00283     if (pos[3*i+1] > xyzdim[1]) xyzdim[1] = pos[3*i+1];
00284     if (pos[3*i+2] > xyzdim[2]) xyzdim[2] = pos[3*i+2];
00285   }
00286 
00287   // Apply padding in each direction
00288   origin[0] -= padding;
00289   origin[1] -= padding;
00290   origin[2] -= padding;
00291   gridsize[0] = xyzdim[0] + padding - origin[0];
00292   gridsize[1] = xyzdim[1] + padding - origin[1];
00293   gridsize[2] = xyzdim[2] + padding - origin[2];  
00294 
00295   set_resolution(resolution);
00296 
00297   return TRUE;
00298 }
00299 
00300 
00301 // Set the dimensions and resolution of the grid for which 
00302 // the orbital shall be computed.
00303 // The given grid dimensions will be rounded to a multiple
00304 // of the voxel size.
00305 void Orbital::set_grid(float newori[3], float newdim[3], float newvoxelsize) {
00306   origin[0] = newori[0];
00307   origin[1] = newori[1];
00308   origin[2] = newori[2];
00309   gridsize[0] = newdim[0];
00310   gridsize[1] = newdim[1];
00311   gridsize[2] = newdim[2];
00312   set_resolution(newvoxelsize);
00313 }
00314 
00315 // Change the resolution of the grid
00316 void Orbital::set_resolution(float resolution) {
00317   voxelsize = resolution;
00318   int i;
00319   for (i=0; i<3; i++) {
00320     numvoxels[i] = (int)(gridsize[i]/voxelsize) + 1;
00321     gridsize[i] = voxelsize*(numvoxels[i]-1);
00322   }
00323 }
00324 
00325 #define XNEG 0
00326 #define YNEG 1
00327 #define ZNEG 2
00328 #define XPOS 3
00329 #define YPOS 4
00330 #define ZPOS 5
00331 
00332 // Check if all values in the boundary plane given by dir 
00333 // are below threshold.
00334 // If not, jump back, decrease the stepsize and test again.
00335 int Orbital::check_plane(int dir, float threshold, int minstepsize,
00336                          int &stepsize) {
00337   bool repeat=0;
00338   int u, v, w, nu, nv;
00339   // w is the dimension we want to adjust,
00340   // u and v are the other two, i.e. the plane in which we test
00341   // the orbital values. 
00342   u = (dir+1)%3;
00343   v = (dir+2)%3;
00344   w = dir%3;
00345 
00346   // for debugging
00347   //char axis[3] = {'X', 'Y', 'Z'};
00348   //char sign[2] = {'-', '+'};
00349   //printf("%c%c: ", sign[dir/3], axis[w]);
00350 
00351   do {
00352     int success = 0;
00353     int gridstep = stepsize;
00354 
00355     if (repeat) {
00356       // We are repeating the test on the previous slate but with
00357       // twice the resolution. Hence we only have to test the new
00358       // grid points lying between the old ones.
00359       gridstep = 2*stepsize;
00360     }
00361 
00362 
00363     float grid[3];
00364     grid[w] = origin[w] + (dir/3)*(numvoxels[w]-1) * voxelsize;
00365 
00366     // Search for a value of the wave function larger than threshold.
00367     for (nu=0; nu<numvoxels[u]; nu+=gridstep) {
00368       grid[u] = origin[u] + nu * voxelsize;
00369     
00370       for (nv=0; nv<numvoxels[v]; nv+=gridstep) {
00371         grid[v] = origin[v] + nv * voxelsize;
00372       
00373         if (fabs(evaluate_grid_point(grid[0], grid[1], grid[2])) > threshold) {
00374           success = 1;
00375           break;
00376         }
00377       }
00378       if (success) break;
00379     }
00380 
00381     if (success) {
00382       // Found an orbital value higher than the threshold.
00383       // We want the threshold isosurface to be completely inside the grid.
00384       // The boundary must be between the previous and this plane.
00385       if (!(dir/3)) origin[w] -= stepsize*voxelsize;
00386       numvoxels[w] += stepsize;
00387       if (stepsize<=minstepsize) {
00388         //printf("success!\n");
00389         return 1;
00390       }
00391       stepsize /=2;
00392       repeat = 1;
00393       //printf("increase by %i, reduce stepsize to %i.\n", 2*stepsize, stepsize);
00394 
00395     } else {
00396       // All values lower than threshold, we decrease the grid size.
00397       if (!(dir/3)) origin[w] += stepsize*voxelsize;
00398       numvoxels[w] -= stepsize;
00399       //printf("decrease by %i\n", stepsize);
00400       repeat = 0;
00401       if (numvoxels[w] <= 1) {
00402         // Here we ended up with a zero grid size.
00403         // We must have missed something. Let's increase grid again and 
00404         // try a smaller step size.
00405         numvoxels[w] = stepsize; 
00406         if (!(dir/3)) origin[w] -= stepsize*voxelsize;
00407         stepsize /=2;
00408         repeat = 1;
00409         //printf("zero grid size - increase to %i, reduce stepsize to %i.\n", 2*stepsize, stepsize);
00410       }
00411     }
00412 
00413   } while (repeat);
00414 
00415   return 0;
00416 }
00417 
00418 
00419 // Optimize position and dimension of current grid so that all orbital
00420 // values higher than threshold are contained in the grid.
00421 //
00422 // Algorithm:
00423 // Based on the idea that the wave function trails off in a distance of
00424 // a few Angstroms from the molecule.
00425 // We start from the current grid size (which could be for instance the
00426 // molecular bounding box plus a padding region) and test the values on
00427 // each of the six boundary planes. If there is no value larger than the
00428 // given threshold in a plane then we shrink the system along the plane
00429 // normal. In the distance the wave function tends to be smoother so we
00430 // start the testing on a coarser grid. A parameter maxstepsize=4 means
00431 // to begin with a grid using a four times higher voxel side length than
00432 // the original grid. When we find the first value above the threshold we
00433 // jump back one step and continue with half of the previous stepsize.
00434 // When stepsize has reached minstepsize then we consider the corresponding
00435 // boundary plane optimal. Note that starting out with a too coarse
00436 // grid one might miss some features of the wave function.
00437 // If you want to be sure not to miss anything then use the voxelsize
00438 // for both minstepsize and maxstepsize.
00439 void Orbital::find_optimal_grid(float threshold, 
00440                                 int minstepsize, int maxstepsize) {
00441   int optimal[6] = {0, 0, 0, 0, 0, 0};
00442   int stepsize[6];
00443   int i;
00444   for (i=0; i<6; i++) stepsize[i] = maxstepsize;
00445 
00446 #ifdef DEBUGORBS
00447   printf("origin = {%f %f %f}\n", origin[0], origin[1], origin[2]);
00448   printf("gridsize = {%f %f %f}\n", gridsize[0], gridsize[1], gridsize[2]);
00449 #endif  
00450   
00451   
00452   // Loop until we have optimal grid boundaries in all
00453   // dimensions
00454   int iter = 0;
00455   while ( !optimal[0] || !optimal[1] || !optimal[2] ||
00456           !optimal[3] || !optimal[4] || !optimal[5] )
00457   {
00458     if (iter>100) {
00459       msgInfo << "WARNING: Could not optimize orbital grid boundaries in"
00460               << iter << "steps!" << sendmsg; 
00461       break;
00462     }
00463     iter++;
00464 
00465     // Examine the current grid boundaries and shrink if
00466     // all values are smaller than threshold    .
00467     if (!optimal[XNEG])
00468       optimal[XNEG] = check_plane(XNEG, threshold, minstepsize, stepsize[XNEG]);
00469 
00470     if (!optimal[XPOS])
00471       optimal[XPOS] = check_plane(XPOS, threshold, minstepsize, stepsize[XPOS]);
00472 
00473     if (!optimal[YNEG])
00474       optimal[YNEG] = check_plane(YNEG, threshold, minstepsize, stepsize[YNEG]);
00475 
00476     if (!optimal[YPOS])
00477       optimal[YPOS] = check_plane(YPOS, threshold, minstepsize, stepsize[YPOS]);
00478 
00479     if (!optimal[ZNEG])
00480       optimal[ZNEG] = check_plane(ZNEG, threshold, minstepsize, stepsize[ZNEG]);
00481 
00482     if (!optimal[ZPOS])
00483       optimal[ZPOS] = check_plane(ZPOS, threshold, minstepsize, stepsize[ZPOS]);
00484 
00485 #if defined(DEBUGORBS)
00486     printf("origin {%f %f %f}\n", origin[0], origin[1], origin[2]);
00487     printf("ngrid {%i %i %i}\n", numvoxels[0], numvoxels[1], numvoxels[2]);
00488     printf("stepsize {%i %i %i %i %i %i}\n", stepsize[0], stepsize[1], stepsize[2],
00489            stepsize[3], stepsize[4], stepsize[5]);
00490 #endif
00491   }
00492 
00493   
00494   gridsize[0] = numvoxels[0]*voxelsize;
00495   gridsize[1] = numvoxels[1]*voxelsize;
00496   gridsize[2] = numvoxels[2]*voxelsize;
00497 }
00498 
00499 
00500 // this function creates the orbital grid given the system dimensions
00501 int Orbital::calculate_mo(DrawMolecule *mol, int density) {
00502   PROFILE_PUSH_RANGE("Orbital", 4);
00503 
00504   wkf_timerhandle timer=wkf_timer_create();
00505   wkf_timer_start(timer);
00506 
00507   //
00508   // Force vectorized N-element padding for the X dimension to prevent
00509   // the possibility of an out-of-bounds orbital grid read/write operation
00510   //
00511   int vecpadmask = VECPADMASK;
00512   if ((mol->app->cpucaps != NULL) && (mol->app->cpucaps->flags & CPU_AVX2)) {
00513     vecpadmask = 7; // AVX2 kernels pad to multiples of 8 
00514   }
00515   if ((mol->app->cpucaps != NULL) && (mol->app->cpucaps->flags & (CPU_AVX512F | CPU_AVX512ER))) {
00516     vecpadmask = 15; // AVX512 kernels pad to multiples of 16
00517   }
00518 
00519   // pad the grid X dimension for vector-multiple length and memory alignment 
00520   numvoxels[0] = (numvoxels[0] + vecpadmask) & ~(vecpadmask);
00521   gridsize[0] = numvoxels[0]*voxelsize;
00522  
00523   // Allocate memory for the volumetric grid
00524   int numgridpoints = numvoxels[0] * numvoxels[1] * numvoxels[2];
00525   grid_data = new float[numgridpoints];
00526 
00527 #if defined(DEBUGORBS)
00528   printf("num_wave_f=%i\n", num_wave_f);
00529 
00530   int i=0;
00531   for (i=0; i<num_wave_f; i++) {
00532     printf("wave_f[%i] = %f\n", i, wave_f[i]);
00533   }
00534 
00535   // perhaps give the user a warning, since the calculation
00536   // could take a while, otherwise they might think the system is borked 
00537   printf("Calculating %ix%ix%i orbital grid.\n", 
00538          numvoxels[0], numvoxels[1], numvoxels[2]);
00539 #endif
00540 
00541 
00542   int rc=-1; // initialize to sentinel value
00543 
00544   // Calculate the value of the orbital at each gridpoint
00545 #if defined(VMDCUDA)
00546   // The CUDA kernel currently only handles up to "G" shells,
00547   // and up to 32 primitives per basis function
00548   if ((max_shell_type() <= G_SHELL) &&
00549       (max_primitives() <= 32) &&
00550       (!getenv("VMDNOCUDA"))) {
00551     rc = vmd_cuda_evaluate_orbital_grid(mol->cuda_devpool(), 
00552                                         numatoms, wave_f, num_wave_f,
00553                                         basis_array, num_basis_funcs,
00554                                         atompos, atom_basis,
00555                                         num_shells_per_atom, 
00556                                         num_prim_per_shell,
00557                                         shell_types, total_shells(),
00558                                         numvoxels, voxelsize, 
00559                                         origin, density, grid_data);
00560   }
00561 #endif
00562 #if defined(VMDOPENCL)
00563   // The OpenCL kernel currently only handles up to "G" shells,
00564   // and up to 32 primitives per basis function
00565   if (rc!=0 &&
00566       (max_shell_type() <= G_SHELL) &&
00567       (max_primitives() <= 32) &&
00568       (!getenv("VMDNOOPENCL"))) {
00569 
00570 #if 1
00571     // XXX this would be done during app startup normally...
00572     static vmd_opencl_orbital_handle *orbh = NULL;
00573     static cl_context clctx = NULL;
00574     static cl_command_queue clcmdq = NULL;
00575     static cl_device_id *cldevs = NULL;
00576     if (orbh == NULL) {
00577       printf("Attaching OpenCL device:\n");
00578       wkf_timer_start(timer);
00579       cl_int clerr = CL_SUCCESS;
00580 
00581       cl_platform_id clplatid = vmd_cl_get_platform_index(0);
00582       cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0};
00583       clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr);
00584 
00585       size_t parmsz;
00586       clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz);
00587       if (clerr != CL_SUCCESS) return -1;
00588       cldevs = (cl_device_id *) malloc(parmsz);
00589       if (clerr != CL_SUCCESS) return -1;
00590       clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL);
00591       if (clerr != CL_SUCCESS) return -1;
00592       clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr);
00593       if (clerr != CL_SUCCESS) return -1;
00594       wkf_timer_stop(timer);
00595       printf("  OpenCL context creation time: %.3f sec\n", wkf_timer_time(timer));
00596 
00597       wkf_timer_start(timer);
00598       orbh = vmd_opencl_create_orbital_handle(clctx, clcmdq, cldevs);
00599       wkf_timer_stop(timer);
00600       printf("  OpenCL kernel compilation time: %.3f sec\n", wkf_timer_time(timer));
00601 
00602       wkf_timer_start(timer);
00603     }
00604 #endif
00605 
00606     rc = vmd_opencl_evaluate_orbital_grid(mol->cuda_devpool(), orbh,
00607                                         numatoms, wave_f, num_wave_f,
00608                                         basis_array, num_basis_funcs,
00609                                         atompos, atom_basis,
00610                                         num_shells_per_atom, 
00611                                         num_prim_per_shell,
00612                                         shell_types, total_shells(),
00613                                         numvoxels, voxelsize, 
00614                                         origin, density, grid_data);
00615 
00616 #if 0
00617     // XXX this would normally be done at program shutdown
00618     vmd_opencl_destroy_orbital_handle(parms.orbh);
00619     clReleaseCommandQueue(clcmdq);
00620     clReleaseContext(clctx);
00621     free(cldevs);
00622 #endif
00623   }
00624 #endif
00625 #if 0
00626   int numprocs = 1;
00627   if (getenv("VMDDUMPORBITALS")) {
00628     write_orbital_data(getenv("VMDDUMPORBITALS"), numatoms,
00629                        wave_f, num_wave_f, basis_array, num_basis,
00630                        atompos, atom_basis, num_shells_per_atom,
00631                        num_prim_per_shell, shell_types,
00632                        num_shells, numvoxels, voxelsize, origin);
00633 
00634     read_calc_orbitals(devpool, getenv("VMDDUMPORBITALS"));
00635   }
00636 #endif
00637 
00638 
00639 #if !defined(VMDORBUSETHRPOOL)
00640 #if defined(VMDTHREADS)
00641   int numcputhreads = wkf_thread_numprocessors();
00642 #else
00643   int numcputhreads = 1;
00644 #endif
00645 #endif
00646   if (rc!=0)  rc = evaluate_grid_fast(mol->app->cpucaps,
00647 #if defined(VMDORBUSETHRPOOL)
00648                                       mol->cpu_threadpool(), 
00649 #else
00650                                       numcputhreads,
00651 #endif
00652                                       numatoms, wave_f, basis_array,
00653                                       atompos, atom_basis,
00654                                       num_shells_per_atom, num_prim_per_shell,
00655                                       shell_types, numvoxels, voxelsize, 
00656                                       origin, density, grid_data);
00657 
00658   if (rc!=0) {
00659     msgErr << "Error computing orbital grid" << sendmsg;
00660     delete [] grid_data;
00661     grid_data=NULL;
00662 
00663     PROFILE_POP_RANGE(); // first return point
00664 
00665     return FALSE;
00666   }
00667 
00668   wkf_timer_stop(timer);
00669 
00670 #if 1
00671   if (getenv("VMDORBTIMING") != NULL) { 
00672     double gflops = (numgridpoints * flops_per_gridpoint()) / (wkf_timer_time(timer) * 1000000000.0);
00673 
00674     char strbuf[1024];
00675     sprintf(strbuf, "Orbital calc. time %.3f secs, %.2f gridpoints/sec, %.2f GFLOPS",
00676             wkf_timer_time(timer), 
00677             (((double) numgridpoints) / wkf_timer_time(timer)),
00678             gflops);
00679     msgInfo << strbuf << sendmsg;
00680   }
00681 #endif
00682 
00683   wkf_timer_destroy(timer);
00684 
00685   PROFILE_POP_RANGE(); // second return point
00686 
00687   return TRUE;
00688 }
00689 
00690 
00691 /*********************************************************
00692  *
00693  * This function calculates the value of the wavefunction
00694  * corresponding to a particular orbital at grid point
00695  * grid_x, grid_y, grid_z.
00696 
00697 
00698  Here's an example of a basis set definition for one atom:
00699 
00700   SHELL TYPE  PRIMITIVE        EXPONENT          CONTRACTION COEFFICIENT(S)
00701 
00702  Oxygen
00703 
00704       1   S       1          5484.6716600    0.001831074430
00705       1   S       2           825.2349460    0.013950172200
00706       1   S       3           188.0469580    0.068445078098
00707       1   S       4            52.9645000    0.232714335992
00708       1   S       5            16.8975704    0.470192897984
00709       1   S       6             5.7996353    0.358520852987
00710 
00711       2   L       7            15.5396162   -0.110777549525    0.070874268231
00712       2   L       8             3.5999336   -0.148026262701    0.339752839147
00713       2   L       9             1.0137618    1.130767015354    0.727158577316
00714 
00715       3   L      10             0.2700058    1.000000000000    1.000000000000
00716 
00717  *********************************************************/
00718 float Orbital::evaluate_grid_point(float grid_x, float grid_y, float grid_z) {
00719   int at;
00720   int prim, shell;
00721 
00722   // initialize value of orbital at gridpoint
00723   float value = 0.0;
00724 
00725   // initialize the wavefunction and shell counters
00726   int ifunc = 0; 
00727   int shell_counter = 0;
00728 
00729   // loop over all the QM atoms
00730   for (at=0; at<numatoms; at++) {
00731     int maxshell = num_shells_per_atom[at];
00732     int prim_counter = atom_basis[at];
00733 
00734     // calculate distance between grid point and center of atom
00735     float xdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
00736     float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00737     float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00738     float dist2 = xdist*xdist + ydist*ydist + zdist*zdist;
00739 
00740     // loop over the shells belonging to this atom
00741     // XXX this is maybe a misnomer because in split valence
00742     //     basis sets like 6-31G we have more than one basis
00743     //     function per (valence-)shell and we are actually
00744     //     looping over the individual contracted GTOs
00745     for (shell=0; shell < maxshell; shell++) {
00746       float contracted_gto = 0.0f;
00747 
00748       // Loop over the Gaussian primitives of this contracted 
00749       // basis function to build the atomic orbital
00750       int maxprim = num_prim_per_shell[shell_counter];
00751       int shelltype = shell_types[shell_counter];
00752       for (prim=0; prim < maxprim;  prim++) {
00753         float exponent       = basis_array[prim_counter    ];
00754         float contract_coeff = basis_array[prim_counter + 1];
00755         contracted_gto += contract_coeff * expf(-exponent*dist2);
00756         prim_counter += 2;
00757       }
00758 
00759       /* multiply with the appropriate wavefunction coefficient */
00760       float tmpshell=0;
00761       // Loop over the cartesian angular momenta of the shell.
00762       // avoid unnecessary branching and minimize use of pow()
00763       int i, j; 
00764       float xdp, ydp, zdp;
00765       float xdiv = 1.0f / xdist;
00766       for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
00767         int imax = shelltype - j; 
00768         for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
00769           tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
00770         }
00771       }
00772       value += tmpshell * contracted_gto;
00773 
00774       shell_counter++;
00775     } 
00776   }
00777 
00778   /* return the final value at grid point */
00779   return value;
00780 }
00781 
00782 
00783 //
00784 // Return the max number of primitives that occur in a basis function
00785 //
00786 int Orbital::max_primitives(void) {
00787   int maxprim=-1;
00788 
00789   int shell_counter = 0;
00790   for (int at=0; at<numatoms; at++) {
00791     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00792       int numprim = num_prim_per_shell[shell_counter];
00793       if (numprim > maxprim)
00794         maxprim = numprim; 
00795     }
00796   }
00797 
00798   return maxprim;
00799 }
00800 
00801 
00802 //
00803 // Return the maximum shell type used
00804 //
00805 int Orbital::max_shell_type(void) {
00806   int maxshell=-1;
00807 
00808   int shell_counter = 0;
00809   for (int at=0; at<numatoms; at++) {
00810     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00811       int shelltype = shell_types[shell_counter];
00812       shell_counter++;
00813       if (shelltype > maxshell)
00814         maxshell=shelltype;
00815     }
00816   }
00817 
00818   return maxshell;
00819 }
00820 
00821 
00822 //
00823 // count the maximum number of wavefunction coefficient accesses
00824 // required for the highest shell types contained in this orbital
00825 //
00826 int Orbital::max_wave_f_count(void) {
00827   int maxcount=0;
00828 
00829   int shell_counter = 0;
00830   for (int at=0; at<numatoms; at++) {
00831     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00832       int shelltype = shell_types[shell_counter];
00833       int i, j; 
00834       int count=0;
00835       for (i=0; i<=shelltype; i++) {
00836         int jmax = shelltype - i; 
00837         for (j=0; j<=jmax; j++) {
00838           count++;
00839         }
00840       }
00841       shell_counter++;
00842       if (count > maxcount)
00843         maxcount=count;
00844     }
00845   }
00846 
00847   return maxcount;
00848 }
00849 
00850 
00851 //
00852 // compute the FLOPS per grid point for performance measurement purposes
00853 //
00854 double Orbital::flops_per_gridpoint() {
00855   double flops=0.0;
00856 
00857   int shell_counter = 0;
00858   for (int at=0; at<numatoms; at++) {
00859     flops += 7;
00860 
00861     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00862       for (int prim=0; prim < num_prim_per_shell[shell_counter];  prim++)
00863         flops += 4; // expf() costs far more, but we count as one.
00864 
00865       int shelltype = shell_types[shell_counter];
00866 
00867       switch (shelltype) {
00868         // separately count for the hand-optimized cases
00869         case S_SHELL: flops += 2; break;
00870         case P_SHELL: flops += 8; break;
00871         case D_SHELL: flops += 17; break;
00872         case F_SHELL: flops += 30; break;
00873         case G_SHELL: flops += 50; break;
00874 
00875         // count up for catch-all loop
00876         default:
00877           int i, j; 
00878           for (i=0; i<=shelltype; i++) {
00879             int jmax = shelltype - i; 
00880             flops += 1;
00881             for (j=0; j<=jmax; j++) {
00882               flops += 6;
00883             }
00884           }
00885           break;
00886       }
00887 
00888       shell_counter++;
00889     } 
00890   }
00891 
00892   return flops;
00893 }
00894 
00895 
00896 //
00897 // Fast single-precision expf() implementation
00898 // Adapted from the free cephes math library on Netlib
00899 //   http://www.netlib.org/cephes/
00900 //
00901 // Cephes Math Library Release 2.2:  June, 1992
00902 // Copyright 1984, 1987, 1989 by Stephen L. Moshier
00903 // Direct inquiries to 30 Frost Street, Cambridge, MA 02140
00904 //
00905 static const float MAXNUMF =    3.4028234663852885981170418348451692544e38f;
00906 static const float MAXLOGF =   88.72283905206835f;
00907 static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */
00908 static const float LOG2EF  =    1.44269504088896341f;
00909 static const float C1      =    0.693359375f;
00910 static const float C2      =   -2.12194440e-4f;
00911 
00912 static inline float cephesfastexpf(float x) {
00913   float z;
00914   int n;
00915   
00916   if(x > MAXLOGF) 
00917     return MAXNUMF;
00918 
00919   if(x < MINLOGF) 
00920     return 0.0;
00921 
00922   // Express e^x = e^g 2^n = e^g e^(n loge(2)) = e^(g + n loge(2))
00923   z = floorf( LOG2EF * x + 0.5f ); // floor() truncates toward -infinity.
00924   x -= z * C1;
00925   x -= z * C2;
00926   n = (int) z;
00927 
00928   z = x * x;
00929   // Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9.
00930   z = ((((( 1.9875691500E-4f  * x + 1.3981999507E-3f) * x
00931           + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x
00932           + 1.6666665459E-1f) * x + 5.0000001201E-1f) * z + x + 1.0f;
00933 
00934   x = ldexpf(z, n); // multiply by power of 2
00935   return x;
00936 }
00937 
00938 
00939 
00940 
00941 /*
00942  * David J. Hardy
00943  * 12 Dec 2008
00944  *
00945  * aexpfnx() - Approximate expf() for negative x.
00946  *
00947  * Assumes that x <= 0.
00948  *
00949  * Assumes IEEE format for single precision float, specifically:
00950  * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00951  *
00952  * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00953  * multiplication of a fast calculation for 2^(-N).  The interpolation
00954  * uses a linear blending of 3rd degree Taylor polynomials at the end
00955  * points, so the approximation is once differentiable.
00956  *
00957  * The error is small (max relative error per interval is calculated
00958  * to be 0.131%, with a max absolute error of -0.000716).
00959  *
00960  * The cutoff is chosen so as to speed up the computation by early
00961  * exit from function, with the value chosen to give less than the
00962  * the max absolute error.  Use of a cutoff is unnecessary, except
00963  * for needing to shift smallest floating point numbers to zero,
00964  * i.e. you could remove cutoff and replace by:
00965  *
00966  * #define MINXNZ  -88.0296919311130  // -127 * log(2)
00967  *
00968  *   if (x < MINXNZ) return 0.f;
00969  *
00970  * Use of a cutoff causes a discontinuity which can be eliminated
00971  * through the use of a switching function.
00972  *
00973  * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00974  * the interval and weighting their respective Taylor polynomials by the
00975  * kth order Lagrange interpolant through those nodes.  The wiggle in the
00976  * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00977  * can be reduced by using Chebyshev nodes.
00978  */
00979 
00980 #define MLOG2EF    -1.44269504088896f
00981 
00982 /*
00983  * Interpolating coefficients for linear blending of the
00984  * 3rd degree Taylor expansion of 2^x about 0 and -1.
00985  */
00986 #define SCEXP0     1.0000000000000000f
00987 #define SCEXP1     0.6987082824680118f
00988 #define SCEXP2     0.2633174272827404f
00989 #define SCEXP3     0.0923611991471395f
00990 #define SCEXP4     0.0277520543324108f
00991 
00992 /* for single precision float */
00993 #define EXPOBIAS   127
00994 #define EXPOSHIFT   23
00995 
00996 /* cutoff is optional, but can help avoid unnecessary work */
00997 #define ACUTOFF    -10
00998 
00999 typedef union flint_t {
01000   float f;
01001   int n;
01002 } flint;
01003 
01004 float aexpfnx(float x) {
01005   /* assume x <= 0 */
01006   float mb;
01007   int mbflr;
01008   float d;
01009   float sy;
01010   flint scalfac;
01011 
01012   if (x < ACUTOFF) return 0.f;
01013 
01014   mb = x * MLOG2EF;    /* change base to 2, mb >= 0 */
01015   mbflr = (int) mb;    /* get int part, floor() */
01016   d = mbflr - mb;      /* remaining exponent, -1 < d <= 0 */
01017   sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
01018                        /* approx with linear blend of Taylor polys */
01019   scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  /* 2^(-mbflr) */
01020   return (sy * scalfac.f);  /* scaled approx */
01021 }
01022 
01023 
01024 
01025 //
01026 // Optimized molecular orbital grid evaluation code
01027 //
01028 #define S_SHELL 0
01029 #define P_SHELL 1
01030 #define D_SHELL 2
01031 #define F_SHELL 3
01032 #define G_SHELL 4
01033 #define H_SHELL 5
01034 
01035 int evaluate_grid(int numatoms,
01036                   const float *wave_f, const float *basis_array,
01037                   const float *atompos,
01038                   const int *atom_basis,
01039                   const int *num_shells_per_atom,
01040                   const int *num_prim_per_shell,
01041                   const int *shell_types,
01042                   const int *numvoxels,
01043                   float voxelsize,
01044                   const float *origin,
01045                   int density,
01046                   float * orbitalgrid) {
01047   if (!orbitalgrid)
01048     return -1;
01049 
01050   int nx, ny, nz;
01051   // Calculate the value of the orbital at each gridpoint and store in 
01052   // the current oribtalgrid array
01053   int numgridxy = numvoxels[0]*numvoxels[1];
01054   for (nz=0; nz<numvoxels[2]; nz++) {
01055     float grid_x, grid_y, grid_z;
01056     grid_z = origin[2] + nz * voxelsize;
01057     for (ny=0; ny<numvoxels[1]; ny++) {
01058       grid_y = origin[1] + ny * voxelsize;
01059       int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01060       for (nx=0; nx<numvoxels[0]; nx++) {
01061         grid_x = origin[0] + nx * voxelsize;
01062 
01063         // calculate the value of the wavefunction of the
01064         // selected orbital at the current grid point
01065         int at;
01066         int prim, shell;
01067 
01068         // initialize value of orbital at gridpoint
01069         float value = 0.0;
01070 
01071         // initialize the wavefunction and shell counters
01072         int ifunc = 0; 
01073         int shell_counter = 0;
01074 
01075         // loop over all the QM atoms
01076         for (at=0; at<numatoms; at++) {
01077           int maxshell = num_shells_per_atom[at];
01078           int prim_counter = atom_basis[at];
01079 
01080           // calculate distance between grid point and center of atom
01081           float xdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
01082           float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01083           float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01084 
01085           float xdist2 = xdist*xdist;
01086           float ydist2 = ydist*ydist;
01087           float zdist2 = zdist*zdist;
01088           float xdist3 = xdist2*xdist;
01089           float ydist3 = ydist2*ydist;
01090           float zdist3 = zdist2*zdist;
01091 
01092           float dist2 = xdist2 + ydist2 + zdist2;
01093 
01094           // loop over the shells belonging to this atom
01095           // XXX this is maybe a misnomer because in split valence
01096           //     basis sets like 6-31G we have more than one basis
01097           //     function per (valence-)shell and we are actually
01098           //     looping over the individual contracted GTOs
01099           for (shell=0; shell < maxshell; shell++) {
01100             float contracted_gto = 0.0f;
01101 
01102             // Loop over the Gaussian primitives of this contracted 
01103             // basis function to build the atomic orbital
01104             // 
01105             // XXX there's a significant opportunity here for further
01106             //     speedup if we replace the entire set of primitives
01107             //     with the single gaussian that they are attempting 
01108             //     to model.  This could give us another 6x speedup in 
01109             //     some of the common/simple cases.
01110             int maxprim = num_prim_per_shell[shell_counter];
01111             int shelltype = shell_types[shell_counter];
01112             for (prim=0; prim<maxprim; prim++) {
01113               float exponent       = basis_array[prim_counter    ];
01114               float contract_coeff = basis_array[prim_counter + 1];
01115 
01116               // XXX By premultiplying the stored exponent factors etc,
01117               //     we should be able to use exp2f() rather than exp(),
01118               //     saving several FLOPS per iteration of this loop
01119 #if defined(__GNUC__) && !defined(__ICC)
01120               // Use David Hardy's fast spline approximation instead
01121               // Works well for GCC, but runs slower for Intel C.
01122               contracted_gto += contract_coeff * aexpfnx(-exponent*dist2);
01123 #elif defined(__ICC)
01124               // When compiling with ICC, we'll use an inlined 
01125               // single-precision expf() implementation based on the
01126               // cephes math library found on Netlib.  This outruns the
01127               // standard glibc expf() by over 2x in this algorithm.
01128               contracted_gto += contract_coeff * cephesfastexpf(-exponent*dist2);
01129 #else
01130               // XXX By far the most costly operation here is exp(),
01131               //     for gcc builds, exp() accounts for 90% of the runtime
01132               contracted_gto += contract_coeff * expf(-exponent*dist2);
01133 #endif
01134               prim_counter += 2;
01135             }
01136 
01137             /* multiply with the appropriate wavefunction coefficient */
01138             float tmpshell=0;
01139             switch (shelltype) {
01140               case S_SHELL:
01141                 value += wave_f[ifunc++] * contracted_gto;
01142                 break;
01143 
01144               case P_SHELL:
01145                 tmpshell += wave_f[ifunc++] * xdist;
01146                 tmpshell += wave_f[ifunc++] * ydist;
01147                 tmpshell += wave_f[ifunc++] * zdist;
01148                 value += tmpshell * contracted_gto;
01149                 break;
01150 
01151               case D_SHELL:
01152                 tmpshell += wave_f[ifunc++] * xdist2;
01153                 tmpshell += wave_f[ifunc++] * xdist * ydist;
01154                 tmpshell += wave_f[ifunc++] * ydist2;
01155                 tmpshell += wave_f[ifunc++] * xdist * zdist;
01156                 tmpshell += wave_f[ifunc++] * ydist * zdist;
01157                 tmpshell += wave_f[ifunc++] * zdist2;
01158                 value += tmpshell * contracted_gto;
01159                 break;
01160 
01161               case F_SHELL:
01162                 tmpshell += wave_f[ifunc++] * xdist3;         // xxx
01163                 tmpshell += wave_f[ifunc++] * xdist2 * ydist; // xxy
01164                 tmpshell += wave_f[ifunc++] * ydist2 * xdist; // xyy
01165                 tmpshell += wave_f[ifunc++] * ydist3;         // yyy
01166                 tmpshell += wave_f[ifunc++] * xdist2 * zdist; // xxz
01167                 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist; // xyz
01168                 tmpshell += wave_f[ifunc++] * ydist2 * zdist; // yyz
01169                 tmpshell += wave_f[ifunc++] * zdist2 * xdist; // xzz
01170                 tmpshell += wave_f[ifunc++] * zdist2 * ydist; // yzz
01171                 tmpshell += wave_f[ifunc++] * zdist3;         // zzz
01172                 value += tmpshell * contracted_gto;
01173                 break;
01174  
01175               case G_SHELL:
01176                 tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
01177                 tmpshell += wave_f[ifunc++] * xdist3 * ydist;  // xxxy
01178                 tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
01179                 tmpshell += wave_f[ifunc++] * ydist3 * xdist;  // xyyy
01180                 tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
01181                 tmpshell += wave_f[ifunc++] * xdist3 * zdist;  // xxxz
01182                 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
01183                 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
01184                 tmpshell += wave_f[ifunc++] * ydist3 * zdist;  // yyyz
01185                 tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
01186                 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
01187                 tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
01188                 tmpshell += wave_f[ifunc++] * zdist3 * xdist;  // zzzx
01189                 tmpshell += wave_f[ifunc++] * zdist3 * ydist;  // zzzy
01190                 tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
01191                 value += tmpshell * contracted_gto;
01192                 break;
01193 
01194               default:
01195 #if 1
01196                 // avoid unnecessary branching and minimize use of pow()
01197                 int i, j; 
01198                 float xdp, ydp, zdp;
01199                 float xdiv = 1.0f / xdist;
01200                 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01201                   int imax = shelltype - j; 
01202                   for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01203                     tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01204                   }
01205                 }
01206                 value += tmpshell * contracted_gto;
01207 #else
01208                 int i, j, k;
01209                 for (k=0; k<=shelltype; k++) {
01210                   for (j=0; j<=shelltype; j++) {
01211                     for (i=0; i<=shelltype; i++) {
01212                       if (i+j+k==shelltype) {
01213                         value += wave_f[ifunc++] * contracted_gto
01214                           * pow(xdist,i) * pow(ydist,j) * pow(zdist,k);
01215                       }
01216                     }
01217                   }
01218                 }
01219 #endif
01220             } // end switch
01221 
01222             shell_counter++;
01223           } // end shell
01224         } // end atom
01225 
01226         // return either orbital density or orbital wavefunction amplitude
01227         if (density) {
01228           float orbdensity = value * value;
01229           if (value < 0.0)
01230             orbdensity = -orbdensity;
01231           orbitalgrid[gaddrzy + nx] = orbdensity;
01232         } else {
01233           orbitalgrid[gaddrzy + nx] = value;
01234         }
01235       }
01236     }
01237   }
01238 
01239   return 0;
01240 }
01241 
01242 
01243 
01244 #if defined(VMDORBUSESSE) && defined(__SSE2__)
01245 
01246 #if 0 && !defined(_WIN64) // MSVC doesn't support old MMX intrinsics
01247 //
01248 // Adaptation of the Cephes exp() to an SSE-ized exp_ps() routine 
01249 // originally by Julien Pommier
01250 //   Copyright (C) 2007  Julien Pommier, ZLIB license
01251 //   http://gruntthepeon.free.fr/ssemath/
01252 // 
01253 #ifdef _MSC_VER /* visual c++ */
01254 # define ALIGN16_BEG __declspec(align(16))
01255 # define ALIGN16_END
01256 #else /* gcc or icc */
01257 # define ALIGN16_BEG
01258 # define ALIGN16_END __attribute__((aligned(16)))
01259 #endif
01260 
01261 #define _PS_CONST(Name, Val)                                            \
01262   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
01263 #define _PI32_CONST(Name, Val)                                            \
01264   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
01265 
01266 _PS_CONST(exp_hi,       88.3762626647949f);
01267 _PS_CONST(exp_lo,       -88.3762626647949f);
01268 
01269 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
01270 _PS_CONST(cephes_exp_C1, 0.693359375);
01271 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
01272 
01273 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
01274 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
01275 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
01276 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
01277 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
01278 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
01279 _PS_CONST(one, 1.0);
01280 _PS_CONST(half, 0.5);
01281 
01282 _PI32_CONST(0x7f, 0x7f);
01283 
01284 typedef union xmm_mm_union {
01285   __m128 xmm;
01286   __m64 mm[2];
01287 } xmm_mm_union;
01288 
01289 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
01290     xmm_mm_union u; u.xmm = xmm_;                   \
01291     mm0_ = u.mm[0];                                 \
01292     mm1_ = u.mm[1];                                 \
01293 }
01294 
01295 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
01296     xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
01297   }
01298 
01299 __m128 exp_ps(__m128 x) {
01300   __m128 tmp = _mm_setzero_ps(), fx;
01301   __m64 mm0, mm1;
01302 
01303   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
01304   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
01305 
01306   /* express exp(x) as exp(g + n*log(2)) */
01307   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
01308   fx = _mm_add_ps(fx,*(__m128*)_ps_half);
01309 
01310   /* how to perform a floorf with SSE: just below */
01311   /* step 1 : cast to int */
01312   tmp = _mm_movehl_ps(tmp, fx);
01313   mm0 = _mm_cvttps_pi32(fx);
01314   mm1 = _mm_cvttps_pi32(tmp);
01315   /* step 2 : cast back to float */
01316   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
01317   /* if greater, substract 1 */
01318   __m128 mask = _mm_cmpgt_ps(tmp, fx);
01319   mask = _mm_and_ps(mask, *(__m128*)_ps_one);
01320   fx = _mm_sub_ps(tmp, mask);
01321 
01322   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
01323   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
01324   x = _mm_sub_ps(x, tmp);
01325   x = _mm_sub_ps(x, z);
01326 
01327   z = _mm_mul_ps(x,x);
01328 
01329   __m128 y = *(__m128*)_ps_cephes_exp_p0;
01330   y = _mm_mul_ps(y, x);
01331   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
01332   y = _mm_mul_ps(y, x);
01333   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
01334   y = _mm_mul_ps(y, x);
01335   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
01336   y = _mm_mul_ps(y, x);
01337   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
01338   y = _mm_mul_ps(y, x);
01339   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
01340   y = _mm_mul_ps(y, z);
01341   y = _mm_add_ps(y, x);
01342   y = _mm_add_ps(y, *(__m128*)_ps_one);
01343 
01344   /* build 2^n */
01345   z = _mm_movehl_ps(z, fx);
01346   mm0 = _mm_cvttps_pi32(fx);
01347   mm1 = _mm_cvttps_pi32(z);
01348   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
01349   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
01350   mm0 = _mm_slli_pi32(mm0, 23);
01351   mm1 = _mm_slli_pi32(mm1, 23);
01352 
01353   __m128 pow2n;
01354   COPY_MM_TO_XMM(mm0, mm1, pow2n);
01355 
01356   y = _mm_mul_ps(y, pow2n);
01357   _mm_empty();
01358   return y;
01359 }
01360 #endif // MSVC doesn't support old MMX intrinsics
01361 
01362 
01363 //
01364 // David J. Hardy
01365 // 12 Dec 2008
01366 //
01367 // aexpfnxsse() - SSE2 version of aexpfnx().
01368 //
01369 //
01370 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
01371 #define __align(X)  __attribute__((aligned(X) ))
01372 #if (__GNUC__ < 4)
01373 #define MISSING_mm_cvtsd_f64
01374 #endif
01375 #else
01376 #define __align(X) __declspec(align(X) )
01377 #endif
01378 
01379 #define MLOG2EF    -1.44269504088896f
01380 
01381 /*
01382  * Interpolating coefficients for linear blending of the
01383  * 3rd degree Taylor expansion of 2^x about 0 and -1.
01384  */
01385 #define SCEXP0     1.0000000000000000f
01386 #define SCEXP1     0.6987082824680118f
01387 #define SCEXP2     0.2633174272827404f
01388 #define SCEXP3     0.0923611991471395f
01389 #define SCEXP4     0.0277520543324108f
01390 
01391 /* for single precision float */
01392 #define EXPOBIAS   127
01393 #define EXPOSHIFT   23
01394 
01395 /* cutoff is optional, but can help avoid unnecessary work */
01396 #define ACUTOFF    -10
01397 
01398 typedef union SSEreg_t {
01399   __m128  f;  // 4x float (SSE)
01400   __m128i i;  // 4x 32-bit int (SSE2)
01401 } SSEreg;
01402 
01403 __m128 aexpfnxsse(__m128 x) {
01404   __align(16) SSEreg scal;
01405   __align(16) SSEreg n;
01406   __align(16) SSEreg y;
01407 
01408   scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF));  /* Is x within cutoff? */
01409 
01410   /* If all x are outside of cutoff, return 0s. */
01411   if (_mm_movemask_ps(scal.f) == 0) {
01412     return _mm_setzero_ps();
01413   }
01414   /* Otherwise, scal.f contains mask to be ANDed with the scale factor */
01415 
01416   /*
01417    * Convert base:  exp(x) = 2^(N-d) where N is integer and 0 <= d < 1.
01418    *
01419    * Below we calculate n=N and x=-d, with "y" for temp storage,
01420    * calculate floor of x*log2(e) and subtract to get -d.
01421    */
01422   y.f = _mm_mul_ps(x, _mm_set_ps1(MLOG2EF));
01423   n.i = _mm_cvttps_epi32(y.f);
01424   x = _mm_cvtepi32_ps(n.i);
01425   x = _mm_sub_ps(x, y.f);
01426 
01427   /*
01428    * Approximate 2^{-d}, 0 <= d < 1, by interpolation.
01429    * Perform Horner's method to evaluate interpolating polynomial.
01430    */
01431   y.f = _mm_mul_ps(x, _mm_set_ps1(SCEXP4));      /* for x^4 term */
01432   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3));    /* for x^3 term */
01433   y.f = _mm_mul_ps(y.f, x);
01434   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2));    /* for x^2 term */
01435   y.f = _mm_mul_ps(y.f, x);
01436   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1));    /* for x^1 term */
01437   y.f = _mm_mul_ps(y.f, x);
01438   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0));    /* for x^0 term */
01439 
01440   /*
01441    * Calculate 2^N exactly by directly manipulating floating point exponent.
01442    * Bitwise AND the result with scal.f mask to create the scale factor,
01443    * then use it to scale y for the final result.
01444    */
01445   n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
01446   n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
01447   scal.f = _mm_and_ps(scal.f, n.f);
01448   y.f = _mm_mul_ps(y.f, scal.f);
01449 
01450   return y.f;
01451 }
01452 
01453 
01454 int evaluate_grid_sse(int numatoms,
01455                   const float *wave_f, const float *basis_array,
01456                   const float *atompos,
01457                   const int *atom_basis,
01458                   const int *num_shells_per_atom,
01459                   const int *num_prim_per_shell,
01460                   const int *shell_types,
01461                   const int *numvoxels,
01462                   float voxelsize,
01463                   const float *origin,
01464                   int density,
01465                   float * orbitalgrid) {
01466   if (!orbitalgrid)
01467     return -1;
01468 
01469   int nx, ny, nz;
01470   __align(16) float sxdelta[4]; // 16-byte aligned for SSE
01471   for (nx=0; nx<4; nx++) 
01472     sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
01473 
01474   // Calculate the value of the orbital at each gridpoint and store in 
01475   // the current oribtalgrid array
01476   int numgridxy = numvoxels[0]*numvoxels[1];
01477   for (nz=0; nz<numvoxels[2]; nz++) {
01478     float grid_x, grid_y, grid_z;
01479     grid_z = origin[2] + nz * voxelsize;
01480     for (ny=0; ny<numvoxels[1]; ny++) {
01481       grid_y = origin[1] + ny * voxelsize;
01482       int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01483       for (nx=0; nx<numvoxels[0]; nx+=4) {
01484         grid_x = origin[0] + nx * voxelsize;
01485 
01486         // calculate the value of the wavefunction of the
01487         // selected orbital at the current grid point
01488         int at;
01489         int prim, shell;
01490 
01491         // initialize value of orbital at gridpoint
01492         __m128 value = _mm_setzero_ps();
01493 
01494         // initialize the wavefunction and shell counters
01495         int ifunc = 0; 
01496         int shell_counter = 0;
01497 
01498         // loop over all the QM atoms
01499         for (at=0; at<numatoms; at++) {
01500           int maxshell = num_shells_per_atom[at];
01501           int prim_counter = atom_basis[at];
01502 
01503           // calculate distance between grid point and center of atom
01504           float sxdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
01505           float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01506           float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01507 
01508           float sydist2 = sydist*sydist;
01509           float szdist2 = szdist*szdist;
01510           float yzdist2 = sydist2 + szdist2;
01511 
01512           __m128 xdelta = _mm_load_ps(&sxdelta[0]); // aligned load
01513           __m128 xdist  = _mm_load_ps1(&sxdist);
01514           xdist = _mm_add_ps(xdist, xdelta);
01515           __m128 ydist  = _mm_load_ps1(&sydist);
01516           __m128 zdist  = _mm_load_ps1(&szdist);
01517           __m128 xdist2 = _mm_mul_ps(xdist, xdist);
01518           __m128 ydist2 = _mm_mul_ps(ydist, ydist);
01519           __m128 zdist2 = _mm_mul_ps(zdist, zdist);
01520           __m128 dist2  = _mm_load_ps1(&yzdist2); 
01521           dist2 = _mm_add_ps(dist2, xdist2);
01522  
01523           // loop over the shells belonging to this atom
01524           // XXX this is maybe a misnomer because in split valence
01525           //     basis sets like 6-31G we have more than one basis
01526           //     function per (valence-)shell and we are actually
01527           //     looping over the individual contracted GTOs
01528           for (shell=0; shell < maxshell; shell++) {
01529             __m128 contracted_gto = _mm_setzero_ps();
01530 
01531             // Loop over the Gaussian primitives of this contracted 
01532             // basis function to build the atomic orbital
01533             // 
01534             // XXX there's a significant opportunity here for further
01535             //     speedup if we replace the entire set of primitives
01536             //     with the single gaussian that they are attempting 
01537             //     to model.  This could give us another 6x speedup in 
01538             //     some of the common/simple cases.
01539             int maxprim = num_prim_per_shell[shell_counter];
01540             int shelltype = shell_types[shell_counter];
01541             for (prim=0; prim<maxprim; prim++) {
01542               // XXX pre-negate exponent value
01543               float exponent       = -basis_array[prim_counter    ];
01544               float contract_coeff =  basis_array[prim_counter + 1];
01545 
01546               // contracted_gto += contract_coeff * exp(-exponent*dist2);
01547               __m128 expval = _mm_mul_ps(_mm_load_ps1(&exponent), dist2);
01548               // SSE expf() required here
01549 #if 1
01550               __m128 retval = aexpfnxsse(expval);
01551 #else
01552               __m128 retval = exp_ps(expval);
01553 #endif
01554               __m128 ctmp = _mm_mul_ps(_mm_load_ps1(&contract_coeff), retval);
01555               contracted_gto = _mm_add_ps(contracted_gto, ctmp);
01556               prim_counter += 2;
01557             }
01558 
01559             /* multiply with the appropriate wavefunction coefficient */
01560             __m128 tmpshell = _mm_setzero_ps();
01561             switch (shelltype) {
01562               case S_SHELL:
01563                 value = _mm_add_ps(value, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), contracted_gto));
01564                 break;
01565 
01566               case P_SHELL:
01567                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist));
01568                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist));
01569                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist));
01570                 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01571                 break;
01572 
01573               case D_SHELL:
01574                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist2));
01575                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, ydist)));
01576                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist2));
01577                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, zdist)));
01578                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist, zdist)));
01579                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist2));
01580                 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01581                 break;
01582 
01583               case F_SHELL:
01584                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, xdist)));
01585                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, ydist)));
01586                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, xdist)));
01587                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, ydist)));
01588                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, zdist)));
01589                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(_mm_mul_ps(xdist, ydist), zdist)));
01590                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, zdist)));
01591                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, xdist)));
01592                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, ydist)));
01593                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, zdist)));
01594                 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01595                 break;
01596  
01597 #if 0
01598               default:
01599                 // avoid unnecessary branching and minimize use of pow()
01600                 int i, j; 
01601                 float xdp, ydp, zdp;
01602                 float xdiv = 1.0f / xdist;
01603                 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01604                   int imax = shelltype - j; 
01605                   for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01606                     tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01607                   }
01608                 }
01609                 value += tmpshell * contracted_gto;
01610 #endif
01611             } // end switch
01612 
01613             shell_counter++;
01614           } // end shell
01615         } // end atom
01616 
01617         // return either orbital density or orbital wavefunction amplitude
01618         if (density) {
01619           __m128 mask = _mm_cmplt_ps(value, _mm_setzero_ps());
01620           __m128 sqdensity = _mm_mul_ps(value, value);
01621           __m128 orbdensity = sqdensity;
01622           __m128 nsqdensity = _mm_and_ps(sqdensity, mask);
01623           orbdensity = _mm_sub_ps(orbdensity, nsqdensity);
01624           orbdensity = _mm_sub_ps(orbdensity, nsqdensity);
01625           _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], orbdensity);
01626         } else {
01627           _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], value);
01628         }
01629       }
01630     }
01631   }
01632 
01633   return 0;
01634 }
01635 
01636 #endif
01637 
01638 
01639 
01640 #if defined(VMDORBUSEVSX) && defined(__VSX__)
01641 //
01642 // John Stone, June 2016
01643 //
01644 // aexpfnxsse() - VSX version of aexpfnx().
01645 //
01646 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
01647 #define __align(X)  __attribute__((aligned(X) ))
01648 #else
01649 #define __align(X) __declspec(align(X) )
01650 #endif
01651 
01652 #define MLOG2EF    -1.44269504088896f
01653 
01654 /*
01655  * Interpolating coefficients for linear blending of the
01656  * 3rd degree Taylor expansion of 2^x about 0 and -1.
01657  */
01658 #define SCEXP0     1.0000000000000000f
01659 #define SCEXP1     0.6987082824680118f
01660 #define SCEXP2     0.2633174272827404f
01661 #define SCEXP3     0.0923611991471395f
01662 #define SCEXP4     0.0277520543324108f
01663 
01664 /* for single precision float */
01665 #define EXPOBIAS   127
01666 #define EXPOSHIFT   23
01667 
01668 /* cutoff is optional, but can help avoid unnecessary work */
01669 #define ACUTOFF    -10
01670 
01671 #if 0
01672 vector float ref_expf(vector float x) {
01673   vector float result;
01674 
01675   int i;
01676   for (i=0; i<4; i++) {
01677     result[i] = expf(x[i]);
01678   }
01679 
01680   return result;
01681 }
01682 #endif
01683 
01684 vector float aexpfnxvsx(vector float x) {
01685   // scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF));  /* Is x within cutoff? */
01686   // 
01687   // If all x are outside of cutoff, return 0s.
01688   // if (_mm_movemask_ps(scal.f) == 0) {
01689   //   return _mm_setzero_ps();
01690   // }
01691   // Otherwise, scal.f contains mask to be ANDed with the scale factor
01692 
01693   /*
01694    * Convert base:  exp(x) = 2^(N-d) where N is integer and 0 <= d < 1.
01695    *
01696    * Below we calculate n=N and x=-d, with "y" for temp storage,
01697    * calculate floor of x*log2(e) and subtract to get -d.
01698    */
01699   vector float mb = vec_mul(x, vec_splats(MLOG2EF));
01700   vector float mbflr = vec_floor(mb);
01701   vector float d = vec_sub(mbflr, mb);
01702   vector float y;
01703 
01704   // Approximate 2^{-d}, 0 <= d < 1, by interpolation.
01705   // Perform Horner's method to evaluate interpolating polynomial.
01706   y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3));
01707   y = vec_madd(y, d, vec_splats(SCEXP2));
01708   y = vec_madd(y, d, vec_splats(SCEXP1));
01709   y = vec_madd(y, d, vec_splats(SCEXP0));
01710 
01711   return vec_mul(y, vec_expte(-mbflr));
01712 }
01713 
01714 
01715 int evaluate_grid_vsx(int numatoms,
01716                   const float *wave_f, const float *basis_array,
01717                   const float *atompos,
01718                   const int *atom_basis,
01719                   const int *num_shells_per_atom,
01720                   const int *num_prim_per_shell,
01721                   const int *shell_types,
01722                   const int *numvoxels,
01723                   float voxelsize,
01724                   const float *origin,
01725                   int density,
01726                   float * orbitalgrid) {
01727   if (!orbitalgrid)
01728     return -1;
01729 
01730   int nx, ny, nz;
01731   __attribute__((aligned(16))) float sxdelta[4]; // 16-byte aligned for VSX
01732   for (nx=0; nx<4; nx++) 
01733     sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
01734 
01735   // Calculate the value of the orbital at each gridpoint and store in 
01736   // the current oribtalgrid array
01737   int numgridxy = numvoxels[0]*numvoxels[1];
01738   for (nz=0; nz<numvoxels[2]; nz++) {
01739     float grid_x, grid_y, grid_z;
01740     grid_z = origin[2] + nz * voxelsize;
01741     for (ny=0; ny<numvoxels[1]; ny++) {
01742       grid_y = origin[1] + ny * voxelsize;
01743       int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01744       for (nx=0; nx<numvoxels[0]; nx+=4) {
01745         grid_x = origin[0] + nx * voxelsize;
01746 
01747         // calculate the value of the wavefunction of the
01748         // selected orbital at the current grid point
01749         int at;
01750         int prim, shell;
01751 
01752         // initialize value of orbital at gridpoint
01753         vector float value = vec_splats(0.0f); 
01754 
01755         // initialize the wavefunction and shell counters
01756         int ifunc = 0; 
01757         int shell_counter = 0;
01758 
01759         // loop over all the QM atoms
01760         for (at=0; at<numatoms; at++) {
01761           int maxshell = num_shells_per_atom[at];
01762           int prim_counter = atom_basis[at];
01763 
01764           // calculate distance between grid point and center of atom
01765           float sxdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
01766           float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01767           float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01768 
01769           float sydist2 = sydist*sydist;
01770           float szdist2 = szdist*szdist;
01771           float yzdist2 = sydist2 + szdist2;
01772 
01773           vector float xdelta =  *((__vector float *) &sxdelta[0]); // aligned load
01774           vector float xdist  = vec_splats(sxdist);
01775           xdist = vec_add(xdist, xdelta);
01776           vector float ydist  = vec_splats(sydist);
01777           vector float zdist  = vec_splats(szdist);
01778           vector float xdist2 = vec_mul(xdist, xdist);
01779           vector float ydist2 = vec_mul(ydist, ydist);
01780           vector float zdist2 = vec_mul(zdist, zdist);
01781           vector float dist2  = vec_splats(yzdist2); 
01782           dist2 = vec_add(dist2, xdist2);
01783  
01784           // loop over the shells belonging to this atom
01785           // XXX this is maybe a misnomer because in split valence
01786           //     basis sets like 6-31G we have more than one basis
01787           //     function per (valence-)shell and we are actually
01788           //     looping over the individual contracted GTOs
01789           for (shell=0; shell < maxshell; shell++) {
01790             vector float contracted_gto = vec_splats(0.0f);
01791 
01792             // Loop over the Gaussian primitives of this contracted 
01793             // basis function to build the atomic orbital
01794             // 
01795             // XXX there's a significant opportunity here for further
01796             //     speedup if we replace the entire set of primitives
01797             //     with the single gaussian that they are attempting 
01798             //     to model.  This could give us another 6x speedup in 
01799             //     some of the common/simple cases.
01800             int maxprim = num_prim_per_shell[shell_counter];
01801             int shelltype = shell_types[shell_counter];
01802             for (prim=0; prim<maxprim; prim++) {
01803               // XXX pre-negate exponent value
01804               float exponent       = -basis_array[prim_counter    ];
01805               float contract_coeff =  basis_array[prim_counter + 1];
01806 
01807               // contracted_gto += contract_coeff * exp(-exponent*dist2);
01808               vector float expval = vec_mul(vec_splats(exponent), dist2);
01809 
01810               // VSX expf() required here
01811               vector float retval = aexpfnxvsx(expval);
01812 
01813               vector float ctmp = vec_mul(vec_splats(contract_coeff), retval);
01814               contracted_gto = vec_add(contracted_gto, ctmp);
01815               prim_counter += 2;
01816             }
01817 
01818             /* multiply with the appropriate wavefunction coefficient */
01819             vector float tmpshell = vec_splats(0.0f);
01820             switch (shelltype) {
01821               case S_SHELL:
01822                 value = vec_add(value, vec_mul(vec_splats(wave_f[ifunc++]), contracted_gto));
01823                 break;
01824 
01825               case P_SHELL:
01826                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), xdist));
01827                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), ydist));
01828                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), zdist));
01829                 value = vec_add(value, vec_mul(tmpshell, contracted_gto));
01830                 break;
01831 
01832               case D_SHELL:
01833                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), xdist2));
01834                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist, ydist)));
01835                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), ydist2));
01836                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist, zdist)));
01837                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist, zdist)));
01838                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), zdist2));
01839                 value = vec_add(value, vec_mul(tmpshell, contracted_gto));
01840                 break;
01841 
01842               case F_SHELL:
01843                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, xdist)));
01844                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, ydist)));
01845                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, xdist)));
01846                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, ydist)));
01847                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(xdist2, zdist)));
01848                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(vec_mul(xdist, ydist), zdist)));
01849                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(ydist2, zdist)));
01850                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, xdist)));
01851                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, ydist)));
01852                 tmpshell = vec_add(tmpshell, vec_mul(vec_splats(wave_f[ifunc++]), vec_mul(zdist2, zdist)));
01853                 value = vec_add(value, vec_mul(tmpshell, contracted_gto));
01854                 break;
01855  
01856 #if 0
01857               default:
01858                 // avoid unnecessary branching and minimize use of pow()
01859                 int i, j; 
01860                 float xdp, ydp, zdp;
01861                 float xdiv = 1.0f / xdist;
01862                 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01863                   int imax = shelltype - j; 
01864                   for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01865                     tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01866                   }
01867                 }
01868                 value += tmpshell * contracted_gto;
01869 #endif
01870             } // end switch
01871 
01872             shell_counter++;
01873           } // end shell
01874         } // end atom
01875 
01876         // return either orbital density or orbital wavefunction amplitude
01877         if (density) {
01878           value = vec_cpsgn(value, vec_mul(value, value));
01879 
01880           float *ufptr = &orbitalgrid[gaddrzy + nx];
01881           ufptr[0] = value[0];
01882           ufptr[1] = value[1];
01883           ufptr[2] = value[2];
01884           ufptr[3] = value[3];
01885         } else {
01886           float *ufptr = &orbitalgrid[gaddrzy + nx];
01887           ufptr[0] = value[0];
01888           ufptr[1] = value[1];
01889           ufptr[2] = value[2];
01890           ufptr[3] = value[3];
01891         }
01892       }
01893     }
01894   }
01895 
01896   return 0;
01897 }
01898 
01899 #endif
01900 
01901 
01902 
01903 //
01904 // Multithreaded molecular orbital computation engine
01905 //
01906 
01907 typedef struct {
01908   wkf_cpu_caps_t *cpucaps;
01909   int numatoms;
01910   const float *wave_f;
01911   const float *basis_array;
01912   const float *atompos;
01913   const int *atom_basis;
01914   const int *num_shells_per_atom;
01915   const int *num_prim_per_shell;
01916   const int *shell_types;
01917   const int *numvoxels;
01918   float voxelsize;
01919   int density;
01920   const float *origin;
01921   float *orbitalgrid;
01922 } orbthrparms;
01923 
01924 
01925 extern "C" void * orbitalthread(void *voidparms) {
01926   int numvoxels[3];
01927   float origin[3];
01928   orbthrparms *parms = NULL;
01929 #if defined(VMDORBUSETHRPOOL)
01930   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
01931 #else
01932   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
01933 #endif
01934 
01935 #if defined(VMDCPUDISPATCH)
01936   wkf_cpu_caps_t *cpucaps = parms->cpucaps;
01937 
01938 #if defined(VMDUSEAVX512)
01939   int dispatch_AVX512ER = 0;
01940   if ((cpucaps->flags & CPU_AVX512ER) && (getenv("VMDNOAVX512ER") == NULL)) {
01941 //    printf("evaluate_grid_avx512er\n");
01942     dispatch_AVX512ER = 1;
01943   }
01944 
01945   int dispatch_AVX512F = 0;
01946   if ((cpucaps->flags & CPU_AVX512F) && (getenv("VMDNOAVX512F") == NULL)) {
01947     dispatch_AVX512F = 1;
01948 //    printf("evaluate_grid_avx512f\n");
01949   }
01950 #endif
01951 
01952 #if defined(VMDUSEAVX2)
01953   int dispatch_AVX2 = 0;
01954   if ((cpucaps->flags & CPU_AVX2) && (getenv("VMDNOAVX2") == NULL)) {
01955     dispatch_AVX2 = 1;
01956 //    printf("evaluate_grid_avx2\n");
01957   }
01958 #endif
01959 
01960 #if defined(VMDUSESVE)
01961   int dispatch_SVE = 0;
01962   if ((cpucaps->flags & CPU_ARM64_SVE) && (getenv("VMDNOSVE") == NULL)) {
01963     dispatch_SVE = 1;
01964 //  printf("evaluate_grid_sve\n");
01965   }
01966 #endif
01967 
01968 #if defined(VMDUSENEON)
01969   int dispatch_NEON = 0;
01970   if ((cpucaps->flags & CPU_ARM64_ASIMD) && (getenv("VMDNONEON") == NULL)) {
01971     dispatch_NEON = 1;
01972 //  printf("evaluate_grid_neon\n");
01973   }
01974 #endif
01975 
01976 #endif // VMDCPUDISPATCH
01977 
01978   // 
01979   // Hard-coded compile-time fall-through vectorization paths
01980   //
01981 #if defined(VMDORBUSESSE) && defined(__SSE2__)
01982   int dispatch_SSE2 = 0;
01983   if ((getenv("VMDNOSSE2") == NULL)) {
01984 //  printf("evaluate_grid_sse\n");
01985     dispatch_SSE2 = 1;
01986   }
01987 #endif
01988 
01989 #if defined(VMDORBUSEVSX) && defined(__VSX__)
01990   int dispatch_VSX = 0;
01991   if (getenv("VMDNOVSX") == NULL) {
01992 //  printf("evaluate_grid_vsx\n");
01993     dispatch_VSX = 1;
01994   }
01995 #endif
01996 
01997   numvoxels[0] = parms->numvoxels[0];
01998   numvoxels[1] = parms->numvoxels[1];
01999   numvoxels[2] = 1; // we compute only a single plane
02000 
02001   origin[0] = parms->origin[0];
02002   origin[1] = parms->origin[1];
02003 
02004   // loop over orbital planes
02005   int planesize = numvoxels[0] * numvoxels[1];
02006   wkf_tasktile_t tile;
02007 #if defined(VMDORBUSETHRPOOL)
02008   while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
02009 #else
02010   while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
02011 #endif
02012     int k;
02013     for (k=tile.start; k<tile.end; k++) {
02014       origin[2] = parms->origin[2] + parms->voxelsize * k;
02015 
02016 #if defined(VMDCPUDISPATCH)
02017       //
02018       // runtime CPU dispatch
02019       //   check for optional vector instructions and execute custom kernels 
02020       //   for the fastest code path supported by the detected hardware
02021       // 
02022       if (cpucaps != NULL) {
02023 #if defined(VMDUSEAVX512)
02024         if (dispatch_AVX512ER) {
02025           evaluate_grid_avx512er(parms->numatoms, parms->wave_f, 
02026               parms->basis_array, parms->atompos, parms->atom_basis,
02027               parms->num_shells_per_atom, parms->num_prim_per_shell,
02028               parms->shell_types, numvoxels, parms->voxelsize,
02029               origin, parms->density, parms->orbitalgrid + planesize*k);
02030           continue;
02031         }
02032 
02033         if (dispatch_AVX512F) {
02034           evaluate_grid_avx512f(parms->numatoms, parms->wave_f, 
02035               parms->basis_array, parms->atompos, parms->atom_basis,
02036               parms->num_shells_per_atom, parms->num_prim_per_shell,
02037               parms->shell_types, numvoxels, parms->voxelsize,
02038               origin, parms->density, parms->orbitalgrid + planesize*k);
02039           continue;
02040         }
02041 #endif
02042 
02043 #if defined(VMDUSEAVX2)
02044         if (dispatch_AVX2) {
02045           evaluate_grid_avx2(parms->numatoms, parms->wave_f, 
02046               parms->basis_array, parms->atompos, parms->atom_basis,
02047               parms->num_shells_per_atom, parms->num_prim_per_shell,
02048               parms->shell_types, numvoxels, parms->voxelsize,
02049               origin, parms->density, parms->orbitalgrid + planesize*k);
02050           continue;
02051         }
02052 #endif
02053 
02054 #if defined(VMDUSESVE)
02055         if (dispatch_SVE) {
02056           evaluate_grid_sve(parms->numatoms, parms->wave_f,
02057               parms->basis_array, parms->atompos, parms->atom_basis,
02058               parms->num_shells_per_atom, parms->num_prim_per_shell,
02059               parms->shell_types, numvoxels, parms->voxelsize,
02060               origin, parms->density, parms->orbitalgrid + planesize*k);
02061           continue;
02062         }
02063 #endif
02064 
02065 #if defined(VMDUSENEON)
02066         if (dispatch_NEON) {
02067           evaluate_grid_neon(parms->numatoms, parms->wave_f, 
02068               parms->basis_array, parms->atompos, parms->atom_basis,
02069               parms->num_shells_per_atom, parms->num_prim_per_shell,
02070               parms->shell_types, numvoxels, parms->voxelsize,
02071               origin, parms->density, parms->orbitalgrid + planesize*k);
02072           continue;
02073         }
02074 #endif
02075 
02076       } // runtime cpucaps-based dispatch
02077 #endif
02078 
02079 
02080 //
02081 // hard-coded fall-through path if runtime CPU dispatch doesn't match up
02082 //
02083 #if defined(VMDORBUSESSE) && defined(__SSE2__)
02084       if (dispatch_SSE2) {
02085         evaluate_grid_sse(parms->numatoms, parms->wave_f, 
02086             parms->basis_array, parms->atompos, parms->atom_basis,
02087             parms->num_shells_per_atom, parms->num_prim_per_shell,
02088             parms->shell_types, numvoxels, parms->voxelsize,
02089             origin, parms->density, parms->orbitalgrid + planesize*k);
02090         continue;
02091       }
02092 #endif
02093 
02094 #if defined(VMDORBUSEVSX) && defined(__VSX__)
02095       if (dispatch_VSX) {
02096         evaluate_grid_vsx(parms->numatoms, parms->wave_f,
02097             parms->basis_array, parms->atompos, parms->atom_basis,
02098             parms->num_shells_per_atom, parms->num_prim_per_shell,
02099             parms->shell_types, numvoxels, parms->voxelsize,
02100             origin, parms->density, parms->orbitalgrid + planesize*k);
02101         continue;
02102       }
02103 #endif
02104 
02105       // Standard C++-based implementation that uses wither the
02106       // standard math library expf(), a faster modified Cephes expf(),
02107       // our our own fast exponential approximation routine.
02108       evaluate_grid(parms->numatoms, parms->wave_f, 
02109             parms->basis_array, parms->atompos, parms->atom_basis,
02110             parms->num_shells_per_atom, parms->num_prim_per_shell,
02111             parms->shell_types, numvoxels, parms->voxelsize,
02112             origin, parms->density, parms->orbitalgrid + planesize*k);
02113     }
02114   }
02115 
02116   return NULL;
02117 }
02118 
02119 
02120 int evaluate_grid_fast(wkf_cpu_caps_t *cpucaps,
02121 #if defined(VMDORBUSETHRPOOL) 
02122                        wkf_threadpool_t *thrpool, 
02123 #else
02124                        int numcputhreads,
02125 #endif
02126                        int numatoms,
02127                        const float *wave_f,
02128                        const float *basis_array,
02129                        const float *atompos,
02130                        const int *atom_basis,
02131                        const int *num_shells_per_atom,
02132                        const int *num_prim_per_shell,
02133                        const int *shell_types,
02134                        const int *numvoxels,
02135                        float voxelsize,
02136                        const float *origin,
02137                        int density,
02138                        float * orbitalgrid) {
02139   int rc=0;
02140   orbthrparms parms;
02141 
02142   parms.cpucaps = cpucaps;
02143   parms.numatoms = numatoms;
02144   parms.wave_f = wave_f;
02145   parms.basis_array = basis_array;
02146   parms.atompos = atompos;
02147   parms.atom_basis = atom_basis;
02148   parms.num_shells_per_atom = num_shells_per_atom;
02149   parms.num_prim_per_shell = num_prim_per_shell;
02150   parms.shell_types = shell_types;
02151   parms.numvoxels = numvoxels;
02152   parms.voxelsize = voxelsize;
02153   parms.origin = origin;
02154   parms.density = density;
02155   parms.orbitalgrid = orbitalgrid;
02156 
02157   /* spawn child threads to do the work */
02158   wkf_tasktile_t tile;
02159   tile.start = 0;
02160   tile.end = numvoxels[2];
02161 
02162 #if defined(VMDORBUSETHRPOOL) 
02163   wkf_threadpool_sched_dynamic(thrpool, &tile);
02164   rc = wkf_threadpool_launch(thrpool, orbitalthread, &parms, 1);
02165 #else
02166   rc = wkf_threadlaunch(numcputhreads, &parms, orbitalthread, &tile);
02167 #endif
02168 
02169   return rc;
02170 }
02171 
02172 
02173 void Orbital::print_wavefunction() {
02174   // XXX Android, IRIX, and Windows don't provide log2f(), nor log2() ?!?!?!?!
02175   // for now we'll just avoid compiling this debugging code
02176 #if !(defined(_MSC_VER) || defined(ARCH_IRIX6) || defined(ARCH_IRIX6_64) || defined(ARCH_ANDROIDARMV7A))
02177   char shellname[6] = {'S', 'P', 'D', 'F', 'G', 'H'};
02178   int ifunc = 0;
02179   int at;
02180   int shell;
02181   for (at=0; at<numatoms; at++) {
02182     for (shell=0; shell < num_shells_per_atom[at]; shell++) {
02183       int shelltype = basis_set[at].shell[shell].type;
02184 
02185       // avoid unnecessary branching and minimize use of pow()
02186       int i, j, iang=0; 
02187       float xdist=2.0;
02188       float ydist=2.0;
02189       float zdist=2.0;
02190       float xdp, ydp, zdp;
02191       float xdiv = 1.0f / xdist;
02192       for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
02193         int imax = shelltype - j; 
02194         for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
02195           printf("%3i %c", at, shellname[shelltype]);
02196           int k, m=0;
02197           char buf[20]; buf[0] = '\0';
02198           for (k=0; k<(int)log2f(xdp); k++, m++) sprintf(buf+m, "x");
02199           for (k=0; k<(int)log2f(ydp); k++, m++) sprintf(buf+m, "y");
02200           for (k=0; k<(int)log2f(zdp); k++, m++) sprintf(buf+m, "z");
02201           //char *ang = qmdata->get_angular_momentum(at, shell, iang);
02202           printf("%-5s (%1.0f%1.0f%1.0f) wave_f[%3i] = % 11.6f\n", buf,
02203                  log2f(xdp), log2f(ydp), log2f(zdp), ifunc, wave_f[ifunc]);
02204           //delete [] ang;
02205           iang++;
02206           ifunc++;
02207         }
02208       }
02209     }
02210   }
02211 #endif
02212 
02213 }