Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

Orbital.C

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr                                                                       
00003  *cr            (C) Copyright 1995-2011 The Board of Trustees of the           
00004  *cr                        University of Illinois                       
00005  *cr                         All Rights Reserved                        
00006  *cr                                                                   
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: Orbital.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.124 $      $Date: 2011/03/11 22:33:00 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *
00019  * The Orbital class, which stores orbitals, SCF energies, etc. for a
00020  * single timestep.
00021  *
00022  ***************************************************************************/
00023 
00024 #define VMDORBUSESSE 1
00025 //#define DEBUGORBS 1
00026 
00027 #include <math.h>
00028 #include <stdio.h>
00029 #if VMDORBUSESSE & defined(__SSE2__)
00030 #include <emmintrin.h>
00031 #endif
00032 #include "Orbital.h"
00033 #include "DrawMolecule.h"
00034 #include "utilities.h"
00035 #include "Inform.h"
00036 #include "WKFThreads.h"
00037 #include "WKFUtils.h"
00038 #if defined(VMDCUDA)
00039 #include "CUDAKernels.h"
00040 #endif
00041 #if defined(VMDOPENCL)
00042 #include "OpenCLUtils.h"
00043 #include "OpenCLKernels.h"
00044 #endif
00045 
00046 #define ANGS_TO_BOHR 1.88972612478289694072f
00047 
00049 Orbital::Orbital(const float *pos,
00050                  const float *wfn,
00051                  const float *barray,
00052                  const basis_atom_t *bset,
00053                  const int *types,
00054                  const int *asort,
00055                  const int *abasis,
00056                  const float **norm,
00057                  const int *nshells,
00058                  const int *nprimshell,
00059                  const int *shelltypes, 
00060                  int natoms, int ntypes, int numwave, int numbasis, 
00061                  int orbid) :
00062   atomid(-1), shellid(-1),
00063   numatoms(natoms), atompos(pos),
00064   num_wave_f(numwave),
00065   num_basis_funcs(numbasis),
00066   basis_array(barray),
00067   numtypes(ntypes), 
00068   basis_set(bset),
00069   atom_types(types),
00070   atom_sort(asort),
00071   atom_basis(abasis),
00072   norm_factors(norm),
00073   num_shells_per_atom(nshells),
00074   num_prim_per_shell(nprimshell),
00075   shell_types(shelltypes) 
00076 {
00077   grid_data = NULL;
00078   origin[0] = origin[1] = origin[2] = 0.0;
00079 
00080   // Multiply wavefunction coefficients with the
00081   // angular momentum dependent part of the basis set
00082   // normalization factors.
00083   normalize_wavefunction(wfn + num_wave_f*orbid);
00084 
00085   //print_wavefunction();
00086 }
00087 
00089 Orbital::~Orbital() {
00090   if (wave_f) delete [] wave_f;
00091 }
00092 
00093 
00094 // Multiply wavefunction coefficients with the
00095 // basis set normalization factors. We do this
00096 // here rather than normalizing the basisset itself
00097 // because we need different factors for the different
00098 // cartesian components of a shell and the basis set
00099 // stores data only per shell.
00100 // By doing the multiplication here we save a lot of
00101 // flops during orbital rendering.
00102 void Orbital::normalize_wavefunction(const float *wfn) {
00103 #ifdef DEBUGORBS
00104   char shellname[8] = {'S', 'P', 'D', 'F', 'G', 'H', 'I', 'K'};
00105 #endif
00106   int i, j, k;
00107   // Get size of the symmetry-expanded wavefunction array
00108 //   int wave_size = 0;
00109 //   for (i=0; i<numatoms; i++) {
00110 //     printf("atom[%d]: type = %d\n", i, atom_types[i]);
00111 //     const basis_atom_t *basis_atom = &basis_set[atom_types[i]];
00112 //     for (j=0; j<basis_atom->numshells; j++) {
00113 //       wave_size += basis_atom->shell[j].num_cart_func;
00114 //     }
00115 //   }
00116 //   printf("num_wave_f/wave_size = %d/%d\n", num_wave_f,  wave_size);
00117 
00118   wave_f = new float[num_wave_f];
00119   int ifunc = 0;
00120   for (i=0; i<numatoms; i++) {
00121     const basis_atom_t *basis_atom = &basis_set[atom_types[i]];
00122     for (j=0; j<basis_atom->numshells; j++) {
00123       int stype = basis_atom->shell[j].type;
00124 #ifdef DEBUGORBS
00125       printf("atom %i/%i,  %i/%i %c-shell\n", i+1, numatoms, j+1, basis_atom->numshells, shellname[stype]);
00126 #endif
00127       for (k=0; k<basis_atom->shell[j].num_cart_func; k++) {
00128         wave_f[ifunc] = wfn[ifunc] * norm_factors[stype][k];
00129 
00130 #ifdef DEBUGORBS
00131         printf("%3i %c %2i wave_f[%3i]=% f  norm=%.3f  normwave=% f\n",
00132                i, shellname[stype], k, ifunc, wfn[ifunc],
00133                norm_factors[stype][k], wave_f[ifunc]);
00134 #endif
00135         ifunc++;
00136       }
00137     }
00138   }
00139 }
00140 
00141 
00142 // Sets the grid dimensions to the bounding box of the given
00143 // set of atoms *pos including a padding in all dimensions.
00144 // The resulting grid dimensions will be rounded to a multiple
00145 // of the voxel size.
00146 int Orbital::set_grid_to_bbox(const float *pos, float padding,
00147                               float resolution) {
00148   int i = 0;
00149   float xyzdim[3];
00150 
00151   /* set initial values of temp values to the coordinates
00152    * of the first atom.  */
00153   origin[0] = xyzdim[0] = pos[0];
00154   origin[1] = xyzdim[1] = pos[1];
00155   origin[2] = xyzdim[2] = pos[2];
00156 
00157   /* now loop over the rest of the atoms to check if there's
00158    * something larger/smaller for the maximum and minimum
00159    * respectively */
00160   for(i=1; i<numatoms; i++) {
00161     if (pos[3*i  ] < origin[0]) origin[0] = pos[3*i];
00162     if (pos[3*i+1] < origin[1]) origin[1] = pos[3*i+1];
00163     if (pos[3*i+2] < origin[2]) origin[2] = pos[3*i+2];
00164     if (pos[3*i  ] > xyzdim[0]) xyzdim[0] = pos[3*i];
00165     if (pos[3*i+1] > xyzdim[1]) xyzdim[1] = pos[3*i+1];
00166     if (pos[3*i+2] > xyzdim[2]) xyzdim[2] = pos[3*i+2];
00167   }
00168 
00169   // Apply padding in each direction
00170   origin[0] -= padding;
00171   origin[1] -= padding;
00172   origin[2] -= padding;
00173   gridsize[0] = xyzdim[0] + padding - origin[0];
00174   gridsize[1] = xyzdim[1] + padding - origin[1];
00175   gridsize[2] = xyzdim[2] + padding - origin[2];  
00176 
00177   set_resolution(resolution);
00178 
00179   return TRUE;
00180 }
00181 
00182 
00183 // Set the dimensions and resolution of the grid for which 
00184 // the orbital shall be computed.
00185 // The given grid dimensions will be rounded to a multiple
00186 // of the voxel size.
00187 void Orbital::set_grid(float newori[3], float newdim[3], float newvoxelsize) {
00188   origin[0] = newori[0];
00189   origin[1] = newori[1];
00190   origin[2] = newori[2];
00191   gridsize[0] = newdim[0];
00192   gridsize[1] = newdim[1];
00193   gridsize[2] = newdim[2];
00194   set_resolution(newvoxelsize);
00195 }
00196 
00197 // Change the resolution of the grid
00198 void Orbital::set_resolution(float resolution) {
00199   voxelsize = resolution;
00200   int i;
00201   for (i=0; i<3; i++) {
00202     numvoxels[i] = (int)(gridsize[i]/voxelsize) + 1;
00203     gridsize[i] = voxelsize*(numvoxels[i]-1);
00204   }
00205 }
00206 
00207 #define XNEG 0
00208 #define YNEG 1
00209 #define ZNEG 2
00210 #define XPOS 3
00211 #define YPOS 4
00212 #define ZPOS 5
00213 
00214 // Check if all values in the boundary plane given by dir 
00215 // are below threshold.
00216 // If not, jump back, decrease the stepsize and test again.
00217 int Orbital::check_plane(int dir, float threshold, int minstepsize,
00218                          int &stepsize) {
00219   bool repeat=0;
00220   int u, v, w, nu, nv;
00221   // w is the dimension we want to adjust,
00222   // u and v are the other two, i.e. the plane in which we test
00223   // the orbital values. 
00224   u = (dir+1)%3;
00225   v = (dir+2)%3;
00226   w = dir%3;
00227 
00228   // for debugging
00229   //char axis[3] = {'X', 'Y', 'Z'};
00230   //char sign[2] = {'-', '+'};
00231   //printf("%c%c: ", sign[dir/3], axis[w]);
00232 
00233   do {
00234     int success = 0;
00235     int offset = 0;
00236     int gridstep = stepsize;
00237 
00238     if (repeat) {
00239       // We are repeating the test on the previous slate but with
00240       // twice the resolution. Hence we only have to test the new
00241       // grid points lying between the old ones.
00242       offset = stepsize;
00243       gridstep = 2*stepsize;
00244     }
00245 
00246 
00247     float grid[3];
00248     grid[w] = origin[w] + (dir/3)*(numvoxels[w]-1) * voxelsize;
00249 
00250     // Search for a value of the wave function larger than threshold.
00251     for (nu=0; nu<numvoxels[u]; nu+=gridstep) {
00252       grid[u] = origin[u] + nu * voxelsize;
00253     
00254       for (nv=0; nv<numvoxels[v]; nv+=gridstep) {
00255         grid[v] = origin[v] + nv * voxelsize;
00256       
00257         if (fabs(evaluate_grid_point(grid[0], grid[1], grid[2])) > threshold) {
00258           success = 1;
00259           break;
00260         }
00261       }
00262       if (success) break;
00263     }
00264 
00265     if (success) {
00266       // Found an orbital value higher than the threshold.
00267       // We want the threshold isosurface to be completely inside the grid.
00268       // The boundary must be between the previous and this plane.
00269       if (!(dir/3)) origin[w] -= stepsize*voxelsize;
00270       numvoxels[w] += stepsize;
00271       if (stepsize<=minstepsize) {
00272         //printf("success!\n");
00273         return 1;
00274       }
00275       stepsize /=2;
00276       repeat = 1;
00277       //printf("increase by %i, reduce stepsize to %i.\n", 2*stepsize, stepsize);
00278 
00279     } else {
00280       // All values lower than threshold, we decrease the grid size.
00281       if (!(dir/3)) origin[w] += stepsize*voxelsize;
00282       numvoxels[w] -= stepsize;
00283       //printf("decrease by %i\n", stepsize);
00284       repeat = 0;
00285       if (numvoxels[w] <= 1) {
00286         // Here we ended up with a zero grid size.
00287         // We must have missed something. Let's increase grid again and 
00288         // try a smaller step size.
00289         numvoxels[w] = stepsize; 
00290         if (!(dir/3)) origin[w] -= stepsize*voxelsize;
00291         stepsize /=2;
00292         repeat = 1;
00293         //printf("zero grid size - increase to %i, reduce stepsize to %i.\n", 2*stepsize, stepsize);
00294       }
00295     }
00296 
00297   } while (repeat);
00298 
00299   return 0;
00300 }
00301 
00302 
00303 // Optimize position and dimension of current grid so that all orbital
00304 // values higher than threshold are contained in the grid.
00305 //
00306 // Algorithm:
00307 // Based on the idea that the wave function trails off in a distance of
00308 // a few Angstroms from the molecule.
00309 // We start from the current grid size (which could be for instance the
00310 // molecular bounding box plus a padding region) and test the values on
00311 // each of the six boundary planes. If there is no value larger than the
00312 // given threshold in a plane then we shrink the system along the plane
00313 // normal. In the distance the wave function tends to be smoother so we
00314 // start the testing on a coarser grid. A parameter maxstepsize=4 means
00315 // to begin with a grid using a four times higher voxel side length than
00316 // the original grid. When we find the first value above the threshold we
00317 // jump back one step and continue with half of the previous stepsize.
00318 // When stepsize has reached minstepsize then we consider the corresponding
00319 // boundary plane optimal. Note that starting out with a too coarse
00320 // grid one might miss some features of the wave function.
00321 // If you want to be sure not to miss anything then use the voxelsize
00322 // for both minstepsize and maxstepsize.
00323 void Orbital::find_optimal_grid(float threshold, 
00324                                 int minstepsize, int maxstepsize) {
00325   int optimal[6] = {0, 0, 0, 0, 0, 0};
00326   int stepsize[6];
00327   int i;
00328   for (i=0; i<6; i++) stepsize[i] = maxstepsize;
00329 
00330 #ifdef DEBUGORBS
00331   printf("origin = {%f %f %f}\n", origin[0], origin[1], origin[2]);
00332   printf("gridsize = {%f %f %f}\n", gridsize[0], gridsize[1], gridsize[2]);
00333 #endif  
00334   
00335   
00336   // Loop until we have optimal grid boundaries in all
00337   // dimensions
00338   int iter = 0;
00339   while ( !optimal[0] || !optimal[1] || !optimal[2] ||
00340           !optimal[3] || !optimal[4] || !optimal[5] )
00341   {
00342     if (iter>100) {
00343       msgInfo << "WARNING: Could not optimize orbital grid boundaries in"
00344               << iter << "steps!" << sendmsg; 
00345       break;
00346     }
00347     iter++;
00348 
00349     // Examine the current grid boundaries and shrink if
00350     // all values are smaller than threshold    .
00351     if (!optimal[XNEG])
00352       optimal[XNEG] = check_plane(XNEG, threshold, minstepsize, stepsize[XNEG]);
00353 
00354     if (!optimal[XPOS])
00355       optimal[XPOS] = check_plane(XPOS, threshold, minstepsize, stepsize[XPOS]);
00356 
00357     if (!optimal[YNEG])
00358       optimal[YNEG] = check_plane(YNEG, threshold, minstepsize, stepsize[YNEG]);
00359 
00360     if (!optimal[YPOS])
00361       optimal[YPOS] = check_plane(YPOS, threshold, minstepsize, stepsize[YPOS]);
00362 
00363     if (!optimal[ZNEG])
00364       optimal[ZNEG] = check_plane(ZNEG, threshold, minstepsize, stepsize[ZNEG]);
00365 
00366     if (!optimal[ZPOS])
00367       optimal[ZPOS] = check_plane(ZPOS, threshold, minstepsize, stepsize[ZPOS]);
00368 
00369 #ifdef DEBUGORBS
00370     printf("origin {%f %f %f}\n", origin[0], origin[1], origin[2]);
00371     printf("ngrid {%i %i %i}\n", numvoxels[0], numvoxels[1], numvoxels[2]);
00372     printf("stepsize {%i %i %i %i %i %i}\n", stepsize[0], stepsize[1], stepsize[2],
00373            stepsize[3], stepsize[4], stepsize[5]);
00374 #endif
00375   }
00376 
00377   
00378   gridsize[0] = numvoxels[0]*voxelsize;
00379   gridsize[1] = numvoxels[1]*voxelsize;
00380   gridsize[2] = numvoxels[2]*voxelsize;
00381 }
00382 
00383 
00384 // this function creates the orbital grid given the system dimensions
00385 int Orbital::calculate_mo(DrawMolecule *mol, int density) {
00386   wkf_timerhandle timer;
00387 
00388   timer=wkf_timer_create();
00389   wkf_timer_start(timer);
00390 
00391 #ifdef DEBUGORBS
00392   printf("num_wave_f=%i\n", num_wave_f);
00393 #endif
00394 
00395 #ifdef DEBUGORBS
00396   int i=0;
00397   for (i=0; i<num_wave_f; i++) {
00398     printf("wave_f[%i] = %f\n", i, wave_f[i]);
00399   }
00400 #endif
00401  
00402 
00403   // Allocate memory for the volumetric grid
00404   int numgridpoints = numvoxels[0] * numvoxels[1] * numvoxels[2];
00405   grid_data = new float[numgridpoints];
00406 
00407   // let's give the user a warning, since the calculation
00408   // could take a while, otherwise they might think the system is borked 
00409 #ifdef DEBUGORBS
00410   printf("Calculating %ix%ix%i orbital grid. \n", numvoxels[0], numvoxels[1], numvoxels[2]);
00411 #endif
00412 
00413 
00414   int rc=-1; // initialize to sentinel value
00415 
00416   // Calculate the value of the orbital at each gridpoint
00417 #if defined(VMDCUDA)
00418   // The CUDA kernel currently only handles up to "G" shells,
00419   // and up to 32 primitives per basis function
00420   if ((max_shell_type() <= G_SHELL) &&
00421       (max_primitives() <= 32) &&
00422       (!getenv("VMDNOCUDA"))) {
00423     rc = vmd_cuda_evaluate_orbital_grid(mol->cuda_devpool(), 
00424                                         numatoms, wave_f, num_wave_f,
00425                                         basis_array, num_basis_funcs,
00426                                         atompos, atom_basis,
00427                                         num_shells_per_atom, 
00428                                         num_prim_per_shell,
00429                                         shell_types, total_shells(),
00430                                         numvoxels, voxelsize, 
00431                                         origin, density, grid_data);
00432   }
00433 #endif
00434 #if defined(VMDOPENCL)
00435   // The OpenCL kernel currently only handles up to "G" shells,
00436   // and up to 32 primitives per basis function
00437   if (rc!=0 &&
00438       (max_shell_type() <= G_SHELL) &&
00439       (max_primitives() <= 32) &&
00440       (!getenv("VMDNOOPENCL"))) {
00441 
00442 #if 1
00443     // XXX this would be done during app startup normally...
00444     static vmd_opencl_orbital_handle *orbh = NULL;
00445     static cl_context clctx = NULL;
00446     static cl_command_queue clcmdq = NULL;
00447     static cl_device_id *cldevs = NULL;
00448     if (orbh == NULL) {
00449       printf("Attaching OpenCL device:\n");
00450       wkf_timer_start(timer);
00451       cl_int clerr = CL_SUCCESS;
00452 
00453       cl_platform_id clplatid = vmd_cl_get_platform_index(0);
00454       cl_context_properties clctxprops[] = {(cl_context_properties) CL_CONTEXT_PLATFORM, (cl_context_properties) clplatid, (cl_context_properties) 0};
00455       clctx = clCreateContextFromType(clctxprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &clerr);
00456 
00457       size_t parmsz;
00458       clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, 0, NULL, &parmsz);
00459       if (clerr != CL_SUCCESS) return -1;
00460       cldevs = (cl_device_id *) malloc(parmsz);
00461       if (clerr != CL_SUCCESS) return -1;
00462       clerr |= clGetContextInfo(clctx, CL_CONTEXT_DEVICES, parmsz, cldevs, NULL);
00463       if (clerr != CL_SUCCESS) return -1;
00464       clcmdq = clCreateCommandQueue(clctx, cldevs[0], 0, &clerr);
00465       if (clerr != CL_SUCCESS) return -1;
00466       wkf_timer_stop(timer);
00467       printf("  OpenCL context creation time: %.3f sec\n", wkf_timer_time(timer));
00468 
00469       wkf_timer_start(timer);
00470       orbh = vmd_opencl_create_orbital_handle(clctx, clcmdq, cldevs);
00471       wkf_timer_stop(timer);
00472       printf("  OpenCL kernel compilation time: %.3f sec\n", wkf_timer_time(timer));
00473 
00474       wkf_timer_start(timer);
00475     }
00476 #endif
00477 
00478     rc = vmd_opencl_evaluate_orbital_grid(mol->cuda_devpool(), orbh,
00479                                         numatoms, wave_f, num_wave_f,
00480                                         basis_array, num_basis_funcs,
00481                                         atompos, atom_basis,
00482                                         num_shells_per_atom, 
00483                                         num_prim_per_shell,
00484                                         shell_types, total_shells(),
00485                                         numvoxels, voxelsize, 
00486                                         origin, density, grid_data);
00487 
00488 #if 0
00489     // XXX this would normally be done at program shutdown
00490     vmd_opencl_destroy_orbital_handle(parms.orbh);
00491     clReleaseCommandQueue(clcmdq);
00492     clReleaseContext(clctx);
00493     free(cldevs);
00494 #endif
00495   }
00496 #endif
00497 #if 0
00498   int numprocs = 1;
00499   if (getenv("VMDDUMPORBITALS")) {
00500     write_orbital_data(getenv("VMDDUMPORBITALS"), numatoms,
00501                        wave_f, num_wave_f, basis_array, num_basis,
00502                        atompos, atom_basis, num_shells_per_atom,
00503                        num_prim_per_shell, shell_types,
00504                        num_shells, numvoxels, voxelsize, origin);
00505 
00506     read_calc_orbitals(devpool, getenv("VMDDUMPORBITALS"));
00507   }
00508 #endif
00509   if (rc!=0)  rc = evaluate_grid_fast(numatoms, wave_f, basis_array,
00510                                       atompos, atom_basis,
00511                                       num_shells_per_atom, num_prim_per_shell,
00512                                       shell_types, numvoxels, voxelsize, 
00513                                       origin, density, grid_data);
00514 
00515   if (rc!=0) {
00516     msgErr << "Error computing orbital grid" << sendmsg;
00517     delete [] grid_data;
00518     grid_data=NULL;
00519     return FALSE;
00520   }
00521 
00522   wkf_timer_stop(timer);
00523 
00524 #if 0
00525   { 
00526     double gflops = (numgridpoints * flops_per_gridpoint()) / (wkf_timer_time(timer) * 1000000000.0);
00527 
00528     char strbuf[1024];
00529     sprintf(strbuf, "Orbital calc. time %.3f secs, %.2f gridpoints/sec, %.2f GFLOPS",
00530             wkf_timer_time(timer), 
00531             (((double) numgridpoints) / wkf_timer_time(timer)),
00532             gflops);
00533     msgInfo << strbuf << sendmsg;
00534   }
00535 #endif
00536 
00537   wkf_timer_destroy(timer);
00538   return TRUE;
00539 }
00540 
00541 
00542 /*********************************************************
00543  *
00544  * This function calculates the value of the wavefunction
00545  * corresponding to a particular orbital at grid point
00546  * grid_x, grid_y, grid_z.
00547 
00548 
00549  Here's an example of a basis set definition for one atom:
00550 
00551   SHELL TYPE  PRIMITIVE        EXPONENT          CONTRACTION COEFFICIENT(S)
00552 
00553  Oxygen
00554 
00555       1   S       1          5484.6716600    0.001831074430
00556       1   S       2           825.2349460    0.013950172200
00557       1   S       3           188.0469580    0.068445078098
00558       1   S       4            52.9645000    0.232714335992
00559       1   S       5            16.8975704    0.470192897984
00560       1   S       6             5.7996353    0.358520852987
00561 
00562       2   L       7            15.5396162   -0.110777549525    0.070874268231
00563       2   L       8             3.5999336   -0.148026262701    0.339752839147
00564       2   L       9             1.0137618    1.130767015354    0.727158577316
00565 
00566       3   L      10             0.2700058    1.000000000000    1.000000000000
00567 
00568  *********************************************************/
00569 float Orbital::evaluate_grid_point(float grid_x, float grid_y, float grid_z) {
00570   int at;
00571   int prim, shell;
00572 
00573   // initialize value of orbital at gridpoint
00574   float value = 0.0;
00575 
00576   // initialize the wavefunction and shell counters
00577   int ifunc = 0; 
00578   int shell_counter = 0;
00579 
00580   // loop over all the QM atoms
00581   for (at=0; at<numatoms; at++) {
00582     int maxshell = num_shells_per_atom[at];
00583     int prim_counter = atom_basis[at];
00584 
00585     // calculate distance between grid point and center of atom
00586     float xdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
00587     float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00588     float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00589     float dist2 = xdist*xdist + ydist*ydist + zdist*zdist;
00590 
00591     // loop over the shells belonging to this atom
00592     // XXX this is maybe a misnomer because in split valence
00593     //     basis sets like 6-31G we have more than one basis
00594     //     function per (valence-)shell and we are actually
00595     //     looping over the individual contracted GTOs
00596     for (shell=0; shell < maxshell; shell++) {
00597       float contracted_gto = 0.0f;
00598 
00599       // Loop over the Gaussian primitives of this contracted 
00600       // basis function to build the atomic orbital
00601       int maxprim = num_prim_per_shell[shell_counter];
00602       int shelltype = shell_types[shell_counter];
00603       for (prim=0; prim < maxprim;  prim++) {
00604         float exponent       = basis_array[prim_counter    ];
00605         float contract_coeff = basis_array[prim_counter + 1];
00606         contracted_gto += contract_coeff * expf(-exponent*dist2);
00607         prim_counter += 2;
00608       }
00609 
00610       /* multiply with the appropriate wavefunction coefficient */
00611       float tmpshell=0;
00612       // Loop over the cartesian angular momenta of the shell.
00613       // avoid unnecessary branching and minimize use of pow()
00614       int i, j; 
00615       float xdp, ydp, zdp;
00616       float xdiv = 1.0f / xdist;
00617       for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
00618         int imax = shelltype - j; 
00619         for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
00620           tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
00621         }
00622       }
00623       value += tmpshell * contracted_gto;
00624 
00625       shell_counter++;
00626     } 
00627   }
00628 
00629   /* return the final value at grid point */
00630   return value;
00631 }
00632 
00633 
00634 //
00635 // Return the max number of primitives that occur in a basis function
00636 //
00637 int Orbital::max_primitives(void) {
00638   int maxprim=-1;
00639 
00640   int shell_counter = 0;
00641   for (int at=0; at<numatoms; at++) {
00642     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00643       int numprim = num_prim_per_shell[shell_counter];
00644       if (numprim > maxprim)
00645         maxprim = numprim; 
00646     }
00647   }
00648 
00649   return maxprim;
00650 }
00651 
00652 
00653 //
00654 // Return the maximum shell type used
00655 //
00656 int Orbital::max_shell_type(void) {
00657   int maxshell=-1;
00658 
00659   int shell_counter = 0;
00660   for (int at=0; at<numatoms; at++) {
00661     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00662       int shelltype = shell_types[shell_counter];
00663       shell_counter++;
00664       if (shelltype > maxshell)
00665         maxshell=shelltype;
00666     }
00667   }
00668 
00669   return maxshell;
00670 }
00671 
00672 
00673 //
00674 // count the maximum number of wavefunction coefficient accesses
00675 // required for the highest shell types contained in this orbital
00676 //
00677 int Orbital::max_wave_f_count(void) {
00678   int maxcount=0;
00679 
00680   int shell_counter = 0;
00681   for (int at=0; at<numatoms; at++) {
00682     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00683       int shelltype = shell_types[shell_counter];
00684       int i, j; 
00685       int count=0;
00686       for (i=0; i<=shelltype; i++) {
00687         int jmax = shelltype - i; 
00688         for (j=0; j<=jmax; j++) {
00689           count++;
00690         }
00691       }
00692       shell_counter++;
00693       if (count > maxcount)
00694         maxcount=count;
00695     }
00696   }
00697 
00698   return maxcount;
00699 }
00700 
00701 
00702 //
00703 // compute the FLOPS per grid point for performance measurement purposes
00704 //
00705 double Orbital::flops_per_gridpoint() {
00706   double flops=0.0;
00707 
00708   int shell_counter = 0;
00709   for (int at=0; at<numatoms; at++) {
00710     flops += 7;
00711 
00712     for (int shell=0; shell < num_shells_per_atom[at]; shell++) {
00713       for (int prim=0; prim < num_prim_per_shell[shell_counter];  prim++)
00714         flops += 4; // expf() costs far more, but we count as one.
00715 
00716       int shelltype = shell_types[shell_counter];
00717 
00718       switch (shelltype) {
00719         // separately count for the hand-optimized cases
00720         case S_SHELL: flops += 2; break;
00721         case P_SHELL: flops += 8; break;
00722         case D_SHELL: flops += 17; break;
00723         case F_SHELL: flops += 30; break;
00724         case G_SHELL: flops += 50; break;
00725 
00726         // count up for catch-all loop
00727         default:
00728           int i, j; 
00729           for (i=0; i<=shelltype; i++) {
00730             int jmax = shelltype - i; 
00731             flops += 1;
00732             for (j=0; j<=jmax; j++) {
00733               flops += 6;
00734             }
00735           }
00736           break;
00737       }
00738 
00739       shell_counter++;
00740     } 
00741   }
00742 
00743   return flops;
00744 }
00745 
00746 
00747 //
00748 // Fast single-precision expf() implementation
00749 // Adapted from the free cephes math library on Netlib
00750 //   http://www.netlib.org/cephes/
00751 //
00752 // Cephes Math Library Release 2.2:  June, 1992
00753 // Copyright 1984, 1987, 1989 by Stephen L. Moshier
00754 // Direct inquiries to 30 Frost Street, Cambridge, MA 02140
00755 //
00756 static const float MAXNUMF =    3.4028234663852885981170418348451692544e38f;
00757 static const float MAXLOGF =   88.72283905206835f;
00758 static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */
00759 static const float LOG2EF  =    1.44269504088896341f;
00760 static const float C1      =    0.693359375f;
00761 static const float C2      =   -2.12194440e-4f;
00762 
00763 static inline float cephesfastexpf(float x) {
00764   float z;
00765   int n;
00766   
00767   if(x > MAXLOGF) 
00768     return MAXNUMF;
00769 
00770   if(x < MINLOGF) 
00771     return 0.0;
00772 
00773   // Express e^x = e^g 2^n = e^g e^(n loge(2)) = e^(g + n loge(2))
00774   z = floorf( LOG2EF * x + 0.5f ); // floor() truncates toward -infinity.
00775   x -= z * C1;
00776   x -= z * C2;
00777   n = (int) z;
00778 
00779   z = x * x;
00780   // Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9.
00781   z = ((((( 1.9875691500E-4f  * x + 1.3981999507E-3f) * x
00782           + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x
00783           + 1.6666665459E-1f) * x + 5.0000001201E-1f) * z + x + 1.0f;
00784 
00785   x = ldexpf(z, n); // multiply by power of 2
00786   return x;
00787 }
00788 
00789 
00790 
00791 
00792 /*
00793  * David J. Hardy
00794  * 12 Dec 2008
00795  *
00796  * aexpfnx() - Approximate expf() for negative x.
00797  *
00798  * Assumes that x <= 0.
00799  *
00800  * Assumes IEEE format for single precision float, specifically:
00801  * 1 sign bit, 8 exponent bits biased by 127, and 23 mantissa bits.
00802  *
00803  * Interpolates exp() on interval (-1/log2(e), 0], then shifts it by
00804  * multiplication of a fast calculation for 2^(-N).  The interpolation
00805  * uses a linear blending of 3rd degree Taylor polynomials at the end
00806  * points, so the approximation is once differentiable.
00807  *
00808  * The error is small (max relative error per interval is calculated
00809  * to be 0.131%, with a max absolute error of -0.000716).
00810  *
00811  * The cutoff is chosen so as to speed up the computation by early
00812  * exit from function, with the value chosen to give less than the
00813  * the max absolute error.  Use of a cutoff is unnecessary, except
00814  * for needing to shift smallest floating point numbers to zero,
00815  * i.e. you could remove cutoff and replace by:
00816  *
00817  * #define MINXNZ  -88.0296919311130  // -127 * log(2)
00818  *
00819  *   if (x < MINXNZ) return 0.f;
00820  *
00821  * Use of a cutoff causes a discontinuity which can be eliminated
00822  * through the use of a switching function.
00823  *
00824  * We can obtain arbitrarily smooth approximation by taking k+1 nodes on
00825  * the interval and weighting their respective Taylor polynomials by the
00826  * kth order Lagrange interpolant through those nodes.  The wiggle in the
00827  * polynomial interpolation due to equidistant nodes (Runge's phenomenon)
00828  * can be reduced by using Chebyshev nodes.
00829  */
00830 
00831 #define MLOG2EF    -1.44269504088896f
00832 
00833 /*
00834  * Interpolating coefficients for linear blending of the
00835  * 3rd degree Taylor expansion of 2^x about 0 and -1.
00836  */
00837 #define SCEXP0     1.0000000000000000f
00838 #define SCEXP1     0.6987082824680118f
00839 #define SCEXP2     0.2633174272827404f
00840 #define SCEXP3     0.0923611991471395f
00841 #define SCEXP4     0.0277520543324108f
00842 
00843 /* for single precision float */
00844 #define EXPOBIAS   127
00845 #define EXPOSHIFT   23
00846 
00847 /* cutoff is optional, but can help avoid unnecessary work */
00848 #define ACUTOFF    -10
00849 
00850 typedef union flint_t {
00851   float f;
00852   int n;
00853 } flint;
00854 
00855 float aexpfnx(float x) {
00856   /* assume x <= 0 */
00857   float mb;
00858   int mbflr;
00859   float d;
00860   float sy;
00861   flint scalfac;
00862 
00863   if (x < ACUTOFF) return 0.f;
00864 
00865   mb = x * MLOG2EF;    /* change base to 2, mb >= 0 */
00866   mbflr = (int) mb;    /* get int part, floor() */
00867   d = mbflr - mb;      /* remaining exponent, -1 < d <= 0 */
00868   sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00869                        /* approx with linear blend of Taylor polys */
00870   scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  /* 2^(-mbflr) */
00871   return (sy * scalfac.f);  /* scaled approx */
00872 }
00873 
00874 
00875 
00876 //
00877 // Optimized molecular orbital grid evaluation code
00878 //
00879 #define S_SHELL 0
00880 #define P_SHELL 1
00881 #define D_SHELL 2
00882 #define F_SHELL 3
00883 #define G_SHELL 4
00884 #define H_SHELL 5
00885 
00886 int evaluate_grid(int numatoms,
00887                   const float *wave_f, const float *basis_array,
00888                   const float *atompos,
00889                   const int *atom_basis,
00890                   const int *num_shells_per_atom,
00891                   const int *num_prim_per_shell,
00892                   const int *shell_types,
00893                   const int *numvoxels,
00894                   float voxelsize,
00895                   const float *origin,
00896                   int density,
00897                   float * orbitalgrid) {
00898   if (!orbitalgrid)
00899     return -1;
00900 
00901   int nx, ny, nz;
00902   // Calculate the value of the orbital at each gridpoint and store in 
00903   // the current oribtalgrid array
00904   int numgridxy = numvoxels[0]*numvoxels[1];
00905   for (nz=0; nz<numvoxels[2]; nz++) {
00906     float grid_x, grid_y, grid_z;
00907     grid_z = origin[2] + nz * voxelsize;
00908     for (ny=0; ny<numvoxels[1]; ny++) {
00909       grid_y = origin[1] + ny * voxelsize;
00910       int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
00911       for (nx=0; nx<numvoxels[0]; nx++) {
00912         grid_x = origin[0] + nx * voxelsize;
00913 
00914         // calculate the value of the wavefunction of the
00915         // selected orbital at the current grid point
00916         int at;
00917         int prim, shell;
00918 
00919         // initialize value of orbital at gridpoint
00920         float value = 0.0;
00921 
00922         // initialize the wavefunction and shell counters
00923         int ifunc = 0; 
00924         int shell_counter = 0;
00925 
00926         // loop over all the QM atoms
00927         for (at=0; at<numatoms; at++) {
00928           int maxshell = num_shells_per_atom[at];
00929           int prim_counter = atom_basis[at];
00930 
00931           // calculate distance between grid point and center of atom
00932           float xdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
00933           float ydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00934           float zdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00935 
00936           float xdist2 = xdist*xdist;
00937           float ydist2 = ydist*ydist;
00938           float zdist2 = zdist*zdist;
00939           float xdist3 = xdist2*xdist;
00940           float ydist3 = ydist2*ydist;
00941           float zdist3 = zdist2*zdist;
00942 
00943           float dist2 = xdist2 + ydist2 + zdist2;
00944 
00945           // loop over the shells belonging to this atom
00946           // XXX this is maybe a misnomer because in split valence
00947           //     basis sets like 6-31G we have more than one basis
00948           //     function per (valence-)shell and we are actually
00949           //     looping over the individual contracted GTOs
00950           for (shell=0; shell < maxshell; shell++) {
00951             float contracted_gto = 0.0f;
00952 
00953             // Loop over the Gaussian primitives of this contracted 
00954             // basis function to build the atomic orbital
00955             // 
00956             // XXX there's a significant opportunity here for further
00957             //     speedup if we replace the entire set of primitives
00958             //     with the single gaussian that they are attempting 
00959             //     to model.  This could give us another 6x speedup in 
00960             //     some of the common/simple cases.
00961             int maxprim = num_prim_per_shell[shell_counter];
00962             int shelltype = shell_types[shell_counter];
00963             for (prim=0; prim<maxprim; prim++) {
00964               float exponent       = basis_array[prim_counter    ];
00965               float contract_coeff = basis_array[prim_counter + 1];
00966 
00967               // XXX By premultiplying the stored exponent factors etc,
00968               //     we should be able to use exp2f() rather than exp(),
00969               //     saving several FLOPS per iteration of this loop
00970 #if defined(__GNUC__) && !defined(__ICC)
00971               // Use David Hardy's fast spline approximation instead
00972               // Works well for GCC, but runs slower for Intel C.
00973               contracted_gto += contract_coeff * aexpfnx(-exponent*dist2);
00974 #elif defined(__ICC)
00975               // When compiling with ICC, we'll use an inlined 
00976               // single-precision expf() implementation based on the
00977               // cephes math library found on Netlib.  This outruns the
00978               // standard glibc expf() by over 2x in this algorithm.
00979               contracted_gto += contract_coeff * cephesfastexpf(-exponent*dist2);
00980 #else
00981               // XXX By far the most costly operation here is exp(),
00982               //     for gcc builds, exp() accounts for 90% of the runtime
00983               contracted_gto += contract_coeff * expf(-exponent*dist2);
00984 #endif
00985               prim_counter += 2;
00986             }
00987 
00988             /* multiply with the appropriate wavefunction coefficient */
00989             float tmpshell=0;
00990             switch (shelltype) {
00991               case S_SHELL:
00992                 value += wave_f[ifunc++] * contracted_gto;
00993                 break;
00994 
00995               case P_SHELL:
00996                 tmpshell += wave_f[ifunc++] * xdist;
00997                 tmpshell += wave_f[ifunc++] * ydist;
00998                 tmpshell += wave_f[ifunc++] * zdist;
00999                 value += tmpshell * contracted_gto;
01000                 break;
01001 
01002               case D_SHELL:
01003                 tmpshell += wave_f[ifunc++] * xdist2;
01004                 tmpshell += wave_f[ifunc++] * xdist * ydist;
01005                 tmpshell += wave_f[ifunc++] * ydist2;
01006                 tmpshell += wave_f[ifunc++] * xdist * zdist;
01007                 tmpshell += wave_f[ifunc++] * ydist * zdist;
01008                 tmpshell += wave_f[ifunc++] * zdist2;
01009                 value += tmpshell * contracted_gto;
01010                 break;
01011 
01012               case F_SHELL:
01013                 tmpshell += wave_f[ifunc++] * xdist3;         // xxx
01014                 tmpshell += wave_f[ifunc++] * xdist2 * ydist; // xxy
01015                 tmpshell += wave_f[ifunc++] * ydist2 * xdist; // xyy
01016                 tmpshell += wave_f[ifunc++] * ydist3;         // yyy
01017                 tmpshell += wave_f[ifunc++] * xdist2 * zdist; // xxz
01018                 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist; // xyz
01019                 tmpshell += wave_f[ifunc++] * ydist2 * zdist; // yyz
01020                 tmpshell += wave_f[ifunc++] * zdist2 * xdist; // xzz
01021                 tmpshell += wave_f[ifunc++] * zdist2 * ydist; // yzz
01022                 tmpshell += wave_f[ifunc++] * zdist3;         // zzz
01023                 value += tmpshell * contracted_gto;
01024                 break;
01025  
01026               case G_SHELL:
01027                 tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
01028                 tmpshell += wave_f[ifunc++] * xdist3 * ydist;  // xxxy
01029                 tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
01030                 tmpshell += wave_f[ifunc++] * ydist3 * xdist;  // xyyy
01031                 tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
01032                 tmpshell += wave_f[ifunc++] * xdist3 * zdist;  // xxxz
01033                 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
01034                 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
01035                 tmpshell += wave_f[ifunc++] * ydist3 * zdist;  // yyyz
01036                 tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
01037                 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
01038                 tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
01039                 tmpshell += wave_f[ifunc++] * zdist3 * xdist;  // zzzx
01040                 tmpshell += wave_f[ifunc++] * zdist3 * ydist;  // zzzy
01041                 tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
01042                 value += tmpshell * contracted_gto;
01043                 break;
01044 
01045               default:
01046 #if 1
01047                 // avoid unnecessary branching and minimize use of pow()
01048                 int i, j; 
01049                 float xdp, ydp, zdp;
01050                 float xdiv = 1.0f / xdist;
01051                 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01052                   int imax = shelltype - j; 
01053                   for (i=0, ydp=1.0f, xdp=powf(xdist, float(imax)); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01054                     tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01055                   }
01056                 }
01057                 value += tmpshell * contracted_gto;
01058 #else
01059                 int i, j, k;
01060                 for (k=0; k<=shelltype; k++) {
01061                   for (j=0; j<=shelltype; j++) {
01062                     for (i=0; i<=shelltype; i++) {
01063                       if (i+j+k==shelltype) {
01064                         value += wave_f[ifunc++] * contracted_gto
01065                           * pow(xdist,i) * pow(ydist,j) * pow(zdist,k);
01066                       }
01067                     }
01068                   }
01069                 }
01070 #endif
01071             } // end switch
01072 
01073             shell_counter++;
01074           } // end shell
01075         } // end atom
01076 
01077         // return either orbital density or orbital wavefunction amplitude
01078         if (density) {
01079           float orbdensity = value * value;
01080           if (value < 0.0)
01081             orbdensity = -orbdensity;
01082           orbitalgrid[gaddrzy + nx] = orbdensity;
01083         } else {
01084           orbitalgrid[gaddrzy + nx] = value;
01085         }
01086       }
01087     }
01088   }
01089 
01090   return 0;
01091 }
01092 
01093 
01094 
01095 #if VMDORBUSESSE & defined(__SSE2__)
01096 
01097 //
01098 // Adaptation of the Cephes exp() to an SSE-ized exp_ps() routine 
01099 // originally by Julien Pommier
01100 //   Copyright (C) 2007  Julien Pommier, ZLIB license
01101 //   http://gruntthepeon.free.fr/ssemath/
01102 // 
01103 #ifdef _MSC_VER /* visual c++ */
01104 # define ALIGN16_BEG __declspec(align(16))
01105 # define ALIGN16_END
01106 #else /* gcc or icc */
01107 # define ALIGN16_BEG
01108 # define ALIGN16_END __attribute__((aligned(16)))
01109 #endif
01110 
01111 #define _PS_CONST(Name, Val)                                            \
01112   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
01113 #define _PI32_CONST(Name, Val)                                            \
01114   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
01115 
01116 _PS_CONST(exp_hi,       88.3762626647949f);
01117 _PS_CONST(exp_lo,       -88.3762626647949f);
01118 
01119 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
01120 _PS_CONST(cephes_exp_C1, 0.693359375);
01121 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
01122 
01123 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
01124 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
01125 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
01126 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
01127 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
01128 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
01129 _PS_CONST(one, 1.0);
01130 _PS_CONST(half, 0.5);
01131 
01132 _PI32_CONST(0x7f, 0x7f);
01133 
01134 typedef union xmm_mm_union {
01135   __m128 xmm;
01136   __m64 mm[2];
01137 } xmm_mm_union;
01138 
01139 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
01140     xmm_mm_union u; u.xmm = xmm_;                   \
01141     mm0_ = u.mm[0];                                 \
01142     mm1_ = u.mm[1];                                 \
01143 }
01144 
01145 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
01146     xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
01147   }
01148 
01149 __m128 exp_ps(__m128 x) {
01150   __m128 tmp = _mm_setzero_ps(), fx;
01151   __m64 mm0, mm1;
01152 
01153   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
01154   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
01155 
01156   /* express exp(x) as exp(g + n*log(2)) */
01157   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
01158   fx = _mm_add_ps(fx,*(__m128*)_ps_half);
01159 
01160   /* how to perform a floorf with SSE: just below */
01161   /* step 1 : cast to int */
01162   tmp = _mm_movehl_ps(tmp, fx);
01163   mm0 = _mm_cvttps_pi32(fx);
01164   mm1 = _mm_cvttps_pi32(tmp);
01165   /* step 2 : cast back to float */
01166   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
01167   /* if greater, substract 1 */
01168   __m128 mask = _mm_cmpgt_ps(tmp, fx);
01169   mask = _mm_and_ps(mask, *(__m128*)_ps_one);
01170   fx = _mm_sub_ps(tmp, mask);
01171 
01172   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
01173   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
01174   x = _mm_sub_ps(x, tmp);
01175   x = _mm_sub_ps(x, z);
01176 
01177   z = _mm_mul_ps(x,x);
01178 
01179   __m128 y = *(__m128*)_ps_cephes_exp_p0;
01180   y = _mm_mul_ps(y, x);
01181   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
01182   y = _mm_mul_ps(y, x);
01183   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
01184   y = _mm_mul_ps(y, x);
01185   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
01186   y = _mm_mul_ps(y, x);
01187   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
01188   y = _mm_mul_ps(y, x);
01189   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
01190   y = _mm_mul_ps(y, z);
01191   y = _mm_add_ps(y, x);
01192   y = _mm_add_ps(y, *(__m128*)_ps_one);
01193 
01194   /* build 2^n */
01195   z = _mm_movehl_ps(z, fx);
01196   mm0 = _mm_cvttps_pi32(fx);
01197   mm1 = _mm_cvttps_pi32(z);
01198   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
01199   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
01200   mm0 = _mm_slli_pi32(mm0, 23);
01201   mm1 = _mm_slli_pi32(mm1, 23);
01202 
01203   __m128 pow2n;
01204   COPY_MM_TO_XMM(mm0, mm1, pow2n);
01205 
01206   y = _mm_mul_ps(y, pow2n);
01207   _mm_empty();
01208   return y;
01209 }
01210 
01211 
01212 //
01213 // David J. Hardy
01214 // 12 Dec 2008
01215 //
01216 // aexpfnxsse() - SSE2 version of aexpfnx().
01217 //
01218 //
01219 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
01220 #define __align(X)  __attribute__((aligned(X) ))
01221 #if (__GNUC__ < 4)
01222 #define MISSING_mm_cvtsd_f64
01223 #endif
01224 #else
01225 #define __align(X) __declspec(align(X) )
01226 #endif
01227 
01228 typedef union SSEreg_t {
01229   __m128  f;  // 4x float (SSE)
01230   __m128i i;  // 4x 32-bit int (SSE2)
01231   struct {
01232     int r0, r1, r2, r3;  // get the individual registers
01233   } reg;
01234 } SSEreg;
01235 
01236 #define MLOG2EF    -1.44269504088896f
01237 
01238 /*
01239  * Interpolating coefficients for linear blending of the
01240  * 3rd degree Taylor expansion of 2^x about 0 and -1.
01241  */
01242 #define SCEXP0     1.0000000000000000f
01243 #define SCEXP1     0.6987082824680118f
01244 #define SCEXP2     0.2633174272827404f
01245 #define SCEXP3     0.0923611991471395f
01246 #define SCEXP4     0.0277520543324108f
01247 
01248 /* for single precision float */
01249 #define EXPOBIAS   127
01250 #define EXPOSHIFT   23
01251 
01252 /* cutoff is optional, but can help avoid unnecessary work */
01253 #define ACUTOFF    -10
01254 
01255 __m128 aexpfnxsse(__m128 x) {
01256   __align(16) SSEreg scal;
01257   __align(16) SSEreg n;
01258   __align(16) SSEreg y;
01259 
01260   scal.f = _mm_cmpge_ps(x, _mm_set_ps1(ACUTOFF));  /* Is x within cutoff? */
01261 
01262   /* If all x are outside of cutoff, return 0s. */
01263   if (_mm_movemask_ps(scal.f) == 0) {
01264     return _mm_setzero_ps();
01265   }
01266   /* Otherwise, scal.f contains mask to be ANDed with the scale factor */
01267 
01268   /*
01269    * Convert base:  exp(x) = 2^(N-d) where N is integer and 0 <= d < 1.
01270    *
01271    * Below we calculate n=N and x=-d, with "y" for temp storage,
01272    * calculate floor of x*log2(e) and subtract to get -d.
01273    */
01274   y.f = _mm_mul_ps(x, _mm_set_ps1(MLOG2EF));
01275   n.i = _mm_cvttps_epi32(y.f);
01276   x = _mm_cvtepi32_ps(n.i);
01277   x = _mm_sub_ps(x, y.f);
01278 
01279   /*
01280    * Approximate 2^{-d}, 0 <= d < 1, by interpolation.
01281    * Perform Horner's method to evaluate interpolating polynomial.
01282    */
01283   y.f = _mm_mul_ps(x, _mm_set_ps1(SCEXP4));      /* for x^4 term */
01284   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3));    /* for x^3 term */
01285   y.f = _mm_mul_ps(y.f, x);
01286   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2));    /* for x^2 term */
01287   y.f = _mm_mul_ps(y.f, x);
01288   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1));    /* for x^1 term */
01289   y.f = _mm_mul_ps(y.f, x);
01290   y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0));    /* for x^0 term */
01291 
01292   /*
01293    * Calculate 2^N exactly by directly manipulating floating point exponent.
01294    * Bitwise AND the result with scal.f mask to create the scale factor,
01295    * then use it to scale y for the final result.
01296    */
01297   n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
01298   n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
01299   scal.f = _mm_and_ps(scal.f, n.f);
01300   y.f = _mm_mul_ps(y.f, scal.f);
01301 
01302   return y.f;
01303 }
01304 
01305 
01306 int evaluate_grid_sse(int numatoms,
01307                   const float *wave_f, const float *basis_array,
01308                   const float *atompos,
01309                   const int *atom_basis,
01310                   const int *num_shells_per_atom,
01311                   const int *num_prim_per_shell,
01312                   const int *shell_types,
01313                   const int *numvoxels,
01314                   float voxelsize,
01315                   const float *origin,
01316                   int density,
01317                   float * orbitalgrid) {
01318   if (!orbitalgrid)
01319     return -1;
01320 
01321   int nx, ny, nz;
01322   __attribute__((aligned(16))) float sxdelta[4]; // 16-byte aligned for SSE
01323   for (nx=0; nx<4; nx++) 
01324     sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
01325 
01326   // Calculate the value of the orbital at each gridpoint and store in 
01327   // the current oribtalgrid array
01328   int numgridxy = numvoxels[0]*numvoxels[1];
01329   for (nz=0; nz<numvoxels[2]; nz++) {
01330     float grid_x, grid_y, grid_z;
01331     grid_z = origin[2] + nz * voxelsize;
01332     for (ny=0; ny<numvoxels[1]; ny++) {
01333       grid_y = origin[1] + ny * voxelsize;
01334       int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
01335       for (nx=0; nx<numvoxels[0]; nx+=4) {
01336         grid_x = origin[0] + nx * voxelsize;
01337 
01338         // calculate the value of the wavefunction of the
01339         // selected orbital at the current grid point
01340         int at;
01341         int prim, shell;
01342 
01343         // initialize value of orbital at gridpoint
01344         __m128 value = _mm_setzero_ps();
01345 
01346         // initialize the wavefunction and shell counters
01347         int ifunc = 0; 
01348         int shell_counter = 0;
01349 
01350         // loop over all the QM atoms
01351         for (at=0; at<numatoms; at++) {
01352           int maxshell = num_shells_per_atom[at];
01353           int prim_counter = atom_basis[at];
01354 
01355           // calculate distance between grid point and center of atom
01356           float sxdist = (grid_x - atompos[3*at  ])*ANGS_TO_BOHR;
01357           float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
01358           float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
01359 
01360           float sydist2 = sydist*sydist;
01361           float szdist2 = szdist*szdist;
01362           float yzdist2 = sydist2 + szdist2;
01363 
01364           __m128 xdelta = _mm_load_ps(&sxdelta[0]); // aligned load
01365           __m128 xdist  = _mm_load_ps1(&sxdist);
01366           xdist = _mm_add_ps(xdist, xdelta);
01367           __m128 ydist  = _mm_load_ps1(&sydist);
01368           __m128 zdist  = _mm_load_ps1(&szdist);
01369           __m128 xdist2 = _mm_mul_ps(xdist, xdist);
01370           __m128 ydist2 = _mm_mul_ps(ydist, ydist);
01371           __m128 zdist2 = _mm_mul_ps(zdist, zdist);
01372           __m128 dist2  = _mm_load_ps1(&yzdist2); 
01373           dist2 = _mm_add_ps(dist2, xdist2);
01374  
01375           // loop over the shells belonging to this atom
01376           // XXX this is maybe a misnomer because in split valence
01377           //     basis sets like 6-31G we have more than one basis
01378           //     function per (valence-)shell and we are actually
01379           //     looping over the individual contracted GTOs
01380           for (shell=0; shell < maxshell; shell++) {
01381             __m128 contracted_gto = _mm_setzero_ps();
01382 
01383             // Loop over the Gaussian primitives of this contracted 
01384             // basis function to build the atomic orbital
01385             // 
01386             // XXX there's a significant opportunity here for further
01387             //     speedup if we replace the entire set of primitives
01388             //     with the single gaussian that they are attempting 
01389             //     to model.  This could give us another 6x speedup in 
01390             //     some of the common/simple cases.
01391             int maxprim = num_prim_per_shell[shell_counter];
01392             int shelltype = shell_types[shell_counter];
01393             for (prim=0; prim<maxprim; prim++) {
01394               // XXX pre-negate exponent value
01395               float exponent       = -basis_array[prim_counter    ];
01396               float contract_coeff =  basis_array[prim_counter + 1];
01397 
01398               // contracted_gto += contract_coeff * exp(-exponent*dist2);
01399               __m128 expval = _mm_mul_ps(_mm_load_ps1(&exponent), dist2);
01400               // SSE expf() required here
01401 #if 1
01402               __m128 retval = aexpfnxsse(expval);
01403 #else
01404               __m128 retval = exp_ps(expval);
01405 #endif
01406               __m128 ctmp = _mm_mul_ps(_mm_load_ps1(&contract_coeff), retval);
01407               contracted_gto = _mm_add_ps(contracted_gto, ctmp);
01408               prim_counter += 2;
01409             }
01410 
01411             /* multiply with the appropriate wavefunction coefficient */
01412             __m128 tmpshell = _mm_setzero_ps();
01413             switch (shelltype) {
01414               case S_SHELL:
01415                 value = _mm_add_ps(value, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), contracted_gto));
01416                 break;
01417 
01418               case P_SHELL:
01419                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist));
01420                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist));
01421                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist));
01422                 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01423                 break;
01424 
01425               case D_SHELL:
01426                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), xdist2));
01427                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, ydist)));
01428                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), ydist2));
01429                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist, zdist)));
01430                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist, zdist)));
01431                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), zdist2));
01432                 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01433                 break;
01434 
01435               case F_SHELL:
01436                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, xdist)));
01437                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, ydist)));
01438                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, xdist)));
01439                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, ydist)));
01440                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(xdist2, zdist)));
01441                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(_mm_mul_ps(xdist, ydist), zdist)));
01442                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(ydist2, zdist)));
01443                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, xdist)));
01444                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, ydist)));
01445                 tmpshell = _mm_add_ps(tmpshell, _mm_mul_ps(_mm_load_ps1(&wave_f[ifunc++]), _mm_mul_ps(zdist2, zdist)));
01446                 value = _mm_add_ps(value, _mm_mul_ps(tmpshell, contracted_gto));
01447                 break;
01448  
01449 #if 0
01450               default:
01451                 // avoid unnecessary branching and minimize use of pow()
01452                 int i, j; 
01453                 float xdp, ydp, zdp;
01454                 float xdiv = 1.0f / xdist;
01455                 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01456                   int imax = shelltype - j; 
01457                   for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01458                     tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
01459                   }
01460                 }
01461                 value += tmpshell * contracted_gto;
01462 #endif
01463             } // end switch
01464 
01465             shell_counter++;
01466           } // end shell
01467         } // end atom
01468 
01469         // return either orbital density or orbital wavefunction amplitude
01470         if (density) {
01471           __m128 mask = _mm_cmplt_ps(value, _mm_setzero_ps());
01472           __m128 sqdensity = _mm_mul_ps(value, value);
01473           __m128 orbdensity = sqdensity;
01474           __m128 nsqdensity = _mm_and_ps(sqdensity, mask);
01475           orbdensity = _mm_sub_ps(orbdensity, nsqdensity);
01476           orbdensity = _mm_sub_ps(orbdensity, nsqdensity);
01477           _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], orbdensity);
01478         } else {
01479           _mm_storeu_ps(&orbitalgrid[gaddrzy + nx], value);
01480         }
01481       }
01482     }
01483   }
01484 
01485   return 0;
01486 }
01487 
01488 #endif
01489 
01490 
01491 //
01492 // Multithreaded molecular orbital computation engine
01493 //
01494 
01495 typedef struct {
01496   int numatoms;
01497   const float *wave_f;
01498   const float *basis_array;
01499   const float *atompos;
01500   const int *atom_basis;
01501   const int *num_shells_per_atom;
01502   const int *num_prim_per_shell;
01503   const int *shell_types;
01504   const int *numvoxels;
01505   float voxelsize;
01506   int density;
01507   const float *origin;
01508   float *orbitalgrid;
01509 } orbthrparms;
01510 
01511 
01512 extern "C" void * orbitalthread(void *voidparms) {
01513   int numvoxels[3];
01514   float origin[3];
01515   orbthrparms *parms = NULL;
01516   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
01517 
01518   numvoxels[0] = parms->numvoxels[0];
01519   numvoxels[1] = parms->numvoxels[1];
01520   numvoxels[2] = 1;
01521 
01522   origin[0] = parms->origin[0];
01523   origin[1] = parms->origin[1];
01524 
01525   // loop over orbital planes
01526   int planesize = numvoxels[0] * numvoxels[1];
01527   wkf_tasktile_t tile;
01528   while (wkf_threadlaunch_next_tile(voidparms, 2, &tile) != WKF_SCHED_DONE) {
01529     int k;
01530     for (k=tile.start; k<tile.end; k++) {
01531       origin[2] = parms->origin[2] + parms->voxelsize * k;
01532 #if VMDORBUSESSE & defined(__SSE2__)
01533       evaluate_grid_sse(parms->numatoms,
01534                     parms->wave_f, parms->basis_array,
01535                     parms->atompos,
01536                     parms->atom_basis,
01537                     parms->num_shells_per_atom,
01538                     parms->num_prim_per_shell,
01539                     parms->shell_types,
01540                     numvoxels,
01541                     parms->voxelsize,
01542                     origin,
01543                     parms->density,
01544                     parms->orbitalgrid + planesize*k);
01545 #else
01546       evaluate_grid(parms->numatoms,
01547                     parms->wave_f, parms->basis_array,
01548                     parms->atompos,
01549                     parms->atom_basis,
01550                     parms->num_shells_per_atom,
01551                     parms->num_prim_per_shell,
01552                     parms->shell_types,
01553                     numvoxels,
01554                     parms->voxelsize,
01555                     origin,
01556                     parms->density,
01557                     parms->orbitalgrid + planesize*k);
01558 #endif
01559     }
01560   }
01561 
01562   return NULL;
01563 }
01564 
01565 
01566 int evaluate_grid_fast(int numatoms,
01567                        const float *wave_f,
01568                        const float *basis_array,
01569                        const float *atompos,
01570                        const int *atom_basis,
01571                        const int *num_shells_per_atom,
01572                        const int *num_prim_per_shell,
01573                        const int *shell_types,
01574                        const int *numvoxels,
01575                        float voxelsize,
01576                        const float *origin,
01577                        int density,
01578                        float * orbitalgrid) {
01579   int rc=0;
01580   orbthrparms parms;
01581 
01582   parms.numatoms = numatoms;
01583   parms.wave_f = wave_f;
01584   parms.basis_array = basis_array;
01585   parms.atompos = atompos;
01586   parms.atom_basis = atom_basis;
01587   parms.num_shells_per_atom = num_shells_per_atom;
01588   parms.num_prim_per_shell = num_prim_per_shell;
01589   parms.shell_types = shell_types;
01590   parms.numvoxels = numvoxels;
01591   parms.voxelsize = voxelsize;
01592   parms.origin = origin;
01593   parms.density = density;
01594   parms.orbitalgrid = orbitalgrid;
01595 
01596 #if defined(VMDTHREADS)
01597   int numprocs = wkf_thread_numprocessors();
01598 #else
01599   int numprocs = 1;
01600 #endif
01601 
01602   /* spawn child threads to do the work */
01603   wkf_tasktile_t tile;
01604   tile.start = 0;
01605   tile.end = numvoxels[2];
01606   rc = wkf_threadlaunch(numprocs, &parms, orbitalthread, &tile);
01607 
01608   return rc;
01609 }
01610 
01611 
01612 void Orbital::print_wavefunction() {
01613   // XXX Windows and IRIX don't provide log2f(), nor log2() ?!?!?!?!
01614   // for now we'll just avoid compiling this debugging code
01615 #if !(defined(_MSC_VER) || defined(ARCH_IRIX6) || defined(ARCH_IRIX6_64))
01616   char shellname[6] = {'S', 'P', 'D', 'F', 'G', 'H'};
01617   int ifunc = 0;
01618   int at;
01619   int shell;
01620   for (at=0; at<numatoms; at++) {
01621     for (shell=0; shell < num_shells_per_atom[at]; shell++) {
01622       int shelltype = basis_set[at].shell[shell].type;
01623 
01624       // avoid unnecessary branching and minimize use of pow()
01625       int i, j, iang=0; 
01626       float xdist=2.0;
01627       float ydist=2.0;
01628       float zdist=2.0;
01629       float xdp, ydp, zdp;
01630       float xdiv = 1.0f / xdist;
01631       for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
01632         int imax = shelltype - j; 
01633         for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
01634           printf("%3i %c", at, shellname[shelltype]);
01635           int k, m=0;
01636           char buf[20]; buf[0] = '\0';
01637           for (k=0; k<(int)log2f(xdp); k++, m++) sprintf(buf+m, "x");
01638           for (k=0; k<(int)log2f(ydp); k++, m++) sprintf(buf+m, "y");
01639           for (k=0; k<(int)log2f(zdp); k++, m++) sprintf(buf+m, "z");
01640           //char *ang = qmdata->get_angular_momentum(at, shell, iang);
01641           printf("%-5s (%1.0f%1.0f%1.0f) wave_f[%3i] = % 11.6f\n", buf,
01642                  log2f(xdp), log2f(ydp), log2f(zdp), ifunc, wave_f[ifunc]);
01643           //delete [] ang;
01644           iang++;
01645           ifunc++;
01646         }
01647       }
01648     }
01649   }
01650 #endif
01651 
01652 }

Generated on Sat May 26 01:48:17 2012 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002