OpenCLOrbital.C Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: OpenCLOrbital.C,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.32 $      $Date: 2019/01/17 21:38:55 $
00014  *
00015  ***************************************************************************
00016  * DESCRIPTION:
00017  *   This source file contains the OpenCL kernels for computing molecular
00018  *  orbital amplitudes on a uniformly spaced grid, using one ore more GPUs.
00019  *
00020  ***************************************************************************/
00021 
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <math.h>
00026 #if defined(__APPLE__)
00027 #include <OpenCL/cl.h>
00028 #else
00029 #include <CL/cl.h>
00030 #endif
00031 
00032 #include "WKFThreads.h"
00033 #include "OpenCLKernels.h"
00034 
00035 #if 1
00036 #define CLERR \
00037   if (clerr != CL_SUCCESS) {                     \
00038     printf("opencl error %d, %s line %d\n", clerr, __FILE__, __LINE__); \
00039     return NULL;                                   \
00040   }
00041 #else
00042 #define CLERR
00043 #endif
00044 
00045 typedef union flint_t {
00046   float f;
00047   int i;
00048 } flint;
00049 
00050 typedef struct {
00051   int numatoms;
00052   const float *wave_f; 
00053   int num_wave_f;
00054   const float *basis_array;
00055   int num_basis;
00056   const float *atompos;
00057   const int *atom_basis;
00058   const int *num_shells_per_atom;
00059   const int *num_prim_per_shell;
00060   const int *shell_symmetry;
00061   int num_shells;
00062   const int *numvoxels;
00063   float voxelsize;
00064   const float *origin;
00065   int density;
00066   float *orbitalgrid;
00067   vmd_opencl_orbital_handle *orbh;
00068 } orbthrparms;
00069 
00070 /* thread prototype */
00071 static void * openclorbitalthread(void *);
00072 
00073 // GPU block layout
00074 #define UNROLLX       1
00075 #define UNROLLY       1
00076 #define BLOCKSIZEX    8 
00077 #define BLOCKSIZEY    8
00078 #define BLOCKSIZE     BLOCKSIZEX * BLOCKSIZEY
00079 
00080 // required GPU array padding to match thread block size
00081 #define TILESIZEX BLOCKSIZEX*UNROLLX
00082 #define TILESIZEY BLOCKSIZEY*UNROLLY
00083 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00084 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00085 
00086 #define V4UNROLLX       4
00087 #define V4UNROLLY       1
00088 #define V4BLOCKSIZEX    2 
00089 #define V4BLOCKSIZEY    8
00090 #define V4BLOCKSIZE     BLOCKSIZEX * BLOCKSIZEY
00091 
00092 // required GPU array padding to match thread block size
00093 #define V4TILESIZEX V4BLOCKSIZEX*V4UNROLLX
00094 #define V4TILESIZEY V4BLOCKSIZEY*V4UNROLLY
00095 #define V4GPU_X_ALIGNMASK (V4TILESIZEX - 1)
00096 #define V4GPU_Y_ALIGNMASK (V4TILESIZEY - 1)
00097 
00098 #define MEMCOALESCE  384
00099 
00100 // orbital shell types 
00101 #define S_SHELL 0
00102 #define P_SHELL 1
00103 #define D_SHELL 2
00104 #define F_SHELL 3
00105 #define G_SHELL 4
00106 #define H_SHELL 5
00107 
00108 //
00109 // Constant arrays to store 
00110 //
00111 #define MAX_ATOM_SZ 256
00112 #define MAX_ATOMPOS_SZ (MAX_ATOM_SZ)
00113 #define MAX_ATOM_BASIS_SZ (MAX_ATOM_SZ)
00114 #define MAX_ATOMSHELL_SZ (MAX_ATOM_SZ)
00115 #define MAX_BASIS_SZ 6144 
00116 #define MAX_SHELL_SZ 1024
00117 #define MAX_WAVEF_SZ 6144
00118 
00119 
00120 // Shared memory tiling parameters:
00121 //   We need a tile size of at least 16 elements for coalesced memory access,
00122 // and it must be the next power of two larger than the largest number of 
00123 // wavefunction coefficients that will be consumed at once for a given 
00124 // shell type.  The current code is designed to handle up to "G" shells 
00125 // (15 coefficients), since that's the largest shell type supported by
00126 // GAMESS, which is the only format we can presently read.
00127 
00128 // The maximum shell coefficient count specifies the largest number
00129 // of coefficients that might have to be read for a single shell,
00130 // e.g. for the highest supported shell type.
00131 // This is a "G" shell coefficient count
00132 #define MAXSHELLCOUNT 15   
00133 
00134 // Mask out the lower N bits of the array index
00135 // to compute which tile to start loading shared memory with.
00136 // This must be large enough to gaurantee coalesced addressing, but
00137 // be half or less of the minimum tile size
00138 #define MEMCOAMASK  (~15)
00139 
00140 // The shared memory basis set and wavefunction coefficient arrays 
00141 // must contain a minimum of at least two tiles, but can be any 
00142 // larger multiple thereof which is also a multiple of the thread block size.
00143 // Larger arrays reduce duplicative data loads/fragmentation 
00144 // at the beginning and end of the array
00145 #define SHAREDSIZE 256
00146 
00147 //
00148 // OpenCL kernel source code as a giant string
00149 //
00150 const char *clorbitalsrc = 
00151   "// unit conversion                                                      \n"
00152   "#define ANGS_TO_BOHR 1.8897259877218677f                                \n"
00153   "                                                                        \n"
00154   "// orbital shell types                                                  \n"
00155   "#define S_SHELL 0                                                       \n"
00156   "#define P_SHELL 1                                                       \n"
00157   "#define D_SHELL 2                                                       \n"
00158   "#define F_SHELL 3                                                       \n"
00159   "#define G_SHELL 4                                                       \n"
00160   "#define H_SHELL 5                                                       \n"
00161   "                                                                        \n"
00162   "//                                                                      \n"
00163   "// OpenCL using const memory for almost all of the key arrays           \n"
00164   "//                                                                      \n"
00165   "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n"
00166   "void clorbitalconstmem(int numatoms,                                    \n"
00167 #if defined(__APPLE__)
00168   // Workaround for Apple OpenCL compiler bug related to constant memory
00169   //   Although the OpenCL full profile requires a minimum of 8 constant args
00170   //   of minimum size 64-kB each, Apple's compiler has problems with more 
00171   //   than 3, giving wrong answers for 4 or more.
00172   "                       __global float *const_atompos,                 \n"
00173   "                       __global int *const_atom_basis,                \n"
00174   "                       __global int *const_num_shells_per_atom,       \n"
00175   "                       __global float *const_basis_array,             \n"
00176   "                       __global int *const_num_prim_per_shell,        \n"
00177   "                       __global int *const_shell_symmetry,            \n"
00178   "                       __global float *const_wave_f,                  \n"
00179 #else
00180   "                       __constant float *const_atompos,                 \n"
00181   "                       __constant int *const_atom_basis,                \n"
00182   "                       __constant int *const_num_shells_per_atom,       \n"
00183   "                       __constant float *const_basis_array,             \n"
00184   "                       __constant int *const_num_prim_per_shell,        \n"
00185   "                       __constant int *const_shell_symmetry,            \n"
00186   "                       __constant float *const_wave_f,                  \n"
00187 #endif
00188   "                       float voxelsize,                                 \n"
00189   "                       float originx,                                   \n"
00190   "                       float originy,                                   \n"
00191   "                       float grid_z,                                    \n"
00192   "                       int density,                                     \n"
00193   "                       __global float * orbitalgrid) {                  \n"
00194   "  unsigned int xindex  = get_global_id(0);                              \n"
00195   "  unsigned int yindex  = get_global_id(1);                              \n"
00196   "  unsigned int outaddr = get_global_size(0) * yindex + xindex;          \n"
00197   "  float grid_x = originx + voxelsize * xindex;                          \n"
00198   "  float grid_y = originy + voxelsize * yindex;                          \n"
00199   "                                                                        \n"
00200   "  // similar to C version                                               \n"
00201   "  int at;                                                               \n"
00202   "  int prim, shell;                                                      \n"
00203   "                                                                        \n"
00204   "  // initialize value of orbital at gridpoint                           \n"
00205   "  float value = 0.0f;                                                   \n"
00206   "                                                                        \n"
00207   "  // initialize the wavefunction and shell counters                     \n"
00208   "  int ifunc = 0;                                                        \n"
00209   "  int shell_counter = 0;                                                \n"
00210   "  // loop over all the QM atoms                                         \n"
00211   "  for (at = 0; at < numatoms; at++) {                                   \n"
00212   "    // calculate distance between grid point and center of atom         \n"
00213   "    int maxshell = const_num_shells_per_atom[at];                       \n"
00214   "    int prim_counter = const_atom_basis[at];                            \n"
00215   "                                                                        \n"
00216   "    float xdist = (grid_x - const_atompos[3*at  ])*ANGS_TO_BOHR;        \n"
00217   "    float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR;        \n"
00218   "    float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR;        \n"
00219   "                                                                        \n"
00220   "    float xdist2 = xdist*xdist;                                         \n"
00221   "    float ydist2 = ydist*ydist;                                         \n"
00222   "    float zdist2 = zdist*zdist;                                         \n"
00223   "                                                                        \n"
00224   "    float dist2 = xdist2 + ydist2 + zdist2;                             \n"
00225   "                                                                        \n"
00226   "    // loop over the shells belonging to this atom (or basis function)  \n"
00227   "    for (shell=0; shell < maxshell; shell++) {                          \n"
00228   "      float contracted_gto = 0.0f;                                      \n"
00229   "                                                                        \n"
00230   "      // Loop over the Gaussian primitives of this contracted           \n"
00231   "      // basis function to build the atomic orbital                     \n"
00232   "      int maxprim = const_num_prim_per_shell[shell_counter];            \n"
00233   "      int shell_type = const_shell_symmetry[shell_counter];             \n"
00234   "      for (prim=0; prim < maxprim;  prim++) {                           \n"
00235   "        float exponent       = const_basis_array[prim_counter    ];     \n"
00236   "        float contract_coeff = const_basis_array[prim_counter + 1];     \n"
00237   "                                                                        \n"
00238   "        // By premultiplying the stored exponent factors etc,           \n"
00239   "        // we can use exp2f() rather than exp(), giving us full         \n"
00240   "        // precision, but with the speed of __expf()                    \n"
00241   "        contracted_gto += contract_coeff * native_exp2(-exponent*dist2);\n"
00242   "        prim_counter += 2;                                              \n"
00243   "      }                                                                 \n"
00244   "                                                                        \n"
00245   "      /* multiply with the appropriate wavefunction coefficient */      \n"
00246   "      float tmpshell=0.0f;                                              \n"
00247   "      switch (shell_type) {                                             \n"
00248   "        case S_SHELL:                                                   \n"
00249   "          value += const_wave_f[ifunc++] * contracted_gto;              \n"
00250   "          break;                                                        \n"
00251   "                                                                        \n"
00252   "        case P_SHELL:                                                   \n"
00253   "          tmpshell += const_wave_f[ifunc++] * xdist;                    \n"
00254   "          tmpshell += const_wave_f[ifunc++] * ydist;                    \n"
00255   "          tmpshell += const_wave_f[ifunc++] * zdist;                    \n"
00256   "          value += tmpshell * contracted_gto;                           \n"
00257   "          break;                                                        \n"
00258   "                                                                        \n"
00259   "        case D_SHELL:                                                   \n"
00260   "          tmpshell += const_wave_f[ifunc++] * xdist2;                   \n"
00261   "          tmpshell += const_wave_f[ifunc++] * xdist * ydist;            \n"
00262   "          tmpshell += const_wave_f[ifunc++] * ydist2;                   \n"
00263   "          tmpshell += const_wave_f[ifunc++] * xdist * zdist;            \n"
00264   "          tmpshell += const_wave_f[ifunc++] * ydist * zdist;            \n"
00265   "          tmpshell += const_wave_f[ifunc++] * zdist2;                   \n"
00266   "          value += tmpshell * contracted_gto;                           \n"
00267   "          break;                                                        \n"
00268   "                                                                        \n"
00269   "        case F_SHELL:                                                   \n"
00270   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist;           \n"
00271   "          tmpshell += const_wave_f[ifunc++] * xdist2 * ydist;           \n"
00272   "          tmpshell += const_wave_f[ifunc++] * ydist2 * xdist;           \n"
00273   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist;           \n"
00274   "          tmpshell += const_wave_f[ifunc++] * xdist2 * zdist;           \n"
00275   "          tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist;    \n"
00276   "          tmpshell += const_wave_f[ifunc++] * ydist2 * zdist;           \n"
00277   "          tmpshell += const_wave_f[ifunc++] * zdist2 * xdist;           \n"
00278   "          tmpshell += const_wave_f[ifunc++] * zdist2 * ydist;           \n"
00279   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist;           \n"
00280   "          value += tmpshell * contracted_gto;                           \n"
00281   "          break;                                                        \n"
00282   "                                                                        \n"
00283   "        case G_SHELL:                                                   \n"
00284   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2;          \n"
00285   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist;   \n"
00286   "          tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2;          \n"
00287   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist;   \n"
00288   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2;          \n"
00289   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist;   \n"
00290   "          tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist;   \n"
00291   "          tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist;   \n"
00292   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist;   \n"
00293   "          tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2;          \n"
00294   "          tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist;   \n"
00295   "          tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2;          \n"
00296   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist;   \n"
00297   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist;   \n"
00298   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2;          \n"
00299   "          value += tmpshell * contracted_gto;                           \n"
00300   "          break;                                                        \n"
00301   "      } // end switch                                                   \n"
00302   "                                                                        \n"
00303   "      shell_counter++;                                                  \n"
00304   "    }                                                                   \n"
00305   "  }                                                                     \n"
00306   "                                                                        \n"
00307   "  // return either orbital density or orbital wavefunction amplitude    \n"
00308   "  if (density) {                                                        \n"
00309   "    orbitalgrid[outaddr] = copysign(value*value, value);                \n"
00310   "  } else {                                                              \n"
00311   "    orbitalgrid[outaddr] = value;                                       \n"
00312   "  }                                                                     \n"
00313   "}                                                                       \n"
00314   "                                                                        \n"
00315   "                                                                        \n"
00316   "//                                                                      \n"
00317   "// OpenCL using const memory for almost all of the key arrays           \n"
00318   "//                                                                      \n"
00319   "__kernel __attribute__((reqd_work_group_size(V4BLOCKSIZEX, V4BLOCKSIZEY, 1))) \n"
00320   "void clorbitalconstmem_vec4(int numatoms,                               \n"
00321 #if defined(__APPLE__)
00322   // Workaround for Apple OpenCL compiler bug related to constant memory
00323   //   Although the OpenCL full profile requires a minimum of 8 constant args
00324   //   of minimum size 64-kB each, Apple's compiler has problems with more 
00325   //   than 3, giving wrong answers for 4 or more.
00326   "                       __global float *const_atompos,                 \n"
00327   "                       __global int *const_atom_basis,                \n"
00328   "                       __global int *const_num_shells_per_atom,       \n"
00329   "                       __global float *const_basis_array,             \n"
00330   "                       __global int *const_num_prim_per_shell,        \n"
00331   "                       __global int *const_shell_symmetry,            \n"
00332   "                       __global float *const_wave_f,                  \n"
00333 #else
00334   "                       __constant float *const_atompos,                 \n"
00335   "                       __constant int *const_atom_basis,                \n"
00336   "                       __constant int *const_num_shells_per_atom,       \n"
00337   "                       __constant float *const_basis_array,             \n"
00338   "                       __constant int *const_num_prim_per_shell,        \n"
00339   "                       __constant int *const_shell_symmetry,            \n"
00340   "                       __constant float *const_wave_f,                  \n"
00341 #endif
00342   "                       float voxelsize,                                 \n"
00343   "                       float originx,                                   \n"
00344   "                       float originy,                                   \n"
00345   "                       float grid_z,                                    \n"
00346   "                       int density,                                     \n"
00347   "                       __global float * orbitalgrid) {                  \n"
00348   "  unsigned int xindex  = (get_global_id(0) - get_local_id(0)) * V4UNROLLX + get_local_id(0); \n"
00349   "  unsigned int yindex  = get_global_id(1);                              \n"
00350   "  unsigned int outaddr = get_global_size(0) * V4UNROLLX * yindex + xindex;\n"
00351   "  float4 gridspacing_v4 = { 0.f, 1.f, 2.f, 3.f };                       \n"
00352   "  gridspacing_v4 *= (voxelsize * V4BLOCKSIZEX);                         \n"
00353   "  float4 grid_x = originx + voxelsize * xindex + gridspacing_v4;        \n"
00354   "  float grid_y = originy + voxelsize * yindex;                          \n"
00355   "                                                                        \n"
00356   "  // similar to C version                                               \n"
00357   "  int at;                                                               \n"
00358   "  int prim, shell;                                                      \n"
00359   "                                                                        \n"
00360   "  // initialize value of orbital at gridpoint                           \n"
00361   "  float4 value = 0.0f;                                                  \n"
00362   "                                                                        \n"
00363   "  // initialize the wavefunction and shell counters                     \n"
00364   "  int ifunc = 0;                                                        \n"
00365   "  int shell_counter = 0;                                                \n"
00366   "  // loop over all the QM atoms                                         \n"
00367   "  for (at = 0; at < numatoms; at++) {                                   \n"
00368   "    // calculate distance between grid point and center of atom         \n"
00369   "    int maxshell = const_num_shells_per_atom[at];                       \n"
00370   "    int prim_counter = const_atom_basis[at];                            \n"
00371   "                                                                        \n"
00372   "    float4 xdist = (grid_x - const_atompos[3*at  ])*ANGS_TO_BOHR;       \n"
00373   "    float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR;        \n"
00374   "    float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR;        \n"
00375   "                                                                        \n"
00376   "    float4 xdist2 = xdist*xdist;                                        \n"
00377   "    float ydist2 = ydist*ydist;                                         \n"
00378   "    float zdist2 = zdist*zdist;                                         \n"
00379   "                                                                        \n"
00380   "    float4 dist2 = xdist2 + ydist2 + zdist2;                            \n"
00381   "                                                                        \n"
00382   "    // loop over the shells belonging to this atom (or basis function)  \n"
00383   "    for (shell=0; shell < maxshell; shell++) {                          \n"
00384   "      float4 contracted_gto = 0.0f;                                     \n"
00385   "                                                                        \n"
00386   "      // Loop over the Gaussian primitives of this contracted           \n"
00387   "      // basis function to build the atomic orbital                     \n"
00388   "      int maxprim = const_num_prim_per_shell[shell_counter];            \n"
00389   "      int shell_type = const_shell_symmetry[shell_counter];             \n"
00390   "      for (prim=0; prim < maxprim;  prim++) {                           \n"
00391   "        float exponent       = const_basis_array[prim_counter    ];     \n"
00392   "        float contract_coeff = const_basis_array[prim_counter + 1];     \n"
00393   "                                                                        \n"
00394   "        // By premultiplying the stored exponent factors etc,           \n"
00395   "        // we can use exp2f() rather than exp(), giving us full         \n"
00396   "        // precision, but with the speed of __expf()                    \n"
00397   "        contracted_gto += contract_coeff * native_exp2(-exponent*dist2);\n"
00398   "        prim_counter += 2;                                              \n"
00399   "      }                                                                 \n"
00400   "                                                                        \n"
00401   "      /* multiply with the appropriate wavefunction coefficient */      \n"
00402   "      float4 tmpshell=0.0f;                                             \n"
00403   "      switch (shell_type) {                                             \n"
00404   "        case S_SHELL:                                                   \n"
00405   "          value += const_wave_f[ifunc++] * contracted_gto;              \n"
00406   "          break;                                                        \n"
00407   "                                                                        \n"
00408   "        case P_SHELL:                                                   \n"
00409   "          tmpshell += const_wave_f[ifunc++] * xdist;                    \n"
00410   "          tmpshell += const_wave_f[ifunc++] * ydist;                    \n"
00411   "          tmpshell += const_wave_f[ifunc++] * zdist;                    \n"
00412   "          value += tmpshell * contracted_gto;                           \n"
00413   "          break;                                                        \n"
00414   "                                                                        \n"
00415   "        case D_SHELL:                                                   \n"
00416   "          tmpshell += const_wave_f[ifunc++] * xdist2;                   \n"
00417   "          tmpshell += const_wave_f[ifunc++] * xdist * ydist;            \n"
00418   "          tmpshell += const_wave_f[ifunc++] * ydist2;                   \n"
00419   "          tmpshell += const_wave_f[ifunc++] * xdist * zdist;            \n"
00420   "          tmpshell += const_wave_f[ifunc++] * ydist * zdist;            \n"
00421   "          tmpshell += const_wave_f[ifunc++] * zdist2;                   \n"
00422   "          value += tmpshell * contracted_gto;                           \n"
00423   "          break;                                                        \n"
00424   "                                                                        \n"
00425   "        case F_SHELL:                                                   \n"
00426   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist;           \n"
00427   "          tmpshell += const_wave_f[ifunc++] * xdist2 * ydist;           \n"
00428   "          tmpshell += const_wave_f[ifunc++] * ydist2 * xdist;           \n"
00429   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist;           \n"
00430   "          tmpshell += const_wave_f[ifunc++] * xdist2 * zdist;           \n"
00431   "          tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist;    \n"
00432   "          tmpshell += const_wave_f[ifunc++] * ydist2 * zdist;           \n"
00433   "          tmpshell += const_wave_f[ifunc++] * zdist2 * xdist;           \n"
00434   "          tmpshell += const_wave_f[ifunc++] * zdist2 * ydist;           \n"
00435   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist;           \n"
00436   "          value += tmpshell * contracted_gto;                           \n"
00437   "          break;                                                        \n"
00438   "                                                                        \n"
00439   "        case G_SHELL:                                                   \n"
00440   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2;          \n"
00441   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist;   \n"
00442   "          tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2;          \n"
00443   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist;   \n"
00444   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2;          \n"
00445   "          tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist;   \n"
00446   "          tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist;   \n"
00447   "          tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist;   \n"
00448   "          tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist;   \n"
00449   "          tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2;          \n"
00450   "          tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist;   \n"
00451   "          tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2;          \n"
00452   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist;   \n"
00453   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist;   \n"
00454   "          tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2;          \n"
00455   "          value += tmpshell * contracted_gto;                           \n"
00456   "          break;                                                        \n"
00457   "      } // end switch                                                   \n"
00458   "                                                                        \n"
00459   "      shell_counter++;                                                  \n"
00460   "    }                                                                   \n"
00461   "  }                                                                     \n"
00462   "                                                                        \n"
00463   "  // return either orbital density or orbital wavefunction amplitude    \n"
00464   "  if (density) {                                                        \n"
00465   "    orbitalgrid[outaddr               ] = copysign(value.x*value.x, value.x); \n"
00466   "    orbitalgrid[outaddr+1*V4BLOCKSIZEX] = copysign(value.y*value.y, value.y); \n"
00467   "    orbitalgrid[outaddr+2*V4BLOCKSIZEX] = copysign(value.z*value.z, value.z); \n"
00468   "    orbitalgrid[outaddr+3*V4BLOCKSIZEX] = copysign(value.w*value.w, value.w); \n"
00469   "  } else {                                                              \n"
00470   "    orbitalgrid[outaddr               ] = value.x;                      \n"
00471   "    orbitalgrid[outaddr+1*V4BLOCKSIZEX] = value.y;                      \n"
00472   "    orbitalgrid[outaddr+2*V4BLOCKSIZEX] = value.z;                      \n"
00473   "    orbitalgrid[outaddr+3*V4BLOCKSIZEX] = value.w;                      \n"
00474   "  }                                                                     \n"
00475   "}                                                                       \n"
00476   "                                                                        \n"
00477   "                                                                        \n"
00478   "//                                                                      \n"
00479   "// OpenCL loading from global memory into shared memory for all arrays  \n"
00480   "//                                                                      \n"
00481   "                                                                        \n"
00482   "typedef union flint_t {                                                 \n"
00483   "  float f;                                                              \n"
00484   "  int i;                                                                \n"
00485   "} flint;                                                                \n"
00486   "                                                                        \n"
00487   "__kernel __attribute__((reqd_work_group_size(BLOCKSIZEX, BLOCKSIZEY, 1))) \n"
00488   "void clorbitaltiledshared(int numatoms,                                 \n"
00489   "                          __global float *wave_f,                       \n"
00490   "                          __global float *basis_array,                  \n"
00491   "                          __global flint *atominfo,                     \n"
00492   "                          __global int *shellinfo,                      \n"
00493   "                          float voxelsize,                              \n"
00494   "                          float originx,                                \n"
00495   "                          float originy,                                \n"
00496   "                          float grid_z,                                 \n"
00497   "                          int density,                                  \n"
00498   "                          __global float *orbitalgrid) {                \n"
00499   "  unsigned int xindex  = get_global_id(0);                              \n"
00500   "  unsigned int yindex  = get_global_id(1);                              \n"
00501   "  unsigned int outaddr = get_global_size(0) * yindex + xindex;          \n"
00502   "  float grid_x = originx + voxelsize * xindex;                          \n"
00503   "  float grid_y = originy + voxelsize * yindex;                          \n"
00504   "                                                                        \n"
00505   "  // XXX NVIDIA-specific warp logic                                     \n"
00506   "  int sidx = get_local_id(1) * get_local_size(0) + get_local_id(0);     \n"
00507   "                                                                        \n"
00508   "  __local float s_wave_f[SHAREDSIZE];                                   \n"
00509   "  int sblock_wave_f = 0;                                                \n"
00510   "  s_wave_f[sidx      ] = wave_f[sidx      ];                            \n"
00511   "  s_wave_f[sidx +  64] = wave_f[sidx +  64];                            \n"
00512   "  s_wave_f[sidx + 128] = wave_f[sidx + 128];                            \n"
00513   "  s_wave_f[sidx + 192] = wave_f[sidx + 192];                            \n"
00514   "  barrier(CLK_LOCAL_MEM_FENCE);                                         \n"
00515   "                                                                        \n"
00516   "  // similar to C version                                               \n"
00517   "  int at;                                                               \n"
00518   "  int prim, shell;                                                      \n"
00519   "                                                                        \n"
00520   "  // initialize value of orbital at gridpoint                           \n"
00521   "  float value = 0.0f;                                                   \n"
00522   "                                                                        \n"
00523   "  // initialize the wavefunction and primitive counters                 \n"
00524   "  int ifunc = 0;                                                        \n"
00525   "  int shell_counter = 0;                                                \n"
00526   "  int sblock_prim_counter = -1; // sentinel indicates no data loaded    \n"
00527   "  // loop over all the QM atoms                                         \n"
00528   "  for (at = 0; at < numatoms; at++) {                                   \n"
00529   "    __local flint s_atominfo[5];                                        \n"
00530   "    __local float s_basis_array[SHAREDSIZE];                            \n"
00531   "    barrier(CLK_LOCAL_MEM_FENCE);                                       \n"
00532   "    if (sidx < 5)                                                       \n"
00533   "      s_atominfo[sidx].i = atominfo[(at<<4) + sidx].i;                  \n"
00534   "    barrier(CLK_LOCAL_MEM_FENCE);                                       \n"
00535   "    int prim_counter = s_atominfo[3].i;                                 \n"
00536   "    int maxshell     = s_atominfo[4].i;                                 \n"
00537   "    int new_sblock_prim_counter = prim_counter & MEMCOAMASK;            \n"
00538   "    if (sblock_prim_counter != new_sblock_prim_counter) {               \n"
00539   "      sblock_prim_counter = new_sblock_prim_counter;                    \n"
00540   "      s_basis_array[sidx      ] = basis_array[sblock_prim_counter + sidx      ]; \n"
00541   "      s_basis_array[sidx +  64] = basis_array[sblock_prim_counter + sidx +  64]; \n"
00542   "      s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128]; \n"
00543   "      s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192]; \n"
00544   "      prim_counter -= sblock_prim_counter;                              \n"
00545   "      barrier(CLK_LOCAL_MEM_FENCE);                                     \n"
00546   "    }                                                                   \n"
00547   "                                                                        \n"
00548   "    // calculate distance between grid point and center of atom         \n"
00549   "    float xdist = (grid_x - s_atominfo[0].f)*ANGS_TO_BOHR;              \n"
00550   "    float ydist = (grid_y - s_atominfo[1].f)*ANGS_TO_BOHR;              \n"
00551   "    float zdist = (grid_z - s_atominfo[2].f)*ANGS_TO_BOHR;              \n"
00552   "                                                                        \n"
00553   "    float xdist2 = xdist*xdist;                                         \n"
00554   "    float ydist2 = ydist*ydist;                                         \n"
00555   "    float zdist2 = zdist*zdist;                                         \n"
00556   "                                                                        \n"
00557   "    float dist2 = xdist2 + ydist2 + zdist2;                             \n"
00558   "                                                                        \n"
00559   "    // loop over the shells belonging to this atom (or basis function)  \n"
00560   "    for (shell=0; shell < maxshell; shell++) {                          \n"
00561   "      float contracted_gto = 0.0f;                                      \n"
00562   "                                                                        \n"
00563   "      // Loop over the Gaussian primitives of this contracted           \n"
00564   "      // basis function to build the atomic orbital                     \n"
00565   "      __local int s_shellinfo[2];                                       \n"
00566   "                                                                        \n"
00567   "      barrier(CLK_LOCAL_MEM_FENCE);                                     \n"
00568   "      if (sidx < 2)                                                     \n"
00569   "        s_shellinfo[sidx] = shellinfo[(shell_counter<<4) + sidx];       \n"
00570   "      barrier(CLK_LOCAL_MEM_FENCE);                                     \n"
00571   "      int maxprim = s_shellinfo[0];                                     \n"
00572   "      int shell_type = s_shellinfo[1];                                  \n"
00573   "                                                                        \n"
00574   "      if ((prim_counter + (maxprim<<1)) >= SHAREDSIZE) {                \n"
00575   "        prim_counter += sblock_prim_counter;                            \n"
00576   "        sblock_prim_counter = prim_counter & MEMCOAMASK;                \n"
00577   "        s_basis_array[sidx      ] = basis_array[sblock_prim_counter + sidx      ]; \n"
00578   "        s_basis_array[sidx +  64] = basis_array[sblock_prim_counter + sidx +  64]; \n"
00579   "        s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128]; \n"
00580   "        s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192]; \n"
00581   "        prim_counter -= sblock_prim_counter;                            \n"
00582   "        barrier(CLK_LOCAL_MEM_FENCE);                                   \n"
00583   "      }                                                                 \n"
00584   "      for (prim=0; prim < maxprim;  prim++) {                           \n"
00585   "        float exponent       = s_basis_array[prim_counter    ];         \n"
00586   "        float contract_coeff = s_basis_array[prim_counter + 1];         \n"
00587   "                                                                        \n"
00588   "        // By premultiplying the stored exponent factors etc,           \n"
00589   "        // we can use exp2f() rather than exp(), giving us full         \n"
00590   "        // precision, but with the speed of __expf()                    \n"
00591   "        contracted_gto += contract_coeff * native_exp2(-exponent*dist2);\n"
00592   "                                                                        \n"
00593   "        prim_counter += 2;                                              \n"
00594   "      }                                                                 \n"
00595   "                                                                        \n"
00596   "      // XXX should use a constant memory lookup table to store         \n"
00597   "      // shared mem refill constants, and dynamically lookup the        \n"
00598   "      // number of elements referenced in the next iteration.           \n"
00599   "      if ((ifunc + MAXSHELLCOUNT) >= SHAREDSIZE) {                      \n"
00600   "        ifunc += sblock_wave_f;                                         \n"
00601   "        sblock_wave_f = ifunc & MEMCOAMASK;                             \n"
00602   "        barrier(CLK_LOCAL_MEM_FENCE);                                   \n"
00603   "        s_wave_f[sidx      ] = wave_f[sblock_wave_f + sidx      ];      \n"
00604   "        s_wave_f[sidx +  64] = wave_f[sblock_wave_f + sidx +  64];      \n"
00605   "        s_wave_f[sidx + 128] = wave_f[sblock_wave_f + sidx + 128];      \n"
00606   "        s_wave_f[sidx + 192] = wave_f[sblock_wave_f + sidx + 192];      \n"
00607   "        barrier(CLK_LOCAL_MEM_FENCE);                                   \n"
00608   "        ifunc -= sblock_wave_f;                                         \n"
00609   "      }                                                                 \n"
00610   "                                                                        \n"
00611   "      /* multiply with the appropriate wavefunction coefficient */      \n"
00612   "      float tmpshell=0.0f;                                              \n"
00613   "      switch (shell_type) {                                             \n"
00614   "        case S_SHELL:                                                   \n"
00615   "          value += s_wave_f[ifunc++] * contracted_gto;                  \n"
00616   "          break;                                                        \n"
00617   "                                                                        \n"
00618   "        case P_SHELL:                                                   \n"
00619   "          tmpshell += s_wave_f[ifunc++] * xdist;                        \n"
00620   "          tmpshell += s_wave_f[ifunc++] * ydist;                        \n"
00621   "          tmpshell += s_wave_f[ifunc++] * zdist;                        \n"
00622   "          value += tmpshell * contracted_gto;                           \n"
00623   "          break;                                                        \n"
00624   "                                                                        \n"
00625   "        case D_SHELL:                                                   \n"
00626   "          tmpshell += s_wave_f[ifunc++] * xdist2;                       \n"
00627   "          tmpshell += s_wave_f[ifunc++] * xdist * ydist;                \n"
00628   "          tmpshell += s_wave_f[ifunc++] * ydist2;                       \n"
00629   "          tmpshell += s_wave_f[ifunc++] * xdist * zdist;                \n"
00630   "          tmpshell += s_wave_f[ifunc++] * ydist * zdist;                \n"
00631   "          tmpshell += s_wave_f[ifunc++] * zdist2;                       \n"
00632   "          value += tmpshell * contracted_gto;                           \n"
00633   "          break;                                                        \n"
00634   "                                                                        \n"
00635   "        case F_SHELL:                                                   \n"
00636   "          tmpshell += s_wave_f[ifunc++] * xdist2 * xdist;               \n"
00637   "          tmpshell += s_wave_f[ifunc++] * xdist2 * ydist;               \n"
00638   "          tmpshell += s_wave_f[ifunc++] * ydist2 * xdist;               \n"
00639   "          tmpshell += s_wave_f[ifunc++] * ydist2 * ydist;               \n"
00640   "          tmpshell += s_wave_f[ifunc++] * xdist2 * zdist;               \n"
00641   "          tmpshell += s_wave_f[ifunc++] * xdist * ydist * zdist;        \n"
00642   "          tmpshell += s_wave_f[ifunc++] * ydist2 * zdist;               \n"
00643   "          tmpshell += s_wave_f[ifunc++] * zdist2 * xdist;               \n"
00644   "          tmpshell += s_wave_f[ifunc++] * zdist2 * ydist;               \n"
00645   "          tmpshell += s_wave_f[ifunc++] * zdist2 * zdist;               \n"
00646   "          value += tmpshell * contracted_gto;                           \n"
00647   "          break;                                                        \n"
00648   "                                                                        \n"
00649   "        case G_SHELL:                                                   \n"
00650   "          tmpshell += s_wave_f[ifunc++] * xdist2 * xdist2;              \n"
00651   "          tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * ydist;       \n"
00652   "          tmpshell += s_wave_f[ifunc++] * xdist2 * ydist2;              \n"
00653   "          tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * xdist;       \n"
00654   "          tmpshell += s_wave_f[ifunc++] * ydist2 * ydist2;              \n"
00655   "          tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * zdist;       \n"
00656   "          tmpshell += s_wave_f[ifunc++] * xdist2 * ydist * zdist;       \n"
00657   "          tmpshell += s_wave_f[ifunc++] * ydist2 * xdist * zdist;       \n"
00658   "          tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * zdist;       \n"
00659   "          tmpshell += s_wave_f[ifunc++] * xdist2 * zdist2;              \n"
00660   "          tmpshell += s_wave_f[ifunc++] * zdist2 * xdist * ydist;       \n"
00661   "          tmpshell += s_wave_f[ifunc++] * ydist2 * zdist2;              \n"
00662   "          tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * xdist;       \n"
00663   "          tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * ydist;       \n"
00664   "          tmpshell += s_wave_f[ifunc++] * zdist2 * zdist2;              \n"
00665   "          value += tmpshell * contracted_gto;                           \n"
00666   "          break;                                                        \n"
00667   "      } // end switch                                                   \n"
00668   "                                                                        \n"
00669   "      shell_counter++;                                                  \n"
00670   "    }                                                                   \n"
00671   "  }                                                                     \n"
00672   "                                                                        \n"
00673   "  // return either orbital density or orbital wavefunction amplitude    \n"
00674   "  if (density) {                                                        \n"
00675   "    orbitalgrid[outaddr] = copysign(value*value, value);                \n"
00676   "  } else {                                                              \n"
00677   "    orbitalgrid[outaddr] = value;                                       \n"
00678   "  }                                                                     \n"
00679   "}                                                                       \n"
00680   "                                                                        \n"
00681   "                                                                        \n";
00682 
00683 
00684 cl_program vmd_opencl_compile_orbital_pgm(cl_context clctx, cl_device_id *cldevs, int &clerr) {
00685   cl_program clpgm = NULL;
00686 
00687   clpgm = clCreateProgramWithSource(clctx, 1, &clorbitalsrc, NULL, &clerr);
00688   CLERR
00689 
00690   char clcompileflags[4096];
00691   sprintf(clcompileflags, "-DUNROLLX=%d -DUNROLLY=%d -DBLOCKSIZEX=%d -DBLOCKSIZEY=%d -DBLOCKSIZE=%d -DSHAREDSIZE=%d -DMEMCOAMASK=%d -DMAXSHELLCOUNT=%d -DV4UNROLLX=%d -DV4UNROLLY=%d -DV4BLOCKSIZEX=%d -DV4BLOCKSIZEY=%d -cl-fast-relaxed-math -cl-single-precision-constant -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros", UNROLLX, UNROLLY, BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZE, SHAREDSIZE, MEMCOAMASK, MAXSHELLCOUNT, V4UNROLLX, V4UNROLLY, V4BLOCKSIZEX, V4BLOCKSIZEY);
00692   clerr = clBuildProgram(clpgm, 0, NULL, clcompileflags, NULL, NULL);
00693   if (clerr != CL_SUCCESS)
00694     printf("  compilation failed!\n");
00695 
00696   if (cldevs) {
00697     char buildlog[8192];
00698     size_t len=0;
00699     clerr = clGetProgramBuildInfo(clpgm, cldevs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildlog), buildlog, &len);
00700     if (len > 1) {
00701       printf("OpenCL compilation log:\n");
00702       printf("  '%s'\n", buildlog);
00703     }
00704     CLERR
00705   }
00706 
00707   return clpgm;
00708 }
00709 
00710 
00711 vmd_opencl_orbital_handle * vmd_opencl_create_orbital_handle(cl_context ctx, cl_command_queue cmdq, cl_device_id *devs) {
00712   vmd_opencl_orbital_handle * orbh;
00713   cl_int clerr = CL_SUCCESS;
00714 
00715   orbh = (vmd_opencl_orbital_handle *) malloc(sizeof(vmd_opencl_orbital_handle));
00716   orbh->ctx = ctx;
00717   orbh->cmdq = cmdq;
00718   orbh->devs = devs;
00719 
00720   orbh->pgm = vmd_opencl_compile_orbital_pgm(orbh->ctx, orbh->devs, clerr);
00721   CLERR
00722   orbh->kconstmem = clCreateKernel(orbh->pgm, "clorbitalconstmem", &clerr);
00723   CLERR
00724   orbh->kconstmem_vec4 = clCreateKernel(orbh->pgm, "clorbitalconstmem_vec4", &clerr);
00725   CLERR
00726   orbh->ktiledshared = clCreateKernel(orbh->pgm, "clorbitaltiledshared", &clerr);
00727   CLERR
00728 
00729   return orbh;
00730 }
00731 
00732 
00733 int vmd_opencl_destroy_orbital_handle(vmd_opencl_orbital_handle * orbh) {
00734   clReleaseKernel(orbh->kconstmem);
00735   clReleaseKernel(orbh->kconstmem_vec4);
00736   clReleaseKernel(orbh->ktiledshared);
00737   clReleaseProgram(orbh->pgm);
00738   free(orbh);
00739 
00740   return 0;
00741 }
00742 
00743 
00744 
00745 static int computepaddedsize(int orig, int tilesize) {
00746   int alignmask = tilesize - 1;
00747   int paddedsz = (orig + alignmask) & ~alignmask;  
00748 //printf("orig: %d  padded: %d  tile: %d\n", orig, paddedsz, tilesize);
00749   return paddedsz;
00750 }
00751 
00752 static void * openclorbitalthread(void *voidparms) {
00753   size_t volsize[3], Gsz[3], Bsz[3];
00754   cl_mem d_wave_f = NULL;
00755   cl_mem d_basis_array = NULL;
00756   cl_mem d_atominfo = NULL;
00757   cl_mem d_shellinfo = NULL;
00758   cl_mem d_origin = NULL;
00759   cl_mem d_numvoxels = NULL;
00760   cl_mem d_orbitalgrid = NULL;
00761   cl_mem const_wave_f=NULL;
00762   cl_mem const_atompos=NULL;
00763   cl_mem const_atom_basis=NULL;
00764   cl_mem const_num_shells_per_atom=NULL;
00765   cl_mem const_basis_array=NULL;
00766   cl_mem const_num_prim_per_shell=NULL;
00767   cl_mem const_shell_symmetry=NULL;
00768   float *h_orbitalgrid = NULL;
00769   float *h_basis_array_exp2f = NULL;
00770   int numvoxels[3];
00771   float origin[3];
00772   orbthrparms *parms = NULL;
00773   int usefastconstkernel=0;
00774   int threadid=0;
00775   int tilesize = 1; // default tile size to use in absence of other info
00776   cl_int clerr = CL_SUCCESS;
00777 
00778 #if defined(USE_OPENCLDEVPOOL)
00779   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00780   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00781 
00782   // scale tile size by device performance
00783   tilesize=4; // GTX 280, Tesla C1060 starting point tile size
00784   wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00785 #else
00786   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00787   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00788 #endif
00789 
00790   // get various existing OpenCL state from handle provided by caller
00791   vmd_opencl_orbital_handle * orbh = parms->orbh;
00792   cl_context clctx = parms->orbh->ctx;
00793   cl_command_queue clcmdq = parms->orbh->cmdq;
00794 
00795   numvoxels[0] = parms->numvoxels[0];
00796   numvoxels[1] = parms->numvoxels[1];
00797   numvoxels[2] = 1;
00798 
00799   origin[0] = parms->origin[0];
00800   origin[1] = parms->origin[1];
00801 
00802   int density = parms->density;
00803  
00804   // setup OpenCL grid and block sizes
00805   if (getenv("VMDMOVEC4")) {
00806     // setup energy grid size, padding out arrays for peak GPU memory performance
00807     volsize[0] = (parms->numvoxels[0] + V4GPU_X_ALIGNMASK) & ~(V4GPU_X_ALIGNMASK);
00808     volsize[1] = (parms->numvoxels[1] + V4GPU_Y_ALIGNMASK) & ~(V4GPU_Y_ALIGNMASK);
00809     volsize[2] = 1;      // we only do one plane at a time
00810     Bsz[0] = V4BLOCKSIZEX;
00811     Bsz[1] = V4BLOCKSIZEY;
00812     Bsz[2] = 1;
00813     Gsz[0] = volsize[0] / V4UNROLLX;
00814     Gsz[1] = volsize[1] / V4UNROLLY;
00815     Gsz[2] = 1;
00816   } else {
00817     volsize[0] = (parms->numvoxels[0] + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00818     volsize[1] = (parms->numvoxels[1] + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00819     volsize[2] = 1;      // we only do one plane at a time
00820     Bsz[0] = BLOCKSIZEX;
00821     Bsz[1] = BLOCKSIZEY;
00822     Bsz[2] = 1;
00823     Gsz[0] = volsize[0] / UNROLLX;
00824     Gsz[1] = volsize[1] / UNROLLY;
00825     Gsz[2] = 1;
00826   }
00827   int volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];
00828 
00829   // determine which runtime strategy is workable
00830   // given the data sizes involved
00831   if ((parms->num_wave_f < MAX_WAVEF_SZ) &&
00832       (parms->numatoms < MAX_ATOM_SZ) &&
00833       (parms->numatoms < MAX_ATOMSHELL_SZ) &&
00834       (2*parms->num_basis < MAX_BASIS_SZ) &&
00835       (parms->num_shells < MAX_SHELL_SZ)) {
00836     usefastconstkernel=1;
00837   }
00838 
00839   // allow overrides for testing purposes
00840   if (getenv("VMDFORCEMOTILEDSHARED") != NULL) {
00841     usefastconstkernel=0; 
00842   }
00843 
00844   // allocate and copy input data to GPU arrays
00845   int padsz;
00846   padsz = computepaddedsize(2 * parms->num_basis, MEMCOALESCE);
00847   h_basis_array_exp2f = (float *) malloc(padsz * sizeof(float));
00848   float log2e = log2(2.718281828);
00849   for (int ll=0; ll<(2*parms->num_basis); ll+=2) {
00850 #if 1
00851     // use exp2f() rather than expf()
00852     h_basis_array_exp2f[ll  ] = parms->basis_array[ll  ] * log2e;
00853 #else
00854     h_basis_array_exp2f[ll  ] = parms->basis_array[ll  ];
00855 #endif
00856     h_basis_array_exp2f[ll+1] = parms->basis_array[ll+1];
00857   }
00858 
00859   if (usefastconstkernel) {
00860     const_wave_f=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->num_wave_f * sizeof(float), NULL, NULL);
00861     clEnqueueWriteBuffer(clcmdq, const_wave_f, CL_TRUE, 0, parms->num_wave_f * sizeof(float), parms->wave_f, 0, NULL, NULL);
00862 
00863     const_atompos=clCreateBuffer(clctx, CL_MEM_READ_ONLY, 3 * parms->numatoms * sizeof(float), NULL, NULL);
00864     clEnqueueWriteBuffer(clcmdq, const_atompos, CL_TRUE, 0, 3 * parms->numatoms * sizeof(float), parms->atompos, 0, NULL, NULL);
00865    
00866     const_atom_basis=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->numatoms * sizeof(int), NULL, NULL);
00867     clEnqueueWriteBuffer(clcmdq, const_atom_basis, CL_TRUE, 0, parms->numatoms * sizeof(int), parms->atom_basis, 0, NULL, NULL);
00868    
00869     const_num_shells_per_atom=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->numatoms * sizeof(int), NULL, NULL);
00870     clEnqueueWriteBuffer(clcmdq, const_num_shells_per_atom, CL_TRUE, 0,  parms->numatoms * sizeof(int), parms->num_shells_per_atom, 0, NULL, NULL);
00871     
00872     const_basis_array=clCreateBuffer(clctx, CL_MEM_READ_ONLY, 2 * parms->num_basis * sizeof(float), NULL, NULL);
00873     clEnqueueWriteBuffer(clcmdq, const_basis_array, CL_TRUE, 0, 2 * parms->num_basis * sizeof(float), h_basis_array_exp2f, 0, NULL, NULL);
00874     
00875     const_num_prim_per_shell=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->num_shells * sizeof(int), NULL, NULL);
00876     clEnqueueWriteBuffer(clcmdq, const_num_prim_per_shell, CL_TRUE, 0, parms->num_shells * sizeof(int), parms->num_prim_per_shell, 0, NULL, NULL);
00877     
00878     const_shell_symmetry=clCreateBuffer(clctx, CL_MEM_READ_ONLY, parms->num_shells * sizeof(int), NULL, NULL);
00879     clEnqueueWriteBuffer(clcmdq, const_shell_symmetry, CL_TRUE, 0, parms->num_shells * sizeof(int), parms->shell_symmetry, 0, NULL, NULL);
00880   } else {
00881     padsz = computepaddedsize(parms->num_wave_f, MEMCOALESCE);
00882     d_wave_f = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(float), NULL, NULL);
00883     clEnqueueWriteBuffer(clcmdq, d_wave_f, CL_TRUE, 0, parms->num_wave_f * sizeof(float), parms->wave_f, 0, NULL, NULL);
00884 
00885     // pack atom data into a tiled array
00886     padsz = computepaddedsize(16 * parms->numatoms, MEMCOALESCE);
00887     flint * h_atominfo = (flint *) calloc(1, padsz * sizeof(flint));
00888     d_atominfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(flint), NULL, NULL);
00889     int ll;
00890     for (ll=0; ll<parms->numatoms; ll++) {
00891       int addr = ll * 16;
00892       h_atominfo[addr    ].f = parms->atompos[ll*3    ];
00893       h_atominfo[addr + 1].f = parms->atompos[ll*3 + 1];
00894       h_atominfo[addr + 2].f = parms->atompos[ll*3 + 2];
00895       h_atominfo[addr + 3].i = parms->atom_basis[ll];
00896       h_atominfo[addr + 4].i = parms->num_shells_per_atom[ll];
00897     }
00898     clEnqueueWriteBuffer(clcmdq, d_atominfo, CL_TRUE, 0, padsz * sizeof(flint), h_atominfo, 0, NULL, NULL);
00899     free(h_atominfo);
00900 
00901     padsz = computepaddedsize(16 * parms->num_shells, MEMCOALESCE);
00902     int * h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
00903     d_shellinfo = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(int), NULL, NULL);
00904     for (ll=0; ll<parms->num_shells; ll++) {
00905       h_shellinfo[ll*16    ] = parms->num_prim_per_shell[ll];
00906       h_shellinfo[ll*16 + 1] = parms->shell_symmetry[ll];
00907     }
00908     clEnqueueWriteBuffer(clcmdq, d_shellinfo, CL_TRUE, 0, padsz * sizeof(int), h_shellinfo, 0, NULL, NULL);
00909     free(h_shellinfo);
00910 
00911     d_basis_array = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(float), NULL, NULL);
00912     clEnqueueWriteBuffer(clcmdq, d_basis_array, CL_TRUE, 0, 2 * parms->num_basis * sizeof(float), h_basis_array_exp2f, 0, NULL, NULL);
00913     
00914     padsz = computepaddedsize(3, MEMCOALESCE);
00915     d_origin = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(float), NULL, NULL);
00916     clEnqueueWriteBuffer(clcmdq, d_origin, CL_TRUE, 0, 3 * sizeof(float), origin, 0, NULL, NULL);
00917 
00918     d_numvoxels = clCreateBuffer(clctx, CL_MEM_READ_ONLY, padsz * sizeof(int), NULL, NULL);
00919     clEnqueueWriteBuffer(clcmdq, d_numvoxels, CL_TRUE, 0, 3 * sizeof(int), numvoxels, 0, NULL, NULL);
00920   }
00921 
00922   // allocate and initialize the GPU output array
00923   d_orbitalgrid = clCreateBuffer(clctx, CL_MEM_READ_WRITE, volmemsz, NULL, NULL);
00924   CLERR // check and clear any existing errors
00925 
00926   // allocate working buffer
00927   h_orbitalgrid = (float *) malloc(volmemsz);
00928 
00929 #if 0
00930   if (threadid == 0) {
00931     // inform on which kernel we're actually going to run
00932     printf("atoms[%d] ", parms->numatoms);
00933     printf("wavef[%d] ", parms->num_wave_f);
00934     printf("basis[%d] ", parms->num_basis);
00935     printf("shell[%d] ", parms->num_shells);
00936     if (usefastconstkernel) {
00937       printf("GPU constant memory");
00938     } else {
00939       printf("GPU tiled shared memory:");
00940     }
00941     printf(" Gsz:%dx%d\n", (int) Gsz[0], (int) Gsz[1]);
00942   }
00943 #endif
00944 
00945   // loop over orbital planes
00946   wkf_tasktile_t tile;
00947   int planesize = numvoxels[0] * numvoxels[1];
00948 
00949 #if defined(USE_OPENCLDEVPOOL)
00950   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00951 #else
00952   while (wkf_threadlaunch_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00953 #endif
00954     int k;
00955     for (k=tile.start; k<tile.end; k++) {
00956       origin[2] = parms->origin[2] + parms->voxelsize * k;
00957 
00958 
00959       // RUN the kernel...
00960       if (usefastconstkernel) {
00961         cl_event event;
00962         if (getenv("VMDMOVEC4")) {
00963           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 0, sizeof(int), &parms->numatoms);
00964           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 1, sizeof(cl_mem), &const_atompos);
00965           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 2, sizeof(cl_mem), &const_atom_basis);
00966           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 3, sizeof(cl_mem), &const_num_shells_per_atom);
00967           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 4, sizeof(cl_mem), &const_basis_array);
00968           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 5, sizeof(cl_mem), &const_num_prim_per_shell);
00969           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 6, sizeof(cl_mem), &const_shell_symmetry);
00970           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 7, sizeof(cl_mem), &const_wave_f);
00971           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 8, sizeof(float), &parms->voxelsize);
00972           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 9, sizeof(float), &origin[0]);
00973           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 10, sizeof(float), &origin[1]);
00974           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 11, sizeof(float), &origin[2]);
00975           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 12, sizeof(cl_int), &density);
00976           clerr |= clSetKernelArg(orbh->kconstmem_vec4, 13, sizeof(cl_mem), &d_orbitalgrid);
00977           clerr = clEnqueueNDRangeKernel(clcmdq, orbh->kconstmem_vec4, 2, NULL, Gsz, Bsz, 0, NULL, &event);
00978         } else {
00979           clerr |= clSetKernelArg(orbh->kconstmem, 0, sizeof(int), &parms->numatoms);
00980           clerr |= clSetKernelArg(orbh->kconstmem, 1, sizeof(cl_mem), &const_atompos);
00981           clerr |= clSetKernelArg(orbh->kconstmem, 2, sizeof(cl_mem), &const_atom_basis);
00982           clerr |= clSetKernelArg(orbh->kconstmem, 3, sizeof(cl_mem), &const_num_shells_per_atom);
00983           clerr |= clSetKernelArg(orbh->kconstmem, 4, sizeof(cl_mem), &const_basis_array);
00984           clerr |= clSetKernelArg(orbh->kconstmem, 5, sizeof(cl_mem), &const_num_prim_per_shell);
00985           clerr |= clSetKernelArg(orbh->kconstmem, 6, sizeof(cl_mem), &const_shell_symmetry);
00986           clerr |= clSetKernelArg(orbh->kconstmem, 7, sizeof(cl_mem), &const_wave_f);
00987           clerr |= clSetKernelArg(orbh->kconstmem, 8, sizeof(float), &parms->voxelsize);
00988           clerr |= clSetKernelArg(orbh->kconstmem, 9, sizeof(float), &origin[0]);
00989           clerr |= clSetKernelArg(orbh->kconstmem, 10, sizeof(float), &origin[1]);
00990           clerr |= clSetKernelArg(orbh->kconstmem, 11, sizeof(float), &origin[2]);
00991           clerr |= clSetKernelArg(orbh->kconstmem, 12, sizeof(cl_int), &density);
00992           clerr |= clSetKernelArg(orbh->kconstmem, 13, sizeof(cl_mem), &d_orbitalgrid);
00993           clerr = clEnqueueNDRangeKernel(clcmdq, orbh->kconstmem, 2, NULL, Gsz, Bsz, 0, NULL, &event);
00994         }
00995         CLERR // check and clear any existing errors
00996         clerr |= clWaitForEvents(1, &event);
00997         clerr |= clReleaseEvent(event);
00998         CLERR // check and clear any existing errors
00999       } else {
01000         clerr |= clSetKernelArg(orbh->ktiledshared, 0, sizeof(int), &parms->numatoms);
01001         clerr |= clSetKernelArg(orbh->ktiledshared, 1, sizeof(cl_mem), &d_wave_f);
01002         clerr |= clSetKernelArg(orbh->ktiledshared, 2, sizeof(cl_mem), &d_basis_array);
01003         clerr |= clSetKernelArg(orbh->ktiledshared, 3, sizeof(cl_mem), &d_atominfo);
01004         clerr |= clSetKernelArg(orbh->ktiledshared, 4, sizeof(cl_mem), &d_shellinfo);
01005         clerr |= clSetKernelArg(orbh->ktiledshared, 5, sizeof(float), &parms->voxelsize);
01006         clerr |= clSetKernelArg(orbh->ktiledshared, 6, sizeof(float), &origin[0]);
01007         clerr |= clSetKernelArg(orbh->ktiledshared, 7, sizeof(float), &origin[1]);
01008         clerr |= clSetKernelArg(orbh->ktiledshared, 8, sizeof(float), &origin[2]);
01009         clerr |= clSetKernelArg(orbh->ktiledshared, 9, sizeof(cl_int), &density);
01010         clerr |= clSetKernelArg(orbh->ktiledshared, 10, sizeof(cl_mem), &d_orbitalgrid);
01011         cl_event event;
01012         clerr = clEnqueueNDRangeKernel(clcmdq, orbh->ktiledshared, 2, NULL, Gsz, Bsz, 0, NULL, &event);
01013         CLERR // check and clear any existing errors
01014         clerr |= clWaitForEvents(1, &event);
01015         clerr |= clReleaseEvent(event);
01016         CLERR // check and clear any existing errors
01017       }
01018       CLERR // check and clear any existing errors
01019 
01020       // Copy the GPU output data back to the host and use/store it..
01021       clEnqueueReadBuffer(clcmdq, d_orbitalgrid, CL_TRUE, 0, volmemsz, h_orbitalgrid, 0, NULL, NULL);
01022       CLERR // check and clear any existing errors
01023       clFinish(clcmdq);
01024 
01025       // Copy GPU blocksize padded array back down to the original size
01026       int y;
01027       for (y=0; y<numvoxels[1]; y++) {
01028         long orbaddr = k*planesize + y*numvoxels[0];
01029         memcpy(&parms->orbitalgrid[orbaddr], &h_orbitalgrid[y*volsize[0]], numvoxels[0] * sizeof(float));
01030       }
01031     }
01032   }
01033   clFinish(clcmdq);
01034 
01035 // printf("worker[%d] done.\n", threadid);
01036 
01037 // printf("freeing host memory objects\n");
01038   free(h_basis_array_exp2f);
01039   free(h_orbitalgrid);
01040 
01041   if (usefastconstkernel) {
01042 // printf("freeing device constant memory objects\n");
01043     clReleaseMemObject(const_wave_f);
01044     clReleaseMemObject(const_atompos);
01045     clReleaseMemObject(const_atom_basis);
01046     clReleaseMemObject(const_num_shells_per_atom);
01047     clReleaseMemObject(const_basis_array);
01048     clReleaseMemObject(const_num_prim_per_shell);
01049     clReleaseMemObject(const_shell_symmetry);
01050   } else {
01051 // printf("freeing device global memory objects\n");
01052     clReleaseMemObject(d_wave_f);
01053     clReleaseMemObject(d_basis_array);
01054     clReleaseMemObject(d_atominfo);
01055     clReleaseMemObject(d_shellinfo);
01056     clReleaseMemObject(d_numvoxels);
01057     clReleaseMemObject(d_origin);
01058   }
01059   clReleaseMemObject(d_orbitalgrid);
01060   CLERR // check and clear any existing errors
01061 
01062   return NULL;
01063 }
01064 
01065 
01066 
01067 int vmd_opencl_evaluate_orbital_grid(wkf_threadpool_t *devpool,
01068                        vmd_opencl_orbital_handle *orbh,
01069                        int numatoms,
01070                        const float *wave_f, int num_wave_f,
01071                        const float *basis_array, int num_basis,
01072                        const float *atompos,
01073                        const int *atom_basis,
01074                        const int *num_shells_per_atom,
01075                        const int *num_prim_per_shell,
01076                        const int *shell_symmetry,
01077                        int num_shells,
01078                        const int *numvoxels,
01079                        float voxelsize,
01080                        const float *origin,
01081                        int density,
01082                        float *orbitalgrid) {
01083   int rc=0;
01084   orbthrparms parms;
01085 
01086   parms.numatoms = numatoms;
01087   parms.wave_f = wave_f;
01088   parms.num_wave_f = num_wave_f;
01089   parms.basis_array = basis_array;
01090   parms.num_basis = num_basis;
01091   parms.atompos = atompos;
01092   parms.atom_basis = atom_basis;
01093   parms.num_shells_per_atom = num_shells_per_atom;
01094   parms.num_prim_per_shell = num_prim_per_shell;
01095   parms.shell_symmetry = shell_symmetry;
01096   parms.num_shells = num_shells;
01097   parms.numvoxels = numvoxels;
01098   parms.voxelsize = voxelsize;
01099   parms.origin = origin;
01100   parms.density = density;
01101   parms.orbitalgrid = orbitalgrid;
01102   parms.orbh = orbh;
01103 
01104   wkf_tasktile_t tile;
01105   tile.start=0;
01106   tile.end=numvoxels[2];
01107 
01108   // XXX hard-coded for single GPU/CPU device
01109   // multi-GPU is disabled at present as we have to add a bunch of 
01110   // explicit infrastructure to gaurantee thread safety of various
01111   // OpenCL API calls among multiple worker threads.
01112 #if defined(USE_OPENCLDEVPOOL)
01113   wkf_threadpool_sched_dynamic(devpool, &tile);
01114   wkf_threadpool_launch(devpool, openclorbitalthread, &parms, 1); 
01115 #else
01116   /* spawn child threads to do the work */
01117   rc = wkf_threadlaunch(1, &parms, openclorbitalthread, &tile);
01118 #endif
01119 
01120   return rc;
01121 }
01122 
01123