CUDAOrbital.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAOrbital.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.128 $      $Date: 2024/03/01 01:50:25 $
00014  *
00015  ***************************************************************************/
00057 #include <stdio.h>
00058 #include <stdlib.h>
00059 #include <string.h>
00060 #include <math.h>
00061 #include <cuda.h>
00062 
00063 #include "WKFThreads.h"
00064 #include "CUDAKernels.h"
00065 #include "OrbitalJIT.h"
00066 
00067 // Multi-GPU pinned memory bugs seem to have been fixed as of CUDA 2.3 drivers
00068 #if CUDART_VERSION >= 2030
00069 #define USE_PINNED_MEMORY 1
00070 // #define USE_ZERO_COPY 1
00071 #endif
00072 
00073 #if 1 && (CUDART_VERSION >= 4000)
00074 #define RESTRICT __restrict__
00075 #else
00076 #define RESTRICT 
00077 #endif
00078 
00079 #if 1
00080 #define CUERR { cudaError_t err; \
00081   if ((err = cudaGetLastError()) != cudaSuccess) { \
00082   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00083   printf("Thread aborting...\n"); \
00084   return NULL; }}
00085 #else
00086 #define CUERR
00087 #endif
00088 
00089 #define ANGS_TO_BOHR 1.8897259877218677f
00090 
00091 typedef union flint_t {
00092   float f;
00093   int i;
00094 } flint;
00095 
00096 typedef struct {
00097   int numatoms;
00098   const float *wave_f; 
00099   int num_wave_f;
00100   const float *basis_array;
00101   int num_basis;
00102   const float *atompos;
00103   const int *atom_basis;
00104   const int *num_shells_per_atom;
00105   const int *num_prim_per_shell;
00106   const int *shell_types;
00107   int num_shells;
00108   const int *numvoxels;
00109   float voxelsize;
00110   const float *origin;
00111   int density;
00112   float *orbitalgrid;
00113 } orbthrparms;
00114 
00115 /* thread prototype */
00116 static void * cudaorbitalthread(void *);
00117 
00118 // GPU block layout
00119 #define UNROLLX       1
00120 #define UNROLLY       1
00121 #define BLOCKSIZEX    8 
00122 #define BLOCKSIZEY    8
00123 #define BLOCKSIZE     BLOCKSIZEX * BLOCKSIZEY
00124 
00125 // required GPU array padding to match thread block size
00126 #define TILESIZEX BLOCKSIZEX*UNROLLX
00127 #define TILESIZEY BLOCKSIZEY*UNROLLY
00128 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00129 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00130 
00131 #define MEMCOALESCE  384
00132 
00133 // orbital shell types 
00134 #define S_SHELL 0
00135 #define P_SHELL 1
00136 #define D_SHELL 2
00137 #define F_SHELL 3
00138 #define G_SHELL 4
00139 #define H_SHELL 5
00140 
00141 //
00142 // Constant arrays to store 
00143 //
00144 #define MAX_ATOM_SZ 256
00145 
00146 #define MAX_ATOMPOS_SZ (MAX_ATOM_SZ)
00147 __constant__ static float const_atompos[MAX_ATOMPOS_SZ * 3];
00148 
00149 #define MAX_ATOM_BASIS_SZ (MAX_ATOM_SZ)
00150 __constant__ static int const_atom_basis[MAX_ATOM_BASIS_SZ];
00151 
00152 #define MAX_ATOMSHELL_SZ (MAX_ATOM_SZ)
00153 __constant__ static int const_num_shells_per_atom[MAX_ATOMSHELL_SZ];
00154 
00155 #define MAX_BASIS_SZ 6144 
00156 __constant__ static float const_basis_array[MAX_BASIS_SZ];
00157 
00158 #define MAX_SHELL_SZ 1024
00159 __constant__ static int const_num_prim_per_shell[MAX_SHELL_SZ];
00160 __constant__ static int const_shell_types[MAX_SHELL_SZ];
00161 
00162 #define MAX_WAVEF_SZ 6144
00163 __constant__ static float const_wave_f[MAX_WAVEF_SZ];
00164 
00165 
00166 
00167 //
00168 // Only enabled for testing an emulation of JIT code generation approach
00169 //
00170 // #define VMDMOJIT 1
00171 // #define VMDMOJITSRC "/home/johns/mojit.cu"
00172 
00173 //
00174 // If we're testing performance of a JIT kernel, include the code here
00175 //
00176 #if defined(VMDMOJIT) && defined(VMDMOJITSRC)
00177 #include VMDMOJITSRC
00178 #endif
00179 
00180 
00181 
00182 //
00183 // CUDA using const memory for almost all of the key arrays
00184 //
00185 __global__ static void cuorbitalconstmem(int numatoms, 
00186                           float voxelsize, 
00187                           float originx,
00188                           float originy,
00189                           float grid_z, 
00190                           int density,
00191                           float * orbitalgrid) {
00192   unsigned int xindex  = blockIdx.x * blockDim.x + threadIdx.x;
00193   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00194   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00195   float grid_x = originx + voxelsize * xindex;
00196   float grid_y = originy + voxelsize * yindex;
00197 
00198   // similar to C version
00199   int at;
00200   int prim, shell;
00201 
00202   // initialize value of orbital at gridpoint
00203   float value = 0.0f;
00204 
00205   // initialize the wavefunction and shell counters
00206   int ifunc = 0;
00207   int shell_counter = 0;
00208 
00209   // loop over all the QM atoms
00210   for (at = 0; at < numatoms; at++) {
00211     // calculate distance between grid point and center of atom
00212     int maxshell = const_num_shells_per_atom[at];
00213     int prim_counter = const_atom_basis[at];
00214 
00215     float xdist = (grid_x - const_atompos[3*at  ])*ANGS_TO_BOHR;
00216     float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR;
00217     float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR;
00218 
00219     float xdist2 = xdist*xdist;
00220     float ydist2 = ydist*ydist;
00221     float zdist2 = zdist*zdist;
00222 
00223     float dist2 = xdist2 + ydist2 + zdist2;
00224 
00225     // loop over the shells belonging to this atom (or basis function)
00226     for (shell=0; shell < maxshell; shell++) {
00227       float contracted_gto = 0.0f;
00228 
00229       // Loop over the Gaussian primitives of this contracted
00230       // basis function to build the atomic orbital
00231       int maxprim = const_num_prim_per_shell[shell_counter];
00232       int shelltype = const_shell_types[shell_counter];
00233       for (prim=0; prim < maxprim;  prim++) {
00234         float exponent       = const_basis_array[prim_counter    ];
00235         float contract_coeff = const_basis_array[prim_counter + 1];
00236 
00237         // By premultiplying the stored exponent factors etc,
00238         // we can use exp2f() rather than exp(), giving us full
00239         // precision, but with the speed of __expf()
00240         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00241         prim_counter += 2;
00242       }
00243 
00244       /* multiply with the appropriate wavefunction coefficient */
00245       float tmpshell=0.0f;
00246       switch (shelltype) {
00247         case S_SHELL:
00248           value += const_wave_f[ifunc++] * contracted_gto;
00249           break;
00250 
00251         case P_SHELL:
00252           tmpshell += const_wave_f[ifunc++] * xdist;
00253           tmpshell += const_wave_f[ifunc++] * ydist;
00254           tmpshell += const_wave_f[ifunc++] * zdist;
00255           value += tmpshell * contracted_gto;
00256           break;
00257 
00258         case D_SHELL:
00259           tmpshell += const_wave_f[ifunc++] * xdist2;
00260           tmpshell += const_wave_f[ifunc++] * xdist * ydist;
00261           tmpshell += const_wave_f[ifunc++] * ydist2;
00262           tmpshell += const_wave_f[ifunc++] * xdist * zdist;
00263           tmpshell += const_wave_f[ifunc++] * ydist * zdist;
00264           tmpshell += const_wave_f[ifunc++] * zdist2;
00265           value += tmpshell * contracted_gto;
00266           break;
00267 
00268         case F_SHELL:
00269           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist;
00270           tmpshell += const_wave_f[ifunc++] * xdist2 * ydist;
00271           tmpshell += const_wave_f[ifunc++] * ydist2 * xdist;
00272           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist;
00273           tmpshell += const_wave_f[ifunc++] * xdist2 * zdist;
00274           tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist;
00275           tmpshell += const_wave_f[ifunc++] * ydist2 * zdist;
00276           tmpshell += const_wave_f[ifunc++] * zdist2 * xdist;
00277           tmpshell += const_wave_f[ifunc++] * zdist2 * ydist;
00278           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist;
00279           value += tmpshell * contracted_gto;
00280           break;
00281 
00282         case G_SHELL:
00283           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00284           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00285           tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00286           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00287           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00288           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00289           tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00290           tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00291           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00292           tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00293           tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00294           tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00295           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00296           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00297           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00298           value += tmpshell * contracted_gto;
00299           break;
00300       } // end switch
00301 
00302       shell_counter++;
00303     }
00304   }
00305 
00306   // return either orbital density or orbital wavefunction amplitude
00307   if (density) {
00308     orbitalgrid[outaddr] = copysignf(value*value, value);
00309   } else {
00310     orbitalgrid[outaddr] = value;
00311   }
00312 }
00313 
00314 
00315 
00316 //
00317 // CUDA loading from global memory into shared memory for all arrays
00318 //
00319 
00320 // Shared memory tiling parameters:
00321 //   We need a tile size of at least 16 elements for coalesced memory access,
00322 // and it must be the next power of two larger than the largest number of 
00323 // wavefunction coefficients that will be consumed at once for a given 
00324 // shell type.  The current code is designed to handle up to "G" shells 
00325 // (15 coefficients), since that's the largest shell type supported by
00326 // GAMESS, which is the only format we can presently read.
00327 
00328 // The maximum shell coefficient count specifies the largest number
00329 // of coefficients that might have to be read for a single shell,
00330 // e.g. for the highest supported shell type.
00331 // This is a "G" shell coefficient count
00332 #define MAXSHELLCOUNT 15   
00333 
00334 // Mask out the lower N bits of the array index
00335 // to compute which tile to start loading shared memory with.
00336 // This must be large enough to gaurantee coalesced addressing, but
00337 // be half or less of the minimum tile size
00338 #define MEMCOAMASK  (~15)
00339 
00340 // The shared memory basis set and wavefunction coefficient arrays 
00341 // must contain a minimum of at least two tiles, but can be any 
00342 // larger multiple thereof which is also a multiple of the thread block size.
00343 // Larger arrays reduce duplicative data loads/fragmentation 
00344 // at the beginning and end of the array
00345 #define SHAREDSIZE 256
00346 
00347 
00348 __global__ static void cuorbitaltiledshared(int numatoms, 
00349                           const float *wave_f,
00350                           const float *basis_array,
00351                           const flint *atominfo, 
00352                           const int *shellinfo, 
00353                           float voxelsize, 
00354                           float originx,
00355                           float originy,
00356                           float grid_z, 
00357                           int density, 
00358                           float * orbitalgrid) {
00359   unsigned int xindex  = blockIdx.x * blockDim.x + threadIdx.x;
00360   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00361   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00362   float grid_x = originx + voxelsize * xindex;
00363   float grid_y = originy + voxelsize * yindex;
00364 
00365   int sidx = __mul24(threadIdx.y, blockDim.x) + threadIdx.x;
00366 
00367   __shared__ float s_wave_f[SHAREDSIZE];
00368   int sblock_wave_f = 0;
00369   s_wave_f[sidx      ] = wave_f[sidx      ];
00370   s_wave_f[sidx +  64] = wave_f[sidx +  64];
00371   s_wave_f[sidx + 128] = wave_f[sidx + 128];
00372   s_wave_f[sidx + 192] = wave_f[sidx + 192];
00373   __syncthreads();
00374 
00375   // similar to C version
00376   int at;
00377   int prim, shell;
00378 
00379   // initialize value of orbital at gridpoint
00380   float value = 0.0f;
00381 
00382   // initialize the wavefunction and primitive counters
00383   int ifunc = 0;
00384   int shell_counter = 0;
00385   int sblock_prim_counter = -1; // sentinel value to indicate no data loaded
00386   // loop over all the QM atoms
00387   for (at = 0; at < numatoms; at++) {
00388     __shared__ flint s_atominfo[5];
00389     __shared__ float s_basis_array[SHAREDSIZE];
00390 
00391     __syncthreads();
00392     if (sidx < 5)
00393       s_atominfo[sidx].i = atominfo[(at<<4) + sidx].i;
00394     __syncthreads();
00395 
00396     int prim_counter = s_atominfo[3].i;
00397     int maxshell     = s_atominfo[4].i;
00398     int new_sblock_prim_counter = prim_counter & MEMCOAMASK;
00399     if (sblock_prim_counter != new_sblock_prim_counter) {
00400       sblock_prim_counter = new_sblock_prim_counter;
00401       s_basis_array[sidx      ] = basis_array[sblock_prim_counter + sidx      ];
00402       s_basis_array[sidx +  64] = basis_array[sblock_prim_counter + sidx +  64];
00403       s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00404       s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00405       prim_counter -= sblock_prim_counter;
00406       __syncthreads();
00407     }
00408 
00409     // calculate distance between grid point and center of atom
00410     float xdist = (grid_x - s_atominfo[0].f)*ANGS_TO_BOHR;
00411     float ydist = (grid_y - s_atominfo[1].f)*ANGS_TO_BOHR;
00412     float zdist = (grid_z - s_atominfo[2].f)*ANGS_TO_BOHR;
00413 
00414     float xdist2 = xdist*xdist;
00415     float ydist2 = ydist*ydist;
00416     float zdist2 = zdist*zdist;
00417 
00418     float dist2 = xdist2 + ydist2 + zdist2;
00419 
00420     // loop over the shells belonging to this atom (or basis function)
00421     for (shell=0; shell < maxshell; shell++) {
00422       float contracted_gto = 0.0f;
00423 
00424       // Loop over the Gaussian primitives of this contracted
00425       // basis function to build the atomic orbital
00426       __shared__ int s_shellinfo[2];
00427      
00428       __syncthreads();
00429       if (sidx < 2)
00430         s_shellinfo[sidx] = shellinfo[(shell_counter<<4) + sidx];
00431       __syncthreads();
00432 
00433       int maxprim = s_shellinfo[0];
00434       int shelltype = s_shellinfo[1];
00435 
00436       if ((prim_counter + (maxprim<<1)) >= SHAREDSIZE) {
00437         prim_counter += sblock_prim_counter;
00438         sblock_prim_counter = prim_counter & MEMCOAMASK;
00439         s_basis_array[sidx      ] = basis_array[sblock_prim_counter + sidx      ];
00440         s_basis_array[sidx +  64] = basis_array[sblock_prim_counter + sidx +  64];
00441         s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00442         s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00443         prim_counter -= sblock_prim_counter;
00444         __syncthreads();
00445       } 
00446       for (prim=0; prim < maxprim;  prim++) {
00447         float exponent       = s_basis_array[prim_counter    ];
00448         float contract_coeff = s_basis_array[prim_counter + 1];
00449 
00450         // By premultiplying the stored exponent factors etc,
00451         // we can use exp2f() rather than exp(), giving us full
00452         // precision, but with the speed of __expf()
00453         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00454 
00455         prim_counter += 2;
00456       }
00457 
00458       // XXX should use a constant memory lookup table to store
00459       // shared mem refill constants, and dynamically lookup the
00460       // number of elements referenced in the next iteration.
00461       if ((ifunc + MAXSHELLCOUNT) >= SHAREDSIZE) { 
00462         ifunc += sblock_wave_f;
00463         sblock_wave_f = ifunc & MEMCOAMASK;
00464         __syncthreads();
00465         s_wave_f[sidx      ] = wave_f[sblock_wave_f + sidx      ];
00466         s_wave_f[sidx +  64] = wave_f[sblock_wave_f + sidx +  64];
00467         s_wave_f[sidx + 128] = wave_f[sblock_wave_f + sidx + 128];
00468         s_wave_f[sidx + 192] = wave_f[sblock_wave_f + sidx + 192];
00469         __syncthreads();
00470         ifunc -= sblock_wave_f;
00471       }
00472 
00473       /* multiply with the appropriate wavefunction coefficient */
00474       float tmpshell=0.0f;
00475       switch (shelltype) {
00476         case S_SHELL:
00477           value += s_wave_f[ifunc++] * contracted_gto;
00478           break;
00479 
00480         case P_SHELL:
00481           tmpshell += s_wave_f[ifunc++] * xdist;
00482           tmpshell += s_wave_f[ifunc++] * ydist;
00483           tmpshell += s_wave_f[ifunc++] * zdist;
00484           value += tmpshell * contracted_gto;
00485           break;
00486 
00487         case D_SHELL:
00488           tmpshell += s_wave_f[ifunc++] * xdist2;
00489           tmpshell += s_wave_f[ifunc++] * xdist * ydist;
00490           tmpshell += s_wave_f[ifunc++] * ydist2;
00491           tmpshell += s_wave_f[ifunc++] * xdist * zdist;
00492           tmpshell += s_wave_f[ifunc++] * ydist * zdist;
00493           tmpshell += s_wave_f[ifunc++] * zdist2;
00494           value += tmpshell * contracted_gto;
00495           break;
00496 
00497         case F_SHELL:
00498           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist;
00499           tmpshell += s_wave_f[ifunc++] * xdist2 * ydist;
00500           tmpshell += s_wave_f[ifunc++] * ydist2 * xdist;
00501           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist;
00502           tmpshell += s_wave_f[ifunc++] * xdist2 * zdist;
00503           tmpshell += s_wave_f[ifunc++] * xdist * ydist * zdist;
00504           tmpshell += s_wave_f[ifunc++] * ydist2 * zdist;
00505           tmpshell += s_wave_f[ifunc++] * zdist2 * xdist;
00506           tmpshell += s_wave_f[ifunc++] * zdist2 * ydist;
00507           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist;
00508           value += tmpshell * contracted_gto;
00509           break;
00510 
00511         case G_SHELL:
00512           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00513           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00514           tmpshell += s_wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00515           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00516           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00517           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00518           tmpshell += s_wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00519           tmpshell += s_wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00520           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00521           tmpshell += s_wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00522           tmpshell += s_wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00523           tmpshell += s_wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00524           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00525           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00526           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00527           value += tmpshell * contracted_gto;
00528           break;
00529       } // end switch
00530 
00531       shell_counter++;
00532     }
00533   }
00534 
00535   // return either orbital density or orbital wavefunction amplitude
00536   if (density) {
00537     orbitalgrid[outaddr] = copysignf(value*value, value);
00538   } else {
00539     orbitalgrid[outaddr] = value;
00540   }
00541 }
00542 
00543 
00544 //
00545 // This is a Fermi-specific kernel designed to rely entirely on
00546 // L1 cache for access to various coefficient arrays.  It would perform
00547 // terribly on older devices with compute capability < 2.0, so it should not
00548 // be used on such hardware.
00549 //
00550 __global__ static void cuorbitalcachedglobmem(int numatoms,
00551                           const float * RESTRICT wave_f,
00552                           const float * RESTRICT basis_array,
00553                           const flint * RESTRICT atominfo, 
00554                           const int * RESTRICT shellinfo, 
00555                           float voxelsize, 
00556                           float originx,
00557                           float originy,
00558                           float grid_z, 
00559                           int density, 
00560                           float *  RESTRICT orbitalgrid) {
00561   unsigned int xindex  = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x;
00562   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00563   unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex;
00564   float grid_x = originx + voxelsize * xindex;
00565   float grid_y = originy + voxelsize * yindex;
00566 
00567   // similar to C version
00568   int at;
00569   int prim, shell;
00570 
00571   // initialize value of orbital at gridpoint
00572   float value = 0.0f;
00573 
00574   // initialize the wavefunction and primitive counters
00575   int ifunc = 0;
00576   int shell_counter = 0;
00577 
00578   // loop over all the QM atoms
00579   for (at = 0; at < numatoms; at++) {
00580     // calculate distance between grid point and center of atom
00581     int atidx = at << 4;
00582     float xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00583     float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00584     float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00585 
00586     int prim_counter = atominfo[atidx + 3].i;
00587     int maxshell     = atominfo[atidx + 4].i;
00588 
00589     float xdist2 = xdist*xdist;
00590     float ydist2 = ydist*ydist;
00591     float zdist2 = zdist*zdist;
00592 
00593     float dist2 = xdist2 + ydist2 + zdist2;
00594 
00595     // loop over the shells belonging to this atom (or basis function)
00596     for (shell=0; shell < maxshell; shell++) {
00597       float contracted_gto = 0.0f;
00598 
00599       int maxprim    = shellinfo[(shell_counter<<4)    ];
00600       int shell_type = shellinfo[(shell_counter<<4) + 1];
00601       for (prim=0; prim < maxprim;  prim++) {
00602         float exponent       = basis_array[prim_counter    ];
00603         float contract_coeff = basis_array[prim_counter + 1];
00604 
00605         // By premultiplying the stored exponent factors etc,
00606         // we can use exp2f() rather than exp(), giving us full
00607         // precision, but with the speed of __expf()
00608         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00609         prim_counter += 2;
00610       }
00611 
00612       /* multiply with the appropriate wavefunction coefficient */
00613       float tmpshell=0;
00614       switch (shell_type) {
00615         case S_SHELL:
00616           value += wave_f[ifunc++] * contracted_gto;
00617           break;
00618 
00619         case P_SHELL:
00620           tmpshell += wave_f[ifunc++] * xdist;
00621           tmpshell += wave_f[ifunc++] * ydist;
00622           tmpshell += wave_f[ifunc++] * zdist;
00623           value += tmpshell * contracted_gto;
00624           break;
00625 
00626         case D_SHELL:
00627           tmpshell += wave_f[ifunc++] * xdist2;
00628           tmpshell += wave_f[ifunc++] * xdist * ydist;
00629           tmpshell += wave_f[ifunc++] * ydist2;
00630           tmpshell += wave_f[ifunc++] * xdist * zdist;
00631           tmpshell += wave_f[ifunc++] * ydist * zdist;
00632           tmpshell += wave_f[ifunc++] * zdist2;
00633           value += tmpshell * contracted_gto;
00634           break;
00635 
00636         case F_SHELL:
00637           tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00638           tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00639           tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00640           tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00641           tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00642           tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00643           tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00644           tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00645           tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00646           tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00647           value += tmpshell * contracted_gto;
00648           break;
00649 
00650         case G_SHELL:
00651           tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00652           tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00653           tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00654           tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00655           tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00656           tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00657           tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00658           tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00659           tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00660           tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00661           tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00662           tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00663           tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00664           tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00665           tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00666           value += tmpshell * contracted_gto;
00667           break;
00668       } // end switch
00669 
00670       shell_counter++;
00671     }
00672   }
00673 
00674   // return either orbital density or orbital wavefunction amplitude
00675   if (density) {
00676     orbitalgrid[outaddr] = copysignf(value*value, value);
00677   } else {
00678     orbitalgrid[outaddr] = value;
00679   }
00680 }
00681 
00682 
00683 
00684 
00685 
00686 //
00687 // This is an Ampere-specific kernel designed to rely entirely on
00688 // L1 cache for access to various coefficient arrays.  It would perform
00689 // terribly on older devices so it should not be used on such hardware.
00690 //
00691 __global__ static void cuorbitalcachedglobmem_packed(int numatoms,
00692                           const float * RESTRICT wave_f,
00693                           const float2 * RESTRICT basis_array,
00694                           const flint * RESTRICT atominfo,
00695                           const int2 * RESTRICT shellinfo,
00696                           float voxelsize,
00697                           float originx,
00698                           float originy,
00699                           float grid_z,
00700                           int density,
00701                           float *  RESTRICT orbitalgrid) {
00702   unsigned int xindex  = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x;
00703   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00704   unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex;
00705   float grid_x = originx + voxelsize * xindex;
00706   float grid_y = originy + voxelsize * yindex;
00707 
00708   // similar to C version
00709   int at;
00710   int prim, shell;
00711 
00712   // initialize value of orbital at gridpoint
00713   float value = 0.0f;
00714 
00715   // initialize the wavefunction and primitive counters
00716   int ifunc = 0;
00717   int shell_counter = 0;
00718 
00719   // loop over all the QM atoms
00720   for (at = 0; at < numatoms; at++) {
00721     // calculate distance between grid point and center of atom
00722     int atidx = at << 4;
00723     float xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00724     float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00725     float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00726 
00727     int prim_counter = atominfo[atidx + 3].i >> 1;
00728     int maxshell     = atominfo[atidx + 4].i;
00729 
00730     float xdist2 = xdist*xdist;
00731     float ydist2 = ydist*ydist;
00732     float zdist2 = zdist*zdist;
00733 
00734     float dist2 = xdist2 + ydist2 + zdist2;
00735 
00736     // loop over the shells belonging to this atom (or basis function)
00737     for (shell=0; shell < maxshell; shell++) {
00738       int2 tmpshellinfo = shellinfo[shell_counter];
00739       int maxprim    = tmpshellinfo.x;
00740       int shell_type = tmpshellinfo.y;
00741 
00742       float contracted_gto = 0.0f;
00743       for (prim=prim_counter; prim < maxprim;  prim++) {
00744         float2 tmpbasis = basis_array[prim];
00745         float exponent       = tmpbasis.x;
00746         float contract_coeff = tmpbasis.y;
00747 
00748         // By premultiplying the stored exponent factors etc,
00749         // we can use exp2f() rather than exp(), giving us full
00750         // precision, but with the speed of __expf()
00751         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00752       }
00753 
00754       /* multiply with the appropriate wavefunction coefficient */
00755       float tmpshell=0;
00756       switch (shell_type) {
00757         case S_SHELL:
00758           value += wave_f[ifunc++] * contracted_gto;
00759           break;
00760 
00761         case P_SHELL:
00762           tmpshell += wave_f[ifunc++] * xdist;
00763           tmpshell += wave_f[ifunc++] * ydist;
00764           tmpshell += wave_f[ifunc++] * zdist;
00765           value += tmpshell * contracted_gto;
00766           break;
00767 
00768         case D_SHELL:
00769           tmpshell += wave_f[ifunc++] * xdist2;
00770           tmpshell += wave_f[ifunc++] * xdist * ydist;
00771           tmpshell += wave_f[ifunc++] * ydist2;
00772           tmpshell += wave_f[ifunc++] * xdist * zdist;
00773           tmpshell += wave_f[ifunc++] * ydist * zdist;
00774           tmpshell += wave_f[ifunc++] * zdist2;
00775           value += tmpshell * contracted_gto;
00776           break;
00777 
00778         case F_SHELL:
00779           tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00780           tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00781           tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00782           tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00783           tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00784           tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00785           tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00786           tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00787           tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00788           tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00789           value += tmpshell * contracted_gto;
00790           break;
00791 
00792         case G_SHELL:
00793           tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00794           tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00795           tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00796           tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00797           tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00798           tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00799           tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00800           tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00801           tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00802           tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00803           tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00804           tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00805           tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00806           tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00807           tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00808           value += tmpshell * contracted_gto;
00809           break;
00810       } // end switch
00811 
00812       shell_counter++;
00813     }
00814   }
00815 
00816   // return either orbital density or orbital wavefunction amplitude
00817   if (density) {
00818     orbitalgrid[outaddr] = copysignf(value*value, value);
00819   } else {
00820     orbitalgrid[outaddr] = value;
00821   }
00822 }
00823 
00824 
00825 
00826 //
00827 // float2 vector operators
00828 //
00829 inline __host__ __device__ float2 make_float2(const float s) {
00830   return make_float2(s, s);
00831 }
00832 
00833 inline __host__ __device__ float2 operator+(const float2& a, const float2& b) {
00834   return make_float2(a.x + b.x, a.y + b.y);
00835 }
00836 
00837 inline __host__ __device__ void operator+=(float2& a, const float& b) {
00838   a.x += b; a.y += b;
00839 }
00840 
00841 inline __host__ __device__ void operator+=(float2& a, const float2& b) {
00842   a.x += b.x; a.y += b.y;
00843 }
00844 
00845 inline __host__ __device__ float2 operator-(const float2& a, const float s) {
00846   return make_float2(a.x - s, a.y - s);
00847 }
00848 
00849 inline __host__ __device__ float2 operator*(const float2& a, const float2& b) {
00850   return make_float2(a.x * b.x, a.y * b.y);
00851 }
00852 
00853 inline __host__ __device__ float2 operator*(const float s, const float2& a) {
00854   return make_float2(a.x * s, a.y * s);
00855 }
00856 
00857 inline __host__ __device__ float2 operator*(const float2& a, const float s) {
00858   return make_float2(a.x * s, a.y * s);
00859 }
00860 
00861 
00862 
00863 //
00864 // This is an Ampere-specific kernel designed to rely entirely on
00865 // L1 cache for access to various coefficient arrays.  It would perform
00866 // terribly on older devices so it should not be used on such hardware.
00867 //
00868 __global__ static void cuorbitalcachedglobmem_packed_2x(int numatoms,
00869                           const float * RESTRICT wave_f,
00870                           const float2 * RESTRICT basis_array,
00871                           const flint * RESTRICT atominfo,
00872                           const int2 * RESTRICT shellinfo,
00873                           float voxelsize,
00874                           float originx,
00875                           float originy,
00876                           float grid_z,
00877                           int density,
00878                           float2 *  RESTRICT orbitalgrid) {
00879   unsigned int xindex  = blockIdx.x * blockDim.x + threadIdx.x;
00880   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
00881 
00882   // outaddr is for a float2, so no unroll factor here
00883   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00884   float2 grid_x = make_float2(originx) +
00885                     voxelsize * make_float2(UNROLLX * xindex,
00886                                             UNROLLX * xindex + 1);
00887   float grid_y = originy + voxelsize * yindex;
00888 
00889   // similar to C version
00890   int at;
00891   int prim, shell;
00892 
00893   // initialize value of orbital at gridpoint
00894   float2 value = make_float2(0.0f);
00895 
00896   // initialize the wavefunction and primitive counters
00897   int ifunc = 0;
00898   int shell_counter = 0;
00899 
00900   // loop over all the QM atoms
00901   for (at = 0; at < numatoms; at++) {
00902     // calculate distance between grid point and center of atom
00903     int atidx = at << 4;
00904     float2 xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00905     float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00906     float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00907 
00908     int prim_counter = atominfo[atidx + 3].i >> 1;
00909     int maxshell     = atominfo[atidx + 4].i;
00910 
00911     float2 xdist2 = xdist*xdist;
00912     float ydist2 = ydist*ydist;
00913     float zdist2 = zdist*zdist;
00914 
00915     float2 dist2 = xdist2 + make_float2(ydist2 + zdist2);
00916 
00917     // loop over the shells belonging to this atom (or basis function)
00918     for (shell=0; shell < maxshell; shell++) {
00919       int2 tmpshellinfo = shellinfo[shell_counter];
00920       int maxprim    = tmpshellinfo.x;
00921       int shell_type = tmpshellinfo.y;
00922 
00923       float2 contracted_gto = make_float2(0.0f);
00924       for (prim=prim_counter; prim < maxprim;  prim++) {
00925         float2 tmpbasis = basis_array[prim];
00926         float exponent       = tmpbasis.x;
00927         float contract_coeff = tmpbasis.y;
00928 
00929         // By premultiplying the stored exponent factors etc,
00930         // we can use exp2f() rather than exp(), giving us full
00931         // precision, but with the speed of __expf()
00932         float2 negexpdist2 = -exponent*dist2;
00933         float2 expval;
00934         expval.x = exp2f(negexpdist2.x);
00935         expval.y = exp2f(negexpdist2.y);
00936         contracted_gto += contract_coeff * expval;
00937       }
00938 
00939       /* multiply with the appropriate wavefunction coefficient */
00940       float2 tmpshell = make_float2(0.0f);
00941       switch (shell_type) {
00942         case S_SHELL:
00943           value += wave_f[ifunc++] * contracted_gto;
00944           break;
00945 
00946         case P_SHELL:
00947           tmpshell += wave_f[ifunc++] * xdist;
00948           tmpshell += wave_f[ifunc++] * ydist;
00949           tmpshell += wave_f[ifunc++] * zdist;
00950           value += tmpshell * contracted_gto;
00951           break;
00952 
00953         case D_SHELL:
00954           tmpshell += wave_f[ifunc++] * xdist2;
00955           tmpshell += wave_f[ifunc++] * xdist * ydist;
00956           tmpshell += wave_f[ifunc++] * ydist2;
00957           tmpshell += wave_f[ifunc++] * xdist * zdist;
00958           tmpshell += wave_f[ifunc++] * ydist * zdist;
00959           tmpshell += wave_f[ifunc++] * zdist2;
00960           value += tmpshell * contracted_gto;
00961           break;
00962 
00963         case F_SHELL:
00964           tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00965           tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00966           tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00967           tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00968           tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00969           tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00970           tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00971           tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00972           tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00973           tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00974           value += tmpshell * contracted_gto;
00975           break;
00976 
00977         case G_SHELL:
00978           tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00979           tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00980           tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00981           tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00982           tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00983           tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00984           tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00985           tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00986           tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00987           tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00988           tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00989           tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00990           tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00991           tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00992           tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00993           value += tmpshell * contracted_gto;
00994           break;
00995       } // end switch
00996 
00997       shell_counter++;
00998     }
00999   }
01000 
01001   // return either orbital density or orbital wavefunction amplitude
01002   if (density) {
01003     float2 tmpout;
01004     tmpout.x = copysignf(value.x*value.x, value.x);
01005     tmpout.y = copysignf(value.y*value.y, value.y);
01006     orbitalgrid[outaddr] = tmpout;
01007   } else {
01008     orbitalgrid[outaddr] = value;
01009   }
01010 }
01011 
01012 
01013 
01014 //
01015 // float4 vector operators
01016 //
01017 inline __host__ __device__ float4 make_float4(const float s) {
01018   return make_float4(s, s, s, s);
01019 }
01020 
01021 inline __host__ __device__ float4 operator+(const float4& a, const float4& b) {
01022   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
01023 }
01024 
01025 inline __host__ __device__ float4 make_float4(const float3 &a, const float &b) {
01026   return make_float4(a.x, a.y, a.z, b);
01027 }
01028 
01029 inline __host__ __device__ void operator+=(float4& a, const float& b) {
01030   a.x += b; a.y += b; a.z += b; a.w += b;
01031 }
01032 
01033 inline __host__ __device__ void operator+=(float4& a, const float4& b) {
01034   a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
01035 }
01036 
01037 inline __host__ __device__ float4 operator-(const float4& a, const float s) {
01038   return make_float4(a.x - s, a.y - s, a.z - s, a.w - s);
01039 }
01040 
01041 inline __host__ __device__ void operator*=(float4& a, const float &b) {
01042   a.x *= b; a.y *= b; a.z *= b; a.w *= b;
01043 }
01044 
01045 inline __host__ __device__ float4 operator*(const float4& a, const float4& b) {
01046   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
01047 }
01048 
01049 inline __host__ __device__ float4 operator*(const float4& a, const float s) {
01050   return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
01051 }
01052 
01053 inline __host__ __device__ float4 operator*(const float s, const float4& a) {
01054   return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
01055 }
01056 
01057 
01058 //
01059 // This is an Ampere-specific kernel designed to rely entirely on
01060 // L1 cache for access to various coefficient arrays.  It would perform
01061 // terribly on older devices so it should not be used on such hardware.
01062 //
01063 __global__ static void cuorbitalcachedglobmem_packed_4x_3D(int numatoms,
01064                           const float * RESTRICT wave_f,
01065                           const float2 * RESTRICT basis_array,
01066                           const flint * RESTRICT atominfo,
01067                           const int2 * RESTRICT shellinfo,
01068                           float voxelsize,
01069                           float originx,
01070                           float originy,
01071                           float grid_z,
01072                           int density,
01073                           float4 *  RESTRICT orbitalgrid) {
01074   unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
01075   unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
01076   unsigned int zindex = blockIdx.z * blockDim.z + threadIdx.z;
01077 
01078   // outaddr is for a float4, so no unroll factor here
01079   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
01080   const int unrollxidx = UNROLLX * xindex;
01081   float4 grid_x = make_float4(originx) +
01082                     voxelsize * make_float4(unrollxidx,
01083                                             unrollxidx + 1,
01084                                             unrollxidx + 2,
01085                                             unrollxidx + 3);
01086   float grid_y = originy + voxelsize * yindex;
01087 
01088   // similar to C version
01089   int at;
01090   int prim, shell;
01091 
01092   // initialize value of orbital at gridpoint
01093   float4 value = make_float4(0.0f);
01094 
01095   // initialize the wavefunction and primitive counters
01096   int ifunc = 0;
01097   int shell_counter = 0;
01098 
01099   // loop over all the QM atoms
01100   for (at = 0; at < numatoms; at++) {
01101     // calculate distance between grid point and center of atom
01102     int atidx = at << 4;
01103     float4 xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
01104     float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
01105     float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
01106 
01107     int prim_counter = atominfo[atidx + 3].i >> 1;
01108     int maxshell     = atominfo[atidx + 4].i;
01109 
01110     float4 xdist2 = xdist*xdist;
01111     float ydist2 = ydist*ydist;
01112     float zdist2 = zdist*zdist;
01113 
01114     float4 dist2 = xdist2 + make_float4(ydist2 + zdist2);
01115 
01116     // loop over the shells belonging to this atom (or basis function)
01117     for (shell=0; shell < maxshell; shell++) {
01118       int2 tmpshellinfo = shellinfo[shell_counter];
01119       int maxprim    = tmpshellinfo.x;
01120       int shell_type = tmpshellinfo.y;
01121 
01122       float4 contracted_gto = make_float4(0.0f);
01123       for (prim=prim_counter; prim < maxprim;  prim++) {
01124         float2 tmpbasis = basis_array[prim];
01125         float exponent       = tmpbasis.x;
01126         float contract_coeff = tmpbasis.y;
01127 
01128         // By premultiplying the stored exponent factors etc,
01129         // we can use exp2f() rather than exp(), giving us full
01130         // precision, but with the speed of __expf()
01131         float4 negexpdist2 = -exponent*dist2;
01132         float4 expval;
01133         expval.x = exp2f(negexpdist2.x);
01134         expval.y = exp2f(negexpdist2.y);
01135         expval.z = exp2f(negexpdist2.z);
01136         expval.w = exp2f(negexpdist2.w);
01137         contracted_gto += contract_coeff * expval;
01138       }
01139 
01140       /* multiply with the appropriate wavefunction coefficient */
01141       float4 tmpshell = make_float4(0.0f);
01142       switch (shell_type) {
01143         case S_SHELL:
01144           value += wave_f[ifunc++] * contracted_gto;
01145           break;
01146 
01147         case P_SHELL:
01148           tmpshell += wave_f[ifunc++] * xdist;
01149           tmpshell += wave_f[ifunc++] * ydist;
01150           tmpshell += wave_f[ifunc++] * zdist;
01151           value += tmpshell * contracted_gto;
01152           break;
01153 
01154         case D_SHELL:
01155           tmpshell += wave_f[ifunc++] * xdist2;
01156           tmpshell += wave_f[ifunc++] * xdist * ydist;
01157           tmpshell += wave_f[ifunc++] * ydist2;
01158           tmpshell += wave_f[ifunc++] * xdist * zdist;
01159           tmpshell += wave_f[ifunc++] * ydist * zdist;
01160           tmpshell += wave_f[ifunc++] * zdist2;
01161           value += tmpshell * contracted_gto;
01162           break;
01163 
01164         case F_SHELL:
01165           tmpshell += wave_f[ifunc++] * xdist2 * xdist;
01166           tmpshell += wave_f[ifunc++] * xdist2 * ydist;
01167           tmpshell += wave_f[ifunc++] * ydist2 * xdist;
01168           tmpshell += wave_f[ifunc++] * ydist2 * ydist;
01169           tmpshell += wave_f[ifunc++] * xdist2 * zdist;
01170           tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
01171           tmpshell += wave_f[ifunc++] * ydist2 * zdist;
01172           tmpshell += wave_f[ifunc++] * zdist2 * xdist;
01173           tmpshell += wave_f[ifunc++] * zdist2 * ydist;
01174           tmpshell += wave_f[ifunc++] * zdist2 * zdist;
01175           value += tmpshell * contracted_gto;
01176           break;
01177 
01178         case G_SHELL:
01179           tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
01180           tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
01181           tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
01182           tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
01183           tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
01184           tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
01185           tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
01186           tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
01187           tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
01188           tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
01189           tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
01190           tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
01191           tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
01192           tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
01193           tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
01194           value += tmpshell * contracted_gto;
01195           break;
01196       } // end switch
01197 
01198       shell_counter++;
01199     }
01200   }
01201 
01202   // return either orbital density or orbital wavefunction amplitude
01203   if (density) {
01204     float4 tmpout;
01205     tmpout.x = copysignf(value.x*value.x, value.x);
01206     tmpout.y = copysignf(value.y*value.y, value.y);
01207     tmpout.z = copysignf(value.z*value.z, value.z);
01208     tmpout.w = copysignf(value.w*value.w, value.w);
01209     orbitalgrid[outaddr] = tmpout;
01210   } else {
01211     orbitalgrid[outaddr] = value;
01212   }
01213 }
01214 
01215 
01216 
01217 //
01218 // This is an Ampere-specific kernel designed to rely entirely on
01219 // L1 cache for access to various coefficient arrays.  It would perform
01220 // terribly on older devices so it should not be used on such hardware.
01221 //
01222 __global__ static void cuorbitalcachedglobmem_packed_4x(int numatoms,
01223                           const float * RESTRICT wave_f,
01224                           const float2 * RESTRICT basis_array,
01225                           const flint * RESTRICT atominfo,
01226                           const int2 * RESTRICT shellinfo,
01227                           float voxelsize,
01228                           float originx,
01229                           float originy,
01230                           float grid_z,
01231                           int density,
01232                           float4 *  RESTRICT orbitalgrid) {
01233   unsigned int xindex  = blockIdx.x * blockDim.x + threadIdx.x;
01234   unsigned int yindex  = blockIdx.y * blockDim.y + threadIdx.y;
01235 
01236   // outaddr is for a float4, so no unroll factor here
01237   unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
01238   float4 grid_x = make_float4(originx) +
01239                     voxelsize * make_float4(UNROLLX * xindex,
01240                                             UNROLLX * xindex + 1,
01241                                             UNROLLX * xindex + 2,
01242                                             UNROLLX * xindex + 3);
01243   float grid_y = originy + voxelsize * yindex;
01244 
01245   // similar to C version
01246   int at;
01247   int prim, shell;
01248 
01249   // initialize value of orbital at gridpoint
01250   float4 value = make_float4(0.0f);
01251 
01252   // initialize the wavefunction and primitive counters
01253   int ifunc = 0;
01254   int shell_counter = 0;
01255 
01256   // loop over all the QM atoms
01257   for (at = 0; at < numatoms; at++) {
01258     // calculate distance between grid point and center of atom
01259     int atidx = at << 4;
01260     float4 xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
01261     float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
01262     float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
01263 
01264     int prim_counter = atominfo[atidx + 3].i >> 1;
01265     int maxshell     = atominfo[atidx + 4].i;
01266 
01267     float4 xdist2 = xdist*xdist;
01268     float ydist2 = ydist*ydist;
01269     float zdist2 = zdist*zdist;
01270 
01271     float4 dist2 = xdist2 + make_float4(ydist2 + zdist2);
01272 
01273     // loop over the shells belonging to this atom (or basis function)
01274     for (shell=0; shell < maxshell; shell++) {
01275       int2 tmpshellinfo = shellinfo[shell_counter];
01276       int maxprim    = tmpshellinfo.x;
01277       int shell_type = tmpshellinfo.y;
01278 
01279       float4 contracted_gto = make_float4(0.0f);
01280       for (prim=prim_counter; prim < maxprim;  prim++) {
01281         float2 tmpbasis = basis_array[prim];
01282         float exponent       = tmpbasis.x;
01283         float contract_coeff = tmpbasis.y;
01284 
01285         // By premultiplying the stored exponent factors etc,
01286         // we can use exp2f() rather than exp(), giving us full
01287         // precision, but with the speed of __expf()
01288         float4 negexpdist2 = -exponent*dist2;
01289         float4 expval;
01290         expval.x = exp2f(negexpdist2.x);
01291         expval.y = exp2f(negexpdist2.y);
01292         expval.z = exp2f(negexpdist2.z);
01293         expval.w = exp2f(negexpdist2.w);
01294         contracted_gto += contract_coeff * expval;
01295       }
01296 
01297       /* multiply with the appropriate wavefunction coefficient */
01298       float4 tmpshell = make_float4(0.0f);
01299       switch (shell_type) {
01300         case S_SHELL:
01301           value += wave_f[ifunc++] * contracted_gto;
01302           break;
01303 
01304         case P_SHELL:
01305           tmpshell += wave_f[ifunc++] * xdist;
01306           tmpshell += wave_f[ifunc++] * ydist;
01307           tmpshell += wave_f[ifunc++] * zdist;
01308           value += tmpshell * contracted_gto;
01309           break;
01310 
01311         case D_SHELL:
01312           tmpshell += wave_f[ifunc++] * xdist2;
01313           tmpshell += wave_f[ifunc++] * xdist * ydist;
01314           tmpshell += wave_f[ifunc++] * ydist2;
01315           tmpshell += wave_f[ifunc++] * xdist * zdist;
01316           tmpshell += wave_f[ifunc++] * ydist * zdist;
01317           tmpshell += wave_f[ifunc++] * zdist2;
01318           value += tmpshell * contracted_gto;
01319           break;
01320 
01321         case F_SHELL:
01322           tmpshell += wave_f[ifunc++] * xdist2 * xdist;
01323           tmpshell += wave_f[ifunc++] * xdist2 * ydist;
01324           tmpshell += wave_f[ifunc++] * ydist2 * xdist;
01325           tmpshell += wave_f[ifunc++] * ydist2 * ydist;
01326           tmpshell += wave_f[ifunc++] * xdist2 * zdist;
01327           tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
01328           tmpshell += wave_f[ifunc++] * ydist2 * zdist;
01329           tmpshell += wave_f[ifunc++] * zdist2 * xdist;
01330           tmpshell += wave_f[ifunc++] * zdist2 * ydist;
01331           tmpshell += wave_f[ifunc++] * zdist2 * zdist;
01332           value += tmpshell * contracted_gto;
01333           break;
01334 
01335         case G_SHELL:
01336           tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
01337           tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
01338           tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
01339           tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
01340           tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
01341           tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
01342           tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
01343           tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
01344           tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
01345           tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
01346           tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
01347           tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
01348           tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
01349           tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
01350           tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
01351           value += tmpshell * contracted_gto;
01352           break;
01353       } // end switch
01354 
01355       shell_counter++;
01356     }
01357   }
01358 
01359   // return either orbital density or orbital wavefunction amplitude
01360   if (density) {
01361     float4 tmpout;
01362     tmpout.x = copysignf(value.x*value.x, value.x);
01363     tmpout.y = copysignf(value.y*value.y, value.y);
01364     tmpout.z = copysignf(value.z*value.z, value.z);
01365     tmpout.w = copysignf(value.w*value.w, value.w);
01366     orbitalgrid[outaddr] = tmpout;
01367   } else {
01368     orbitalgrid[outaddr] = value;
01369   }
01370 }
01371 
01372 
01373 
01374 
01375 static int computepaddedsize(int orig, int tilesize) {
01376   int alignmask = tilesize - 1;
01377   int paddedsz = (orig + alignmask) & ~alignmask;  
01378 //printf("orig: %d  padded: %d  tile: %d\n", orig, paddedsz, tilesize);
01379   return paddedsz;
01380 }
01381 
01382 static void * cudaorbitalthread(void *voidparms) {
01383   dim3 volsize, Gsz, Bsz;
01384   float *d_wave_f = NULL;
01385   float *d_basis_array = NULL;
01386   flint *d_atominfo = NULL;
01387   int *d_shellinfo = NULL;
01388   int *d_shellinfo_packed = NULL;
01389   float *d_origin = NULL;
01390   int *d_numvoxels = NULL;
01391   float *d_orbitalgrid = NULL;
01392   float *h_orbitalgrid = NULL;
01393   float *h_basis_array_exp2f = NULL;
01394   int numvoxels[3];
01395   float origin[3];
01396   orbthrparms *parms = NULL;
01397   int hwpreferscachekernel=0;
01398   int usefastconstkernel=0;
01399   int usecachekernel=0;
01400   int threadid=0;
01401 #if defined(USE_PINNED_MEMORY)
01402   int h_orbitalgrid_pinnedalloc=0;
01403 #endif
01404 #if defined(USE_ZERO_COPY)
01405   int h_orbitalgrid_zerocopy=0;
01406 #endif
01407 
01408   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
01409   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
01410 
01411   // query hardware to favor algorithms accordingly
01412   cudaDeviceProp deviceProp;
01413   memset(&deviceProp, 0, sizeof(cudaDeviceProp));
01414 
01415   int dev=0;
01416   if ((cudaGetDevice(&dev) == cudaSuccess) &&
01417       (cudaGetDeviceProperties(&deviceProp, dev) == cudaSuccess)) {
01418     cudaError_t err = cudaGetLastError(); // eat error so next CUDA op succeeds
01419 
01420     if (deviceProp.major >= 7) 
01421       hwpreferscachekernel=7; // Volta hardware prefers the cache kernel
01422 
01423     // allow overrides for testing purposes
01424     if (getenv("VMDFORCEMOPACKED") != NULL) {
01425       if (deviceProp.major >= 8) 
01426         hwpreferscachekernel=8; // Ampere hardware prefers the cache kernel
01427     }
01428   }
01429 
01430 
01431   // default tile size to use in absence of other info
01432   int maxtilesize=4; // GTX 280, Tesla C1060 starting point tile size
01433 
01434   // scale tile size by device performance
01435   wkf_threadpool_worker_devscaletile(voidparms, &maxtilesize);
01436 
01437   numvoxels[0] = parms->numvoxels[0];
01438   numvoxels[1] = parms->numvoxels[1];
01439   numvoxels[2] = 1;
01440 
01441   origin[0] = parms->origin[0];
01442   origin[1] = parms->origin[1];
01443 
01444   // setup energy grid size, padding out arrays for peak GPU memory performance
01445   volsize.x = (parms->numvoxels[0] + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
01446   volsize.y = (parms->numvoxels[1] + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
01447   volsize.z = 1;      // we only do one plane at a time
01448  
01449   // setup CUDA grid and block sizes
01450   Bsz.x = BLOCKSIZEX;
01451   Bsz.y = BLOCKSIZEY;
01452   Bsz.z = 1;
01453   Gsz.x = volsize.x / (Bsz.x * UNROLLX);
01454   Gsz.y = volsize.y / (Bsz.y * UNROLLY);
01455   Gsz.z = 1;
01456   int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
01457 
01458   // determine which runtime strategy is workable
01459   // given the data sizes involved
01460   if (hwpreferscachekernel) {
01461     usecachekernel=1;
01462   } else if ((parms->num_wave_f < MAX_WAVEF_SZ) &&
01463              (parms->numatoms < MAX_ATOM_SZ) &&
01464              (parms->numatoms < MAX_ATOMSHELL_SZ) &&
01465              (2*parms->num_basis < MAX_BASIS_SZ) &&
01466              (parms->num_shells < MAX_SHELL_SZ)) {
01467     usefastconstkernel=1;
01468   }
01469 
01470   // allow overrides for testing purposes
01471   if (getenv("VMDFORCEMOCONSTMEM") != NULL) {
01472     usefastconstkernel=1; 
01473     usecachekernel=0;
01474   }
01475   if (getenv("VMDFORCEMOTILEDSHARED") != NULL) {
01476     usefastconstkernel=0; 
01477     usecachekernel=0;
01478   }
01479   if (getenv("VMDFORCEMOL1CACHE") != NULL) {
01480     usefastconstkernel=0; 
01481     usecachekernel=1;
01482   }
01483 
01484   // allocate and copy input data to GPU arrays
01485   int padsz;
01486   padsz = computepaddedsize(2 * parms->num_basis, MEMCOALESCE);
01487   h_basis_array_exp2f = (float *) malloc(padsz * sizeof(float));
01488   float log2e = log2f(2.718281828f);
01489   for (int ll=0; ll<(2*parms->num_basis); ll+=2) {
01490 #if 1
01491     // use exp2f() rather than expf()
01492     h_basis_array_exp2f[ll  ] = parms->basis_array[ll  ] * log2e;
01493 #else
01494     h_basis_array_exp2f[ll  ] = parms->basis_array[ll  ];
01495 #endif
01496     h_basis_array_exp2f[ll+1] = parms->basis_array[ll+1];
01497   }
01498 
01499   if (usefastconstkernel) {
01500     cudaMemcpyToSymbol(const_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), 0);
01501     cudaMemcpyToSymbol(const_atompos, parms->atompos, 3 * parms->numatoms * sizeof(float), 0);
01502     cudaMemcpyToSymbol(const_atom_basis, parms->atom_basis, parms->numatoms * sizeof(int), 0);
01503     cudaMemcpyToSymbol(const_num_shells_per_atom, parms->num_shells_per_atom, parms->numatoms * sizeof(int), 0);
01504     cudaMemcpyToSymbol(const_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), 0);
01505     cudaMemcpyToSymbol(const_num_prim_per_shell, parms->num_prim_per_shell, parms->num_shells * sizeof(int), 0);
01506     cudaMemcpyToSymbol(const_shell_types, parms->shell_types, parms->num_shells * sizeof(int), 0);
01507   } else {
01508     padsz = computepaddedsize(parms->num_wave_f, MEMCOALESCE);
01509     cudaMalloc((void**)&d_wave_f, padsz * sizeof(float));
01510     cudaMemcpy(d_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), cudaMemcpyHostToDevice);
01511 
01512     // pack atom data into a tiled array
01513     padsz = computepaddedsize(16 * parms->numatoms, MEMCOALESCE);
01514     flint * h_atominfo = (flint *) calloc(1, padsz * sizeof(flint));
01515     cudaMalloc((void**)&d_atominfo, padsz * sizeof(flint));
01516     for (int ll=0; ll<parms->numatoms; ll++) {
01517       int addr = ll * 16;
01518       h_atominfo[addr    ].f = parms->atompos[ll*3    ];
01519       h_atominfo[addr + 1].f = parms->atompos[ll*3 + 1];
01520       h_atominfo[addr + 2].f = parms->atompos[ll*3 + 2];
01521       h_atominfo[addr + 3].i = parms->atom_basis[ll];
01522       h_atominfo[addr + 4].i = parms->num_shells_per_atom[ll];
01523     }
01524     cudaMemcpy(d_atominfo, h_atominfo, padsz * sizeof(flint), cudaMemcpyHostToDevice);
01525     free(h_atominfo);
01526 
01527     padsz = computepaddedsize(16 * parms->num_shells, MEMCOALESCE);
01528     int * h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
01529     cudaMalloc((void**)&d_shellinfo, padsz * sizeof(int));
01530     for (int ll=0; ll<parms->num_shells; ll++) {
01531       h_shellinfo[ll*16    ] = parms->num_prim_per_shell[ll];
01532       h_shellinfo[ll*16 + 1] = parms->shell_types[ll];
01533     }
01534     cudaMemcpy(d_shellinfo, h_shellinfo, padsz * sizeof(int), cudaMemcpyHostToDevice);
01535     free(h_shellinfo);
01536 
01537 
01538 #if 1
01539     //
01540     // Packed shellinfo array for latest Volta/Ampere GPUs
01541     //
01542     padsz = computepaddedsize(2 * parms->num_shells, MEMCOALESCE);
01543     h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
01544     cudaMalloc((void**)&d_shellinfo_packed, padsz * sizeof(int));
01545     for (int ll=0; ll<parms->num_shells; ll++) {
01546       h_shellinfo[ll*2    ] = parms->num_prim_per_shell[ll];
01547       h_shellinfo[ll*2 + 1] = parms->shell_types[ll];
01548     }
01549     cudaMemcpy(d_shellinfo_packed, h_shellinfo, padsz * sizeof(int), cudaMemcpyHostToDevice);
01550     free(h_shellinfo);
01551 #endif
01552 
01553 
01554     cudaMalloc((void**)&d_basis_array, padsz * sizeof(float));
01555     cudaMemcpy(d_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), cudaMemcpyHostToDevice);
01556 
01557     padsz = computepaddedsize(3, MEMCOALESCE);
01558     cudaMalloc((void**)&d_origin, padsz * sizeof(float));
01559     cudaMemcpy(d_origin, origin, 3 * sizeof(float), cudaMemcpyHostToDevice);
01560 
01561     cudaMalloc((void**)&d_numvoxels, padsz * sizeof(int));
01562     cudaMemcpy(d_numvoxels, numvoxels, 3 * sizeof(int), cudaMemcpyHostToDevice);
01563   }
01564 
01565 
01566 #if defined(USE_ZERO_COPY)
01567   if ((getenv("VMDCUDANOZEROCOPY") == NULL) && 
01568       (cudaHostAlloc((void **) &h_orbitalgrid, volmemsz, cudaHostAllocMapped) == cudaSuccess)) {
01569     h_orbitalgrid_zerocopy=1;
01570     cudaHostGetDevicePointer(&d_orbitalgrid, h_orbitalgrid, 0);
01571     CUERR // check and clear any existing errors
01572   } else {
01573     printf("WARNING: CUDA zero-copy pinned memory allocation failed!\n"); 
01574 #else
01575     // allocate and initialize the GPU output array
01576     cudaMalloc((void**)&d_orbitalgrid, volmemsz);
01577     CUERR // check and clear any existing errors
01578 #endif
01579 #if defined(USE_PINNED_MEMORY)
01580     // try to allocate working buffer in pinned host memory
01581     if ((getenv("VMDCUDANOPINNEDMEMORY") == NULL) && 
01582         (cudaMallocHost((void **) &h_orbitalgrid, volmemsz) == cudaSuccess)) {
01583       h_orbitalgrid_pinnedalloc=1;
01584     } else {
01585       printf("WARNING: CUDA pinned memory allocation failed!\n"); 
01586       h_orbitalgrid_pinnedalloc=0;
01587       h_orbitalgrid = (float *) malloc(volmemsz);
01588     } 
01589 #else
01590     // allocate working buffer
01591     h_orbitalgrid = (float *) malloc(volmemsz);
01592 #endif
01593 #if defined(USE_ZERO_COPY)
01594   }
01595 #endif
01596 
01597 #if 0
01598   if (threadid == 0) {
01599     // inform on which kernel we're actually going to run
01600     printf("atoms[%d] ", parms->numatoms);
01601     printf("wavef[%d] ", parms->num_wave_f);
01602     printf("basis[%d] ", parms->num_basis);
01603     printf("shell[%d] ", parms->num_shells);
01604     if (usefastconstkernel) {
01605 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
01606       printf("GPU constant memory (JIT)");
01607 #else
01608       printf("GPU constant memory");
01609 #endif
01610     } else {
01611       printf("GPU tiled shared memory:");
01612     }
01613     printf(" Gsz:%dx%d\n", Gsz.x, Gsz.y);
01614   }
01615 #endif
01616 
01617 #if 0
01618   if (usefastconstkernel) {
01619     printf("Using constant memory\n");
01620   } else if (!usefastconstkernel && usecachekernel) {
01621     printf("Using Fermi/Kepler L1 cache\n");
01622   }  else if (!usefastconstkernel) {
01623     printf("Using tiled shared memory kernel\n");
01624   }
01625 #endif
01626 
01627   // loop over orbital planes
01628   wkf_tasktile_t tile;
01629   while (wkf_threadpool_next_tile(voidparms, maxtilesize, &tile) != WKF_SCHED_DONE) {
01630     for (int k=tile.start; k<tile.end; k++) {
01631       origin[2] = parms->origin[2] + parms->voxelsize * k;
01632 
01633       // RUN the kernel...
01634       if (usefastconstkernel) {
01635 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
01636         cuorbitalconstmem_jit<<<Gsz, Bsz, 0>>>(parms->numatoms, 
01637                               parms->voxelsize,
01638                               origin[0], origin[1], origin[2],
01639                               parms->density, d_orbitalgrid);
01640 #else
01641         cuorbitalconstmem<<<Gsz, Bsz, 0>>>(parms->numatoms, 
01642                           parms->voxelsize,
01643                           origin[0], origin[1], origin[2],
01644                           parms->density, d_orbitalgrid);
01645 #endif
01646       } else {
01647         if (usecachekernel) {
01648           // Ampere hardware prefers the "packed" cache kernel
01649           if (hwpreferscachekernel >= 8) {
01650 #if 0
01651             // 3-D grid launch with all planes at once...
01652             int ktilesize = tile.end - tile.start;
01653             Gsz.z = ktilesize; // fetch dynamically assigned tile size
01654             cuorbitalcachedglobmem_packed_4x_3D<<<Gsz, Bsz, 0>>>(parms->numatoms,
01655                                  d_wave_f, (float2 *) d_basis_array,
01656                                  d_atominfo, (int2 *) d_shellinfo_packed,
01657                                  parms->voxelsize,
01658                                  origin[0], origin[1], origin[2],
01659                                  parms->density, (float4 *) d_orbitalgrid);
01660             k = tile.end; // set counter for completion of all iterations
01661 #elif 1
01662             cuorbitalcachedglobmem_packed_4x<<<Gsz, Bsz, 0>>>(parms->numatoms,
01663                                  d_wave_f, (float2 *) d_basis_array,
01664                                  d_atominfo, (int2 *) d_shellinfo_packed,
01665                                  parms->voxelsize,
01666                                  origin[0], origin[1], origin[2],
01667                                  parms->density, (float4 *) d_orbitalgrid);
01668 #elif 1
01669             cuorbitalcachedglobmem_packed_2x<<<Gsz, Bsz, 0>>>(parms->numatoms,
01670                                  d_wave_f, (float2 *) d_basis_array,
01671                                  d_atominfo, (int2 *) d_shellinfo_packed,
01672                                  parms->voxelsize,
01673                                  origin[0], origin[1], origin[2],
01674                                  parms->density, (float2 *) d_orbitalgrid);
01675 #else
01676             cuorbitalcachedglobmem_packed<<<Gsz, Bsz, 0>>>(parms->numatoms,
01677                                  d_wave_f, (float2 *) d_basis_array,
01678                                  d_atominfo, (int2 *) d_shellinfo_packed,
01679                                  parms->voxelsize,
01680                                  origin[0], origin[1], origin[2],
01681                                  parms->density, d_orbitalgrid);
01682 #endif
01683           } else {
01684             // This is a Fermi/Kepler-specific kernel and should only be used
01685             // on devices with compute capability >= 2.0
01686             cuorbitalcachedglobmem<<<Gsz, Bsz, 0>>>(parms->numatoms, 
01687                                  d_wave_f, d_basis_array,
01688                                  d_atominfo, d_shellinfo,
01689                                  parms->voxelsize,
01690                                  origin[0], origin[1], origin[2],
01691                                  parms->density, d_orbitalgrid);
01692           }
01693         } else {
01694           // Use shared memory as a software-managed cache with a 
01695           // high performance tiling scheme
01696           cuorbitaltiledshared<<<Gsz, Bsz, 0>>>(parms->numatoms, 
01697                                d_wave_f, d_basis_array,
01698                                d_atominfo, d_shellinfo,
01699                                parms->voxelsize,
01700                                origin[0], origin[1], origin[2],
01701                                parms->density, d_orbitalgrid);
01702         }
01703       }
01704 
01705 #if defined(USE_ZERO_COPY)
01706       if (h_orbitalgrid_zerocopy) {
01707         cudaDeviceSynchronize();
01708       } else {
01709 #endif
01710         CUERR // check and clear any existing errors
01711 
01712         // Copy the GPU output data back to the host and use/store it..
01713         cudaMemcpy(h_orbitalgrid, d_orbitalgrid, volmemsz, cudaMemcpyDeviceToHost);
01714         CUERR // check and clear any existing errors
01715 #if defined(USE_ZERO_COPY)
01716       }
01717 #endif
01718 
01719       // Copy GPU blocksize padded array back down to the original size
01720       int planesz = numvoxels[0] * numvoxels[1];
01721       for (int y=0; y<numvoxels[1]; y++) {
01722         long orbaddr = k*planesz + y*numvoxels[0];
01723         memcpy(&parms->orbitalgrid[orbaddr], &h_orbitalgrid[y*volsize.x], numvoxels[0] * sizeof(float));
01724       }
01725     }
01726     cudaDeviceSynchronize();
01727   }
01728 
01729   free(h_basis_array_exp2f);
01730 
01731 #if defined(USE_ZERO_COPY)
01732   if (h_orbitalgrid_zerocopy) {
01733     cudaFreeHost(h_orbitalgrid);
01734   } else {
01735 #endif
01736 #if defined(USE_PINNED_MEMORY)
01737     if (h_orbitalgrid_pinnedalloc)
01738       cudaFreeHost(h_orbitalgrid);
01739     else
01740       free(h_orbitalgrid);
01741 #else
01742     free(h_orbitalgrid);
01743 #endif
01744     cudaFree(d_orbitalgrid);
01745 #if defined(USE_ZERO_COPY)
01746   }
01747 #endif
01748 
01749   if (!usefastconstkernel) {
01750     cudaFree(d_wave_f);
01751     cudaFree(d_basis_array);
01752     cudaFree(d_atominfo);
01753     cudaFree(d_shellinfo);
01754     cudaFree(d_numvoxels);
01755     cudaFree(d_origin);
01756   }
01757 
01758   CUERR // check and clear any existing errors
01759 
01760   return NULL;
01761 }
01762 
01763 
01764 int vmd_cuda_evaluate_orbital_grid(wkf_threadpool_t *devpool,
01765                        int numatoms,
01766                        const float *wave_f, int num_wave_f,
01767                        const float *basis_array, int num_basis,
01768                        const float *atompos,
01769                        const int *atom_basis,
01770                        const int *num_shells_per_atom,
01771                        const int *num_prim_per_shell,
01772                        const int *shell_types,
01773                        int num_shells,
01774                        const int *numvoxels,
01775                        float voxelsize,
01776                        const float *origin,
01777                        int density,
01778                        float *orbitalgrid) {
01779   int rc=0;
01780   orbthrparms parms;
01781 
01782   if (devpool == NULL) {
01783     return -1; // abort if no device pool exists
01784   }
01785 
01786 #if 0
01787   /* XXX hackish method of supporting standlone runs outside of VMD... */
01788   /* init device context and settings */
01789   if (getenv("VMDCUDADEV")) {
01790     rc=cudaSetDevice(atoi(getenv("VMDCUDADEV")));
01791   } else {
01792     rc=cudaSetDevice(0);
01793   }
01794 
01795   if (rc != cudaSuccess) {
01796 #if CUDART_VERSION >= 2010
01797     rc = cudaGetLastError(); // query last error and reset error state
01798     if (rc != cudaErrorSetOnActiveProcess)
01799       return NULL; // abort and return an error
01800 #else
01801     cudaGetLastError(); // just ignore and reset error state, since older CUDA
01802                         // revs don't have a cudaErrorSetOnActiveProcess enum
01803 #endif
01804   }
01805 
01806   /* set blocking/yielding API behavior and enable mapped host memory */
01807   cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost);
01808 
01809 #if CUDART_VERSION >= 3000
01810   /* we prefer to have a large 48kB L1 cache for the Fermi cache kernel */
01811   cudaFuncSetCacheConfig(cuorbitalcachedglobmem, cudaFuncCachePreferL1);
01812 
01813   /* we prefer to have a large 48kB shared mem for the tiled kernel */
01814   cudaFuncSetCacheConfig(cuorbitaltiledshared, cudaFuncCachePreferShared);
01815 #endif
01816 #endif
01817 
01818 #if defined(VMDMOJITSRC)
01819   if (getenv("VMDMOJIT") != NULL) {
01820     // generate CUDA kernel
01821     orbital_jit_generate(ORBITAL_JIT_CUDA, getenv("VMDMOJITSRCFILE"), 
01822                          numatoms, wave_f, basis_array, atom_basis, 
01823                          num_shells_per_atom, num_prim_per_shell, shell_types);
01824 
01825     // generate OpenCL kernel
01826     orbital_jit_generate(ORBITAL_JIT_OPENCL, "/tmp/mojit.cl", 
01827                          numatoms, wave_f, basis_array, atom_basis, 
01828                          num_shells_per_atom, num_prim_per_shell, shell_types);
01829     return 0;
01830   }
01831 #endif
01832 
01833   parms.numatoms = numatoms;
01834   parms.wave_f = wave_f;
01835   parms.num_wave_f = num_wave_f;
01836   parms.basis_array = basis_array;
01837   parms.num_basis = num_basis;
01838   parms.atompos = atompos;
01839   parms.atom_basis = atom_basis;
01840   parms.num_shells_per_atom = num_shells_per_atom;
01841   parms.num_prim_per_shell = num_prim_per_shell;
01842   parms.shell_types = shell_types;
01843   parms.num_shells = num_shells;
01844   parms.numvoxels = numvoxels;
01845   parms.voxelsize = voxelsize;
01846   parms.origin = origin;
01847   parms.density = density;
01848   parms.orbitalgrid = orbitalgrid;
01849 
01850   wkf_tasktile_t tile;
01851   tile.start=0;
01852   tile.end=numvoxels[2];
01853   wkf_threadpool_sched_dynamic(devpool, &tile);
01854   wkf_threadpool_launch(devpool, cudaorbitalthread, &parms, 1); 
01855 
01856   return rc;
01857 }
01858 
01859