Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDAOrbital.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2008-2009 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAOrbital.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.109 $      $Date: 2010/06/10 20:34:27 $
00014  *
00015  ***************************************************************************
00016  * DESCRIPTION:
00017  *   This source file contains the CUDA kernels for computing molecular
00018  *  orbital amplitudes on a uniformly spaced grid, using one or more GPUs.
00019  *
00020  ***************************************************************************/
00021 
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <math.h>
00026 #include <cuda.h>
00027 
00028 #include "WKFThreads.h"
00029 #include "CUDAKernels.h"
00030 #include "OrbitalJIT.h"
00031 
00032 // Multi-GPU pinned memory bugs seem to have been fixed as of CUDA 2.3 drivers
00033 #if CUDART_VERSION >= 2030
00034 #define USE_PINNED_MEMORY 1
00035 // #define USE_ZERO_COPY 1
00036 // #define USE_FERMI_CACHE 1
00037 #endif
00038 
00039 #if 1
00040 #define CUERR { cudaError_t err; \
00041   if ((err = cudaGetLastError()) != cudaSuccess) { \
00042   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00043   printf("Thread aborting...\n"); \
00044   return NULL; }}
00045 #else
00046 #define CUERR
00047 #endif
00048 
00049 #define ANGS_TO_BOHR 1.8897259877218677f
00050 
00051 typedef union flint_t {
00052   float f;
00053   int i;
00054 } flint;
00055 
00056 typedef struct {
00057   int numatoms;
00058   const float *wave_f; 
00059   int num_wave_f;
00060   const float *basis_array;
00061   int num_basis;
00062   const float *atompos;
00063   const int *atom_basis;
00064   const int *num_shells_per_atom;
00065   const int *num_prim_per_shell;
00066   const int *shell_types;
00067   int num_shells;
00068   const int *numvoxels;
00069   float voxelsize;
00070   const float *origin;
00071   int density;
00072   float *orbitalgrid;
00073 } orbthrparms;
00074 
00075 /* thread prototype */
00076 static void * cudaorbitalthread(void *);
00077 
00078 // GPU block layout
00079 #define UNROLLX       1
00080 #define UNROLLY       1
00081 #define BLOCKSIZEX    8 
00082 #define BLOCKSIZEY    8
00083 #define BLOCKSIZE     BLOCKSIZEX * BLOCKSIZEY
00084 
00085 // required GPU array padding to match thread block size
00086 #define TILESIZEX BLOCKSIZEX*UNROLLX
00087 #define TILESIZEY BLOCKSIZEY*UNROLLY
00088 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00089 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00090 
00091 #define MEMCOALESCE  384
00092 
00093 // orbital shell types 
00094 #define S_SHELL 0
00095 #define P_SHELL 1
00096 #define D_SHELL 2
00097 #define F_SHELL 3
00098 #define G_SHELL 4
00099 #define H_SHELL 5
00100 
00101 //
00102 // Constant arrays to store 
00103 //
00104 #define MAX_ATOM_SZ 256
00105 
00106 #define MAX_ATOMPOS_SZ (MAX_ATOM_SZ)
00107 __constant__ static float const_atompos[MAX_ATOMPOS_SZ * 3];
00108 
00109 #define MAX_ATOM_BASIS_SZ (MAX_ATOM_SZ)
00110 __constant__ static int const_atom_basis[MAX_ATOM_BASIS_SZ];
00111 
00112 #define MAX_ATOMSHELL_SZ (MAX_ATOM_SZ)
00113 __constant__ static int const_num_shells_per_atom[MAX_ATOMSHELL_SZ];
00114 
00115 #define MAX_BASIS_SZ 6144 
00116 __constant__ static float const_basis_array[MAX_BASIS_SZ];
00117 
00118 #define MAX_SHELL_SZ 1024
00119 __constant__ static int const_num_prim_per_shell[MAX_SHELL_SZ];
00120 __constant__ static int const_shell_types[MAX_SHELL_SZ];
00121 
00122 #define MAX_WAVEF_SZ 6144
00123 __constant__ static float const_wave_f[MAX_WAVEF_SZ];
00124 
00125 
00126 
00127 //
00128 // Only enabled for testing an emulation of JIT code generation approach
00129 //
00130 // #define VMDMOJIT 1
00131 // #define VMDMOJITSRC "/home/johns/mojit.cu"
00132 
00133 //
00134 // If we're testing performance of a JIT kernel, include the code here
00135 //
00136 #if defined(VMDMOJIT) && defined(VMDMOJITSRC)
00137 #include VMDMOJITSRC
00138 #endif
00139 
00140 
00141 
00142 //
00143 // CUDA using const memory for almost all of the key arrays
00144 //
00145 __global__ static void cuorbitalconstmem(int numatoms, 
00146                           float voxelsize, 
00147                           float originx,
00148                           float originy,
00149                           float grid_z, 
00150                           int density,
00151                           float * orbitalgrid) {
00152   unsigned int xindex  = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
00153   unsigned int yindex  = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00154   unsigned int outaddr = __umul24(gridDim.x, blockDim.x) * yindex + xindex;
00155   float grid_x = originx + voxelsize * xindex;
00156   float grid_y = originy + voxelsize * yindex;
00157 
00158   // similar to C version
00159   int at;
00160   int prim, shell;
00161 
00162   // initialize value of orbital at gridpoint
00163   float value = 0.0f;
00164 
00165   // initialize the wavefunction and shell counters
00166   int ifunc = 0;
00167   int shell_counter = 0;
00168 
00169   // loop over all the QM atoms
00170   for (at = 0; at < numatoms; at++) {
00171     // calculate distance between grid point and center of atom
00172     int maxshell = const_num_shells_per_atom[at];
00173     int prim_counter = const_atom_basis[at];
00174 
00175     float xdist = (grid_x - const_atompos[3*at  ])*ANGS_TO_BOHR;
00176     float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR;
00177     float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR;
00178 
00179     float xdist2 = xdist*xdist;
00180     float ydist2 = ydist*ydist;
00181     float zdist2 = zdist*zdist;
00182 
00183     float dist2 = xdist2 + ydist2 + zdist2;
00184 
00185     // loop over the shells belonging to this atom (or basis function)
00186     for (shell=0; shell < maxshell; shell++) {
00187       float contracted_gto = 0.0f;
00188 
00189       // Loop over the Gaussian primitives of this contracted
00190       // basis function to build the atomic orbital
00191       int maxprim = const_num_prim_per_shell[shell_counter];
00192       int shelltype = const_shell_types[shell_counter];
00193       for (prim=0; prim < maxprim;  prim++) {
00194         float exponent       = const_basis_array[prim_counter    ];
00195         float contract_coeff = const_basis_array[prim_counter + 1];
00196 
00197         // By premultiplying the stored exponent factors etc,
00198         // we can use exp2f() rather than exp(), giving us full
00199         // precision, but with the speed of __expf()
00200         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00201         prim_counter += 2;
00202       }
00203 
00204       /* multiply with the appropriate wavefunction coefficient */
00205       float tmpshell=0.0f;
00206       switch (shelltype) {
00207         case S_SHELL:
00208           value += const_wave_f[ifunc++] * contracted_gto;
00209           break;
00210 
00211         case P_SHELL:
00212           tmpshell += const_wave_f[ifunc++] * xdist;
00213           tmpshell += const_wave_f[ifunc++] * ydist;
00214           tmpshell += const_wave_f[ifunc++] * zdist;
00215           value += tmpshell * contracted_gto;
00216           break;
00217 
00218         case D_SHELL:
00219           tmpshell += const_wave_f[ifunc++] * xdist2;
00220           tmpshell += const_wave_f[ifunc++] * xdist * ydist;
00221           tmpshell += const_wave_f[ifunc++] * ydist2;
00222           tmpshell += const_wave_f[ifunc++] * xdist * zdist;
00223           tmpshell += const_wave_f[ifunc++] * ydist * zdist;
00224           tmpshell += const_wave_f[ifunc++] * zdist2;
00225           value += tmpshell * contracted_gto;
00226           break;
00227 
00228         case F_SHELL:
00229           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist;
00230           tmpshell += const_wave_f[ifunc++] * xdist2 * ydist;
00231           tmpshell += const_wave_f[ifunc++] * ydist2 * xdist;
00232           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist;
00233           tmpshell += const_wave_f[ifunc++] * xdist2 * zdist;
00234           tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist;
00235           tmpshell += const_wave_f[ifunc++] * ydist2 * zdist;
00236           tmpshell += const_wave_f[ifunc++] * zdist2 * xdist;
00237           tmpshell += const_wave_f[ifunc++] * zdist2 * ydist;
00238           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist;
00239           value += tmpshell * contracted_gto;
00240           break;
00241 
00242         case G_SHELL:
00243           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00244           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00245           tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00246           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00247           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00248           tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00249           tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00250           tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00251           tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00252           tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00253           tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00254           tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00255           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00256           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00257           tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00258           value += tmpshell * contracted_gto;
00259           break;
00260       } // end switch
00261 
00262       shell_counter++;
00263     }
00264   }
00265 
00266   // return either orbital density or orbital wavefunction amplitude
00267   if (density) {
00268     orbitalgrid[outaddr] = copysignf(value*value, value);
00269   } else {
00270     orbitalgrid[outaddr] = value;
00271   }
00272 }
00273 
00274 
00275 
00276 //
00277 // CUDA loading from global memory into shared memory for all arrays
00278 //
00279 
00280 // Shared memory tiling parameters:
00281 //   We need a tile size of at least 16 elements for coalesced memory access,
00282 // and it must be the next power of two larger than the largest number of 
00283 // wavefunction coefficients that will be consumed at once for a given 
00284 // shell type.  The current code is designed to handle up to "G" shells 
00285 // (15 coefficients), since that's the largest shell type supported by
00286 // GAMESS, which is the only format we can presently read.
00287 
00288 // The maximum shell coefficient count specifies the largest number
00289 // of coefficients that might have to be read for a single shell,
00290 // e.g. for the highest supported shell type.
00291 // This is a "G" shell coefficient count
00292 #define MAXSHELLCOUNT 15   
00293 
00294 // Mask out the lower N bits of the array index
00295 // to compute which tile to start loading shared memory with.
00296 // This must be large enough to gaurantee coalesced addressing, but
00297 // be half or less of the minimum tile size
00298 #define MEMCOAMASK  (~15)
00299 
00300 // The shared memory basis set and wavefunction coefficient arrays 
00301 // must contain a minimum of at least two tiles, but can be any 
00302 // larger multiple thereof which is also a multiple of the thread block size.
00303 // Larger arrays reduce duplicative data loads/fragmentation 
00304 // at the beginning and end of the array
00305 #define SHAREDSIZE 256
00306 
00307 
00308 __global__ static void cuorbitaltiledshared(int numatoms, 
00309                           const float *wave_f,
00310                           const float *basis_array,
00311                           const flint *atominfo, 
00312                           const int *shellinfo, 
00313                           float voxelsize, 
00314                           float originx,
00315                           float originy,
00316                           float grid_z, 
00317                           int density, 
00318                           float * orbitalgrid) {
00319   unsigned int xindex  = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
00320   unsigned int yindex  = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00321   unsigned int outaddr = __umul24(gridDim.x, blockDim.x) * yindex + xindex;
00322   float grid_x = originx + voxelsize * xindex;
00323   float grid_y = originy + voxelsize * yindex;
00324 
00325   int sidx = __mul24(threadIdx.y, blockDim.x) + threadIdx.x;
00326 
00327   __shared__ float s_wave_f[SHAREDSIZE];
00328   int sblock_wave_f = 0;
00329   s_wave_f[sidx      ] = wave_f[sidx      ];
00330   s_wave_f[sidx +  64] = wave_f[sidx +  64];
00331   s_wave_f[sidx + 128] = wave_f[sidx + 128];
00332   s_wave_f[sidx + 192] = wave_f[sidx + 192];
00333   __syncthreads();
00334 
00335   // similar to C version
00336   int at;
00337   int prim, shell;
00338 
00339   // initialize value of orbital at gridpoint
00340   float value = 0.0f;
00341 
00342   // initialize the wavefunction and primitive counters
00343   int ifunc = 0;
00344   int shell_counter = 0;
00345   int sblock_prim_counter = -1; // sentinel value to indicate no data loaded
00346   // loop over all the QM atoms
00347   for (at = 0; at < numatoms; at++) {
00348     __shared__ flint s_atominfo[5];
00349     __shared__ float s_basis_array[SHAREDSIZE];
00350 
00351     __syncthreads();
00352     if (sidx < 5)
00353       s_atominfo[sidx].i = atominfo[(at<<4) + sidx].i;
00354     __syncthreads();
00355 
00356     int prim_counter = s_atominfo[3].i;
00357     int maxshell     = s_atominfo[4].i;
00358     int new_sblock_prim_counter = prim_counter & MEMCOAMASK;
00359     if (sblock_prim_counter != new_sblock_prim_counter) {
00360       sblock_prim_counter = new_sblock_prim_counter;
00361       s_basis_array[sidx      ] = basis_array[sblock_prim_counter + sidx      ];
00362       s_basis_array[sidx +  64] = basis_array[sblock_prim_counter + sidx +  64];
00363       s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00364       s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00365       prim_counter -= sblock_prim_counter;
00366       __syncthreads();
00367     }
00368 
00369     // calculate distance between grid point and center of atom
00370     float xdist = (grid_x - s_atominfo[0].f)*ANGS_TO_BOHR;
00371     float ydist = (grid_y - s_atominfo[1].f)*ANGS_TO_BOHR;
00372     float zdist = (grid_z - s_atominfo[2].f)*ANGS_TO_BOHR;
00373 
00374     float xdist2 = xdist*xdist;
00375     float ydist2 = ydist*ydist;
00376     float zdist2 = zdist*zdist;
00377 
00378     float dist2 = xdist2 + ydist2 + zdist2;
00379 
00380     // loop over the shells belonging to this atom (or basis function)
00381     for (shell=0; shell < maxshell; shell++) {
00382       float contracted_gto = 0.0f;
00383 
00384       // Loop over the Gaussian primitives of this contracted
00385       // basis function to build the atomic orbital
00386       __shared__ int s_shellinfo[2];
00387      
00388       __syncthreads();
00389       if (sidx < 2)
00390         s_shellinfo[sidx] = shellinfo[(shell_counter<<4) + sidx];
00391       __syncthreads();
00392 
00393       int maxprim = s_shellinfo[0];
00394       int shelltype = s_shellinfo[1];
00395 
00396       if ((prim_counter + (maxprim<<1)) >= SHAREDSIZE) {
00397         prim_counter += sblock_prim_counter;
00398         sblock_prim_counter = prim_counter & MEMCOAMASK;
00399         s_basis_array[sidx      ] = basis_array[sblock_prim_counter + sidx      ];
00400         s_basis_array[sidx +  64] = basis_array[sblock_prim_counter + sidx +  64];
00401         s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00402         s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00403         prim_counter -= sblock_prim_counter;
00404         __syncthreads();
00405       } 
00406       for (prim=0; prim < maxprim;  prim++) {
00407         float exponent       = s_basis_array[prim_counter    ];
00408         float contract_coeff = s_basis_array[prim_counter + 1];
00409 
00410         // By premultiplying the stored exponent factors etc,
00411         // we can use exp2f() rather than exp(), giving us full
00412         // precision, but with the speed of __expf()
00413         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00414 
00415         prim_counter += 2;
00416       }
00417 
00418       // XXX should use a constant memory lookup table to store
00419       // shared mem refill constants, and dynamically lookup the
00420       // number of elements referenced in the next iteration.
00421       if ((ifunc + MAXSHELLCOUNT) >= SHAREDSIZE) { 
00422         ifunc += sblock_wave_f;
00423         sblock_wave_f = ifunc & MEMCOAMASK;
00424         __syncthreads();
00425         s_wave_f[sidx      ] = wave_f[sblock_wave_f + sidx      ];
00426         s_wave_f[sidx +  64] = wave_f[sblock_wave_f + sidx +  64];
00427         s_wave_f[sidx + 128] = wave_f[sblock_wave_f + sidx + 128];
00428         s_wave_f[sidx + 192] = wave_f[sblock_wave_f + sidx + 192];
00429         __syncthreads();
00430         ifunc -= sblock_wave_f;
00431       }
00432 
00433       /* multiply with the appropriate wavefunction coefficient */
00434       float tmpshell=0.0f;
00435       switch (shelltype) {
00436         case S_SHELL:
00437           value += s_wave_f[ifunc++] * contracted_gto;
00438           break;
00439 
00440         case P_SHELL:
00441           tmpshell += s_wave_f[ifunc++] * xdist;
00442           tmpshell += s_wave_f[ifunc++] * ydist;
00443           tmpshell += s_wave_f[ifunc++] * zdist;
00444           value += tmpshell * contracted_gto;
00445           break;
00446 
00447         case D_SHELL:
00448           tmpshell += s_wave_f[ifunc++] * xdist2;
00449           tmpshell += s_wave_f[ifunc++] * xdist * ydist;
00450           tmpshell += s_wave_f[ifunc++] * ydist2;
00451           tmpshell += s_wave_f[ifunc++] * xdist * zdist;
00452           tmpshell += s_wave_f[ifunc++] * ydist * zdist;
00453           tmpshell += s_wave_f[ifunc++] * zdist2;
00454           value += tmpshell * contracted_gto;
00455           break;
00456 
00457         case F_SHELL:
00458           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist;
00459           tmpshell += s_wave_f[ifunc++] * xdist2 * ydist;
00460           tmpshell += s_wave_f[ifunc++] * ydist2 * xdist;
00461           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist;
00462           tmpshell += s_wave_f[ifunc++] * xdist2 * zdist;
00463           tmpshell += s_wave_f[ifunc++] * xdist * ydist * zdist;
00464           tmpshell += s_wave_f[ifunc++] * ydist2 * zdist;
00465           tmpshell += s_wave_f[ifunc++] * zdist2 * xdist;
00466           tmpshell += s_wave_f[ifunc++] * zdist2 * ydist;
00467           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist;
00468           value += tmpshell * contracted_gto;
00469           break;
00470 
00471         case G_SHELL:
00472           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00473           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00474           tmpshell += s_wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00475           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00476           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00477           tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00478           tmpshell += s_wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00479           tmpshell += s_wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00480           tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00481           tmpshell += s_wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00482           tmpshell += s_wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00483           tmpshell += s_wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00484           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00485           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00486           tmpshell += s_wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00487           value += tmpshell * contracted_gto;
00488           break;
00489       } // end switch
00490 
00491       shell_counter++;
00492     }
00493   }
00494 
00495   // return either orbital density or orbital wavefunction amplitude
00496   if (density) {
00497     orbitalgrid[outaddr] = copysignf(value*value, value);
00498   } else {
00499     orbitalgrid[outaddr] = value;
00500   }
00501 }
00502 
00503 
00504 //
00505 // This is a Fermi-specific kernel designed to rely entirely on
00506 // L1 cache for access to various coefficient arrays.  It would perform
00507 // terribly on older devices with compute capability < 2.0, so it should not
00508 // be used on such hardware.
00509 //
00510 __global__ static void cuorbitalcachedglobmem(int numatoms,
00511                           const float *wave_f,
00512                           const float *basis_array,
00513                           const flint *atominfo, 
00514                           const int *shellinfo, 
00515                           float voxelsize, 
00516                           float originx,
00517                           float originy,
00518                           float grid_z, 
00519                           int density, 
00520                           float * orbitalgrid) {
00521   unsigned int xindex  = __umul24(blockIdx.x, blockDim.x) * UNROLLX
00522                          + threadIdx.x;
00523   unsigned int yindex  = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00524   unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex
00525                          + xindex;
00526   float grid_x = originx + voxelsize * xindex;
00527   float grid_y = originy + voxelsize * yindex;
00528 
00529   // similar to C version
00530   int at;
00531   int prim, shell;
00532 
00533   // initialize value of orbital at gridpoint
00534   float value = 0.0f;
00535 
00536   // initialize the wavefunction and primitive counters
00537   int ifunc = 0;
00538   int shell_counter = 0;
00539 
00540   // loop over all the QM atoms
00541   for (at = 0; at < numatoms; at++) {
00542     // calculate distance between grid point and center of atom
00543     int atidx = at << 4;
00544     float xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00545     float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00546     float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00547 
00548     int prim_counter = atominfo[atidx + 3].i;
00549     int maxshell     = atominfo[atidx + 4].i;
00550 
00551     float xdist2 = xdist*xdist;
00552     float ydist2 = ydist*ydist;
00553     float zdist2 = zdist*zdist;
00554 
00555     float dist2 = xdist2 + ydist2 + zdist2;
00556 
00557     // loop over the shells belonging to this atom (or basis function)
00558     for (shell=0; shell < maxshell; shell++) {
00559       float contracted_gto = 0.0f;
00560 
00561       int maxprim    = shellinfo[(shell_counter<<4)    ];
00562       int shell_type = shellinfo[(shell_counter<<4) + 1];
00563       for (prim=0; prim < maxprim;  prim++) {
00564         float exponent       = basis_array[prim_counter    ];
00565         float contract_coeff = basis_array[prim_counter + 1];
00566 
00567         // By premultiplying the stored exponent factors etc,
00568         // we can use exp2f() rather than exp(), giving us full
00569         // precision, but with the speed of __expf()
00570         contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00571         prim_counter += 2;
00572       }
00573 
00574       /* multiply with the appropriate wavefunction coefficient */
00575       float tmpshell=0;
00576       switch (shell_type) {
00577         case S_SHELL:
00578           value += wave_f[ifunc++] * contracted_gto;
00579           break;
00580 
00581         case P_SHELL:
00582           tmpshell += wave_f[ifunc++] * xdist;
00583           tmpshell += wave_f[ifunc++] * ydist;
00584           tmpshell += wave_f[ifunc++] * zdist;
00585           value += tmpshell * contracted_gto;
00586           break;
00587 
00588         case D_SHELL:
00589           tmpshell += wave_f[ifunc++] * xdist2;
00590           tmpshell += wave_f[ifunc++] * xdist * ydist;
00591           tmpshell += wave_f[ifunc++] * ydist2;
00592           tmpshell += wave_f[ifunc++] * xdist * zdist;
00593           tmpshell += wave_f[ifunc++] * ydist * zdist;
00594           tmpshell += wave_f[ifunc++] * zdist2;
00595           value += tmpshell * contracted_gto;
00596           break;
00597 
00598         case F_SHELL:
00599           tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00600           tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00601           tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00602           tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00603           tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00604           tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00605           tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00606           tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00607           tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00608           tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00609           value += tmpshell * contracted_gto;
00610           break;
00611 
00612         case G_SHELL:
00613           tmpshell += wave_f[ifunc++] * xdist2 * xdist2; // xxxx
00614           tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;  // xxxy
00615           tmpshell += wave_f[ifunc++] * xdist2 * ydist2; // xxyy
00616           tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;  // xyyy
00617           tmpshell += wave_f[ifunc++] * ydist2 * ydist2; // yyyy
00618           tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist; // xxxz
00619           tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist; // xxyz
00620           tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist; // xyyz
00621           tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist; // yyyz
00622           tmpshell += wave_f[ifunc++] * xdist2 * zdist2; // xxzz
00623           tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist; // xyzz
00624           tmpshell += wave_f[ifunc++] * ydist2 * zdist2; // yyzz
00625           tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist; // zzzx
00626           tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist; // zzzy
00627           tmpshell += wave_f[ifunc++] * zdist2 * zdist2; // zzzz
00628           value += tmpshell * contracted_gto;
00629           break;
00630       } // end switch
00631 
00632       shell_counter++;
00633     }
00634   }
00635 
00636   // return either orbital density or orbital wavefunction amplitude
00637   if (density) {
00638     orbitalgrid[outaddr] = copysignf(value*value, value);
00639   } else {
00640     orbitalgrid[outaddr] = value;
00641   }
00642 }
00643 
00644 
00645 
00646 
00647 
00648 
00649 static int computepaddedsize(int orig, int tilesize) {
00650   int alignmask = tilesize - 1;
00651   int paddedsz = (orig + alignmask) & ~alignmask;  
00652 //printf("orig: %d  padded: %d  tile: %d\n", orig, paddedsz, tilesize);
00653   return paddedsz;
00654 }
00655 
00656 static void * cudaorbitalthread(void *voidparms) {
00657   dim3 volsize, Gsz, Bsz;
00658   float *d_wave_f = NULL;
00659   float *d_basis_array = NULL;
00660   flint *d_atominfo = NULL;
00661   int *d_shellinfo = NULL;
00662   float *d_origin = NULL;
00663   int *d_numvoxels = NULL;
00664   float *d_orbitalgrid = NULL;
00665   float *h_orbitalgrid = NULL;
00666   float *h_basis_array_exp2f = NULL;
00667   int numvoxels[3];
00668   float origin[3];
00669   orbthrparms *parms = NULL;
00670   int usefastconstkernel=0;
00671   int threadid=0;
00672 #if defined(USE_PINNED_MEMORY)
00673   int h_orbitalgrid_pinnedalloc=0;
00674   int h_orbitalgrid_zerocopy=0;
00675 #endif
00676   int tilesize = 1; // default tile size to use in absence of other info
00677 
00678   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00679   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00680 
00681   // scale tile size by device performance
00682   tilesize=4; // GTX 280, Tesla C1060 starting point tile size
00683   wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00684 
00685   numvoxels[0] = parms->numvoxels[0];
00686   numvoxels[1] = parms->numvoxels[1];
00687   numvoxels[2] = 1;
00688 
00689   origin[0] = parms->origin[0];
00690   origin[1] = parms->origin[1];
00691 
00692   // setup energy grid size, padding out arrays for peak GPU memory performance
00693   volsize.x = (parms->numvoxels[0] + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00694   volsize.y = (parms->numvoxels[1] + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00695   volsize.z = 1;      // we only do one plane at a time
00696  
00697   // setup CUDA grid and block sizes
00698   Bsz.x = BLOCKSIZEX;
00699   Bsz.y = BLOCKSIZEY;
00700   Bsz.z = 1;
00701   Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00702   Gsz.y = volsize.y / (Bsz.y * UNROLLY);
00703   Gsz.z = 1;
00704   int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00705 
00706   // determine which runtime strategy is workable
00707   // given the data sizes involved
00708   if ((parms->num_wave_f < MAX_WAVEF_SZ) &&
00709       (parms->numatoms < MAX_ATOM_SZ) &&
00710       (parms->numatoms < MAX_ATOMSHELL_SZ) &&
00711       (2*parms->num_basis < MAX_BASIS_SZ) &&
00712       (parms->num_shells < MAX_SHELL_SZ)) {
00713     usefastconstkernel=1;
00714   }
00715 
00716   // allow overrides for testing purposes
00717   if (getenv("VMDFORCEMOTILEDSHARED") != NULL) {
00718     usefastconstkernel=0; 
00719   }
00720 
00721   // allocate and copy input data to GPU arrays
00722   int padsz;
00723   padsz = computepaddedsize(2 * parms->num_basis, MEMCOALESCE);
00724   h_basis_array_exp2f = (float *) malloc(padsz * sizeof(float));
00725   float log2e = log2(2.718281828);
00726   for (int ll=0; ll<(2*parms->num_basis); ll+=2) {
00727 #if 1
00728     // use exp2f() rather than expf()
00729     h_basis_array_exp2f[ll  ] = parms->basis_array[ll  ] * log2e;
00730 #else
00731     h_basis_array_exp2f[ll  ] = parms->basis_array[ll  ];
00732 #endif
00733     h_basis_array_exp2f[ll+1] = parms->basis_array[ll+1];
00734   }
00735 
00736   if (usefastconstkernel) {
00737     cudaMemcpyToSymbol(const_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), 0);
00738     cudaMemcpyToSymbol(const_atompos, parms->atompos, 3 * parms->numatoms * sizeof(float), 0);
00739     cudaMemcpyToSymbol(const_atom_basis, parms->atom_basis, parms->numatoms * sizeof(int), 0);
00740     cudaMemcpyToSymbol(const_num_shells_per_atom, parms->num_shells_per_atom, parms->numatoms * sizeof(int), 0);
00741     cudaMemcpyToSymbol(const_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), 0);
00742     cudaMemcpyToSymbol(const_num_prim_per_shell, parms->num_prim_per_shell, parms->num_shells * sizeof(int), 0);
00743     cudaMemcpyToSymbol(const_shell_types, parms->shell_types, parms->num_shells * sizeof(int), 0);
00744   } else {
00745     padsz = computepaddedsize(parms->num_wave_f, MEMCOALESCE);
00746     cudaMalloc((void**)&d_wave_f, padsz * sizeof(float));
00747     cudaMemcpy(d_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), cudaMemcpyHostToDevice);
00748 
00749     // pack atom data into a tiled array
00750     padsz = computepaddedsize(16 * parms->numatoms, MEMCOALESCE);
00751     flint * h_atominfo = (flint *) calloc(1, padsz * sizeof(flint));
00752     cudaMalloc((void**)&d_atominfo, padsz * sizeof(flint));
00753     for (int ll=0; ll<parms->numatoms; ll++) {
00754       int addr = ll * 16;
00755       h_atominfo[addr    ].f = parms->atompos[ll*3    ];
00756       h_atominfo[addr + 1].f = parms->atompos[ll*3 + 1];
00757       h_atominfo[addr + 2].f = parms->atompos[ll*3 + 2];
00758       h_atominfo[addr + 3].i = parms->atom_basis[ll];
00759       h_atominfo[addr + 4].i = parms->num_shells_per_atom[ll];
00760     }
00761     cudaMemcpy(d_atominfo, h_atominfo, padsz * sizeof(flint), cudaMemcpyHostToDevice);
00762     free(h_atominfo);
00763 
00764     padsz = computepaddedsize(16 * parms->num_shells, MEMCOALESCE);
00765     int * h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
00766     cudaMalloc((void**)&d_shellinfo, padsz * sizeof(int));
00767     for (int ll=0; ll<parms->num_shells; ll++) {
00768       h_shellinfo[ll*16    ] = parms->num_prim_per_shell[ll];
00769       h_shellinfo[ll*16 + 1] = parms->shell_types[ll];
00770     }
00771     cudaMemcpy(d_shellinfo, h_shellinfo, padsz * sizeof(int), cudaMemcpyHostToDevice);
00772     free(h_shellinfo);
00773 
00774     cudaMalloc((void**)&d_basis_array, padsz * sizeof(float));
00775     cudaMemcpy(d_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), cudaMemcpyHostToDevice);
00776 
00777     padsz = computepaddedsize(3, MEMCOALESCE);
00778     cudaMalloc((void**)&d_origin, padsz * sizeof(float));
00779     cudaMemcpy(d_origin, origin, 3 * sizeof(float), cudaMemcpyHostToDevice);
00780 
00781     cudaMalloc((void**)&d_numvoxels, padsz * sizeof(int));
00782     cudaMemcpy(d_numvoxels, numvoxels, 3 * sizeof(int), cudaMemcpyHostToDevice);
00783   }
00784 
00785 
00786 #if defined(USE_ZERO_COPY)
00787   if ((getenv("VMDCUDANOZEROCOPY") == NULL) && 
00788       (cudaHostAlloc((void **) &h_orbitalgrid, volmemsz, cudaHostAllocMapped) == cudaSuccess)) {
00789     h_orbitalgrid_zerocopy=1;
00790     cudaHostGetDevicePointer(&d_orbitalgrid, h_orbitalgrid, 0);
00791     CUERR // check and clear any existing errors
00792   } else {
00793     printf("WARNING: CUDA zero-copy pinned memory allocation failed!\n"); 
00794 #else
00795     // allocate and initialize the GPU output array
00796     cudaMalloc((void**)&d_orbitalgrid, volmemsz);
00797     CUERR // check and clear any existing errors
00798 #endif
00799 #if defined(USE_PINNED_MEMORY)
00800     // try to allocate working buffer in pinned host memory
00801     if ((getenv("VMDCUDANOPINNEDMEMORY") == NULL) && 
00802         (cudaMallocHost((void **) &h_orbitalgrid, volmemsz) == cudaSuccess)) {
00803       h_orbitalgrid_pinnedalloc=1;
00804     } else {
00805       printf("WARNING: CUDA pinned memory allocation failed!\n"); 
00806       h_orbitalgrid_pinnedalloc=0;
00807       h_orbitalgrid = (float *) malloc(volmemsz);
00808     } 
00809 #else
00810     // allocate working buffer
00811     h_orbitalgrid = (float *) malloc(volmemsz);
00812 #endif
00813 #if defined(USE_ZERO_COPY)
00814   }
00815 #endif
00816 
00817 #if 0
00818   if (threadid == 0) {
00819     // inform on which kernel we're actually going to run
00820     printf("atoms[%d] ", parms->numatoms);
00821     printf("wavef[%d] ", parms->num_wave_f);
00822     printf("basis[%d] ", parms->num_basis);
00823     printf("shell[%d] ", parms->num_shells);
00824     if (usefastconstkernel) {
00825 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
00826       printf("GPU constant memory (JIT)");
00827 #else
00828       printf("GPU constant memory");
00829 #endif
00830     } else {
00831       printf("GPU tiled shared memory:");
00832     }
00833     printf(" Gsz:%dx%d\n", Gsz.x, Gsz.y);
00834   }
00835 #endif
00836 
00837   // loop over orbital planes
00838   wkf_tasktile_t tile;
00839   int planesize = numvoxels[0] * numvoxels[1];
00840 
00841   while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00842     int k;
00843     for (k=tile.start; k<tile.end; k++) {
00844       origin[2] = parms->origin[2] + parms->voxelsize * k;
00845 
00846 
00847       // RUN the kernel...
00848       if (usefastconstkernel) {
00849 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
00850         cuorbitalconstmem_jit<<<Gsz, Bsz, 0>>>(parms->numatoms, 
00851                               parms->voxelsize,
00852                               origin[0],
00853                               origin[1],
00854                               origin[2],
00855                               parms->density,
00856                               d_orbitalgrid);
00857 #else
00858         cuorbitalconstmem<<<Gsz, Bsz, 0>>>(parms->numatoms, 
00859                           parms->voxelsize,
00860                           origin[0],
00861                           origin[1],
00862                           origin[2],
00863                           parms->density,
00864                           d_orbitalgrid);
00865 #endif
00866       } else {
00867 #if defined(USE_FERMI_CACHE)
00868         // This is a Fermi-specific kernel and should only be used
00869         // on devices with compute capability >= 2.0
00870         cuorbitalcachedglobmem<<<Gsz, Bsz, 0>>>(parms->numatoms, 
00871                              d_wave_f,
00872                              d_basis_array,
00873                              d_atominfo,
00874                              d_shellinfo,
00875                              parms->voxelsize,
00876                              origin[0],
00877                              origin[1],
00878                              origin[2],
00879                              parms->density,
00880                              d_orbitalgrid);
00881 #else
00882         cuorbitaltiledshared<<<Gsz, Bsz, 0>>>(parms->numatoms, 
00883                              d_wave_f,
00884                              d_basis_array,
00885                              d_atominfo,
00886                              d_shellinfo,
00887                              parms->voxelsize,
00888                              origin[0],
00889                              origin[1],
00890                              origin[2],
00891                              parms->density,
00892                              d_orbitalgrid);
00893 #endif
00894       }
00895 #if defined(USE_ZERO_COPY)
00896       if (h_orbitalgrid_zerocopy) {
00897         cudaThreadSynchronize();
00898       } else {
00899 #endif
00900         CUERR // check and clear any existing errors
00901 
00902         // Copy the GPU output data back to the host and use/store it..
00903         cudaMemcpy(h_orbitalgrid, d_orbitalgrid, volmemsz, cudaMemcpyDeviceToHost);
00904         CUERR // check and clear any existing errors
00905 #if defined(USE_ZERO_COPY)
00906       }
00907 #endif
00908 
00909       // Copy GPU blocksize padded array back down to the original size
00910       int y;
00911       for (y=0; y<numvoxels[1]; y++) {
00912         long orbaddr = k*planesize + y*numvoxels[0];
00913         memcpy(&parms->orbitalgrid[orbaddr], &h_orbitalgrid[y*volsize.x], numvoxels[0] * sizeof(float));
00914       }
00915     }
00916     cudaThreadSynchronize();
00917   }
00918 
00919   free(h_basis_array_exp2f);
00920 
00921 #if defined(USE_ZERO_COPY)
00922   if (h_orbitalgrid_zerocopy) {
00923     cudaFreeHost(h_orbitalgrid);
00924   } else {
00925 #endif
00926 #if defined(USE_PINNED_MEMORY)
00927     if (h_orbitalgrid_pinnedalloc)
00928       cudaFreeHost(h_orbitalgrid);
00929     else
00930       free(h_orbitalgrid);
00931 #else
00932     free(h_orbitalgrid);
00933 #endif
00934     cudaFree(d_orbitalgrid);
00935 #if defined(USE_ZERO_COPY)
00936   }
00937 #endif
00938 
00939   if (!usefastconstkernel) {
00940     cudaFree(d_wave_f);
00941     cudaFree(d_basis_array);
00942     cudaFree(d_atominfo);
00943     cudaFree(d_shellinfo);
00944     cudaFree(d_numvoxels);
00945     cudaFree(d_origin);
00946   }
00947 
00948   CUERR // check and clear any existing errors
00949 
00950   return NULL;
00951 }
00952 
00953 
00954 int vmd_cuda_evaluate_orbital_grid(wkf_threadpool_t *devpool,
00955                        int numatoms,
00956                        const float *wave_f, int num_wave_f,
00957                        const float *basis_array, int num_basis,
00958                        const float *atompos,
00959                        const int *atom_basis,
00960                        const int *num_shells_per_atom,
00961                        const int *num_prim_per_shell,
00962                        const int *shell_types,
00963                        int num_shells,
00964                        const int *numvoxels,
00965                        float voxelsize,
00966                        const float *origin,
00967                        int density,
00968                        float *orbitalgrid) {
00969   int rc=0;
00970   orbthrparms parms;
00971 
00972   if (devpool == NULL) {
00973     return -1; // abort if no device pool exists
00974   }
00975 
00976 #if 0
00977   /* XXX hackish method of supporting standlone runs outside of VMD... */
00978   /* init device context and settings */
00979   if (getenv("VMDCUDADEV")) {
00980     rc=cudaSetDevice(atoi(getenv("VMDCUDADEV")));
00981   } else {
00982     rc=cudaSetDevice(0);
00983   }
00984 
00985   if (rc != cudaSuccess) {
00986 #if CUDART_VERSION >= 2010
00987     rc = cudaGetLastError(); // query last error and reset error state
00988     if (rc != cudaErrorSetOnActiveProcess)
00989       return NULL; // abort and return an error
00990 #else
00991     cudaGetLastError(); // just ignore and reset error state, since older CUDA
00992                         // revs don't have a cudaErrorSetOnActiveProcess enum
00993 #endif
00994   }
00995 
00996   /* set blocking/yielding API behavior and enable mapped host memory */
00997   cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost);
00998 
00999 #if CUDART_VERSION >= 3000
01000   /* we prefer to have a large 48kB L1 cache for the Fermi cache kernel */
01001   cudaFuncSetCacheConfig(cuorbitalcachedglobmem, cudaFuncCachePreferL1);
01002 
01003   /* we prefer to have a large 48kB shared mem for the tiled kernel */
01004   cudaFuncSetCacheConfig(cuorbitaltiledshared, cudaFuncCachePreferShared);
01005 #endif
01006 #endif
01007 
01008 #if defined(VMDMOJITSRC)
01009   if (getenv("VMDMOJIT") != NULL) {
01010     // generate CUDA kernel
01011     orbital_jit_generate(ORBITAL_JIT_CUDA, getenv("VMDMOJITSRCFILE"), 
01012                          numatoms, wave_f, basis_array, atom_basis, 
01013                          num_shells_per_atom, num_prim_per_shell, shell_types);
01014 
01015     // generate OpenCL kernel
01016     orbital_jit_generate(ORBITAL_JIT_OPENCL, "/tmp/mojit.cl", 
01017                          numatoms, wave_f, basis_array, atom_basis, 
01018                          num_shells_per_atom, num_prim_per_shell, shell_types);
01019     return 0;
01020   }
01021 #endif
01022 
01023   parms.numatoms = numatoms;
01024   parms.wave_f = wave_f;
01025   parms.num_wave_f = num_wave_f;
01026   parms.basis_array = basis_array;
01027   parms.num_basis = num_basis;
01028   parms.atompos = atompos;
01029   parms.atom_basis = atom_basis;
01030   parms.num_shells_per_atom = num_shells_per_atom;
01031   parms.num_prim_per_shell = num_prim_per_shell;
01032   parms.shell_types = shell_types;
01033   parms.num_shells = num_shells;
01034   parms.numvoxels = numvoxels;
01035   parms.voxelsize = voxelsize;
01036   parms.origin = origin;
01037   parms.density = density;
01038   parms.orbitalgrid = orbitalgrid;
01039 
01040   wkf_tasktile_t tile;
01041   tile.start=0;
01042   tile.end=numvoxels[2];
01043   wkf_threadpool_sched_dynamic(devpool, &tile);
01044   wkf_threadpool_launch(devpool, cudaorbitalthread, &parms, 1); 
01045 
01046   return rc;
01047 }
01048 
01049 

Generated on Wed May 16 01:49:09 2012 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002