CUDAVolMapCreateILS.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAVolMapCreateILS.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.53 $      $Date: 2020/02/26 19:26:47 $
00014  *
00015  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025 
00026 #include "WKFThreads.h"
00027 #include "WKFUtils.h"
00028 #include "CUDAKernels.h"
00029 #include "utilities.h"
00030 
00031 //#define DEBUG 1
00032 
00033 #if defined(VMDTHREADS)
00034 #define USE_CUDADEVPOOL 1
00035 #endif
00036 
00037 #if 1
00038 #define CUERR \
00039   do { \
00040     cudaError_t err; \
00041     if ((err = cudaGetLastError()) != cudaSuccess) { \
00042       printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00043       printf("Thread aborting...\n"); \
00044       return NULL; \
00045     } \
00046   } while (0)
00047 // to be terminated with semi-colon
00048 #else
00049 #define CUERR
00050 #endif
00051 
00052 //#define DEBUG
00053 
00054 typedef union flint_t {
00055   float f;
00056   int i;
00057 } flint;
00058 
00059 
00060 //
00061 // algorithm optimizations
00062 //
00063 
00064 // Computing distance-based exclusions either along with or instead of
00065 // energy-based exclusions turns out to decrease performance.
00066 //#define USE_DISTANCE_EXCL  // use the distance-based exclusion kernel
00067 #define USE_ENERGY_EXCL    // use the energy-based exclusion kernel
00068 
00069 #ifdef USE_DISTANCE_EXCL
00070 #define TEST_BLOCK_EXCL    // early exit if entire thread block is excluded
00071 // don't enable unless we have distance-based exclusions
00072 #endif
00073 
00074 #if defined(USE_ENERGY_EXCL) || defined(USE_DISTANCE_EXCL)
00075 #define TEST_MBLOCK_EXCL     // early exit from multiatom kernel
00076 #define TEST_SHIFT_EXCL
00077 #endif
00078 
00079 #ifdef TEST_SHIFT_EXCL
00080 #undef TEST_MBLOCK_EXCL
00081 #endif
00082 
00083 
00084 #define MAX_THREADBLOCKS  65536
00085 
00086 #define BIN_DEPTH     8  // number of slots per bin
00087 #define BIN_SLOTSIZE  4  // slot permits x, y, z, vdwtype  (4 elements)
00088 #define BIN_SIZE      (BIN_DEPTH * BIN_SLOTSIZE)  // size given in "flints"
00089 #define BIN_SHIFT     5  // log2(BIN_SIZE)
00090 #define BIN_SLOTSHIFT 2  // log2(BIN_SLOTSIZE)
00091 
00092 typedef struct occthrparms_t {
00093   int errcode;                   // 0 for success, -1 for failure
00094 
00095   int mx, my, mz;                // map dimensions
00096   float *map;                    // buffer space for occupancy map
00097                                  // (length mx*my*mz floats)
00098 
00099   float max_energy;              // max energy threshold
00100   float cutoff;                  // vdw cutoff distance
00101   float min_excldist;            // min exclusion distance
00102   float hx, hy, hz;              // map lattice spacing
00103   float x0, y0, z0;              // map origin
00104   float bx_1, by_1, bz_1;        // inverse of atom bin lengths
00105 
00106   int nbx, nby, nbz;             // bin dimensions
00107   const flint *bin;              // atom bins
00108                                  // (length BIN_SIZE*nbx*nby*nbz)
00109   const flint *bin_zero;         // bin pointer shifted to origin
00110 
00111   int num_binoffsets;            // number of offsets
00112   const char *binoffsets;        // bin neighborhood index offsets
00113                                  // (length 3*num_bin_offsets)
00114 
00115   int num_extras;                // number of extra atoms
00116   const flint *extra;            // extra atoms from overfilled bins
00117                                  // (length BIN_SLOTSIZE*num_extras)
00118 
00119   int num_vdwparms;              // number of vdw parameter types
00120   const float *vdwparms;         // vdw parameters
00121                                  // (length 2*num_vdw_params)
00122 
00123   int num_probes;                // number of probe atoms
00124   const float *probevdwparms;    // vdw parameters of probe atoms
00125                                  // (length 2*num_probes)
00126 
00127   int num_conformers;            // number of conformers
00128   const float *conformers;       // probe atom offsets for conformers
00129                                  // (length 3*num_probes*num_conformers)
00130 } occthrparms;
00131 
00132 
00133 // constant memory, page 1 (8K)
00134 //
00135 __constant__ static float const_max_energy;     // energy threshold
00136 __constant__ static float const_min_occupancy;  // occupancy threshold
00137 __constant__ static float const_cutoff;         // cutoff distance
00138 __constant__ static float const_min_excldist;   // min exclusion distance
00139 __constant__ static float const_hx, const_hy, const_hz;  // map spacings
00140 __constant__ static float const_x0, const_y0, const_z0;  // map origin
00141 __constant__ static float const_bx_1, const_by_1, const_bz_1;
00142                                                 // inverse of bin lengths
00143 __constant__ static float const_inv_numconf;    // inverse number conformers
00144 
00145 __constant__ static int const_num_binoffsets;   // need use lengths of arrays
00146 __constant__ static int const_num_extras;
00147 __constant__ static int const_num_probes;
00148 __constant__ static int const_num_conformers;
00149 
00150 #define MAX_VDWPARMS  160
00151 __constant__ static float const_vdwparms[2 * MAX_VDWPARMS];
00152 
00153 #define MAX_PROBES  8
00154 __constant__ static float const_probevdwparms[2 * MAX_PROBES];
00155 
00156 #define MAX_EXTRAS  50
00157 __constant__ static flint const_extra[BIN_SLOTSIZE * MAX_EXTRAS];
00158 
00159 // each offset is a single flat index offset into atom bin array
00160 #define MAX_BINOFFSETS  1467
00161 __constant__ static int const_binoffsets[MAX_BINOFFSETS];
00162 
00163 // nearest neighbor list for distance-based exclusions
00164 #define MAX_EXCLOFFSETS  27
00165 __constant__ static int const_excloffsets[MAX_EXCLOFFSETS];
00166 
00167 // constant memory, pages 2 - 7
00168 //
00169 // the max number of conformers is chosen to leave 1K available
00170 // to the runtime library
00171 //
00172 #define MAX_CONFORMERS 586
00173 __constant__ static float const_conformers[3*MAX_PROBES*MAX_CONFORMERS];
00174 
00175 
00176 #define DO_ONE_PLANE         // optimze when z-slabs are slices of thickness 1
00177 #undef DO_ONE_PLANE
00178 
00179 #define THBLEN   4           // thread block length of cube
00180 #define NTHREADSPERBLOCK  (THBLEN*THBLEN*THBLEN)
00181 
00182 #define BSHIFT   2           // shift for map points to blocks = log2(THBLEN)
00183 
00184 #define NTPBSHIFT  6         // do bit shifting instead of int multiplies
00185                              //  = log2(NTHREADPERBLOCK)
00186 
00187 // Performance is improved for exclusion optimizations
00188 // if we reduce shared memory use to < 4K
00189 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00190 #define NCONFPERLOOP       5
00191 #else
00192 #define NCONFPERLOOP       8
00193 #endif
00194 
00195 #define NTPB_TIMES_NCPL  (NTHREADSPERBLOCK * NCONFPERLOOP)
00196 
00197 #define PADMASK  (THBLEN-1)  // bit mask for padding up to multiple of THBLEN
00198 
00199 #define MAX_ALLOWABLE_ENERGY  87.f  // otherwise expf(-u) gives 0
00200 
00201 
00203 //
00204 //     ROUTINES FOR EXCLUSIONS
00205 //
00207 
00208 #ifdef USE_DISTANCE_EXCL
00209 __global__ static void cuda_find_distance_exclusions(
00210     int *excl,              // exclusions, stored in contiguous thread blocks
00211     const flint *bin_zero,  // points to bin containing map origin
00212     int nbx, int nby,       // number of bins along x and y dimensions
00213 #ifndef DO_ONE_PLANE
00214     int mzblocks,           // number of thread blocks in z dimension
00215 #endif
00216     int mzblockoffset       // offset (number of planes) to starting z
00217     ) {
00218 
00219   // space for 1 atom bin in shared memory
00220   __shared__ flint atombin[BIN_SIZE];
00221 
00222 #ifndef DO_ONE_PLANE
00223   const int nblockx = gridDim.x;
00224   const int nblocky = gridDim.y / mzblocks;
00225 
00226   // 3D index for this thread block
00227   const int bidx = blockIdx.x;
00228   const int bidy = blockIdx.y % nblocky;
00229   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00230 
00231   // block ID, flat index
00232   const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00233 
00234   // thread ID, flat index
00235   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00236 
00237   // position of lower left corner of this thread block
00238   float px = (bidx<<BSHIFT) * const_hx;
00239   float py = (bidy<<BSHIFT) * const_hy;
00240   float pz = (bidz<<BSHIFT) * const_hz;
00241 #else
00242   // can optimize from above for case when mzblocks = 1
00243 
00244   // block ID, flat index
00245   const int bid
00246     = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00247 
00248   // thread ID, flat index
00249   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00250 
00251   // position of lower left corner of this thread block
00252   float px = (blockIdx.x<<BSHIFT) * const_hx;
00253   float py = (blockIdx.y<<BSHIFT) * const_hy;
00254   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00255 #endif
00256 
00257   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00258   int isexcluded = 0;  // assume it isn't excluded
00259 
00260   const float mindis2 = const_min_excldist * const_min_excldist;
00261 
00262   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00263   const int jb = (int) floorf(py * const_by_1);  // thread block
00264   const int kb = (int) floorf(pz * const_bz_1);
00265 
00266   const int binid = (kb * nby + jb)*nbx + ib;    // flat index of my bin
00267 
00268   int n, nb;
00269 
00270   px += threadIdx.x * const_hx;  // adjust position for this thread
00271   py += threadIdx.y * const_hy;
00272   pz += threadIdx.z * const_hz;
00273 
00274   px += const_x0;
00275   py += const_y0;
00276   pz += const_z0;  // translate to absolute position
00277 
00278   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00279 
00280   for (nb = 0;  nb < MAX_EXCLOFFSETS;  nb++) {
00281 
00282     if (tid < BIN_SIZE) {  // cache next atom bin
00283       const flint *p_bin = bin_zero
00284         + ((binid + const_excloffsets[nb])<<BIN_SHIFT);
00285       atombin[tid] = p_bin[tid];  // copy from global to shared memory
00286     }
00287     __syncthreads();  // now we have atom bin loaded
00288 
00289     for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00290       
00291       if (-1 == atombin[n+3].i) break;  // no more atoms in bin
00292       const float dx = px - atombin[n  ].f;
00293       const float dy = py - atombin[n+1].f;
00294       const float dz = pz - atombin[n+2].f;
00295       const float r2 = dx*dx + dy*dy + dz*dz;
00296 
00297       if (r2 <= mindis2) isexcluded = 1;
00298 
00299     } // end loop over atoms in bin
00300 
00301     __syncthreads();  // wait for thread block before loading next bin
00302   } // end loop over bin neighborhood
00303 
00304   for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00305     const float dx = px - const_extra[n  ].f;
00306     const float dy = py - const_extra[n+1].f;
00307     const float dz = pz - const_extra[n+2].f;
00308     const float r2 = dx*dx + dy*dy + dz*dz;
00309 
00310     if (r2 <= mindis2) isexcluded = 1;
00311 
00312   } // end loop over extra atoms
00313 
00314   excl[mindex] = isexcluded;
00315 }
00316 #endif // USE_DISTANCE_EXCL
00317 
00318 
00319 #ifdef USE_ENERGY_EXCL
00320 __global__ static void cuda_find_energy_exclusions(
00321     int *excl,              // exclusions, stored in contiguous thread blocks
00322     const flint *bin_zero,  // points to bin containing map origin
00323     int nbx, int nby,       // number of bins along x and y dimensions
00324 #ifndef DO_ONE_PLANE
00325     int mzblocks,           // number of thread blocks in z dimension
00326 #endif
00327     int mzblockoffset       // offset (number of planes) to starting z
00328     ) {
00329 
00330   // space for 1 atom bin in shared memory
00331   __shared__ flint atombin[BIN_SIZE];
00332 
00333 #ifdef TEST_BLOCK_EXCL
00334 #if 0
00335   __shared__ int sh_isall_upper_warp;
00336 #endif
00337   __shared__ int sh_sum[NTHREADSPERBLOCK];
00338 #endif
00339 
00340 #ifndef DO_ONE_PLANE
00341   const int nblockx = gridDim.x;
00342   const int nblocky = gridDim.y / mzblocks;
00343 
00344   // 3D index for this thread block
00345   const int bidx = blockIdx.x;
00346   const int bidy = blockIdx.y % nblocky;
00347   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00348 
00349   // block ID, flat index
00350   const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00351 
00352   // thread ID, flat index
00353   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00354 
00355   // position of lower left corner of this thread block
00356   float px = (bidx<<BSHIFT) * const_hx;
00357   float py = (bidy<<BSHIFT) * const_hy;
00358   float pz = (bidz<<BSHIFT) * const_hz;
00359 #else
00360   // can optimize from above for case when mzblocks = 1
00361 
00362   // block ID, flat index
00363   const int bid
00364     = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00365 
00366   // thread ID, flat index
00367   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00368 
00369   // position of lower left corner of this thread block
00370   float px = (blockIdx.x<<BSHIFT) * const_hx;
00371   float py = (blockIdx.y<<BSHIFT) * const_hy;
00372   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00373 #endif
00374 
00375   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00376 #ifdef USE_DISTANCE_EXCL
00377   int isexcluded = excl[mindex];  // read exclusion from global memory
00378 #else
00379   int isexcluded = 0;  // assume it isn't excluded
00380 #endif
00381 
00382 #ifdef TEST_BLOCK_EXCL
00383 #if 0
00384   // warp vote is available only in devices 1.2 and later
00385   // warp vote to see if we continue
00386   int isall = __all(isexcluded);
00387   if (32==tid) sh_isall_upper_warp = isall;
00388   if (0==isany+sh_isall_upper_warp) return; // all points excluded, exit
00389 #endif
00390   sh_sum[tid] = !(isexcluded);  // count inclusions
00391   int nbrsum = sh_sum[tid^1];
00392   sh_sum[tid] += nbrsum;
00393   nbrsum = sh_sum[tid^2];
00394   sh_sum[tid] += nbrsum;
00395   nbrsum = sh_sum[tid^4];
00396   sh_sum[tid] += nbrsum;
00397   nbrsum = sh_sum[tid^8];
00398   sh_sum[tid] += nbrsum;
00399   nbrsum = sh_sum[tid^16];
00400   sh_sum[tid] += nbrsum;
00401   __syncthreads();
00402   nbrsum = sh_sum[tid^32];
00403   __syncthreads();
00404   sh_sum[tid] += nbrsum;
00405 
00406   if (0==sh_sum[tid]) return;  // all points excluded, exit
00407 #endif
00408 
00409   const float cutoff2 = const_cutoff * const_cutoff;  // cutoff2 in register
00410   const float probe_vdweps = const_probevdwparms[0];
00411   const float probe_vdwrmin = const_probevdwparms[1];
00412 
00413   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00414   const int jb = (int) floorf(py * const_by_1);  // thread block
00415   const int kb = (int) floorf(pz * const_bz_1);
00416 
00417   const int binid = (kb * nby + jb)*nbx + ib;    // flat index of my bin
00418 
00419   int n, nb;
00420 
00421   px += threadIdx.x * const_hx;  // adjust position for this thread
00422   py += threadIdx.y * const_hy;
00423   pz += threadIdx.z * const_hz;
00424 
00425   px += const_x0;
00426   py += const_y0;
00427   pz += const_z0;  // translate to absolute position
00428 
00429   float u = 0.f;  // sum potential energy
00430 
00431   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00432 
00433   for (nb = 0;  nb < const_num_binoffsets;  nb++) {
00434 
00435     if (tid < BIN_SIZE) {  // cache next atom bin
00436       const flint *p_bin = bin_zero
00437         + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00438       atombin[tid] = p_bin[tid];  // copy from global to shared memory
00439     }
00440     __syncthreads();  // now we have atom bin loaded
00441 
00442     for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00443       int vdwtype = atombin[n+3].i;
00444       if (-1 == vdwtype) break;  // no more atoms in bin
00445       vdwtype <<= 1;  // multiply by 2 for indexing VDW parms
00446 
00447       const float dx = px - atombin[n  ].f;
00448       const float dy = py - atombin[n+1].f;
00449       const float dz = pz - atombin[n+2].f;
00450       const float r2 = dx*dx + dy*dy + dz*dz;
00451       if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00452         const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00453         const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00454         float rm6 = rmin*rmin / r2;
00455         rm6 = rm6 * rm6 * rm6;
00456         u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00457       }
00458     } // end loop over atoms in bin
00459 
00460     __syncthreads();  // wait for thread block before loading next bin
00461   } // end loop over bin neighborhood
00462 
00463   for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00464     const int vdwtype = const_extra[n+3].i << 1; // mult by 2 index VDW parms
00465     const float dx = px - const_extra[n  ].f;
00466     const float dy = py - const_extra[n+1].f;
00467     const float dz = pz - const_extra[n+2].f;
00468     const float r2 = dx*dx + dy*dy + dz*dz;
00469     if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00470       const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00471       const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00472       float rm6 = rmin*rmin / r2;
00473       rm6 = rm6 * rm6 * rm6;
00474       u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00475     }
00476   } // end loop over extra atoms
00477 
00478   if (u >= const_max_energy) isexcluded = 1;  // exclude this map point
00479 
00480   excl[mindex] = isexcluded;
00481 }
00482 #endif // USE_ENERGY_EXCL
00483 
00484 
00485 // For a monoatomic probe compute the occupancy rho
00486 // (probability of finding the probe)
00487 //
00488 // For each map point the occupancy is computed as
00489 //
00490 //   rho = exp(-U)
00491 //
00492 // where U is the interaction energy of the probe with the system
00493 // due to the VDW force field.
00494 //
00495 __global__ static void cuda_occupancy_monoatom(
00496     float *map,             // map buffer, stored in contiguous thread blocks
00497     const int *excl,        // exclusion buffer, stored same
00498     const flint *bin_zero,  // points to bin containing map origin
00499     int nbx, int nby,       // number of bins along x and y dimensions
00500 #ifndef DO_ONE_PLANE
00501     int mzblocks,           // number of thread blocks in z dimension
00502 #endif
00503     int mzblockoffset       // offset (number of planes) to starting z
00504     ) {
00505 
00506   // space for 1 atom bin in shared memory
00507   __shared__ flint atombin[BIN_SIZE];
00508 
00509 #ifdef TEST_BLOCK_EXCL
00510 #if 0
00511   __shared__ int sh_isall_upper_warp;
00512 #endif
00513   __shared__ int sh_sum[NTHREADSPERBLOCK];
00514 #endif
00515 
00516 #ifndef DO_ONE_PLANE
00517   const int nblockx = gridDim.x;
00518   const int nblocky = gridDim.y / mzblocks;
00519 
00520   // 3D index for this thread block
00521   const int bidx = blockIdx.x;
00522   const int bidy = blockIdx.y % nblocky;
00523   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00524 
00525   // block ID, flat index
00526   const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00527 
00528   // thread ID, flat index
00529   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00530 
00531   // position of lower left corner of this thread block
00532   float px = (bidx<<BSHIFT) * const_hx;
00533   float py = (bidy<<BSHIFT) * const_hy;
00534   float pz = (bidz<<BSHIFT) * const_hz;
00535 #else
00536   // can optimize from above for case when mzblocks = 1
00537 
00538   // block ID, flat index
00539   const int bid
00540     = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00541 
00542   // thread ID, flat index
00543   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00544 
00545   // position of lower left corner of this thread block
00546   float px = (blockIdx.x<<BSHIFT) * const_hx;
00547   float py = (blockIdx.y<<BSHIFT) * const_hy;
00548   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00549 #endif
00550 
00551   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00552   const int isexcluded = excl[mindex];  // read exclusion from global memory
00553 
00554 #ifdef TEST_BLOCK_EXCL
00555 #if 0
00556   // warp vote is available only in devices 1.2 and later
00557   // warp vote to see if we continue
00558   int isall = __all(isexcluded);
00559   if (32==tid) sh_isall_upper_warp = isall;
00560   if (0==isany+sh_isall_upper_warp) return; // all points excluded, exit
00561 #endif
00562   sh_sum[tid] = !(isexcluded);  // count inclusions
00563   int nbrsum = sh_sum[tid^1];
00564   sh_sum[tid] += nbrsum;
00565   nbrsum = sh_sum[tid^2];
00566   sh_sum[tid] += nbrsum;
00567   nbrsum = sh_sum[tid^4];
00568   sh_sum[tid] += nbrsum;
00569   nbrsum = sh_sum[tid^8];
00570   sh_sum[tid] += nbrsum;
00571   nbrsum = sh_sum[tid^16];
00572   sh_sum[tid] += nbrsum;
00573   __syncthreads();
00574   nbrsum = sh_sum[tid^32];
00575   __syncthreads();
00576   sh_sum[tid] += nbrsum;
00577 
00578   if (0==sh_sum[tid]) return;  // all points excluded, exit
00579 #endif
00580 
00581   const float cutoff2 = const_cutoff * const_cutoff;  // cutoff2 in register
00582   const float probe_vdweps = const_probevdwparms[0];
00583   const float probe_vdwrmin = const_probevdwparms[1];
00584 
00585   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00586   const int jb = (int) floorf(py * const_by_1);  // thread block
00587   const int kb = (int) floorf(pz * const_bz_1);
00588 
00589   const int binid = (kb * nby + jb)*nbx + ib;    // flat index of my bin
00590 
00591   int n, nb;
00592 
00593   px += threadIdx.x * const_hx;  // adjust position for this thread
00594   py += threadIdx.y * const_hy;
00595   pz += threadIdx.z * const_hz;
00596 
00597   px += const_x0;
00598   py += const_y0;
00599   pz += const_z0;  // translate to absolute position
00600 
00601   float u = 0.f;  // sum potential energy
00602 
00603   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00604 
00605   for (nb = 0;  nb < const_num_binoffsets;  nb++) {  // bin neighborhood
00606 
00607     if (tid < BIN_SIZE) {  // cache next atom bin
00608       const flint *p_bin = bin_zero
00609         + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00610       atombin[tid] = p_bin[tid];  // copy from global to shared memory
00611     }
00612     __syncthreads();  // now we have atom bin loaded
00613 
00614     for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00615       int vdwtype = atombin[n+3].i;
00616       if (-1 == vdwtype) break;  // no more atoms in bin
00617       vdwtype <<= 1;  // multiply by 2 for indexing VDW parms
00618 
00619       const float dx = px - atombin[n  ].f;
00620       const float dy = py - atombin[n+1].f;
00621       const float dz = pz - atombin[n+2].f;
00622       const float r2 = dx*dx + dy*dy + dz*dz;
00623       if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00624         const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00625         const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00626         float rm6 = rmin*rmin / r2;
00627         rm6 = rm6 * rm6 * rm6;
00628         u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00629       }
00630     } // end loop atoms in bin
00631 
00632     __syncthreads();  // wait for entire thread block before loading next bin
00633   } // end loop over bin neighborhood
00634 
00635   for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00636     const int vdwtype = const_extra[n+3].i << 1; // mult by 2 index VDW parms
00637     const float dx = px - const_extra[n  ].f;
00638     const float dy = py - const_extra[n+1].f;
00639     const float dz = pz - const_extra[n+2].f;
00640     const float r2 = dx*dx + dy*dy + dz*dz;
00641     if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00642       const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00643       const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00644       float rm6 = rmin*rmin / r2;
00645       rm6 = rm6 * rm6 * rm6;
00646       u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00647     }
00648   } // end loop over extra atoms
00649 
00650   float occ = 0.f;
00651   if (!isexcluded && u < const_max_energy) {
00652     occ = expf(-u);  // the occupancy
00653   }
00654   map[mindex] = occ;
00655 } // cuda_occupancy_monoatom()
00656 
00657 
00658 // For a multiatom probe compute the occupancy rho
00659 // (probability of finding the probe)
00660 //
00661 // For each map point the occupancy is computed as
00662 //
00663 //   rho = (1/m) sum_i exp(-U[i]), i = 1..m
00664 //
00665 // where U[i] is the potential energy of the i-th conformer.
00666 //
00667 __global__ static void cuda_occupancy_multiatom(
00668     float *map,             // map buffer, stored in contiguous thread blocks
00669     const int *excl,        // exclusion buffer, stored same
00670     const flint *bin_zero,  // points to bin containing map origin
00671     int nbx, int nby,       // number of bins along x and y dimensions
00672 #ifndef DO_ONE_PLANE
00673     int mzblocks,           // number of thread blocks in z dimension
00674 #endif
00675     int mzblockoffset       // offset (number of planes) to starting z
00676     ) {
00677 
00678   // space for 1 atom bin in shared memory
00679   __shared__ flint atombin[BIN_SIZE];
00680 
00681   // accumulate into shared memory the energy for NCONFPERLOOP conformers
00682   __shared__ float usum[NTPB_TIMES_NCPL];
00683 
00684   // cache const mem pages 2+ into shared memory for faster access
00685   // since we have to keep accessing const mem page 1 (8K)
00686   __shared__ float conf[3*MAX_PROBES * NCONFPERLOOP];
00687 
00688 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00689 #if 0
00690   __shared__ int sh_isany_upper_warp;
00691 #endif
00692   // we reuse this buffer:
00693   // (1) reduce total sum across thread block for prefix sum
00694   // (2) to accumulate occupancy
00695   __shared__ flint sh_buffer[NTHREADSPERBLOCK];
00696 #endif
00697 
00698 #ifndef DO_ONE_PLANE
00699   const int nblockx = gridDim.x;
00700   const int nblocky = gridDim.y / mzblocks;
00701 
00702   // 3D index for this thread block
00703   const int bidx = blockIdx.x;
00704   const int bidy = blockIdx.y % nblocky;
00705   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00706 
00707   // block ID, flat index
00708   const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00709 
00710   // thread ID, flat index
00711   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00712 
00713   // position of lower left corner of this thread block
00714   float px = (bidx<<BSHIFT) * const_hx;
00715   float py = (bidy<<BSHIFT) * const_hy;
00716   float pz = (bidz<<BSHIFT) * const_hz;
00717 #else
00718   // can optimize from above for case when mzblocks = 1
00719 
00720   // block ID, flat index
00721   const int bid
00722     = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00723 
00724   // thread ID, flat index
00725   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00726 
00727   // position of lower left corner of this thread block
00728   float px = (blockIdx.x<<BSHIFT) * const_hx;
00729   float py = (blockIdx.y<<BSHIFT) * const_hy;
00730   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00731 #endif
00732 
00733   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00734   const int isincluded = !(excl[mindex]);  // read exclusion from global memory
00735 
00736 #ifdef TEST_MBLOCK_EXCL
00737 #if 0
00738   // warp vote is available only in devices 1.2 and later
00739   // warp vote to see if we continue
00740   int isany = __any(isincluded);
00741   if (32==tid) sh_isany_upper_warp = isany;
00742   if (0==isany+sh_isany_upper_warp) return; // all points excluded, exit
00743 #endif
00744 
00745   sh_buffer[tid].i = isincluded;
00746   int nbrsum = sh_buffer[tid^1].i;
00747   sh_buffer[tid].i += nbrsum;
00748   nbrsum = sh_buffer[tid^2].i;
00749   sh_buffer[tid].i += nbrsum;
00750   nbrsum = sh_buffer[tid^4].i;
00751   sh_buffer[tid].i += nbrsum;
00752   nbrsum = sh_buffer[tid^8].i;
00753   sh_buffer[tid].i += nbrsum;
00754   nbrsum = sh_buffer[tid^16].i;
00755   sh_buffer[tid].i += nbrsum;
00756   __syncthreads();
00757   nbrsum = sh_buffer[tid^32].i;
00758   __syncthreads();
00759   sh_buffer[tid].i += nbrsum;
00760 
00761   if (0==sh_buffer[tid].i) {
00762     map[mindex] = 0.f;
00763     return;
00764   }
00765 #endif
00766 
00767 #ifdef TEST_SHIFT_EXCL
00768   int numincl = 0;  // number of included points in thread block
00769   {
00770     int psum = isincluded;  // will have prefix sum over isincluded for tid
00771     sh_buffer[tid].i = isincluded;
00772 
00773     int nbrid = tid^1;
00774     int nbrsum = sh_buffer[nbrid].i;
00775     if (nbrid < tid) psum += nbrsum;
00776     sh_buffer[tid].i += nbrsum;
00777 
00778     nbrid = tid^2;
00779     nbrsum = sh_buffer[nbrid].i;
00780     if (nbrid < tid) psum += nbrsum;
00781     sh_buffer[tid].i += nbrsum;
00782 
00783     nbrid = tid^4;
00784     nbrsum = sh_buffer[nbrid].i;
00785     if (nbrid < tid) psum += nbrsum;
00786     sh_buffer[tid].i += nbrsum;
00787 
00788     nbrid = tid^8;
00789     nbrsum = sh_buffer[nbrid].i;
00790     if (nbrid < tid) psum += nbrsum;
00791     sh_buffer[tid].i += nbrsum;
00792 
00793     nbrid = tid^16;
00794     nbrsum = sh_buffer[nbrid].i;
00795     if (nbrid < tid) psum += nbrsum;
00796     sh_buffer[tid].i += nbrsum;
00797 
00798     __syncthreads();
00799     nbrid = tid^32;
00800     nbrsum = sh_buffer[nbrid].i;
00801     if (nbrid < tid) psum += nbrsum;
00802     __syncthreads();
00803     sh_buffer[tid].i += nbrsum;
00804 
00805     numincl = sh_buffer[tid].i;  // number of included points
00806     if (0==numincl) {
00807       map[mindex] = 0.f;
00808       return;  // all points in thread block excluded so exit early
00809     }
00810     __syncthreads();
00811     if (isincluded) {
00812       sh_buffer[psum-1].i = tid;  // assign my work to thread #(psum-1)
00813     }
00814   }
00815   __syncthreads();
00816   const int sid = sh_buffer[tid].i;  // shifted thread id for whom i calculate
00817 #endif
00818 
00819   const float cutoff2 = const_cutoff * const_cutoff;  // cutoff2 in register
00820 
00821   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00822   const int jb = (int) floorf(py * const_by_1);  // thread block
00823   const int kb = (int) floorf(pz * const_bz_1);
00824 
00825   const int binid = (kb * nby + jb)*nbx + ib;    // flat index of my bin
00826 
00827   int n, nb;
00828   int m;
00829   int ma;
00830   int nc;
00831 
00832 #ifdef TEST_SHIFT_EXCL
00833   if (tid < numincl) {
00834     int ssid = sid;
00835     const int sid_x = ssid & PADMASK;
00836     ssid >>= BSHIFT;
00837     const int sid_y = ssid & PADMASK;
00838     ssid >>= BSHIFT;
00839     const int sid_z = ssid;
00840     px += sid_x * const_hx;  // adjust position for shifted thread
00841     py += sid_y * const_hy;
00842     pz += sid_z * const_hz;
00843   }
00844 #else
00845   px += threadIdx.x * const_hx;  // adjust position for this thread
00846   py += threadIdx.y * const_hy;
00847   pz += threadIdx.z * const_hz;
00848 #endif
00849 
00850   px += const_x0;
00851   py += const_y0;
00852   pz += const_z0;  // translate to absolute position
00853 
00854 #ifdef TEST_SHIFT_EXCL
00855   sh_buffer[tid].f = 0.f;  // for summing occupancy
00856 #else
00857   float occ = 0.f;  // sum occupancy
00858 #endif
00859 
00860   const int twice_num_probes = const_num_probes << 1;
00861   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00862 
00863   // number of floats from conformers to move into shared memory
00864   const int total_shmem_copy = 3*NCONFPERLOOP * const_num_probes;
00865 
00866   // loop over all conformers
00867   for (m = 0;  m < const_num_conformers;  m += NCONFPERLOOP) {
00868 
00869     // reset shared memory energy accumulators
00870     for (nc = 0;  nc < NTPB_TIMES_NCPL;  nc += NTHREADSPERBLOCK) {
00871       usum[nc + tid] = 0.f;  // these are NOT shared
00872     }
00873 
00874     // cooperatively copy conformers from constant memory page 2+
00875     // into shared memory for improved performance
00876     // while we continue to access constant memory page 1 (8K)
00877     __syncthreads();
00878     if (tid < 32) {  // use only first warp
00879 
00880       // number of loop iterations for all threads in warp
00881       const int npass = (total_shmem_copy >> 5);
00882 
00883       // remaining number of floats to move
00884       const int nleft = (total_shmem_copy & 31);
00885 
00886       // skip over previously computed conformers (3*const_num_probes*m)
00887       const int skip = (const_num_probes + twice_num_probes) * m;
00888 
00889       int off = 0;
00890       for (nc = 0;  nc < npass;  nc++, off += 32) {
00891         conf[off + tid] = const_conformers[skip + off + tid];
00892       }
00893       if (tid < nleft) {
00894         conf[off + tid] = const_conformers[skip + off + tid];
00895       }
00896     }
00897     __syncthreads();  // wait for entire thread block
00898 
00899     for (nb = 0;  nb < const_num_binoffsets;  nb++) {  // bin neighborhood
00900 
00901       __syncthreads();  // wait for entire thread block before loading next bin
00902       if (tid < BIN_SIZE) {  // cache next atom bin
00903         const flint *p_bin = bin_zero
00904           + ((binid + const_binoffsets[nb]) << BIN_SHIFT);
00905         atombin[tid] = p_bin[tid];  // copy from global to shared memory
00906       }
00907       __syncthreads();  // now we have atom bin loaded
00908 
00909 #ifdef TEST_SHIFT_EXCL
00910     if (tid < numincl) {
00911 #endif
00912 
00913       for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00914         int vdwtype = atombin[n+3].i;
00915         if (-1 == vdwtype) break;  // no more atoms in bin
00916         vdwtype <<= 1;  // multiply by 2 for indexing VDW parms
00917         const float epsilon_atom = const_vdwparms[vdwtype];
00918         const float rmin_atom = const_vdwparms[vdwtype+1];
00919         const float ax = px - atombin[n  ].f;
00920         const float ay = py - atombin[n+1].f;
00921         const float az = pz - atombin[n+2].f;
00922 
00923         int sindex = 0;  // index conformers
00924        
00925         // loop shared memory conformers
00926         for (nc = 0;  nc < NTPB_TIMES_NCPL;  nc += NTHREADSPERBLOCK) {
00927 
00928           for (ma = 0;  ma < twice_num_probes;  ma += 2) {  // probe atoms
00929             const float dx = conf[sindex++] + ax;
00930             const float dy = conf[sindex++] + ay;
00931             const float dz = conf[sindex++] + az;
00932             const float r2 = dx*dx + dy*dy + dz*dz;
00933             if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00934               const float epsilon = epsilon_atom * const_probevdwparms[ma];
00935               const float rmin = rmin_atom + const_probevdwparms[ma+1];
00936               float rm6 = rmin*rmin / r2;
00937               rm6 = rm6 * rm6 * rm6;
00938 
00939               // XXX rather than reading/writing to shared memory here,
00940               //     we should be working with a register.  Only when we
00941               //     complete the two innermost loops should we be storing
00942               //     back to shared memory.  Shared memory limits performance
00943               //     to 66% of what's possible with a register...
00944               //     The stores to usum[] should be moved out to the 'nc' loop.
00945               // XXX Accumulating usum in 'nc' loop is actually a bit slower.
00946               //     Note that we only access shared memory if the system atom
00947               //     is within the cutoff.  Even with small atom bins we still
00948               //     have a large percentage of atoms outside of cutoff.
00949               usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
00950             }
00951           } // end loop over probe atoms
00952 
00953         } // end loop over shared memory conformers
00954 
00955       } // end loop over atoms in bin
00956 #ifdef TEST_SHIFT_EXCL
00957     } // end if (tid < numincl)
00958 #endif
00959 
00960     } // end loop over bin neighborhood
00961 
00962 #ifdef TEST_SHIFT_EXCL
00963   if (tid < numincl) {
00964 #endif
00965 
00966     for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00967       const int vdwtype = const_extra[n+3].i << 1; // mult by 2 index VDW parms
00968       const float epsilon_atom = const_vdwparms[vdwtype];
00969       const float rmin_atom = const_vdwparms[vdwtype+1];
00970       const float ax = px - const_extra[n  ].f;
00971       const float ay = py - const_extra[n+1].f;
00972       const float az = pz - const_extra[n+2].f;
00973       int sindex = 0;
00974     
00975       // loop over shared memory conformers
00976       for (nc = 0;  nc < NTPB_TIMES_NCPL;  nc += NTHREADSPERBLOCK) {
00977 
00978         for (ma = 0;  ma < twice_num_probes;  ma += 2) {  // probe atoms
00979           const float dx = conf[sindex++] + ax;
00980           const float dy = conf[sindex++] + ay;
00981           const float dz = conf[sindex++] + az;
00982           const float r2 = dx*dx + dy*dy + dz*dz;
00983           if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00984             const float epsilon = epsilon_atom * const_probevdwparms[ma];
00985             const float rmin = rmin_atom + const_probevdwparms[ma+1];
00986             float rm6 = rmin*rmin / r2;
00987             rm6 = rm6 * rm6 * rm6;
00988             usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
00989           }
00990         } // end loop over probe atoms
00991 
00992       } // end loop over shared memory conformers
00993 
00994     } // end loop over extra atoms
00995 
00996     for (nc = 0;  nc < NCONFPERLOOP;  nc++) {  // loop over some conformers
00997 
00998       if (m + nc < const_num_conformers) {  // ensure this 'nc' is meaningful
00999         float u = usum[(nc << NTPBSHIFT) + tid];
01000 
01001 #ifdef TEST_SHIFT_EXCL
01002         if (u < MAX_ALLOWABLE_ENERGY) {  // XXX should we test?
01003           sh_buffer[sid].f += expf(-u);  // the occupancy
01004         }
01005 #else
01006         if (isincluded && u < MAX_ALLOWABLE_ENERGY) {  // XXX should we test?
01007           occ += expf(-u);  // the occupancy
01008         }
01009 #endif
01010       }
01011 
01012     } // end loop over some conformers
01013 #ifdef TEST_SHIFT_EXCL
01014   } // end if (tid < numincl)
01015 #endif
01016 
01017   } // end loop over all conformers
01018 
01019 #ifdef TEST_SHIFT_EXCL
01020   __syncthreads();  // must wait for thread block before finishing
01021   sh_buffer[tid].f *= const_inv_numconf;  // averaged occupancy
01022   if (sh_buffer[tid].f <= const_min_occupancy) sh_buffer[tid].f = 0.f;
01023 
01024   map[mindex] = sh_buffer[tid].f;
01025 #else
01026   occ = occ * const_inv_numconf;  // averaged occupancy
01027   if (occ <= const_min_occupancy) occ = 0.f;
01028 
01029   map[mindex] = occ;
01030 #endif
01031 
01032 } // cuda_occupancy_multiatom()
01033 
01034 
01035 
01036 static void *cuda_occupancy_thread(void *voidparms) {
01037   occthrparms *parms = NULL;
01038   int mxpad, mypad, mzpad, mappad;
01039   float *h_map;       // padded map buffer on host
01040   float *d_map;       // padded map buffer on device
01041   int *d_excl;        // padded exclusion map buffer on device
01042   int binsz;          // size of bin buffer
01043   flint *d_bin;       // atom bins stored on device
01044   flint *d_bin_zero;  // shifted bin pointer on device
01045   int gpuid = -1;     // default: let VMD choose.
01046   int numgpus = 0;
01047   int i, j, k;
01048 
01049   // override device choice from environment for backward compatibility.
01050   if (getenv("VMDILSCUDADEVICE")) {
01051     gpuid = atoi(getenv("VMDILSCUDADEVICE"));
01052   } 
01053 
01054   if (gpuid < 0) {
01055     if (vmd_cuda_num_devices(&numgpus) != VMDCUDA_ERR_NONE) {
01056       printf("ILS CUDA device init error\n");
01057       return NULL;
01058     }
01059     if (numgpus > 0) {
01060       // XXX: here we should pick the first available GPU
01061       //      and skip over device GPUs. but how to do this 
01062       //      cleanly from in here? 
01063       //      change the code to use the device pool?
01064       // AK 2010/03/15
01065       gpuid = 0;
01066     } else {
01067       return NULL;
01068     }    
01069   }
01070 
01071   if (gpuid >= 0) {
01072     printf("Using CUDA device: %d\n", gpuid);
01073   } else {
01074     // no suitable GPU.
01075     return NULL;
01076   }
01077   
01078   wkf_timerhandle timer = wkf_timer_create();
01079 
01080   parms = (occthrparms *) voidparms;
01081   parms->errcode = -1;  // be pessimistic until the very end
01082 
01083   if (getenv("VMDILSVERBOSE")) {
01084     // XXX look at parms
01085     printf("*****************************************************************\n");
01086     printf("mx = %d  my = %d  mz = %d\n", parms->mx, parms->my, parms->mz);
01087     printf("hx = %g  hy = %g  hz = %g\n", parms->hx, parms->hy, parms->hz);
01088     printf("x0 = %g  y0 = %g  z0 = %g\n", parms->x0, parms->y0, parms->z0);
01089     printf("bx_1 = %g  by_1 = %g  bz_1 = %g\n",
01090            parms->bx_1, parms->by_1, parms->bz_1);
01091     printf("nbx = %d  nby = %d  nbz = %d\n", parms->nbx, parms->nby, parms->nbz);
01092     printf("num_binoffsets = %d\n", parms->num_binoffsets);
01093     printf("num_extras = %d\n", parms->num_extras);
01094     printf("num_vdwparms = %d\n", parms->num_vdwparms);
01095     printf("num_probes = %d\n", parms->num_probes);
01096     printf("num_conformers = %d\n", parms->num_conformers);
01097     printf("*****************************************************************\n");
01098   }
01099 
01100   // check that data will fit constant memory
01101   if (parms->num_binoffsets > MAX_BINOFFSETS) {
01102     printf("***** ERROR: Exceeded MAX_BINOFFSETS for CUDA kernel\n");
01103     return NULL;  // XXX How do I raise an error / exception?
01104   }
01105   if (parms->num_extras > MAX_EXTRAS) {
01106     printf("***** ERROR: Exceeded MAX_EXTRAS for CUDA kernel\n");
01107     return NULL;  // XXX How do I raise an error / exception?
01108   }
01109   if (parms->num_vdwparms > MAX_VDWPARMS) {
01110     printf("***** ERROR: Exceeded MAX_VDWPARMS for CUDA kernel\n");
01111     return NULL;  // XXX How do I raise an error / exception?
01112   }
01113   if (parms->num_probes > MAX_PROBES) {
01114     printf("***** ERROR: Exceeded MAX_PROBES for CUDA kernel\n");
01115     return NULL;  // XXX How do I raise an error / exception?
01116   }
01117   if (parms->num_conformers > MAX_CONFORMERS) {
01118     printf("***** ERROR: Exceeded MAX_CONFORMERS for CUDA kernel\n");
01119     return NULL;  // XXX How do I raise an error / exception?
01120   }
01121 
01122   // attach to GPU and check for errors
01123   cudaError_t rc;
01124   rc = cudaSetDevice(gpuid);
01125   if (rc != cudaSuccess) {
01126 #if CUDART_VERSION >= 2010
01127     rc = cudaGetLastError();  // query last error and reset error state
01128     if (rc != cudaErrorSetOnActiveProcess) {
01129       return NULL;  // abort and return an error
01130     }
01131 #else
01132     cudaGetLastError();  // just ignore and reset error state, since older CUDA
01133                          // revs don't have a cudaErrorSetOnActiveProcess enum
01134 #endif
01135   }
01136 
01137   // pad the map for CUDA thread blocks
01138   mxpad = (parms->mx + PADMASK) & ~PADMASK;
01139   mypad = (parms->my + PADMASK) & ~PADMASK;
01140   mzpad = (parms->mz + PADMASK) & ~PADMASK;
01141   mappad = mxpad * mypad * mzpad;
01142 
01143   if (getenv("VMDILSVERBOSE")) {
01144     printf("mxpad=%d  mypad=%d  mzpad=%d  mappad=%d\n",
01145            mxpad, mypad, mzpad, mappad);
01146   }
01147 
01148   h_map = (float *) malloc(mappad * sizeof(float));
01149   if (getenv("VMDILSVERBOSE")) {
01150     printf("Allocate %g MB for map\n", mappad*sizeof(float)/(1024*1024.f));
01151   }
01152   cudaMalloc((void **) &d_map, mappad * sizeof(float));
01153   CUERR;
01154   if (getenv("VMDILSVERBOSE")) {
01155     printf("Allocate %g MB for exclusions\n", mappad*sizeof(int)/(1024*1024.f));
01156   }
01157   cudaMalloc((void **) &d_excl, mappad * sizeof(int));
01158   CUERR;
01159 
01160 #if !defined(USE_DISTANCE_EXCL) && !defined(USE_ENERGY_EXCL)
01161   // set all points to be excluded by default
01162   cudaMemset(d_excl, 0, mappad * sizeof(int));
01163   CUERR;
01164 #endif
01165 
01166   binsz = BIN_SIZE * parms->nbx * parms->nby * parms->nbz;
01167   if (getenv("VMDILSVERBOSE")) {
01168     printf("nbx=%d  nby=%d  nbz=%d  binsz=%d\n",
01169            parms->nbx, parms->nby, parms->nbz, binsz);
01170     printf("Allocate %g MB for atom bins\n", binsz*sizeof(flint)/(1024*1024.f));
01171   }
01172   cudaMalloc((void **) &d_bin, binsz * sizeof(flint));
01173   CUERR;
01174   cudaMemcpy(d_bin, parms->bin, binsz * sizeof(flint), cudaMemcpyHostToDevice);
01175   CUERR;
01176   d_bin_zero = d_bin + (parms->bin_zero - parms->bin);
01177 
01178   if (getenv("VMDILSVERBOSE")) {
01179     printf("parms delta bin = %d   delta bin = %d\n",
01180            (parms->bin_zero - parms->bin), (d_bin_zero - d_bin));
01181     
01182     printf("probe epsilon=%g rmin=%g\n",
01183            parms->probevdwparms[0], parms->probevdwparms[1]);
01184     printf("max_energy=%g\n", parms->max_energy);
01185     printf("cutoff=%g\n", parms->cutoff);
01186     printf("hx=%g hy=%g hz=%g\n", parms->hx, parms->hy, parms->hz);
01187     printf("x0=%g y0=%g z0=%g\n", parms->x0, parms->y0, parms->z0);
01188     printf("bx_1=%g by_1=%g bz_1=%g\n", parms->bx_1, parms->by_1, parms->bz_1);
01189   }
01190 
01191   cudaMemcpyToSymbol(const_max_energy, &(parms->max_energy), sizeof(float), 0);
01192   CUERR;
01193   float min_occ = expf(-parms->max_energy);  // occupancy threshold
01194   cudaMemcpyToSymbol(const_min_occupancy, &min_occ, sizeof(float), 0);
01195   CUERR;
01196   cudaMemcpyToSymbol(const_cutoff, &(parms->cutoff), sizeof(float), 0);
01197   CUERR;
01198   cudaMemcpyToSymbol(const_min_excldist, &(parms->min_excldist),
01199       sizeof(float), 0);
01200   CUERR;
01201   cudaMemcpyToSymbol(const_hx, &(parms->hx), sizeof(float), 0);
01202   CUERR;
01203   cudaMemcpyToSymbol(const_hy, &(parms->hy), sizeof(float), 0);
01204   CUERR;
01205   cudaMemcpyToSymbol(const_hz, &(parms->hz), sizeof(float), 0);
01206   CUERR;
01207   cudaMemcpyToSymbol(const_x0, &(parms->x0), sizeof(float), 0);
01208   CUERR;
01209   cudaMemcpyToSymbol(const_y0, &(parms->y0), sizeof(float), 0);
01210   CUERR;
01211   cudaMemcpyToSymbol(const_z0, &(parms->z0), sizeof(float), 0);
01212   CUERR;
01213   cudaMemcpyToSymbol(const_bx_1, &(parms->bx_1), sizeof(float), 0);
01214   CUERR;
01215   cudaMemcpyToSymbol(const_by_1, &(parms->by_1), sizeof(float), 0);
01216   CUERR;
01217   cudaMemcpyToSymbol(const_bz_1, &(parms->bz_1), sizeof(float), 0);
01218   CUERR;
01219   float inv_numconf = 1.f / parms->num_conformers;
01220   cudaMemcpyToSymbol(const_inv_numconf, &inv_numconf, sizeof(float), 0);
01221 
01222   if (getenv("VMDILSVERBOSE")) {
01223     printf("num_binoffsets = %d\n", parms->num_binoffsets);
01224   }
01225   cudaMemcpyToSymbol(const_num_binoffsets, &(parms->num_binoffsets),
01226       sizeof(int), 0);
01227   CUERR;
01228   if (getenv("VMDILSVERBOSE")) {
01229     printf("num_extras = %d\n", parms->num_extras);
01230   }
01231   cudaMemcpyToSymbol(const_num_extras, &(parms->num_extras), sizeof(int), 0);
01232   CUERR;
01233   if (getenv("VMDILSVERBOSE")) {
01234     printf("num_probes = %d\n", parms->num_probes);
01235   }
01236   cudaMemcpyToSymbol(const_num_probes, &(parms->num_probes), sizeof(int), 0);
01237   CUERR;
01238   if (getenv("VMDILSVERBOSE")) {
01239     printf("num_conformers = %d\n", parms->num_conformers);
01240   }
01241   cudaMemcpyToSymbol(const_num_conformers, &(parms->num_conformers),
01242       sizeof(int), 0);
01243   CUERR;
01244 
01245   if (getenv("VMDILSVERBOSE")) {
01246     printf("num_vdwparms = %d\n", parms->num_vdwparms);
01247     for (i = 0;  i < parms->num_vdwparms;  i++) {
01248       printf("  %2d:  epsilon=%g  rmin=%g\n", i,
01249              parms->vdwparms[2*i], parms->vdwparms[2*i+1]);
01250     }
01251   }
01252 
01253   cudaMemcpyToSymbol(const_vdwparms, parms->vdwparms,
01254       2 * parms->num_vdwparms * sizeof(float), 0);
01255   CUERR;
01256   cudaMemcpyToSymbol(const_probevdwparms, parms->probevdwparms,
01257       2 * (0==parms->num_probes ? 1 : parms->num_probes) * sizeof(float), 0);
01258   CUERR;
01259   cudaMemcpyToSymbol(const_extra, parms->extra,
01260       BIN_SLOTSIZE * parms->num_extras * sizeof(flint), 0);
01261   CUERR;
01262 
01263   // take 3D bin offset and calculate array of flat bin offset
01264   {
01265     const int nbx = parms->nbx;
01266     const int nby = parms->nby;
01267     int n;
01268     int *h_flatbinoffset = (int *) calloc(parms->num_binoffsets, sizeof(int));
01269     for (n = 0;  n < parms->num_binoffsets;  n++) {
01270       int i = (int) (parms->binoffsets[3*n  ]);
01271       int j = (int) (parms->binoffsets[3*n+1]);
01272       int k = (int) (parms->binoffsets[3*n+2]);
01273       int index = (k*nby + j)*nbx + i;
01274       h_flatbinoffset[n] = index;
01275     }
01276     cudaMemcpyToSymbol(const_binoffsets, h_flatbinoffset,
01277         parms->num_binoffsets * sizeof(int), 0);
01278     CUERR;
01279     free(h_flatbinoffset);
01280   }
01281 
01282   // calculate array of flat bin offsets for distance-based exclusions
01283   {
01284     const int nbx = parms->nbx;
01285     const int nby = parms->nby;
01286     int i, j, k, n=0;
01287     int h_flatexcloffset[27];  // 3x3x3 cube
01288     for (k = -1;  k <= 1;  k++) {
01289       for (j = -1;  j <= 1;  j++) {
01290         for (i = -1;  i <= 1;  i++) {
01291           int index = (k*nby + j)*nbx + i;
01292           h_flatexcloffset[n++] = index;
01293         }
01294       }
01295     }
01296     cudaMemcpyToSymbol(const_excloffsets, h_flatexcloffset, 27*sizeof(int), 0);
01297     CUERR;
01298   }
01299 
01300   cudaMemcpyToSymbol(const_conformers, parms->conformers,
01301       3 * parms->num_probes * parms->num_conformers * sizeof(float), 0);
01302   CUERR;
01303 
01304   // cluster slabs based on time per work unit;
01305   // the following constants empirically derived from
01306   // oxygen testcase run on GT200
01307 #define TIME_PER_MONOATOM_WORKUNIT   1.24553913519431e-10
01308 #define TIME_PER_MULTIATOM_WORKUNIT  5.24559386973180e-09
01309 //#define TIME_PER_MONOATOM_WORKUNIT   3.73661740558292e-09
01310 //#define TIME_PER_MULTIATOM_WORKUNIT  1.57367816091954e-07
01311 
01312   int mxblocks = (mxpad >> BSHIFT);
01313   int myblocks = (mypad >> BSHIFT);
01314   int mzblocks = (mzpad >> BSHIFT);
01315   int nblocksperplane = mxblocks * myblocks;
01316   int maxplanes = MAX_THREADBLOCKS / nblocksperplane;
01317   int numconf = (0==parms->num_conformers ? 1 : parms->num_conformers);
01318 
01319   double monoatom_time_per_plane = (double)(parms->num_probes * numconf *
01320       parms->num_binoffsets * nblocksperplane) * TIME_PER_MONOATOM_WORKUNIT;
01321 
01322   double multiatom_time_per_plane = (double)(parms->num_probes * numconf *
01323       parms->num_binoffsets * nblocksperplane) * TIME_PER_MULTIATOM_WORKUNIT;
01324 
01325   // number of planes to schedule per slab
01326 #define MAX_TIME_PER_KERNEL  1.f  // schedule up to 1 second per kernel call
01327   int max_mono_planes = (int)(MAX_TIME_PER_KERNEL / monoatom_time_per_plane);
01328   int max_multi_planes = (int)(MAX_TIME_PER_KERNEL / multiatom_time_per_plane);
01329 
01330   if (max_mono_planes > maxplanes) max_mono_planes = maxplanes;
01331   else if (max_mono_planes < 1)    max_mono_planes = 1;
01332 
01333   if (max_multi_planes > maxplanes) max_multi_planes = maxplanes;
01334   else if (max_multi_planes < 1)    max_multi_planes = 1;
01335 
01336   if (getenv("VMDILSVERBOSE")) {
01337     printf("mxblocks = %d  myblocks = %d  mzblocks = %d\n",
01338            mxblocks, myblocks, mzblocks);
01339   }
01340   if (nblocksperplane > MAX_THREADBLOCKS) {
01341     printf("***** ERROR: Too many blocks in X-Y plane for CUDA kernel\n");
01342     return NULL;  // XXX raise exception
01343   }
01344 
01345   if (0 == parms->num_conformers) {  // monoatom
01346     if (getenv("VMDILSVERBOSE")) {
01347       printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01348         "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01349     }
01350     int mzblockoffset = 0;
01351     int nplanes_remaining = mzblocks;
01352     while (nplanes_remaining > 0) {
01353 #ifndef DO_ONE_PLANE
01354       // do as many planes as possible given constraints
01355       int nplanes = nplanes_remaining;
01356       if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01357 #else
01358       // XXX do only one slice at a time
01359       int nplanes = 1;
01360 #endif
01361       dim3 gsz, bsz;
01362       gsz.x = mxblocks;
01363       gsz.y = myblocks * nplanes;
01364       gsz.z = 1;
01365       bsz.x = THBLEN;
01366       bsz.y = THBLEN;
01367       bsz.z = THBLEN;
01368 
01369       if ( ! getenv("VMDILSNOEXCL") ) {
01370 #ifdef USE_DISTANCE_EXCL
01371       // distance exclusions
01372 #ifdef DEBUG
01373       printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01374           "on %2d planes   ",
01375           nplanes);
01376 #endif
01377 #if 1
01378       cudaDeviceSynchronize();
01379 #endif
01380       wkf_timer_start(timer);
01381       cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01382           d_excl, d_bin_zero,
01383           parms->nbx, parms->nby,
01384 #ifndef DO_ONE_PLANE
01385           nplanes,
01386 #endif
01387           mzblockoffset);
01388 #if 1
01389       cudaDeviceSynchronize();
01390 #endif
01391 #ifdef DEBUG
01392       printf("%f s\n", wkf_timer_timenow(timer));
01393 #endif
01394       CUERR;
01395 #endif // USE_DISTANCE_EXCL
01396       }
01397 
01398       // monoatom occupancy
01399 #ifdef DEBUG
01400      printf("*** VMD CUDA: cuda_occupancy_monoatom()       "
01401           "on %2d planes   ",
01402           nplanes);
01403 #endif
01404 #if 1
01405       cudaDeviceSynchronize();
01406 #endif
01407       wkf_timer_start(timer);
01408       cuda_occupancy_monoatom<<<gsz, bsz, 0>>>(
01409           d_map, d_excl, d_bin_zero,
01410           parms->nbx, parms->nby,
01411 #ifndef DO_ONE_PLANE
01412           nplanes,
01413 #endif
01414           mzblockoffset);
01415 #if 1
01416       cudaDeviceSynchronize();
01417 #endif
01418 #ifdef DEBUG
01419       printf("%f s\n", wkf_timer_timenow(timer));
01420 #endif
01421       CUERR;
01422 
01423       nplanes_remaining -= nplanes;
01424       mzblockoffset += nplanes;
01425     }
01426   } // if monoatom
01427 
01428   else { // multiatom
01429     if (getenv("VMDILSVERBOSE")) {
01430       printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01431         "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01432       printf("Scheduling up to %d planes (%d thread blocks) per multiatom "
01433         "kernel call\n", max_multi_planes, nblocksperplane*max_multi_planes);
01434     }
01435 
01436     int mzblockoffset;
01437     int nplanes_remaining;
01438 
01439     if ( ! getenv("VMDILSNOEXCL") ) {
01440     // first do "monoatom" slabs for exclusions
01441     mzblockoffset = 0;
01442     nplanes_remaining = mzblocks;
01443     while (nplanes_remaining > 0) {
01444 #ifndef DO_ONE_PLANE
01445       // do as many planes as possible given constraints
01446       int nplanes = nplanes_remaining;
01447       if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01448 #else
01449       // XXX do only one slice at a time
01450       int nplanes = 1;
01451 #endif
01452       dim3 gsz, bsz;
01453       gsz.x = mxblocks;
01454       gsz.y = myblocks * nplanes;
01455       gsz.z = 1;
01456       bsz.x = THBLEN;
01457       bsz.y = THBLEN;
01458       bsz.z = THBLEN;
01459 
01460 #ifdef USE_DISTANCE_EXCL
01461       // distance exclusions
01462 #ifdef DEBUG
01463       printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01464           "on %2d planes   ",
01465           nplanes);
01466 #endif
01467 #if 1
01468       cudaDeviceSynchronize();
01469 #endif
01470       wkf_timer_start(timer);
01471       cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01472           d_excl, d_bin_zero,
01473           parms->nbx, parms->nby,
01474 #ifndef DO_ONE_PLANE
01475           nplanes,
01476 #endif
01477           mzblockoffset);
01478 #if 1
01479       cudaDeviceSynchronize();
01480 #endif
01481 #ifdef DEBUG
01482       printf("%f s\n", wkf_timer_timenow(timer));
01483 #endif
01484       CUERR;
01485 #endif // USE_DISTANCE_EXCL
01486 
01487 #ifdef USE_ENERGY_EXCL
01488       // energy exclusions
01489 #ifdef DEBUG
01490       printf("*** VMD CUDA: cuda_find_energy_exclusions()   "
01491           "on %2d planes   ",
01492           nplanes);
01493 #endif
01494 #if 1
01495       cudaDeviceSynchronize();
01496 #endif
01497       wkf_timer_start(timer);
01498       cuda_find_energy_exclusions<<<gsz, bsz, 0>>>(
01499           d_excl, d_bin_zero,
01500           parms->nbx, parms->nby,
01501 #ifndef DO_ONE_PLANE
01502           nplanes,
01503 #endif
01504           mzblockoffset);
01505 #if 1
01506       cudaDeviceSynchronize();
01507 #endif
01508 #ifdef DEBUG
01509       printf("%f s\n", wkf_timer_timenow(timer));
01510 #endif
01511       CUERR;
01512 #endif // USE_ENERGY_EXCL
01513 
01514       nplanes_remaining -= nplanes;
01515       mzblockoffset += nplanes;
01516     }
01517     } // ( ! getenv("VMDILSNOEXCL") )
01518 
01519     // next do "multiatom" slabs for exclusions
01520     mzblockoffset = 0;
01521     nplanes_remaining = mzblocks;
01522     while (nplanes_remaining > 0) {
01523 #ifndef DO_ONE_PLANE
01524       // do as many planes as possible given constraints
01525       int nplanes = nplanes_remaining;
01526       if (nplanes > max_multi_planes) nplanes = max_multi_planes;
01527 #else
01528       // XXX do only one slice at a time
01529       int nplanes = 1;
01530 #endif
01531       dim3 gsz, bsz;
01532       gsz.x = mxblocks;
01533       gsz.y = myblocks * nplanes;
01534       gsz.z = 1;
01535       bsz.x = THBLEN;
01536       bsz.y = THBLEN;
01537       bsz.z = THBLEN;
01538 
01539       // multiatom occupancy
01540 #ifdef DEBUG
01541       printf("*** VMD CUDA: cuda_occupancy_multiatom()      "
01542           "on %2d planes   ",
01543           nplanes);
01544 #endif
01545 #if 1
01546       cudaDeviceSynchronize();
01547 #endif
01548       wkf_timer_start(timer);
01549       cuda_occupancy_multiatom<<<gsz, bsz, 0>>>(
01550           d_map, d_excl, d_bin_zero,
01551           parms->nbx, parms->nby,
01552 #ifndef DO_ONE_PLANE
01553           nplanes,
01554 #endif
01555           mzblockoffset);
01556 #if 1
01557       cudaDeviceSynchronize();
01558 #endif
01559 #ifdef DEBUG
01560       printf("%f s\n", wkf_timer_timenow(timer));
01561 #endif
01562       CUERR;
01563 
01564       nplanes_remaining -= nplanes;
01565       mzblockoffset += nplanes;
01566     }
01567   } // else multiatom
01568 
01569   // retrieve padded map from GPU
01570   cudaMemcpy(h_map, d_map, mappad * sizeof(float), cudaMemcpyDeviceToHost);
01571   CUERR;
01572 
01573   // copy padded map into user map buffer
01574   // must transpose the thread blocks
01575   for (k = 0;  k < parms->mz;  k++) {
01576     for (j = 0;  j < parms->my;  j++) {
01577       for (i = 0;  i < parms->mx;  i++) {
01578         int mindex = (k*parms->my + j)*parms->mx + i;
01579         int block = ((k >> BSHIFT)*myblocks + (j >> BSHIFT))*mxblocks
01580           + (i >> BSHIFT);
01581         int tid = ((k & PADMASK)*THBLEN + (j & PADMASK))*THBLEN + (i & PADMASK);
01582         int pindex = block*(THBLEN*THBLEN*THBLEN) + tid;
01583         parms->map[mindex] = h_map[pindex];
01584       }
01585     }
01586   }
01587 
01588   cudaFree(d_excl);
01589   cudaFree(d_bin);
01590   cudaFree(d_map);
01591   CUERR;
01592 
01593   free(h_map);
01594   wkf_timer_destroy(timer);
01595 
01596   parms->errcode = 0;  // success!
01597   return NULL;
01598 }
01599 
01600 
01601 extern "C" int vmd_cuda_evaluate_occupancy_map(
01602     int mx, int my, int mz,             // map dimensions
01603     float *map,                         // buffer space for occupancy map
01604                                         // (length mx*my*mz floats)
01605 
01606     float max_energy,                   // max energy threshold
01607     float cutoff,                       // vdw cutoff distance
01608     float hx, float hy, float hz,       // map lattice spacing
01609     float x0, float y0, float z0,       // map origin
01610     float bx_1, float by_1, float bz_1, // inverse of atom bin lengths
01611 
01612     int nbx, int nby, int nbz,          // bin dimensions
01613     const float *bin,                   // atom bins XXX typecast to flint
01614                                         // (length BIN_SIZE*nbx*nby*nbz)
01615     const float *bin_zero,              // bin pointer shifted to origin
01616 
01617     int num_binoffsets,                 // number of offsets
01618     const char *binoffsets,             // bin neighborhood index offsets
01619                                         // (length 3*num_bin_offsets)
01620 
01621     int num_extras,                     // number of extra atoms
01622     const float *extra,                 // extra atoms from overfilled bins
01623                                         // XXX typecast to flint
01624                                         // (length BIN_SLOTSIZE*num_extras)
01625 
01626     int num_vdwparms,                   // number of vdw parameter types
01627     const float *vdwparms,              // vdw parameters
01628                                         // (length 2*num_vdw_params)
01629 
01630     int num_probes,                     // number of probe atoms
01631     const float *probevdwparms,         // vdw parameters of probe atoms
01632                                         // (length 2*num_probes)
01633 
01634     int num_conformers,                 // number of conformers
01635     const float *conformers             // probe atom offsets for conformers
01636                                         // (length 3*num_probes*num_conformers)
01637     ) {
01638   occthrparms parms;
01639 
01640   parms.mx = mx;
01641   parms.my = my;
01642   parms.mz = mz;
01643   parms.map = map;
01644   parms.max_energy = max_energy;
01645   parms.cutoff = cutoff;
01646 #define DEFAULT_EXCL_DIST  1.f
01647   parms.min_excldist = DEFAULT_EXCL_DIST;
01648   parms.hx = hx;
01649   parms.hy = hy;
01650   parms.hz = hz;
01651   parms.x0 = x0;
01652   parms.y0 = y0;
01653   parms.z0 = z0;
01654   parms.bx_1 = bx_1;
01655   parms.by_1 = by_1;
01656   parms.bz_1 = bz_1;
01657   parms.nbx = nbx;
01658   parms.nby = nby;
01659   parms.nbz = nbz;
01660   parms.bin = (flint *) bin;
01661   parms.bin_zero = (flint *) bin_zero;
01662   parms.num_binoffsets = num_binoffsets;
01663   parms.binoffsets = binoffsets;
01664   parms.num_extras = num_extras;
01665   parms.extra = (flint *) extra;
01666   parms.num_vdwparms = num_vdwparms;
01667   parms.vdwparms = vdwparms;
01668   parms.num_probes = num_probes;
01669   parms.probevdwparms = probevdwparms;
01670   parms.num_conformers = num_conformers;
01671   parms.conformers = conformers;
01672 
01673 #ifdef DEBUG
01674   printf("*** calling cuda_occupancy_thread()\n");
01675 #endif
01676   cuda_occupancy_thread(&parms);
01677 
01678   return parms.errcode;
01679 }