Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDAVolMapCreateILS.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2008-2009 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAVolMapCreateILS.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.49 $      $Date: 2011/02/04 18:49:07 $
00014  *
00015  ***************************************************************************
00016  * DESCRIPTION:
00017  * This source file contains CUDA kernels for calculating the
00018  * occupancy map for implicit ligand sampling.
00019  *
00020  ***************************************************************************/
00021 
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <cuda.h>
00026 
00027 #include "WKFThreads.h"
00028 #include "WKFUtils.h"
00029 #include "CUDAKernels.h"
00030 #include "utilities.h"
00031 
00032 //#define DEBUG 1
00033 
00034 #if defined(VMDTHREADS)
00035 #define USE_CUDADEVPOOL 1
00036 #endif
00037 
00038 #if 1
00039 #define CUERR \
00040   do { \
00041     cudaError_t err; \
00042     if ((err = cudaGetLastError()) != cudaSuccess) { \
00043       printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00044       printf("Thread aborting...\n"); \
00045       return NULL; \
00046     } \
00047   } while (0)
00048 // to be terminated with semi-colon
00049 #else
00050 #define CUERR
00051 #endif
00052 
00053 //#define DEBUG
00054 
00055 typedef union flint_t {
00056   float f;
00057   int i;
00058 } flint;
00059 
00060 
00061 //
00062 // algorithm optimizations
00063 //
00064 
00065 // Computing distance-based exclusions either along with or instead of
00066 // energy-based exclusions turns out to decrease performance.
00067 //#define USE_DISTANCE_EXCL  // use the distance-based exclusion kernel
00068 #define USE_ENERGY_EXCL    // use the energy-based exclusion kernel
00069 
00070 #ifdef USE_DISTANCE_EXCL
00071 #define TEST_BLOCK_EXCL    // early exit if entire thread block is excluded
00072 // don't enable unless we have distance-based exclusions
00073 #endif
00074 
00075 #if defined(USE_ENERGY_EXCL) || defined(USE_DISTANCE_EXCL)
00076 #define TEST_MBLOCK_EXCL     // early exit from multiatom kernel
00077 #define TEST_SHIFT_EXCL
00078 #endif
00079 
00080 #ifdef TEST_SHIFT_EXCL
00081 #undef TEST_MBLOCK_EXCL
00082 #endif
00083 
00084 
00085 #define MAX_THREADBLOCKS  65536
00086 
00087 #define BIN_DEPTH     8  // number of slots per bin
00088 #define BIN_SLOTSIZE  4  // slot permits x, y, z, vdwtype  (4 elements)
00089 #define BIN_SIZE      (BIN_DEPTH * BIN_SLOTSIZE)  // size given in "flints"
00090 #define BIN_SHIFT     5  // log2(BIN_SIZE)
00091 #define BIN_SLOTSHIFT 2  // log2(BIN_SLOTSIZE)
00092 
00093 typedef struct occthrparms_t {
00094   int errcode;                   // 0 for success, -1 for failure
00095 
00096   int mx, my, mz;                // map dimensions
00097   float *map;                    // buffer space for occupancy map
00098                                  // (length mx*my*mz floats)
00099 
00100   float max_energy;              // max energy threshold
00101   float cutoff;                  // vdw cutoff distance
00102   float min_excldist;            // min exclusion distance
00103   float hx, hy, hz;              // map lattice spacing
00104   float x0, y0, z0;              // map origin
00105   float bx_1, by_1, bz_1;        // inverse of atom bin lengths
00106 
00107   int nbx, nby, nbz;             // bin dimensions
00108   const flint *bin;              // atom bins
00109                                  // (length BIN_SIZE*nbx*nby*nbz)
00110   const flint *bin_zero;         // bin pointer shifted to origin
00111 
00112   int num_binoffsets;            // number of offsets
00113   const char *binoffsets;        // bin neighborhood index offsets
00114                                  // (length 3*num_bin_offsets)
00115 
00116   int num_extras;                // number of extra atoms
00117   const flint *extra;            // extra atoms from overfilled bins
00118                                  // (length BIN_SLOTSIZE*num_extras)
00119 
00120   int num_vdwparms;              // number of vdw parameter types
00121   const float *vdwparms;         // vdw parameters
00122                                  // (length 2*num_vdw_params)
00123 
00124   int num_probes;                // number of probe atoms
00125   const float *probevdwparms;    // vdw parameters of probe atoms
00126                                  // (length 2*num_probes)
00127 
00128   int num_conformers;            // number of conformers
00129   const float *conformers;       // probe atom offsets for conformers
00130                                  // (length 3*num_probes*num_conformers)
00131 } occthrparms;
00132 
00133 
00134 // constant memory, page 1 (8K)
00135 //
00136 __constant__ static float const_max_energy;     // energy threshold
00137 __constant__ static float const_min_occupancy;  // occupancy threshold
00138 __constant__ static float const_cutoff;         // cutoff distance
00139 __constant__ static float const_min_excldist;   // min exclusion distance
00140 __constant__ static float const_hx, const_hy, const_hz;  // map spacings
00141 __constant__ static float const_x0, const_y0, const_z0;  // map origin
00142 __constant__ static float const_bx_1, const_by_1, const_bz_1;
00143                                                 // inverse of bin lengths
00144 __constant__ static float const_inv_numconf;    // inverse number conformers
00145 
00146 __constant__ static int const_num_binoffsets;   // need use lengths of arrays
00147 __constant__ static int const_num_extras;
00148 __constant__ static int const_num_probes;
00149 __constant__ static int const_num_conformers;
00150 
00151 #define MAX_VDWPARMS  160
00152 __constant__ static float const_vdwparms[2 * MAX_VDWPARMS];
00153 
00154 #define MAX_PROBES  8
00155 __constant__ static float const_probevdwparms[2 * MAX_PROBES];
00156 
00157 #define MAX_EXTRAS  50
00158 __constant__ static flint const_extra[BIN_SLOTSIZE * MAX_EXTRAS];
00159 
00160 // each offset is a single flat index offset into atom bin array
00161 #define MAX_BINOFFSETS  1467
00162 __constant__ static int const_binoffsets[MAX_BINOFFSETS];
00163 
00164 // nearest neighbor list for distance-based exclusions
00165 #define MAX_EXCLOFFSETS  27
00166 __constant__ static int const_excloffsets[MAX_EXCLOFFSETS];
00167 
00168 // constant memory, pages 2 - 7
00169 //
00170 // the max number of conformers is chosen to leave 1K available
00171 // to the runtime library
00172 //
00173 #define MAX_CONFORMERS 586
00174 __constant__ static float const_conformers[3*MAX_PROBES*MAX_CONFORMERS];
00175 
00176 
00177 #define DO_ONE_PLANE         // optimze when z-slabs are slices of thickness 1
00178 #undef DO_ONE_PLANE
00179 
00180 #define THBLEN   4           // thread block length of cube
00181 #define NTHREADSPERBLOCK  (THBLEN*THBLEN*THBLEN)
00182 
00183 #define BSHIFT   2           // shift for map points to blocks = log2(THBLEN)
00184 
00185 #define NTPBSHIFT  6         // do bit shifting instead of int multiplies
00186                              //  = log2(NTHREADPERBLOCK)
00187 
00188 // Performance is improved for exclusion optimizations
00189 // if we reduce shared memory use to < 4K
00190 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00191 #define NCONFPERLOOP       5
00192 #else
00193 #define NCONFPERLOOP       8
00194 #endif
00195 
00196 #define NTPB_TIMES_NCPL  (NTHREADSPERBLOCK * NCONFPERLOOP)
00197 
00198 #define PADMASK  (THBLEN-1)  // bit mask for padding up to multiple of THBLEN
00199 
00200 #define MAX_ALLOWABLE_ENERGY  87.f  // otherwise expf(-u) gives 0
00201 
00202 
00204 //
00205 //     ROUTINES FOR EXCLUSIONS
00206 //
00208 
00209 #ifdef USE_DISTANCE_EXCL
00210 __global__ static void cuda_find_distance_exclusions(
00211     int *excl,              // exclusions, stored in contiguous thread blocks
00212     const flint *bin_zero,  // points to bin containing map origin
00213     int nbx, int nby,       // number of bins along x and y dimensions
00214 #ifndef DO_ONE_PLANE
00215     int mzblocks,           // number of thread blocks in z dimension
00216 #endif
00217     int mzblockoffset       // offset (number of planes) to starting z
00218     ) {
00219 
00220   // space for 1 atom bin in shared memory
00221   __shared__ flint atombin[BIN_SIZE];
00222 
00223 #ifndef DO_ONE_PLANE
00224   const int nblockx = gridDim.x;
00225   const int nblocky = gridDim.y / mzblocks;
00226 
00227   // 3D index for this thread block
00228   const int bidx = blockIdx.x;
00229   const int bidy = blockIdx.y % nblocky;
00230   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00231 
00232   // block ID, flat index
00233   // XXX assume first mult is within 24-bits
00234   const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00235 
00236   // thread ID, flat index
00237   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00238 
00239   // position of lower left corner of this thread block
00240   float px = (bidx<<BSHIFT) * const_hx;
00241   float py = (bidy<<BSHIFT) * const_hy;
00242   float pz = (bidz<<BSHIFT) * const_hz;
00243 #else
00244   // can optimize from above for case when mzblocks = 1
00245 
00246   // block ID, flat index
00247   // XXX assume first mult is within 24-bits
00248   const int bid
00249     = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00250 
00251   // thread ID, flat index
00252   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00253 
00254   // position of lower left corner of this thread block
00255   float px = (blockIdx.x<<BSHIFT) * const_hx;
00256   float py = (blockIdx.y<<BSHIFT) * const_hy;
00257   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00258 #endif
00259 
00260   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00261   int isexcluded = 0;  // assume it isn't excluded
00262 
00263   const float mindis2 = const_min_excldist * const_min_excldist;
00264 
00265   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00266   const int jb = (int) floorf(py * const_by_1);  // thread block
00267   const int kb = (int) floorf(pz * const_bz_1);
00268 
00269   // XXX assume first mult is within 24-bits
00270   const int binid = (__umul24(kb,nby) + jb)*nbx + ib;  // flat index of my bin
00271 
00272   int n, nb;
00273 
00274   px += threadIdx.x * const_hx;  // adjust position for this thread
00275   py += threadIdx.y * const_hy;
00276   pz += threadIdx.z * const_hz;
00277 
00278   px += const_x0;
00279   py += const_y0;
00280   pz += const_z0;  // translate to absolute position
00281 
00282   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00283 
00284   for (nb = 0;  nb < MAX_EXCLOFFSETS;  nb++) {
00285 
00286     if (tid < BIN_SIZE) {  // cache next atom bin
00287       const flint *p_bin = bin_zero
00288         + ((binid + const_excloffsets[nb])<<BIN_SHIFT);
00289       atombin[tid] = p_bin[tid];  // copy from global to shared memory
00290     }
00291     __syncthreads();  // now we have atom bin loaded
00292 
00293     for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00294       
00295       if (-1 == atombin[n+3].i) break;  // no more atoms in bin
00296       const float dx = px - atombin[n  ].f;
00297       const float dy = py - atombin[n+1].f;
00298       const float dz = pz - atombin[n+2].f;
00299       const float r2 = dx*dx + dy*dy + dz*dz;
00300 
00301       if (r2 <= mindis2) isexcluded = 1;
00302 
00303     } // end loop over atoms in bin
00304 
00305     __syncthreads();  // wait for thread block before loading next bin
00306   } // end loop over bin neighborhood
00307 
00308   for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00309     const float dx = px - const_extra[n  ].f;
00310     const float dy = py - const_extra[n+1].f;
00311     const float dz = pz - const_extra[n+2].f;
00312     const float r2 = dx*dx + dy*dy + dz*dz;
00313 
00314     if (r2 <= mindis2) isexcluded = 1;
00315 
00316   } // end loop over extra atoms
00317 
00318   excl[mindex] = isexcluded;
00319 }
00320 #endif // USE_DISTANCE_EXCL
00321 
00322 
00323 #ifdef USE_ENERGY_EXCL
00324 __global__ static void cuda_find_energy_exclusions(
00325     int *excl,              // exclusions, stored in contiguous thread blocks
00326     const flint *bin_zero,  // points to bin containing map origin
00327     int nbx, int nby,       // number of bins along x and y dimensions
00328 #ifndef DO_ONE_PLANE
00329     int mzblocks,           // number of thread blocks in z dimension
00330 #endif
00331     int mzblockoffset       // offset (number of planes) to starting z
00332     ) {
00333 
00334   // space for 1 atom bin in shared memory
00335   __shared__ flint atombin[BIN_SIZE];
00336 
00337 #ifdef TEST_BLOCK_EXCL
00338 #if 0
00339   __shared__ int sh_isall_upper_warp;
00340 #endif
00341   __shared__ int sh_sum[NTHREADSPERBLOCK];
00342 #endif
00343 
00344 #ifndef DO_ONE_PLANE
00345   const int nblockx = gridDim.x;
00346   const int nblocky = gridDim.y / mzblocks;
00347 
00348   // 3D index for this thread block
00349   const int bidx = blockIdx.x;
00350   const int bidy = blockIdx.y % nblocky;
00351   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00352 
00353   // block ID, flat index
00354   // XXX assume first mult is within 24-bits
00355   const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00356 
00357   // thread ID, flat index
00358   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00359 
00360   // position of lower left corner of this thread block
00361   float px = (bidx<<BSHIFT) * const_hx;
00362   float py = (bidy<<BSHIFT) * const_hy;
00363   float pz = (bidz<<BSHIFT) * const_hz;
00364 #else
00365   // can optimize from above for case when mzblocks = 1
00366 
00367   // block ID, flat index
00368   // XXX assume first mult is within 24-bits
00369   const int bid
00370     = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00371 
00372   // thread ID, flat index
00373   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00374 
00375   // position of lower left corner of this thread block
00376   float px = (blockIdx.x<<BSHIFT) * const_hx;
00377   float py = (blockIdx.y<<BSHIFT) * const_hy;
00378   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00379 #endif
00380 
00381   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00382 #ifdef USE_DISTANCE_EXCL
00383   int isexcluded = excl[mindex];  // read exclusion from global memory
00384 #else
00385   int isexcluded = 0;  // assume it isn't excluded
00386 #endif
00387 
00388 #ifdef TEST_BLOCK_EXCL
00389 #if 0
00390   // warp vote is available only in devices 1.2 and later
00391   // warp vote to see if we continue
00392   int isall = __all(isexcluded);
00393   if (32==tid) sh_isall_upper_warp = isall;
00394   if (0==isany+sh_isall_upper_warp) return; // all points excluded, exit
00395 #endif
00396   sh_sum[tid] = !(isexcluded);  // count inclusions
00397   int nbrsum = sh_sum[tid^1];
00398   sh_sum[tid] += nbrsum;
00399   nbrsum = sh_sum[tid^2];
00400   sh_sum[tid] += nbrsum;
00401   nbrsum = sh_sum[tid^4];
00402   sh_sum[tid] += nbrsum;
00403   nbrsum = sh_sum[tid^8];
00404   sh_sum[tid] += nbrsum;
00405   nbrsum = sh_sum[tid^16];
00406   sh_sum[tid] += nbrsum;
00407   __syncthreads();
00408   nbrsum = sh_sum[tid^32];
00409   __syncthreads();
00410   sh_sum[tid] += nbrsum;
00411 
00412   if (0==sh_sum[tid]) return;  // all points excluded, exit
00413 #endif
00414 
00415   const float cutoff2 = const_cutoff * const_cutoff;  // cutoff2 in register
00416   const float probe_vdweps = const_probevdwparms[0];
00417   const float probe_vdwrmin = const_probevdwparms[1];
00418 
00419   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00420   const int jb = (int) floorf(py * const_by_1);  // thread block
00421   const int kb = (int) floorf(pz * const_bz_1);
00422 
00423   // XXX assume first mult is within 24-bits
00424   const int binid = (__umul24(kb,nby) + jb)*nbx + ib;  // flat index of my bin
00425 
00426   int n, nb;
00427 
00428   px += threadIdx.x * const_hx;  // adjust position for this thread
00429   py += threadIdx.y * const_hy;
00430   pz += threadIdx.z * const_hz;
00431 
00432   px += const_x0;
00433   py += const_y0;
00434   pz += const_z0;  // translate to absolute position
00435 
00436   float u = 0.f;  // sum potential energy
00437 
00438   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00439 
00440   for (nb = 0;  nb < const_num_binoffsets;  nb++) {
00441 
00442     if (tid < BIN_SIZE) {  // cache next atom bin
00443       const flint *p_bin = bin_zero
00444         + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00445       atombin[tid] = p_bin[tid];  // copy from global to shared memory
00446     }
00447     __syncthreads();  // now we have atom bin loaded
00448 
00449     for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00450       int vdwtype = atombin[n+3].i;
00451       if (-1 == vdwtype) break;  // no more atoms in bin
00452       vdwtype <<= 1;  // multiply by 2 for indexing VDW parms
00453 
00454       const float dx = px - atombin[n  ].f;
00455       const float dy = py - atombin[n+1].f;
00456       const float dz = pz - atombin[n+2].f;
00457       const float r2 = dx*dx + dy*dy + dz*dz;
00458       if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00459         const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00460         const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00461         float rm6 = rmin*rmin / r2;
00462         rm6 = rm6 * rm6 * rm6;
00463         u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00464       }
00465     } // end loop over atoms in bin
00466 
00467     __syncthreads();  // wait for thread block before loading next bin
00468   } // end loop over bin neighborhood
00469 
00470   for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00471     const int vdwtype = const_extra[n+3].i << 1; // mult by 2 index VDW parms
00472     const float dx = px - const_extra[n  ].f;
00473     const float dy = py - const_extra[n+1].f;
00474     const float dz = pz - const_extra[n+2].f;
00475     const float r2 = dx*dx + dy*dy + dz*dz;
00476     if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00477       const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00478       const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00479       float rm6 = rmin*rmin / r2;
00480       rm6 = rm6 * rm6 * rm6;
00481       u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00482     }
00483   } // end loop over extra atoms
00484 
00485   if (u >= const_max_energy) isexcluded = 1;  // exclude this map point
00486 
00487   excl[mindex] = isexcluded;
00488 }
00489 #endif // USE_ENERGY_EXCL
00490 
00491 
00492 // For a monoatomic probe compute the occupancy rho
00493 // (probability of finding the probe)
00494 //
00495 // For each map point the occupancy is computed as
00496 //
00497 //   rho = exp(-U)
00498 //
00499 // where U is the interaction energy of the probe with the system
00500 // due to the VDW force field.
00501 //
00502 __global__ static void cuda_occupancy_monoatom(
00503     float *map,             // map buffer, stored in contiguous thread blocks
00504     const int *excl,        // exclusion buffer, stored same
00505     const flint *bin_zero,  // points to bin containing map origin
00506     int nbx, int nby,       // number of bins along x and y dimensions
00507 #ifndef DO_ONE_PLANE
00508     int mzblocks,           // number of thread blocks in z dimension
00509 #endif
00510     int mzblockoffset       // offset (number of planes) to starting z
00511     ) {
00512 
00513   // space for 1 atom bin in shared memory
00514   __shared__ flint atombin[BIN_SIZE];
00515 
00516 #ifdef TEST_BLOCK_EXCL
00517 #if 0
00518   __shared__ int sh_isall_upper_warp;
00519 #endif
00520   __shared__ int sh_sum[NTHREADSPERBLOCK];
00521 #endif
00522 
00523 #ifndef DO_ONE_PLANE
00524   const int nblockx = gridDim.x;
00525   const int nblocky = gridDim.y / mzblocks;
00526 
00527   // 3D index for this thread block
00528   const int bidx = blockIdx.x;
00529   const int bidy = blockIdx.y % nblocky;
00530   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00531 
00532   // block ID, flat index
00533   // XXX assume first mult is within 24-bits
00534   const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00535 
00536   // thread ID, flat index
00537   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00538 
00539   // position of lower left corner of this thread block
00540   float px = (bidx<<BSHIFT) * const_hx;
00541   float py = (bidy<<BSHIFT) * const_hy;
00542   float pz = (bidz<<BSHIFT) * const_hz;
00543 #else
00544   // can optimize from above for case when mzblocks = 1
00545 
00546   // block ID, flat index
00547   // XXX assume first mult is within 24-bits
00548   const int bid
00549     = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00550 
00551   // thread ID, flat index
00552   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00553 
00554   // position of lower left corner of this thread block
00555   float px = (blockIdx.x<<BSHIFT) * const_hx;
00556   float py = (blockIdx.y<<BSHIFT) * const_hy;
00557   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00558 #endif
00559 
00560   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00561   const int isexcluded = excl[mindex];  // read exclusion from global memory
00562 
00563 #ifdef TEST_BLOCK_EXCL
00564 #if 0
00565   // warp vote is available only in devices 1.2 and later
00566   // warp vote to see if we continue
00567   int isall = __all(isexcluded);
00568   if (32==tid) sh_isall_upper_warp = isall;
00569   if (0==isany+sh_isall_upper_warp) return; // all points excluded, exit
00570 #endif
00571   sh_sum[tid] = !(isexcluded);  // count inclusions
00572   int nbrsum = sh_sum[tid^1];
00573   sh_sum[tid] += nbrsum;
00574   nbrsum = sh_sum[tid^2];
00575   sh_sum[tid] += nbrsum;
00576   nbrsum = sh_sum[tid^4];
00577   sh_sum[tid] += nbrsum;
00578   nbrsum = sh_sum[tid^8];
00579   sh_sum[tid] += nbrsum;
00580   nbrsum = sh_sum[tid^16];
00581   sh_sum[tid] += nbrsum;
00582   __syncthreads();
00583   nbrsum = sh_sum[tid^32];
00584   __syncthreads();
00585   sh_sum[tid] += nbrsum;
00586 
00587   if (0==sh_sum[tid]) return;  // all points excluded, exit
00588 #endif
00589 
00590   const float cutoff2 = const_cutoff * const_cutoff;  // cutoff2 in register
00591   const float probe_vdweps = const_probevdwparms[0];
00592   const float probe_vdwrmin = const_probevdwparms[1];
00593 
00594   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00595   const int jb = (int) floorf(py * const_by_1);  // thread block
00596   const int kb = (int) floorf(pz * const_bz_1);
00597 
00598   // XXX assume first mult is within 24-bits
00599   const int binid = (__umul24(kb,nby) + jb)*nbx + ib;  // flat index of my bin
00600 
00601   int n, nb;
00602 
00603   px += threadIdx.x * const_hx;  // adjust position for this thread
00604   py += threadIdx.y * const_hy;
00605   pz += threadIdx.z * const_hz;
00606 
00607   px += const_x0;
00608   py += const_y0;
00609   pz += const_z0;  // translate to absolute position
00610 
00611   float u = 0.f;  // sum potential energy
00612 
00613   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00614 
00615   for (nb = 0;  nb < const_num_binoffsets;  nb++) {  // bin neighborhood
00616 
00617     if (tid < BIN_SIZE) {  // cache next atom bin
00618       const flint *p_bin = bin_zero
00619         + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00620       atombin[tid] = p_bin[tid];  // copy from global to shared memory
00621     }
00622     __syncthreads();  // now we have atom bin loaded
00623 
00624     for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00625       int vdwtype = atombin[n+3].i;
00626       if (-1 == vdwtype) break;  // no more atoms in bin
00627       vdwtype <<= 1;  // multiply by 2 for indexing VDW parms
00628 
00629       const float dx = px - atombin[n  ].f;
00630       const float dy = py - atombin[n+1].f;
00631       const float dz = pz - atombin[n+2].f;
00632       const float r2 = dx*dx + dy*dy + dz*dz;
00633       if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00634         const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00635         const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00636         float rm6 = rmin*rmin / r2;
00637         rm6 = rm6 * rm6 * rm6;
00638         u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00639       }
00640     } // end loop atoms in bin
00641 
00642     __syncthreads();  // wait for entire thread block before loading next bin
00643   } // end loop over bin neighborhood
00644 
00645   for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00646     const int vdwtype = const_extra[n+3].i << 1; // mult by 2 index VDW parms
00647     const float dx = px - const_extra[n  ].f;
00648     const float dy = py - const_extra[n+1].f;
00649     const float dz = pz - const_extra[n+2].f;
00650     const float r2 = dx*dx + dy*dy + dz*dz;
00651     if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00652       const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00653       const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00654       float rm6 = rmin*rmin / r2;
00655       rm6 = rm6 * rm6 * rm6;
00656       u += epsilon * rm6 * (rm6 - 2.f);  // sum vdw contribution
00657     }
00658   } // end loop over extra atoms
00659 
00660   float occ = 0.f;
00661   if (!isexcluded && u < const_max_energy) {
00662     occ = expf(-u);  // the occupancy
00663   }
00664   map[mindex] = occ;
00665 } // cuda_occupancy_monoatom()
00666 
00667 
00668 // For a multiatom probe compute the occupancy rho
00669 // (probability of finding the probe)
00670 //
00671 // For each map point the occupancy is computed as
00672 //
00673 //   rho = (1/m) sum_i exp(-U[i]), i = 1..m
00674 //
00675 // where U[i] is the potential energy of the i-th conformer.
00676 //
00677 __global__ static void cuda_occupancy_multiatom(
00678     float *map,             // map buffer, stored in contiguous thread blocks
00679     const int *excl,        // exclusion buffer, stored same
00680     const flint *bin_zero,  // points to bin containing map origin
00681     int nbx, int nby,       // number of bins along x and y dimensions
00682 #ifndef DO_ONE_PLANE
00683     int mzblocks,           // number of thread blocks in z dimension
00684 #endif
00685     int mzblockoffset       // offset (number of planes) to starting z
00686     ) {
00687 
00688   // space for 1 atom bin in shared memory
00689   __shared__ flint atombin[BIN_SIZE];
00690 
00691   // accumulate into shared memory the energy for NCONFPERLOOP conformers
00692   __shared__ float usum[NTPB_TIMES_NCPL];
00693 
00694   // cache const mem pages 2+ into shared memory for faster access
00695   // since we have to keep accessing const mem page 1 (8K)
00696   __shared__ float conf[3*MAX_PROBES * NCONFPERLOOP];
00697 
00698 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00699 #if 0
00700   __shared__ int sh_isany_upper_warp;
00701 #endif
00702   // we reuse this buffer:
00703   // (1) reduce total sum across thread block for prefix sum
00704   // (2) to accumulate occupancy
00705   __shared__ flint sh_buffer[NTHREADSPERBLOCK];
00706 #endif
00707 
00708 #ifndef DO_ONE_PLANE
00709   const int nblockx = gridDim.x;
00710   const int nblocky = gridDim.y / mzblocks;
00711 
00712   // 3D index for this thread block
00713   const int bidx = blockIdx.x;
00714   const int bidy = blockIdx.y % nblocky;
00715   const int bidz = blockIdx.y / nblocky + mzblockoffset;
00716 
00717   // block ID, flat index
00718   // XXX assume first mult is within 24-bits
00719   const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00720 
00721   // thread ID, flat index
00722   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00723 
00724   // position of lower left corner of this thread block
00725   float px = (bidx<<BSHIFT) * const_hx;
00726   float py = (bidy<<BSHIFT) * const_hy;
00727   float pz = (bidz<<BSHIFT) * const_hz;
00728 #else
00729   // can optimize from above for case when mzblocks = 1
00730 
00731   // block ID, flat index
00732   // XXX assume first mult is within 24-bits
00733   const int bid
00734     = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00735 
00736   // thread ID, flat index
00737   const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00738 
00739   // position of lower left corner of this thread block
00740   float px = (blockIdx.x<<BSHIFT) * const_hx;
00741   float py = (blockIdx.y<<BSHIFT) * const_hy;
00742   float pz = (mzblockoffset<<BSHIFT) * const_hz;
00743 #endif
00744 
00745   const int mindex = (bid<<NTPBSHIFT) + tid;  // index into map and excl
00746   const int isincluded = !(excl[mindex]);  // read exclusion from global memory
00747 
00748 #ifdef TEST_MBLOCK_EXCL
00749 #if 0
00750   // warp vote is available only in devices 1.2 and later
00751   // warp vote to see if we continue
00752   int isany = __any(isincluded);
00753   if (32==tid) sh_isany_upper_warp = isany;
00754   if (0==isany+sh_isany_upper_warp) return; // all points excluded, exit
00755 #endif
00756 
00757   sh_buffer[tid].i = isincluded;
00758   int nbrsum = sh_buffer[tid^1].i;
00759   sh_buffer[tid].i += nbrsum;
00760   nbrsum = sh_buffer[tid^2].i;
00761   sh_buffer[tid].i += nbrsum;
00762   nbrsum = sh_buffer[tid^4].i;
00763   sh_buffer[tid].i += nbrsum;
00764   nbrsum = sh_buffer[tid^8].i;
00765   sh_buffer[tid].i += nbrsum;
00766   nbrsum = sh_buffer[tid^16].i;
00767   sh_buffer[tid].i += nbrsum;
00768   __syncthreads();
00769   nbrsum = sh_buffer[tid^32].i;
00770   __syncthreads();
00771   sh_buffer[tid].i += nbrsum;
00772 
00773   if (0==sh_buffer[tid].i) {
00774     map[mindex] = 0.f;
00775     return;
00776   }
00777 #endif
00778 
00779 #ifdef TEST_SHIFT_EXCL
00780   int numincl = 0;  // number of included points in thread block
00781   {
00782     int psum = isincluded;  // will have prefix sum over isincluded for tid
00783     sh_buffer[tid].i = isincluded;
00784 
00785     int nbrid = tid^1;
00786     int nbrsum = sh_buffer[nbrid].i;
00787     if (nbrid < tid) psum += nbrsum;
00788     sh_buffer[tid].i += nbrsum;
00789 
00790     nbrid = tid^2;
00791     nbrsum = sh_buffer[nbrid].i;
00792     if (nbrid < tid) psum += nbrsum;
00793     sh_buffer[tid].i += nbrsum;
00794 
00795     nbrid = tid^4;
00796     nbrsum = sh_buffer[nbrid].i;
00797     if (nbrid < tid) psum += nbrsum;
00798     sh_buffer[tid].i += nbrsum;
00799 
00800     nbrid = tid^8;
00801     nbrsum = sh_buffer[nbrid].i;
00802     if (nbrid < tid) psum += nbrsum;
00803     sh_buffer[tid].i += nbrsum;
00804 
00805     nbrid = tid^16;
00806     nbrsum = sh_buffer[nbrid].i;
00807     if (nbrid < tid) psum += nbrsum;
00808     sh_buffer[tid].i += nbrsum;
00809 
00810     __syncthreads();
00811     nbrid = tid^32;
00812     nbrsum = sh_buffer[nbrid].i;
00813     if (nbrid < tid) psum += nbrsum;
00814     __syncthreads();
00815     sh_buffer[tid].i += nbrsum;
00816 
00817     numincl = sh_buffer[tid].i;  // number of included points
00818     if (0==numincl) {
00819       map[mindex] = 0.f;
00820       return;  // all points in thread block excluded so exit early
00821     }
00822     __syncthreads();
00823     if (isincluded) {
00824       sh_buffer[psum-1].i = tid;  // assign my work to thread #(psum-1)
00825     }
00826   }
00827   __syncthreads();
00828   const int sid = sh_buffer[tid].i;  // shifted thread id for whom i calculate
00829 #endif
00830 
00831   const float cutoff2 = const_cutoff * const_cutoff;  // cutoff2 in register
00832 
00833   const int ib = (int) floorf(px * const_bx_1);  // bin index for entire
00834   const int jb = (int) floorf(py * const_by_1);  // thread block
00835   const int kb = (int) floorf(pz * const_bz_1);
00836 
00837   // XXX assume first mult is within 24-bits
00838   const int binid = (__umul24(kb,nby) + jb)*nbx + ib;  // flat index of my bin
00839 
00840   int n, nb;
00841   int m;
00842   int ma;
00843   int nc;
00844 
00845 #ifdef TEST_SHIFT_EXCL
00846   if (tid < numincl) {
00847     int ssid = sid;
00848     const int sid_x = ssid & PADMASK;
00849     ssid >>= BSHIFT;
00850     const int sid_y = ssid & PADMASK;
00851     ssid >>= BSHIFT;
00852     const int sid_z = ssid;
00853     px += sid_x * const_hx;  // adjust position for shifted thread
00854     py += sid_y * const_hy;
00855     pz += sid_z * const_hz;
00856   }
00857 #else
00858   px += threadIdx.x * const_hx;  // adjust position for this thread
00859   py += threadIdx.y * const_hy;
00860   pz += threadIdx.z * const_hz;
00861 #endif
00862 
00863   px += const_x0;
00864   py += const_y0;
00865   pz += const_z0;  // translate to absolute position
00866 
00867 #ifdef TEST_SHIFT_EXCL
00868   sh_buffer[tid].f = 0.f;  // for summing occupancy
00869 #else
00870   float occ = 0.f;  // sum occupancy
00871 #endif
00872 
00873   const int twice_num_probes = const_num_probes << 1;
00874   const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00875 
00876   // number of floats from conformers to move into shared memory
00877   // product is within 24-bits
00878   const int total_shmem_copy = __umul24((3*NCONFPERLOOP),const_num_probes);
00879 
00880   // loop over all conformers
00881   for (m = 0;  m < const_num_conformers;  m += NCONFPERLOOP) {
00882 
00883     // reset shared memory energy accumulators
00884     for (nc = 0;  nc < NTPB_TIMES_NCPL;  nc += NTHREADSPERBLOCK) {
00885       usum[nc + tid] = 0.f;  // these are NOT shared
00886     }
00887 
00888     // cooperatively copy conformers from constant memory page 2+
00889     // into shared memory for improved performance
00890     // while we continue to access constant memory page 1 (8K)
00891     __syncthreads();
00892     if (tid < 32) {  // use only first warp
00893 
00894       // number of loop iterations for all threads in warp
00895       const int npass = (total_shmem_copy >> 5);
00896 
00897       // remaining number of floats to move
00898       const int nleft = (total_shmem_copy & 31);
00899 
00900       // skip over previously computed conformers (3*const_num_probes*m)
00901       const int skip = __umul24((const_num_probes + twice_num_probes), m);
00902 
00903       int off = 0;
00904       for (nc = 0;  nc < npass;  nc++, off += 32) {
00905         conf[off + tid] = const_conformers[skip + off + tid];
00906       }
00907       if (tid < nleft) {
00908         conf[off + tid] = const_conformers[skip + off + tid];
00909       }
00910     }
00911     __syncthreads();  // wait for entire thread block
00912 
00913     for (nb = 0;  nb < const_num_binoffsets;  nb++) {  // bin neighborhood
00914 
00915       __syncthreads();  // wait for entire thread block before loading next bin
00916       if (tid < BIN_SIZE) {  // cache next atom bin
00917         const flint *p_bin = bin_zero
00918           + ((binid + const_binoffsets[nb]) << BIN_SHIFT);
00919         atombin[tid] = p_bin[tid];  // copy from global to shared memory
00920       }
00921       __syncthreads();  // now we have atom bin loaded
00922 
00923 #ifdef TEST_SHIFT_EXCL
00924     if (tid < numincl) {
00925 #endif
00926 
00927       for (n = 0;  n < BIN_SIZE;  n += BIN_SLOTSIZE) { // loop over atoms in bin
00928         int vdwtype = atombin[n+3].i;
00929         if (-1 == vdwtype) break;  // no more atoms in bin
00930         vdwtype <<= 1;  // multiply by 2 for indexing VDW parms
00931         const float epsilon_atom = const_vdwparms[vdwtype];
00932         const float rmin_atom = const_vdwparms[vdwtype+1];
00933         const float ax = px - atombin[n  ].f;
00934         const float ay = py - atombin[n+1].f;
00935         const float az = pz - atombin[n+2].f;
00936 
00937         int sindex = 0;  // index conformers
00938        
00939         // loop shared memory conformers
00940         for (nc = 0;  nc < NTPB_TIMES_NCPL;  nc += NTHREADSPERBLOCK) {
00941 
00942           for (ma = 0;  ma < twice_num_probes;  ma += 2) {  // probe atoms
00943             const float dx = conf[sindex++] + ax;
00944             const float dy = conf[sindex++] + ay;
00945             const float dz = conf[sindex++] + az;
00946             const float r2 = dx*dx + dy*dy + dz*dz;
00947             if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00948               const float epsilon = epsilon_atom * const_probevdwparms[ma];
00949               const float rmin = rmin_atom + const_probevdwparms[ma+1];
00950               float rm6 = rmin*rmin / r2;
00951               rm6 = rm6 * rm6 * rm6;
00952 
00953               // XXX rather than reading/writing to shared memory here,
00954               //     we should be working with a register.  Only when we
00955               //     complete the two innermost loops should we be storing
00956               //     back to shared memory.  Shared memory limits performance
00957               //     to 66% of what's possible with a register...
00958               //     The stores to usum[] should be moved out to the 'nc' loop.
00959               // XXX Accumulating usum in 'nc' loop is actually a bit slower.
00960               //     Note that we only access shared memory if the system atom
00961               //     is within the cutoff.  Even with small atom bins we still
00962               //     have a large percentage of atoms outside of cutoff.
00963               usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
00964             }
00965           } // end loop over probe atoms
00966 
00967         } // end loop over shared memory conformers
00968 
00969       } // end loop over atoms in bin
00970 #ifdef TEST_SHIFT_EXCL
00971     } // end if (tid < numincl)
00972 #endif
00973 
00974     } // end loop over bin neighborhood
00975 
00976 #ifdef TEST_SHIFT_EXCL
00977   if (tid < numincl) {
00978 #endif
00979 
00980     for (n = 0;  n < num_extra_slots;  n += BIN_SLOTSIZE) {  // extra atoms
00981       const int vdwtype = const_extra[n+3].i << 1; // mult by 2 index VDW parms
00982       const float epsilon_atom = const_vdwparms[vdwtype];
00983       const float rmin_atom = const_vdwparms[vdwtype+1];
00984       const float ax = px - const_extra[n  ].f;
00985       const float ay = py - const_extra[n+1].f;
00986       const float az = pz - const_extra[n+2].f;
00987       int sindex = 0;
00988     
00989       // loop over shared memory conformers
00990       for (nc = 0;  nc < NTPB_TIMES_NCPL;  nc += NTHREADSPERBLOCK) {
00991 
00992         for (ma = 0;  ma < twice_num_probes;  ma += 2) {  // probe atoms
00993           const float dx = conf[sindex++] + ax;
00994           const float dy = conf[sindex++] + ay;
00995           const float dz = conf[sindex++] + az;
00996           const float r2 = dx*dx + dy*dy + dz*dz;
00997           if (r2 < cutoff2) {  // atom within cutoff, accumulate energy
00998             const float epsilon = epsilon_atom * const_probevdwparms[ma];
00999             const float rmin = rmin_atom + const_probevdwparms[ma+1];
01000             float rm6 = rmin*rmin / r2;
01001             rm6 = rm6 * rm6 * rm6;
01002             usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
01003           }
01004         } // end loop over probe atoms
01005 
01006       } // end loop over shared memory conformers
01007 
01008     } // end loop over extra atoms
01009 
01010     for (nc = 0;  nc < NCONFPERLOOP;  nc++) {  // loop over some conformers
01011 
01012       if (m + nc < const_num_conformers) {  // ensure this 'nc' is meaningful
01013         float u = usum[(nc << NTPBSHIFT) + tid];
01014 
01015 #ifdef TEST_SHIFT_EXCL
01016         if (u < MAX_ALLOWABLE_ENERGY) {  // XXX should we test?
01017           sh_buffer[sid].f += expf(-u);  // the occupancy
01018         }
01019 #else
01020         if (isincluded && u < MAX_ALLOWABLE_ENERGY) {  // XXX should we test?
01021           occ += expf(-u);  // the occupancy
01022         }
01023 #endif
01024       }
01025 
01026     } // end loop over some conformers
01027 #ifdef TEST_SHIFT_EXCL
01028   } // end if (tid < numincl)
01029 #endif
01030 
01031   } // end loop over all conformers
01032 
01033 #ifdef TEST_SHIFT_EXCL
01034   __syncthreads();  // must wait for thread block before finishing
01035   sh_buffer[tid].f *= const_inv_numconf;  // averaged occupancy
01036   if (sh_buffer[tid].f <= const_min_occupancy) sh_buffer[tid].f = 0.f;
01037 
01038   map[mindex] = sh_buffer[tid].f;
01039 #else
01040   occ = occ * const_inv_numconf;  // averaged occupancy
01041   if (occ <= const_min_occupancy) occ = 0.f;
01042 
01043   map[mindex] = occ;
01044 #endif
01045 
01046 } // cuda_occupancy_multiatom()
01047 
01048 
01049 
01050 static void *cuda_occupancy_thread(void *voidparms) {
01051   occthrparms *parms = NULL;
01052   int mxpad, mypad, mzpad, mappad;
01053   float *h_map;       // padded map buffer on host
01054   float *d_map;       // padded map buffer on device
01055   int *d_excl;        // padded exclusion map buffer on device
01056   int binsz;          // size of bin buffer
01057   flint *d_bin;       // atom bins stored on device
01058   flint *d_bin_zero;  // shifted bin pointer on device
01059   int gpuid = -1;     // default: let VMD choose.
01060   int numgpus = 0;
01061   int i, j, k;
01062 
01063   // override device choice from environment for backward compatibility.
01064   if (getenv("VMDILSCUDADEVICE")) {
01065     gpuid = atoi(getenv("VMDILSCUDADEVICE"));
01066   } 
01067 
01068   if (gpuid < 0) {
01069     if (vmd_cuda_num_devices(&numgpus) != VMDCUDA_ERR_NONE) {
01070       printf("ILS CUDA device init error\n");
01071       return NULL;
01072     }
01073     if (numgpus > 0) {
01074       // XXX: here we should pick the first available GPU
01075       //      and skip over device GPUs. but how to do this 
01076       //      cleanly from in here? 
01077       //      change the code to use the device pool?
01078       // AK 2010/03/15
01079       gpuid = 0;
01080     } else {
01081       return NULL;
01082     }    
01083   }
01084 
01085   if (gpuid >= 0) {
01086     printf("Using CUDA device: %d\n", gpuid);
01087   } else {
01088     // no suitable GPU.
01089     return NULL;
01090   }
01091   
01092   wkf_timerhandle timer = wkf_timer_create();
01093 
01094   parms = (occthrparms *) voidparms;
01095   parms->errcode = -1;  // be pessimistic until the very end
01096 
01097   if (getenv("VMDILSVERBOSE")) {
01098     // XXX look at parms
01099     printf("*****************************************************************\n");
01100     printf("mx = %d  my = %d  mz = %d\n", parms->mx, parms->my, parms->mz);
01101     printf("hx = %g  hy = %g  hz = %g\n", parms->hx, parms->hy, parms->hz);
01102     printf("x0 = %g  y0 = %g  z0 = %g\n", parms->x0, parms->y0, parms->z0);
01103     printf("bx_1 = %g  by_1 = %g  bz_1 = %g\n",
01104            parms->bx_1, parms->by_1, parms->bz_1);
01105     printf("nbx = %d  nby = %d  nbz = %d\n", parms->nbx, parms->nby, parms->nbz);
01106     printf("num_binoffsets = %d\n", parms->num_binoffsets);
01107     printf("num_extras = %d\n", parms->num_extras);
01108     printf("num_vdwparms = %d\n", parms->num_vdwparms);
01109     printf("num_probes = %d\n", parms->num_probes);
01110     printf("num_conformers = %d\n", parms->num_conformers);
01111     printf("*****************************************************************\n");
01112   }
01113 
01114   // check that data will fit constant memory
01115   if (parms->num_binoffsets > MAX_BINOFFSETS) {
01116     printf("***** ERROR: Exceeded MAX_BINOFFSETS for CUDA kernel\n");
01117     return NULL;  // XXX How do I raise an error / exception?
01118   }
01119   if (parms->num_extras > MAX_EXTRAS) {
01120     printf("***** ERROR: Exceeded MAX_EXTRAS for CUDA kernel\n");
01121     return NULL;  // XXX How do I raise an error / exception?
01122   }
01123   if (parms->num_vdwparms > MAX_VDWPARMS) {
01124     printf("***** ERROR: Exceeded MAX_VDWPARMS for CUDA kernel\n");
01125     return NULL;  // XXX How do I raise an error / exception?
01126   }
01127   if (parms->num_probes > MAX_PROBES) {
01128     printf("***** ERROR: Exceeded MAX_PROBES for CUDA kernel\n");
01129     return NULL;  // XXX How do I raise an error / exception?
01130   }
01131   if (parms->num_conformers > MAX_CONFORMERS) {
01132     printf("***** ERROR: Exceeded MAX_CONFORMERS for CUDA kernel\n");
01133     return NULL;  // XXX How do I raise an error / exception?
01134   }
01135 
01136   // attach to GPU and check for errors
01137   cudaError_t rc;
01138   rc = cudaSetDevice(gpuid);
01139   if (rc != cudaSuccess) {
01140 #if CUDART_VERSION >= 2010
01141     rc = cudaGetLastError();  // query last error and reset error state
01142     if (rc != cudaErrorSetOnActiveProcess) {
01143       return NULL;  // abort and return an error
01144     }
01145 #else
01146     cudaGetLastError();  // just ignore and reset error state, since older CUDA
01147                          // revs don't have a cudaErrorSetOnActiveProcess enum
01148 #endif
01149   }
01150 
01151   // pad the map for CUDA thread blocks
01152   mxpad = (parms->mx + PADMASK) & ~PADMASK;
01153   mypad = (parms->my + PADMASK) & ~PADMASK;
01154   mzpad = (parms->mz + PADMASK) & ~PADMASK;
01155   mappad = mxpad * mypad * mzpad;
01156 
01157   if (getenv("VMDILSVERBOSE")) {
01158     printf("mxpad=%d  mypad=%d  mzpad=%d  mappad=%d\n",
01159            mxpad, mypad, mzpad, mappad);
01160   }
01161 
01162   h_map = (float *) malloc(mappad * sizeof(float));
01163   if (getenv("VMDILSVERBOSE")) {
01164     printf("Allocate %g MB for map\n", mappad*sizeof(float)/(1024*1024.f));
01165   }
01166   cudaMalloc((void **) &d_map, mappad * sizeof(float));
01167   CUERR;
01168   if (getenv("VMDILSVERBOSE")) {
01169     printf("Allocate %g MB for exclusions\n", mappad*sizeof(int)/(1024*1024.f));
01170   }
01171   cudaMalloc((void **) &d_excl, mappad * sizeof(int));
01172   CUERR;
01173 
01174 #if !defined(USE_DISTANCE_EXCL) && !defined(USE_ENERGY_EXCL)
01175   // set all points to be excluded by default
01176   cudaMemset(d_excl, 0, mappad * sizeof(int));
01177   CUERR;
01178 #endif
01179 
01180   binsz = BIN_SIZE * parms->nbx * parms->nby * parms->nbz;
01181   if (getenv("VMDILSVERBOSE")) {
01182     printf("nbx=%d  nby=%d  nbz=%d  binsz=%d\n",
01183            parms->nbx, parms->nby, parms->nbz, binsz);
01184     printf("Allocate %g MB for atom bins\n", binsz*sizeof(flint)/(1024*1024.f));
01185   }
01186   cudaMalloc((void **) &d_bin, binsz * sizeof(flint));
01187   CUERR;
01188   cudaMemcpy(d_bin, parms->bin, binsz * sizeof(flint), cudaMemcpyHostToDevice);
01189   CUERR;
01190   d_bin_zero = d_bin + (parms->bin_zero - parms->bin);
01191 
01192   if (getenv("VMDILSVERBOSE")) {
01193     printf("parms delta bin = %d   delta bin = %d\n",
01194            (parms->bin_zero - parms->bin), (d_bin_zero - d_bin));
01195     
01196     printf("probe epsilon=%g rmin=%g\n",
01197            parms->probevdwparms[0], parms->probevdwparms[1]);
01198     printf("max_energy=%g\n", parms->max_energy);
01199     printf("cutoff=%g\n", parms->cutoff);
01200     printf("hx=%g hy=%g hz=%g\n", parms->hx, parms->hy, parms->hz);
01201     printf("x0=%g y0=%g z0=%g\n", parms->x0, parms->y0, parms->z0);
01202     printf("bx_1=%g by_1=%g bz_1=%g\n", parms->bx_1, parms->by_1, parms->bz_1);
01203   }
01204 
01205   cudaMemcpyToSymbol(const_max_energy, &(parms->max_energy), sizeof(float), 0);
01206   CUERR;
01207   float min_occ = expf(-parms->max_energy);  // occupancy threshold
01208   cudaMemcpyToSymbol(const_min_occupancy, &min_occ, sizeof(float), 0);
01209   CUERR;
01210   cudaMemcpyToSymbol(const_cutoff, &(parms->cutoff), sizeof(float), 0);
01211   CUERR;
01212   cudaMemcpyToSymbol(const_min_excldist, &(parms->min_excldist),
01213       sizeof(float), 0);
01214   CUERR;
01215   cudaMemcpyToSymbol(const_hx, &(parms->hx), sizeof(float), 0);
01216   CUERR;
01217   cudaMemcpyToSymbol(const_hy, &(parms->hy), sizeof(float), 0);
01218   CUERR;
01219   cudaMemcpyToSymbol(const_hz, &(parms->hz), sizeof(float), 0);
01220   CUERR;
01221   cudaMemcpyToSymbol(const_x0, &(parms->x0), sizeof(float), 0);
01222   CUERR;
01223   cudaMemcpyToSymbol(const_y0, &(parms->y0), sizeof(float), 0);
01224   CUERR;
01225   cudaMemcpyToSymbol(const_z0, &(parms->z0), sizeof(float), 0);
01226   CUERR;
01227   cudaMemcpyToSymbol(const_bx_1, &(parms->bx_1), sizeof(float), 0);
01228   CUERR;
01229   cudaMemcpyToSymbol(const_by_1, &(parms->by_1), sizeof(float), 0);
01230   CUERR;
01231   cudaMemcpyToSymbol(const_bz_1, &(parms->bz_1), sizeof(float), 0);
01232   CUERR;
01233   float inv_numconf = 1.f / parms->num_conformers;
01234   cudaMemcpyToSymbol(const_inv_numconf, &inv_numconf, sizeof(float), 0);
01235 
01236   if (getenv("VMDILSVERBOSE")) {
01237     printf("num_binoffsets = %d\n", parms->num_binoffsets);
01238   }
01239   cudaMemcpyToSymbol(const_num_binoffsets, &(parms->num_binoffsets),
01240       sizeof(int), 0);
01241   CUERR;
01242   if (getenv("VMDILSVERBOSE")) {
01243     printf("num_extras = %d\n", parms->num_extras);
01244   }
01245   cudaMemcpyToSymbol(const_num_extras, &(parms->num_extras), sizeof(int), 0);
01246   CUERR;
01247   if (getenv("VMDILSVERBOSE")) {
01248     printf("num_probes = %d\n", parms->num_probes);
01249   }
01250   cudaMemcpyToSymbol(const_num_probes, &(parms->num_probes), sizeof(int), 0);
01251   CUERR;
01252   if (getenv("VMDILSVERBOSE")) {
01253     printf("num_conformers = %d\n", parms->num_conformers);
01254   }
01255   cudaMemcpyToSymbol(const_num_conformers, &(parms->num_conformers),
01256       sizeof(int), 0);
01257   CUERR;
01258 
01259   if (getenv("VMDILSVERBOSE")) {
01260     printf("num_vdwparms = %d\n", parms->num_vdwparms);
01261     for (i = 0;  i < parms->num_vdwparms;  i++) {
01262       printf("  %2d:  epsilon=%g  rmin=%g\n", i,
01263              parms->vdwparms[2*i], parms->vdwparms[2*i+1]);
01264     }
01265   }
01266 
01267   cudaMemcpyToSymbol(const_vdwparms, parms->vdwparms,
01268       2 * parms->num_vdwparms * sizeof(float), 0);
01269   CUERR;
01270   cudaMemcpyToSymbol(const_probevdwparms, parms->probevdwparms,
01271       2 * (0==parms->num_probes ? 1 : parms->num_probes) * sizeof(float), 0);
01272   CUERR;
01273   cudaMemcpyToSymbol(const_extra, parms->extra,
01274       BIN_SLOTSIZE * parms->num_extras * sizeof(flint), 0);
01275   CUERR;
01276 
01277   // take 3D bin offset and calculate array of flat bin offset
01278   {
01279     const int nbx = parms->nbx;
01280     const int nby = parms->nby;
01281     int n;
01282     int *h_flatbinoffset = (int *) calloc(parms->num_binoffsets, sizeof(int));
01283     for (n = 0;  n < parms->num_binoffsets;  n++) {
01284       int i = (int) (parms->binoffsets[3*n  ]);
01285       int j = (int) (parms->binoffsets[3*n+1]);
01286       int k = (int) (parms->binoffsets[3*n+2]);
01287       int index = (k*nby + j)*nbx + i;
01288       h_flatbinoffset[n] = index;
01289     }
01290     cudaMemcpyToSymbol(const_binoffsets, h_flatbinoffset,
01291         parms->num_binoffsets * sizeof(int), 0);
01292     CUERR;
01293     free(h_flatbinoffset);
01294   }
01295 
01296   // calculate array of flat bin offsets for distance-based exclusions
01297   {
01298     const int nbx = parms->nbx;
01299     const int nby = parms->nby;
01300     int i, j, k, n=0;
01301     int h_flatexcloffset[27];  // 3x3x3 cube
01302     for (k = -1;  k <= 1;  k++) {
01303       for (j = -1;  j <= 1;  j++) {
01304         for (i = -1;  i <= 1;  i++) {
01305           int index = (k*nby + j)*nbx + i;
01306           h_flatexcloffset[n++] = index;
01307         }
01308       }
01309     }
01310     cudaMemcpyToSymbol(const_excloffsets, h_flatexcloffset, 27*sizeof(int), 0);
01311     CUERR;
01312   }
01313 
01314   cudaMemcpyToSymbol(const_conformers, parms->conformers,
01315       3 * parms->num_probes * parms->num_conformers * sizeof(float), 0);
01316   CUERR;
01317 
01318   // cluster slabs based on time per work unit;
01319   // the following constants empirically derived from
01320   // oxygen testcase run on GT200
01321 #define TIME_PER_MONOATOM_WORKUNIT   1.24553913519431e-10
01322 #define TIME_PER_MULTIATOM_WORKUNIT  5.24559386973180e-09
01323 //#define TIME_PER_MONOATOM_WORKUNIT   3.73661740558292e-09
01324 //#define TIME_PER_MULTIATOM_WORKUNIT  1.57367816091954e-07
01325 
01326   int mxblocks = (mxpad >> BSHIFT);
01327   int myblocks = (mypad >> BSHIFT);
01328   int mzblocks = (mzpad >> BSHIFT);
01329   int nblocksperplane = mxblocks * myblocks;
01330   int maxplanes = MAX_THREADBLOCKS / nblocksperplane;
01331   int numconf = (0==parms->num_conformers ? 1 : parms->num_conformers);
01332 
01333   double monoatom_time_per_plane = (double)(parms->num_probes * numconf *
01334       parms->num_binoffsets * nblocksperplane) * TIME_PER_MONOATOM_WORKUNIT;
01335 
01336   double multiatom_time_per_plane = (double)(parms->num_probes * numconf *
01337       parms->num_binoffsets * nblocksperplane) * TIME_PER_MULTIATOM_WORKUNIT;
01338 
01339   // number of planes to schedule per slab
01340 #define MAX_TIME_PER_KERNEL  1.f  // schedule up to 1 second per kernel call
01341   int max_mono_planes = (int)(MAX_TIME_PER_KERNEL / monoatom_time_per_plane);
01342   int max_multi_planes = (int)(MAX_TIME_PER_KERNEL / multiatom_time_per_plane);
01343 
01344   if (max_mono_planes > maxplanes) max_mono_planes = maxplanes;
01345   else if (max_mono_planes < 1)    max_mono_planes = 1;
01346 
01347   if (max_multi_planes > maxplanes) max_multi_planes = maxplanes;
01348   else if (max_multi_planes < 1)    max_multi_planes = 1;
01349 
01350   if (getenv("VMDILSVERBOSE")) {
01351     printf("mxblocks = %d  myblocks = %d  mzblocks = %d\n",
01352            mxblocks, myblocks, mzblocks);
01353   }
01354   if (nblocksperplane > MAX_THREADBLOCKS) {
01355     printf("***** ERROR: Too many blocks in X-Y plane for CUDA kernel\n");
01356     return NULL;  // XXX raise exception
01357   }
01358 
01359   if (0 == parms->num_conformers) {  // monoatom
01360     if (getenv("VMDILSVERBOSE")) {
01361       printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01362         "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01363     }
01364     int mzblockoffset = 0;
01365     int nplanes_remaining = mzblocks;
01366     while (nplanes_remaining > 0) {
01367 #ifndef DO_ONE_PLANE
01368       // do as many planes as possible given constraints
01369       int nplanes = nplanes_remaining;
01370       if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01371 #else
01372       // XXX do only one slice at a time
01373       int nplanes = 1;
01374 #endif
01375       dim3 gsz, bsz;
01376       gsz.x = mxblocks;
01377       gsz.y = myblocks * nplanes;
01378       gsz.z = 1;
01379       bsz.x = THBLEN;
01380       bsz.y = THBLEN;
01381       bsz.z = THBLEN;
01382 
01383       if ( ! getenv("VMDILSNOEXCL") ) {
01384 #ifdef USE_DISTANCE_EXCL
01385       // distance exclusions
01386 #ifdef DEBUG
01387       printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01388           "on %2d planes   ",
01389           nplanes);
01390 #endif
01391 #if 1
01392       cudaThreadSynchronize();
01393 #endif
01394       wkf_timer_start(timer);
01395       cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01396           d_excl, d_bin_zero,
01397           parms->nbx, parms->nby,
01398 #ifndef DO_ONE_PLANE
01399           nplanes,
01400 #endif
01401           mzblockoffset);
01402 #if 1
01403       cudaThreadSynchronize();
01404 #endif
01405 #ifdef DEBUG
01406       printf("%f s\n", wkf_timer_timenow(timer));
01407 #endif
01408       CUERR;
01409 #endif // USE_DISTANCE_EXCL
01410       }
01411 
01412       // monoatom occupancy
01413 #ifdef DEBUG
01414      printf("*** VMD CUDA: cuda_occupancy_monoatom()       "
01415           "on %2d planes   ",
01416           nplanes);
01417 #endif
01418 #if 1
01419       cudaThreadSynchronize();
01420 #endif
01421       wkf_timer_start(timer);
01422       cuda_occupancy_monoatom<<<gsz, bsz, 0>>>(
01423           d_map, d_excl, d_bin_zero,
01424           parms->nbx, parms->nby,
01425 #ifndef DO_ONE_PLANE
01426           nplanes,
01427 #endif
01428           mzblockoffset);
01429 #if 1
01430       cudaThreadSynchronize();
01431 #endif
01432 #ifdef DEBUG
01433       printf("%f s\n", wkf_timer_timenow(timer));
01434 #endif
01435       CUERR;
01436 
01437       nplanes_remaining -= nplanes;
01438       mzblockoffset += nplanes;
01439     }
01440   } // if monoatom
01441 
01442   else { // multiatom
01443     if (getenv("VMDILSVERBOSE")) {
01444       printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01445         "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01446       printf("Scheduling up to %d planes (%d thread blocks) per multiatom "
01447         "kernel call\n", max_multi_planes, nblocksperplane*max_multi_planes);
01448     }
01449 
01450     int mzblockoffset;
01451     int nplanes_remaining;
01452 
01453     if ( ! getenv("VMDILSNOEXCL") ) {
01454     // first do "monoatom" slabs for exclusions
01455     mzblockoffset = 0;
01456     nplanes_remaining = mzblocks;
01457     while (nplanes_remaining > 0) {
01458 #ifndef DO_ONE_PLANE
01459       // do as many planes as possible given constraints
01460       int nplanes = nplanes_remaining;
01461       if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01462 #else
01463       // XXX do only one slice at a time
01464       int nplanes = 1;
01465 #endif
01466       dim3 gsz, bsz;
01467       gsz.x = mxblocks;
01468       gsz.y = myblocks * nplanes;
01469       gsz.z = 1;
01470       bsz.x = THBLEN;
01471       bsz.y = THBLEN;
01472       bsz.z = THBLEN;
01473 
01474 #ifdef USE_DISTANCE_EXCL
01475       // distance exclusions
01476 #ifdef DEBUG
01477       printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01478           "on %2d planes   ",
01479           nplanes);
01480 #endif
01481 #if 1
01482       cudaThreadSynchronize();
01483 #endif
01484       wkf_timer_start(timer);
01485       cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01486           d_excl, d_bin_zero,
01487           parms->nbx, parms->nby,
01488 #ifndef DO_ONE_PLANE
01489           nplanes,
01490 #endif
01491           mzblockoffset);
01492 #if 1
01493       cudaThreadSynchronize();
01494 #endif
01495 #ifdef DEBUG
01496       printf("%f s\n", wkf_timer_timenow(timer));
01497 #endif
01498       CUERR;
01499 #endif // USE_DISTANCE_EXCL
01500 
01501 #ifdef USE_ENERGY_EXCL
01502       // energy exclusions
01503 #ifdef DEBUG
01504       printf("*** VMD CUDA: cuda_find_energy_exclusions()   "
01505           "on %2d planes   ",
01506           nplanes);
01507 #endif
01508 #if 1
01509       cudaThreadSynchronize();
01510 #endif
01511       wkf_timer_start(timer);
01512       cuda_find_energy_exclusions<<<gsz, bsz, 0>>>(
01513           d_excl, d_bin_zero,
01514           parms->nbx, parms->nby,
01515 #ifndef DO_ONE_PLANE
01516           nplanes,
01517 #endif
01518           mzblockoffset);
01519 #if 1
01520       cudaThreadSynchronize();
01521 #endif
01522 #ifdef DEBUG
01523       printf("%f s\n", wkf_timer_timenow(timer));
01524 #endif
01525       CUERR;
01526 #endif // USE_ENERGY_EXCL
01527 
01528       nplanes_remaining -= nplanes;
01529       mzblockoffset += nplanes;
01530     }
01531     } // ( ! getenv("VMDILSNOEXCL") )
01532 
01533     // next do "multiatom" slabs for exclusions
01534     mzblockoffset = 0;
01535     nplanes_remaining = mzblocks;
01536     while (nplanes_remaining > 0) {
01537 #ifndef DO_ONE_PLANE
01538       // do as many planes as possible given constraints
01539       int nplanes = nplanes_remaining;
01540       if (nplanes > max_multi_planes) nplanes = max_multi_planes;
01541 #else
01542       // XXX do only one slice at a time
01543       int nplanes = 1;
01544 #endif
01545       dim3 gsz, bsz;
01546       gsz.x = mxblocks;
01547       gsz.y = myblocks * nplanes;
01548       gsz.z = 1;
01549       bsz.x = THBLEN;
01550       bsz.y = THBLEN;
01551       bsz.z = THBLEN;
01552 
01553       // multiatom occupancy
01554 #ifdef DEBUG
01555       printf("*** VMD CUDA: cuda_occupancy_multiatom()      "
01556           "on %2d planes   ",
01557           nplanes);
01558 #endif
01559 #if 1
01560       cudaThreadSynchronize();
01561 #endif
01562       wkf_timer_start(timer);
01563       cuda_occupancy_multiatom<<<gsz, bsz, 0>>>(
01564           d_map, d_excl, d_bin_zero,
01565           parms->nbx, parms->nby,
01566 #ifndef DO_ONE_PLANE
01567           nplanes,
01568 #endif
01569           mzblockoffset);
01570 #if 1
01571       cudaThreadSynchronize();
01572 #endif
01573 #ifdef DEBUG
01574       printf("%f s\n", wkf_timer_timenow(timer));
01575 #endif
01576       CUERR;
01577 
01578       nplanes_remaining -= nplanes;
01579       mzblockoffset += nplanes;
01580     }
01581   } // else multiatom
01582 
01583   // retrieve padded map from GPU
01584   cudaMemcpy(h_map, d_map, mappad * sizeof(float), cudaMemcpyDeviceToHost);
01585   CUERR;
01586 
01587   // copy padded map into user map buffer
01588   // must transpose the thread blocks
01589   for (k = 0;  k < parms->mz;  k++) {
01590     for (j = 0;  j < parms->my;  j++) {
01591       for (i = 0;  i < parms->mx;  i++) {
01592         int mindex = (k*parms->my + j)*parms->mx + i;
01593         int block = ((k >> BSHIFT)*myblocks + (j >> BSHIFT))*mxblocks
01594           + (i >> BSHIFT);
01595         int tid = ((k & PADMASK)*THBLEN + (j & PADMASK))*THBLEN + (i & PADMASK);
01596         int pindex = block*(THBLEN*THBLEN*THBLEN) + tid;
01597         parms->map[mindex] = h_map[pindex];
01598       }
01599     }
01600   }
01601 
01602   cudaFree(d_excl);
01603   cudaFree(d_bin);
01604   cudaFree(d_map);
01605   CUERR;
01606 
01607   free(h_map);
01608   wkf_timer_destroy(timer);
01609 
01610   parms->errcode = 0;  // success!
01611   return NULL;
01612 }
01613 
01614 
01615 extern "C" int vmd_cuda_evaluate_occupancy_map(
01616     int mx, int my, int mz,             // map dimensions
01617     float *map,                         // buffer space for occupancy map
01618                                         // (length mx*my*mz floats)
01619 
01620     float max_energy,                   // max energy threshold
01621     float cutoff,                       // vdw cutoff distance
01622     float hx, float hy, float hz,       // map lattice spacing
01623     float x0, float y0, float z0,       // map origin
01624     float bx_1, float by_1, float bz_1, // inverse of atom bin lengths
01625 
01626     int nbx, int nby, int nbz,          // bin dimensions
01627     const float *bin,                   // atom bins XXX typecast to flint
01628                                         // (length BIN_SIZE*nbx*nby*nbz)
01629     const float *bin_zero,              // bin pointer shifted to origin
01630 
01631     int num_binoffsets,                 // number of offsets
01632     const char *binoffsets,             // bin neighborhood index offsets
01633                                         // (length 3*num_bin_offsets)
01634 
01635     int num_extras,                     // number of extra atoms
01636     const float *extra,                 // extra atoms from overfilled bins
01637                                         // XXX typecast to flint
01638                                         // (length BIN_SLOTSIZE*num_extras)
01639 
01640     int num_vdwparms,                   // number of vdw parameter types
01641     const float *vdwparms,              // vdw parameters
01642                                         // (length 2*num_vdw_params)
01643 
01644     int num_probes,                     // number of probe atoms
01645     const float *probevdwparms,         // vdw parameters of probe atoms
01646                                         // (length 2*num_probes)
01647 
01648     int num_conformers,                 // number of conformers
01649     const float *conformers             // probe atom offsets for conformers
01650                                         // (length 3*num_probes*num_conformers)
01651     ) {
01652   occthrparms parms;
01653 
01654   parms.mx = mx;
01655   parms.my = my;
01656   parms.mz = mz;
01657   parms.map = map;
01658   parms.max_energy = max_energy;
01659   parms.cutoff = cutoff;
01660 #define DEFAULT_EXCL_DIST  1.f
01661   parms.min_excldist = DEFAULT_EXCL_DIST;
01662   parms.hx = hx;
01663   parms.hy = hy;
01664   parms.hz = hz;
01665   parms.x0 = x0;
01666   parms.y0 = y0;
01667   parms.z0 = z0;
01668   parms.bx_1 = bx_1;
01669   parms.by_1 = by_1;
01670   parms.bz_1 = bz_1;
01671   parms.nbx = nbx;
01672   parms.nby = nby;
01673   parms.nbz = nbz;
01674   parms.bin = (flint *) bin;
01675   parms.bin_zero = (flint *) bin_zero;
01676   parms.num_binoffsets = num_binoffsets;
01677   parms.binoffsets = binoffsets;
01678   parms.num_extras = num_extras;
01679   parms.extra = (flint *) extra;
01680   parms.num_vdwparms = num_vdwparms;
01681   parms.vdwparms = vdwparms;
01682   parms.num_probes = num_probes;
01683   parms.probevdwparms = probevdwparms;
01684   parms.num_conformers = num_conformers;
01685   parms.conformers = conformers;
01686 
01687 #ifdef DEBUG
01688   printf("*** calling cuda_occupancy_thread()\n");
01689 #endif
01690   cuda_occupancy_thread(&parms);
01691 
01692   return parms.errcode;
01693 }

Generated on Mon May 20 01:45:34 2013 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002