00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #include "WKFThreads.h"
00027 #include "WKFUtils.h"
00028 #include "CUDAKernels.h"
00029 #include "utilities.h"
00030
00031
00032
00033 #if defined(VMDTHREADS)
00034 #define USE_CUDADEVPOOL 1
00035 #endif
00036
00037 #if 1
00038 #define CUERR \
00039 do { \
00040 cudaError_t err; \
00041 if ((err = cudaGetLastError()) != cudaSuccess) { \
00042 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00043 printf("Thread aborting...\n"); \
00044 return NULL; \
00045 } \
00046 } while (0)
00047
00048 #else
00049 #define CUERR
00050 #endif
00051
00052
00053
00054 typedef union flint_t {
00055 float f;
00056 int i;
00057 } flint;
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #define USE_ENERGY_EXCL // use the energy-based exclusion kernel
00068
00069 #ifdef USE_DISTANCE_EXCL
00070 #define TEST_BLOCK_EXCL // early exit if entire thread block is excluded
00071
00072 #endif
00073
00074 #if defined(USE_ENERGY_EXCL) || defined(USE_DISTANCE_EXCL)
00075 #define TEST_MBLOCK_EXCL // early exit from multiatom kernel
00076 #define TEST_SHIFT_EXCL
00077 #endif
00078
00079 #ifdef TEST_SHIFT_EXCL
00080 #undef TEST_MBLOCK_EXCL
00081 #endif
00082
00083
00084 #define MAX_THREADBLOCKS 65536
00085
00086 #define BIN_DEPTH 8 // number of slots per bin
00087 #define BIN_SLOTSIZE 4 // slot permits x, y, z, vdwtype (4 elements)
00088 #define BIN_SIZE (BIN_DEPTH * BIN_SLOTSIZE) // size given in "flints"
00089 #define BIN_SHIFT 5 // log2(BIN_SIZE)
00090 #define BIN_SLOTSHIFT 2 // log2(BIN_SLOTSIZE)
00091
00092 typedef struct occthrparms_t {
00093 int errcode;
00094
00095 int mx, my, mz;
00096 float *map;
00097
00098
00099 float max_energy;
00100 float cutoff;
00101 float min_excldist;
00102 float hx, hy, hz;
00103 float x0, y0, z0;
00104 float bx_1, by_1, bz_1;
00105
00106 int nbx, nby, nbz;
00107 const flint *bin;
00108
00109 const flint *bin_zero;
00110
00111 int num_binoffsets;
00112 const char *binoffsets;
00113
00114
00115 int num_extras;
00116 const flint *extra;
00117
00118
00119 int num_vdwparms;
00120 const float *vdwparms;
00121
00122
00123 int num_probes;
00124 const float *probevdwparms;
00125
00126
00127 int num_conformers;
00128 const float *conformers;
00129
00130 } occthrparms;
00131
00132
00133
00134
00135 __constant__ static float const_max_energy;
00136 __constant__ static float const_min_occupancy;
00137 __constant__ static float const_cutoff;
00138 __constant__ static float const_min_excldist;
00139 __constant__ static float const_hx, const_hy, const_hz;
00140 __constant__ static float const_x0, const_y0, const_z0;
00141 __constant__ static float const_bx_1, const_by_1, const_bz_1;
00142
00143 __constant__ static float const_inv_numconf;
00144
00145 __constant__ static int const_num_binoffsets;
00146 __constant__ static int const_num_extras;
00147 __constant__ static int const_num_probes;
00148 __constant__ static int const_num_conformers;
00149
00150 #define MAX_VDWPARMS 160
00151 __constant__ static float const_vdwparms[2 * MAX_VDWPARMS];
00152
00153 #define MAX_PROBES 8
00154 __constant__ static float const_probevdwparms[2 * MAX_PROBES];
00155
00156 #define MAX_EXTRAS 50
00157 __constant__ static flint const_extra[BIN_SLOTSIZE * MAX_EXTRAS];
00158
00159
00160 #define MAX_BINOFFSETS 1467
00161 __constant__ static int const_binoffsets[MAX_BINOFFSETS];
00162
00163
00164 #define MAX_EXCLOFFSETS 27
00165 __constant__ static int const_excloffsets[MAX_EXCLOFFSETS];
00166
00167
00168
00169
00170
00171
00172 #define MAX_CONFORMERS 586
00173 __constant__ static float const_conformers[3*MAX_PROBES*MAX_CONFORMERS];
00174
00175
00176 #define DO_ONE_PLANE // optimze when z-slabs are slices of thickness 1
00177 #undef DO_ONE_PLANE
00178
00179 #define THBLEN 4 // thread block length of cube
00180 #define NTHREADSPERBLOCK (THBLEN*THBLEN*THBLEN)
00181
00182 #define BSHIFT 2 // shift for map points to blocks = log2(THBLEN)
00183
00184 #define NTPBSHIFT 6 // do bit shifting instead of int multiplies
00185
00186
00187
00188
00189 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00190 #define NCONFPERLOOP 5
00191 #else
00192 #define NCONFPERLOOP 8
00193 #endif
00194
00195 #define NTPB_TIMES_NCPL (NTHREADSPERBLOCK * NCONFPERLOOP)
00196
00197 #define PADMASK (THBLEN-1) // bit mask for padding up to multiple of THBLEN
00198
00199 #define MAX_ALLOWABLE_ENERGY 87.f // otherwise expf(-u) gives 0
00200
00201
00203
00204
00205
00207
00208 #ifdef USE_DISTANCE_EXCL
00209 __global__ static void cuda_find_distance_exclusions(
00210 int *excl,
00211 const flint *bin_zero,
00212 int nbx, int nby,
00213 #ifndef DO_ONE_PLANE
00214 int mzblocks,
00215 #endif
00216 int mzblockoffset
00217 ) {
00218
00219
00220 __shared__ flint atombin[BIN_SIZE];
00221
00222 #ifndef DO_ONE_PLANE
00223 const int nblockx = gridDim.x;
00224 const int nblocky = gridDim.y / mzblocks;
00225
00226
00227 const int bidx = blockIdx.x;
00228 const int bidy = blockIdx.y % nblocky;
00229 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00230
00231
00232 const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00233
00234
00235 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00236
00237
00238 float px = (bidx<<BSHIFT) * const_hx;
00239 float py = (bidy<<BSHIFT) * const_hy;
00240 float pz = (bidz<<BSHIFT) * const_hz;
00241 #else
00242
00243
00244
00245 const int bid
00246 = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00247
00248
00249 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00250
00251
00252 float px = (blockIdx.x<<BSHIFT) * const_hx;
00253 float py = (blockIdx.y<<BSHIFT) * const_hy;
00254 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00255 #endif
00256
00257 const int mindex = (bid<<NTPBSHIFT) + tid;
00258 int isexcluded = 0;
00259
00260 const float mindis2 = const_min_excldist * const_min_excldist;
00261
00262 const int ib = (int) floorf(px * const_bx_1);
00263 const int jb = (int) floorf(py * const_by_1);
00264 const int kb = (int) floorf(pz * const_bz_1);
00265
00266 const int binid = (kb * nby + jb)*nbx + ib;
00267
00268 int n, nb;
00269
00270 px += threadIdx.x * const_hx;
00271 py += threadIdx.y * const_hy;
00272 pz += threadIdx.z * const_hz;
00273
00274 px += const_x0;
00275 py += const_y0;
00276 pz += const_z0;
00277
00278 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00279
00280 for (nb = 0; nb < MAX_EXCLOFFSETS; nb++) {
00281
00282 if (tid < BIN_SIZE) {
00283 const flint *p_bin = bin_zero
00284 + ((binid + const_excloffsets[nb])<<BIN_SHIFT);
00285 atombin[tid] = p_bin[tid];
00286 }
00287 __syncthreads();
00288
00289 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00290
00291 if (-1 == atombin[n+3].i) break;
00292 const float dx = px - atombin[n ].f;
00293 const float dy = py - atombin[n+1].f;
00294 const float dz = pz - atombin[n+2].f;
00295 const float r2 = dx*dx + dy*dy + dz*dz;
00296
00297 if (r2 <= mindis2) isexcluded = 1;
00298
00299 }
00300
00301 __syncthreads();
00302 }
00303
00304 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00305 const float dx = px - const_extra[n ].f;
00306 const float dy = py - const_extra[n+1].f;
00307 const float dz = pz - const_extra[n+2].f;
00308 const float r2 = dx*dx + dy*dy + dz*dz;
00309
00310 if (r2 <= mindis2) isexcluded = 1;
00311
00312 }
00313
00314 excl[mindex] = isexcluded;
00315 }
00316 #endif // USE_DISTANCE_EXCL
00317
00318
00319 #ifdef USE_ENERGY_EXCL
00320 __global__ static void cuda_find_energy_exclusions(
00321 int *excl,
00322 const flint *bin_zero,
00323 int nbx, int nby,
00324 #ifndef DO_ONE_PLANE
00325 int mzblocks,
00326 #endif
00327 int mzblockoffset
00328 ) {
00329
00330
00331 __shared__ flint atombin[BIN_SIZE];
00332
00333 #ifdef TEST_BLOCK_EXCL
00334 #if 0
00335 __shared__ int sh_isall_upper_warp;
00336 #endif
00337 __shared__ int sh_sum[NTHREADSPERBLOCK];
00338 #endif
00339
00340 #ifndef DO_ONE_PLANE
00341 const int nblockx = gridDim.x;
00342 const int nblocky = gridDim.y / mzblocks;
00343
00344
00345 const int bidx = blockIdx.x;
00346 const int bidy = blockIdx.y % nblocky;
00347 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00348
00349
00350 const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00351
00352
00353 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00354
00355
00356 float px = (bidx<<BSHIFT) * const_hx;
00357 float py = (bidy<<BSHIFT) * const_hy;
00358 float pz = (bidz<<BSHIFT) * const_hz;
00359 #else
00360
00361
00362
00363 const int bid
00364 = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00365
00366
00367 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00368
00369
00370 float px = (blockIdx.x<<BSHIFT) * const_hx;
00371 float py = (blockIdx.y<<BSHIFT) * const_hy;
00372 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00373 #endif
00374
00375 const int mindex = (bid<<NTPBSHIFT) + tid;
00376 #ifdef USE_DISTANCE_EXCL
00377 int isexcluded = excl[mindex];
00378 #else
00379 int isexcluded = 0;
00380 #endif
00381
00382 #ifdef TEST_BLOCK_EXCL
00383 #if 0
00384
00385
00386 int isall = __all(isexcluded);
00387 if (32==tid) sh_isall_upper_warp = isall;
00388 if (0==isany+sh_isall_upper_warp) return;
00389 #endif
00390 sh_sum[tid] = !(isexcluded);
00391 int nbrsum = sh_sum[tid^1];
00392 sh_sum[tid] += nbrsum;
00393 nbrsum = sh_sum[tid^2];
00394 sh_sum[tid] += nbrsum;
00395 nbrsum = sh_sum[tid^4];
00396 sh_sum[tid] += nbrsum;
00397 nbrsum = sh_sum[tid^8];
00398 sh_sum[tid] += nbrsum;
00399 nbrsum = sh_sum[tid^16];
00400 sh_sum[tid] += nbrsum;
00401 __syncthreads();
00402 nbrsum = sh_sum[tid^32];
00403 __syncthreads();
00404 sh_sum[tid] += nbrsum;
00405
00406 if (0==sh_sum[tid]) return;
00407 #endif
00408
00409 const float cutoff2 = const_cutoff * const_cutoff;
00410 const float probe_vdweps = const_probevdwparms[0];
00411 const float probe_vdwrmin = const_probevdwparms[1];
00412
00413 const int ib = (int) floorf(px * const_bx_1);
00414 const int jb = (int) floorf(py * const_by_1);
00415 const int kb = (int) floorf(pz * const_bz_1);
00416
00417 const int binid = (kb * nby + jb)*nbx + ib;
00418
00419 int n, nb;
00420
00421 px += threadIdx.x * const_hx;
00422 py += threadIdx.y * const_hy;
00423 pz += threadIdx.z * const_hz;
00424
00425 px += const_x0;
00426 py += const_y0;
00427 pz += const_z0;
00428
00429 float u = 0.f;
00430
00431 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00432
00433 for (nb = 0; nb < const_num_binoffsets; nb++) {
00434
00435 if (tid < BIN_SIZE) {
00436 const flint *p_bin = bin_zero
00437 + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00438 atombin[tid] = p_bin[tid];
00439 }
00440 __syncthreads();
00441
00442 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00443 int vdwtype = atombin[n+3].i;
00444 if (-1 == vdwtype) break;
00445 vdwtype <<= 1;
00446
00447 const float dx = px - atombin[n ].f;
00448 const float dy = py - atombin[n+1].f;
00449 const float dz = pz - atombin[n+2].f;
00450 const float r2 = dx*dx + dy*dy + dz*dz;
00451 if (r2 < cutoff2) {
00452 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00453 const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00454 float rm6 = rmin*rmin / r2;
00455 rm6 = rm6 * rm6 * rm6;
00456 u += epsilon * rm6 * (rm6 - 2.f);
00457 }
00458 }
00459
00460 __syncthreads();
00461 }
00462
00463 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00464 const int vdwtype = const_extra[n+3].i << 1;
00465 const float dx = px - const_extra[n ].f;
00466 const float dy = py - const_extra[n+1].f;
00467 const float dz = pz - const_extra[n+2].f;
00468 const float r2 = dx*dx + dy*dy + dz*dz;
00469 if (r2 < cutoff2) {
00470 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00471 const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00472 float rm6 = rmin*rmin / r2;
00473 rm6 = rm6 * rm6 * rm6;
00474 u += epsilon * rm6 * (rm6 - 2.f);
00475 }
00476 }
00477
00478 if (u >= const_max_energy) isexcluded = 1;
00479
00480 excl[mindex] = isexcluded;
00481 }
00482 #endif // USE_ENERGY_EXCL
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495 __global__ static void cuda_occupancy_monoatom(
00496 float *map,
00497 const int *excl,
00498 const flint *bin_zero,
00499 int nbx, int nby,
00500 #ifndef DO_ONE_PLANE
00501 int mzblocks,
00502 #endif
00503 int mzblockoffset
00504 ) {
00505
00506
00507 __shared__ flint atombin[BIN_SIZE];
00508
00509 #ifdef TEST_BLOCK_EXCL
00510 #if 0
00511 __shared__ int sh_isall_upper_warp;
00512 #endif
00513 __shared__ int sh_sum[NTHREADSPERBLOCK];
00514 #endif
00515
00516 #ifndef DO_ONE_PLANE
00517 const int nblockx = gridDim.x;
00518 const int nblocky = gridDim.y / mzblocks;
00519
00520
00521 const int bidx = blockIdx.x;
00522 const int bidy = blockIdx.y % nblocky;
00523 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00524
00525
00526 const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00527
00528
00529 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00530
00531
00532 float px = (bidx<<BSHIFT) * const_hx;
00533 float py = (bidy<<BSHIFT) * const_hy;
00534 float pz = (bidz<<BSHIFT) * const_hz;
00535 #else
00536
00537
00538
00539 const int bid
00540 = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00541
00542
00543 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00544
00545
00546 float px = (blockIdx.x<<BSHIFT) * const_hx;
00547 float py = (blockIdx.y<<BSHIFT) * const_hy;
00548 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00549 #endif
00550
00551 const int mindex = (bid<<NTPBSHIFT) + tid;
00552 const int isexcluded = excl[mindex];
00553
00554 #ifdef TEST_BLOCK_EXCL
00555 #if 0
00556
00557
00558 int isall = __all(isexcluded);
00559 if (32==tid) sh_isall_upper_warp = isall;
00560 if (0==isany+sh_isall_upper_warp) return;
00561 #endif
00562 sh_sum[tid] = !(isexcluded);
00563 int nbrsum = sh_sum[tid^1];
00564 sh_sum[tid] += nbrsum;
00565 nbrsum = sh_sum[tid^2];
00566 sh_sum[tid] += nbrsum;
00567 nbrsum = sh_sum[tid^4];
00568 sh_sum[tid] += nbrsum;
00569 nbrsum = sh_sum[tid^8];
00570 sh_sum[tid] += nbrsum;
00571 nbrsum = sh_sum[tid^16];
00572 sh_sum[tid] += nbrsum;
00573 __syncthreads();
00574 nbrsum = sh_sum[tid^32];
00575 __syncthreads();
00576 sh_sum[tid] += nbrsum;
00577
00578 if (0==sh_sum[tid]) return;
00579 #endif
00580
00581 const float cutoff2 = const_cutoff * const_cutoff;
00582 const float probe_vdweps = const_probevdwparms[0];
00583 const float probe_vdwrmin = const_probevdwparms[1];
00584
00585 const int ib = (int) floorf(px * const_bx_1);
00586 const int jb = (int) floorf(py * const_by_1);
00587 const int kb = (int) floorf(pz * const_bz_1);
00588
00589 const int binid = (kb * nby + jb)*nbx + ib;
00590
00591 int n, nb;
00592
00593 px += threadIdx.x * const_hx;
00594 py += threadIdx.y * const_hy;
00595 pz += threadIdx.z * const_hz;
00596
00597 px += const_x0;
00598 py += const_y0;
00599 pz += const_z0;
00600
00601 float u = 0.f;
00602
00603 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00604
00605 for (nb = 0; nb < const_num_binoffsets; nb++) {
00606
00607 if (tid < BIN_SIZE) {
00608 const flint *p_bin = bin_zero
00609 + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00610 atombin[tid] = p_bin[tid];
00611 }
00612 __syncthreads();
00613
00614 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00615 int vdwtype = atombin[n+3].i;
00616 if (-1 == vdwtype) break;
00617 vdwtype <<= 1;
00618
00619 const float dx = px - atombin[n ].f;
00620 const float dy = py - atombin[n+1].f;
00621 const float dz = pz - atombin[n+2].f;
00622 const float r2 = dx*dx + dy*dy + dz*dz;
00623 if (r2 < cutoff2) {
00624 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00625 const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00626 float rm6 = rmin*rmin / r2;
00627 rm6 = rm6 * rm6 * rm6;
00628 u += epsilon * rm6 * (rm6 - 2.f);
00629 }
00630 }
00631
00632 __syncthreads();
00633 }
00634
00635 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00636 const int vdwtype = const_extra[n+3].i << 1;
00637 const float dx = px - const_extra[n ].f;
00638 const float dy = py - const_extra[n+1].f;
00639 const float dz = pz - const_extra[n+2].f;
00640 const float r2 = dx*dx + dy*dy + dz*dz;
00641 if (r2 < cutoff2) {
00642 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00643 const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00644 float rm6 = rmin*rmin / r2;
00645 rm6 = rm6 * rm6 * rm6;
00646 u += epsilon * rm6 * (rm6 - 2.f);
00647 }
00648 }
00649
00650 float occ = 0.f;
00651 if (!isexcluded && u < const_max_energy) {
00652 occ = expf(-u);
00653 }
00654 map[mindex] = occ;
00655 }
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667 __global__ static void cuda_occupancy_multiatom(
00668 float *map,
00669 const int *excl,
00670 const flint *bin_zero,
00671 int nbx, int nby,
00672 #ifndef DO_ONE_PLANE
00673 int mzblocks,
00674 #endif
00675 int mzblockoffset
00676 ) {
00677
00678
00679 __shared__ flint atombin[BIN_SIZE];
00680
00681
00682 __shared__ float usum[NTPB_TIMES_NCPL];
00683
00684
00685
00686 __shared__ float conf[3*MAX_PROBES * NCONFPERLOOP];
00687
00688 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00689 #if 0
00690 __shared__ int sh_isany_upper_warp;
00691 #endif
00692
00693
00694
00695 __shared__ flint sh_buffer[NTHREADSPERBLOCK];
00696 #endif
00697
00698 #ifndef DO_ONE_PLANE
00699 const int nblockx = gridDim.x;
00700 const int nblocky = gridDim.y / mzblocks;
00701
00702
00703 const int bidx = blockIdx.x;
00704 const int bidy = blockIdx.y % nblocky;
00705 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00706
00707
00708 const int bid = (bidz * nblocky + bidy)*nblockx + bidx;
00709
00710
00711 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00712
00713
00714 float px = (bidx<<BSHIFT) * const_hx;
00715 float py = (bidy<<BSHIFT) * const_hy;
00716 float pz = (bidz<<BSHIFT) * const_hz;
00717 #else
00718
00719
00720
00721 const int bid
00722 = (mzblockoffset * gridDim.y + blockIdx.y)*gridDim.x + blockIdx.x;
00723
00724
00725 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00726
00727
00728 float px = (blockIdx.x<<BSHIFT) * const_hx;
00729 float py = (blockIdx.y<<BSHIFT) * const_hy;
00730 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00731 #endif
00732
00733 const int mindex = (bid<<NTPBSHIFT) + tid;
00734 const int isincluded = !(excl[mindex]);
00735
00736 #ifdef TEST_MBLOCK_EXCL
00737 #if 0
00738
00739
00740 int isany = __any(isincluded);
00741 if (32==tid) sh_isany_upper_warp = isany;
00742 if (0==isany+sh_isany_upper_warp) return;
00743 #endif
00744
00745 sh_buffer[tid].i = isincluded;
00746 int nbrsum = sh_buffer[tid^1].i;
00747 sh_buffer[tid].i += nbrsum;
00748 nbrsum = sh_buffer[tid^2].i;
00749 sh_buffer[tid].i += nbrsum;
00750 nbrsum = sh_buffer[tid^4].i;
00751 sh_buffer[tid].i += nbrsum;
00752 nbrsum = sh_buffer[tid^8].i;
00753 sh_buffer[tid].i += nbrsum;
00754 nbrsum = sh_buffer[tid^16].i;
00755 sh_buffer[tid].i += nbrsum;
00756 __syncthreads();
00757 nbrsum = sh_buffer[tid^32].i;
00758 __syncthreads();
00759 sh_buffer[tid].i += nbrsum;
00760
00761 if (0==sh_buffer[tid].i) {
00762 map[mindex] = 0.f;
00763 return;
00764 }
00765 #endif
00766
00767 #ifdef TEST_SHIFT_EXCL
00768 int numincl = 0;
00769 {
00770 int psum = isincluded;
00771 sh_buffer[tid].i = isincluded;
00772
00773 int nbrid = tid^1;
00774 int nbrsum = sh_buffer[nbrid].i;
00775 if (nbrid < tid) psum += nbrsum;
00776 sh_buffer[tid].i += nbrsum;
00777
00778 nbrid = tid^2;
00779 nbrsum = sh_buffer[nbrid].i;
00780 if (nbrid < tid) psum += nbrsum;
00781 sh_buffer[tid].i += nbrsum;
00782
00783 nbrid = tid^4;
00784 nbrsum = sh_buffer[nbrid].i;
00785 if (nbrid < tid) psum += nbrsum;
00786 sh_buffer[tid].i += nbrsum;
00787
00788 nbrid = tid^8;
00789 nbrsum = sh_buffer[nbrid].i;
00790 if (nbrid < tid) psum += nbrsum;
00791 sh_buffer[tid].i += nbrsum;
00792
00793 nbrid = tid^16;
00794 nbrsum = sh_buffer[nbrid].i;
00795 if (nbrid < tid) psum += nbrsum;
00796 sh_buffer[tid].i += nbrsum;
00797
00798 __syncthreads();
00799 nbrid = tid^32;
00800 nbrsum = sh_buffer[nbrid].i;
00801 if (nbrid < tid) psum += nbrsum;
00802 __syncthreads();
00803 sh_buffer[tid].i += nbrsum;
00804
00805 numincl = sh_buffer[tid].i;
00806 if (0==numincl) {
00807 map[mindex] = 0.f;
00808 return;
00809 }
00810 __syncthreads();
00811 if (isincluded) {
00812 sh_buffer[psum-1].i = tid;
00813 }
00814 }
00815 __syncthreads();
00816 const int sid = sh_buffer[tid].i;
00817 #endif
00818
00819 const float cutoff2 = const_cutoff * const_cutoff;
00820
00821 const int ib = (int) floorf(px * const_bx_1);
00822 const int jb = (int) floorf(py * const_by_1);
00823 const int kb = (int) floorf(pz * const_bz_1);
00824
00825 const int binid = (kb * nby + jb)*nbx + ib;
00826
00827 int n, nb;
00828 int m;
00829 int ma;
00830 int nc;
00831
00832 #ifdef TEST_SHIFT_EXCL
00833 if (tid < numincl) {
00834 int ssid = sid;
00835 const int sid_x = ssid & PADMASK;
00836 ssid >>= BSHIFT;
00837 const int sid_y = ssid & PADMASK;
00838 ssid >>= BSHIFT;
00839 const int sid_z = ssid;
00840 px += sid_x * const_hx;
00841 py += sid_y * const_hy;
00842 pz += sid_z * const_hz;
00843 }
00844 #else
00845 px += threadIdx.x * const_hx;
00846 py += threadIdx.y * const_hy;
00847 pz += threadIdx.z * const_hz;
00848 #endif
00849
00850 px += const_x0;
00851 py += const_y0;
00852 pz += const_z0;
00853
00854 #ifdef TEST_SHIFT_EXCL
00855 sh_buffer[tid].f = 0.f;
00856 #else
00857 float occ = 0.f;
00858 #endif
00859
00860 const int twice_num_probes = const_num_probes << 1;
00861 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00862
00863
00864 const int total_shmem_copy = 3*NCONFPERLOOP * const_num_probes;
00865
00866
00867 for (m = 0; m < const_num_conformers; m += NCONFPERLOOP) {
00868
00869
00870 for (nc = 0; nc < NTPB_TIMES_NCPL; nc += NTHREADSPERBLOCK) {
00871 usum[nc + tid] = 0.f;
00872 }
00873
00874
00875
00876
00877 __syncthreads();
00878 if (tid < 32) {
00879
00880
00881 const int npass = (total_shmem_copy >> 5);
00882
00883
00884 const int nleft = (total_shmem_copy & 31);
00885
00886
00887 const int skip = (const_num_probes + twice_num_probes) * m;
00888
00889 int off = 0;
00890 for (nc = 0; nc < npass; nc++, off += 32) {
00891 conf[off + tid] = const_conformers[skip + off + tid];
00892 }
00893 if (tid < nleft) {
00894 conf[off + tid] = const_conformers[skip + off + tid];
00895 }
00896 }
00897 __syncthreads();
00898
00899 for (nb = 0; nb < const_num_binoffsets; nb++) {
00900
00901 __syncthreads();
00902 if (tid < BIN_SIZE) {
00903 const flint *p_bin = bin_zero
00904 + ((binid + const_binoffsets[nb]) << BIN_SHIFT);
00905 atombin[tid] = p_bin[tid];
00906 }
00907 __syncthreads();
00908
00909 #ifdef TEST_SHIFT_EXCL
00910 if (tid < numincl) {
00911 #endif
00912
00913 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00914 int vdwtype = atombin[n+3].i;
00915 if (-1 == vdwtype) break;
00916 vdwtype <<= 1;
00917 const float epsilon_atom = const_vdwparms[vdwtype];
00918 const float rmin_atom = const_vdwparms[vdwtype+1];
00919 const float ax = px - atombin[n ].f;
00920 const float ay = py - atombin[n+1].f;
00921 const float az = pz - atombin[n+2].f;
00922
00923 int sindex = 0;
00924
00925
00926 for (nc = 0; nc < NTPB_TIMES_NCPL; nc += NTHREADSPERBLOCK) {
00927
00928 for (ma = 0; ma < twice_num_probes; ma += 2) {
00929 const float dx = conf[sindex++] + ax;
00930 const float dy = conf[sindex++] + ay;
00931 const float dz = conf[sindex++] + az;
00932 const float r2 = dx*dx + dy*dy + dz*dz;
00933 if (r2 < cutoff2) {
00934 const float epsilon = epsilon_atom * const_probevdwparms[ma];
00935 const float rmin = rmin_atom + const_probevdwparms[ma+1];
00936 float rm6 = rmin*rmin / r2;
00937 rm6 = rm6 * rm6 * rm6;
00938
00939
00940
00941
00942
00943
00944
00945
00946
00947
00948
00949 usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
00950 }
00951 }
00952
00953 }
00954
00955 }
00956 #ifdef TEST_SHIFT_EXCL
00957 }
00958 #endif
00959
00960 }
00961
00962 #ifdef TEST_SHIFT_EXCL
00963 if (tid < numincl) {
00964 #endif
00965
00966 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00967 const int vdwtype = const_extra[n+3].i << 1;
00968 const float epsilon_atom = const_vdwparms[vdwtype];
00969 const float rmin_atom = const_vdwparms[vdwtype+1];
00970 const float ax = px - const_extra[n ].f;
00971 const float ay = py - const_extra[n+1].f;
00972 const float az = pz - const_extra[n+2].f;
00973 int sindex = 0;
00974
00975
00976 for (nc = 0; nc < NTPB_TIMES_NCPL; nc += NTHREADSPERBLOCK) {
00977
00978 for (ma = 0; ma < twice_num_probes; ma += 2) {
00979 const float dx = conf[sindex++] + ax;
00980 const float dy = conf[sindex++] + ay;
00981 const float dz = conf[sindex++] + az;
00982 const float r2 = dx*dx + dy*dy + dz*dz;
00983 if (r2 < cutoff2) {
00984 const float epsilon = epsilon_atom * const_probevdwparms[ma];
00985 const float rmin = rmin_atom + const_probevdwparms[ma+1];
00986 float rm6 = rmin*rmin / r2;
00987 rm6 = rm6 * rm6 * rm6;
00988 usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
00989 }
00990 }
00991
00992 }
00993
00994 }
00995
00996 for (nc = 0; nc < NCONFPERLOOP; nc++) {
00997
00998 if (m + nc < const_num_conformers) {
00999 float u = usum[(nc << NTPBSHIFT) + tid];
01000
01001 #ifdef TEST_SHIFT_EXCL
01002 if (u < MAX_ALLOWABLE_ENERGY) {
01003 sh_buffer[sid].f += expf(-u);
01004 }
01005 #else
01006 if (isincluded && u < MAX_ALLOWABLE_ENERGY) {
01007 occ += expf(-u);
01008 }
01009 #endif
01010 }
01011
01012 }
01013 #ifdef TEST_SHIFT_EXCL
01014 }
01015 #endif
01016
01017 }
01018
01019 #ifdef TEST_SHIFT_EXCL
01020 __syncthreads();
01021 sh_buffer[tid].f *= const_inv_numconf;
01022 if (sh_buffer[tid].f <= const_min_occupancy) sh_buffer[tid].f = 0.f;
01023
01024 map[mindex] = sh_buffer[tid].f;
01025 #else
01026 occ = occ * const_inv_numconf;
01027 if (occ <= const_min_occupancy) occ = 0.f;
01028
01029 map[mindex] = occ;
01030 #endif
01031
01032 }
01033
01034
01035
01036 static void *cuda_occupancy_thread(void *voidparms) {
01037 occthrparms *parms = NULL;
01038 int mxpad, mypad, mzpad, mappad;
01039 float *h_map;
01040 float *d_map;
01041 int *d_excl;
01042 int binsz;
01043 flint *d_bin;
01044 flint *d_bin_zero;
01045 int gpuid = -1;
01046 int numgpus = 0;
01047 int i, j, k;
01048
01049
01050 if (getenv("VMDILSCUDADEVICE")) {
01051 gpuid = atoi(getenv("VMDILSCUDADEVICE"));
01052 }
01053
01054 if (gpuid < 0) {
01055 if (vmd_cuda_num_devices(&numgpus) != VMDCUDA_ERR_NONE) {
01056 printf("ILS CUDA device init error\n");
01057 return NULL;
01058 }
01059 if (numgpus > 0) {
01060
01061
01062
01063
01064
01065 gpuid = 0;
01066 } else {
01067 return NULL;
01068 }
01069 }
01070
01071 if (gpuid >= 0) {
01072 printf("Using CUDA device: %d\n", gpuid);
01073 } else {
01074
01075 return NULL;
01076 }
01077
01078 wkf_timerhandle timer = wkf_timer_create();
01079
01080 parms = (occthrparms *) voidparms;
01081 parms->errcode = -1;
01082
01083 if (getenv("VMDILSVERBOSE")) {
01084
01085 printf("*****************************************************************\n");
01086 printf("mx = %d my = %d mz = %d\n", parms->mx, parms->my, parms->mz);
01087 printf("hx = %g hy = %g hz = %g\n", parms->hx, parms->hy, parms->hz);
01088 printf("x0 = %g y0 = %g z0 = %g\n", parms->x0, parms->y0, parms->z0);
01089 printf("bx_1 = %g by_1 = %g bz_1 = %g\n",
01090 parms->bx_1, parms->by_1, parms->bz_1);
01091 printf("nbx = %d nby = %d nbz = %d\n", parms->nbx, parms->nby, parms->nbz);
01092 printf("num_binoffsets = %d\n", parms->num_binoffsets);
01093 printf("num_extras = %d\n", parms->num_extras);
01094 printf("num_vdwparms = %d\n", parms->num_vdwparms);
01095 printf("num_probes = %d\n", parms->num_probes);
01096 printf("num_conformers = %d\n", parms->num_conformers);
01097 printf("*****************************************************************\n");
01098 }
01099
01100
01101 if (parms->num_binoffsets > MAX_BINOFFSETS) {
01102 printf("***** ERROR: Exceeded MAX_BINOFFSETS for CUDA kernel\n");
01103 return NULL;
01104 }
01105 if (parms->num_extras > MAX_EXTRAS) {
01106 printf("***** ERROR: Exceeded MAX_EXTRAS for CUDA kernel\n");
01107 return NULL;
01108 }
01109 if (parms->num_vdwparms > MAX_VDWPARMS) {
01110 printf("***** ERROR: Exceeded MAX_VDWPARMS for CUDA kernel\n");
01111 return NULL;
01112 }
01113 if (parms->num_probes > MAX_PROBES) {
01114 printf("***** ERROR: Exceeded MAX_PROBES for CUDA kernel\n");
01115 return NULL;
01116 }
01117 if (parms->num_conformers > MAX_CONFORMERS) {
01118 printf("***** ERROR: Exceeded MAX_CONFORMERS for CUDA kernel\n");
01119 return NULL;
01120 }
01121
01122
01123 cudaError_t rc;
01124 rc = cudaSetDevice(gpuid);
01125 if (rc != cudaSuccess) {
01126 #if CUDART_VERSION >= 2010
01127 rc = cudaGetLastError();
01128 if (rc != cudaErrorSetOnActiveProcess) {
01129 return NULL;
01130 }
01131 #else
01132 cudaGetLastError();
01133
01134 #endif
01135 }
01136
01137
01138 mxpad = (parms->mx + PADMASK) & ~PADMASK;
01139 mypad = (parms->my + PADMASK) & ~PADMASK;
01140 mzpad = (parms->mz + PADMASK) & ~PADMASK;
01141 mappad = mxpad * mypad * mzpad;
01142
01143 if (getenv("VMDILSVERBOSE")) {
01144 printf("mxpad=%d mypad=%d mzpad=%d mappad=%d\n",
01145 mxpad, mypad, mzpad, mappad);
01146 }
01147
01148 h_map = (float *) malloc(mappad * sizeof(float));
01149 if (getenv("VMDILSVERBOSE")) {
01150 printf("Allocate %g MB for map\n", mappad*sizeof(float)/(1024*1024.f));
01151 }
01152 cudaMalloc((void **) &d_map, mappad * sizeof(float));
01153 CUERR;
01154 if (getenv("VMDILSVERBOSE")) {
01155 printf("Allocate %g MB for exclusions\n", mappad*sizeof(int)/(1024*1024.f));
01156 }
01157 cudaMalloc((void **) &d_excl, mappad * sizeof(int));
01158 CUERR;
01159
01160 #if !defined(USE_DISTANCE_EXCL) && !defined(USE_ENERGY_EXCL)
01161
01162 cudaMemset(d_excl, 0, mappad * sizeof(int));
01163 CUERR;
01164 #endif
01165
01166 binsz = BIN_SIZE * parms->nbx * parms->nby * parms->nbz;
01167 if (getenv("VMDILSVERBOSE")) {
01168 printf("nbx=%d nby=%d nbz=%d binsz=%d\n",
01169 parms->nbx, parms->nby, parms->nbz, binsz);
01170 printf("Allocate %g MB for atom bins\n", binsz*sizeof(flint)/(1024*1024.f));
01171 }
01172 cudaMalloc((void **) &d_bin, binsz * sizeof(flint));
01173 CUERR;
01174 cudaMemcpy(d_bin, parms->bin, binsz * sizeof(flint), cudaMemcpyHostToDevice);
01175 CUERR;
01176 d_bin_zero = d_bin + (parms->bin_zero - parms->bin);
01177
01178 if (getenv("VMDILSVERBOSE")) {
01179 printf("parms delta bin = %d delta bin = %d\n",
01180 (parms->bin_zero - parms->bin), (d_bin_zero - d_bin));
01181
01182 printf("probe epsilon=%g rmin=%g\n",
01183 parms->probevdwparms[0], parms->probevdwparms[1]);
01184 printf("max_energy=%g\n", parms->max_energy);
01185 printf("cutoff=%g\n", parms->cutoff);
01186 printf("hx=%g hy=%g hz=%g\n", parms->hx, parms->hy, parms->hz);
01187 printf("x0=%g y0=%g z0=%g\n", parms->x0, parms->y0, parms->z0);
01188 printf("bx_1=%g by_1=%g bz_1=%g\n", parms->bx_1, parms->by_1, parms->bz_1);
01189 }
01190
01191 cudaMemcpyToSymbol(const_max_energy, &(parms->max_energy), sizeof(float), 0);
01192 CUERR;
01193 float min_occ = expf(-parms->max_energy);
01194 cudaMemcpyToSymbol(const_min_occupancy, &min_occ, sizeof(float), 0);
01195 CUERR;
01196 cudaMemcpyToSymbol(const_cutoff, &(parms->cutoff), sizeof(float), 0);
01197 CUERR;
01198 cudaMemcpyToSymbol(const_min_excldist, &(parms->min_excldist),
01199 sizeof(float), 0);
01200 CUERR;
01201 cudaMemcpyToSymbol(const_hx, &(parms->hx), sizeof(float), 0);
01202 CUERR;
01203 cudaMemcpyToSymbol(const_hy, &(parms->hy), sizeof(float), 0);
01204 CUERR;
01205 cudaMemcpyToSymbol(const_hz, &(parms->hz), sizeof(float), 0);
01206 CUERR;
01207 cudaMemcpyToSymbol(const_x0, &(parms->x0), sizeof(float), 0);
01208 CUERR;
01209 cudaMemcpyToSymbol(const_y0, &(parms->y0), sizeof(float), 0);
01210 CUERR;
01211 cudaMemcpyToSymbol(const_z0, &(parms->z0), sizeof(float), 0);
01212 CUERR;
01213 cudaMemcpyToSymbol(const_bx_1, &(parms->bx_1), sizeof(float), 0);
01214 CUERR;
01215 cudaMemcpyToSymbol(const_by_1, &(parms->by_1), sizeof(float), 0);
01216 CUERR;
01217 cudaMemcpyToSymbol(const_bz_1, &(parms->bz_1), sizeof(float), 0);
01218 CUERR;
01219 float inv_numconf = 1.f / parms->num_conformers;
01220 cudaMemcpyToSymbol(const_inv_numconf, &inv_numconf, sizeof(float), 0);
01221
01222 if (getenv("VMDILSVERBOSE")) {
01223 printf("num_binoffsets = %d\n", parms->num_binoffsets);
01224 }
01225 cudaMemcpyToSymbol(const_num_binoffsets, &(parms->num_binoffsets),
01226 sizeof(int), 0);
01227 CUERR;
01228 if (getenv("VMDILSVERBOSE")) {
01229 printf("num_extras = %d\n", parms->num_extras);
01230 }
01231 cudaMemcpyToSymbol(const_num_extras, &(parms->num_extras), sizeof(int), 0);
01232 CUERR;
01233 if (getenv("VMDILSVERBOSE")) {
01234 printf("num_probes = %d\n", parms->num_probes);
01235 }
01236 cudaMemcpyToSymbol(const_num_probes, &(parms->num_probes), sizeof(int), 0);
01237 CUERR;
01238 if (getenv("VMDILSVERBOSE")) {
01239 printf("num_conformers = %d\n", parms->num_conformers);
01240 }
01241 cudaMemcpyToSymbol(const_num_conformers, &(parms->num_conformers),
01242 sizeof(int), 0);
01243 CUERR;
01244
01245 if (getenv("VMDILSVERBOSE")) {
01246 printf("num_vdwparms = %d\n", parms->num_vdwparms);
01247 for (i = 0; i < parms->num_vdwparms; i++) {
01248 printf(" %2d: epsilon=%g rmin=%g\n", i,
01249 parms->vdwparms[2*i], parms->vdwparms[2*i+1]);
01250 }
01251 }
01252
01253 cudaMemcpyToSymbol(const_vdwparms, parms->vdwparms,
01254 2 * parms->num_vdwparms * sizeof(float), 0);
01255 CUERR;
01256 cudaMemcpyToSymbol(const_probevdwparms, parms->probevdwparms,
01257 2 * (0==parms->num_probes ? 1 : parms->num_probes) * sizeof(float), 0);
01258 CUERR;
01259 cudaMemcpyToSymbol(const_extra, parms->extra,
01260 BIN_SLOTSIZE * parms->num_extras * sizeof(flint), 0);
01261 CUERR;
01262
01263
01264 {
01265 const int nbx = parms->nbx;
01266 const int nby = parms->nby;
01267 int n;
01268 int *h_flatbinoffset = (int *) calloc(parms->num_binoffsets, sizeof(int));
01269 for (n = 0; n < parms->num_binoffsets; n++) {
01270 int i = (int) (parms->binoffsets[3*n ]);
01271 int j = (int) (parms->binoffsets[3*n+1]);
01272 int k = (int) (parms->binoffsets[3*n+2]);
01273 int index = (k*nby + j)*nbx + i;
01274 h_flatbinoffset[n] = index;
01275 }
01276 cudaMemcpyToSymbol(const_binoffsets, h_flatbinoffset,
01277 parms->num_binoffsets * sizeof(int), 0);
01278 CUERR;
01279 free(h_flatbinoffset);
01280 }
01281
01282
01283 {
01284 const int nbx = parms->nbx;
01285 const int nby = parms->nby;
01286 int i, j, k, n=0;
01287 int h_flatexcloffset[27];
01288 for (k = -1; k <= 1; k++) {
01289 for (j = -1; j <= 1; j++) {
01290 for (i = -1; i <= 1; i++) {
01291 int index = (k*nby + j)*nbx + i;
01292 h_flatexcloffset[n++] = index;
01293 }
01294 }
01295 }
01296 cudaMemcpyToSymbol(const_excloffsets, h_flatexcloffset, 27*sizeof(int), 0);
01297 CUERR;
01298 }
01299
01300 cudaMemcpyToSymbol(const_conformers, parms->conformers,
01301 3 * parms->num_probes * parms->num_conformers * sizeof(float), 0);
01302 CUERR;
01303
01304
01305
01306
01307 #define TIME_PER_MONOATOM_WORKUNIT 1.24553913519431e-10
01308 #define TIME_PER_MULTIATOM_WORKUNIT 5.24559386973180e-09
01309
01310
01311
01312 int mxblocks = (mxpad >> BSHIFT);
01313 int myblocks = (mypad >> BSHIFT);
01314 int mzblocks = (mzpad >> BSHIFT);
01315 int nblocksperplane = mxblocks * myblocks;
01316 int maxplanes = MAX_THREADBLOCKS / nblocksperplane;
01317 int numconf = (0==parms->num_conformers ? 1 : parms->num_conformers);
01318
01319 double monoatom_time_per_plane = (double)(parms->num_probes * numconf *
01320 parms->num_binoffsets * nblocksperplane) * TIME_PER_MONOATOM_WORKUNIT;
01321
01322 double multiatom_time_per_plane = (double)(parms->num_probes * numconf *
01323 parms->num_binoffsets * nblocksperplane) * TIME_PER_MULTIATOM_WORKUNIT;
01324
01325
01326 #define MAX_TIME_PER_KERNEL 1.f // schedule up to 1 second per kernel call
01327 int max_mono_planes = (int)(MAX_TIME_PER_KERNEL / monoatom_time_per_plane);
01328 int max_multi_planes = (int)(MAX_TIME_PER_KERNEL / multiatom_time_per_plane);
01329
01330 if (max_mono_planes > maxplanes) max_mono_planes = maxplanes;
01331 else if (max_mono_planes < 1) max_mono_planes = 1;
01332
01333 if (max_multi_planes > maxplanes) max_multi_planes = maxplanes;
01334 else if (max_multi_planes < 1) max_multi_planes = 1;
01335
01336 if (getenv("VMDILSVERBOSE")) {
01337 printf("mxblocks = %d myblocks = %d mzblocks = %d\n",
01338 mxblocks, myblocks, mzblocks);
01339 }
01340 if (nblocksperplane > MAX_THREADBLOCKS) {
01341 printf("***** ERROR: Too many blocks in X-Y plane for CUDA kernel\n");
01342 return NULL;
01343 }
01344
01345 if (0 == parms->num_conformers) {
01346 if (getenv("VMDILSVERBOSE")) {
01347 printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01348 "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01349 }
01350 int mzblockoffset = 0;
01351 int nplanes_remaining = mzblocks;
01352 while (nplanes_remaining > 0) {
01353 #ifndef DO_ONE_PLANE
01354
01355 int nplanes = nplanes_remaining;
01356 if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01357 #else
01358
01359 int nplanes = 1;
01360 #endif
01361 dim3 gsz, bsz;
01362 gsz.x = mxblocks;
01363 gsz.y = myblocks * nplanes;
01364 gsz.z = 1;
01365 bsz.x = THBLEN;
01366 bsz.y = THBLEN;
01367 bsz.z = THBLEN;
01368
01369 if ( ! getenv("VMDILSNOEXCL") ) {
01370 #ifdef USE_DISTANCE_EXCL
01371
01372 #ifdef DEBUG
01373 printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01374 "on %2d planes ",
01375 nplanes);
01376 #endif
01377 #if 1
01378 cudaDeviceSynchronize();
01379 #endif
01380 wkf_timer_start(timer);
01381 cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01382 d_excl, d_bin_zero,
01383 parms->nbx, parms->nby,
01384 #ifndef DO_ONE_PLANE
01385 nplanes,
01386 #endif
01387 mzblockoffset);
01388 #if 1
01389 cudaDeviceSynchronize();
01390 #endif
01391 #ifdef DEBUG
01392 printf("%f s\n", wkf_timer_timenow(timer));
01393 #endif
01394 CUERR;
01395 #endif // USE_DISTANCE_EXCL
01396 }
01397
01398
01399 #ifdef DEBUG
01400 printf("*** VMD CUDA: cuda_occupancy_monoatom() "
01401 "on %2d planes ",
01402 nplanes);
01403 #endif
01404 #if 1
01405 cudaDeviceSynchronize();
01406 #endif
01407 wkf_timer_start(timer);
01408 cuda_occupancy_monoatom<<<gsz, bsz, 0>>>(
01409 d_map, d_excl, d_bin_zero,
01410 parms->nbx, parms->nby,
01411 #ifndef DO_ONE_PLANE
01412 nplanes,
01413 #endif
01414 mzblockoffset);
01415 #if 1
01416 cudaDeviceSynchronize();
01417 #endif
01418 #ifdef DEBUG
01419 printf("%f s\n", wkf_timer_timenow(timer));
01420 #endif
01421 CUERR;
01422
01423 nplanes_remaining -= nplanes;
01424 mzblockoffset += nplanes;
01425 }
01426 }
01427
01428 else {
01429 if (getenv("VMDILSVERBOSE")) {
01430 printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01431 "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01432 printf("Scheduling up to %d planes (%d thread blocks) per multiatom "
01433 "kernel call\n", max_multi_planes, nblocksperplane*max_multi_planes);
01434 }
01435
01436 int mzblockoffset;
01437 int nplanes_remaining;
01438
01439 if ( ! getenv("VMDILSNOEXCL") ) {
01440
01441 mzblockoffset = 0;
01442 nplanes_remaining = mzblocks;
01443 while (nplanes_remaining > 0) {
01444 #ifndef DO_ONE_PLANE
01445
01446 int nplanes = nplanes_remaining;
01447 if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01448 #else
01449
01450 int nplanes = 1;
01451 #endif
01452 dim3 gsz, bsz;
01453 gsz.x = mxblocks;
01454 gsz.y = myblocks * nplanes;
01455 gsz.z = 1;
01456 bsz.x = THBLEN;
01457 bsz.y = THBLEN;
01458 bsz.z = THBLEN;
01459
01460 #ifdef USE_DISTANCE_EXCL
01461
01462 #ifdef DEBUG
01463 printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01464 "on %2d planes ",
01465 nplanes);
01466 #endif
01467 #if 1
01468 cudaDeviceSynchronize();
01469 #endif
01470 wkf_timer_start(timer);
01471 cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01472 d_excl, d_bin_zero,
01473 parms->nbx, parms->nby,
01474 #ifndef DO_ONE_PLANE
01475 nplanes,
01476 #endif
01477 mzblockoffset);
01478 #if 1
01479 cudaDeviceSynchronize();
01480 #endif
01481 #ifdef DEBUG
01482 printf("%f s\n", wkf_timer_timenow(timer));
01483 #endif
01484 CUERR;
01485 #endif // USE_DISTANCE_EXCL
01486
01487 #ifdef USE_ENERGY_EXCL
01488
01489 #ifdef DEBUG
01490 printf("*** VMD CUDA: cuda_find_energy_exclusions() "
01491 "on %2d planes ",
01492 nplanes);
01493 #endif
01494 #if 1
01495 cudaDeviceSynchronize();
01496 #endif
01497 wkf_timer_start(timer);
01498 cuda_find_energy_exclusions<<<gsz, bsz, 0>>>(
01499 d_excl, d_bin_zero,
01500 parms->nbx, parms->nby,
01501 #ifndef DO_ONE_PLANE
01502 nplanes,
01503 #endif
01504 mzblockoffset);
01505 #if 1
01506 cudaDeviceSynchronize();
01507 #endif
01508 #ifdef DEBUG
01509 printf("%f s\n", wkf_timer_timenow(timer));
01510 #endif
01511 CUERR;
01512 #endif // USE_ENERGY_EXCL
01513
01514 nplanes_remaining -= nplanes;
01515 mzblockoffset += nplanes;
01516 }
01517 }
01518
01519
01520 mzblockoffset = 0;
01521 nplanes_remaining = mzblocks;
01522 while (nplanes_remaining > 0) {
01523 #ifndef DO_ONE_PLANE
01524
01525 int nplanes = nplanes_remaining;
01526 if (nplanes > max_multi_planes) nplanes = max_multi_planes;
01527 #else
01528
01529 int nplanes = 1;
01530 #endif
01531 dim3 gsz, bsz;
01532 gsz.x = mxblocks;
01533 gsz.y = myblocks * nplanes;
01534 gsz.z = 1;
01535 bsz.x = THBLEN;
01536 bsz.y = THBLEN;
01537 bsz.z = THBLEN;
01538
01539
01540 #ifdef DEBUG
01541 printf("*** VMD CUDA: cuda_occupancy_multiatom() "
01542 "on %2d planes ",
01543 nplanes);
01544 #endif
01545 #if 1
01546 cudaDeviceSynchronize();
01547 #endif
01548 wkf_timer_start(timer);
01549 cuda_occupancy_multiatom<<<gsz, bsz, 0>>>(
01550 d_map, d_excl, d_bin_zero,
01551 parms->nbx, parms->nby,
01552 #ifndef DO_ONE_PLANE
01553 nplanes,
01554 #endif
01555 mzblockoffset);
01556 #if 1
01557 cudaDeviceSynchronize();
01558 #endif
01559 #ifdef DEBUG
01560 printf("%f s\n", wkf_timer_timenow(timer));
01561 #endif
01562 CUERR;
01563
01564 nplanes_remaining -= nplanes;
01565 mzblockoffset += nplanes;
01566 }
01567 }
01568
01569
01570 cudaMemcpy(h_map, d_map, mappad * sizeof(float), cudaMemcpyDeviceToHost);
01571 CUERR;
01572
01573
01574
01575 for (k = 0; k < parms->mz; k++) {
01576 for (j = 0; j < parms->my; j++) {
01577 for (i = 0; i < parms->mx; i++) {
01578 int mindex = (k*parms->my + j)*parms->mx + i;
01579 int block = ((k >> BSHIFT)*myblocks + (j >> BSHIFT))*mxblocks
01580 + (i >> BSHIFT);
01581 int tid = ((k & PADMASK)*THBLEN + (j & PADMASK))*THBLEN + (i & PADMASK);
01582 int pindex = block*(THBLEN*THBLEN*THBLEN) + tid;
01583 parms->map[mindex] = h_map[pindex];
01584 }
01585 }
01586 }
01587
01588 cudaFree(d_excl);
01589 cudaFree(d_bin);
01590 cudaFree(d_map);
01591 CUERR;
01592
01593 free(h_map);
01594 wkf_timer_destroy(timer);
01595
01596 parms->errcode = 0;
01597 return NULL;
01598 }
01599
01600
01601 extern "C" int vmd_cuda_evaluate_occupancy_map(
01602 int mx, int my, int mz,
01603 float *map,
01604
01605
01606 float max_energy,
01607 float cutoff,
01608 float hx, float hy, float hz,
01609 float x0, float y0, float z0,
01610 float bx_1, float by_1, float bz_1,
01611
01612 int nbx, int nby, int nbz,
01613 const float *bin,
01614
01615 const float *bin_zero,
01616
01617 int num_binoffsets,
01618 const char *binoffsets,
01619
01620
01621 int num_extras,
01622 const float *extra,
01623
01624
01625
01626 int num_vdwparms,
01627 const float *vdwparms,
01628
01629
01630 int num_probes,
01631 const float *probevdwparms,
01632
01633
01634 int num_conformers,
01635 const float *conformers
01636
01637 ) {
01638 occthrparms parms;
01639
01640 parms.mx = mx;
01641 parms.my = my;
01642 parms.mz = mz;
01643 parms.map = map;
01644 parms.max_energy = max_energy;
01645 parms.cutoff = cutoff;
01646 #define DEFAULT_EXCL_DIST 1.f
01647 parms.min_excldist = DEFAULT_EXCL_DIST;
01648 parms.hx = hx;
01649 parms.hy = hy;
01650 parms.hz = hz;
01651 parms.x0 = x0;
01652 parms.y0 = y0;
01653 parms.z0 = z0;
01654 parms.bx_1 = bx_1;
01655 parms.by_1 = by_1;
01656 parms.bz_1 = bz_1;
01657 parms.nbx = nbx;
01658 parms.nby = nby;
01659 parms.nbz = nbz;
01660 parms.bin = (flint *) bin;
01661 parms.bin_zero = (flint *) bin_zero;
01662 parms.num_binoffsets = num_binoffsets;
01663 parms.binoffsets = binoffsets;
01664 parms.num_extras = num_extras;
01665 parms.extra = (flint *) extra;
01666 parms.num_vdwparms = num_vdwparms;
01667 parms.vdwparms = vdwparms;
01668 parms.num_probes = num_probes;
01669 parms.probevdwparms = probevdwparms;
01670 parms.num_conformers = num_conformers;
01671 parms.conformers = conformers;
01672
01673 #ifdef DEBUG
01674 printf("*** calling cuda_occupancy_thread()\n");
01675 #endif
01676 cuda_occupancy_thread(&parms);
01677
01678 return parms.errcode;
01679 }