00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <cuda.h>
00026
00027 #include "WKFThreads.h"
00028 #include "WKFUtils.h"
00029 #include "CUDAKernels.h"
00030 #include "utilities.h"
00031
00032
00033
00034 #if defined(VMDTHREADS)
00035 #define USE_CUDADEVPOOL 1
00036 #endif
00037
00038 #if 1
00039 #define CUERR \
00040 do { \
00041 cudaError_t err; \
00042 if ((err = cudaGetLastError()) != cudaSuccess) { \
00043 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00044 printf("Thread aborting...\n"); \
00045 return NULL; \
00046 } \
00047 } while (0)
00048
00049 #else
00050 #define CUERR
00051 #endif
00052
00053
00054
00055 typedef union flint_t {
00056 float f;
00057 int i;
00058 } flint;
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068 #define USE_ENERGY_EXCL // use the energy-based exclusion kernel
00069
00070 #ifdef USE_DISTANCE_EXCL
00071 #define TEST_BLOCK_EXCL // early exit if entire thread block is excluded
00072
00073 #endif
00074
00075 #if defined(USE_ENERGY_EXCL) || defined(USE_DISTANCE_EXCL)
00076 #define TEST_MBLOCK_EXCL // early exit from multiatom kernel
00077 #define TEST_SHIFT_EXCL
00078 #endif
00079
00080 #ifdef TEST_SHIFT_EXCL
00081 #undef TEST_MBLOCK_EXCL
00082 #endif
00083
00084
00085 #define MAX_THREADBLOCKS 65536
00086
00087 #define BIN_DEPTH 8 // number of slots per bin
00088 #define BIN_SLOTSIZE 4 // slot permits x, y, z, vdwtype (4 elements)
00089 #define BIN_SIZE (BIN_DEPTH * BIN_SLOTSIZE) // size given in "flints"
00090 #define BIN_SHIFT 5 // log2(BIN_SIZE)
00091 #define BIN_SLOTSHIFT 2 // log2(BIN_SLOTSIZE)
00092
00093 typedef struct occthrparms_t {
00094 int errcode;
00095
00096 int mx, my, mz;
00097 float *map;
00098
00099
00100 float max_energy;
00101 float cutoff;
00102 float min_excldist;
00103 float hx, hy, hz;
00104 float x0, y0, z0;
00105 float bx_1, by_1, bz_1;
00106
00107 int nbx, nby, nbz;
00108 const flint *bin;
00109
00110 const flint *bin_zero;
00111
00112 int num_binoffsets;
00113 const char *binoffsets;
00114
00115
00116 int num_extras;
00117 const flint *extra;
00118
00119
00120 int num_vdwparms;
00121 const float *vdwparms;
00122
00123
00124 int num_probes;
00125 const float *probevdwparms;
00126
00127
00128 int num_conformers;
00129 const float *conformers;
00130
00131 } occthrparms;
00132
00133
00134
00135
00136 __constant__ static float const_max_energy;
00137 __constant__ static float const_min_occupancy;
00138 __constant__ static float const_cutoff;
00139 __constant__ static float const_min_excldist;
00140 __constant__ static float const_hx, const_hy, const_hz;
00141 __constant__ static float const_x0, const_y0, const_z0;
00142 __constant__ static float const_bx_1, const_by_1, const_bz_1;
00143
00144 __constant__ static float const_inv_numconf;
00145
00146 __constant__ static int const_num_binoffsets;
00147 __constant__ static int const_num_extras;
00148 __constant__ static int const_num_probes;
00149 __constant__ static int const_num_conformers;
00150
00151 #define MAX_VDWPARMS 160
00152 __constant__ static float const_vdwparms[2 * MAX_VDWPARMS];
00153
00154 #define MAX_PROBES 8
00155 __constant__ static float const_probevdwparms[2 * MAX_PROBES];
00156
00157 #define MAX_EXTRAS 50
00158 __constant__ static flint const_extra[BIN_SLOTSIZE * MAX_EXTRAS];
00159
00160
00161 #define MAX_BINOFFSETS 1467
00162 __constant__ static int const_binoffsets[MAX_BINOFFSETS];
00163
00164
00165 #define MAX_EXCLOFFSETS 27
00166 __constant__ static int const_excloffsets[MAX_EXCLOFFSETS];
00167
00168
00169
00170
00171
00172
00173 #define MAX_CONFORMERS 586
00174 __constant__ static float const_conformers[3*MAX_PROBES*MAX_CONFORMERS];
00175
00176
00177 #define DO_ONE_PLANE // optimze when z-slabs are slices of thickness 1
00178 #undef DO_ONE_PLANE
00179
00180 #define THBLEN 4 // thread block length of cube
00181 #define NTHREADSPERBLOCK (THBLEN*THBLEN*THBLEN)
00182
00183 #define BSHIFT 2 // shift for map points to blocks = log2(THBLEN)
00184
00185 #define NTPBSHIFT 6 // do bit shifting instead of int multiplies
00186
00187
00188
00189
00190 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00191 #define NCONFPERLOOP 5
00192 #else
00193 #define NCONFPERLOOP 8
00194 #endif
00195
00196 #define NTPB_TIMES_NCPL (NTHREADSPERBLOCK * NCONFPERLOOP)
00197
00198 #define PADMASK (THBLEN-1) // bit mask for padding up to multiple of THBLEN
00199
00200 #define MAX_ALLOWABLE_ENERGY 87.f // otherwise expf(-u) gives 0
00201
00202
00204
00205
00206
00208
00209 #ifdef USE_DISTANCE_EXCL
00210 __global__ static void cuda_find_distance_exclusions(
00211 int *excl,
00212 const flint *bin_zero,
00213 int nbx, int nby,
00214 #ifndef DO_ONE_PLANE
00215 int mzblocks,
00216 #endif
00217 int mzblockoffset
00218 ) {
00219
00220
00221 __shared__ flint atombin[BIN_SIZE];
00222
00223 #ifndef DO_ONE_PLANE
00224 const int nblockx = gridDim.x;
00225 const int nblocky = gridDim.y / mzblocks;
00226
00227
00228 const int bidx = blockIdx.x;
00229 const int bidy = blockIdx.y % nblocky;
00230 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00231
00232
00233
00234 const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00235
00236
00237 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00238
00239
00240 float px = (bidx<<BSHIFT) * const_hx;
00241 float py = (bidy<<BSHIFT) * const_hy;
00242 float pz = (bidz<<BSHIFT) * const_hz;
00243 #else
00244
00245
00246
00247
00248 const int bid
00249 = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00250
00251
00252 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00253
00254
00255 float px = (blockIdx.x<<BSHIFT) * const_hx;
00256 float py = (blockIdx.y<<BSHIFT) * const_hy;
00257 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00258 #endif
00259
00260 const int mindex = (bid<<NTPBSHIFT) + tid;
00261 int isexcluded = 0;
00262
00263 const float mindis2 = const_min_excldist * const_min_excldist;
00264
00265 const int ib = (int) floorf(px * const_bx_1);
00266 const int jb = (int) floorf(py * const_by_1);
00267 const int kb = (int) floorf(pz * const_bz_1);
00268
00269
00270 const int binid = (__umul24(kb,nby) + jb)*nbx + ib;
00271
00272 int n, nb;
00273
00274 px += threadIdx.x * const_hx;
00275 py += threadIdx.y * const_hy;
00276 pz += threadIdx.z * const_hz;
00277
00278 px += const_x0;
00279 py += const_y0;
00280 pz += const_z0;
00281
00282 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00283
00284 for (nb = 0; nb < MAX_EXCLOFFSETS; nb++) {
00285
00286 if (tid < BIN_SIZE) {
00287 const flint *p_bin = bin_zero
00288 + ((binid + const_excloffsets[nb])<<BIN_SHIFT);
00289 atombin[tid] = p_bin[tid];
00290 }
00291 __syncthreads();
00292
00293 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00294
00295 if (-1 == atombin[n+3].i) break;
00296 const float dx = px - atombin[n ].f;
00297 const float dy = py - atombin[n+1].f;
00298 const float dz = pz - atombin[n+2].f;
00299 const float r2 = dx*dx + dy*dy + dz*dz;
00300
00301 if (r2 <= mindis2) isexcluded = 1;
00302
00303 }
00304
00305 __syncthreads();
00306 }
00307
00308 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00309 const float dx = px - const_extra[n ].f;
00310 const float dy = py - const_extra[n+1].f;
00311 const float dz = pz - const_extra[n+2].f;
00312 const float r2 = dx*dx + dy*dy + dz*dz;
00313
00314 if (r2 <= mindis2) isexcluded = 1;
00315
00316 }
00317
00318 excl[mindex] = isexcluded;
00319 }
00320 #endif // USE_DISTANCE_EXCL
00321
00322
00323 #ifdef USE_ENERGY_EXCL
00324 __global__ static void cuda_find_energy_exclusions(
00325 int *excl,
00326 const flint *bin_zero,
00327 int nbx, int nby,
00328 #ifndef DO_ONE_PLANE
00329 int mzblocks,
00330 #endif
00331 int mzblockoffset
00332 ) {
00333
00334
00335 __shared__ flint atombin[BIN_SIZE];
00336
00337 #ifdef TEST_BLOCK_EXCL
00338 #if 0
00339 __shared__ int sh_isall_upper_warp;
00340 #endif
00341 __shared__ int sh_sum[NTHREADSPERBLOCK];
00342 #endif
00343
00344 #ifndef DO_ONE_PLANE
00345 const int nblockx = gridDim.x;
00346 const int nblocky = gridDim.y / mzblocks;
00347
00348
00349 const int bidx = blockIdx.x;
00350 const int bidy = blockIdx.y % nblocky;
00351 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00352
00353
00354
00355 const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00356
00357
00358 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00359
00360
00361 float px = (bidx<<BSHIFT) * const_hx;
00362 float py = (bidy<<BSHIFT) * const_hy;
00363 float pz = (bidz<<BSHIFT) * const_hz;
00364 #else
00365
00366
00367
00368
00369 const int bid
00370 = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00371
00372
00373 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00374
00375
00376 float px = (blockIdx.x<<BSHIFT) * const_hx;
00377 float py = (blockIdx.y<<BSHIFT) * const_hy;
00378 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00379 #endif
00380
00381 const int mindex = (bid<<NTPBSHIFT) + tid;
00382 #ifdef USE_DISTANCE_EXCL
00383 int isexcluded = excl[mindex];
00384 #else
00385 int isexcluded = 0;
00386 #endif
00387
00388 #ifdef TEST_BLOCK_EXCL
00389 #if 0
00390
00391
00392 int isall = __all(isexcluded);
00393 if (32==tid) sh_isall_upper_warp = isall;
00394 if (0==isany+sh_isall_upper_warp) return;
00395 #endif
00396 sh_sum[tid] = !(isexcluded);
00397 int nbrsum = sh_sum[tid^1];
00398 sh_sum[tid] += nbrsum;
00399 nbrsum = sh_sum[tid^2];
00400 sh_sum[tid] += nbrsum;
00401 nbrsum = sh_sum[tid^4];
00402 sh_sum[tid] += nbrsum;
00403 nbrsum = sh_sum[tid^8];
00404 sh_sum[tid] += nbrsum;
00405 nbrsum = sh_sum[tid^16];
00406 sh_sum[tid] += nbrsum;
00407 __syncthreads();
00408 nbrsum = sh_sum[tid^32];
00409 __syncthreads();
00410 sh_sum[tid] += nbrsum;
00411
00412 if (0==sh_sum[tid]) return;
00413 #endif
00414
00415 const float cutoff2 = const_cutoff * const_cutoff;
00416 const float probe_vdweps = const_probevdwparms[0];
00417 const float probe_vdwrmin = const_probevdwparms[1];
00418
00419 const int ib = (int) floorf(px * const_bx_1);
00420 const int jb = (int) floorf(py * const_by_1);
00421 const int kb = (int) floorf(pz * const_bz_1);
00422
00423
00424 const int binid = (__umul24(kb,nby) + jb)*nbx + ib;
00425
00426 int n, nb;
00427
00428 px += threadIdx.x * const_hx;
00429 py += threadIdx.y * const_hy;
00430 pz += threadIdx.z * const_hz;
00431
00432 px += const_x0;
00433 py += const_y0;
00434 pz += const_z0;
00435
00436 float u = 0.f;
00437
00438 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00439
00440 for (nb = 0; nb < const_num_binoffsets; nb++) {
00441
00442 if (tid < BIN_SIZE) {
00443 const flint *p_bin = bin_zero
00444 + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00445 atombin[tid] = p_bin[tid];
00446 }
00447 __syncthreads();
00448
00449 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00450 int vdwtype = atombin[n+3].i;
00451 if (-1 == vdwtype) break;
00452 vdwtype <<= 1;
00453
00454 const float dx = px - atombin[n ].f;
00455 const float dy = py - atombin[n+1].f;
00456 const float dz = pz - atombin[n+2].f;
00457 const float r2 = dx*dx + dy*dy + dz*dz;
00458 if (r2 < cutoff2) {
00459 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00460 const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00461 float rm6 = rmin*rmin / r2;
00462 rm6 = rm6 * rm6 * rm6;
00463 u += epsilon * rm6 * (rm6 - 2.f);
00464 }
00465 }
00466
00467 __syncthreads();
00468 }
00469
00470 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00471 const int vdwtype = const_extra[n+3].i << 1;
00472 const float dx = px - const_extra[n ].f;
00473 const float dy = py - const_extra[n+1].f;
00474 const float dz = pz - const_extra[n+2].f;
00475 const float r2 = dx*dx + dy*dy + dz*dz;
00476 if (r2 < cutoff2) {
00477 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00478 const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00479 float rm6 = rmin*rmin / r2;
00480 rm6 = rm6 * rm6 * rm6;
00481 u += epsilon * rm6 * (rm6 - 2.f);
00482 }
00483 }
00484
00485 if (u >= const_max_energy) isexcluded = 1;
00486
00487 excl[mindex] = isexcluded;
00488 }
00489 #endif // USE_ENERGY_EXCL
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502 __global__ static void cuda_occupancy_monoatom(
00503 float *map,
00504 const int *excl,
00505 const flint *bin_zero,
00506 int nbx, int nby,
00507 #ifndef DO_ONE_PLANE
00508 int mzblocks,
00509 #endif
00510 int mzblockoffset
00511 ) {
00512
00513
00514 __shared__ flint atombin[BIN_SIZE];
00515
00516 #ifdef TEST_BLOCK_EXCL
00517 #if 0
00518 __shared__ int sh_isall_upper_warp;
00519 #endif
00520 __shared__ int sh_sum[NTHREADSPERBLOCK];
00521 #endif
00522
00523 #ifndef DO_ONE_PLANE
00524 const int nblockx = gridDim.x;
00525 const int nblocky = gridDim.y / mzblocks;
00526
00527
00528 const int bidx = blockIdx.x;
00529 const int bidy = blockIdx.y % nblocky;
00530 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00531
00532
00533
00534 const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00535
00536
00537 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00538
00539
00540 float px = (bidx<<BSHIFT) * const_hx;
00541 float py = (bidy<<BSHIFT) * const_hy;
00542 float pz = (bidz<<BSHIFT) * const_hz;
00543 #else
00544
00545
00546
00547
00548 const int bid
00549 = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00550
00551
00552 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00553
00554
00555 float px = (blockIdx.x<<BSHIFT) * const_hx;
00556 float py = (blockIdx.y<<BSHIFT) * const_hy;
00557 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00558 #endif
00559
00560 const int mindex = (bid<<NTPBSHIFT) + tid;
00561 const int isexcluded = excl[mindex];
00562
00563 #ifdef TEST_BLOCK_EXCL
00564 #if 0
00565
00566
00567 int isall = __all(isexcluded);
00568 if (32==tid) sh_isall_upper_warp = isall;
00569 if (0==isany+sh_isall_upper_warp) return;
00570 #endif
00571 sh_sum[tid] = !(isexcluded);
00572 int nbrsum = sh_sum[tid^1];
00573 sh_sum[tid] += nbrsum;
00574 nbrsum = sh_sum[tid^2];
00575 sh_sum[tid] += nbrsum;
00576 nbrsum = sh_sum[tid^4];
00577 sh_sum[tid] += nbrsum;
00578 nbrsum = sh_sum[tid^8];
00579 sh_sum[tid] += nbrsum;
00580 nbrsum = sh_sum[tid^16];
00581 sh_sum[tid] += nbrsum;
00582 __syncthreads();
00583 nbrsum = sh_sum[tid^32];
00584 __syncthreads();
00585 sh_sum[tid] += nbrsum;
00586
00587 if (0==sh_sum[tid]) return;
00588 #endif
00589
00590 const float cutoff2 = const_cutoff * const_cutoff;
00591 const float probe_vdweps = const_probevdwparms[0];
00592 const float probe_vdwrmin = const_probevdwparms[1];
00593
00594 const int ib = (int) floorf(px * const_bx_1);
00595 const int jb = (int) floorf(py * const_by_1);
00596 const int kb = (int) floorf(pz * const_bz_1);
00597
00598
00599 const int binid = (__umul24(kb,nby) + jb)*nbx + ib;
00600
00601 int n, nb;
00602
00603 px += threadIdx.x * const_hx;
00604 py += threadIdx.y * const_hy;
00605 pz += threadIdx.z * const_hz;
00606
00607 px += const_x0;
00608 py += const_y0;
00609 pz += const_z0;
00610
00611 float u = 0.f;
00612
00613 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00614
00615 for (nb = 0; nb < const_num_binoffsets; nb++) {
00616
00617 if (tid < BIN_SIZE) {
00618 const flint *p_bin = bin_zero
00619 + ((binid + const_binoffsets[nb])<<BIN_SHIFT);
00620 atombin[tid] = p_bin[tid];
00621 }
00622 __syncthreads();
00623
00624 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00625 int vdwtype = atombin[n+3].i;
00626 if (-1 == vdwtype) break;
00627 vdwtype <<= 1;
00628
00629 const float dx = px - atombin[n ].f;
00630 const float dy = py - atombin[n+1].f;
00631 const float dz = pz - atombin[n+2].f;
00632 const float r2 = dx*dx + dy*dy + dz*dz;
00633 if (r2 < cutoff2) {
00634 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00635 const float rmin = const_vdwparms[vdwtype+1] + probe_vdwrmin;
00636 float rm6 = rmin*rmin / r2;
00637 rm6 = rm6 * rm6 * rm6;
00638 u += epsilon * rm6 * (rm6 - 2.f);
00639 }
00640 }
00641
00642 __syncthreads();
00643 }
00644
00645 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00646 const int vdwtype = const_extra[n+3].i << 1;
00647 const float dx = px - const_extra[n ].f;
00648 const float dy = py - const_extra[n+1].f;
00649 const float dz = pz - const_extra[n+2].f;
00650 const float r2 = dx*dx + dy*dy + dz*dz;
00651 if (r2 < cutoff2) {
00652 const float epsilon = const_vdwparms[vdwtype] * probe_vdweps;
00653 const float rmin = const_vdwparms[vdwtype + 1] + probe_vdwrmin;
00654 float rm6 = rmin*rmin / r2;
00655 rm6 = rm6 * rm6 * rm6;
00656 u += epsilon * rm6 * (rm6 - 2.f);
00657 }
00658 }
00659
00660 float occ = 0.f;
00661 if (!isexcluded && u < const_max_energy) {
00662 occ = expf(-u);
00663 }
00664 map[mindex] = occ;
00665 }
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677 __global__ static void cuda_occupancy_multiatom(
00678 float *map,
00679 const int *excl,
00680 const flint *bin_zero,
00681 int nbx, int nby,
00682 #ifndef DO_ONE_PLANE
00683 int mzblocks,
00684 #endif
00685 int mzblockoffset
00686 ) {
00687
00688
00689 __shared__ flint atombin[BIN_SIZE];
00690
00691
00692 __shared__ float usum[NTPB_TIMES_NCPL];
00693
00694
00695
00696 __shared__ float conf[3*MAX_PROBES * NCONFPERLOOP];
00697
00698 #if defined(TEST_MBLOCK_EXCL) || defined(TEST_SHIFT_EXCL)
00699 #if 0
00700 __shared__ int sh_isany_upper_warp;
00701 #endif
00702
00703
00704
00705 __shared__ flint sh_buffer[NTHREADSPERBLOCK];
00706 #endif
00707
00708 #ifndef DO_ONE_PLANE
00709 const int nblockx = gridDim.x;
00710 const int nblocky = gridDim.y / mzblocks;
00711
00712
00713 const int bidx = blockIdx.x;
00714 const int bidy = blockIdx.y % nblocky;
00715 const int bidz = blockIdx.y / nblocky + mzblockoffset;
00716
00717
00718
00719 const int bid = (__umul24(bidz,nblocky) + bidy)*nblockx + bidx;
00720
00721
00722 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00723
00724
00725 float px = (bidx<<BSHIFT) * const_hx;
00726 float py = (bidy<<BSHIFT) * const_hy;
00727 float pz = (bidz<<BSHIFT) * const_hz;
00728 #else
00729
00730
00731
00732
00733 const int bid
00734 = (__umul24(mzblockoffset,gridDim.y) + blockIdx.y)*gridDim.x + blockIdx.x;
00735
00736
00737 const int tid = (((threadIdx.z<<BSHIFT) + threadIdx.y)<<BSHIFT) + threadIdx.x;
00738
00739
00740 float px = (blockIdx.x<<BSHIFT) * const_hx;
00741 float py = (blockIdx.y<<BSHIFT) * const_hy;
00742 float pz = (mzblockoffset<<BSHIFT) * const_hz;
00743 #endif
00744
00745 const int mindex = (bid<<NTPBSHIFT) + tid;
00746 const int isincluded = !(excl[mindex]);
00747
00748 #ifdef TEST_MBLOCK_EXCL
00749 #if 0
00750
00751
00752 int isany = __any(isincluded);
00753 if (32==tid) sh_isany_upper_warp = isany;
00754 if (0==isany+sh_isany_upper_warp) return;
00755 #endif
00756
00757 sh_buffer[tid].i = isincluded;
00758 int nbrsum = sh_buffer[tid^1].i;
00759 sh_buffer[tid].i += nbrsum;
00760 nbrsum = sh_buffer[tid^2].i;
00761 sh_buffer[tid].i += nbrsum;
00762 nbrsum = sh_buffer[tid^4].i;
00763 sh_buffer[tid].i += nbrsum;
00764 nbrsum = sh_buffer[tid^8].i;
00765 sh_buffer[tid].i += nbrsum;
00766 nbrsum = sh_buffer[tid^16].i;
00767 sh_buffer[tid].i += nbrsum;
00768 __syncthreads();
00769 nbrsum = sh_buffer[tid^32].i;
00770 __syncthreads();
00771 sh_buffer[tid].i += nbrsum;
00772
00773 if (0==sh_buffer[tid].i) {
00774 map[mindex] = 0.f;
00775 return;
00776 }
00777 #endif
00778
00779 #ifdef TEST_SHIFT_EXCL
00780 int numincl = 0;
00781 {
00782 int psum = isincluded;
00783 sh_buffer[tid].i = isincluded;
00784
00785 int nbrid = tid^1;
00786 int nbrsum = sh_buffer[nbrid].i;
00787 if (nbrid < tid) psum += nbrsum;
00788 sh_buffer[tid].i += nbrsum;
00789
00790 nbrid = tid^2;
00791 nbrsum = sh_buffer[nbrid].i;
00792 if (nbrid < tid) psum += nbrsum;
00793 sh_buffer[tid].i += nbrsum;
00794
00795 nbrid = tid^4;
00796 nbrsum = sh_buffer[nbrid].i;
00797 if (nbrid < tid) psum += nbrsum;
00798 sh_buffer[tid].i += nbrsum;
00799
00800 nbrid = tid^8;
00801 nbrsum = sh_buffer[nbrid].i;
00802 if (nbrid < tid) psum += nbrsum;
00803 sh_buffer[tid].i += nbrsum;
00804
00805 nbrid = tid^16;
00806 nbrsum = sh_buffer[nbrid].i;
00807 if (nbrid < tid) psum += nbrsum;
00808 sh_buffer[tid].i += nbrsum;
00809
00810 __syncthreads();
00811 nbrid = tid^32;
00812 nbrsum = sh_buffer[nbrid].i;
00813 if (nbrid < tid) psum += nbrsum;
00814 __syncthreads();
00815 sh_buffer[tid].i += nbrsum;
00816
00817 numincl = sh_buffer[tid].i;
00818 if (0==numincl) {
00819 map[mindex] = 0.f;
00820 return;
00821 }
00822 __syncthreads();
00823 if (isincluded) {
00824 sh_buffer[psum-1].i = tid;
00825 }
00826 }
00827 __syncthreads();
00828 const int sid = sh_buffer[tid].i;
00829 #endif
00830
00831 const float cutoff2 = const_cutoff * const_cutoff;
00832
00833 const int ib = (int) floorf(px * const_bx_1);
00834 const int jb = (int) floorf(py * const_by_1);
00835 const int kb = (int) floorf(pz * const_bz_1);
00836
00837
00838 const int binid = (__umul24(kb,nby) + jb)*nbx + ib;
00839
00840 int n, nb;
00841 int m;
00842 int ma;
00843 int nc;
00844
00845 #ifdef TEST_SHIFT_EXCL
00846 if (tid < numincl) {
00847 int ssid = sid;
00848 const int sid_x = ssid & PADMASK;
00849 ssid >>= BSHIFT;
00850 const int sid_y = ssid & PADMASK;
00851 ssid >>= BSHIFT;
00852 const int sid_z = ssid;
00853 px += sid_x * const_hx;
00854 py += sid_y * const_hy;
00855 pz += sid_z * const_hz;
00856 }
00857 #else
00858 px += threadIdx.x * const_hx;
00859 py += threadIdx.y * const_hy;
00860 pz += threadIdx.z * const_hz;
00861 #endif
00862
00863 px += const_x0;
00864 py += const_y0;
00865 pz += const_z0;
00866
00867 #ifdef TEST_SHIFT_EXCL
00868 sh_buffer[tid].f = 0.f;
00869 #else
00870 float occ = 0.f;
00871 #endif
00872
00873 const int twice_num_probes = const_num_probes << 1;
00874 const int num_extra_slots = const_num_extras << BIN_SLOTSHIFT;
00875
00876
00877
00878 const int total_shmem_copy = __umul24((3*NCONFPERLOOP),const_num_probes);
00879
00880
00881 for (m = 0; m < const_num_conformers; m += NCONFPERLOOP) {
00882
00883
00884 for (nc = 0; nc < NTPB_TIMES_NCPL; nc += NTHREADSPERBLOCK) {
00885 usum[nc + tid] = 0.f;
00886 }
00887
00888
00889
00890
00891 __syncthreads();
00892 if (tid < 32) {
00893
00894
00895 const int npass = (total_shmem_copy >> 5);
00896
00897
00898 const int nleft = (total_shmem_copy & 31);
00899
00900
00901 const int skip = __umul24((const_num_probes + twice_num_probes), m);
00902
00903 int off = 0;
00904 for (nc = 0; nc < npass; nc++, off += 32) {
00905 conf[off + tid] = const_conformers[skip + off + tid];
00906 }
00907 if (tid < nleft) {
00908 conf[off + tid] = const_conformers[skip + off + tid];
00909 }
00910 }
00911 __syncthreads();
00912
00913 for (nb = 0; nb < const_num_binoffsets; nb++) {
00914
00915 __syncthreads();
00916 if (tid < BIN_SIZE) {
00917 const flint *p_bin = bin_zero
00918 + ((binid + const_binoffsets[nb]) << BIN_SHIFT);
00919 atombin[tid] = p_bin[tid];
00920 }
00921 __syncthreads();
00922
00923 #ifdef TEST_SHIFT_EXCL
00924 if (tid < numincl) {
00925 #endif
00926
00927 for (n = 0; n < BIN_SIZE; n += BIN_SLOTSIZE) {
00928 int vdwtype = atombin[n+3].i;
00929 if (-1 == vdwtype) break;
00930 vdwtype <<= 1;
00931 const float epsilon_atom = const_vdwparms[vdwtype];
00932 const float rmin_atom = const_vdwparms[vdwtype+1];
00933 const float ax = px - atombin[n ].f;
00934 const float ay = py - atombin[n+1].f;
00935 const float az = pz - atombin[n+2].f;
00936
00937 int sindex = 0;
00938
00939
00940 for (nc = 0; nc < NTPB_TIMES_NCPL; nc += NTHREADSPERBLOCK) {
00941
00942 for (ma = 0; ma < twice_num_probes; ma += 2) {
00943 const float dx = conf[sindex++] + ax;
00944 const float dy = conf[sindex++] + ay;
00945 const float dz = conf[sindex++] + az;
00946 const float r2 = dx*dx + dy*dy + dz*dz;
00947 if (r2 < cutoff2) {
00948 const float epsilon = epsilon_atom * const_probevdwparms[ma];
00949 const float rmin = rmin_atom + const_probevdwparms[ma+1];
00950 float rm6 = rmin*rmin / r2;
00951 rm6 = rm6 * rm6 * rm6;
00952
00953
00954
00955
00956
00957
00958
00959
00960
00961
00962
00963 usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
00964 }
00965 }
00966
00967 }
00968
00969 }
00970 #ifdef TEST_SHIFT_EXCL
00971 }
00972 #endif
00973
00974 }
00975
00976 #ifdef TEST_SHIFT_EXCL
00977 if (tid < numincl) {
00978 #endif
00979
00980 for (n = 0; n < num_extra_slots; n += BIN_SLOTSIZE) {
00981 const int vdwtype = const_extra[n+3].i << 1;
00982 const float epsilon_atom = const_vdwparms[vdwtype];
00983 const float rmin_atom = const_vdwparms[vdwtype+1];
00984 const float ax = px - const_extra[n ].f;
00985 const float ay = py - const_extra[n+1].f;
00986 const float az = pz - const_extra[n+2].f;
00987 int sindex = 0;
00988
00989
00990 for (nc = 0; nc < NTPB_TIMES_NCPL; nc += NTHREADSPERBLOCK) {
00991
00992 for (ma = 0; ma < twice_num_probes; ma += 2) {
00993 const float dx = conf[sindex++] + ax;
00994 const float dy = conf[sindex++] + ay;
00995 const float dz = conf[sindex++] + az;
00996 const float r2 = dx*dx + dy*dy + dz*dz;
00997 if (r2 < cutoff2) {
00998 const float epsilon = epsilon_atom * const_probevdwparms[ma];
00999 const float rmin = rmin_atom + const_probevdwparms[ma+1];
01000 float rm6 = rmin*rmin / r2;
01001 rm6 = rm6 * rm6 * rm6;
01002 usum[nc + tid] += epsilon * rm6 * (rm6 - 2.f);
01003 }
01004 }
01005
01006 }
01007
01008 }
01009
01010 for (nc = 0; nc < NCONFPERLOOP; nc++) {
01011
01012 if (m + nc < const_num_conformers) {
01013 float u = usum[(nc << NTPBSHIFT) + tid];
01014
01015 #ifdef TEST_SHIFT_EXCL
01016 if (u < MAX_ALLOWABLE_ENERGY) {
01017 sh_buffer[sid].f += expf(-u);
01018 }
01019 #else
01020 if (isincluded && u < MAX_ALLOWABLE_ENERGY) {
01021 occ += expf(-u);
01022 }
01023 #endif
01024 }
01025
01026 }
01027 #ifdef TEST_SHIFT_EXCL
01028 }
01029 #endif
01030
01031 }
01032
01033 #ifdef TEST_SHIFT_EXCL
01034 __syncthreads();
01035 sh_buffer[tid].f *= const_inv_numconf;
01036 if (sh_buffer[tid].f <= const_min_occupancy) sh_buffer[tid].f = 0.f;
01037
01038 map[mindex] = sh_buffer[tid].f;
01039 #else
01040 occ = occ * const_inv_numconf;
01041 if (occ <= const_min_occupancy) occ = 0.f;
01042
01043 map[mindex] = occ;
01044 #endif
01045
01046 }
01047
01048
01049
01050 static void *cuda_occupancy_thread(void *voidparms) {
01051 occthrparms *parms = NULL;
01052 int mxpad, mypad, mzpad, mappad;
01053 float *h_map;
01054 float *d_map;
01055 int *d_excl;
01056 int binsz;
01057 flint *d_bin;
01058 flint *d_bin_zero;
01059 int gpuid = -1;
01060 int numgpus = 0;
01061 int i, j, k;
01062
01063
01064 if (getenv("VMDILSCUDADEVICE")) {
01065 gpuid = atoi(getenv("VMDILSCUDADEVICE"));
01066 }
01067
01068 if (gpuid < 0) {
01069 if (vmd_cuda_num_devices(&numgpus) != VMDCUDA_ERR_NONE) {
01070 printf("ILS CUDA device init error\n");
01071 return NULL;
01072 }
01073 if (numgpus > 0) {
01074
01075
01076
01077
01078
01079 gpuid = 0;
01080 } else {
01081 return NULL;
01082 }
01083 }
01084
01085 if (gpuid >= 0) {
01086 printf("Using CUDA device: %d\n", gpuid);
01087 } else {
01088
01089 return NULL;
01090 }
01091
01092 wkf_timerhandle timer = wkf_timer_create();
01093
01094 parms = (occthrparms *) voidparms;
01095 parms->errcode = -1;
01096
01097 if (getenv("VMDILSVERBOSE")) {
01098
01099 printf("*****************************************************************\n");
01100 printf("mx = %d my = %d mz = %d\n", parms->mx, parms->my, parms->mz);
01101 printf("hx = %g hy = %g hz = %g\n", parms->hx, parms->hy, parms->hz);
01102 printf("x0 = %g y0 = %g z0 = %g\n", parms->x0, parms->y0, parms->z0);
01103 printf("bx_1 = %g by_1 = %g bz_1 = %g\n",
01104 parms->bx_1, parms->by_1, parms->bz_1);
01105 printf("nbx = %d nby = %d nbz = %d\n", parms->nbx, parms->nby, parms->nbz);
01106 printf("num_binoffsets = %d\n", parms->num_binoffsets);
01107 printf("num_extras = %d\n", parms->num_extras);
01108 printf("num_vdwparms = %d\n", parms->num_vdwparms);
01109 printf("num_probes = %d\n", parms->num_probes);
01110 printf("num_conformers = %d\n", parms->num_conformers);
01111 printf("*****************************************************************\n");
01112 }
01113
01114
01115 if (parms->num_binoffsets > MAX_BINOFFSETS) {
01116 printf("***** ERROR: Exceeded MAX_BINOFFSETS for CUDA kernel\n");
01117 return NULL;
01118 }
01119 if (parms->num_extras > MAX_EXTRAS) {
01120 printf("***** ERROR: Exceeded MAX_EXTRAS for CUDA kernel\n");
01121 return NULL;
01122 }
01123 if (parms->num_vdwparms > MAX_VDWPARMS) {
01124 printf("***** ERROR: Exceeded MAX_VDWPARMS for CUDA kernel\n");
01125 return NULL;
01126 }
01127 if (parms->num_probes > MAX_PROBES) {
01128 printf("***** ERROR: Exceeded MAX_PROBES for CUDA kernel\n");
01129 return NULL;
01130 }
01131 if (parms->num_conformers > MAX_CONFORMERS) {
01132 printf("***** ERROR: Exceeded MAX_CONFORMERS for CUDA kernel\n");
01133 return NULL;
01134 }
01135
01136
01137 cudaError_t rc;
01138 rc = cudaSetDevice(gpuid);
01139 if (rc != cudaSuccess) {
01140 #if CUDART_VERSION >= 2010
01141 rc = cudaGetLastError();
01142 if (rc != cudaErrorSetOnActiveProcess) {
01143 return NULL;
01144 }
01145 #else
01146 cudaGetLastError();
01147
01148 #endif
01149 }
01150
01151
01152 mxpad = (parms->mx + PADMASK) & ~PADMASK;
01153 mypad = (parms->my + PADMASK) & ~PADMASK;
01154 mzpad = (parms->mz + PADMASK) & ~PADMASK;
01155 mappad = mxpad * mypad * mzpad;
01156
01157 if (getenv("VMDILSVERBOSE")) {
01158 printf("mxpad=%d mypad=%d mzpad=%d mappad=%d\n",
01159 mxpad, mypad, mzpad, mappad);
01160 }
01161
01162 h_map = (float *) malloc(mappad * sizeof(float));
01163 if (getenv("VMDILSVERBOSE")) {
01164 printf("Allocate %g MB for map\n", mappad*sizeof(float)/(1024*1024.f));
01165 }
01166 cudaMalloc((void **) &d_map, mappad * sizeof(float));
01167 CUERR;
01168 if (getenv("VMDILSVERBOSE")) {
01169 printf("Allocate %g MB for exclusions\n", mappad*sizeof(int)/(1024*1024.f));
01170 }
01171 cudaMalloc((void **) &d_excl, mappad * sizeof(int));
01172 CUERR;
01173
01174 #if !defined(USE_DISTANCE_EXCL) && !defined(USE_ENERGY_EXCL)
01175
01176 cudaMemset(d_excl, 0, mappad * sizeof(int));
01177 CUERR;
01178 #endif
01179
01180 binsz = BIN_SIZE * parms->nbx * parms->nby * parms->nbz;
01181 if (getenv("VMDILSVERBOSE")) {
01182 printf("nbx=%d nby=%d nbz=%d binsz=%d\n",
01183 parms->nbx, parms->nby, parms->nbz, binsz);
01184 printf("Allocate %g MB for atom bins\n", binsz*sizeof(flint)/(1024*1024.f));
01185 }
01186 cudaMalloc((void **) &d_bin, binsz * sizeof(flint));
01187 CUERR;
01188 cudaMemcpy(d_bin, parms->bin, binsz * sizeof(flint), cudaMemcpyHostToDevice);
01189 CUERR;
01190 d_bin_zero = d_bin + (parms->bin_zero - parms->bin);
01191
01192 if (getenv("VMDILSVERBOSE")) {
01193 printf("parms delta bin = %d delta bin = %d\n",
01194 (parms->bin_zero - parms->bin), (d_bin_zero - d_bin));
01195
01196 printf("probe epsilon=%g rmin=%g\n",
01197 parms->probevdwparms[0], parms->probevdwparms[1]);
01198 printf("max_energy=%g\n", parms->max_energy);
01199 printf("cutoff=%g\n", parms->cutoff);
01200 printf("hx=%g hy=%g hz=%g\n", parms->hx, parms->hy, parms->hz);
01201 printf("x0=%g y0=%g z0=%g\n", parms->x0, parms->y0, parms->z0);
01202 printf("bx_1=%g by_1=%g bz_1=%g\n", parms->bx_1, parms->by_1, parms->bz_1);
01203 }
01204
01205 cudaMemcpyToSymbol(const_max_energy, &(parms->max_energy), sizeof(float), 0);
01206 CUERR;
01207 float min_occ = expf(-parms->max_energy);
01208 cudaMemcpyToSymbol(const_min_occupancy, &min_occ, sizeof(float), 0);
01209 CUERR;
01210 cudaMemcpyToSymbol(const_cutoff, &(parms->cutoff), sizeof(float), 0);
01211 CUERR;
01212 cudaMemcpyToSymbol(const_min_excldist, &(parms->min_excldist),
01213 sizeof(float), 0);
01214 CUERR;
01215 cudaMemcpyToSymbol(const_hx, &(parms->hx), sizeof(float), 0);
01216 CUERR;
01217 cudaMemcpyToSymbol(const_hy, &(parms->hy), sizeof(float), 0);
01218 CUERR;
01219 cudaMemcpyToSymbol(const_hz, &(parms->hz), sizeof(float), 0);
01220 CUERR;
01221 cudaMemcpyToSymbol(const_x0, &(parms->x0), sizeof(float), 0);
01222 CUERR;
01223 cudaMemcpyToSymbol(const_y0, &(parms->y0), sizeof(float), 0);
01224 CUERR;
01225 cudaMemcpyToSymbol(const_z0, &(parms->z0), sizeof(float), 0);
01226 CUERR;
01227 cudaMemcpyToSymbol(const_bx_1, &(parms->bx_1), sizeof(float), 0);
01228 CUERR;
01229 cudaMemcpyToSymbol(const_by_1, &(parms->by_1), sizeof(float), 0);
01230 CUERR;
01231 cudaMemcpyToSymbol(const_bz_1, &(parms->bz_1), sizeof(float), 0);
01232 CUERR;
01233 float inv_numconf = 1.f / parms->num_conformers;
01234 cudaMemcpyToSymbol(const_inv_numconf, &inv_numconf, sizeof(float), 0);
01235
01236 if (getenv("VMDILSVERBOSE")) {
01237 printf("num_binoffsets = %d\n", parms->num_binoffsets);
01238 }
01239 cudaMemcpyToSymbol(const_num_binoffsets, &(parms->num_binoffsets),
01240 sizeof(int), 0);
01241 CUERR;
01242 if (getenv("VMDILSVERBOSE")) {
01243 printf("num_extras = %d\n", parms->num_extras);
01244 }
01245 cudaMemcpyToSymbol(const_num_extras, &(parms->num_extras), sizeof(int), 0);
01246 CUERR;
01247 if (getenv("VMDILSVERBOSE")) {
01248 printf("num_probes = %d\n", parms->num_probes);
01249 }
01250 cudaMemcpyToSymbol(const_num_probes, &(parms->num_probes), sizeof(int), 0);
01251 CUERR;
01252 if (getenv("VMDILSVERBOSE")) {
01253 printf("num_conformers = %d\n", parms->num_conformers);
01254 }
01255 cudaMemcpyToSymbol(const_num_conformers, &(parms->num_conformers),
01256 sizeof(int), 0);
01257 CUERR;
01258
01259 if (getenv("VMDILSVERBOSE")) {
01260 printf("num_vdwparms = %d\n", parms->num_vdwparms);
01261 for (i = 0; i < parms->num_vdwparms; i++) {
01262 printf(" %2d: epsilon=%g rmin=%g\n", i,
01263 parms->vdwparms[2*i], parms->vdwparms[2*i+1]);
01264 }
01265 }
01266
01267 cudaMemcpyToSymbol(const_vdwparms, parms->vdwparms,
01268 2 * parms->num_vdwparms * sizeof(float), 0);
01269 CUERR;
01270 cudaMemcpyToSymbol(const_probevdwparms, parms->probevdwparms,
01271 2 * (0==parms->num_probes ? 1 : parms->num_probes) * sizeof(float), 0);
01272 CUERR;
01273 cudaMemcpyToSymbol(const_extra, parms->extra,
01274 BIN_SLOTSIZE * parms->num_extras * sizeof(flint), 0);
01275 CUERR;
01276
01277
01278 {
01279 const int nbx = parms->nbx;
01280 const int nby = parms->nby;
01281 int n;
01282 int *h_flatbinoffset = (int *) calloc(parms->num_binoffsets, sizeof(int));
01283 for (n = 0; n < parms->num_binoffsets; n++) {
01284 int i = (int) (parms->binoffsets[3*n ]);
01285 int j = (int) (parms->binoffsets[3*n+1]);
01286 int k = (int) (parms->binoffsets[3*n+2]);
01287 int index = (k*nby + j)*nbx + i;
01288 h_flatbinoffset[n] = index;
01289 }
01290 cudaMemcpyToSymbol(const_binoffsets, h_flatbinoffset,
01291 parms->num_binoffsets * sizeof(int), 0);
01292 CUERR;
01293 free(h_flatbinoffset);
01294 }
01295
01296
01297 {
01298 const int nbx = parms->nbx;
01299 const int nby = parms->nby;
01300 int i, j, k, n=0;
01301 int h_flatexcloffset[27];
01302 for (k = -1; k <= 1; k++) {
01303 for (j = -1; j <= 1; j++) {
01304 for (i = -1; i <= 1; i++) {
01305 int index = (k*nby + j)*nbx + i;
01306 h_flatexcloffset[n++] = index;
01307 }
01308 }
01309 }
01310 cudaMemcpyToSymbol(const_excloffsets, h_flatexcloffset, 27*sizeof(int), 0);
01311 CUERR;
01312 }
01313
01314 cudaMemcpyToSymbol(const_conformers, parms->conformers,
01315 3 * parms->num_probes * parms->num_conformers * sizeof(float), 0);
01316 CUERR;
01317
01318
01319
01320
01321 #define TIME_PER_MONOATOM_WORKUNIT 1.24553913519431e-10
01322 #define TIME_PER_MULTIATOM_WORKUNIT 5.24559386973180e-09
01323
01324
01325
01326 int mxblocks = (mxpad >> BSHIFT);
01327 int myblocks = (mypad >> BSHIFT);
01328 int mzblocks = (mzpad >> BSHIFT);
01329 int nblocksperplane = mxblocks * myblocks;
01330 int maxplanes = MAX_THREADBLOCKS / nblocksperplane;
01331 int numconf = (0==parms->num_conformers ? 1 : parms->num_conformers);
01332
01333 double monoatom_time_per_plane = (double)(parms->num_probes * numconf *
01334 parms->num_binoffsets * nblocksperplane) * TIME_PER_MONOATOM_WORKUNIT;
01335
01336 double multiatom_time_per_plane = (double)(parms->num_probes * numconf *
01337 parms->num_binoffsets * nblocksperplane) * TIME_PER_MULTIATOM_WORKUNIT;
01338
01339
01340 #define MAX_TIME_PER_KERNEL 1.f // schedule up to 1 second per kernel call
01341 int max_mono_planes = (int)(MAX_TIME_PER_KERNEL / monoatom_time_per_plane);
01342 int max_multi_planes = (int)(MAX_TIME_PER_KERNEL / multiatom_time_per_plane);
01343
01344 if (max_mono_planes > maxplanes) max_mono_planes = maxplanes;
01345 else if (max_mono_planes < 1) max_mono_planes = 1;
01346
01347 if (max_multi_planes > maxplanes) max_multi_planes = maxplanes;
01348 else if (max_multi_planes < 1) max_multi_planes = 1;
01349
01350 if (getenv("VMDILSVERBOSE")) {
01351 printf("mxblocks = %d myblocks = %d mzblocks = %d\n",
01352 mxblocks, myblocks, mzblocks);
01353 }
01354 if (nblocksperplane > MAX_THREADBLOCKS) {
01355 printf("***** ERROR: Too many blocks in X-Y plane for CUDA kernel\n");
01356 return NULL;
01357 }
01358
01359 if (0 == parms->num_conformers) {
01360 if (getenv("VMDILSVERBOSE")) {
01361 printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01362 "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01363 }
01364 int mzblockoffset = 0;
01365 int nplanes_remaining = mzblocks;
01366 while (nplanes_remaining > 0) {
01367 #ifndef DO_ONE_PLANE
01368
01369 int nplanes = nplanes_remaining;
01370 if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01371 #else
01372
01373 int nplanes = 1;
01374 #endif
01375 dim3 gsz, bsz;
01376 gsz.x = mxblocks;
01377 gsz.y = myblocks * nplanes;
01378 gsz.z = 1;
01379 bsz.x = THBLEN;
01380 bsz.y = THBLEN;
01381 bsz.z = THBLEN;
01382
01383 if ( ! getenv("VMDILSNOEXCL") ) {
01384 #ifdef USE_DISTANCE_EXCL
01385
01386 #ifdef DEBUG
01387 printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01388 "on %2d planes ",
01389 nplanes);
01390 #endif
01391 #if 1
01392 cudaThreadSynchronize();
01393 #endif
01394 wkf_timer_start(timer);
01395 cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01396 d_excl, d_bin_zero,
01397 parms->nbx, parms->nby,
01398 #ifndef DO_ONE_PLANE
01399 nplanes,
01400 #endif
01401 mzblockoffset);
01402 #if 1
01403 cudaThreadSynchronize();
01404 #endif
01405 #ifdef DEBUG
01406 printf("%f s\n", wkf_timer_timenow(timer));
01407 #endif
01408 CUERR;
01409 #endif // USE_DISTANCE_EXCL
01410 }
01411
01412
01413 #ifdef DEBUG
01414 printf("*** VMD CUDA: cuda_occupancy_monoatom() "
01415 "on %2d planes ",
01416 nplanes);
01417 #endif
01418 #if 1
01419 cudaThreadSynchronize();
01420 #endif
01421 wkf_timer_start(timer);
01422 cuda_occupancy_monoatom<<<gsz, bsz, 0>>>(
01423 d_map, d_excl, d_bin_zero,
01424 parms->nbx, parms->nby,
01425 #ifndef DO_ONE_PLANE
01426 nplanes,
01427 #endif
01428 mzblockoffset);
01429 #if 1
01430 cudaThreadSynchronize();
01431 #endif
01432 #ifdef DEBUG
01433 printf("%f s\n", wkf_timer_timenow(timer));
01434 #endif
01435 CUERR;
01436
01437 nplanes_remaining -= nplanes;
01438 mzblockoffset += nplanes;
01439 }
01440 }
01441
01442 else {
01443 if (getenv("VMDILSVERBOSE")) {
01444 printf("Scheduling up to %d planes (%d thread blocks) per monoatom "
01445 "kernel call\n", max_mono_planes, nblocksperplane*max_mono_planes);
01446 printf("Scheduling up to %d planes (%d thread blocks) per multiatom "
01447 "kernel call\n", max_multi_planes, nblocksperplane*max_multi_planes);
01448 }
01449
01450 int mzblockoffset;
01451 int nplanes_remaining;
01452
01453 if ( ! getenv("VMDILSNOEXCL") ) {
01454
01455 mzblockoffset = 0;
01456 nplanes_remaining = mzblocks;
01457 while (nplanes_remaining > 0) {
01458 #ifndef DO_ONE_PLANE
01459
01460 int nplanes = nplanes_remaining;
01461 if (nplanes > max_mono_planes) nplanes = max_mono_planes;
01462 #else
01463
01464 int nplanes = 1;
01465 #endif
01466 dim3 gsz, bsz;
01467 gsz.x = mxblocks;
01468 gsz.y = myblocks * nplanes;
01469 gsz.z = 1;
01470 bsz.x = THBLEN;
01471 bsz.y = THBLEN;
01472 bsz.z = THBLEN;
01473
01474 #ifdef USE_DISTANCE_EXCL
01475
01476 #ifdef DEBUG
01477 printf("*** VMD CUDA: cuda_find_distance_exclusions() "
01478 "on %2d planes ",
01479 nplanes);
01480 #endif
01481 #if 1
01482 cudaThreadSynchronize();
01483 #endif
01484 wkf_timer_start(timer);
01485 cuda_find_distance_exclusions<<<gsz, bsz, 0>>>(
01486 d_excl, d_bin_zero,
01487 parms->nbx, parms->nby,
01488 #ifndef DO_ONE_PLANE
01489 nplanes,
01490 #endif
01491 mzblockoffset);
01492 #if 1
01493 cudaThreadSynchronize();
01494 #endif
01495 #ifdef DEBUG
01496 printf("%f s\n", wkf_timer_timenow(timer));
01497 #endif
01498 CUERR;
01499 #endif // USE_DISTANCE_EXCL
01500
01501 #ifdef USE_ENERGY_EXCL
01502
01503 #ifdef DEBUG
01504 printf("*** VMD CUDA: cuda_find_energy_exclusions() "
01505 "on %2d planes ",
01506 nplanes);
01507 #endif
01508 #if 1
01509 cudaThreadSynchronize();
01510 #endif
01511 wkf_timer_start(timer);
01512 cuda_find_energy_exclusions<<<gsz, bsz, 0>>>(
01513 d_excl, d_bin_zero,
01514 parms->nbx, parms->nby,
01515 #ifndef DO_ONE_PLANE
01516 nplanes,
01517 #endif
01518 mzblockoffset);
01519 #if 1
01520 cudaThreadSynchronize();
01521 #endif
01522 #ifdef DEBUG
01523 printf("%f s\n", wkf_timer_timenow(timer));
01524 #endif
01525 CUERR;
01526 #endif // USE_ENERGY_EXCL
01527
01528 nplanes_remaining -= nplanes;
01529 mzblockoffset += nplanes;
01530 }
01531 }
01532
01533
01534 mzblockoffset = 0;
01535 nplanes_remaining = mzblocks;
01536 while (nplanes_remaining > 0) {
01537 #ifndef DO_ONE_PLANE
01538
01539 int nplanes = nplanes_remaining;
01540 if (nplanes > max_multi_planes) nplanes = max_multi_planes;
01541 #else
01542
01543 int nplanes = 1;
01544 #endif
01545 dim3 gsz, bsz;
01546 gsz.x = mxblocks;
01547 gsz.y = myblocks * nplanes;
01548 gsz.z = 1;
01549 bsz.x = THBLEN;
01550 bsz.y = THBLEN;
01551 bsz.z = THBLEN;
01552
01553
01554 #ifdef DEBUG
01555 printf("*** VMD CUDA: cuda_occupancy_multiatom() "
01556 "on %2d planes ",
01557 nplanes);
01558 #endif
01559 #if 1
01560 cudaThreadSynchronize();
01561 #endif
01562 wkf_timer_start(timer);
01563 cuda_occupancy_multiatom<<<gsz, bsz, 0>>>(
01564 d_map, d_excl, d_bin_zero,
01565 parms->nbx, parms->nby,
01566 #ifndef DO_ONE_PLANE
01567 nplanes,
01568 #endif
01569 mzblockoffset);
01570 #if 1
01571 cudaThreadSynchronize();
01572 #endif
01573 #ifdef DEBUG
01574 printf("%f s\n", wkf_timer_timenow(timer));
01575 #endif
01576 CUERR;
01577
01578 nplanes_remaining -= nplanes;
01579 mzblockoffset += nplanes;
01580 }
01581 }
01582
01583
01584 cudaMemcpy(h_map, d_map, mappad * sizeof(float), cudaMemcpyDeviceToHost);
01585 CUERR;
01586
01587
01588
01589 for (k = 0; k < parms->mz; k++) {
01590 for (j = 0; j < parms->my; j++) {
01591 for (i = 0; i < parms->mx; i++) {
01592 int mindex = (k*parms->my + j)*parms->mx + i;
01593 int block = ((k >> BSHIFT)*myblocks + (j >> BSHIFT))*mxblocks
01594 + (i >> BSHIFT);
01595 int tid = ((k & PADMASK)*THBLEN + (j & PADMASK))*THBLEN + (i & PADMASK);
01596 int pindex = block*(THBLEN*THBLEN*THBLEN) + tid;
01597 parms->map[mindex] = h_map[pindex];
01598 }
01599 }
01600 }
01601
01602 cudaFree(d_excl);
01603 cudaFree(d_bin);
01604 cudaFree(d_map);
01605 CUERR;
01606
01607 free(h_map);
01608 wkf_timer_destroy(timer);
01609
01610 parms->errcode = 0;
01611 return NULL;
01612 }
01613
01614
01615 extern "C" int vmd_cuda_evaluate_occupancy_map(
01616 int mx, int my, int mz,
01617 float *map,
01618
01619
01620 float max_energy,
01621 float cutoff,
01622 float hx, float hy, float hz,
01623 float x0, float y0, float z0,
01624 float bx_1, float by_1, float bz_1,
01625
01626 int nbx, int nby, int nbz,
01627 const float *bin,
01628
01629 const float *bin_zero,
01630
01631 int num_binoffsets,
01632 const char *binoffsets,
01633
01634
01635 int num_extras,
01636 const float *extra,
01637
01638
01639
01640 int num_vdwparms,
01641 const float *vdwparms,
01642
01643
01644 int num_probes,
01645 const float *probevdwparms,
01646
01647
01648 int num_conformers,
01649 const float *conformers
01650
01651 ) {
01652 occthrparms parms;
01653
01654 parms.mx = mx;
01655 parms.my = my;
01656 parms.mz = mz;
01657 parms.map = map;
01658 parms.max_energy = max_energy;
01659 parms.cutoff = cutoff;
01660 #define DEFAULT_EXCL_DIST 1.f
01661 parms.min_excldist = DEFAULT_EXCL_DIST;
01662 parms.hx = hx;
01663 parms.hy = hy;
01664 parms.hz = hz;
01665 parms.x0 = x0;
01666 parms.y0 = y0;
01667 parms.z0 = z0;
01668 parms.bx_1 = bx_1;
01669 parms.by_1 = by_1;
01670 parms.bz_1 = bz_1;
01671 parms.nbx = nbx;
01672 parms.nby = nby;
01673 parms.nbz = nbz;
01674 parms.bin = (flint *) bin;
01675 parms.bin_zero = (flint *) bin_zero;
01676 parms.num_binoffsets = num_binoffsets;
01677 parms.binoffsets = binoffsets;
01678 parms.num_extras = num_extras;
01679 parms.extra = (flint *) extra;
01680 parms.num_vdwparms = num_vdwparms;
01681 parms.vdwparms = vdwparms;
01682 parms.num_probes = num_probes;
01683 parms.probevdwparms = probevdwparms;
01684 parms.num_conformers = num_conformers;
01685 parms.conformers = conformers;
01686
01687 #ifdef DEBUG
01688 printf("*** calling cuda_occupancy_thread()\n");
01689 #endif
01690 cuda_occupancy_thread(&parms);
01691
01692 return parms.errcode;
01693 }