ComputeNonbondedCUDAKernel.cu

Go to the documentation of this file.
00001 
00002 #include "CudaUtils.h"
00003 #include "ComputeNonbondedCUDAKernel.h"
00004 #include <stdio.h>
00005 
00006 #ifdef NAMD_CUDA
00007 
00008 #ifdef WIN32
00009 #define __thread __declspec(thread)
00010 #endif
00011 
00012 texture<unsigned int, 1, cudaReadModeElementType> tex_exclusions;
00013 static __thread int exclusions_size;
00014 static __thread unsigned int *exclusions;
00015 
00016 __constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS];
00017 static __thread unsigned int *overflow_exclusions;
00018 
00019 #define SET_EXCL(EXCL,BASE,DIFF) \
00020          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00021 
00022 void cuda_bind_exclusions(const unsigned int *t, int n) {
00023   exclusions_size = n;
00024   static __thread int exclusions_alloc;
00025   if ( exclusions && exclusions_alloc < exclusions_size ) {
00026     cudaFree(exclusions);
00027     cuda_errcheck("freeing exclusions");
00028     cudaFree(overflow_exclusions);
00029     cuda_errcheck("freeing overflow_exclusions");
00030     exclusions = 0;
00031   }
00032   if ( ! exclusions ) {
00033     exclusions_alloc = exclusions_size;
00034     cudaMalloc((void**) &exclusions, n*sizeof(unsigned int));
00035     cuda_errcheck("malloc exclusions");
00036     cudaMalloc((void**) &overflow_exclusions, n*sizeof(unsigned int));
00037     cuda_errcheck("malloc overflow_exclusions");
00038   }
00039   cudaMemcpy(exclusions, t, n*sizeof(unsigned int), cudaMemcpyHostToDevice);
00040   cuda_errcheck("memcpy exclusions");
00041   tex_exclusions.normalized = false;
00042   tex_exclusions.addressMode[0] = cudaAddressModeClamp;
00043   tex_exclusions.filterMode = cudaFilterModePoint;
00044   cudaBindTexture(NULL, tex_exclusions, exclusions, n*sizeof(unsigned int));
00045   cuda_errcheck("binding exclusions to texture");
00046 
00047   cudaMemcpy(overflow_exclusions, t,
00048                 n*sizeof(unsigned int), cudaMemcpyHostToDevice);
00049   cuda_errcheck("memcpy to overflow_exclusions");
00050   int nconst = ( n < MAX_CONST_EXCLUSIONS ? n : MAX_CONST_EXCLUSIONS );
00051   cudaMemcpyToSymbol(const_exclusions, t, nconst*sizeof(unsigned int), 0);
00052   cuda_errcheck("memcpy to const_exclusions");
00053 }
00054 
00055 
00056 texture<float2, 1, cudaReadModeElementType> lj_table;
00057 static __thread int lj_table_size;
00058 
00059 void cuda_bind_lj_table(const float2 *t, int _lj_table_size) {
00060     static __thread float2 *ct;
00061     static __thread int lj_table_alloc;
00062     lj_table_size = _lj_table_size;
00063     if ( ct && lj_table_alloc < lj_table_size ) {
00064       cudaFree(ct);
00065       cuda_errcheck("freeing lj table");
00066       ct = 0;
00067     }
00068     if ( ! ct ) {
00069       lj_table_alloc = lj_table_size;
00070       cudaMalloc((void**) &ct, lj_table_size*lj_table_size*sizeof(float2));
00071       cuda_errcheck("allocating lj table");
00072     }
00073     cudaMemcpy(ct, t, lj_table_size*lj_table_size*sizeof(float2),
00074                                             cudaMemcpyHostToDevice);
00075     cuda_errcheck("memcpy to lj table");
00076 
00077     lj_table.normalized = false;
00078     lj_table.addressMode[0] = cudaAddressModeClamp;
00079     lj_table.filterMode = cudaFilterModePoint;
00080 
00081     cudaBindTexture((size_t*)0, lj_table, ct,
00082         lj_table_size*lj_table_size*sizeof(float2));
00083     cuda_errcheck("binding lj table to texture");
00084 }
00085 
00086 
00087 texture<float4, 1, cudaReadModeElementType> force_table;
00088 texture<float4, 1, cudaReadModeElementType> energy_table;
00089 
00090 void cuda_bind_force_table(const float4 *t, const float4 *et) {
00091     static __thread cudaArray *ct;
00092     static __thread cudaArray *ect;
00093     if ( ! ct ) {
00094       cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
00095       cuda_errcheck("allocating force table");
00096     }
00097     if ( ! ect ) {
00098       cudaMallocArray(&ect, &energy_table.channelDesc, FORCE_TABLE_SIZE, 1);
00099       cuda_errcheck("allocating energy table");
00100     }
00101     cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00102     // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00103     cuda_errcheck("memcpy to force table");
00104     cudaMemcpyToArray(ect, 0, 0, et, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00105     cuda_errcheck("memcpy to energy table");
00106 
00107     force_table.normalized = true;
00108     force_table.addressMode[0] = cudaAddressModeClamp;
00109     force_table.addressMode[1] = cudaAddressModeClamp;
00110     force_table.filterMode = cudaFilterModeLinear;
00111 
00112     energy_table.normalized = true;
00113     energy_table.addressMode[0] = cudaAddressModeClamp;
00114     energy_table.addressMode[1] = cudaAddressModeClamp;
00115     energy_table.filterMode = cudaFilterModeLinear;
00116 
00117     cudaBindTextureToArray(force_table, ct);
00118     cuda_errcheck("binding force table to texture");
00119 
00120     cudaBindTextureToArray(energy_table, ect);
00121     cuda_errcheck("binding energy table to texture");
00122 }
00123 
00124 static __thread int num_patches;
00125 static __thread int num_virials;
00126 static __thread int num_atoms;
00127 // Size of the device array followed by the array pointer
00128 static __thread int patch_pairs_size;
00129 static __thread patch_pair* patch_pairs;
00130 static __thread int atom_params_size;
00131 static __thread atom_param* atom_params;
00132 static __thread int vdw_types_size;
00133 static __thread int* vdw_types;
00134 static __thread int atoms_size;
00135 static __thread atom* atoms;
00136 
00137 static __thread int tmpforces_size;
00138 static __thread float4* tmpforces;
00139 static __thread int slow_tmpforces_size;
00140 static __thread float4* slow_tmpforces;
00141 
00142 static __thread int tmpvirials_size;
00143 static __thread float* tmpvirials;
00144 static __thread int slow_tmpvirials_size;
00145 static __thread float* slow_tmpvirials;
00146 
00147 static __thread int global_counters_size;
00148 static __thread unsigned int* global_counters;
00149 static __thread int plist_size;
00150 static __thread unsigned int* plist;
00151 static __thread int exclmasks_size;
00152 static __thread exclmask* exclmasks;
00153 // Device pointers to the page-locked host arrays (provided by ComputeNonbondedCUDA -class)
00154 static __thread float4* forces;
00155 static __thread float4* slow_forces;
00156 static __thread int* force_ready_queue;
00157 static __thread float* virials;
00158 static __thread float* slow_virials;
00159 static __thread int* block_order;
00160 
00161 //GBIS arrays
00162 static __thread int intRad0D_size;
00163 static __thread float *intRad0D;
00164 
00165 static __thread int intRadSD_size;
00166 static __thread float *intRadSD;
00167 
00168 static __thread GBReal *psiSumD;        // host-mapped memory
00169 
00170 static __thread int tmp_psiSumD_size;
00171 static __thread GBReal *tmp_psiSumD;
00172 
00173 static __thread int bornRadD_size;
00174 static __thread float *bornRadD;
00175 
00176 static __thread GBReal *dEdaSumD;       // host-mapped memory
00177 
00178 static __thread int tmp_dEdaSumD_size;
00179 static __thread GBReal *tmp_dEdaSumD;
00180 
00181 static __thread int dHdrPrefixD_size;
00182 static __thread float *dHdrPrefixD;
00183 
00184 static __thread int GBIS_P1_counters_size;
00185 static __thread unsigned int *GBIS_P1_counters;
00186 
00187 static __thread int GBIS_P2_counters_size;
00188 static __thread unsigned int *GBIS_P2_counters;
00189 
00190 static __thread int GBIS_P3_counters_size;
00191 static __thread unsigned int *GBIS_P3_counters;
00192 
00193 static __thread float *energy_gbis;     // host-mapped memory
00194 
00195 static __thread int tmp_energy_gbis_size;
00196 static __thread float *tmp_energy_gbis;
00197 
00198 __thread int max_grid_size;
00199 
00200 __thread cudaStream_t stream;
00201 __thread cudaStream_t stream2;
00202  
00203 void cuda_init() {
00204   patch_pairs_size = 0;
00205   patch_pairs = NULL;
00206 
00207   atom_params_size = 0;
00208   atom_params = NULL;
00209 
00210   vdw_types_size = 0;
00211   vdw_types = NULL;
00212 
00213   atoms_size = 0;
00214   atoms = NULL;  
00215 
00216   tmpforces_size = 0;
00217   tmpforces = NULL;
00218 
00219   slow_tmpforces_size = 0;
00220   slow_tmpforces = NULL;
00221 
00222   tmpvirials_size = 0;
00223   tmpvirials = NULL;
00224 
00225   slow_tmpvirials_size = 0;
00226   slow_tmpvirials = NULL;
00227 
00228   global_counters_size = 0;
00229   global_counters = NULL;
00230 
00231   plist_size = 0;
00232   plist = NULL;
00233 
00234   exclmasks_size = 0;
00235   exclmasks = NULL;
00236 
00237   forces = NULL;
00238   slow_forces = NULL;
00239 
00240   force_ready_queue = NULL;
00241 
00242   exclusions_size = 0;
00243   exclusions = NULL;
00244 
00245   // --------------------
00246   // For GBIS
00247   // --------------------
00248   intRad0D_size = 0;
00249   intRad0D = NULL;
00250 
00251   intRadSD_size = 0;
00252   intRadSD = NULL;
00253 
00254   psiSumD = NULL;        // host-mapped memory
00255 
00256   tmp_psiSumD_size = 0;
00257   tmp_psiSumD = NULL;
00258 
00259   bornRadD_size = 0;
00260   bornRadD = NULL;
00261 
00262   dEdaSumD = NULL;       // host-mapped memory
00263 
00264   tmp_dEdaSumD_size = 0;
00265   tmp_dEdaSumD = NULL;
00266 
00267   dHdrPrefixD_size = 0;
00268   dHdrPrefixD = NULL;
00269 
00270   GBIS_P1_counters_size = 0;
00271   GBIS_P1_counters = NULL;
00272 
00273   GBIS_P2_counters_size = 0;
00274   GBIS_P2_counters = NULL;
00275 
00276   GBIS_P3_counters_size = 0;
00277   GBIS_P3_counters = NULL;
00278 
00279   energy_gbis = NULL;     // host-mapped memory
00280 
00281   tmp_energy_gbis_size = 0;
00282   tmp_energy_gbis = NULL;
00283 
00284   int dev;
00285   cudaGetDevice(&dev);
00286   cuda_errcheck("cudaGetDevice");
00287   cudaDeviceProp deviceProp;
00288   cudaGetDeviceProperties(&deviceProp, dev);
00289   cuda_errcheck("cudaGetDeviceProperties");
00290   max_grid_size = deviceProp.maxGridSize[1];
00291 }
00292 
00293 void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs,
00294                            int npatches, int natoms, int plist_len, 
00295                            int nexclmask) {
00296   num_patches = npatches;
00297   num_virials = npatches;
00298   num_atoms = natoms;
00299   reallocate_device<patch_pair>(&patch_pairs, &patch_pairs_size, npatch_pairs, 1.2f);
00300   reallocate_device<atom>(&atoms, &atoms_size, num_atoms, 1.2f);
00301   reallocate_device<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
00302   reallocate_device<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
00303   reallocate_device<unsigned int>(&global_counters, &global_counters_size, num_patches+2, 1.2f);
00304   reallocate_device<float4>(&tmpforces, &tmpforces_size, num_atoms, 1.2f);
00305   reallocate_device<float4>(&slow_tmpforces, &slow_tmpforces_size, num_atoms, 1.2f);
00306   reallocate_device<unsigned int>(&plist, &plist_size, plist_len, 1.2f);
00307   reallocate_device<exclmask>(&exclmasks, &exclmasks_size, nexclmask, 1.2f);
00308   reallocate_device<float>(&tmpvirials, &tmpvirials_size, num_patches*16, 1.2f);
00309   reallocate_device<float>(&slow_tmpvirials, &slow_tmpvirials_size, num_patches*16, 1.2f);
00310 
00311   // For GBIS
00312   reallocate_device<unsigned int>(&GBIS_P1_counters, &GBIS_P1_counters_size, num_patches, 1.2f);
00313   reallocate_device<unsigned int>(&GBIS_P2_counters, &GBIS_P2_counters_size, num_patches, 1.2f);
00314   reallocate_device<unsigned int>(&GBIS_P3_counters, &GBIS_P3_counters_size, num_patches, 1.2f);
00315   reallocate_device<float>(&intRad0D, &intRad0D_size, num_atoms, 1.2f);
00316   reallocate_device<float>(&intRadSD, &intRadSD_size, num_atoms, 1.2f);
00317   reallocate_device<GBReal>(&tmp_psiSumD, &tmp_psiSumD_size, num_atoms, 1.2f);
00318   reallocate_device<float>(&bornRadD, &bornRadD_size, num_atoms, 1.2f);
00319   reallocate_device<GBReal>(&tmp_dEdaSumD, &tmp_dEdaSumD_size, num_atoms, 1.2f);
00320   reallocate_device<float>(&dHdrPrefixD, &dHdrPrefixD_size, num_atoms, 1.2f);
00321   reallocate_device<float>(&tmp_energy_gbis, &tmp_energy_gbis_size, num_patches, 1.2f);
00322 
00323   cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*sizeof(patch_pair), cudaMemcpyHostToDevice);
00324   cuda_errcheck("memcpy to patch_pairs");
00325 
00326   cudaMemset(global_counters, 0, (num_patches+2)*sizeof(unsigned int));
00327   cuda_errcheck("memset global_counters");
00328 
00329   cudaMemset(GBIS_P1_counters, 0, num_patches*sizeof(unsigned int));
00330   cuda_errcheck("memset GBIS_P1_counters");
00331 
00332   cudaMemset(GBIS_P2_counters, 0, num_patches*sizeof(unsigned int));
00333   cuda_errcheck("memset GBIS_P2_counters");
00334 
00335   cudaMemset(GBIS_P3_counters, 0, num_patches*sizeof(unsigned int));
00336   cuda_errcheck("memset GBIS_P3_counters");
00337 
00338   cudaMemset(tmpforces, 0, num_atoms*sizeof(float4));
00339   cuda_errcheck("memset tmpforces");
00340 
00341   cudaMemset(tmpvirials, 0, num_patches*sizeof(float)*16);
00342   cuda_errcheck("memset tmpvirials");
00343 
00344   cudaMemset(slow_tmpforces, 0, num_atoms*sizeof(float4));
00345   cuda_errcheck("memset slow_tmpforces");
00346 
00347   cudaMemset(slow_tmpvirials, 0, num_patches*sizeof(float)*16);
00348   cuda_errcheck("memset slow_tmpvirials");
00349 
00350 }
00351 
00352 void cuda_bind_atom_params(const atom_param *t) {
00353   cudaMemcpyAsync(atom_params, t, num_atoms * sizeof(atom_param),
00354                                 cudaMemcpyHostToDevice, stream);
00355   cuda_errcheck("memcpy to atom_params");
00356 }
00357 
00358 void cuda_bind_vdw_types(const int *t) {
00359   cudaMemcpyAsync(vdw_types, t, num_atoms * sizeof(int),
00360                                 cudaMemcpyHostToDevice, stream);
00361   cuda_errcheck("memcpy to vdw_types");
00362 }
00363 
00364 void cuda_bind_atoms(const atom *a) {
00365   cuda_errcheck("before memcpy to atoms");
00366   cudaMemcpyAsync(atoms, a, num_atoms * sizeof(atom),
00367                                 cudaMemcpyHostToDevice, stream);
00368   cuda_errcheck("memcpy to atoms");
00369 }
00370 
00371 void cuda_bind_forces(float4 *f, float4 *f_slow) {
00372   cudaHostGetDevicePointer(&forces, f, 0);
00373   cuda_errcheck("cudaHostGetDevicePointer forces");
00374   cudaHostGetDevicePointer(&slow_forces, f_slow, 0);
00375   cuda_errcheck("cudaHostGetDevicePointer slow_forces");
00376 }
00377 
00378 void cuda_bind_virials(float *v, int *queue, int *blockorder) {
00379   cudaHostGetDevicePointer(&virials, v, 0);
00380   cuda_errcheck("cudaHostGetDevicePointer virials");
00381   slow_virials = virials + num_virials*16;
00382   cudaHostGetDevicePointer(&force_ready_queue, queue, 0);
00383   cuda_errcheck("cudaHostGetDevicePointer force_ready_queue");
00384   cudaHostGetDevicePointer(&block_order, blockorder, 0);
00385   cuda_errcheck("cudaHostGetDevicePointer block_order");
00386 }
00387 
00388 //GBIS bindings
00389 void cuda_bind_GBIS_energy(float *e) {
00390   cudaHostGetDevicePointer(&energy_gbis, e, 0);
00391   cuda_errcheck("cudaHostGetDevicePointer energy_gbis");
00392 }
00393 void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH) {
00394   cudaMemcpyAsync(intRad0D, intRad0H, num_atoms * sizeof(float),
00395                                 cudaMemcpyHostToDevice, stream);
00396   cudaMemcpyAsync(intRadSD, intRadSH, num_atoms * sizeof(float),
00397                                 cudaMemcpyHostToDevice, stream);
00398   cuda_errcheck("memcpy to intRad");
00399 }
00400 
00401 void cuda_bind_GBIS_psiSum(GBReal *psiSumH) {
00402   cudaHostGetDevicePointer(&psiSumD, psiSumH, 0);
00403   cuda_errcheck("cudaHostGetDevicePointer psiSum");
00404 }
00405 
00406 void cuda_bind_GBIS_bornRad(float *bornRadH) {
00407   cudaMemcpyAsync(bornRadD, bornRadH, num_atoms * sizeof(float),
00408                                 cudaMemcpyHostToDevice, stream);
00409   cuda_errcheck("memcpy to bornRad");
00410 }
00411 
00412 void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH) {
00413   cudaHostGetDevicePointer(&dEdaSumD, dEdaSumH, 0);
00414   cuda_errcheck("cudaHostGetDevicePointer dEdaSum");
00415 }
00416 
00417 void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH) {
00418   cudaMemcpyAsync(dHdrPrefixD, dHdrPrefixH, num_atoms * sizeof(float),
00419                                 cudaMemcpyHostToDevice, stream);
00420   cuda_errcheck("memcpy to dHdrPrefix");
00421 }
00422 // end GBIS methods
00423 
00424 #if 0
00425 void cuda_load_forces(float4 *f, float4 *f_slow, int begin, int count) {
00426   // printf("load forces %d %d %d\n",begin,count,num_atoms);
00427   cudaMemcpyAsync(f+begin, forces+begin, count * sizeof(float4),
00428                                 cudaMemcpyDeviceToHost, stream);
00429   if ( f_slow ) {
00430     cudaMemcpyAsync(f_slow+begin, slow_forces+begin, count * sizeof(float4),
00431                                 cudaMemcpyDeviceToHost, stream);
00432   }
00433   cuda_errcheck("memcpy from forces");
00434 }
00435 
00436 void cuda_load_virials(float *v, int doSlow) {
00437   int count = force_lists_size;
00438   if ( doSlow ) count *= 2;
00439   cudaMemcpyAsync(v, virials, count * 16*sizeof(float),
00440                                 cudaMemcpyDeviceToHost, stream);
00441   cuda_errcheck("memcpy from virials");
00442 }
00443 #endif
00444 
00445 #if 0
00446 __host__ __device__ static int3 patch_coords_from_id(
00447         dim3 PATCH_GRID, int id) {
00448 
00449   return make_int3( id % PATCH_GRID.x,
00450                 ( id / PATCH_GRID.x ) % PATCH_GRID.y,
00451                 id / ( PATCH_GRID.x * PATCH_GRID.y ) );
00452 }
00453 
00454 __host__ __device__ static int patch_id_from_coords(
00455         dim3 PATCH_GRID, int3 coords) {
00456 
00457   // handles periodic boundaries
00458   int x = (coords.x + 4 * PATCH_GRID.x) % PATCH_GRID.x;
00459   int y = (coords.y + 4 * PATCH_GRID.y) % PATCH_GRID.y;
00460   int z = (coords.z + 4 * PATCH_GRID.z) % PATCH_GRID.z;
00461 
00462   return ( z * PATCH_GRID.y + y ) * PATCH_GRID.x + x;
00463 }
00464 
00465 __host__ __device__ static int3 patch_offset_from_neighbor(int neighbor) {
00466 
00467   // int3 coords = patch_coords_from_id(make_uint3(3,3,3), 13 + neighbor);
00468   int3 coords = patch_coords_from_id(make_uint3(3,3,3), neighbor);
00469   return make_int3(coords.x - 1, coords.y - 1, coords.z - 1);
00470 
00471 }
00472 #endif
00473  
00474 #define BLOCK_SIZE 128
00475 #define SHARED_SIZE 32
00476 
00477 #define MAKE_PAIRLIST
00478 #define DO_SLOW
00479 #define DO_ENERGY
00480 #include "ComputeNonbondedCUDAKernelBase.h"
00481 #undef DO_ENERGY
00482 #include "ComputeNonbondedCUDAKernelBase.h"
00483 #undef DO_SLOW
00484 #define DO_ENERGY
00485 #include "ComputeNonbondedCUDAKernelBase.h"
00486 #undef DO_ENERGY
00487 #include "ComputeNonbondedCUDAKernelBase.h"
00488 #undef MAKE_PAIRLIST
00489 #define DO_SLOW
00490 #define DO_ENERGY
00491 #include "ComputeNonbondedCUDAKernelBase.h"
00492 #undef DO_ENERGY
00493 #include "ComputeNonbondedCUDAKernelBase.h"
00494 #undef DO_SLOW
00495 #define DO_ENERGY
00496 #include "ComputeNonbondedCUDAKernelBase.h"
00497 #undef DO_ENERGY
00498 #include "ComputeNonbondedCUDAKernelBase.h"
00499 
00500 void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc,
00501                            float cutoff2, float plcutoff2,
00502                            int cbegin, int ccount, int ctotal,
00503                            int doSlow, int doEnergy, int usePairlists, int savePairlists,
00504                            int doStreaming, int saveOrder, cudaStream_t &strm) {
00505 
00506  if ( ccount ) {
00507    if ( usePairlists ) {
00508      if ( ! savePairlists ) plcutoff2 = 0.;
00509    } else {
00510      plcutoff2 = cutoff2;
00511    }
00512 
00513    cudaMemsetAsync(tmpforces, 0, num_atoms*sizeof(float4), strm);
00514    cudaMemsetAsync(tmpvirials, 0, num_patches*sizeof(float)*16, strm);
00515    if ( doSlow ) {
00516      cudaMemsetAsync(slow_tmpforces, 0, num_atoms*sizeof(float4), strm);
00517      cudaMemsetAsync(slow_tmpvirials, 0, num_patches*sizeof(float)*16, strm);
00518    }
00519 
00520    int grid_dim = max_grid_size;  // maximum allowed
00521    for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00522      if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
00523 
00524      dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00525 
00526 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>>                    \
00527        (patch_pairs, atoms, atom_params, vdw_types, plist,              \
00528         tmpforces, (doSlow?slow_tmpforces:NULL),                        \
00529         forces, (doSlow?slow_forces:NULL),                              \
00530         tmpvirials, (doSlow?slow_tmpvirials:NULL),                      \
00531         virials, (doSlow?slow_virials:NULL),                            \
00532         global_counters, (doStreaming?force_ready_queue:NULL),          \
00533         overflow_exclusions, num_patches,                               \
00534         cbegin+cstart, ctotal, (saveOrder?block_order:NULL),            \
00535         exclmasks, lj_table_size,                                       \
00536         lata, latb, latc, cutoff2, plcutoff2, doSlow)
00537 //end definition
00538 
00539      if ( doEnergy ) {
00540        if ( doSlow ) {
00541          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_energy_pairlist);
00542          else CALL(dev_nonbonded_slow_energy);
00543        } else {
00544          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_energy_pairlist);
00545          else CALL(dev_nonbonded_energy);
00546        }
00547      } else {
00548        if ( doSlow ) {
00549          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_pairlist);
00550          else CALL(dev_nonbonded_slow);
00551        } else {
00552          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_pairlist);
00553          else CALL(dev_nonbonded);
00554        }
00555      }
00556 
00557      cuda_errcheck("dev_nonbonded");
00558    }
00559  }
00560 
00561 }
00562 
00563 //import GBIS Kernel definitions
00564 #include "ComputeGBISCUDAKernel.h"
00565 
00567 //  GBIS P1
00569 void cuda_GBIS_P1(
00570         int cbegin,
00571   int ccount,
00572   int pbegin,
00573   int pcount,
00574   float a_cut,
00575   float rho_0,
00576   float3 lata,
00577   float3 latb,
00578   float3 latc,
00579   cudaStream_t &strm
00580 ) {
00581 
00582  if ( ccount ) {
00583   cudaMemsetAsync(tmp_psiSumD, 0, num_atoms*sizeof(GBReal), strm);
00584 
00585   int grid_dim = max_grid_size;  // maximum allowed
00586   for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00587     if (grid_dim > ccount - cstart) {
00588       grid_dim = ccount - cstart;
00589     }
00590 
00591     dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00592     GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
00593       patch_pairs+cbegin+cstart,
00594       atoms,
00595       intRad0D,
00596       intRadSD,
00597       tmp_psiSumD,
00598       psiSumD,
00599       a_cut,
00600       rho_0,
00601       lata,
00602       latb,
00603       latc,
00604       GBIS_P1_counters 
00605       );
00606     cuda_errcheck("dev_GBIS_P1");
00607   } // end for
00608  }
00609 } // end GBIS P1
00610 
00612 //  GBIS P2
00614 void cuda_GBIS_P2(
00615         int cbegin,
00616   int ccount,
00617   int pbegin,
00618   int pcount,
00619   float a_cut,
00620   float r_cut,
00621   float scaling,
00622   float kappa,
00623   float smoothDist,
00624   float epsilon_p,
00625   float epsilon_s,
00626   float3 lata,
00627   float3 latb,
00628   float3 latc,
00629   int doEnergy,
00630   int doFullElec,
00631   cudaStream_t &strm
00632 ) {
00633 
00634  if ( ccount ) {
00635   cudaMemsetAsync(tmp_dEdaSumD, 0, num_atoms*sizeof(GBReal), strm);
00636   cudaMemsetAsync(tmp_energy_gbis, 0, num_patches*sizeof(float), strm);
00637 
00638   int grid_dim = max_grid_size;  // maximum allowed
00639   for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00640     if (grid_dim > ccount - cstart)
00641       grid_dim = ccount - cstart;
00642 
00643     dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00644     GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
00645       patch_pairs+cbegin+cstart,
00646       atoms,
00647       bornRadD,
00648       tmp_dEdaSumD,
00649       dEdaSumD,
00650       a_cut,
00651       r_cut,
00652       scaling,
00653       kappa,
00654       smoothDist,
00655       epsilon_p,
00656       epsilon_s,
00657       lata,
00658       latb,
00659       latc,
00660       doEnergy,
00661       doFullElec,
00662       tmpforces,
00663       forces,
00664       tmp_energy_gbis,
00665       energy_gbis,
00666       GBIS_P2_counters 
00667       );
00668     cuda_errcheck("dev_GBIS_P2");
00669   } // end for
00670  }
00671 } // end P2
00672 
00674 //  GBIS P3
00676 void cuda_GBIS_P3(
00677         int cbegin,
00678   int ccount,
00679   int pbegin,
00680   int pcount,
00681   float a_cut,
00682   float rho_0,
00683   float scaling,
00684   float3 lata,
00685   float3 latb,
00686   float3 latc,
00687   cudaStream_t &strm
00688 ) {
00689   int grid_dim = max_grid_size;  // maximum allowed
00690   for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00691     if (grid_dim > ccount - cstart)
00692       grid_dim = ccount - cstart;
00693 
00694     dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00695     GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
00696       patch_pairs+cbegin+cstart,
00697       atoms,
00698       intRad0D,
00699       intRadSD,
00700       dHdrPrefixD,
00701       a_cut,
00702       rho_0,
00703       scaling,
00704       lata,
00705       latb,
00706       latc,
00707       slow_tmpforces,
00708       slow_forces,
00709       GBIS_P3_counters 
00710       );
00711     cuda_errcheck("dev_GBIS_P3");
00712   }
00713 }
00714 
00715 #if 0
00716 int cuda_stream_finished() {
00717   return ( cudaStreamQuery(stream) == cudaSuccess );
00718 }
00719 #endif
00720 
00721 
00722 #else  // NAMD_CUDA
00723 
00724 // for make depends
00725 #include "ComputeNonbondedCUDAKernelBase.h"
00726 #include "ComputeGBISCUDAKernel.h"
00727 
00728 #endif  // NAMD_CUDA
00729 

Generated on Tue Sep 19 01:17:11 2017 for NAMD by  doxygen 1.4.7