ComputeNonbondedCUDAKernel.h File Reference

Go to the source code of this file.

Classes

struct  exclmask

Defines

#define __align__(X)
#define NUM_WARP   4
#define WARPSIZE   32
#define PATCH_PAIR_SIZE   (sizeof(patch_pair)/4)
#define COPY_ATOM(DEST, SOURCE)
#define COPY_PARAM(DEST, SOURCE)
#define COPY_ATOM_TO_SHARED(ATOM, PARAM, SHARED)
#define COPY_ATOM_FROM_SHARED(ATOM, PARAM, SHARED)
#define MAX_EXCLUSIONS   (1<<27)
#define MAX_CONST_EXCLUSIONS   2048
#define FORCE_TABLE_SIZE   4096

Typedefs

typedef float GBReal

Functions

void cuda_errcheck (const char *msg)
struct __align__ (16) patch_pair
void cuda_bind_exclusions (const unsigned int *t, int n)
void cuda_bind_lj_table (const float2 *t, int _lj_table_size)
void cuda_bind_force_table (const float4 *t, const float4 *et)
void cuda_init ()
void cuda_bind_patch_pairs (patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int nexclmask, int plist_len)
void cuda_bind_atom_params (const atom_param *t)
void cuda_bind_vdw_types (const int *t)
void cuda_bind_atoms (const atom *a)
void cuda_bind_forces (float4 *f, float4 *f_slow)
void cuda_bind_virials (float *v, int *queue, int *blockorder)
void cuda_nonbonded_forces (float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
void cuda_GBIS_P1 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
void cuda_GBIS_P2 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
void cuda_GBIS_P3 (int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
void cuda_bind_GBIS_intRad (float *intRad0H, float *intRadSH)
void cuda_bind_GBIS_energy (float *energy_gbis)
void cuda_bind_GBIS_psiSum (GBReal *psiSumH)
void cuda_bind_GBIS_bornRad (float *bornRadH)
void cuda_bind_GBIS_dEdaSum (GBReal *dEdaSumH)
void cuda_bind_GBIS_dHdrPrefix (float *dHdrPrefixH)
int cuda_stream_finished ()


Define Documentation

#define __align__ (  ) 

Definition at line 9 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM ( DEST,
SOURCE   ) 

Value:

{ \
  DEST.position.x = SOURCE.position.x; \
  DEST.position.y = SOURCE.position.y; \
  DEST.position.z = SOURCE.position.z; \
  DEST.charge = SOURCE.charge; \
  }

Definition at line 59 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM_FROM_SHARED ( ATOM,
PARAM,
SHARED   ) 

Value:

{ \
    COPY_ATOM( ATOM, SHARED ) \
    COPY_PARAM( PARAM, SHARED ) \
  }

Definition at line 79 of file ComputeNonbondedCUDAKernel.h.

#define COPY_ATOM_TO_SHARED ( ATOM,
PARAM,
SHARED   ) 

Value:

{ \
    COPY_ATOM( SHARED, ATOM ) \
    COPY_PARAM( SHARED, PARAM ) \
  }

Definition at line 74 of file ComputeNonbondedCUDAKernel.h.

#define COPY_PARAM ( DEST,
SOURCE   ) 

Value:

{ \
  DEST.sqrt_epsilon = SOURCE.sqrt_epsilon; \
  DEST.half_sigma = SOURCE.half_sigma; \
  DEST.index = SOURCE.index; \
  DEST.excl_index = SOURCE.excl_index; \
  DEST.excl_maxdiff = SOURCE.excl_maxdiff; \
  }

Definition at line 66 of file ComputeNonbondedCUDAKernel.h.

#define FORCE_TABLE_SIZE   4096

Definition at line 97 of file ComputeNonbondedCUDAKernel.h.

Referenced by ComputeNonbondedCUDA::build_force_table(), and cuda_bind_force_table().

#define MAX_CONST_EXCLUSIONS   2048

Definition at line 87 of file ComputeNonbondedCUDAKernel.h.

Referenced by CudaComputeNonbondedKernel::bindExclusions(), and cuda_bind_exclusions().

#define MAX_EXCLUSIONS   (1<<27)

Definition at line 86 of file ComputeNonbondedCUDAKernel.h.

Referenced by ComputeNonbondedCUDA::build_exclusions().

#define NUM_WARP   4

Definition at line 14 of file ComputeNonbondedCUDAKernel.h.

Referenced by cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), and reduceVariables().

#define PATCH_PAIR_SIZE   (sizeof(patch_pair)/4)

Definition at line 45 of file ComputeNonbondedCUDAKernel.h.

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel().

#define WARPSIZE   32

Definition at line 15 of file ComputeNonbondedCUDAKernel.h.

Referenced by ComputeBondedCUDAKernel::bondedForce(), bondedForcesKernel(), buildBoundingBoxesKernel(), CudaTileListKernel::buildTileLists(), buildTileListsBBKernel(), calcTileListPosKernel(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), ComputeNonbondedCUDA::doWork(), fillSortKeys(), GBIS_P1_Kernel(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), modifiedExclusionForcesKernel(), CudaComputeNonbondedKernel::nonbondedForce(), reduceVariables(), CudaComputeNonbondedKernel::reduceVirialEnergy(), repackTileListsKernel(), scalar_sum_kernel(), GBISResults< 3 >::shuffleNext(), GBISInput< 3 >::shuffleNext(), GBISResults< 2 >::shuffleNext(), GBISInput< 2 >::shuffleNext(), GBISResults< 1 >::shuffleNext(), GBISInput< 1 >::shuffleNext(), shuffleNext(), updatePatchesKernel(), and ComputeBondedCUDAKernel::warpAlign().


Typedef Documentation

typedef float GBReal

Definition at line 3 of file ComputeNonbondedCUDAKernel.h.


Function Documentation

struct __align__ ( 16   ) 

Definition at line 22 of file ComputeNonbondedCUDAKernel.h.

References plist_size.

00022                                 {
00023   float3 offset;
00024   int patch1_start;      // Coordinate/force start for this patch
00025   int patch1_size;       // Size of the patch
00026   int patch2_start;
00027   int patch2_size;
00028   int patch1_ind;        // Patch index
00029   int patch2_ind;
00030   int patch1_num_pairs;  // Number of pairs that involve this patch
00031   int patch2_num_pairs;
00032   union {
00033     bool patch_done[2];      // After-GPU-computation shared memory temporary storage
00034     struct {
00035       int plist_start;       // Pair list start
00036       int plist_size;        // Pair list size
00037     };
00038   };
00039   int exclmask_start;    // Exclusion mask start
00040   int patch1_free_size;  // Size of the free atoms in patch
00041   int patch2_free_size;  // Size of the free atoms in patch
00042 //  int pad1, pad2;
00043 };

void cuda_bind_atom_params ( const atom_param *  t  ) 

Definition at line 352 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00352                                                 {
00353   cudaMemcpyAsync(atom_params, t, num_atoms * sizeof(atom_param),
00354                                 cudaMemcpyHostToDevice, stream);
00355   cuda_errcheck("memcpy to atom_params");
00356 }

void cuda_bind_atoms ( const atom *  a  ) 

Definition at line 364 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00364                                     {
00365   cuda_errcheck("before memcpy to atoms");
00366   cudaMemcpyAsync(atoms, a, num_atoms * sizeof(atom),
00367                                 cudaMemcpyHostToDevice, stream);
00368   cuda_errcheck("memcpy to atoms");
00369 }

void cuda_bind_exclusions ( const unsigned int *  t,
int  n 
)

Definition at line 22 of file ComputeNonbondedCUDAKernel.cu.

References const_exclusions, cuda_errcheck(), exclusions, exclusions_size, MAX_CONST_EXCLUSIONS, overflow_exclusions, and tex_exclusions.

Referenced by ComputeNonbondedCUDA::build_exclusions().

00022                                                         {
00023   exclusions_size = n;
00024   static __thread int exclusions_alloc;
00025   if ( exclusions && exclusions_alloc < exclusions_size ) {
00026     cudaFree(exclusions);
00027     cuda_errcheck("freeing exclusions");
00028     cudaFree(overflow_exclusions);
00029     cuda_errcheck("freeing overflow_exclusions");
00030     exclusions = 0;
00031   }
00032   if ( ! exclusions ) {
00033     exclusions_alloc = exclusions_size;
00034     cudaMalloc((void**) &exclusions, n*sizeof(unsigned int));
00035     cuda_errcheck("malloc exclusions");
00036     cudaMalloc((void**) &overflow_exclusions, n*sizeof(unsigned int));
00037     cuda_errcheck("malloc overflow_exclusions");
00038   }
00039   cudaMemcpy(exclusions, t, n*sizeof(unsigned int), cudaMemcpyHostToDevice);
00040   cuda_errcheck("memcpy exclusions");
00041   tex_exclusions.normalized = false;
00042   tex_exclusions.addressMode[0] = cudaAddressModeClamp;
00043   tex_exclusions.filterMode = cudaFilterModePoint;
00044   cudaBindTexture(NULL, tex_exclusions, exclusions, n*sizeof(unsigned int));
00045   cuda_errcheck("binding exclusions to texture");
00046 
00047   cudaMemcpy(overflow_exclusions, t,
00048                 n*sizeof(unsigned int), cudaMemcpyHostToDevice);
00049   cuda_errcheck("memcpy to overflow_exclusions");
00050   int nconst = ( n < MAX_CONST_EXCLUSIONS ? n : MAX_CONST_EXCLUSIONS );
00051   cudaMemcpyToSymbol(const_exclusions, t, nconst*sizeof(unsigned int), 0);
00052   cuda_errcheck("memcpy to const_exclusions");
00053 }

void cuda_bind_force_table ( const float4 *  t,
const float4 *  et 
)

Definition at line 90 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), energy_table, force_table, and FORCE_TABLE_SIZE.

Referenced by ComputeNonbondedCUDA::build_force_table().

00090                                                               {
00091     static __thread cudaArray *ct;
00092     static __thread cudaArray *ect;
00093     if ( ! ct ) {
00094       cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
00095       cuda_errcheck("allocating force table");
00096     }
00097     if ( ! ect ) {
00098       cudaMallocArray(&ect, &energy_table.channelDesc, FORCE_TABLE_SIZE, 1);
00099       cuda_errcheck("allocating energy table");
00100     }
00101     cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00102     // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00103     cuda_errcheck("memcpy to force table");
00104     cudaMemcpyToArray(ect, 0, 0, et, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00105     cuda_errcheck("memcpy to energy table");
00106 
00107     force_table.normalized = true;
00108     force_table.addressMode[0] = cudaAddressModeClamp;
00109     force_table.addressMode[1] = cudaAddressModeClamp;
00110     force_table.filterMode = cudaFilterModeLinear;
00111 
00112     energy_table.normalized = true;
00113     energy_table.addressMode[0] = cudaAddressModeClamp;
00114     energy_table.addressMode[1] = cudaAddressModeClamp;
00115     energy_table.filterMode = cudaFilterModeLinear;
00116 
00117     cudaBindTextureToArray(force_table, ct);
00118     cuda_errcheck("binding force table to texture");
00119 
00120     cudaBindTextureToArray(energy_table, ect);
00121     cuda_errcheck("binding energy table to texture");
00122 }

void cuda_bind_forces ( float4 *  f,
float4 *  f_slow 
)

Definition at line 371 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), forces, and slow_forces.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00371                                                  {
00372   cudaHostGetDevicePointer(&forces, f, 0);
00373   cuda_errcheck("cudaHostGetDevicePointer forces");
00374   cudaHostGetDevicePointer(&slow_forces, f_slow, 0);
00375   cuda_errcheck("cudaHostGetDevicePointer slow_forces");
00376 }

void cuda_bind_GBIS_bornRad ( float *  bornRadH  ) 

Definition at line 406 of file ComputeNonbondedCUDAKernel.cu.

References bornRadD, cuda_errcheck(), num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00406                                              {
00407   cudaMemcpyAsync(bornRadD, bornRadH, num_atoms * sizeof(float),
00408                                 cudaMemcpyHostToDevice, stream);
00409   cuda_errcheck("memcpy to bornRad");
00410 }

void cuda_bind_GBIS_dEdaSum ( GBReal dEdaSumH  ) 

Definition at line 412 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and dEdaSumD.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00412                                               {
00413   cudaHostGetDevicePointer(&dEdaSumD, dEdaSumH, 0);
00414   cuda_errcheck("cudaHostGetDevicePointer dEdaSum");
00415 }

void cuda_bind_GBIS_dHdrPrefix ( float *  dHdrPrefixH  ) 

Definition at line 417 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), dHdrPrefixD, num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00417                                                    {
00418   cudaMemcpyAsync(dHdrPrefixD, dHdrPrefixH, num_atoms * sizeof(float),
00419                                 cudaMemcpyHostToDevice, stream);
00420   cuda_errcheck("memcpy to dHdrPrefix");
00421 }

void cuda_bind_GBIS_energy ( float *  energy_gbis  ) 

Definition at line 389 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and energy_gbis.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00389                                      {
00390   cudaHostGetDevicePointer(&energy_gbis, e, 0);
00391   cuda_errcheck("cudaHostGetDevicePointer energy_gbis");
00392 }

void cuda_bind_GBIS_intRad ( float *  intRad0H,
float *  intRadSH 
)

Definition at line 393 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), intRad0D, intRadSD, num_atoms, and stream.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00393                                                              {
00394   cudaMemcpyAsync(intRad0D, intRad0H, num_atoms * sizeof(float),
00395                                 cudaMemcpyHostToDevice, stream);
00396   cudaMemcpyAsync(intRadSD, intRadSH, num_atoms * sizeof(float),
00397                                 cudaMemcpyHostToDevice, stream);
00398   cuda_errcheck("memcpy to intRad");
00399 }

void cuda_bind_GBIS_psiSum ( GBReal psiSumH  ) 

Definition at line 401 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), and psiSumD.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00401                                             {
00402   cudaHostGetDevicePointer(&psiSumD, psiSumH, 0);
00403   cuda_errcheck("cudaHostGetDevicePointer psiSum");
00404 }

void cuda_bind_lj_table ( const float2 t,
int  _lj_table_size 
)

Definition at line 59 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), lj_table, and lj_table_size.

Referenced by ComputeNonbondedCUDA::build_lj_table().

00059                                                              {
00060     static __thread float2 *ct;
00061     static __thread int lj_table_alloc;
00062     lj_table_size = _lj_table_size;
00063     if ( ct && lj_table_alloc < lj_table_size ) {
00064       cudaFree(ct);
00065       cuda_errcheck("freeing lj table");
00066       ct = 0;
00067     }
00068     if ( ! ct ) {
00069       lj_table_alloc = lj_table_size;
00070       cudaMalloc((void**) &ct, lj_table_size*lj_table_size*sizeof(float2));
00071       cuda_errcheck("allocating lj table");
00072     }
00073     cudaMemcpy(ct, t, lj_table_size*lj_table_size*sizeof(float2),
00074                                             cudaMemcpyHostToDevice);
00075     cuda_errcheck("memcpy to lj table");
00076 
00077     lj_table.normalized = false;
00078     lj_table.addressMode[0] = cudaAddressModeClamp;
00079     lj_table.filterMode = cudaFilterModePoint;
00080 
00081     cudaBindTexture((size_t*)0, lj_table, ct,
00082         lj_table_size*lj_table_size*sizeof(float2));
00083     cuda_errcheck("binding lj table to texture");
00084 }

void cuda_bind_patch_pairs ( patch_pair *  h_patch_pairs,
int  npatch_pairs,
int  npatches,
int  natoms,
int  nexclmask,
int  plist_len 
)

Definition at line 293 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, atom_params_size, atoms, atoms_size, bornRadD, bornRadD_size, cuda_errcheck(), dHdrPrefixD, dHdrPrefixD_size, exclmasks, exclmasks_size, GBIS_P1_counters, GBIS_P1_counters_size, GBIS_P2_counters, GBIS_P2_counters_size, GBIS_P3_counters, GBIS_P3_counters_size, global_counters, global_counters_size, intRad0D, intRad0D_size, intRadSD, intRadSD_size, num_atoms, num_patches, num_virials, patch_pairs, patch_pairs_size, plist, plist_size, slow_tmpforces, slow_tmpforces_size, slow_tmpvirials, slow_tmpvirials_size, tmp_dEdaSumD, tmp_dEdaSumD_size, tmp_energy_gbis, tmp_energy_gbis_size, tmp_psiSumD, tmp_psiSumD_size, tmpforces, tmpforces_size, tmpvirials, tmpvirials_size, vdw_types, and vdw_types_size.

Referenced by ComputeNonbondedCUDA::doWork().

00295                                           {
00296   num_patches = npatches;
00297   num_virials = npatches;
00298   num_atoms = natoms;
00299   reallocate_device<patch_pair>(&patch_pairs, &patch_pairs_size, npatch_pairs, 1.2f);
00300   reallocate_device<atom>(&atoms, &atoms_size, num_atoms, 1.2f);
00301   reallocate_device<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
00302   reallocate_device<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
00303   reallocate_device<unsigned int>(&global_counters, &global_counters_size, num_patches+2, 1.2f);
00304   reallocate_device<float4>(&tmpforces, &tmpforces_size, num_atoms, 1.2f);
00305   reallocate_device<float4>(&slow_tmpforces, &slow_tmpforces_size, num_atoms, 1.2f);
00306   reallocate_device<unsigned int>(&plist, &plist_size, plist_len, 1.2f);
00307   reallocate_device<exclmask>(&exclmasks, &exclmasks_size, nexclmask, 1.2f);
00308   reallocate_device<float>(&tmpvirials, &tmpvirials_size, num_patches*16, 1.2f);
00309   reallocate_device<float>(&slow_tmpvirials, &slow_tmpvirials_size, num_patches*16, 1.2f);
00310 
00311   // For GBIS
00312   reallocate_device<unsigned int>(&GBIS_P1_counters, &GBIS_P1_counters_size, num_patches, 1.2f);
00313   reallocate_device<unsigned int>(&GBIS_P2_counters, &GBIS_P2_counters_size, num_patches, 1.2f);
00314   reallocate_device<unsigned int>(&GBIS_P3_counters, &GBIS_P3_counters_size, num_patches, 1.2f);
00315   reallocate_device<float>(&intRad0D, &intRad0D_size, num_atoms, 1.2f);
00316   reallocate_device<float>(&intRadSD, &intRadSD_size, num_atoms, 1.2f);
00317   reallocate_device<GBReal>(&tmp_psiSumD, &tmp_psiSumD_size, num_atoms, 1.2f);
00318   reallocate_device<float>(&bornRadD, &bornRadD_size, num_atoms, 1.2f);
00319   reallocate_device<GBReal>(&tmp_dEdaSumD, &tmp_dEdaSumD_size, num_atoms, 1.2f);
00320   reallocate_device<float>(&dHdrPrefixD, &dHdrPrefixD_size, num_atoms, 1.2f);
00321   reallocate_device<float>(&tmp_energy_gbis, &tmp_energy_gbis_size, num_patches, 1.2f);
00322 
00323   cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*sizeof(patch_pair), cudaMemcpyHostToDevice);
00324   cuda_errcheck("memcpy to patch_pairs");
00325 
00326   cudaMemset(global_counters, 0, (num_patches+2)*sizeof(unsigned int));
00327   cuda_errcheck("memset global_counters");
00328 
00329   cudaMemset(GBIS_P1_counters, 0, num_patches*sizeof(unsigned int));
00330   cuda_errcheck("memset GBIS_P1_counters");
00331 
00332   cudaMemset(GBIS_P2_counters, 0, num_patches*sizeof(unsigned int));
00333   cuda_errcheck("memset GBIS_P2_counters");
00334 
00335   cudaMemset(GBIS_P3_counters, 0, num_patches*sizeof(unsigned int));
00336   cuda_errcheck("memset GBIS_P3_counters");
00337 
00338   cudaMemset(tmpforces, 0, num_atoms*sizeof(float4));
00339   cuda_errcheck("memset tmpforces");
00340 
00341   cudaMemset(tmpvirials, 0, num_patches*sizeof(float)*16);
00342   cuda_errcheck("memset tmpvirials");
00343 
00344   cudaMemset(slow_tmpforces, 0, num_atoms*sizeof(float4));
00345   cuda_errcheck("memset slow_tmpforces");
00346 
00347   cudaMemset(slow_tmpvirials, 0, num_patches*sizeof(float)*16);
00348   cuda_errcheck("memset slow_tmpvirials");
00349 
00350 }

void cuda_bind_vdw_types ( const int *  t  ) 

Definition at line 358 of file ComputeNonbondedCUDAKernel.cu.

References cuda_errcheck(), num_atoms, stream, and vdw_types.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00358                                        {
00359   cudaMemcpyAsync(vdw_types, t, num_atoms * sizeof(int),
00360                                 cudaMemcpyHostToDevice, stream);
00361   cuda_errcheck("memcpy to vdw_types");
00362 }

void cuda_bind_virials ( float *  v,
int *  queue,
int *  blockorder 
)

Definition at line 378 of file ComputeNonbondedCUDAKernel.cu.

References block_order, cuda_errcheck(), force_ready_queue, num_virials, slow_virials, and virials.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00378                                                               {
00379   cudaHostGetDevicePointer(&virials, v, 0);
00380   cuda_errcheck("cudaHostGetDevicePointer virials");
00381   slow_virials = virials + num_virials*16;
00382   cudaHostGetDevicePointer(&force_ready_queue, queue, 0);
00383   cuda_errcheck("cudaHostGetDevicePointer force_ready_queue");
00384   cudaHostGetDevicePointer(&block_order, blockorder, 0);
00385   cuda_errcheck("cudaHostGetDevicePointer block_order");
00386 }

void cuda_errcheck ( const char *  msg  ) 

Definition at line 41 of file ComputeNonbondedCUDA.C.

00041                                     {
00042   cudaError_t err;
00043   if ((err = cudaGetLastError()) != cudaSuccess) {
00044     char host[128];
00045 #ifdef NOHOSTNAME
00046     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00047 #else
00048     gethostname(host, 128);  host[127] = 0;
00049 #endif
00050     char devstr[128] = "";
00051     int devnum;
00052     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00053       sprintf(devstr, " device %d", devnum);
00054     }
00055     char errmsg[1024];
00056     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00057     NAMD_die(errmsg);
00058   }
00059 }

void cuda_GBIS_P1 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  rho_0,
float3  lata,
float3  latb,
float3  latc,
cudaStream_t &  strm 
)

Definition at line 569 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), GBIS_P1_counters, intRad0D, intRadSD, max_grid_size, num_atoms, NUM_WARP, patch_pairs, psiSumD, tmp_psiSumD, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00580   {
00581 
00582  if ( ccount ) {
00583   cudaMemsetAsync(tmp_psiSumD, 0, num_atoms*sizeof(GBReal), strm);
00584 
00585   int grid_dim = max_grid_size;  // maximum allowed
00586   for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00587     if (grid_dim > ccount - cstart) {
00588       grid_dim = ccount - cstart;
00589     }
00590 
00591     dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00592     GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
00593       patch_pairs+cbegin+cstart,
00594       atoms,
00595       intRad0D,
00596       intRadSD,
00597       tmp_psiSumD,
00598       psiSumD,
00599       a_cut,
00600       rho_0,
00601       lata,
00602       latb,
00603       latc,
00604       GBIS_P1_counters 
00605       );
00606     cuda_errcheck("dev_GBIS_P1");
00607   } // end for
00608  }
00609 } // end GBIS P1

void cuda_GBIS_P2 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  r_cut,
float  scaling,
float  kappa,
float  smoothDist,
float  epsilon_p,
float  epsilon_s,
float3  lata,
float3  latb,
float3  latc,
int  doEnergy,
int  doFullElec,
cudaStream_t &  strm 
)

Definition at line 614 of file ComputeNonbondedCUDAKernel.cu.

References atoms, bornRadD, cuda_errcheck(), dEdaSumD, energy_gbis, forces, GBIS_P2_counters, max_grid_size, num_atoms, num_patches, NUM_WARP, patch_pairs, tmp_dEdaSumD, tmp_energy_gbis, tmpforces, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00632   {
00633 
00634  if ( ccount ) {
00635   cudaMemsetAsync(tmp_dEdaSumD, 0, num_atoms*sizeof(GBReal), strm);
00636   cudaMemsetAsync(tmp_energy_gbis, 0, num_patches*sizeof(float), strm);
00637 
00638   int grid_dim = max_grid_size;  // maximum allowed
00639   for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00640     if (grid_dim > ccount - cstart)
00641       grid_dim = ccount - cstart;
00642 
00643     dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00644     GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
00645       patch_pairs+cbegin+cstart,
00646       atoms,
00647       bornRadD,
00648       tmp_dEdaSumD,
00649       dEdaSumD,
00650       a_cut,
00651       r_cut,
00652       scaling,
00653       kappa,
00654       smoothDist,
00655       epsilon_p,
00656       epsilon_s,
00657       lata,
00658       latb,
00659       latc,
00660       doEnergy,
00661       doFullElec,
00662       tmpforces,
00663       forces,
00664       tmp_energy_gbis,
00665       energy_gbis,
00666       GBIS_P2_counters 
00667       );
00668     cuda_errcheck("dev_GBIS_P2");
00669   } // end for
00670  }
00671 } // end P2

void cuda_GBIS_P3 ( int  cbegin,
int  ccount,
int  pbegin,
int  pcount,
float  a_cut,
float  rho_0,
float  scaling,
float3  lata,
float3  latb,
float3  latc,
cudaStream_t &  strm 
)

Definition at line 676 of file ComputeNonbondedCUDAKernel.cu.

References atoms, cuda_errcheck(), dHdrPrefixD, GBIS_P3_counters, intRad0D, intRadSD, max_grid_size, NUM_WARP, patch_pairs, slow_forces, slow_tmpforces, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00688   {
00689   int grid_dim = max_grid_size;  // maximum allowed
00690   for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00691     if (grid_dim > ccount - cstart)
00692       grid_dim = ccount - cstart;
00693 
00694     dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00695     GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
00696       patch_pairs+cbegin+cstart,
00697       atoms,
00698       intRad0D,
00699       intRadSD,
00700       dHdrPrefixD,
00701       a_cut,
00702       rho_0,
00703       scaling,
00704       lata,
00705       latb,
00706       latc,
00707       slow_tmpforces,
00708       slow_forces,
00709       GBIS_P3_counters 
00710       );
00711     cuda_errcheck("dev_GBIS_P3");
00712   }
00713 }

void cuda_init (  ) 

Definition at line 203 of file ComputeNonbondedCUDAKernel.cu.

References atom_params, atom_params_size, atoms, atoms_size, bornRadD, bornRadD_size, cuda_errcheck(), dEdaSumD, dHdrPrefixD, dHdrPrefixD_size, energy_gbis, exclmasks, exclmasks_size, exclusions, exclusions_size, force_ready_queue, forces, GBIS_P1_counters, GBIS_P1_counters_size, GBIS_P2_counters, GBIS_P2_counters_size, GBIS_P3_counters, GBIS_P3_counters_size, global_counters, global_counters_size, intRad0D, intRad0D_size, intRadSD, intRadSD_size, max_grid_size, patch_pairs, patch_pairs_size, plist, plist_size, psiSumD, slow_forces, slow_tmpforces, slow_tmpforces_size, slow_tmpvirials, slow_tmpvirials_size, tmp_dEdaSumD, tmp_dEdaSumD_size, tmp_energy_gbis, tmp_energy_gbis_size, tmp_psiSumD, tmp_psiSumD_size, tmpforces, tmpforces_size, tmpvirials, tmpvirials_size, vdw_types, and vdw_types_size.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

00203                  {
00204   patch_pairs_size = 0;
00205   patch_pairs = NULL;
00206 
00207   atom_params_size = 0;
00208   atom_params = NULL;
00209 
00210   vdw_types_size = 0;
00211   vdw_types = NULL;
00212 
00213   atoms_size = 0;
00214   atoms = NULL;  
00215 
00216   tmpforces_size = 0;
00217   tmpforces = NULL;
00218 
00219   slow_tmpforces_size = 0;
00220   slow_tmpforces = NULL;
00221 
00222   tmpvirials_size = 0;
00223   tmpvirials = NULL;
00224 
00225   slow_tmpvirials_size = 0;
00226   slow_tmpvirials = NULL;
00227 
00228   global_counters_size = 0;
00229   global_counters = NULL;
00230 
00231   plist_size = 0;
00232   plist = NULL;
00233 
00234   exclmasks_size = 0;
00235   exclmasks = NULL;
00236 
00237   forces = NULL;
00238   slow_forces = NULL;
00239 
00240   force_ready_queue = NULL;
00241 
00242   exclusions_size = 0;
00243   exclusions = NULL;
00244 
00245   // --------------------
00246   // For GBIS
00247   // --------------------
00248   intRad0D_size = 0;
00249   intRad0D = NULL;
00250 
00251   intRadSD_size = 0;
00252   intRadSD = NULL;
00253 
00254   psiSumD = NULL;        // host-mapped memory
00255 
00256   tmp_psiSumD_size = 0;
00257   tmp_psiSumD = NULL;
00258 
00259   bornRadD_size = 0;
00260   bornRadD = NULL;
00261 
00262   dEdaSumD = NULL;       // host-mapped memory
00263 
00264   tmp_dEdaSumD_size = 0;
00265   tmp_dEdaSumD = NULL;
00266 
00267   dHdrPrefixD_size = 0;
00268   dHdrPrefixD = NULL;
00269 
00270   GBIS_P1_counters_size = 0;
00271   GBIS_P1_counters = NULL;
00272 
00273   GBIS_P2_counters_size = 0;
00274   GBIS_P2_counters = NULL;
00275 
00276   GBIS_P3_counters_size = 0;
00277   GBIS_P3_counters = NULL;
00278 
00279   energy_gbis = NULL;     // host-mapped memory
00280 
00281   tmp_energy_gbis_size = 0;
00282   tmp_energy_gbis = NULL;
00283 
00284   int dev;
00285   cudaGetDevice(&dev);
00286   cuda_errcheck("cudaGetDevice");
00287   cudaDeviceProp deviceProp;
00288   cudaGetDeviceProperties(&deviceProp, dev);
00289   cuda_errcheck("cudaGetDeviceProperties");
00290   max_grid_size = deviceProp.maxGridSize[1];
00291 }

void cuda_nonbonded_forces ( float3  lata,
float3  latb,
float3  latc,
float  cutoff2,
float  plcutoff2,
int  cbegin,
int  ccount,
int  ctotal,
int  doSlow,
int  doEnergy,
int  usePairlists,
int  savePairlists,
int  doStreaming,
int  saveOrder,
cudaStream_t &  strm 
)

Definition at line 500 of file ComputeNonbondedCUDAKernel.cu.

References CALL, cuda_errcheck(), max_grid_size, num_atoms, num_patches, NUM_WARP, slow_tmpforces, slow_tmpvirials, tmpforces, tmpvirials, and WARPSIZE.

Referenced by ComputeNonbondedCUDA::recvYieldDevice().

00504                                                                                {
00505 
00506  if ( ccount ) {
00507    if ( usePairlists ) {
00508      if ( ! savePairlists ) plcutoff2 = 0.;
00509    } else {
00510      plcutoff2 = cutoff2;
00511    }
00512 
00513    cudaMemsetAsync(tmpforces, 0, num_atoms*sizeof(float4), strm);
00514    cudaMemsetAsync(tmpvirials, 0, num_patches*sizeof(float)*16, strm);
00515    if ( doSlow ) {
00516      cudaMemsetAsync(slow_tmpforces, 0, num_atoms*sizeof(float4), strm);
00517      cudaMemsetAsync(slow_tmpvirials, 0, num_patches*sizeof(float)*16, strm);
00518    }
00519 
00520    int grid_dim = max_grid_size;  // maximum allowed
00521    for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
00522      if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
00523 
00524      dim3 nthread3(WARPSIZE, NUM_WARP, 1);
00525 
00526 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>>                    \
00527        (patch_pairs, atoms, atom_params, vdw_types, plist,              \
00528         tmpforces, (doSlow?slow_tmpforces:NULL),                        \
00529         forces, (doSlow?slow_forces:NULL),                              \
00530         tmpvirials, (doSlow?slow_tmpvirials:NULL),                      \
00531         virials, (doSlow?slow_virials:NULL),                            \
00532         global_counters, (doStreaming?force_ready_queue:NULL),          \
00533         overflow_exclusions, num_patches,                               \
00534         cbegin+cstart, ctotal, (saveOrder?block_order:NULL),            \
00535         exclmasks, lj_table_size,                                       \
00536         lata, latb, latc, cutoff2, plcutoff2, doSlow)
00537 //end definition
00538 
00539      if ( doEnergy ) {
00540        if ( doSlow ) {
00541          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_energy_pairlist);
00542          else CALL(dev_nonbonded_slow_energy);
00543        } else {
00544          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_energy_pairlist);
00545          else CALL(dev_nonbonded_energy);
00546        }
00547      } else {
00548        if ( doSlow ) {
00549          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_pairlist);
00550          else CALL(dev_nonbonded_slow);
00551        } else {
00552          if ( plcutoff2 != 0. ) CALL(dev_nonbonded_pairlist);
00553          else CALL(dev_nonbonded);
00554        }
00555      }
00556 
00557      cuda_errcheck("dev_nonbonded");
00558    }
00559  }
00560 
00561 }

int cuda_stream_finished (  ) 


Generated on Fri Sep 22 01:17:15 2017 for NAMD by  doxygen 1.4.7