NAMD
Macros | Functions | Variables
ComputeBondedCUDAKernel.cu File Reference
#include <math.h>
#include <cuda.h>
#include <namd_cub/cub.cuh>
#include "ComputeBondedCUDAKernel.h"
#include "DeviceCUDA.h"

Go to the source code of this file.

Macros

#define BONDEDFORCESKERNEL_NUM_WARP   4
 
#define NONBONDEDTABLES
 
#define CALL(DOENERGY, DOVIRIAL)
 
#define CALL(DOENERGY, DOVIRIAL, DOELECT)
 

Functions

__device__ __forceinline__ float4 sampleTableTex (cudaTextureObject_t tex, float k)
 
template<typename T >
__forceinline__ __device__ void convertForces (const float x, const float y, const float z, T &Tx, T &Ty, T &Tz)
 
template<typename T >
__forceinline__ __device__ void calcComponentForces (float fij, const float dx, const float dy, const float dz, T &fxij, T &fyij, T &fzij)
 
template<typename T >
__forceinline__ __device__ void calcComponentForces (float fij1, const float dx1, const float dy1, const float dz1, float fij2, const float dx2, const float dy2, const float dz2, T &fxij, T &fyij, T &fzij)
 
__forceinline__ __device__ int warpAggregatingAtomicInc (int *counter)
 
template<typename T >
__forceinline__ __device__ void storeForces (const T fx, const T fy, const T fz, const int ind, const int stride, T *force, T *forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void bondForce (const int index, const CudaBond *__restrict__ bondList, const CudaBondValue *__restrict__ bondValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__device__ void modifiedExclusionForce (const int index, const CudaExclusion *__restrict__ exclusionList, const bool doSlow, const float one_scale14, const int vdwCoefTableWidth, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, double &energyVdw, T *__restrict__ forceNbond, double &energyNbond, T *__restrict__ forceSlow, double &energySlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsNbond, int *forceListStartsSlow, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virialNbond, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void exclusionForce (const int index, const CudaExclusion *__restrict__ exclusionList, const float r2_delta, const int r2_delta_expc, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, T *__restrict__ forceSlow, double &energySlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsSlow, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void angleForce (const int index, const CudaAngle *__restrict__ angleList, const CudaAngleValue *__restrict__ angleValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
template<bool doEnergy>
__forceinline__ __device__ void diheComp (const CudaDihedralValue *dihedralValues, int ic, const float sin_phi, const float cos_phi, const float scale, float &df, double &e)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void diheForce (const int index, const CudaDihedral *__restrict__ diheList, const CudaDihedralValue *__restrict__ dihedralValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
__device__ __forceinline__ float3 cross (const float3 v1, const float3 v2)
 
__device__ __forceinline__ float dot (const float3 v1, const float3 v2)
 
template<typename T , bool doEnergy, bool doVirial>
__device__ void crosstermForce (const int index, const CudaCrossterm *__restrict__ crosstermList, const CudaCrosstermValue *__restrict__ crosstermValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
 
template<typename T , bool doEnergy, bool doVirial>
__global__ void bondedForcesKernel (const int start, const int numBonds, const CudaBond *__restrict__ bonds, const CudaBondValue *__restrict__ bondValues, const int numAngles, const CudaAngle *__restrict__ angles, const CudaAngleValue *__restrict__ angleValues, const int numDihedrals, const CudaDihedral *__restrict__ dihedrals, const CudaDihedralValue *__restrict__ dihedralValues, const int numImpropers, const CudaDihedral *__restrict__ impropers, const CudaDihedralValue *__restrict__ improperValues, const int numExclusions, const CudaExclusion *__restrict__ exclusions, const int numCrossterms, const CudaCrossterm *__restrict__ crossterms, const CudaCrosstermValue *__restrict__ crosstermValues, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float *__restrict__ r2_table, const float4 *__restrict__ exclusionTable, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, T *__restrict__ forceSlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListStartsSlow, int *forceListNexts, double *__restrict__ energies_virials)
 
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__global__ void modifiedExclusionForcesKernel (const int start, const int numModifiedExclusions, const CudaExclusion *__restrict__ modifiedExclusions, const bool doSlow, const float one_scale14, const float cutoff2, const int vdwCoefTableWidth, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ forceNbond, T *__restrict__ forceSlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsNbond, int *forceListStartsSlow, int *forceListNexts, double *__restrict__ energies_virials)
 
template<typename T >
__global__ void gatherBondedForcesKernel (const int start, const int atomStorageSize, const int stride, const T *__restrict__ forceList, const int *__restrict__ forceListStarts, const int *__restrict__ forceListNexts, T *__restrict__ force)
 
__global__ void reduceBondedBinsKernel (double *energies_virials)
 

Variables

__thread DeviceCUDAdeviceCUDA
 

Macro Definition Documentation

#define BONDEDFORCESKERNEL_NUM_WARP   4
#define CALL (   DOENERGY,
  DOVIRIAL 
)
Value:
bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
(start, numBonds, bonds, bondValues, \
numAngles, angles, angleValues, \
numDihedrals, dihedrals, dihedralValues, \
numImpropers, impropers, improperValues, \
numExclusionsDoSlow, exclusions, \
numCrossterms, crossterms, crosstermValues, \
r2_delta, r2_delta_expc, \
cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex() , \
xyzq, forceStride, \
forces, &forces[startSlow], \
forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
energies_virials);
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread unsigned int * exclusions
static __thread float4 * forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc

Referenced by ComputeBondedCUDAKernel::bondedForce(), cuda_nonbonded_forces(), CudaComputeGBISKernel::GBISphase2(), and CudaComputeNonbondedKernel::nonbondedForce().

#define CALL (   DOENERGY,
  DOVIRIAL,
  DOELECT 
)
Value:
modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
<<< nblock_use, nthread, shmem_size, stream >>> (\
start, numModifiedExclusions, modifiedExclusions, \
doSlow, one_scale14, cutoff2, \
cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
xyzq, forceStride, lata, latb, latc, \
&forces[startNbond], &forces[startSlow], \
forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
energies_virials);
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread float4 * forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
#define NONBONDEDTABLES
Value:
cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex()

Function Documentation

template<typename T , bool doEnergy, bool doVirial>
__device__ void angleForce ( const int  index,
const CudaAngle *__restrict__  angleList,
const CudaAngleValue *__restrict__  angleValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStarts,
int *  forceListNexts,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 588 of file ComputeBondedCUDAKernel.cu.

References force_to_float, CudaAngle::i, CudaAngle::ioffsetXYZ, CudaAngle::itype, CudaAngle::j, CudaAngle::k, CudaAngleValue::k, CudaAngleValue::k_ub, CudaAngle::koffsetXYZ, CudaAngleValue::normal, CudaAngleValue::r_ub, CudaAngle::scale, and CudaAngleValue::theta0.

602  {
603 
604  CudaAngle al = angleList[index];
605 
606  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
607  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
608  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
609 
610  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
611  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
612  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
613 
614  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
615  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
616  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
617 
618  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
619  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
620  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
621 
622  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
623  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
624 
625  float xijr = xij*rij_inv;
626  float yijr = yij*rij_inv;
627  float zijr = zij*rij_inv;
628  float xkjr = xkj*rkj_inv;
629  float ykjr = ykj*rkj_inv;
630  float zkjr = zkj*rkj_inv;
631  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
632 
633  CudaAngleValue angleValue = angleValues[al.itype];
634  angleValue.k *= al.scale;
635 
636  float diff;
637  if (angleValue.normal == 1) {
638  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
639  cos_theta = fminf(0.999f, fmaxf(-0.999f, cos_theta));
640  float theta = acosf(cos_theta);
641  diff = theta - angleValue.theta0;
642  } else {
643  diff = cos_theta - angleValue.theta0;
644  }
645 
646  if (doEnergy) {
647  energy += (double)(angleValue.k * diff * diff);
648  }
649  if (angleValue.normal == 1) {
650  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
651  if (inv_sin_theta > 1.0e6) {
652  diff = (diff < 0.0f) ? 1.0f : -1.0f;
653  } else {
654  diff *= -inv_sin_theta;
655  }
656  }
657  diff *= -2.0f*angleValue.k;
658 
659  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
660  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
661  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
662  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
663  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
664  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
665 
666  T T_dtxi, T_dtyi, T_dtzi;
667  T T_dtxj, T_dtyj, T_dtzj;
668  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
669  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
670  T T_dtxk = -T_dtxi - T_dtxj;
671  T T_dtyk = -T_dtyi - T_dtyj;
672  T T_dtzk = -T_dtzi - T_dtzj;
673  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
674 
675  if (angleValue.k_ub) {
676  float xik = xij - xkj;
677  float yik = yij - ykj;
678  float zik = zij - zkj;
679  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
680  float rik = 1.0f/rik_inv;
681  float diff_ub = rik - angleValue.r_ub;
682  if (doEnergy) {
683  energy += (double)(angleValue.k_ub * diff_ub * diff_ub);
684  }
685  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
686  T T_dxik, T_dyik, T_dzik;
687  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
688  T_dtxi += T_dxik;
689  T_dtyi += T_dyik;
690  T_dtzi += T_dzik;
691  T_dtxj -= T_dxik;
692  T_dtyj -= T_dyik;
693  T_dtzj -= T_dzik;
694  }
695 
696  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
697  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
698 
699  // Store virial
700  if (doVirial) {
701 #ifdef WRITE_FULL_VIRIALS
702  float fxi = ((float)T_dtxi)*force_to_float;
703  float fyi = ((float)T_dtyi)*force_to_float;
704  float fzi = ((float)T_dtzi)*force_to_float;
705  float fxk = ((float)T_dtxj)*force_to_float;
706  float fyk = ((float)T_dtyj)*force_to_float;
707  float fzk = ((float)T_dtzj)*force_to_float;
708  virial.xx = (fxi*xij) + (fxk*xkj);
709  virial.xy = (fxi*yij) + (fxk*ykj);
710  virial.xz = (fxi*zij) + (fxk*zkj);
711  virial.yx = (fyi*xij) + (fyk*xkj);
712  virial.yy = (fyi*yij) + (fyk*ykj);
713  virial.yz = (fyi*zij) + (fyk*zkj);
714  virial.zx = (fzi*xij) + (fzk*xkj);
715  virial.zy = (fzi*yij) + (fzk*ykj);
716  virial.zz = (fzi*zij) + (fzk*zkj);
717 #endif
718  }
719 }
static __constant__ const float force_to_float
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
float3 koffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 ioffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
template<typename T , bool doEnergy, bool doVirial>
__global__ void bondedForcesKernel ( const int  start,
const int  numBonds,
const CudaBond *__restrict__  bonds,
const CudaBondValue *__restrict__  bondValues,
const int  numAngles,
const CudaAngle *__restrict__  angles,
const CudaAngleValue *__restrict__  angleValues,
const int  numDihedrals,
const CudaDihedral *__restrict__  dihedrals,
const CudaDihedralValue *__restrict__  dihedralValues,
const int  numImpropers,
const CudaDihedral *__restrict__  impropers,
const CudaDihedralValue *__restrict__  improperValues,
const int  numExclusions,
const CudaExclusion *__restrict__  exclusions,
const int  numCrossterms,
const CudaCrossterm *__restrict__  crossterms,
const CudaCrosstermValue *__restrict__  crosstermValues,
const float  cutoff2,
const float  r2_delta,
const int  r2_delta_expc,
const float *__restrict__  r2_table,
const float4 *__restrict__  exclusionTable,
cudaTextureObject_t  r2_table_tex,
cudaTextureObject_t  exclusionTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
T *__restrict__  forceSlow,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStarts,
int *  forceListStartsSlow,
int *  forceListNexts,
double *__restrict__  energies_virials 
)

Definition at line 1195 of file ComputeBondedCUDAKernel.cu.

References ComputeBondedCUDAKernel::amdDiheVirialIndex_XX, ATOMIC_BINS, BLOCK_SYNC, BONDEDFORCESKERNEL_NUM_WARP, cutoff2, ComputeBondedCUDAKernel::energies_virials_SIZE, ComputeBondedCUDAKernel::energyIndex_ANGLE, ComputeBondedCUDAKernel::energyIndex_BOND, ComputeBondedCUDAKernel::energyIndex_CROSSTERM, ComputeBondedCUDAKernel::energyIndex_DIHEDRAL, ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW, ComputeBondedCUDAKernel::energyIndex_IMPROPER, exclusions, lata, latb, latc, ComputeBondedCUDAKernel::normalVirialIndex_XX, ComputeBondedCUDAKernel::slowVirialIndex_XX, WARP_FULL_MASK, WARP_SHUFFLE_XOR, WARPSIZE, ComputeBondedCUDAKernel::BondedVirial< T >::xx, ComputeBondedCUDAKernel::BondedVirial< T >::xy, xyzq, ComputeBondedCUDAKernel::BondedVirial< T >::xz, ComputeBondedCUDAKernel::BondedVirial< T >::yx, ComputeBondedCUDAKernel::BondedVirial< T >::yy, ComputeBondedCUDAKernel::BondedVirial< T >::yz, ComputeBondedCUDAKernel::BondedVirial< T >::zx, ComputeBondedCUDAKernel::BondedVirial< T >::zy, and ComputeBondedCUDAKernel::BondedVirial< T >::zz.

1242  {
1243 
1244  // Thread-block index
1245  int indexTB = start + blockIdx.x;
1246 
1247  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1248  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1249  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1250  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1251  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1252  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1253 
1254  // Each thread computes single bonded interaction.
1255  // Each thread block computes single bonded type
1256  double energy;
1257  int energyIndex;
1258 
1259  if (doEnergy) {
1260  energy = 0.0;
1261  energyIndex = -1;
1262  }
1263 
1264 #ifdef WRITE_FULL_VIRIALS
1266  int virialIndex;
1267  if (doVirial) {
1268  virial.xx = 0.0;
1269  virial.xy = 0.0;
1270  virial.xz = 0.0;
1271  virial.yx = 0.0;
1272  virial.yy = 0.0;
1273  virial.yz = 0.0;
1274  virial.zx = 0.0;
1275  virial.zy = 0.0;
1276  virial.zz = 0.0;
1278  }
1279 #endif
1280 
1281  if (indexTB < numBondsTB) {
1282  int i = threadIdx.x + indexTB*blockDim.x;
1283  if (doEnergy) {
1285  }
1286  if (i < numBonds) {
1287  bondForce<T, doEnergy, doVirial>
1288  (i, bonds, bondValues, xyzq,
1289  stride, lata, latb, latc,
1290  force, energy,
1291  forceList, forceListCounter, forceListStarts, forceListNexts,
1292  virial);
1293  }
1294  goto reduce;
1295  }
1296  indexTB -= numBondsTB;
1297 
1298  if (indexTB < numAnglesTB) {
1299  int i = threadIdx.x + indexTB*blockDim.x;
1300  if (doEnergy) {
1302  }
1303  if (i < numAngles) {
1304  angleForce<T, doEnergy, doVirial>
1305  (i, angles, angleValues, xyzq, stride,
1306  lata, latb, latc,
1307  force, energy,
1308  forceList, forceListCounter, forceListStarts, forceListNexts,
1309  virial);
1310  }
1311  goto reduce;
1312  }
1313  indexTB -= numAnglesTB;
1314 
1315  if (indexTB < numDihedralsTB) {
1316  int i = threadIdx.x + indexTB*blockDim.x;
1317  if (doEnergy) {
1319  }
1320  if (doVirial) {
1322  }
1323  if (i < numDihedrals) {
1324  diheForce<T, doEnergy, doVirial>
1325  (i, dihedrals, dihedralValues, xyzq, stride,
1326  lata, latb, latc,
1327  force, energy,
1328  forceList, forceListCounter, forceListStarts, forceListNexts,
1329  virial);
1330  }
1331  goto reduce;
1332  }
1333  indexTB -= numDihedralsTB;
1334 
1335  if (indexTB < numImpropersTB) {
1336  int i = threadIdx.x + indexTB*blockDim.x;
1337  if (doEnergy) {
1339  }
1340  if (i < numImpropers) {
1341  diheForce<T, doEnergy, doVirial>
1342  (i, impropers, improperValues, xyzq, stride,
1343  lata, latb, latc,
1344  force, energy,
1345  forceList, forceListCounter, forceListStarts, forceListNexts,
1346  virial);
1347  }
1348  goto reduce;
1349  }
1350  indexTB -= numImpropersTB;
1351 
1352  if (indexTB < numExclusionsTB) {
1353  int i = threadIdx.x + indexTB*blockDim.x;
1354  if (doEnergy) {
1356  }
1357  if (doVirial) {
1359  }
1360  if (i < numExclusions) {
1361  exclusionForce<T, doEnergy, doVirial>
1362  (i, exclusions, r2_delta, r2_delta_expc,
1363 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
1364  r2_table, exclusionTable,
1365 #else
1366  r2_table_tex, exclusionTableTex,
1367 #endif
1368  xyzq, stride, lata, latb, latc, cutoff2,
1369  forceSlow, energy,
1370  forceList, forceListCounter, forceListStartsSlow, forceListNexts,
1371  virial);
1372  }
1373  goto reduce;
1374  }
1375  indexTB -= numExclusionsTB;
1376 
1377  if (indexTB < numCrosstermsTB) {
1378  int i = threadIdx.x + indexTB*blockDim.x;
1379  if (doEnergy) {
1381  }
1382  if (doVirial) {
1384  }
1385  if (i < numCrossterms) {
1386  crosstermForce<T, doEnergy, doVirial>
1387  (i, crossterms, crosstermValues,
1388  xyzq, stride, lata, latb, latc,
1389  force, energy,
1390  forceList, forceListCounter, forceListStarts, forceListNexts,
1391  virial);
1392  }
1393  goto reduce;
1394  }
1395  // indexTB -= numCrosstermsTB;
1396 
1397  reduce:
1398 
1399  // Write energies to global memory
1400  if (doEnergy) {
1401  // energyIndex is constant within thread-block
1402  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
1403 #pragma unroll
1404  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1405  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1406  }
1407  int laneID = (threadIdx.x & (WARPSIZE - 1));
1408  int warpID = threadIdx.x / WARPSIZE;
1409  if (laneID == 0) {
1410  shEnergy[warpID] = energy;
1411  }
1412  BLOCK_SYNC;
1413  if (warpID == 0) {
1414  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
1415 #pragma unroll
1416  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1417  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1418  }
1419  if (laneID == 0) {
1420  const int bin = blockIdx.x % ATOMIC_BINS;
1421  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
1422  }
1423  }
1424  }
1425 
1426  // Write virials to global memory
1427 #ifdef WRITE_FULL_VIRIALS
1428  if (doVirial) {
1429 #pragma unroll
1430  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1431  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1432  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1433  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1434  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1435  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1436  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1437  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1438  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1439  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1440  }
1442  int laneID = (threadIdx.x & (WARPSIZE - 1));
1443  int warpID = threadIdx.x / WARPSIZE;
1444  if (laneID == 0) {
1445  shVirial[warpID] = virial;
1446  }
1447  BLOCK_SYNC;
1448 
1449  if (warpID == 0) {
1450  virial.xx = 0.0;
1451  virial.xy = 0.0;
1452  virial.xz = 0.0;
1453  virial.yx = 0.0;
1454  virial.yy = 0.0;
1455  virial.yz = 0.0;
1456  virial.zx = 0.0;
1457  virial.zy = 0.0;
1458  virial.zz = 0.0;
1459  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
1460 #pragma unroll
1461  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1462  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1463  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1464  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1465  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1466  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1467  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1468  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1469  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1470  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1471  }
1472 
1473  if (laneID == 0) {
1474 #ifdef USE_FP_VIRIAL
1475  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.xx*double_to_virial));
1476  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.xy*double_to_virial));
1477  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.xz*double_to_virial));
1478  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.yx*double_to_virial));
1479  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.yy*double_to_virial));
1480  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.yz*double_to_virial));
1481  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.zx*double_to_virial));
1482  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.zy*double_to_virial));
1483  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.zz*double_to_virial));
1484 #else
1485  const int bin = blockIdx.x % ATOMIC_BINS;
1486  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
1487  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
1488  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
1489  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
1490  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
1491  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
1492  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
1493  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
1494  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
1495 #endif
1496  }
1497  }
1498  }
1499 #endif
1500 
1501 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread unsigned int * exclusions
#define BONDEDFORCESKERNEL_NUM_WARP
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
template<typename T , bool doEnergy, bool doVirial>
__device__ void bondForce ( const int  index,
const CudaBond *__restrict__  bondList,
const CudaBondValue *__restrict__  bondValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStarts,
int *  forceListNexts,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 254 of file ComputeBondedCUDAKernel.cu.

References CudaBond::i, CudaBond::ioffsetXYZ, CudaBond::itype, CudaBond::j, CudaBondValue::k, CudaBond::scale, CudaBondValue::x0, and CudaBondValue::x1.

269  {
270 
271  CudaBond bl = bondList[index];
272  CudaBondValue bondValue = bondValues[bl.itype];
273  if (bondValue.x0 == 0.0f) return;
274 
275  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
276  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
277  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
278 
279  float4 xyzqi = xyzq[bl.i];
280  float4 xyzqj = xyzq[bl.j];
281 
282  float xij = xyzqi.x + shx - xyzqj.x;
283  float yij = xyzqi.y + shy - xyzqj.y;
284  float zij = xyzqi.z + shz - xyzqj.z;
285 
286  float r = sqrtf(xij*xij + yij*yij + zij*zij);
287 
288  float db = r - bondValue.x0;
289  if (bondValue.x1) {
290  // in this case, the bond represents a harmonic wall potential
291  // where x0 is the lower wall and x1 is the upper
292  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
293  }
294  float fij = db * bondValue.k * bl.scale;
295 
296  if (doEnergy) {
297  energy += (double)(fij*db);
298  }
299  fij *= -2.0f/r;
300 
301  T T_fxij, T_fyij, T_fzij;
302  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
303 
304  // Store forces
305  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
306  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
307 
308  // Store virial
309  if (doVirial) {
310 #ifdef WRITE_FULL_VIRIALS
311  float fxij = fij*xij;
312  float fyij = fij*yij;
313  float fzij = fij*zij;
314  virial.xx = (fxij*xij);
315  virial.xy = (fxij*yij);
316  virial.xz = (fxij*zij);
317  virial.yx = (fyij*xij);
318  virial.yy = (fyij*yij);
319  virial.yz = (fyij*zij);
320  virial.zx = (fzij*xij);
321  virial.zy = (fzij*yij);
322  virial.zz = (fzij*zij);
323 #endif
324  }
325 }
float scale
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
float3 ioffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
template<typename T >
__forceinline__ __device__ void calcComponentForces ( float  fij,
const float  dx,
const float  dy,
const float  dz,
T &  fxij,
T &  fyij,
T &  fzij 
)

Definition at line 99 of file ComputeBondedCUDAKernel.cu.

102  {
103 
104  fxij = (T)(fij*dx);
105  fyij = (T)(fij*dy);
106  fzij = (T)(fij*dz);
107 
108 }
template<typename T >
__forceinline__ __device__ void calcComponentForces ( float  fij1,
const float  dx1,
const float  dy1,
const float  dz1,
float  fij2,
const float  dx2,
const float  dy2,
const float  dz2,
T &  fxij,
T &  fyij,
T &  fzij 
)

Definition at line 130 of file ComputeBondedCUDAKernel.cu.

135  {
136 
137  fxij = (T)(fij1*dx1 + fij2*dx2);
138  fyij = (T)(fij1*dy1 + fij2*dy2);
139  fzij = (T)(fij1*dz1 + fij2*dz2);
140 }
template<typename T >
__forceinline__ __device__ void convertForces ( const float  x,
const float  y,
const float  z,
T &  Tx,
T &  Ty,
T &  Tz 
)

Definition at line 77 of file ComputeBondedCUDAKernel.cu.

78  {
79 
80  Tx = (T)(x);
81  Ty = (T)(y);
82  Tz = (T)(z);
83 }
gridSize z
gridSize y
gridSize x
__device__ __forceinline__ float3 cross ( const float3  v1,
const float3  v2 
)
template<typename T , bool doEnergy, bool doVirial>
__device__ void crosstermForce ( const int  index,
const CudaCrossterm *__restrict__  crosstermList,
const CudaCrosstermValue *__restrict__  crosstermValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStarts,
int *  forceListNexts,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 926 of file ComputeBondedCUDAKernel.cu.

References A, B, CudaCrosstermValue::c, cross(), CudaCrosstermValue::dim, dot(), CudaCrossterm::i1, CudaCrossterm::i2, CudaCrossterm::i3, CudaCrossterm::i4, CudaCrossterm::i5, CudaCrossterm::i6, CudaCrossterm::i7, CudaCrossterm::i8, CudaCrossterm::itype, M_PI, CudaCrossterm::offset12XYZ, CudaCrossterm::offset23XYZ, CudaCrossterm::offset34XYZ, CudaCrossterm::offset56XYZ, CudaCrossterm::offset67XYZ, CudaCrossterm::offset78XYZ, and CudaCrossterm::scale.

940  {
941 
942  CudaCrossterm cl = crosstermList[index];
943 
944  // ----------------------------------------------------------------------------
945  // Angle between 1 - 2 - 3 - 4
946  //
947  // 1 - 2
948  float3 sh12 = make_float3(
949  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
950  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
951  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
952 
953  float4 xyzq1 = xyzq[cl.i1];
954  float4 xyzq2 = xyzq[cl.i2];
955 
956  float3 r12 = make_float3(
957  xyzq1.x + sh12.x - xyzq2.x,
958  xyzq1.y + sh12.y - xyzq2.y,
959  xyzq1.z + sh12.z - xyzq2.z);
960 
961  // 2 - 3
962  float3 sh23 = make_float3(
963  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
964  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
965  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
966 
967  float4 xyzq3 = xyzq[cl.i3];
968 
969  float3 r23 = make_float3(
970  xyzq2.x + sh23.x - xyzq3.x,
971  xyzq2.y + sh23.y - xyzq3.y,
972  xyzq2.z + sh23.z - xyzq3.z);
973 
974  // 3 - 4
975  float3 sh34 = make_float3(
976  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
977  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
978  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
979 
980  float4 xyzq4 = xyzq[cl.i4];
981 
982  float3 r34 = make_float3(
983  xyzq3.x + sh34.x - xyzq4.x,
984  xyzq3.y + sh34.y - xyzq4.y,
985  xyzq3.z + sh34.z - xyzq4.z);
986 
987  // Calculate the cross products
988  float3 A = cross(r12, r23);
989  float3 B = cross(r23, r34);
990  float3 C = cross(r23, A);
991 
992  // Calculate the inverse distances
993  float inv_rA = rsqrtf(dot(A, A));
994  float inv_rB = rsqrtf(dot(B, B));
995  float inv_rC = rsqrtf(dot(C, C));
996 
997  // Calculate the sin and cos
998  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
999  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1000 
1001  float phi = -atan2f(sin_phi,cos_phi);
1002 
1003  // ----------------------------------------------------------------------------
1004  // Angle between 5 - 6 - 7 - 8
1005  //
1006 
1007  // 5 - 6
1008  float3 sh56 = make_float3(
1009  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1010  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1011  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1012 
1013  float4 xyzq5 = xyzq[cl.i5];
1014  float4 xyzq6 = xyzq[cl.i6];
1015 
1016  float3 r56 = make_float3(
1017  xyzq5.x + sh56.x - xyzq6.x,
1018  xyzq5.y + sh56.y - xyzq6.y,
1019  xyzq5.z + sh56.z - xyzq6.z);
1020 
1021  // 6 - 7
1022  float3 sh67 = make_float3(
1023  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1024  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1025  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1026 
1027  float4 xyzq7 = xyzq[cl.i7];
1028 
1029  float3 r67 = make_float3(
1030  xyzq6.x + sh67.x - xyzq7.x,
1031  xyzq6.y + sh67.y - xyzq7.y,
1032  xyzq6.z + sh67.z - xyzq7.z);
1033 
1034  // 7 - 8
1035  float3 sh78 = make_float3(
1036  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1037  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1038  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1039 
1040  float4 xyzq8 = xyzq[cl.i8];
1041 
1042  float3 r78 = make_float3(
1043  xyzq7.x + sh78.x - xyzq8.x,
1044  xyzq7.y + sh78.y - xyzq8.y,
1045  xyzq7.z + sh78.z - xyzq8.z);
1046 
1047  // Calculate the cross products
1048  float3 D = cross(r56, r67);
1049  float3 E = cross(r67, r78);
1050  float3 F = cross(r67, D);
1051 
1052  // Calculate the inverse distances
1053  float inv_rD = rsqrtf(dot(D, D));
1054  float inv_rE = rsqrtf(dot(E, E));
1055  float inv_rF = rsqrtf(dot(F, F));
1056 
1057  // Calculate the sin and cos
1058  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1059  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1060 
1061  float psi = -atan2f(sin_psi,cos_psi);
1062 
1063  // ----------------------------------------------------------------------------
1064  // Calculate the energy
1065 
1066  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1067 
1068  // Shift angles
1069  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1070  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1071 
1072  // distance measured in grid points between angle and smallest value
1073  float phi_h = phi * inv_h;
1074  float psi_h = psi * inv_h;
1075 
1076  // find smallest numbered grid point in stencil
1077  int iphi = (int)floor(phi_h);
1078  int ipsi = (int)floor(psi_h);
1079 
1080  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1081 
1082  // Load coefficients
1083  float4 c[4];
1084  c[0] = cp.c[iphi][ipsi][0];
1085  c[1] = cp.c[iphi][ipsi][1];
1086  c[2] = cp.c[iphi][ipsi][2];
1087  c[3] = cp.c[iphi][ipsi][3];
1088 
1089  float dphi = phi_h - (float)iphi;
1090  float dpsi = psi_h - (float)ipsi;
1091 
1092  if (doEnergy) {
1093  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1094  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1095  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1096  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1097  energy += energyf*cl.scale;
1098  }
1099 
1100  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1101  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1102  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1103  dEdphi *= cl.scale*inv_h;
1104 
1105  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1106  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1107  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1108  dEdpsi *= cl.scale*inv_h;
1109 
1110  // float normCross1 = dot(A, A);
1111  float square_r23 = dot(r23, r23);
1112  float norm_r23 = sqrtf(square_r23);
1113  float inv_square_r23 = 1.0f/square_r23;
1114  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1115  float ff2 = -dot(r12, r23)*inv_square_r23;
1116  float ff3 = -dot(r34, r23)*inv_square_r23;
1117  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1118 
1119  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1120  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1121  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1122  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1123  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1124 
1125  T T_f1x, T_f1y, T_f1z;
1126  T T_f2x, T_f2y, T_f2z;
1127  T T_f3x, T_f3y, T_f3z;
1128  T T_f4x, T_f4y, T_f4z;
1129  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1130  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1131  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1132  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1133  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1134  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1135  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1136  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1137 
1138  float square_r67 = dot(r67, r67);
1139  float norm_r67 = sqrtf(square_r67);
1140  float inv_square_r67 = 1.0f/(square_r67);
1141  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1142  ff2 = -dot(r56, r67)*inv_square_r67;
1143  ff3 = -dot(r78, r67)*inv_square_r67;
1144  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1145 
1146  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1147  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1148  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1149  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1150  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1151 
1152  T T_f5x, T_f5y, T_f5z;
1153  T T_f6x, T_f6y, T_f6z;
1154  T T_f7x, T_f7y, T_f7z;
1155  T T_f8x, T_f8y, T_f8z;
1156  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1157  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1158  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1159  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1160  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1161  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1162  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1163  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1164 
1165  // Store virial
1166  if (doVirial) {
1167 #ifdef WRITE_FULL_VIRIALS
1168  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1169  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1170  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1171  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1172  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1173  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1174  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1175  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1176  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1177  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1178  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1179  }
1180 #endif
1181 
1182 }
#define M_PI
Definition: GoMolecule.C:39
float3 offset12XYZ
float3 offset78XYZ
const BigReal A
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
float3 offset67XYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
float3 offset23XYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 offset56XYZ
float4 c[dim][dim][4]
__device__ __forceinline__ float dot(const float3 v1, const float3 v2)
const BigReal B
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
float3 offset34XYZ
template<bool doEnergy>
__forceinline__ __device__ void diheComp ( const CudaDihedralValue dihedralValues,
int  ic,
const float  sin_phi,
const float  cos_phi,
const float  scale,
float &  df,
double &  e 
)

Definition at line 728 of file ComputeBondedCUDAKernel.cu.

References CudaDihedralValue::delta, CudaDihedralValue::k, M_PI, and CudaDihedralValue::n.

729  {
730 
731  df = 0.0f;
732  if (doEnergy) e = 0.0;
733 
734  float phi = atan2f(sin_phi, cos_phi);
735 
736  bool lrep = true;
737  while (lrep) {
738  CudaDihedralValue dihedralValue = dihedralValues[ic];
739  dihedralValue.k *= scale;
740 
741  // Last dihedral has n low bit set to 0
742  lrep = (dihedralValue.n & 1);
743  dihedralValue.n >>= 1;
744  if (dihedralValue.n) {
745  float nf = dihedralValue.n;
746  float x = nf*phi - dihedralValue.delta;
747  if (doEnergy) {
748  float sin_x, cos_x;
749  sincosf(x, &sin_x, &cos_x);
750  e += (double)(dihedralValue.k*(1.0f + cos_x));
751  df += (double)(nf*dihedralValue.k*sin_x);
752  } else {
753  float sin_x = sinf(x);
754  df += (double)(nf*dihedralValue.k*sin_x);
755  }
756  } else {
757  float diff = phi - dihedralValue.delta;
758  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
759  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
760  if (doEnergy) {
761  e += (double)(dihedralValue.k*diff*diff);
762  }
763  df -= 2.0f*dihedralValue.k*diff;
764  }
765  ic++;
766  }
767 
768 }
#define M_PI
Definition: GoMolecule.C:39
gridSize x
template<typename T , bool doEnergy, bool doVirial>
__device__ void diheForce ( const int  index,
const CudaDihedral *__restrict__  diheList,
const CudaDihedralValue *__restrict__  dihedralValues,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  force,
double &  energy,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStarts,
int *  forceListNexts,
ComputeBondedCUDAKernel::BondedVirial< double > &  virial 
)

Definition at line 772 of file ComputeBondedCUDAKernel.cu.

References force_to_float, CudaDihedral::i, CudaDihedral::ioffsetXYZ, CudaDihedral::itype, CudaDihedral::j, CudaDihedral::joffsetXYZ, CudaDihedral::k, CudaDihedral::l, CudaDihedral::loffsetXYZ, and CudaDihedral::scale.

785  {
786 
787  CudaDihedral dl = diheList[index];
788 
789  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
790  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
791  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
792 
793  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
794  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
795  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
796 
797  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
798  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
799  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
800 
801  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
802  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
803  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
804 
805  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
806  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
807  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
808 
809  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
810  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
811  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
812 
813  // A=F^G, B=H^G.
814  float ax = yij*zjk - zij*yjk;
815  float ay = zij*xjk - xij*zjk;
816  float az = xij*yjk - yij*xjk;
817  float bx = ylk*zjk - zlk*yjk;
818  float by = zlk*xjk - xlk*zjk;
819  float bz = xlk*yjk - ylk*xjk;
820 
821  float ra2 = ax*ax + ay*ay + az*az;
822  float rb2 = bx*bx + by*by + bz*bz;
823  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
824 
825  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
826  // nlinear = nlinear + 1
827  // endif
828 
829  float rgr = 1.0f / rg;
830  float ra2r = 1.0f / ra2;
831  float rb2r = 1.0f / rb2;
832  float rabr = sqrtf(ra2r*rb2r);
833 
834  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
835  //
836  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
837  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
838  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
839  //
840  // Energy and derivative contributions.
841 
842  float df;
843  double e;
844  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
845 
846  if (doEnergy) energy += e;
847 
848  //
849  // Compute derivatives wrt catesian coordinates.
850  //
851  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
852  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
853 
854  float fg = xij*xjk + yij*yjk + zij*zjk;
855  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
856  ra2r *= df;
857  rb2r *= df;
858  float fga = fg*ra2r*rgr;
859  float hgb = hg*rb2r*rgr;
860  float gaa = ra2r*rg;
861  float gbb = rb2r*rg;
862  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
863 
864  // Store forces
865  T T_fix, T_fiy, T_fiz;
866  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
867  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
868 
869  T dgx, dgy, dgz;
870  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
871  T T_fjx = dgx - T_fix;
872  T T_fjy = dgy - T_fiy;
873  T T_fjz = dgz - T_fiz;
874  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
875 
876  T dhx, dhy, dhz;
877  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
878  T T_fkx = -dhx - dgx;
879  T T_fky = -dhy - dgy;
880  T T_fkz = -dhz - dgz;
881  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
882  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
883 
884  // Store virial
885  if (doVirial) {
886 #ifdef WRITE_FULL_VIRIALS
887  float fix = ((float)T_fix)*force_to_float;
888  float fiy = ((float)T_fiy)*force_to_float;
889  float fiz = ((float)T_fiz)*force_to_float;
890  float fjx = ((float)dgx)*force_to_float;
891  float fjy = ((float)dgy)*force_to_float;
892  float fjz = ((float)dgz)*force_to_float;
893  float flx = ((float)dhx)*force_to_float;
894  float fly = ((float)dhy)*force_to_float;
895  float flz = ((float)dhz)*force_to_float;
896  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
897  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
898  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
899  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
900  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
901  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
902  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
903  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
904  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
905 #endif
906  }
907 
908 }
static __constant__ const float force_to_float
float3 ioffsetXYZ
float3 joffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
float3 loffsetXYZ
__device__ __forceinline__ float dot ( const float3  v1,
const float3  v2 
)

Definition at line 918 of file ComputeBondedCUDAKernel.cu.

Referenced by crosstermForce(), ARestraint::GetDihe(), and AmberParm7Reader::parse_fortran_format().

918  {
919  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
920 }
template<typename T , bool doEnergy, bool doVirial>
__device__ void exclusionForce ( const int  index,
const CudaExclusion *__restrict__  exclusionList,
const float  r2_delta,
const int  r2_delta_expc,
cudaTextureObject_t  r2_table_tex,
cudaTextureObject_t  exclusionTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
T *__restrict__  forceSlow,
double &  energySlow,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStartsSlow,
int *  forceListNexts,
ComputeBondedCUDAKernel::BondedVirial< double > &  virialSlow 
)

Definition at line 492 of file ComputeBondedCUDAKernel.cu.

References __ldg, CudaExclusion::i, CudaExclusion::ioffsetXYZ, and CudaExclusion::j.

517  {
518 
519  CudaExclusion bl = exclusionList[index];
520 
521  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
522  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
523  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
524 
525  float4 xyzqi = xyzq[bl.i];
526  float4 xyzqj = xyzq[bl.j];
527 
528  float xij = xyzqi.x + shx - xyzqj.x;
529  float yij = xyzqi.y + shy - xyzqj.y;
530  float zij = xyzqi.z + shz - xyzqj.z;
531 
532  float r2 = xij*xij + yij*yij + zij*zij;
533  if (r2 < cutoff2) {
534  r2 += r2_delta;
535 
536  union { float f; int i; } r2i;
537  r2i.f = r2;
538  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
539 
540 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
541  float r2_table_val = __ldg(&r2_table[table_i]);
542 #else
543  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
544 #endif
545 
546  float diffa = r2 - r2_table_val;
547  float qq = xyzqi.w * xyzqj.w;
548 
549 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
550  float4 slow = __ldg(&exclusionTable[table_i]);
551 #else
552  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
553 #endif
554 
555 
556  if (doEnergy) {
557  energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
558  }
559 
560  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
561 
562  T T_fxij, T_fyij, T_fzij;
563  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
564  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
565  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
566 
567  // Store virial
568  if (doVirial) {
569 #ifdef WRITE_FULL_VIRIALS
570  float fxij = fSlow*xij;
571  float fyij = fSlow*yij;
572  float fzij = fSlow*zij;
573  virialSlow.xx = (fxij*xij);
574  virialSlow.xy = (fxij*yij);
575  virialSlow.xz = (fxij*zij);
576  virialSlow.yx = (fyij*xij);
577  virialSlow.yy = (fyij*yij);
578  virialSlow.yz = (fyij*zij);
579  virialSlow.zx = (fzij*xij);
580  virialSlow.zy = (fzij*yij);
581  virialSlow.zz = (fzij*zij);
582 #endif
583  }
584  }
585 }
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
template<typename T >
__global__ void gatherBondedForcesKernel ( const int  start,
const int  atomStorageSize,
const int  stride,
const T *__restrict__  forceList,
const int *__restrict__  forceListStarts,
const int *__restrict__  forceListNexts,
T *__restrict__  force 
)

Definition at line 1779 of file ComputeBondedCUDAKernel.cu.

References if().

1786  {
1787 
1788  int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
1789 
1790  if (i < atomStorageSize) {
1791  T fx = 0;
1792  T fy = 0;
1793  T fz = 0;
1794  int pos = forceListStarts[i];
1795  while (pos != -1) {
1796  fx += forceList[pos * 3 + 0];
1797  fy += forceList[pos * 3 + 1];
1798  fz += forceList[pos * 3 + 2];
1799  pos = forceListNexts[pos];
1800  }
1801  force[i ] = fx;
1802  force[i + stride ] = fy;
1803  force[i + stride * 2] = fz;
1804  }
1805 }
if(ComputeNonbondedUtil::goMethod==2)
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__device__ void modifiedExclusionForce ( const int  index,
const CudaExclusion *__restrict__  exclusionList,
const bool  doSlow,
const float  one_scale14,
const int  vdwCoefTableWidth,
cudaTextureObject_t  vdwCoefTableTex,
cudaTextureObject_t  forceTableTex,
cudaTextureObject_t  energyTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
double &  energyVdw,
T *__restrict__  forceNbond,
double &  energyNbond,
T *__restrict__  forceSlow,
double &  energySlow,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStartsNbond,
int *  forceListStartsSlow,
int *  forceListNexts,
ComputeBondedCUDAKernel::BondedVirial< double > &  virialNbond,
ComputeBondedCUDAKernel::BondedVirial< double > &  virialSlow 
)

Definition at line 352 of file ComputeBondedCUDAKernel.cu.

References __ldg, energyTableTex, forceTableTex, CudaExclusion::i, CudaExclusion::ioffsetXYZ, CudaExclusion::j, sampleTableTex(), vdwCoefTableTex, vdwCoefTableWidth, CudaExclusion::vdwtypei, CudaExclusion::vdwtypej, float2::x, ComputeBondedCUDAKernel::BondedVirial< T >::xx, ComputeBondedCUDAKernel::BondedVirial< T >::xy, ComputeBondedCUDAKernel::BondedVirial< T >::xz, float2::y, ComputeBondedCUDAKernel::BondedVirial< T >::yx, ComputeBondedCUDAKernel::BondedVirial< T >::yy, ComputeBondedCUDAKernel::BondedVirial< T >::yz, ComputeBondedCUDAKernel::BondedVirial< T >::zx, ComputeBondedCUDAKernel::BondedVirial< T >::zy, and ComputeBondedCUDAKernel::BondedVirial< T >::zz.

380  {
381 
382  CudaExclusion bl = exclusionList[index];
383 
384  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
385  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
386  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
387 
388  float4 xyzqi = xyzq[bl.i];
389  float4 xyzqj = xyzq[bl.j];
390 
391  float xij = xyzqi.x + shx - xyzqj.x;
392  float yij = xyzqi.y + shy - xyzqj.y;
393  float zij = xyzqi.z + shz - xyzqj.z;
394 
395  float r2 = xij*xij + yij*yij + zij*zij;
396  if (r2 < cutoff2) {
397 
398  float rinv = rsqrtf(r2);
399 
400  float qq;
401  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
402 
403  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
404 #if __CUDA_ARCH__ >= 350
405  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
406 #else
407  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
408 #endif
409 
410 #if defined(NAMD_CUDA)
411  float4 fi = tex1D<float4>(forceTableTex, rinv);
412 #else // NAMD_HIP
413  float4 fi = sampleTableTex(forceTableTex, rinv);
414 #endif
415  float4 ei;
416  if (doEnergy) {
417 #if defined NAMD_CUDA
418  ei = tex1D<float4>(energyTableTex, rinv);
419 #else
420  ei = sampleTableTex(energyTableTex, rinv);
421 #endif
422  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
423  if (doElect) {
424  energyNbond += qq * ei.x;
425  if (doSlow) energySlow += qq * ei.w;
426  }
427  }
428 
429  float fNbond;
430  if (doElect) {
431  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
432  } else {
433  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
434  }
435  T T_fxij, T_fyij, T_fzij;
436  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
437  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
438  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
439 
440  float fSlow;
441  if (doSlow && doElect) {
442  fSlow = -qq * fi.w;
443  T T_fxij, T_fyij, T_fzij;
444  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
445  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
446  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
447  }
448 
449  // Store virial
450  if (doVirial) {
451 #ifdef WRITE_FULL_VIRIALS
452  float fxij = fNbond*xij;
453  float fyij = fNbond*yij;
454  float fzij = fNbond*zij;
455  virialNbond.xx = (fxij*xij);
456  virialNbond.xy = (fxij*yij);
457  virialNbond.xz = (fxij*zij);
458  virialNbond.yx = (fyij*xij);
459  virialNbond.yy = (fyij*yij);
460  virialNbond.yz = (fyij*zij);
461  virialNbond.zx = (fzij*xij);
462  virialNbond.zy = (fzij*yij);
463  virialNbond.zz = (fzij*zij);
464 #endif
465  }
466 
467  // Store virial
468  if (doVirial && doSlow && doElect) {
469 #ifdef WRITE_FULL_VIRIALS
470  float fxij = fSlow*xij;
471  float fyij = fSlow*yij;
472  float fzij = fSlow*zij;
473  virialSlow.xx = (fxij*xij);
474  virialSlow.xy = (fxij*yij);
475  virialSlow.xz = (fxij*zij);
476  virialSlow.yx = (fyij*xij);
477  virialSlow.yy = (fyij*yij);
478  virialSlow.yz = (fyij*zij);
479  virialSlow.zx = (fzij*xij);
480  virialSlow.zy = (fzij*yij);
481  virialSlow.zz = (fzij*zij);
482 #endif
483  }
484 
485  }
486 }
float x
Definition: PmeSolver.C:4
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__device__ __forceinline__ float4 sampleTableTex(cudaTextureObject_t tex, float k)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
float y
Definition: PmeSolver.C:4
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
template<typename T , bool doEnergy, bool doVirial, bool doElect>
__global__ void modifiedExclusionForcesKernel ( const int  start,
const int  numModifiedExclusions,
const CudaExclusion *__restrict__  modifiedExclusions,
const bool  doSlow,
const float  one_scale14,
const float  cutoff2,
const int  vdwCoefTableWidth,
const float2 *__restrict__  vdwCoefTable,
cudaTextureObject_t  vdwCoefTableTex,
cudaTextureObject_t  modifiedExclusionForceTableTex,
cudaTextureObject_t  modifiedExclusionEnergyTableTex,
const float4 *__restrict__  xyzq,
const int  stride,
const float3  lata,
const float3  latb,
const float3  latc,
T *__restrict__  forceNbond,
T *__restrict__  forceSlow,
T *__restrict__  forceList,
int *  forceListCounter,
int *  forceListStartsNbond,
int *  forceListStartsSlow,
int *  forceListNexts,
double *__restrict__  energies_virials 
)

Definition at line 1504 of file ComputeBondedCUDAKernel.cu.

References ATOMIC_BINS, BLOCK_SYNC, BONDEDFORCESKERNEL_NUM_WARP, cutoff2, ComputeBondedCUDAKernel::energies_virials_SIZE, ComputeBondedCUDAKernel::energyIndex_ELECT, ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW, ComputeBondedCUDAKernel::energyIndex_LJ, if(), lata, latb, latc, ComputeBondedCUDAKernel::nbondVirialIndex_XX, ComputeBondedCUDAKernel::nbondVirialIndex_XY, ComputeBondedCUDAKernel::nbondVirialIndex_XZ, ComputeBondedCUDAKernel::nbondVirialIndex_YX, ComputeBondedCUDAKernel::nbondVirialIndex_YY, ComputeBondedCUDAKernel::nbondVirialIndex_YZ, ComputeBondedCUDAKernel::nbondVirialIndex_ZX, ComputeBondedCUDAKernel::nbondVirialIndex_ZY, ComputeBondedCUDAKernel::nbondVirialIndex_ZZ, ComputeBondedCUDAKernel::slowVirialIndex_XX, ComputeBondedCUDAKernel::slowVirialIndex_XY, ComputeBondedCUDAKernel::slowVirialIndex_XZ, ComputeBondedCUDAKernel::slowVirialIndex_YX, ComputeBondedCUDAKernel::slowVirialIndex_YY, ComputeBondedCUDAKernel::slowVirialIndex_YZ, ComputeBondedCUDAKernel::slowVirialIndex_ZX, ComputeBondedCUDAKernel::slowVirialIndex_ZY, ComputeBondedCUDAKernel::slowVirialIndex_ZZ, vdwCoefTable, vdwCoefTableTex, vdwCoefTableWidth, WARP_FULL_MASK, WARP_SHUFFLE_XOR, WARPSIZE, ComputeBondedCUDAKernel::BondedVirial< T >::xx, ComputeBondedCUDAKernel::BondedVirial< T >::xy, xyzq, ComputeBondedCUDAKernel::BondedVirial< T >::xz, ComputeBondedCUDAKernel::BondedVirial< T >::yx, ComputeBondedCUDAKernel::BondedVirial< T >::yy, ComputeBondedCUDAKernel::BondedVirial< T >::yz, ComputeBondedCUDAKernel::BondedVirial< T >::zx, ComputeBondedCUDAKernel::BondedVirial< T >::zy, and ComputeBondedCUDAKernel::BondedVirial< T >::zz.

1527  {
1528 
1529  // index
1530  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
1531 
1532  double energyVdw, energyNbond, energySlow;
1533  if (doEnergy) {
1534  energyVdw = 0.0;
1535  if (doElect) {
1536  energyNbond = 0.0;
1537  energySlow = 0.0;
1538  }
1539  }
1540 
1541 #ifdef WRITE_FULL_VIRIALS
1544  if (doVirial) {
1545  virialNbond.xx = 0.0;
1546  virialNbond.xy = 0.0;
1547  virialNbond.xz = 0.0;
1548  virialNbond.yx = 0.0;
1549  virialNbond.yy = 0.0;
1550  virialNbond.yz = 0.0;
1551  virialNbond.zx = 0.0;
1552  virialNbond.zy = 0.0;
1553  virialNbond.zz = 0.0;
1554  if (doElect) {
1555  virialSlow.xx = 0.0;
1556  virialSlow.xy = 0.0;
1557  virialSlow.xz = 0.0;
1558  virialSlow.yx = 0.0;
1559  virialSlow.yy = 0.0;
1560  virialSlow.yz = 0.0;
1561  virialSlow.zx = 0.0;
1562  virialSlow.zy = 0.0;
1563  virialSlow.zz = 0.0;
1564  }
1565  }
1566 #endif
1567 
1568  if (i < numModifiedExclusions)
1569  {
1570  modifiedExclusionForce<T, doEnergy, doVirial, doElect>
1571  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
1572 #if __CUDA_ARCH__ >= 350
1573  vdwCoefTable,
1574 #else
1576 #endif
1577  modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
1578  xyzq, stride, lata, latb, latc, cutoff2,
1579  energyVdw, forceNbond, energyNbond,
1580  forceSlow, energySlow,
1581  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
1582  virialNbond, virialSlow);
1583  }
1584 
1585  // Write energies to global memory
1586  if (doEnergy) {
1587  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
1588  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1589  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1590  // JM: Each warp reduce its energy, adds it to the shared memory array
1591  // and warp zero reduces the shared memory on all warps in the block and adds it to gmem
1592 #pragma unroll
1593  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1594  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
1595  if (doElect) {
1596  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
1597  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
1598  }
1599  }
1600  int laneID = (threadIdx.x & (WARPSIZE - 1));
1601  int warpID = threadIdx.x / WARPSIZE;
1602  if (laneID == 0) {
1603  shEnergyVdw[warpID] = energyVdw;
1604  if (doElect) {
1605  shEnergyNbond[warpID] = energyNbond;
1606  shEnergySlow[warpID] = energySlow;
1607  }
1608  }
1609  BLOCK_SYNC;
1610  if (warpID == 0) {
1611  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
1612  if (doElect) {
1613  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
1614  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
1615  }
1616 #pragma unroll
1617  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1618  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
1619  if (doElect) {
1620  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
1621  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
1622  }
1623  }
1624  if (laneID == 0) {
1625  const int bin = blockIdx.x % ATOMIC_BINS;
1626  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
1627  if (doElect) {
1628  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
1630  }
1631  }
1632  }
1633  }
1634 
1635  // Write virials to global memory
1636 #ifdef WRITE_FULL_VIRIALS
1637  if (doVirial) {
1638 #pragma unroll
1639  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1640  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
1641  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
1642  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
1643  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
1644  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
1645  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
1646  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
1647  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
1648  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
1649  if (doElect && doSlow) {
1650  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
1651  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
1652  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
1653  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
1654  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
1655  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
1656  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
1657  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
1658  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
1659  }
1660  }
1662  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1663  int laneID = (threadIdx.x & (WARPSIZE - 1));
1664  int warpID = threadIdx.x / WARPSIZE;
1665  if (laneID == 0) {
1666  shVirialNbond[warpID] = virialNbond;
1667  if (doElect && doSlow) {
1668  shVirialSlow[warpID] = virialSlow;
1669  }
1670  }
1671  BLOCK_SYNC;
1672 
1673  virialNbond.xx = 0.0;
1674  virialNbond.xy = 0.0;
1675  virialNbond.xz = 0.0;
1676  virialNbond.yx = 0.0;
1677  virialNbond.yy = 0.0;
1678  virialNbond.yz = 0.0;
1679  virialNbond.zx = 0.0;
1680  virialNbond.zy = 0.0;
1681  virialNbond.zz = 0.0;
1682  if (doElect && doSlow) {
1683  virialSlow.xx = 0.0;
1684  virialSlow.xy = 0.0;
1685  virialSlow.xz = 0.0;
1686  virialSlow.yx = 0.0;
1687  virialSlow.yy = 0.0;
1688  virialSlow.yz = 0.0;
1689  virialSlow.zx = 0.0;
1690  virialSlow.zy = 0.0;
1691  virialSlow.zz = 0.0;
1692  }
1693 
1694  if (warpID == 0) {
1695  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
1696  virialNbond = shVirialNbond[laneID];
1697  if (doElect && doSlow) {
1698  virialSlow = shVirialSlow[laneID];
1699  }
1700  }
1701 #pragma unroll
1702  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1703  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
1704  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
1705  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
1706  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
1707  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
1708  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
1709  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
1710  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
1711  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
1712  if (doElect && doSlow) {
1713  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
1714  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
1715  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
1716  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
1717  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
1718  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
1719  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
1720  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
1721  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
1722  }
1723  }
1724 
1725  if (laneID == 0)
1726  {
1727 #ifdef USE_FP_VIRIAL
1728  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], llitoulli(virialNbond.xx*double_to_virial));
1729  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], llitoulli(virialNbond.xy*double_to_virial));
1730  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], llitoulli(virialNbond.xz*double_to_virial));
1731  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], llitoulli(virialNbond.yx*double_to_virial));
1732  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], llitoulli(virialNbond.yy*double_to_virial));
1733  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], llitoulli(virialNbond.yz*double_to_virial));
1734  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], llitoulli(virialNbond.zx*double_to_virial));
1735  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], llitoulli(virialNbond.zy*double_to_virial));
1736  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], llitoulli(virialNbond.zz*double_to_virial));
1737  if (doElect && doSlow) {
1738  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], llitoulli(virialSlow.xx*double_to_virial));
1739  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], llitoulli(virialSlow.xy*double_to_virial));
1740  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], llitoulli(virialSlow.xz*double_to_virial));
1741  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], llitoulli(virialSlow.yx*double_to_virial));
1742  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], llitoulli(virialSlow.yy*double_to_virial));
1743  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], llitoulli(virialSlow.yz*double_to_virial));
1744  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], llitoulli(virialSlow.zx*double_to_virial));
1745  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], llitoulli(virialSlow.zy*double_to_virial));
1746  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], llitoulli(virialSlow.zz*double_to_virial));
1747  }
1748 #else
1749  const int bin = blockIdx.x % ATOMIC_BINS;
1750  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
1751  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
1752  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
1753  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
1754  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
1755  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
1756  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
1757  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
1758  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
1759  if (doElect && doSlow) {
1760  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
1761  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
1762  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
1763  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
1764  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
1765  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
1766  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
1767  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
1768  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
1769  }
1770 #endif
1771  }
1772  }
1773  }
1774 #endif
1775 
1776 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
if(ComputeNonbondedUtil::goMethod==2)
#define BONDEDFORCESKERNEL_NUM_WARP
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
__global__ void reduceBondedBinsKernel ( double *  energies_virials)

Definition at line 1807 of file ComputeBondedCUDAKernel.cu.

References ATOMIC_BINS, ComputeBondedCUDAKernel::energies_virials_SIZE, and tempStorage.

1807  {
1808  const int bin = threadIdx.x;
1809 
1810  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
1811  __shared__ typename WarpReduce::TempStorage tempStorage;
1812 
1813  double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
1814  if (threadIdx.x == 0) {
1815  energies_virials[blockIdx.x] = v;
1816  }
1817 }
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__shared__ union @43 tempStorage
__device__ __forceinline__ float4 sampleTableTex ( cudaTextureObject_t  tex,
float  k 
)

Definition at line 30 of file ComputeBondedCUDAKernel.cu.

References FORCE_ENERGY_TABLE_SIZE, and x.

Referenced by calcForceEnergy(), and modifiedExclusionForce().

30  {
31  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
32  const float x = k * (float)tableSize - 0.5f;
33  const float f = floorf(x);
34  const float a = x - f;
35  const unsigned int i = (unsigned int)f;
36  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
37  const int i1 = i0 + 1;
38  const float4 t0 = tex1Dfetch<float4>(tex, i0);
39  const float4 t1 = tex1Dfetch<float4>(tex, i1);
40  return make_float4(
41  a * (t1.x - t0.x) + t0.x,
42  a * (t1.y - t0.y) + t0.y,
43  a * (t1.z - t0.z) + t0.z,
44  a * (t1.w - t0.w) + t0.w);
45 }
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:19
gridSize x
template<typename T >
__forceinline__ __device__ void storeForces ( const T  fx,
const T  fy,
const T  fz,
const int  ind,
const int  stride,
T *  force,
T *  forceList,
int *  forceListCounter,
int *  forceListStarts,
int *  forceListNexts 
)

Definition at line 189 of file ComputeBondedCUDAKernel.cu.

References WARP_ALL, WARP_BALLOT, WARP_FULL_MASK, WARP_SHUFFLE, warpAggregatingAtomicInc(), and WARPSIZE.

193  {
194 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
195 #if defined(NAMD_HIP)
196  // Try to decrease conflicts between lanes if there are repeting ind
198  if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
199  const int laneId = threadIdx.x % WARPSIZE;
200  const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
201  const bool isHead = laneId == 0 || ind != prevLaneInd;
202  if (!WARP_ALL(WARP_FULL_MASK, isHead)) {
203  // There are segments of repeating ind
204  typedef cub::WarpReduce<T> WarpReduce;
205  __shared__ typename WarpReduce::TempStorage temp_storage;
206  const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
207  const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
208  const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
209  if (isHead) {
210  atomicAdd(&force[ind ], sumfx);
211  atomicAdd(&force[ind + stride ], sumfy);
212  atomicAdd(&force[ind + stride*2], sumfz);
213  }
214  return;
215  }
216  }
217  // Not all lanes are active (the last block) or there is no repeating ind
218  atomicAdd(&force[ind ], fx);
219  atomicAdd(&force[ind + stride ], fy);
220  atomicAdd(&force[ind + stride*2], fz);
221 #else
222  atomicAdd(&force[ind ], fx);
223  atomicAdd(&force[ind + stride ], fy);
224  atomicAdd(&force[ind + stride*2], fz);
225 #endif
226 #else
227  const int newPos = warpAggregatingAtomicInc(forceListCounter);
228  forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
229  forceList[newPos * 3 + 0] = fx;
230  forceList[newPos * 3 + 1] = fy;
231  forceList[newPos * 3 + 2] = fz;
232 #endif
233 }
#define WARP_ALL(MASK, P)
Definition: CudaUtils.h:56
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define WARPSIZE
Definition: CudaUtils.h:10
unsigned int WarpMask
Definition: CudaUtils.h:11
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:58
__forceinline__ __device__ int warpAggregatingAtomicInc(int *counter)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
__forceinline__ __device__ int warpAggregatingAtomicInc ( int *  counter)

Definition at line 167 of file ComputeBondedCUDAKernel.cu.

References WARP_SHUFFLE, and WARPSIZE.

Referenced by storeForces().

167  {
168 #if WARPSIZE == 64
169  WarpMask mask = __ballot(1);
170  int total = __popcll(mask);
171  int prefix = __popcll(mask & cub::LaneMaskLt());
172  int firstLane = __ffsll(mask) - 1;
173 #else
174  WarpMask mask = __activemask();
175  int total = __popc(mask);
176  int prefix = __popc(mask & cub::LaneMaskLt());
177  int firstLane = __ffs(mask) - 1;
178 #endif
179  int start = 0;
180  if (prefix == 0) {
181  start = atomicAdd(counter, total);
182  }
183  start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
184  return start + prefix;
185 }
#define WARPSIZE
Definition: CudaUtils.h:10
unsigned int WarpMask
Definition: CudaUtils.h:11
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54

Variable Documentation

__thread DeviceCUDA* deviceCUDA