ComputeBondedCUDAKernel.cu

Go to the documentation of this file.
00001 #ifdef WIN32
00002 #define _USE_MATH_DEFINES
00003 #define __thread __declspec(thread)
00004 #endif
00005 #include <math.h>
00006 #include <cuda.h>
00007 #include <cub/cub.cuh>
00008 #include "ComputeBondedCUDAKernel.h"
00009 
00010 #ifdef FUTURE_CUDADEVICE
00011 #include "CudaDevice.h"
00012 #else
00013 #include "DeviceCUDA.h"
00014 extern __thread DeviceCUDA *deviceCUDA;
00015 #endif
00016 
00017 __device__ inline long long int lliroundf(float f)
00018 {
00019     long long int l;
00020     asm("cvt.rni.s64.f32  %0, %1;" : "=l"(l) : "f"(f));
00021     return l;
00022 }
00023 
00024 __device__ inline unsigned long long int llitoulli(long long int l)
00025 {
00026     unsigned long long int u;
00027     asm("mov.b64    %0, %1;" : "=l"(u) : "l"(l));
00028     return u;
00029 }
00030 
00031 template <typename T>
00032 __forceinline__ __device__
00033 void convertForces(const float x, const float y, const float z,
00034   T &Tx, T &Ty, T &Tz) {
00035 
00036   Tx = (T)(x);
00037   Ty = (T)(y);
00038   Tz = (T)(z);
00039 }
00040 
00041 template <>
00042 __forceinline__ __device__
00043 void convertForces<long long int>(const float x, const float y, const float z,
00044   long long int &Tx, long long int &Ty, long long int &Tz) {
00045 
00046   Tx = lliroundf(x*float_to_force);
00047   Ty = lliroundf(y*float_to_force);
00048   Tz = lliroundf(z*float_to_force);
00049 }
00050 
00051 template <typename T>
00052 __forceinline__ __device__
00053 void calcComponentForces(
00054   float fij,
00055   const float dx, const float dy, const float dz,
00056   T &fxij, T &fyij, T &fzij) {
00057 
00058   fxij = (T)(fij*dx);
00059   fyij = (T)(fij*dy);
00060   fzij = (T)(fij*dz);
00061 
00062 }
00063 
00064 template <>
00065 __forceinline__ __device__
00066 void calcComponentForces<long long int>(
00067   float fij,
00068   const float dx, const float dy, const float dz,
00069   long long int &fxij,
00070   long long int &fyij,
00071   long long int &fzij) {
00072 
00073   fij *= float_to_force;
00074   fxij = lliroundf(fij*dx);
00075   fyij = lliroundf(fij*dy);
00076   fzij = lliroundf(fij*dz);
00077 
00078 }
00079 
00080 template <typename T>
00081 __forceinline__ __device__
00082 void calcComponentForces(
00083   float fij1,
00084   const float dx1, const float dy1, const float dz1,
00085   float fij2,
00086   const float dx2, const float dy2, const float dz2,
00087   T &fxij, T &fyij, T &fzij) {
00088 
00089   fxij = (T)(fij1*dx1 + fij2*dx2);
00090   fyij = (T)(fij1*dy1 + fij2*dy2);
00091   fzij = (T)(fij1*dz1 + fij2*dz2);
00092 }
00093 
00094 template <>
00095 __forceinline__ __device__
00096 void calcComponentForces<long long int>(
00097   float fij1,
00098   const float dx1,
00099   const float dy1,
00100   const float dz1,
00101   float fij2,
00102   const float dx2,
00103   const float dy2,
00104   const float dz2,
00105   long long int &fxij,
00106   long long int &fyij,
00107   long long int &fzij) {
00108 
00109   fij1 *= float_to_force;
00110   fij2 *= float_to_force;
00111   fxij = lliroundf(fij1*dx1 + fij2*dx2);
00112   fyij = lliroundf(fij1*dy1 + fij2*dy2);
00113   fzij = lliroundf(fij1*dz1 + fij2*dz2);
00114 }
00115 
00116 template <typename T>
00117 __forceinline__ __device__
00118 void storeForces(const T fx, const T fy, const T fz,
00119      const int ind, const int stride,
00120      T* force) {
00121   // The generic version can not be used
00122 }
00123 
00124 // Template specialization for 64bit integer = "long long int"
00125 template <>
00126 __forceinline__ __device__ 
00127 void storeForces <long long int> (const long long int fx,
00128           const long long int fy,
00129           const long long int fz,
00130           const int ind, const int stride,
00131           long long int* force) {
00132   atomicAdd((unsigned long long int *)&force[ind           ], llitoulli(fx));
00133   atomicAdd((unsigned long long int *)&force[ind + stride  ], llitoulli(fy));
00134   atomicAdd((unsigned long long int *)&force[ind + stride*2], llitoulli(fz));
00135 }
00136 
00137 //
00138 // Calculates bonds
00139 //
00140 template <typename T, bool doEnergy, bool doVirial>
00141 __device__ void bondForce(
00142   const int index,
00143   const CudaBond* __restrict__ bondList,
00144   const CudaBondValue* __restrict__ bondValues,
00145   const float4* __restrict__ xyzq,
00146   const int stride,
00147   const float3 lata, const float3 latb, const float3 latc,
00148   T* __restrict__ force, double &energy,
00149 #ifdef WRITE_FULL_VIRIALS
00150   ComputeBondedCUDAKernel::BondedVirial<double>& virial
00151 #else
00152   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00153 #endif
00154   ) {
00155 
00156   CudaBond bl = bondList[index];
00157   CudaBondValue bondValue = bondValues[bl.itype];
00158   if (bondValue.x0 == 0.0f) return;
00159 
00160   float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
00161   float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
00162   float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
00163 
00164   float4 xyzqi = xyzq[bl.i];
00165   float4 xyzqj = xyzq[bl.j];
00166 
00167   float xij = xyzqi.x + shx - xyzqj.x;
00168   float yij = xyzqi.y + shy - xyzqj.y;
00169   float zij = xyzqi.z + shz - xyzqj.z;
00170 
00171   float r = sqrtf(xij*xij + yij*yij + zij*zij);
00172 
00173   float db = r - bondValue.x0;
00174   float fij = db * bondValue.k * bl.scale;
00175  
00176   if (doEnergy) {
00177     energy += (double)(fij*db);
00178   }
00179   fij *= -2.0f/r;
00180   
00181   T T_fxij, T_fyij, T_fzij;
00182   calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00183   
00184   // Store forces
00185   storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force);
00186   storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force);
00187 
00188   // Store virial
00189   if (doVirial) {
00190 #ifdef WRITE_FULL_VIRIALS
00191     float fxij = fij*xij;
00192     float fyij = fij*yij;
00193     float fzij = fij*zij;
00194     virial.xx = (fxij*xij);
00195     virial.xy = (fxij*yij);
00196     virial.xz = (fxij*zij);
00197     virial.yx = (fyij*xij);
00198     virial.yy = (fyij*yij);
00199     virial.yz = (fyij*zij);
00200     virial.zx = (fzij*xij);
00201     virial.zy = (fzij*yij);
00202     virial.zz = (fzij*zij);
00203 #endif
00204   }
00205 }
00206 
00207 //
00208 // Calculates modified exclusions
00209 //
00210 template <typename T, bool doEnergy, bool doVirial, bool doElect>
00211 __device__ void modifiedExclusionForce(
00212   const int index,
00213   const CudaExclusion* __restrict__ exclusionList,
00214   const bool doSlow,
00215   const float one_scale14,                // 1 - scale14
00216   const int vdwCoefTableWidth,
00217 #if __CUDA_ARCH__ >= 350
00218   const float2* __restrict__ vdwCoefTable,
00219 #else
00220   cudaTextureObject_t vdwCoefTableTex, 
00221 #endif
00222   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
00223   const float4* __restrict__ xyzq,
00224   const int stride,
00225   const float3 lata, const float3 latb, const float3 latc,
00226   const float cutoff2,
00227   double &energyVdw,
00228   T* __restrict__ forceNbond, double &energyNbond,
00229   T* __restrict__ forceSlow, double &energySlow,
00230 #ifdef WRITE_FULL_VIRIALS
00231   ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
00232   ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow
00233 #else
00234   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
00235   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
00236 #endif
00237   ) {
00238 
00239   CudaExclusion bl = exclusionList[index];
00240 
00241   float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
00242   float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
00243   float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
00244 
00245   float4 xyzqi = xyzq[bl.i];
00246   float4 xyzqj = xyzq[bl.j];
00247 
00248   float xij = xyzqi.x + shx - xyzqj.x;
00249   float yij = xyzqi.y + shy - xyzqj.y;
00250   float zij = xyzqi.z + shz - xyzqj.z;
00251 
00252   float r2 = xij*xij + yij*yij + zij*zij;
00253   if (r2 < cutoff2) {
00254 
00255     float rinv = rsqrtf(r2);
00256 
00257     float qq;
00258     if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
00259 
00260     int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
00261 #if __CUDA_ARCH__ >= 350
00262     float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
00263 #else
00264     float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex);
00265 #endif
00266 
00267     float4 fi = tex1D<float4>(forceTableTex, rinv);
00268     float4 ei;
00269     if (doEnergy) {
00270       ei = tex1D<float4>(energyTableTex, rinv);
00271       energyVdw   += (double)(ljab.x * ei.z + ljab.y * ei.y);
00272       if (doElect) {
00273         energyNbond += qq * ei.x;
00274         if (doSlow) energySlow  += qq * ei.w;
00275       }
00276     }
00277 
00278     float fNbond;
00279     if (doElect) {
00280       fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
00281     } else {
00282       fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
00283     }
00284     T T_fxij, T_fyij, T_fzij;
00285     calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00286     storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond);
00287     storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond);
00288 
00289     float fSlow;
00290     if (doSlow && doElect) {
00291       fSlow = -qq * fi.w;
00292       T T_fxij, T_fyij, T_fzij;
00293       calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00294       storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow);
00295       storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow);
00296     }
00297 
00298     // Store virial
00299     if (doVirial) {
00300 #ifdef WRITE_FULL_VIRIALS
00301       float fxij = fNbond*xij;
00302       float fyij = fNbond*yij;
00303       float fzij = fNbond*zij;
00304       virialNbond.xx = (fxij*xij);
00305       virialNbond.xy = (fxij*yij);
00306       virialNbond.xz = (fxij*zij);
00307       virialNbond.yx = (fyij*xij);
00308       virialNbond.yy = (fyij*yij);
00309       virialNbond.yz = (fyij*zij);
00310       virialNbond.zx = (fzij*xij);
00311       virialNbond.zy = (fzij*yij);
00312       virialNbond.zz = (fzij*zij);
00313 #endif
00314     }
00315 
00316     // Store virial
00317     if (doVirial && doSlow && doElect) {
00318 #ifdef WRITE_FULL_VIRIALS
00319       float fxij = fSlow*xij;
00320       float fyij = fSlow*yij;
00321       float fzij = fSlow*zij;
00322       virialSlow.xx = (fxij*xij);
00323       virialSlow.xy = (fxij*yij);
00324       virialSlow.xz = (fxij*zij);
00325       virialSlow.yx = (fyij*xij);
00326       virialSlow.yy = (fyij*yij);
00327       virialSlow.yz = (fyij*zij);
00328       virialSlow.zx = (fzij*xij);
00329       virialSlow.zy = (fzij*yij);
00330       virialSlow.zz = (fzij*zij);
00331 #endif
00332     }
00333 
00334   }
00335 }
00336 
00337 //
00338 // Calculates exclusions. Here doSlow = true
00339 //
00340 template <typename T, bool doEnergy, bool doVirial>
00341 __device__ void exclusionForce(
00342   const int index,
00343   const CudaExclusion* __restrict__ exclusionList,
00344   const float r2_delta, const int r2_delta_expc,
00345 #if __CUDA_ARCH__ >= 350
00346   const float* __restrict__ r2_table,
00347   const float4* __restrict__ exclusionTable,
00348 #else
00349   cudaTextureObject_t r2_table_tex,
00350   cudaTextureObject_t exclusionTableTex,
00351 #endif
00352   const float4* __restrict__ xyzq,
00353   const int stride,
00354   const float3 lata, const float3 latb, const float3 latc,
00355   const float cutoff2,
00356   T* __restrict__ forceSlow, double &energySlow,
00357 #ifdef WRITE_FULL_VIRIALS
00358   ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow
00359 #else
00360   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
00361 #endif
00362   ) {
00363 
00364   CudaExclusion bl = exclusionList[index];
00365 
00366   float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
00367   float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
00368   float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
00369 
00370   float4 xyzqi = xyzq[bl.i];
00371   float4 xyzqj = xyzq[bl.j];
00372 
00373   float xij = xyzqi.x + shx - xyzqj.x;
00374   float yij = xyzqi.y + shy - xyzqj.y;
00375   float zij = xyzqi.z + shz - xyzqj.z;
00376 
00377   float r2 = xij*xij + yij*yij + zij*zij;
00378   if (r2 < cutoff2) {
00379     r2 += r2_delta;
00380 
00381     union { float f; int i; } r2i;
00382     r2i.f = r2;
00383     int table_i = (r2i.i >> 17) + r2_delta_expc;  // table_i >= 0
00384 
00385 #if __CUDA_ARCH__ >= 350
00386     float r2_table_val = __ldg(&r2_table[table_i]);
00387 #else
00388     float r2_table_val = tex1D<float>(r2_table_tex, table_i);
00389 #endif
00390     float diffa = r2 - r2_table_val;
00391     float qq = xyzqi.w * xyzqj.w;
00392 
00393 #if __CUDA_ARCH__ >= 350
00394     float4 slow = __ldg(&exclusionTable[table_i]);
00395 #else
00396     float4 slow = tex1D<float4>(exclusionTableTex, table_i);
00397 #endif
00398 
00399     if (doEnergy) {
00400       energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
00401     }
00402 
00403     float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
00404 
00405     T T_fxij, T_fyij, T_fzij;
00406     calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
00407     storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow);
00408     storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow);
00409 
00410     // Store virial
00411     if (doVirial) {
00412 #ifdef WRITE_FULL_VIRIALS
00413       float fxij = fSlow*xij;
00414       float fyij = fSlow*yij;
00415       float fzij = fSlow*zij;
00416       virialSlow.xx = (fxij*xij);
00417       virialSlow.xy = (fxij*yij);
00418       virialSlow.xz = (fxij*zij);
00419       virialSlow.yx = (fyij*xij);
00420       virialSlow.yy = (fyij*yij);
00421       virialSlow.yz = (fyij*zij);
00422       virialSlow.zx = (fzij*xij);
00423       virialSlow.zy = (fzij*yij);
00424       virialSlow.zz = (fzij*zij);
00425 #endif
00426     }
00427   }
00428 }
00429 
00430 template <typename T, bool doEnergy, bool doVirial>
00431 __device__ void angleForce(const int index,
00432   const CudaAngle* __restrict__ angleList,
00433   const CudaAngleValue* __restrict__ angleValues,
00434   const float4* __restrict__ xyzq,
00435   const int stride,
00436   const float3 lata, const float3 latb, const float3 latc,
00437   T* __restrict__ force, double &energy,
00438 #ifdef WRITE_FULL_VIRIALS
00439   ComputeBondedCUDAKernel::BondedVirial<double>& virial
00440 #else
00441   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00442 #endif
00443   ) {
00444 
00445   CudaAngle al = angleList[index];
00446 
00447   float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
00448   float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
00449   float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
00450 
00451   float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
00452   float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
00453   float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
00454 
00455   float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
00456   float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
00457   float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
00458 
00459   float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
00460   float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
00461   float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
00462 
00463   float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
00464   float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
00465 
00466   float xijr = xij*rij_inv;
00467   float yijr = yij*rij_inv;
00468   float zijr = zij*rij_inv;
00469   float xkjr = xkj*rkj_inv;
00470   float ykjr = ykj*rkj_inv;
00471   float zkjr = zkj*rkj_inv;
00472   float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
00473 
00474   CudaAngleValue angleValue = angleValues[al.itype];
00475   angleValue.k *= al.scale;
00476 
00477   float diff;
00478   if (angleValue.normal == 1) {
00479     // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
00480     cos_theta = min(0.999f, max(-0.999f, cos_theta));
00481     float theta = acosf(cos_theta);
00482     diff = theta - angleValue.theta0;
00483   } else {
00484     diff = cos_theta - angleValue.theta0;
00485   }
00486 
00487   if (doEnergy) {
00488     energy += (double)(angleValue.k * diff * diff);
00489   }
00490   if (angleValue.normal == 1) {
00491     float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
00492     if (inv_sin_theta > 1.0e6) {
00493       diff = (diff < 0.0f) ? 1.0f : -1.0f;
00494     } else {
00495       diff *= -inv_sin_theta;
00496     }
00497   }
00498   diff *= -2.0f*angleValue.k;
00499 
00500   float dtxi = rij_inv*(xkjr - cos_theta*xijr);
00501   float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
00502   float dtyi = rij_inv*(ykjr - cos_theta*yijr);
00503   float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
00504   float dtzi = rij_inv*(zkjr - cos_theta*zijr);
00505   float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
00506 
00507   T T_dtxi, T_dtyi, T_dtzi;
00508   T T_dtxj, T_dtyj, T_dtzj;
00509   calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
00510   calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
00511   T T_dtxk = -T_dtxi - T_dtxj;
00512   T T_dtyk = -T_dtyi - T_dtyj;
00513   T T_dtzk = -T_dtzi - T_dtzj;
00514   storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force);
00515 
00516   if (angleValue.k_ub) {
00517     float xik = xij - xkj;
00518     float yik = yij - ykj;
00519     float zik = zij - zkj;
00520     float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
00521     float rik = 1.0f/rik_inv;
00522     float diff_ub = rik - angleValue.r_ub;
00523     if (doEnergy) {
00524       energy += (double)(angleValue.k_ub * diff_ub * diff_ub);
00525     }
00526     diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
00527     T T_dxik, T_dyik, T_dzik;
00528     calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
00529     T_dtxi += T_dxik;
00530     T_dtyi += T_dyik;
00531     T_dtzi += T_dzik;
00532     T_dtxj -= T_dxik;
00533     T_dtyj -= T_dyik;
00534     T_dtzj -= T_dzik;
00535   }
00536 
00537   storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force);
00538   storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force);
00539 
00540   // Store virial
00541   if (doVirial) {
00542 #ifdef WRITE_FULL_VIRIALS
00543     float fxi = ((float)T_dtxi)*force_to_float;
00544     float fyi = ((float)T_dtyi)*force_to_float;
00545     float fzi = ((float)T_dtzi)*force_to_float;
00546     float fxk = ((float)T_dtxj)*force_to_float;
00547     float fyk = ((float)T_dtyj)*force_to_float;
00548     float fzk = ((float)T_dtzj)*force_to_float;
00549     virial.xx = (fxi*xij) + (fxk*xkj);
00550     virial.xy = (fxi*yij) + (fxk*ykj);
00551     virial.xz = (fxi*zij) + (fxk*zkj);
00552     virial.yx = (fyi*xij) + (fyk*xkj);
00553     virial.yy = (fyi*yij) + (fyk*ykj);
00554     virial.yz = (fyi*zij) + (fyk*zkj);
00555     virial.zx = (fzi*xij) + (fzk*xkj);
00556     virial.zy = (fzi*yij) + (fzk*ykj);
00557     virial.zz = (fzi*zij) + (fzk*zkj);
00558 #endif
00559   }
00560 }
00561 
00562 //
00563 // Dihedral computation
00564 //
00565 // Out: df, e
00566 //
00567 template <bool doEnergy>
00568 __forceinline__ __device__
00569 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
00570   const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
00571 
00572   df = 0.0f;
00573   if (doEnergy) e = 0.0;
00574 
00575   float phi = atan2f(sin_phi, cos_phi);
00576 
00577   bool lrep = true;
00578   while (lrep) {
00579     CudaDihedralValue dihedralValue = dihedralValues[ic];
00580     dihedralValue.k *= scale;
00581 
00582     // Last dihedral has n low bit set to 0
00583     lrep = (dihedralValue.n & 1);
00584     dihedralValue.n >>= 1;
00585     if (dihedralValue.n) {
00586       float nf = dihedralValue.n;
00587       float x = nf*phi - dihedralValue.delta;
00588       if (doEnergy) {
00589         float sin_x, cos_x;
00590         sincosf(x, &sin_x, &cos_x);
00591         e += (double)(dihedralValue.k*(1.0f + cos_x));
00592         df += (double)(nf*dihedralValue.k*sin_x);
00593       } else {
00594         float sin_x = sinf(x);
00595         df += (double)(nf*dihedralValue.k*sin_x);
00596       }
00597     } else {
00598       float diff = phi - dihedralValue.delta;
00599       if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
00600       if (diff >  (float)(M_PI)) diff -= (float)(2.0*M_PI);
00601       if (doEnergy) {
00602         e += (double)(dihedralValue.k*diff*diff);
00603       }
00604       df -= 2.0f*dihedralValue.k*diff;
00605     }
00606     ic++;
00607   }
00608 
00609 }
00610 
00611 
00612 template <typename T, bool doEnergy, bool doVirial>
00613 __device__ void diheForce(const int index,
00614   const CudaDihedral* __restrict__ diheList,
00615   const CudaDihedralValue* __restrict__ dihedralValues,
00616   const float4* __restrict__ xyzq,
00617   const int stride,
00618   const float3 lata, const float3 latb, const float3 latc,
00619   T* __restrict__ force, double &energy,
00620 #ifdef WRITE_FULL_VIRIALS
00621   ComputeBondedCUDAKernel::BondedVirial<double>& virial
00622 #else
00623   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00624 #endif
00625   ) {
00626 
00627   CudaDihedral dl = diheList[index];
00628 
00629   float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
00630   float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
00631   float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
00632 
00633   float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
00634   float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
00635   float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
00636 
00637   float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
00638   float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
00639   float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
00640 
00641   float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
00642   float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
00643   float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
00644 
00645   float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
00646   float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
00647   float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
00648 
00649   float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
00650   float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
00651   float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
00652 
00653   // A=F^G, B=H^G.
00654   float ax = yij*zjk - zij*yjk;
00655   float ay = zij*xjk - xij*zjk;
00656   float az = xij*yjk - yij*xjk;
00657   float bx = ylk*zjk - zlk*yjk;
00658   float by = zlk*xjk - xlk*zjk;
00659   float bz = xlk*yjk - ylk*xjk;
00660 
00661   float ra2 = ax*ax + ay*ay + az*az;
00662   float rb2 = bx*bx + by*by + bz*bz;
00663   float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
00664 
00665   //    if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
00666   //          nlinear = nlinear + 1
00667   //       endif
00668 
00669   float rgr = 1.0f / rg;
00670   float ra2r = 1.0f / ra2;
00671   float rb2r = 1.0f / rb2;
00672   float rabr = sqrtf(ra2r*rb2r);
00673 
00674   float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
00675   //
00676   // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
00677   // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
00678   float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
00679   //
00680   //     Energy and derivative contributions.
00681 
00682   float df;
00683   double e;
00684   diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
00685 
00686   if (doEnergy) energy += e;
00687 
00688   //
00689   //     Compute derivatives wrt catesian coordinates.
00690   //
00691   // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
00692   //  FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
00693 
00694   float fg = xij*xjk + yij*yjk + zij*zjk;
00695   float hg = xlk*xjk + ylk*yjk + zlk*zjk;
00696   ra2r *= df;
00697   rb2r *= df;
00698   float fga = fg*ra2r*rgr;
00699   float hgb = hg*rb2r*rgr;
00700   float gaa = ra2r*rg;
00701   float gbb = rb2r*rg;
00702   // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
00703 
00704   // Store forces
00705   T T_fix, T_fiy, T_fiz;
00706   calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
00707   storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force);
00708 
00709   T dgx, dgy, dgz;
00710   calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
00711   T T_fjx = dgx - T_fix;
00712   T T_fjy = dgy - T_fiy;
00713   T T_fjz = dgz - T_fiz;
00714   storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force);
00715 
00716   T dhx, dhy, dhz;
00717   calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
00718   T T_fkx = -dhx - dgx;
00719   T T_fky = -dhy - dgy;
00720   T T_fkz = -dhz - dgz;
00721   storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force);
00722   storeForces<T>(dhx, dhy, dhz, dl.l, stride, force);
00723 
00724   // Store virial
00725   if (doVirial) {
00726 #ifdef WRITE_FULL_VIRIALS
00727     float fix = ((float)T_fix)*force_to_float;
00728     float fiy = ((float)T_fiy)*force_to_float;
00729     float fiz = ((float)T_fiz)*force_to_float;
00730     float fjx = ((float)dgx)*force_to_float;
00731     float fjy = ((float)dgy)*force_to_float;
00732     float fjz = ((float)dgz)*force_to_float;
00733     float flx = ((float)dhx)*force_to_float;
00734     float fly = ((float)dhy)*force_to_float;
00735     float flz = ((float)dhz)*force_to_float;
00736     virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
00737     virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
00738     virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
00739     virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
00740     virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
00741     virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
00742     virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
00743     virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
00744     virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
00745 #endif
00746   }
00747 
00748 }
00749 
00750 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
00751  return make_float3(
00752   v1.y*v2.z-v2.y*v1.z,
00753   v2.x*v1.z-v1.x*v2.z,
00754   v1.x*v2.y-v2.x*v1.y
00755   );
00756 }
00757 
00758 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
00759   return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
00760 }
00761 
00762 //
00763 // Calculates crossterms
00764 //
00765 template <typename T, bool doEnergy, bool doVirial>
00766 __device__ void crosstermForce(
00767   const int index,
00768   const CudaCrossterm* __restrict__ crosstermList,
00769   const CudaCrosstermValue* __restrict__ crosstermValues,
00770   const float4* __restrict__ xyzq,
00771   const int stride,
00772   const float3 lata, const float3 latb, const float3 latc,
00773   T* __restrict__ force, double &energy,
00774 #ifdef WRITE_FULL_VIRIALS
00775   ComputeBondedCUDAKernel::BondedVirial<double>& virial
00776 #else
00777   ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
00778 #endif
00779   ) {
00780 
00781   CudaCrossterm cl = crosstermList[index];
00782 
00783   // ----------------------------------------------------------------------------
00784   // Angle between 1 - 2 - 3 - 4
00785   //
00786   // 1 - 2
00787   float3 sh12 = make_float3(
00788     cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
00789     cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
00790     cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
00791 
00792   float4 xyzq1 = xyzq[cl.i1];
00793   float4 xyzq2 = xyzq[cl.i2];
00794 
00795   float3 r12 = make_float3(
00796     xyzq1.x + sh12.x - xyzq2.x,
00797     xyzq1.y + sh12.y - xyzq2.y,
00798     xyzq1.z + sh12.z - xyzq2.z);
00799 
00800   // 2 - 3
00801   float3 sh23 = make_float3(
00802     cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
00803     cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
00804     cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
00805 
00806   float4 xyzq3 = xyzq[cl.i3];
00807 
00808   float3 r23 = make_float3(
00809     xyzq2.x + sh23.x - xyzq3.x,
00810     xyzq2.y + sh23.y - xyzq3.y,
00811     xyzq2.z + sh23.z - xyzq3.z);
00812 
00813   // 3 - 4
00814   float3 sh34 = make_float3(
00815     cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
00816     cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
00817     cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
00818 
00819   float4 xyzq4 = xyzq[cl.i4];
00820 
00821   float3 r34 = make_float3(
00822     xyzq3.x + sh34.x - xyzq4.x,
00823     xyzq3.y + sh34.y - xyzq4.y,
00824     xyzq3.z + sh34.z - xyzq4.z);
00825 
00826   // Calculate the cross products
00827   float3 A = cross(r12, r23);
00828   float3 B = cross(r23, r34);
00829   float3 C = cross(r23, A);
00830 
00831   // Calculate the inverse distances
00832   float inv_rA = rsqrtf(dot(A, A));
00833   float inv_rB = rsqrtf(dot(B, B));
00834   float inv_rC = rsqrtf(dot(C, C));
00835 
00836   //  Calculate the sin and cos
00837   float cos_phi = dot(A, B)*(inv_rA*inv_rB);
00838   float sin_phi = dot(C, B)*(inv_rC*inv_rB);
00839 
00840   float phi = -atan2f(sin_phi,cos_phi);
00841 
00842   // ----------------------------------------------------------------------------
00843   // Angle between 5 - 6 - 7 - 8
00844   //
00845 
00846   // 5 - 6
00847   float3 sh56 = make_float3(
00848     cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
00849     cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
00850     cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
00851 
00852   float4 xyzq5 = xyzq[cl.i5];
00853   float4 xyzq6 = xyzq[cl.i6];
00854 
00855   float3 r56 = make_float3(
00856     xyzq5.x + sh56.x - xyzq6.x,
00857     xyzq5.y + sh56.y - xyzq6.y,
00858     xyzq5.z + sh56.z - xyzq6.z);
00859 
00860   // 6 - 7
00861   float3 sh67 = make_float3(
00862     cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
00863     cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
00864     cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
00865 
00866   float4 xyzq7 = xyzq[cl.i7];
00867 
00868   float3 r67 = make_float3(
00869     xyzq6.x + sh67.x - xyzq7.x,
00870     xyzq6.y + sh67.y - xyzq7.y,
00871     xyzq6.z + sh67.z - xyzq7.z);
00872 
00873   // 7 - 8
00874   float3 sh78 = make_float3(
00875     cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
00876     cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
00877     cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
00878 
00879   float4 xyzq8 = xyzq[cl.i8];
00880 
00881   float3 r78 = make_float3(
00882     xyzq7.x + sh78.x - xyzq8.x,
00883     xyzq7.y + sh78.y - xyzq8.y,
00884     xyzq7.z + sh78.z - xyzq8.z);
00885 
00886   // Calculate the cross products
00887   float3 D = cross(r56, r67);
00888   float3 E = cross(r67, r78);
00889   float3 F = cross(r67, D);
00890   
00891   // Calculate the inverse distances
00892   float inv_rD = rsqrtf(dot(D, D));
00893   float inv_rE = rsqrtf(dot(E, E));
00894   float inv_rF = rsqrtf(dot(F, F));
00895 
00896   //  Calculate the sin and cos
00897   float cos_psi = dot(D, E)*(inv_rD*inv_rE);
00898   float sin_psi = dot(F, E)*(inv_rF*inv_rE);
00899 
00900   float psi = -atan2f(sin_psi,cos_psi);
00901 
00902   // ----------------------------------------------------------------------------
00903   // Calculate the energy
00904 
00905   const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
00906 
00907   // Shift angles
00908   phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
00909   psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
00910 
00911   // distance measured in grid points between angle and smallest value
00912   float phi_h = phi * inv_h;
00913   float psi_h = psi * inv_h;
00914 
00915   // find smallest numbered grid point in stencil
00916   int iphi = (int)floor(phi_h);
00917   int ipsi = (int)floor(psi_h);
00918 
00919   const CudaCrosstermValue& cp = crosstermValues[cl.itype];
00920 
00921   // Load coefficients
00922   float4 c[4];
00923   c[0] = cp.c[iphi][ipsi][0];
00924   c[1] = cp.c[iphi][ipsi][1];
00925   c[2] = cp.c[iphi][ipsi][2];
00926   c[3] = cp.c[iphi][ipsi][3];
00927 
00928   float dphi = phi_h - (float)iphi;
00929   float dpsi = psi_h - (float)ipsi;
00930 
00931   if (doEnergy) {
00932     float energyf =          c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
00933     energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
00934     energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
00935     energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
00936     energy += energyf*cl.scale;
00937   }
00938 
00939   float dEdphi =         3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
00940   dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
00941   dEdphi = dEdphi*dphi +      (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) ));  // 13 muls
00942   dEdphi *= cl.scale*inv_h;
00943 
00944   float dEdpsi =         3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
00945   dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
00946   dEdpsi = dEdpsi*dpsi +      (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) ));  // 13 muls
00947   dEdpsi *= cl.scale*inv_h;
00948 
00949   // float normCross1 = dot(A, A);
00950   float square_r23 = dot(r23, r23);
00951   float norm_r23 = sqrtf(square_r23);
00952   float inv_square_r23 = 1.0f/square_r23;
00953   float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
00954   float ff2 = -dot(r12, r23)*inv_square_r23;
00955   float ff3 = -dot(r34, r23)*inv_square_r23;
00956   float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
00957 
00958   float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
00959   float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
00960   float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
00961   float3 f2 = make_float3(  t1.x - f1.x,  t1.y - f1.y,  t1.z - f1.z);
00962   float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
00963 
00964   T T_f1x, T_f1y, T_f1z;
00965   T T_f2x, T_f2y, T_f2z;
00966   T T_f3x, T_f3y, T_f3z;
00967   T T_f4x, T_f4y, T_f4z;
00968   convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
00969   convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
00970   convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
00971   convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
00972   storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force);
00973   storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force);
00974   storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force);
00975   storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force);
00976 
00977   float square_r67 = dot(r67, r67);
00978   float norm_r67 = sqrtf(square_r67);
00979   float inv_square_r67 = 1.0f/(square_r67);
00980   ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
00981   ff2 = -dot(r56, r67)*inv_square_r67;
00982   ff3 = -dot(r78, r67)*inv_square_r67;
00983   ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
00984   
00985   float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
00986   float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
00987   float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
00988   float3 f6 = make_float3( t2.x - f5.x,  t2.y - f5.y,  t2.z - f5.z );
00989   float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
00990 
00991   T T_f5x, T_f5y, T_f5z;
00992   T T_f6x, T_f6y, T_f6z;
00993   T T_f7x, T_f7y, T_f7z;
00994   T T_f8x, T_f8y, T_f8z;
00995   convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
00996   convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
00997   convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
00998   convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
00999   storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force);
01000   storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force);
01001   storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force);
01002   storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force);
01003 
01004   // Store virial
01005   if (doVirial) {
01006 #ifdef WRITE_FULL_VIRIALS
01007     float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
01008     float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
01009     virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
01010     virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
01011     virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
01012     virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
01013     virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
01014     virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
01015     virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
01016     virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
01017     virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
01018   }
01019 #endif
01020 
01021 }
01022 
01023 #define BONDEDFORCESKERNEL_NUM_WARP 4
01024 //
01025 // Calculates all forces in a single kernel call
01026 //
01027 template <typename T, bool doEnergy, bool doVirial>
01028 __global__ void bondedForcesKernel(
01029   const int start,
01030 
01031   const int numBonds,
01032   const CudaBond* __restrict__ bonds,
01033   const CudaBondValue* __restrict__ bondValues,
01034 
01035   const int numAngles,
01036   const CudaAngle* __restrict__ angles,
01037   const CudaAngleValue* __restrict__ angleValues,
01038 
01039   const int numDihedrals,
01040   const CudaDihedral* __restrict__ dihedrals,
01041   const CudaDihedralValue* __restrict__ dihedralValues,
01042 
01043   const int numImpropers,
01044   const CudaDihedral* __restrict__ impropers,
01045   const CudaDihedralValue* __restrict__ improperValues,
01046 
01047   const int numExclusions,
01048   const CudaExclusion* __restrict__ exclusions,
01049 
01050   const int numCrossterms,
01051   const CudaCrossterm* __restrict__ crossterms,
01052   const CudaCrosstermValue* __restrict__ crosstermValues,
01053 
01054   const float cutoff2,
01055   const float r2_delta, const int r2_delta_expc,
01056 
01057   const float* __restrict__ r2_table,
01058   const float4* __restrict__ exclusionTable,
01059   cudaTextureObject_t r2_table_tex,
01060   cudaTextureObject_t exclusionTableTex,
01061 
01062   const float4* __restrict__ xyzq,
01063   const int stride,
01064   const float3 lata, const float3 latb, const float3 latc,
01065   T* __restrict__ force,
01066   T* __restrict__ forceSlow,
01067   double* __restrict__ energies_virials) {
01068 
01069   // Thread-block index
01070   int indexTB = start + blockIdx.x;
01071 
01072   const int numBondsTB     = (numBonds + blockDim.x - 1)/blockDim.x;
01073   const int numAnglesTB    = (numAngles + blockDim.x - 1)/blockDim.x;
01074   const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
01075   const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
01076   const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
01077   const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
01078 
01079   // Each thread computes single bonded interaction.
01080   // Each thread block computes single bonded type
01081   double energy;
01082   int energyIndex;
01083 
01084   if (doEnergy) {
01085     energy = 0.0;
01086     energyIndex = -1;
01087   }
01088 
01089 #ifdef WRITE_FULL_VIRIALS
01090   ComputeBondedCUDAKernel::BondedVirial<double> virial;
01091   int virialIndex;
01092   if (doVirial) {
01093     virial.xx = 0.0;
01094     virial.xy = 0.0;
01095     virial.xz = 0.0;
01096     virial.yx = 0.0;
01097     virial.yy = 0.0;
01098     virial.yz = 0.0;
01099     virial.zx = 0.0;
01100     virial.zy = 0.0;
01101     virial.zz = 0.0;
01102     virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
01103   }
01104 #endif
01105 
01106   if (indexTB < numBondsTB) {
01107     int i = threadIdx.x + indexTB*blockDim.x;
01108     if (doEnergy) {
01109       energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
01110     }
01111     if (i < numBonds) {
01112       bondForce<T, doEnergy, doVirial>
01113       (i, bonds, bondValues, xyzq,
01114         stride, lata, latb, latc,
01115         force, energy, virial);
01116     }
01117     goto reduce;
01118   }
01119   indexTB -= numBondsTB;
01120 
01121   if (indexTB < numAnglesTB) {
01122     int i = threadIdx.x + indexTB*blockDim.x;
01123     if (doEnergy) {
01124       energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
01125     }
01126     if (i < numAngles) {
01127       angleForce<T, doEnergy, doVirial>
01128       (i, angles, angleValues, xyzq, stride,
01129         lata, latb, latc,
01130         force, energy, virial);
01131     }
01132     goto reduce;
01133   }
01134   indexTB -= numAnglesTB;
01135 
01136   if (indexTB < numDihedralsTB) {
01137     int i = threadIdx.x + indexTB*blockDim.x;
01138     if (doEnergy) {
01139       energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
01140     }
01141     if (doVirial) {
01142       virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
01143     }
01144     if (i < numDihedrals) {
01145       diheForce<T, doEnergy, doVirial>
01146       (i, dihedrals, dihedralValues, xyzq, stride,
01147         lata, latb, latc,
01148         force, energy, virial);
01149     }
01150     goto reduce;
01151   }
01152   indexTB -= numDihedralsTB;
01153 
01154   if (indexTB < numImpropersTB) {
01155     int i = threadIdx.x + indexTB*blockDim.x;
01156     if (doEnergy) {
01157       energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
01158     }
01159     if (i < numImpropers) {
01160       diheForce<T, doEnergy, doVirial>
01161       (i, impropers, improperValues, xyzq, stride,
01162         lata, latb, latc,
01163         force, energy, virial);
01164     }
01165     goto reduce;
01166   }
01167   indexTB -= numImpropersTB;
01168 
01169   if (indexTB < numExclusionsTB) {
01170     int i = threadIdx.x + indexTB*blockDim.x;
01171     if (doEnergy) {
01172       energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
01173     }
01174     if (doVirial) {
01175       virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
01176     }
01177     if (i < numExclusions) {
01178       exclusionForce<T, doEnergy, doVirial>
01179       (i, exclusions, r2_delta, r2_delta_expc,
01180 #if __CUDA_ARCH__ >= 350
01181         r2_table, exclusionTable,
01182 #else
01183         r2_table_tex, exclusionTableTex,
01184 #endif
01185         xyzq, stride, lata, latb, latc, cutoff2,
01186         forceSlow, energy, virial);
01187     }
01188     goto reduce;
01189   }
01190   indexTB -= numExclusionsTB;
01191 
01192   if (indexTB < numCrosstermsTB) {
01193     int i = threadIdx.x + indexTB*blockDim.x;
01194     if (doEnergy) {
01195       energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
01196     }
01197     if (doVirial) {
01198       virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
01199     }
01200     if (i < numCrossterms) {
01201       crosstermForce<T, doEnergy, doVirial>
01202       (i, crossterms, crosstermValues,
01203         xyzq, stride, lata, latb, latc,
01204         force, energy, virial);
01205     }
01206     goto reduce;
01207   }
01208   // indexTB -= numCrosstermsTB;
01209 
01210   reduce:
01211 
01212   // Write energies to global memory
01213   if (doEnergy) {
01214     // energyIndex is constant within thread-block
01215     __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
01216 #pragma unroll
01217     for (int i=16;i >= 1;i/=2) {
01218       energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, 32);
01219     }
01220     int laneID = (threadIdx.x & (WARPSIZE - 1));
01221     int warpID = threadIdx.x / WARPSIZE;
01222     if (laneID == 0) {
01223       shEnergy[warpID] = energy;
01224     }
01225     BLOCK_SYNC;
01226     if (warpID == 0) {
01227       energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
01228 #pragma unroll
01229       for (int i=16;i >= 1;i/=2) {
01230         energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, 32);
01231       }
01232       if (laneID == 0) {
01233         atomicAdd(&energies_virials[energyIndex], energy);
01234       }
01235     }
01236   }
01237 
01238   // Write virials to global memory
01239 #ifdef WRITE_FULL_VIRIALS
01240   if (doVirial) {
01241 #pragma unroll
01242     for (int i=16;i >= 1;i/=2) {
01243       virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, 32);
01244       virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, 32);
01245       virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, 32);
01246       virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, 32);
01247       virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, 32);
01248       virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, 32);
01249       virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, 32);
01250       virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, 32);
01251       virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, 32);
01252     }
01253     __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
01254     int laneID = (threadIdx.x & (WARPSIZE - 1));
01255     int warpID = threadIdx.x / WARPSIZE;
01256     if (laneID == 0) {
01257       shVirial[warpID] = virial;
01258     }
01259     BLOCK_SYNC;
01260 
01261     if (warpID == 0) {
01262       virial.xx = 0.0;
01263       virial.xy = 0.0;
01264       virial.xz = 0.0;
01265       virial.yx = 0.0;
01266       virial.yy = 0.0;
01267       virial.yz = 0.0;
01268       virial.zx = 0.0;
01269       virial.zy = 0.0;
01270       virial.zz = 0.0;
01271       if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
01272 #pragma unroll
01273       for (int i=16;i >= 1;i/=2) {
01274         virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, 32);
01275         virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, 32);
01276         virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, 32);
01277         virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, 32);
01278         virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, 32);
01279         virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, 32);
01280         virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, 32);
01281         virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, 32);
01282         virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, 32);
01283       }   
01284 
01285       if (laneID == 0) {
01286 #ifdef USE_FP_VIRIAL
01287         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.xx*double_to_virial));
01288         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.xy*double_to_virial));
01289         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.xz*double_to_virial));
01290         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.yx*double_to_virial));
01291         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.yy*double_to_virial));
01292         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.yz*double_to_virial));
01293         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.zx*double_to_virial));
01294         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.zy*double_to_virial));
01295         atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.zz*double_to_virial));
01296 #else
01297         atomicAdd(&energies_virials[virialIndex + 0], virial.xx);
01298         atomicAdd(&energies_virials[virialIndex + 1], virial.xy);
01299         atomicAdd(&energies_virials[virialIndex + 2], virial.xz);
01300         atomicAdd(&energies_virials[virialIndex + 3], virial.yx);
01301         atomicAdd(&energies_virials[virialIndex + 4], virial.yy);
01302         atomicAdd(&energies_virials[virialIndex + 5], virial.yz);
01303         atomicAdd(&energies_virials[virialIndex + 6], virial.zx);
01304         atomicAdd(&energies_virials[virialIndex + 7], virial.zy);
01305         atomicAdd(&energies_virials[virialIndex + 8], virial.zz);
01306 #endif
01307       }
01308     }
01309   }
01310 #endif
01311 
01312 }
01313 
01314 template <typename T, bool doEnergy, bool doVirial, bool doElect>
01315 __global__ void modifiedExclusionForcesKernel(
01316   const int start,
01317 
01318   const int numModifiedExclusions,
01319   const CudaExclusion* __restrict__ modifiedExclusions,
01320 
01321   const bool doSlow,
01322   const float one_scale14,                // 1 - scale14
01323   const float cutoff2,
01324   const int vdwCoefTableWidth,
01325   const float2* __restrict__ vdwCoefTable,
01326   cudaTextureObject_t vdwCoefTableTex, 
01327   cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex,
01328 
01329   const float4* __restrict__ xyzq,
01330   const int stride,
01331   const float3 lata, const float3 latb, const float3 latc,
01332   T* __restrict__ forceNbond, T* __restrict__ forceSlow,
01333   double* __restrict__ energies_virials
01334   ) {
01335 
01336   // index
01337   int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
01338 
01339   double energyVdw, energyNbond, energySlow;
01340   if (doEnergy) {
01341     energyVdw = 0.0;
01342     if (doElect) {
01343       energyNbond = 0.0;
01344       energySlow = 0.0;
01345     }
01346   }
01347 
01348 #ifdef WRITE_FULL_VIRIALS
01349   ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
01350   ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
01351   if (doVirial) {
01352     virialNbond.xx = 0.0;
01353     virialNbond.xy = 0.0;
01354     virialNbond.xz = 0.0;
01355     virialNbond.yx = 0.0;
01356     virialNbond.yy = 0.0;
01357     virialNbond.yz = 0.0;
01358     virialNbond.zx = 0.0;
01359     virialNbond.zy = 0.0;
01360     virialNbond.zz = 0.0;
01361     if (doElect) {
01362       virialSlow.xx = 0.0;
01363       virialSlow.xy = 0.0;
01364       virialSlow.xz = 0.0;
01365       virialSlow.yx = 0.0;
01366       virialSlow.yy = 0.0;
01367       virialSlow.yz = 0.0;
01368       virialSlow.zx = 0.0;
01369       virialSlow.zy = 0.0;
01370       virialSlow.zz = 0.0;
01371     }
01372   }
01373 #endif
01374 
01375   if (i < numModifiedExclusions)
01376   {
01377     modifiedExclusionForce<T, doEnergy, doVirial, doElect>
01378     (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
01379 #if __CUDA_ARCH__ >= 350
01380       vdwCoefTable,
01381 #else
01382       vdwCoefTableTex,
01383 #endif
01384       modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
01385       xyzq, stride, lata, latb, latc, cutoff2,
01386       energyVdw, forceNbond, energyNbond,
01387       forceSlow, energySlow, virialNbond, virialSlow);
01388   }
01389 
01390   // Write energies to global memory
01391   if (doEnergy) {
01392     __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
01393     __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
01394     __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
01395 #pragma unroll
01396     for (int i=16;i >= 1;i/=2) {
01397       energyVdw   += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
01398       if (doElect) {
01399         energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, 32);
01400         energySlow  += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
01401       }
01402     }
01403     int laneID = (threadIdx.x & (WARPSIZE - 1));
01404     int warpID = threadIdx.x / WARPSIZE;
01405     if (laneID == 0) {
01406       shEnergyVdw[warpID]   = energyVdw;
01407       if (doElect) {
01408         shEnergyNbond[warpID] = energyNbond;
01409         shEnergySlow[warpID]  = energySlow;
01410       }
01411     }
01412     BLOCK_SYNC;
01413     if (warpID == 0) {
01414       energyVdw   = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
01415       if (doElect) {
01416         energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
01417         energySlow  = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
01418       }
01419 #pragma unroll
01420       for (int i=16;i >= 1;i/=2) {
01421         energyVdw   += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
01422         if (doElect) {
01423           energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, 32);
01424           energySlow  += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
01425         }
01426       }
01427       if (laneID == 0) {
01428         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ],         energyVdw);
01429         if (doElect) {
01430           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT],      energyNbond);
01431           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
01432         }
01433       }
01434     }
01435   }
01436 
01437   // Write virials to global memory
01438 #ifdef WRITE_FULL_VIRIALS
01439   if (doVirial) {
01440 #pragma unroll
01441     for (int i=16;i >= 1;i/=2) {
01442       virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, 32);
01443       virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, 32);
01444       virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, 32);
01445       virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, 32);
01446       virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, 32);
01447       virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, 32);
01448       virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, 32);
01449       virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, 32);
01450       virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, 32);
01451       if (doElect && doSlow) {
01452         virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, 32);
01453         virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, 32);
01454         virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, 32);
01455         virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, 32);
01456         virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, 32);
01457         virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, 32);
01458         virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, 32);
01459         virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, 32);
01460         virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, 32);
01461       }
01462     }
01463     __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
01464     __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
01465     int laneID = (threadIdx.x & (WARPSIZE - 1));
01466     int warpID = threadIdx.x / WARPSIZE;
01467     if (laneID == 0) {
01468       shVirialNbond[warpID] = virialNbond;
01469       shVirialSlow[warpID] = virialSlow;
01470     }
01471     BLOCK_SYNC;
01472 
01473     virialNbond.xx = 0.0;
01474     virialNbond.xy = 0.0;
01475     virialNbond.xz = 0.0;
01476     virialNbond.yx = 0.0;
01477     virialNbond.yy = 0.0;
01478     virialNbond.yz = 0.0;
01479     virialNbond.zx = 0.0;
01480     virialNbond.zy = 0.0;
01481     virialNbond.zz = 0.0;
01482     if (doElect && doSlow) {
01483       virialSlow.xx = 0.0;
01484       virialSlow.xy = 0.0;
01485       virialSlow.xz = 0.0;
01486       virialSlow.yx = 0.0;
01487       virialSlow.yy = 0.0;
01488       virialSlow.yz = 0.0;
01489       virialSlow.zx = 0.0;
01490       virialSlow.zy = 0.0;
01491       virialSlow.zz = 0.0;
01492     }
01493 
01494     if (warpID == 0) {
01495       if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
01496         virialNbond = shVirialNbond[laneID];
01497         virialSlow = shVirialSlow[laneID];
01498       }
01499 #pragma unroll
01500       for (int i=16;i >= 1;i/=2) {
01501         virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, 32);
01502         virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, 32);
01503         virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, 32);
01504         virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, 32);
01505         virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, 32);
01506         virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, 32);
01507         virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, 32);
01508         virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, 32);
01509         virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, 32);
01510         if (doElect && doSlow) {
01511           virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, 32);
01512           virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, 32);
01513           virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, 32);
01514           virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, 32);
01515           virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, 32);
01516           virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, 32);
01517           virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, 32);
01518           virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, 32);
01519           virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, 32);
01520         }
01521       }
01522 
01523       if (laneID == 0)
01524       {
01525 #ifdef USE_FP_VIRIAL
01526         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], llitoulli(virialNbond.xx*double_to_virial));
01527         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], llitoulli(virialNbond.xy*double_to_virial));
01528         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], llitoulli(virialNbond.xz*double_to_virial));
01529         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], llitoulli(virialNbond.yx*double_to_virial));
01530         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], llitoulli(virialNbond.yy*double_to_virial));
01531         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], llitoulli(virialNbond.yz*double_to_virial));
01532         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], llitoulli(virialNbond.zx*double_to_virial));
01533         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], llitoulli(virialNbond.zy*double_to_virial));
01534         atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], llitoulli(virialNbond.zz*double_to_virial));
01535         if (doElect && doSlow) {
01536           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], llitoulli(virialSlow.xx*double_to_virial));
01537           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], llitoulli(virialSlow.xy*double_to_virial));
01538           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], llitoulli(virialSlow.xz*double_to_virial));
01539           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], llitoulli(virialSlow.yx*double_to_virial));
01540           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], llitoulli(virialSlow.yy*double_to_virial));
01541           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], llitoulli(virialSlow.yz*double_to_virial));
01542           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], llitoulli(virialSlow.zx*double_to_virial));
01543           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], llitoulli(virialSlow.zy*double_to_virial));
01544           atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], llitoulli(virialSlow.zz*double_to_virial));
01545         }
01546 #else
01547         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
01548         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
01549         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
01550         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
01551         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
01552         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
01553         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
01554         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
01555         atomicAdd(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
01556         if (doElect && doSlow) {
01557           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
01558           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
01559           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
01560           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
01561           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
01562           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
01563           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
01564           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
01565           atomicAdd(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
01566         }
01567 #endif
01568       }
01569     }
01570   }
01571 #endif
01572 
01573 }
01574 
01575 // ##############################################################################################
01576 // ##############################################################################################
01577 // ##############################################################################################
01578 
01579 //
01580 // Class constructor
01581 //
01582 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
01583 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
01584 
01585   cudaCheck(cudaSetDevice(deviceID));
01586 
01587   tupleData = NULL;
01588   tupleDataSize = 0;
01589 
01590   numBonds = 0;
01591   numAngles = 0;
01592   numDihedrals = 0;
01593   numImpropers = 0;
01594   numModifiedExclusions = 0;
01595   numExclusions = 0;
01596   numCrossterms = 0;
01597 
01598   bondValues = NULL;
01599   angleValues = NULL;
01600   dihedralValues = NULL;
01601   improperValues = NULL;
01602   crosstermValues = NULL;
01603 
01604   xyzq = NULL;
01605   xyzqSize = 0;
01606 
01607   forces = NULL;
01608   forcesSize = 0;
01609 
01610   allocate_device<double>(&energies_virials, energies_virials_SIZE);
01611 }
01612 
01613 //
01614 // Class destructor
01615 //
01616 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
01617   cudaCheck(cudaSetDevice(deviceID));
01618 
01619   deallocate_device<double>(&energies_virials);
01620   // deallocate_device<BondedVirial>(&virial);
01621 
01622   if (tupleData != NULL) deallocate_device<char>(&tupleData);
01623   if (xyzq != NULL) deallocate_device<float4>(&xyzq);
01624   if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
01625 
01626   if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
01627   if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
01628   if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
01629   if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
01630   if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
01631 }
01632 
01633 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
01634   allocate_device<CudaBondValue>(&bondValues, numBondValues);
01635   copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
01636 }
01637 
01638 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
01639   allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
01640   copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
01641 }
01642 
01643 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
01644   allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
01645   copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
01646 }
01647 
01648 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
01649   allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
01650   copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
01651 }
01652 
01653 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
01654   allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
01655   copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
01656 }
01657 
01658 //
01659 // Update bonded lists
01660 //
01661 void ComputeBondedCUDAKernel::update(
01662   const int numBondsIn,
01663   const int numAnglesIn,
01664   const int numDihedralsIn,
01665   const int numImpropersIn,
01666   const int numModifiedExclusionsIn,
01667   const int numExclusionsIn,
01668   const int numCrosstermsIn,
01669   const char* h_tupleData,
01670   cudaStream_t stream) {
01671 
01672   numBonds              = numBondsIn;
01673   numAngles             = numAnglesIn;
01674   numDihedrals          = numDihedralsIn;
01675   numImpropers          = numImpropersIn;
01676   numModifiedExclusions = numModifiedExclusionsIn;
01677   numExclusions         = numExclusionsIn;
01678   numCrossterms         = numCrosstermsIn;
01679 
01680   int numBondsWA     = warpAlign(numBonds);
01681   int numAnglesWA    = warpAlign(numAngles);
01682   int numDihedralsWA = warpAlign(numDihedrals);
01683   int numImpropersWA = warpAlign(numImpropers);
01684   int numModifiedExclusionsWA = warpAlign(numModifiedExclusions);
01685   int numExclusionsWA         = warpAlign(numExclusions);
01686   int numCrosstermsWA         = warpAlign(numCrossterms);
01687 
01688   int sizeTot = numBondsWA*sizeof(CudaBond) + numAnglesWA*sizeof(CudaAngle) + 
01689   numDihedralsWA*sizeof(CudaDihedral) + numImpropersWA*sizeof(CudaDihedral) +
01690   numModifiedExclusionsWA*sizeof(CudaExclusion) + numExclusionsWA*sizeof(CudaExclusion) + 
01691   numCrosstermsWA*sizeof(CudaCrossterm);
01692 
01693   reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, 1.4f);
01694   copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
01695 
01696   // Setup pointers
01697   int pos = 0;
01698   bonds = (CudaBond *)&tupleData[pos];
01699   pos += numBondsWA*sizeof(CudaBond);
01700 
01701   angles = (CudaAngle* )&tupleData[pos];
01702   pos += numAnglesWA*sizeof(CudaAngle);
01703 
01704   dihedrals = (CudaDihedral* )&tupleData[pos];
01705   pos += numDihedralsWA*sizeof(CudaDihedral);
01706 
01707   impropers = (CudaDihedral* )&tupleData[pos];
01708   pos += numImpropersWA*sizeof(CudaDihedral);
01709 
01710   modifiedExclusions = (CudaExclusion* )&tupleData[pos];
01711   pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
01712 
01713   exclusions = (CudaExclusion* )&tupleData[pos];
01714   pos += numExclusionsWA*sizeof(CudaExclusion);
01715 
01716   crossterms = (CudaCrossterm* )&tupleData[pos];
01717   pos += numCrosstermsWA*sizeof(CudaCrossterm);
01718 }
01719 
01720 //
01721 // Return stride for force array
01722 //
01723 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
01724 #ifdef USE_STRIDED_FORCE
01725   // Align stride to 256 bytes
01726   return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
01727 #else
01728   return 1;
01729 #endif
01730 }
01731 
01732 //
01733 // Return size of single force array
01734 //
01735 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
01736 #ifdef USE_STRIDED_FORCE
01737   return (3*getForceStride(atomStorageSize));
01738 #else
01739   return (3*atomStorageSize);
01740 #endif
01741 }
01742 
01743 //
01744 // Return size of the all force arrays
01745 //
01746 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
01747 
01748   int forceSize = getForceSize(atomStorageSize);
01749 
01750   if (numModifiedExclusions > 0 || numExclusions > 0) {
01751     if (doSlow) {
01752       // All three force arrays [normal, nbond, slow]
01753       forceSize *= 3;
01754     } else {
01755       // Two force arrays [normal, nbond]
01756       forceSize *= 2;
01757     }
01758   }
01759 
01760   return forceSize;
01761 }
01762 
01763 //
01764 // Compute bonded forces
01765 //
01766 void ComputeBondedCUDAKernel::bondedForce(
01767   const double scale14, const int atomStorageSize,
01768   const bool doEnergy, const bool doVirial, const bool doSlow,
01769   const float3 lata, const float3 latb, const float3 latc,
01770   const float cutoff2, const float r2_delta, const int r2_delta_expc,
01771   const float4* h_xyzq, FORCE_TYPE* h_forces,
01772   double *h_energies_virials,
01773   cudaStream_t stream) {
01774 
01775   int forceStorageSize = getAllForceSize(atomStorageSize, true);
01776   int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
01777   int forceStride = getForceStride(atomStorageSize);
01778 
01779   int forceSize = getForceSize(atomStorageSize);
01780   int startNbond = forceSize;
01781   int startSlow = 2*forceSize;
01782 
01783   // Re-allocate coordinate and force arrays if neccessary
01784   reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
01785   reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
01786 
01787   // Copy coordinates to device
01788   copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
01789 
01790   // Clear force array
01791   clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
01792   if (doEnergy || doVirial) {
01793     clear_device_array<double>(energies_virials, energies_virials_SIZE, stream);
01794   }
01795 
01796   float one_scale14 = (float)(1.0 - scale14);
01797 
01798   // If doSlow = false, these exclusions are not calculated
01799   int numExclusionsDoSlow = doSlow ? numExclusions : 0;
01800 
01801   int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
01802 
01803   int numBondsTB     = (numBonds + nthread - 1)/nthread;
01804   int numAnglesTB    = (numAngles + nthread - 1)/nthread;
01805   int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
01806   int numImpropersTB = (numImpropers + nthread - 1)/nthread;
01807   int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
01808   int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
01809 
01810   int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB + 
01811   numExclusionsTB + numCrosstermsTB;
01812   int shmem_size = 0;
01813 
01814   // printf("%d %d %d %d %d %d nblock %d\n",
01815   //   numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
01816 
01817   int max_nblock = deviceCUDA->getMaxNumBlocks();
01818 
01819   int start = 0;
01820   while (start < nblock)
01821   {
01822     int nleft = nblock - start;
01823     int nblock_use = min(max_nblock, nleft);
01824 
01825 #define CALL(DOENERGY, DOVIRIAL) \
01826   bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> \
01827   <<< nblock_use, nthread, shmem_size, stream >>> \
01828   (start, numBonds, bonds, bondValues, \
01829     numAngles, angles, angleValues, \
01830     numDihedrals, dihedrals, dihedralValues, \
01831     numImpropers, impropers, improperValues, \
01832     numExclusionsDoSlow, exclusions, \
01833     numCrossterms, crossterms, crosstermValues, \
01834     cutoff2, \
01835     r2_delta, r2_delta_expc, \
01836     cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
01837     cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
01838     xyzq, forceStride, \
01839     lata, latb, latc, \
01840     forces, &forces[startSlow], energies_virials);
01841 
01842     if (!doEnergy && !doVirial) CALL(0, 0);
01843     if (!doEnergy && doVirial)  CALL(0, 1);
01844     if (doEnergy && !doVirial)  CALL(1, 0);
01845     if (doEnergy && doVirial)   CALL(1, 1);
01846 
01847 #undef CALL
01848     cudaCheck(cudaGetLastError());
01849 
01850     start += nblock_use;
01851   }
01852 
01853   nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
01854   nblock = (numModifiedExclusions + nthread - 1)/nthread;
01855 
01856   bool doElect = (one_scale14 == 0.0f) ? false : true;
01857 
01858   start = 0;
01859   while (start < nblock)
01860   {
01861     int nleft = nblock - start;
01862     int nblock_use = min(max_nblock, nleft);
01863 
01864 #define CALL(DOENERGY, DOVIRIAL, DOELECT) \
01865   modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
01866   <<< nblock_use, nthread, shmem_size, stream >>> \
01867   (start, numModifiedExclusions, modifiedExclusions, \
01868     doSlow, one_scale14, cutoff2, \
01869     cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
01870     cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
01871     cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
01872     xyzq, forceStride, lata, latb, latc, \
01873     &forces[startNbond], &forces[startSlow], energies_virials);
01874 
01875     if (!doEnergy && !doVirial && !doElect) CALL(0, 0, 0);
01876     if (!doEnergy && doVirial && !doElect)  CALL(0, 1, 0);
01877     if (doEnergy && !doVirial && !doElect)  CALL(1, 0, 0);
01878     if (doEnergy && doVirial && !doElect)   CALL(1, 1, 0);
01879 
01880     if (!doEnergy && !doVirial && doElect)  CALL(0, 0, 1);
01881     if (!doEnergy && doVirial && doElect)   CALL(0, 1, 1);
01882     if (doEnergy && !doVirial && doElect)   CALL(1, 0, 1);
01883     if (doEnergy && doVirial && doElect)    CALL(1, 1, 1);
01884 
01885 #undef CALL
01886     cudaCheck(cudaGetLastError());
01887 
01888     start += nblock_use;
01889   }
01890 
01891   copy_DtoH<FORCE_TYPE>(forces, h_forces, forceCopySize, stream);
01892   if (doEnergy || doVirial) {
01893     copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
01894   }
01895 
01896 }

Generated on Mon Nov 20 01:17:10 2017 for NAMD by  doxygen 1.4.7