CudaComputeGBISKernel.cu

Go to the documentation of this file.
00001 #include <cuda.h>
00002 #include <cub/cub.cuh>
00003 #include "CudaUtils.h"
00004 #include "CudaTileListKernel.h"
00005 #include "CudaComputeGBISKernel.h"
00006 #define GBIS_CUDA
00007 #include "ComputeGBIS.inl"
00008 #include "DeviceCUDA.h"
00009 #ifdef WIN32
00010 #define __thread __declspec(thread)
00011 #endif
00012 extern __thread DeviceCUDA *deviceCUDA;
00013 
00014 #define LARGE_FLOAT (float)(1.0e10)
00015 
00016 #define GBISKERNEL_NUM_WARP 4
00017 
00018 __device__ __forceinline__
00019 void shuffleNext(float& w) {
00020   w = WARP_SHUFFLE(WARP_FULL_MASK, w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00021 }
00022 
00023 // Generic
00024 template <int phase> struct GBISParam {};
00025 template <int phase> struct GBISInput {};
00026 template <int phase> struct GBISResults {};
00027 
00028 // ------------------------------------------------------------------------------------------------
00029 // Phase 1 specialization
00030 
00031 template <> struct GBISParam<1> {
00032   float a_cut;
00033 };
00034 
00035 template <> struct GBISInput<1> {
00036   float qi, qj;
00037   float intRad0j, intRadSi;
00038   // inp1 = intRad0
00039   // inp2 = intRadS
00040   __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
00041     qi = inp1[i];
00042     intRadSi = inp2[i];
00043   }
00044   __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
00045     qj = inp2[i];
00046     intRad0j = inp1[i];
00047   }
00048   __device__ __forceinline__ void initQi(const GBISParam<1> param, const float q) {}
00049   __device__ __forceinline__ void initQj(const float q) {}
00050   __device__ __forceinline__ void shuffleNext() {
00051     qj       = WARP_SHUFFLE(WARP_FULL_MASK, qj,       (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00052     intRad0j = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00053   }
00054 };
00055 
00056 template <> struct GBISResults<1> {
00057   float psiSum;
00058   __device__ __forceinline__ void init() {psiSum = 0.0f;}
00059   __device__ __forceinline__ void shuffleNext() {
00060     psiSum = WARP_SHUFFLE(WARP_FULL_MASK, psiSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00061   }
00062 };
00063 
00064 // calculate H(i,j) [and not H(j,i)]
00065 template <bool doEnergy, bool doSlow>
00066 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
00067   const float cutoff2, const GBISParam<1> param, const GBISInput<1> inp,
00068   GBISResults<1>& resI, GBISResults<1>& resJ, float& energy) {
00069 
00070   if (r2 < cutoff2 && r2 > 0.01f) {
00071     float r_i = rsqrtf(r2);
00072     float r  = r2 * r_i;
00073     float hij;
00074     int dij;
00075     CalcH(r, r2, r_i, param.a_cut, inp.qi, inp.qj, hij, dij);
00076     resI.psiSum += hij;
00077     float hji;
00078     int dji;
00079     CalcH(r, r2, r_i, param.a_cut, inp.intRad0j, inp.intRadSi, hji, dji);
00080     resJ.psiSum += hji;
00081   }
00082 
00083 }
00084 
00085 __device__ __forceinline__ void writeResult(const int i, const GBISResults<1> res, float* out, float4* forces) {
00086   atomicAdd(&out[i], res.psiSum);
00087 }
00088 
00089 // ------------------------------------------------------------------------------------------------
00090 // Phase 2
00091 
00092 template <> struct GBISParam<2> {
00093   float kappa;
00094   float epsilon_p_i, epsilon_s_i;
00095   float smoothDist;
00096   float r_cut2, r_cut_2, r_cut_4;
00097   float scaling;
00098 };
00099 
00100 template <> struct GBISInput<2> {
00101   float qi, qj;
00102   float bornRadI, bornRadJ;
00103   // inp1 = bornRad
00104   __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
00105     bornRadI = inp1[i];
00106   }
00107   __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
00108     bornRadJ = inp1[i];
00109   }
00110   __device__ __forceinline__ void initQi(const GBISParam<2> param, const float q) {
00111     qi = -q*param.scaling;    
00112   }
00113   __device__ __forceinline__ void initQj(const float q) {
00114     qj = q;
00115   }
00116   __device__ __forceinline__ void shuffleNext() {
00117     qj       = WARP_SHUFFLE(WARP_FULL_MASK, qj,       (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00118     bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00119   }
00120 };
00121 
00122 template <> struct GBISResults<2> {
00123   float3 force;
00124   float dEdaSum;
00125   __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f; dEdaSum = 0.0f;}
00126   __device__ __forceinline__ void shuffleNext() {
00127     force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00128     force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00129     force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00130     dEdaSum = WARP_SHUFFLE(WARP_FULL_MASK, dEdaSum, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00131   }
00132 };
00133 
00134 template <bool doEnergy, bool doSlow>
00135 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
00136   const float cutoff2, const GBISParam<2> param, const GBISInput<2> inp,
00137   GBISResults<2>& resI, GBISResults<2>& resJ, float& energyT) {
00138 
00139   if (r2 < cutoff2 && r2 > 0.01f) {
00140     float r_i = rsqrtf(r2);
00141     float r  = r2 * r_i;
00142     //float bornRadJ = sh_jBornRad[j];
00143 
00144     //calculate GB energy
00145     float qiqj = inp.qi*inp.qj;
00146     float aiaj = inp.bornRadI*inp.bornRadJ;
00147     float aiaj4 = 4*aiaj;
00148     float expr2aiaj4 = exp(-r2/aiaj4);
00149     float fij = sqrt(r2 + aiaj*expr2aiaj4);
00150     float f_i = 1/fij;
00151     float expkappa = exp(-param.kappa*fij);
00152     float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
00153     float gbEij = qiqj*Dij*f_i;
00154 
00155     //calculate energy derivatives
00156     float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00157     float ddrf_i = -ddrfij*f_i*f_i;
00158     float ddrDij = param.kappa*expkappa*ddrfij*param.epsilon_s_i;
00159     float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00160 
00161     //NAMD smoothing function
00162     float scale = 1.f;
00163     float ddrScale = 0.f;
00164     float forcedEdr;
00165     if (param.smoothDist > 0.f) {
00166       scale = r2 * param.r_cut_2 - 1.f;
00167       scale *= scale;
00168       ddrScale = r*(r2-param.r_cut2)*param.r_cut_4;
00169       if (doEnergy) energyT += gbEij * scale;
00170       forcedEdr = (ddrGbEij)*scale + (gbEij)*ddrScale;
00171     } else {
00172       if (doEnergy) energyT += gbEij;
00173       forcedEdr = ddrGbEij;
00174     }
00175 
00176     //add dEda
00177     if (doSlow) {
00178       float dEdai = 0.5f*qiqj*f_i*f_i
00179                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
00180                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadI*scale;//0
00181       resI.dEdaSum += dEdai;
00182       float dEdaj = 0.5f*qiqj*f_i*f_i
00183                 *(param.kappa*param.epsilon_s_i*expkappa-Dij*f_i)
00184                 *(aiaj+0.25f*r2)*expr2aiaj4/inp.bornRadJ*scale;//0
00185       resJ.dEdaSum += dEdaj;
00186     }
00187 
00188     forcedEdr *= r_i;
00189     float tmpx = dx*forcedEdr;
00190     float tmpy = dy*forcedEdr;
00191     float tmpz = dz*forcedEdr;
00192     resI.force.x += tmpx;
00193     resI.force.y += tmpy;
00194     resI.force.z += tmpz;
00195     resJ.force.x -= tmpx;
00196     resJ.force.y -= tmpy;
00197     resJ.force.z -= tmpz;
00198   }
00199 
00200   // GB Self Energy
00201   if (doEnergy) {
00202     if (r2 < 0.01f) {
00203       float fij = inp.bornRadI;//inf
00204       float expkappa = exp(-param.kappa*fij);//0
00205       float Dij = param.epsilon_p_i - expkappa*param.epsilon_s_i;
00206       float gbEij = inp.qi*(inp.qi / (-param.scaling) )*Dij/fij;
00207       energyT += 0.5f*gbEij;
00208     } //same atom or within cutoff
00209   }
00210 }
00211 
00212 __device__ __forceinline__ void writeResult(const int i, const GBISResults<2> res, float* out, float4* forces) {
00213   atomicAdd(&out[i], res.dEdaSum);
00214   atomicAdd(&forces[i].x, res.force.x);
00215   atomicAdd(&forces[i].y, res.force.y);
00216   atomicAdd(&forces[i].z, res.force.z);
00217 }
00218 
00219 // ------------------------------------------------------------------------------------------------
00220 // Phase 3
00221 template <> struct GBISParam<3> {
00222   float a_cut;
00223 };
00224 
00225 template <> struct GBISInput<3> {
00226   float qi;
00227   float intRadSJ, intRadJ0, intRadIS;
00228   float dHdrPrefixI, dHdrPrefixJ;
00229   // inp1 = intRad0
00230   // inp2 = intRadS
00231   // inp3 = dHdrPrefix
00232   __device__ __forceinline__ void loadI(const int i, const float* inp1, const float* inp2, const float* inp3) {
00233     qi          = inp1[i];
00234     intRadIS    = inp2[i];
00235     dHdrPrefixI = inp3[i];
00236   }
00237   __device__ __forceinline__ void loadJ(const int i, const float* inp1, const float* inp2, const float* inp3) {
00238     intRadJ0    = inp1[i];
00239     intRadSJ    = inp2[i];
00240     dHdrPrefixJ = inp3[i];
00241   }
00242   __device__ __forceinline__ void initQi(const GBISParam<3> param, const float q) {}
00243   __device__ __forceinline__ void initQj(const float q) {}
00244   __device__ __forceinline__ void shuffleNext() {
00245     intRadSJ    = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ,    (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00246     dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00247     intRadJ0    = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0,    (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00248   }
00249 };
00250 
00251 template <> struct GBISResults<3> {
00252   float3 force;
00253   __device__ __forceinline__ void init() {force.x = 0.0f; force.y = 0.0f; force.z = 0.0f;}
00254   __device__ __forceinline__ void shuffleNext() {
00255     force.x = WARP_SHUFFLE(WARP_FULL_MASK, force.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00256     force.y = WARP_SHUFFLE(WARP_FULL_MASK, force.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00257     force.z = WARP_SHUFFLE(WARP_FULL_MASK, force.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00258   }
00259 };
00260 
00261 template <bool doEnergy, bool doSlow>
00262 __device__ __forceinline__ void calcGBISPhase(const float r2, const float dx, const float dy, const float dz,
00263   const float cutoff2, const GBISParam<3> param, const GBISInput<3> inp,
00264   GBISResults<3>& resI, GBISResults<3>& resJ, float& energy) {
00265 
00266   if (r2 < cutoff2 && r2 > 0.01f) {
00267     float r_i = rsqrtf(r2);
00268     float r  = r2 * r_i;
00269     float dhij, dhji;
00270     int dij, dji;
00271     CalcDH(r, r2, r_i, param.a_cut, inp.qi,       inp.intRadSJ, dhij, dij);
00272     CalcDH(r, r2, r_i, param.a_cut, inp.intRadJ0, inp.intRadIS, dhji, dji);
00273 
00274     float forceAlpha = r_i*(inp.dHdrPrefixI*dhij + inp.dHdrPrefixJ*dhji);
00275     float tmpx = dx * forceAlpha;
00276     float tmpy = dy * forceAlpha;
00277     float tmpz = dz * forceAlpha;
00278     resI.force.x += tmpx;
00279     resI.force.y += tmpy;
00280     resI.force.z += tmpz;
00281     resJ.force.x -= tmpx;
00282     resJ.force.y -= tmpy;
00283     resJ.force.z -= tmpz;
00284   }
00285 
00286 }
00287 
00288 __device__ __forceinline__ void writeResult(const int i, const GBISResults<3> res, float* out, float4* forces) {
00289   atomicAdd(&forces[i].x, res.force.x);
00290   atomicAdd(&forces[i].y, res.force.y);
00291   atomicAdd(&forces[i].z, res.force.z);
00292 }
00293 
00294 // ------------------------------------------------------------------------------------------------
00295 
00296 //
00297 // GBIS kernel
00298 //
00299 template <bool doEnergy, bool doSlow, int phase>
00300 __global__ void
00301 __launch_bounds__(32*GBISKERNEL_NUM_WARP, 6)
00302 GBIS_Kernel(const int numTileLists,
00303   const TileList* __restrict__ tileLists,
00304   const int* __restrict__ tileJatomStart,
00305   const PatchPairRecord* __restrict__ patchPairs,
00306   const float3 lata, const float3 latb, const float3 latc,
00307   const float4* __restrict__ xyzq,
00308   const float cutoff2,
00309   const GBISParam<phase> param,
00310   const float* __restrict__ inp1,
00311   const float* __restrict__ inp2,
00312   const float* __restrict__ inp3,
00313   float* __restrict__ out,
00314   float4* __restrict__ forces,
00315   TileListVirialEnergy* __restrict__ virialEnergy) {
00316 
00317   // Single warp takes care of one list of tiles
00318   for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
00319   {
00320     TileList tmp = tileLists[itileList];
00321     int iatomStart = tmp.iatomStart;
00322     int jtileStart = tmp.jtileStart;
00323     int jtileEnd   = tmp.jtileEnd;
00324     PatchPairRecord PPStmp = patchPairs[itileList];
00325     int iatomSize     = PPStmp.iatomSize;
00326     int jatomSize     = PPStmp.jatomSize;
00327 
00328     float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
00329     float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
00330     float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
00331 
00332     // Warp index (0...warpsize-1)
00333     const int wid = threadIdx.x % WARPSIZE;
00334 
00335     // Load i-atom data (and shift coordinates)
00336     float4 xyzq_i = xyzq[iatomStart + wid];
00337     xyzq_i.x += shx;
00338     xyzq_i.y += shy;
00339     xyzq_i.z += shz;
00340 
00341     GBISInput<phase> inp;
00342     inp.loadI(iatomStart + wid, inp1, inp2, inp3);
00343     if (phase == 2) inp.initQi(param, xyzq_i.w);
00344 
00345     // Number of i loops
00346     int nloopi = min(iatomSize - iatomStart, WARPSIZE);
00347 
00348     GBISResults<phase> resI;
00349     resI.init();
00350     float energyT;
00351     if (doEnergy) energyT = 0.0f;
00352 
00353     for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
00354 
00355       // Load j-atom starting index and exclusion mask
00356       int jatomStart = tileJatomStart[jtile];
00357 
00358       // Load coordinates and charge
00359       float4 xyzq_j  = xyzq[jatomStart + wid];
00360 
00361       inp.loadJ(jatomStart + wid, inp1, inp2, inp3);
00362       if (phase == 2) inp.initQj(xyzq_j.w);
00363 
00364       // Number of j loops
00365       int nloopj = min(jatomSize - jatomStart, WARPSIZE);
00366 
00367       const bool diag_tile = (iatomStart == jatomStart);
00368       const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
00369       int t = (phase != 2 && diag_tile) ? 1 : 0;
00370       if (phase != 2 && diag_tile) {
00371         inp.shuffleNext();
00372       }
00373 
00374       GBISResults<phase> resJ;
00375       resJ.init();
00376 
00377       for (;t < WARPSIZE;t++) {
00378         int j = (t + wid) & modval;
00379 
00380         float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00381         float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00382         float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00383 
00384         float r2 = dx*dx + dy*dy + dz*dz;
00385 
00386         if (j < nloopj && wid < nloopi) {
00387           calcGBISPhase<doEnergy, doSlow>(r2, dx, dy, dz, cutoff2, param, inp, resI, resJ, energyT);
00388         }
00389 
00390         inp.shuffleNext();
00391         resJ.shuffleNext();
00392       } // t
00393 
00394       // Write j
00395       writeResult(jatomStart + wid, resJ, out, forces);
00396 
00397     } // jtile
00398 
00399     // Write i
00400     writeResult(iatomStart + wid, resI, out, forces);
00401 
00402     // Reduce energy
00403     if (doEnergy) {
00404       typedef cub::WarpReduce<double> WarpReduceDouble;
00405       __shared__ typename WarpReduceDouble::TempStorage tempStorage[GBISKERNEL_NUM_WARP];
00406       int warpId = threadIdx.x / WARPSIZE;
00407       volatile double energyTot = WarpReduceDouble(tempStorage[warpId]).Sum((double)energyT);
00408       if (wid == 0) virialEnergy[itileList].energyGBIS = energyTot;
00409     }
00410 
00411   }
00412 }
00413 
00414 #undef GBIS_CUDA
00415 
00416 // ##############################################################################################
00417 // ##############################################################################################
00418 // ##############################################################################################
00419 
00420 //
00421 // Class constructor
00422 //
00423 CudaComputeGBISKernel::CudaComputeGBISKernel(int deviceID) : deviceID(deviceID) {
00424   cudaCheck(cudaSetDevice(deviceID));
00425 
00426   intRad0 = NULL;
00427   intRad0Size = 0;
00428 
00429   intRadS = NULL;
00430   intRadSSize = 0;
00431 
00432   psiSum = NULL;
00433   psiSumSize = 0;
00434 
00435   bornRad = NULL;
00436   bornRadSize = 0;
00437 
00438   dEdaSum = NULL;
00439   dEdaSumSize = 0;
00440 
00441   dHdrPrefix = NULL;
00442   dHdrPrefixSize = 0;
00443 
00444 }
00445 
00446 //
00447 // Class destructor
00448 //
00449 CudaComputeGBISKernel::~CudaComputeGBISKernel() {
00450   cudaCheck(cudaSetDevice(deviceID));
00451   if (intRad0 != NULL) deallocate_device<float>(&intRad0);
00452   if (intRadS != NULL) deallocate_device<float>(&intRadS);
00453   if (psiSum != NULL) deallocate_device<float>(&psiSum);
00454   if (bornRad != NULL) deallocate_device<float>(&bornRad);
00455   if (dEdaSum != NULL) deallocate_device<float>(&dEdaSum);
00456   if (dHdrPrefix != NULL) deallocate_device<float>(&dHdrPrefix);
00457 }
00458 
00459 //
00460 // Update (copy Host->Device) intRad0, intRadS
00461 //
00462 void CudaComputeGBISKernel::updateIntRad(const int atomStorageSize, float* intRad0H, float* intRadSH,
00463   cudaStream_t stream) {
00464 
00465   reallocate_device<float>(&intRad0, &intRad0Size, atomStorageSize, 1.2f);
00466   reallocate_device<float>(&intRadS, &intRadSSize, atomStorageSize, 1.2f);
00467 
00468   copy_HtoD<float>(intRad0H, intRad0, atomStorageSize, stream);
00469   copy_HtoD<float>(intRadSH, intRadS, atomStorageSize, stream);
00470 }
00471 
00472 //
00473 // Update (copy Host->Device) bornRad
00474 //
00475 void CudaComputeGBISKernel::updateBornRad(const int atomStorageSize, float* bornRadH, cudaStream_t stream) {
00476   reallocate_device<float>(&bornRad, &bornRadSize, atomStorageSize, 1.2f);
00477   copy_HtoD<float>(bornRadH, bornRad, atomStorageSize, stream);
00478 }
00479 
00480 //
00481 // Update (copy Host->Device) dHdrPrefix
00482 //
00483 void CudaComputeGBISKernel::update_dHdrPrefix(const int atomStorageSize, float* dHdrPrefixH, cudaStream_t stream) {
00484   reallocate_device<float>(&dHdrPrefix, &dHdrPrefixSize, atomStorageSize, 1.2f);
00485   copy_HtoD<float>(dHdrPrefixH, dHdrPrefix, atomStorageSize, stream);
00486 }
00487 
00488 //
00489 // Phase 1
00490 //
00491 void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize,
00492   const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum,
00493   cudaStream_t stream) {
00494 
00495   reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f);
00496   clear_device_array<float>(psiSum, atomStorageSize, stream);
00497 
00498   int nwarp = GBISKERNEL_NUM_WARP;
00499   int nthread = WARPSIZE*nwarp;
00500   int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
00501 
00502   GBISParam<1> param;
00503   param.a_cut = a_cut;
00504 
00505   float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
00506 
00507   GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>>
00508   (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
00509     tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
00510     param, intRad0, intRadS, NULL, psiSum, NULL, NULL);
00511 
00512   cudaCheck(cudaGetLastError());
00513 
00514   copy_DtoH<float>(psiSum, h_psiSum, atomStorageSize, stream);
00515 }
00516 
00517 //
00518 // Phase 2
00519 //
00520 void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize,
00521   const bool doEnergy, const bool doSlow,
00522   const float3 lata, const float3 latb, const float3 latc,
00523   const float r_cut, const float scaling, const float kappa, const float smoothDist,
00524   const float epsilon_p, const float epsilon_s,
00525   float4* d_forces, float* h_dEdaSum, cudaStream_t stream) {
00526 
00527   reallocate_device<float>(&dEdaSum, &dEdaSumSize, atomStorageSize, 1.2f);
00528   clear_device_array<float>(dEdaSum, atomStorageSize, stream);
00529 
00530   if (doEnergy) {
00531     tlKernel.setTileListVirialEnergyGBISLength(tlKernel.getNumTileListsGBIS());
00532   }
00533 
00534   int nwarp = GBISKERNEL_NUM_WARP;
00535   int nthread = WARPSIZE*nwarp;
00536 
00537   int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
00538 
00539   GBISParam<2> param;
00540   param.r_cut2      = r_cut*r_cut;
00541   param.r_cut_2     = 1.f / param.r_cut2;
00542   param.r_cut_4     = 4.f*param.r_cut_2*param.r_cut_2;
00543   param.epsilon_s_i = 1.f / epsilon_s;
00544   param.epsilon_p_i = 1.f / epsilon_p;
00545   param.scaling     = scaling;
00546   param.kappa       = kappa;
00547   param.smoothDist  = smoothDist;
00548 
00549 #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \
00550      <<< nblock, nthread, 0, stream >>> \
00551     (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \
00552       tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \
00553       param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy())
00554 
00555   if (!doEnergy && !doSlow) CALL(false, false);
00556   if (!doEnergy &&  doSlow) CALL(false, true);
00557   if ( doEnergy && !doSlow) CALL(true,  false);
00558   if ( doEnergy &&  doSlow) CALL(true,  true);
00559 
00560   cudaCheck(cudaGetLastError());
00561 
00562   copy_DtoH<float>(dEdaSum, h_dEdaSum, atomStorageSize, stream);
00563 }
00564 
00565 //
00566 // Phase 3
00567 //
00568 void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize,
00569   const float3 lata, const float3 latb, const float3 latc, const float a_cut,
00570   float4* d_forces, cudaStream_t stream) {
00571 
00572   int nwarp = GBISKERNEL_NUM_WARP;
00573   int nthread = WARPSIZE*nwarp;
00574   int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getNumTileListsGBIS()-1)/nwarp+1);
00575 
00576   GBISParam<3> param;
00577   param.a_cut = a_cut;
00578 
00579   float cutoff2 = (a_cut + FS_MAX)*(a_cut + FS_MAX);
00580 
00581   GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>>
00582   (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(),
00583     tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2,
00584     param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL);
00585 
00586   cudaCheck(cudaGetLastError());
00587 }

Generated on Sat Sep 23 01:17:13 2017 for NAMD by  doxygen 1.4.7