ComputeGBISCUDAKernel.h

Go to the documentation of this file.
00001 #include <stdio.h>
00002 #define GBIS_CUDA
00003 #include "ComputeGBIS.inl"
00004 
00005 #undef KEPLER_SHUFFLE
00006 #ifdef __CUDA_ARCH__
00007 #define KEPLER_SHUFFLE
00008 #if __CUDA_ARCH__ < 300
00009 #undef KEPLER_SHUFFLE
00010 #endif
00011 #endif
00012 
00013 //1111111111111111111111111111111111111111111111111111111111
00014 //
00015 // GBIS Phase 1 CUDA Kernal
00016 //
00017 //1111111111111111111111111111111111111111111111111111111111
00018 __global__ static void GBIS_P1_Kernel (
00019   const patch_pair *patch_pairs, // atoms pointers and such
00020   const atom *atoms,             // position & charge
00021   const float *intRad0,          // read in intrinsic radius
00022   const float *intRadS,          // read in intrinsic radius
00023   GBReal *tmp_psiSum,            // temporary device memory
00024   GBReal *psiSum,                // host-mapped memory
00025   const float a_cut,             // P1 interaction cutoff
00026   const float rho_0,             // H(i,j) parameter
00027   float3 lata,
00028   float3 latb,
00029   float3 latc,
00030   unsigned int *P1_counters
00031 ) {
00032 
00033   // shared memory
00034   __shared__ GBReal sh_psiSumJ_2d[NUM_WARP][WARPSIZE];
00035   __shared__ patch_pair sh_patch_pair;
00036 #ifndef KEPLER_SHUFFLE
00037   __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
00038   __shared__ float sh_intRad0j_2d[NUM_WARP][WARPSIZE];
00039 #endif
00040 
00041   volatile GBReal* sh_psiSumJ = sh_psiSumJ_2d[threadIdx.y];
00042 #ifndef KEPLER_SHUFFLE
00043   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
00044   volatile float* sh_intRad0j = sh_intRad0j_2d[threadIdx.y];
00045 #endif
00046 
00047   // Load data into shared memory
00048   {
00049     const int t = threadIdx.x + threadIdx.y*WARPSIZE;
00050     if (t < PATCH_PAIR_SIZE) {
00051       int* src = (int *)&patch_pairs[blockIdx.x];
00052       int* dst = (int *)&sh_patch_pair;
00053       dst[t] = src[t];
00054     }
00055     BLOCK_SYNC;
00056 
00057     // convert scaled offset with current lattice and write into shared memory
00058     if (t == 0) {
00059       float offx = sh_patch_pair.offset.x * lata.x 
00060                   + sh_patch_pair.offset.y * latb.x
00061                   + sh_patch_pair.offset.z * latc.x;
00062       float offy = sh_patch_pair.offset.x * lata.y
00063                   + sh_patch_pair.offset.y * latb.y
00064                   + sh_patch_pair.offset.z * latc.y;
00065       float offz = sh_patch_pair.offset.x * lata.z
00066                   + sh_patch_pair.offset.y * latb.z
00067                   + sh_patch_pair.offset.z * latc.z;
00068       sh_patch_pair.offset.x = offx;
00069       sh_patch_pair.offset.y = offy;
00070       sh_patch_pair.offset.z = offz;
00071     }
00072     BLOCK_SYNC;
00073   }
00074 
00075   //iterate over chunks of atoms within Patch 1
00076   for (int blocki = threadIdx.y*WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*WARPSIZE) {
00077 
00078     int nloopi = sh_patch_pair.patch1_size - blocki;
00079     nloopi = min(nloopi, WARPSIZE);
00080 
00081     //this thread calculates only the force on atomi; iterating over js
00082     atom atomi;
00083     float intRadSi;
00084     int i;
00085 
00086     // load BLOCK of Patch i atoms
00087     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
00088       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00089       float4 tmpa = ((float4*)atoms)[i];
00090       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
00091       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
00092       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
00093       atomi.charge = intRad0[i]; // overwrite charge with radius
00094       intRadSi = intRadS[i];
00095     } // load patch 1
00096 
00097     //init intermediate variables
00098     GBReal psiSumI = 0.f; // each thread accumulating single psi
00099 
00100     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
00101     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
00102     int blockj = (diag_patch_pair) ? blocki : 0;
00103 
00104     //iterate over chunks of atoms within Patch 2
00105     for (; blockj < sh_patch_pair.patch2_size; blockj += WARPSIZE ) {
00106 
00107       int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
00108 
00109 #ifdef KEPLER_SHUFFLE
00110       float xj;
00111       float yj;
00112       float zj;
00113       float chargej;
00114       float intRad0j_val;
00115 #endif
00116 
00117       //load smaller chunk of j atoms into shared memory: coordinates
00118       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00119         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00120         float4 tmpa = ((float4*)atoms)[j];
00121 #ifdef KEPLER_SHUFFLE
00122         xj = tmpa.x;
00123         yj = tmpa.y;
00124         zj = tmpa.z;
00125         chargej = intRadS[j];
00126         intRad0j_val = intRad0[j];
00127 #else
00128         sh_jpq[threadIdx.x].position.x = tmpa.x;
00129         sh_jpq[threadIdx.x].position.y = tmpa.y;
00130         sh_jpq[threadIdx.x].position.z = tmpa.z;
00131         sh_jpq[threadIdx.x].charge = intRadS[j];
00132         sh_intRad0j[threadIdx.x] = intRad0[j];
00133 #endif
00134       }
00135 
00136       const bool diag_tile = diag_patch_pair && (blocki == blockj);
00137       const int modval = diag_tile ? 2*WARPSIZE : WARPSIZE;
00138 
00139       sh_psiSumJ[threadIdx.x] = 0.f;
00140 
00141       //each thread loop over shared atoms
00142       int t = diag_tile ? 1 : 0;
00143 #ifdef KEPLER_SHUFFLE
00144       if (diag_tile) {
00145         xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00146         yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00147         zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00148         chargej = WARP_SHUFFLE(WARP_FULL_MASK, chargej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00149         intRad0j_val = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j_val, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00150       }
00151 #endif
00152 
00153       for (; t < WARPSIZE; ++t ) {
00154         int j = (t + threadIdx.x) % modval;
00155 #ifndef KEPLER_SHUFFLE
00156         float xj = sh_jpq[j].position.x;
00157         float yj = sh_jpq[j].position.y;
00158         float zj = sh_jpq[j].position.z;
00159         float chargej = sh_jpq[j].charge;
00160         float intRad0j_val = sh_intRad0j[j];
00161 #endif
00162         if (j < nloopj && threadIdx.x < nloopi)
00163         {
00164           float dx = atomi.position.x - xj;
00165           float dy = atomi.position.y - yj;
00166           float dz = atomi.position.z - zj;
00167           float r2 = dx*dx + dy*dy + dz*dz;
00168 
00169           // within cutoff        different atoms
00170           if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00171             // calculate H(i,j) [and not H(j,i)]
00172             float r_i = 1.f / sqrt(r2);
00173             float r  = r2 * r_i;
00174             float hij;
00175             int dij;
00176             CalcH(r,r2,r_i,a_cut,atomi.charge,chargej,hij,dij);
00177             psiSumI += hij;
00178             float hji;
00179             int dji;
00180             CalcH(r,r2,r_i,a_cut,intRad0j_val,intRadSi,hji,dji);
00181             sh_psiSumJ[j] += hji;
00182           } // cutoff
00183         } // if (j < nloopj)
00184 #ifdef KEPLER_SHUFFLE        
00185         xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00186         yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00187         zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00188         chargej = WARP_SHUFFLE(WARP_FULL_MASK, chargej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00189         intRad0j_val = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j_val, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00190 #endif
00191       } // for t
00192 
00193       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00194         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00195         atomicAdd(&tmp_psiSum[i_out], sh_psiSumJ[threadIdx.x]);
00196       }
00197 
00198     } // for block j
00199     //psiSumI now contains contributions from all j in 2nd patch
00200 
00201     // write psiSum to global memory buffer; to be accumulated later
00202     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
00203       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00204       atomicAdd(&tmp_psiSum[i_out], psiSumI);
00205     }
00206   } // for block i
00207 
00208   { // start of force sum
00209     // make sure psiSums are visible in global memory
00210     __threadfence();
00211     BLOCK_SYNC;
00212 
00213     // Mark patch pair (patch1_ind, patch2_ind) as "done"
00214     int patch1_ind = sh_patch_pair.patch1_ind;
00215     int patch2_ind = sh_patch_pair.patch2_ind;
00216 
00217     if (threadIdx.x == 0 && threadIdx.y == 0) {
00218       sh_patch_pair.patch_done[0] = false;
00219       sh_patch_pair.patch_done[1] = false;
00220 
00221       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
00222       int patch1_old = atomicInc(&P1_counters[patch1_ind], patch1_num_pairs-1);
00223       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
00224       if (patch1_ind != patch2_ind) {
00225         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
00226         int patch2_old = atomicInc(&P1_counters[patch2_ind], patch2_num_pairs-1);
00227         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
00228       }
00229     }
00230     // sync threads so that patch1_done and patch2_done are visible to all threads
00231     BLOCK_SYNC;
00232 
00233     if (sh_patch_pair.patch_done[0]) {
00234       const int start = sh_patch_pair.patch1_start;
00235       for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*WARPSIZE) {
00236         psiSum[start+i] = tmp_psiSum[start+i];
00237       }
00238     }
00239 
00240     if (sh_patch_pair.patch_done[1]) {
00241       const int start = sh_patch_pair.patch2_start;
00242       for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*WARPSIZE) {
00243         psiSum[start+i] = tmp_psiSum[start+i];
00244       }
00245     }
00246 
00247     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
00248       // Make sure page-locked host forces are up-to-date
00249 #if __CUDA_ARCH__ < 200
00250       __threadfence();
00251 #else
00252       __threadfence_system();
00253 #endif
00254     }
00255 
00256   } // end of force sum
00257 } //GBIS_P1
00258 
00259 //2222222222222222222222222222222222222222222222222222222222
00260 //
00261 // GBIS Phase 2 CUDA Kernal
00262 //
00263 //2222222222222222222222222222222222222222222222222222222222
00264 __global__ static void GBIS_P2_Kernel (
00265   const patch_pair *patch_pairs,// atoms pointers and such
00266   const atom *atoms,            // position & charge
00267   const float *bornRad,         // read in Born radius
00268   GBReal *tmp_dEdaSum,          // temporary device memory
00269   GBReal *dEdaSum,              // host-mapped memory
00270   const float a_cut,            // P1 interaction cutoff
00271   const float r_cut,            // P1 interaction cutoff
00272   const float scaling,          // scale nonbonded
00273   const float kappa,
00274   const float smoothDist,       // use interaction cutoff smoothing?
00275   const float epsilon_p,        // protein dielectric
00276   const float epsilon_s,        // solvent dielectric
00277   float3 lata,
00278   float3 latb,
00279   float3 latc,
00280   const int doEnergy,           // calculate energy too?
00281   const int doFullElec,         // calc dEdaSum for P3 full electrostatics
00282   float4 *tmp_forces,           // temporary  device memory
00283   float4 *forces,               // host-mapped memory
00284   float *tmp_energy,            // temporary device memory
00285   float *energy,                // host-mapped memory
00286   unsigned int *P2_counters
00287 ) {
00288 
00289   // Shared memory
00290   __shared__ patch_pair sh_patch_pair;
00291 #ifndef KEPLER_SHUFFLE
00292   __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
00293   __shared__ float sh_jBornRad_2d[NUM_WARP][WARPSIZE];
00294 #endif
00295   __shared__ float3 sh_forceJ_2d[NUM_WARP][WARPSIZE];
00296   __shared__ float sh_dEdaSumJ_2d[NUM_WARP][WARPSIZE];
00297 
00298 #ifndef KEPLER_SHUFFLE
00299   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
00300   volatile float* sh_jBornRad = sh_jBornRad_2d[threadIdx.y];
00301 #endif
00302   volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
00303   volatile float* sh_dEdaSumJ = sh_dEdaSumJ_2d[threadIdx.y];
00304 
00305   // Load data into shared memory
00306   {
00307     const int t = threadIdx.x + threadIdx.y*WARPSIZE;
00308     if (t < PATCH_PAIR_SIZE) {
00309       int* src = (int *)&patch_pairs[blockIdx.x];
00310       int* dst = (int *)&sh_patch_pair;
00311       dst[t] = src[t];
00312     }
00313     BLOCK_SYNC;
00314 
00315     // convert scaled offset with current lattice and write into shared memory
00316     if (t == 0) {
00317       float offx = sh_patch_pair.offset.x * lata.x 
00318                   + sh_patch_pair.offset.y * latb.x
00319                   + sh_patch_pair.offset.z * latc.x;
00320       float offy = sh_patch_pair.offset.x * lata.y
00321                   + sh_patch_pair.offset.y * latb.y
00322                   + sh_patch_pair.offset.z * latc.y;
00323       float offz = sh_patch_pair.offset.x * lata.z
00324                   + sh_patch_pair.offset.y * latb.z
00325                   + sh_patch_pair.offset.z * latc.z;
00326       sh_patch_pair.offset.x = offx;
00327       sh_patch_pair.offset.y = offy;
00328       sh_patch_pair.offset.z = offz;
00329     }
00330     BLOCK_SYNC;
00331   }
00332 
00333   float energyT = 0.f; // total energy for this thread; to be reduced
00334 
00335   //values used in loop
00336   float r_cut2 = r_cut*r_cut;
00337   float r_cut_2 = 1.f / r_cut2;
00338   float r_cut_4 = 4.f*r_cut_2*r_cut_2;
00339   float epsilon_s_i = 1.f / epsilon_s;
00340   float epsilon_p_i = 1.f / epsilon_p;
00341 
00342   //iterate over chunks of atoms within Patch 1
00343   for ( int blocki = threadIdx.y*WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += BLOCK_SIZE ) {
00344 
00345     int nloopi = sh_patch_pair.patch1_size - blocki;
00346     nloopi = min(nloopi, WARPSIZE);
00347 
00348     //this thread calculates only the force on atomi; iterating over js
00349     atom atomi;
00350     float bornRadI;
00351     int i;
00352 
00353     // load BLOCK of Patch i atoms
00354     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
00355       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00356       float4 tmpa = ((float4*)atoms)[i];
00357       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
00358       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
00359       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
00360       atomi.charge = - tmpa.w * scaling;
00361       bornRadI = bornRad[i];
00362     } // load patch 1
00363 
00364     //init intermediate variables
00365     GBReal dEdaSumI = 0.f; // each thread accumulating single psi
00366     float3 forceI;
00367     forceI.x = 0.f;
00368     forceI.y = 0.f;
00369     forceI.z = 0.f;
00370 
00371     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
00372     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
00373     int blockj = (diag_patch_pair) ? blocki : 0;
00374     //iterate over chunks of atoms within Patch 2
00375     for (; blockj < sh_patch_pair.patch2_size; blockj += WARPSIZE) {
00376 
00377       int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
00378 
00379 #ifdef KEPLER_SHUFFLE
00380       float xj;
00381       float yj;
00382       float zj;
00383       float chargej;
00384       float bornRadJ;
00385 #endif
00386 
00387       //load smaller chunk of j atoms into shared memory: coordinates
00388       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00389         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00390         float4 tmpa = ((float4*)atoms)[j];
00391 #ifdef KEPLER_SHUFFLE
00392         xj = tmpa.x;
00393         yj = tmpa.y;
00394         zj = tmpa.z;
00395         chargej = tmpa.w;
00396         bornRadJ = bornRad[j];
00397 #else
00398         sh_jpq[threadIdx.x].position.x = tmpa.x;
00399         sh_jpq[threadIdx.x].position.y = tmpa.y;
00400         sh_jpq[threadIdx.x].position.z = tmpa.z;
00401         sh_jpq[threadIdx.x].charge = tmpa.w;
00402         sh_jBornRad[threadIdx.x] = bornRad[j];
00403 #endif
00404       }
00405 
00406       sh_forceJ[threadIdx.x].x = 0.f;
00407       sh_forceJ[threadIdx.x].y = 0.f;
00408       sh_forceJ[threadIdx.x].z = 0.f;
00409       sh_dEdaSumJ[threadIdx.x] = 0.f;
00410 
00411       const bool diag_tile = diag_patch_pair && (blocki == blockj);
00412       const int modval = diag_tile ? 2*WARPSIZE : WARPSIZE;
00413       for (int t=0; t < WARPSIZE; ++t ) {
00414         int j = (t + threadIdx.x) % modval;
00415 #ifndef KEPLER_SHUFFLE
00416         float xj = sh_jpq[j].position.x;
00417         float yj = sh_jpq[j].position.y;
00418         float zj = sh_jpq[j].position.z;
00419         float chargej = sh_jpq[j].charge;
00420         float bornRadJ = sh_jBornRad[j];
00421 #endif
00422         if (j < nloopj && threadIdx.x < nloopi)
00423         {
00424           float dx = atomi.position.x - xj;
00425           float dy = atomi.position.y - yj;
00426           float dz = atomi.position.z - zj;
00427           float r2 = dx*dx + dy*dy + dz*dz;
00428 
00429           // within cutoff different atoms
00430           if (r2 < r_cut2 && r2 > 0.01f) {
00431 
00432             float r_i = 1.f / sqrt(r2);
00433             float r  = r2 * r_i;
00434             //float bornRadJ = sh_jBornRad[j];
00435 
00436             //calculate GB energy
00437             float qiqj = atomi.charge*chargej;
00438             float aiaj = bornRadI*bornRadJ;
00439             float aiaj4 = 4*aiaj;
00440             float expr2aiaj4 = exp(-r2/aiaj4);
00441             float fij = sqrt(r2+aiaj*expr2aiaj4);
00442             float f_i = 1/fij;
00443             float expkappa = exp(-kappa*fij);
00444             float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00445             float gbEij = qiqj*Dij*f_i;
00446 
00447             //calculate energy derivatives
00448             float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00449             float ddrf_i = -ddrfij*f_i*f_i;
00450             float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
00451             float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00452 
00453             //NAMD smoothing function
00454             float scale = 1.f;
00455             float ddrScale = 0.f;
00456             float forcedEdr;
00457             if (smoothDist > 0.f) {
00458               scale = r2 * r_cut_2 - 1.f;
00459               scale *= scale;
00460               ddrScale = r*(r2-r_cut2)*r_cut_4;
00461               energyT += gbEij * scale;
00462               forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
00463             } else {
00464               energyT += gbEij;
00465               forcedEdr = -ddrGbEij;
00466             }
00467 
00468             //add dEda
00469             if (doFullElec) {
00470               float dEdai = 0.5f*qiqj*f_i*f_i
00471                         *(kappa*epsilon_s_i*expkappa-Dij*f_i)
00472                         *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;//0
00473               dEdaSumI += dEdai;
00474               float dEdaj = 0.5f*qiqj*f_i*f_i
00475                         *(kappa*epsilon_s_i*expkappa-Dij*f_i)
00476                         *(aiaj+0.25f*r2)*expr2aiaj4/bornRadJ*scale;//0
00477               sh_dEdaSumJ[j] += dEdaj;
00478             }
00479 
00480             forcedEdr *= r_i;
00481             float tmpx = dx*forcedEdr;
00482             float tmpy = dy*forcedEdr;
00483             float tmpz = dz*forcedEdr;
00484             forceI.x += tmpx;
00485             forceI.y += tmpy;
00486             forceI.z += tmpz;
00487             sh_forceJ[j].x -= tmpx;
00488             sh_forceJ[j].y -= tmpy;
00489             sh_forceJ[j].z -= tmpz;
00490           } // within cutoff
00491           if (r2 < 0.01f) {
00492             // GB Self Energy
00493             if (doEnergy) {
00494               float fij = bornRadI;//inf
00495               float expkappa = exp(-kappa*fij);//0
00496               float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00497               float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
00498               energyT += 0.5f*gbEij;
00499             }
00500           } //same atom or within cutoff
00501         } // if (j < nloopj)
00502 #ifdef KEPLER_SHUFFLE
00503         xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00504         yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00505         zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00506         chargej = WARP_SHUFFLE(WARP_FULL_MASK, chargej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00507         bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00508 #endif
00509       } // for t
00510       if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00511         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00512         atomicAdd(&tmp_dEdaSum[i_out], sh_dEdaSumJ[threadIdx.x]);
00513         atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
00514         atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
00515         atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
00516       }
00517     } // for block j
00518     //psiSumI now contains contributions from all j in 2nd patch
00519 
00520     // write psiSum to global memory buffer; to be accumulated later
00521     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
00522       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00523       atomicAdd(&tmp_dEdaSum[i_out], dEdaSumI);
00524       atomicAdd(&tmp_forces[i_out].x, forceI.x);
00525       atomicAdd(&tmp_forces[i_out].y, forceI.y);
00526       atomicAdd(&tmp_forces[i_out].z, forceI.z);
00527     }
00528   } // for block i
00529 
00530   //Energy Reduction
00531   if (doEnergy) {
00532     // Do not have to sync here because each warp writes to the same
00533     // portion of sh_jforce_2d as in the above force computation loop
00534     volatile float* sh_energy = (float *)&sh_forceJ_2d[threadIdx.y][0].x;
00535     // Reduce within warps
00536     sh_energy[threadIdx.x] = (float)energyT;
00537     for (int d=1;d < WARPSIZE;d*=2) {
00538       int pos = threadIdx.x + d;
00539       float val = (pos < WARPSIZE) ? sh_energy[pos] : 0.0f;
00540       sh_energy[threadIdx.x] += val;
00541     }
00542     BLOCK_SYNC;
00543     // Reduce among warps
00544     if (threadIdx.x == 0 && threadIdx.y == 0) {
00545       float tot_energy = 0.0f;
00546 #pragma unroll
00547       for (int i=0;i < NUM_WARP;++i) {
00548         tot_energy += ((float *)&sh_forceJ_2d[i][0].x)[0];
00549       }
00550       int patch1_ind = sh_patch_pair.patch1_ind;
00551       atomicAdd(&tmp_energy[patch1_ind], (float)tot_energy);
00552     }
00553   } //end Energy Reduction
00554 
00555   { // start of reduction
00556     // make sure tmp_forces and tmp_dEdaSum are visible in global memory
00557     __threadfence();
00558     BLOCK_SYNC;
00559 
00560     // Mark patch pair (patch1_ind, patch2_ind) as "done"
00561     int patch1_ind = sh_patch_pair.patch1_ind;
00562     int patch2_ind = sh_patch_pair.patch2_ind;
00563 
00564     if (threadIdx.x == 0 && threadIdx.y == 0) {
00565       sh_patch_pair.patch_done[0] = false;
00566       sh_patch_pair.patch_done[1] = false;
00567 
00568       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
00569       int patch1_old = atomicInc(&P2_counters[patch1_ind], patch1_num_pairs-1);
00570       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
00571       if (patch1_ind != patch2_ind) {
00572         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
00573         int patch2_old = atomicInc(&P2_counters[patch2_ind], patch2_num_pairs-1);
00574         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
00575       }
00576     }
00577     // sync threads so that patch1_done and patch2_done are visible to all threads
00578     BLOCK_SYNC;
00579 
00580     if (sh_patch_pair.patch_done[0]) {
00581       const int start = sh_patch_pair.patch1_start;
00582       for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*WARPSIZE) {
00583         forces[start+i] = tmp_forces[start+i];
00584         dEdaSum[start+i] = tmp_dEdaSum[start+i];
00585       }
00586       energy[patch1_ind] = tmp_energy[patch1_ind];
00587     }
00588 
00589     if (sh_patch_pair.patch_done[1]) {
00590       const int start = sh_patch_pair.patch2_start;
00591       for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*WARPSIZE) {
00592         forces[start+i] = tmp_forces[start+i];
00593         dEdaSum[start+i] = tmp_dEdaSum[start+i];
00594       }
00595       energy[patch2_ind] = tmp_energy[patch2_ind];
00596     }
00597 
00598     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
00599       // Make sure page-locked host arrays are up-to-date
00600 #if __CUDA_ARCH__ < 200
00601       __threadfence();
00602 #else
00603       __threadfence_system();
00604 #endif
00605     }
00606 
00607   } // end of sum
00608 } //GBIS_P2
00609 
00610 //3333333333333333333333333333333333333333333333333333333333
00611 //
00612 // GBIS Phase 3 CUDA Kernal
00613 //
00614 //3333333333333333333333333333333333333333333333333333333333
00615 __global__ static void GBIS_P3_Kernel (
00616   const patch_pair *patch_pairs,  // atoms pointers and such
00617   const atom *atoms,              // position & charge
00618   const float *intRad0,           // read in intrinsic radius
00619   const float *intRadS,           // read in intrinsic radius
00620   const float *dHdrPrefix,        // read in prefix
00621   const float a_cut,              // P1 interaction cutoff
00622   const float rho_0,              // H(i,j) parameter
00623   const float scaling,            // scale nonbonded
00624   float3 lata,
00625   float3 latb,
00626   float3 latc,
00627   float4 *tmp_forces,             // temporary device memory
00628   float4 *forces,                 // host-mapped memory
00629   unsigned int *P3_counters
00630 ) {
00631 
00632   // Shared memory
00633   __shared__ patch_pair sh_patch_pair;
00634 #ifndef KEPLER_SHUFFLE
00635   __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
00636   __shared__ float sh_intRadJ0_2d[NUM_WARP][WARPSIZE];
00637   __shared__ float sh_jDHdrPrefix_2d[NUM_WARP][WARPSIZE];
00638 #endif
00639   __shared__ float3 sh_forceJ_2d[NUM_WARP][WARPSIZE];
00640 
00641 #ifndef KEPLER_SHUFFLE
00642   volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
00643   volatile float* sh_intRadJ0 = sh_intRadJ0_2d[threadIdx.y];
00644   volatile float* sh_jDHdrPrefix = sh_jDHdrPrefix_2d[threadIdx.y];
00645 #endif
00646   volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
00647 
00648   // Load data into shared memory
00649   {
00650     const int t = threadIdx.x + threadIdx.y*WARPSIZE;
00651     if (t < PATCH_PAIR_SIZE) {
00652       int* src = (int *)&patch_pairs[blockIdx.x];
00653       int* dst = (int *)&sh_patch_pair;
00654       dst[t] = src[t];
00655     }
00656     BLOCK_SYNC;
00657 
00658     // convert scaled offset with current lattice and write into shared memory
00659     if (t == 0) {
00660       float offx = sh_patch_pair.offset.x * lata.x 
00661                   + sh_patch_pair.offset.y * latb.x
00662                   + sh_patch_pair.offset.z * latc.x;
00663       float offy = sh_patch_pair.offset.x * lata.y
00664                   + sh_patch_pair.offset.y * latb.y
00665                   + sh_patch_pair.offset.z * latc.y;
00666       float offz = sh_patch_pair.offset.x * lata.z
00667                   + sh_patch_pair.offset.y * latb.z
00668                   + sh_patch_pair.offset.z * latc.z;
00669       sh_patch_pair.offset.x = offx;
00670       sh_patch_pair.offset.y = offy;
00671       sh_patch_pair.offset.z = offz;
00672     }
00673     BLOCK_SYNC;
00674   }
00675 
00676   //iterate over chunks of atoms within Patch 1
00677   for ( int blocki = threadIdx.y*WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*WARPSIZE ) {
00678 
00679     int nloopi = sh_patch_pair.patch1_size - blocki;
00680     nloopi = min(nloopi, WARPSIZE);
00681 
00682     //this thread calculates only the force on atomi; iterating over js
00683     atom atomi;
00684     float intRadIS;
00685     int i;
00686     float dHdrPrefixI;
00687 
00688     // load BLOCK of Patch i atoms
00689     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
00690       i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00691       float4 tmpa = ((float4*)atoms)[i];
00692       atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
00693       atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
00694       atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
00695       atomi.charge = intRad0[i]; // overwrite charge with radius
00696       intRadIS = intRadS[i];
00697       dHdrPrefixI = dHdrPrefix[i];
00698     } // load patch 1
00699 
00700     //init intermediate variables
00701     float3 forceI;
00702     forceI.x = 0.f;
00703     forceI.y = 0.f;
00704     forceI.z = 0.f;
00705 
00706     const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
00707     (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
00708     int blockj = (diag_patch_pair) ? blocki : 0;
00709     //iterate over chunks of atoms within Patch 2
00710     for (; blockj < sh_patch_pair.patch2_size; blockj += WARPSIZE ) {
00711 
00712       int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
00713 
00714 #ifdef KEPLER_SHUFFLE
00715       float xj;
00716       float yj;
00717       float zj;
00718       float intRadSJ;
00719       float dHdrPrefixJ;
00720       float intRadJ0;
00721 #endif
00722 
00723       //load smaller chunk of j atoms into shared memory: coordinates
00724       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00725         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00726         float4 tmpa = ((float4*)atoms)[j];
00727 #ifdef KEPLER_SHUFFLE
00728         xj = tmpa.x;
00729         yj = tmpa.y;
00730         zj = tmpa.z;
00731         intRadSJ = intRadS[j];
00732         dHdrPrefixJ = dHdrPrefix[j];
00733         intRadJ0 = intRad0[j];
00734 #else
00735         sh_jpq[threadIdx.x].position.x = tmpa.x;
00736         sh_jpq[threadIdx.x].position.y = tmpa.y;
00737         sh_jpq[threadIdx.x].position.z = tmpa.z;
00738         sh_jpq[threadIdx.x].charge = intRadS[j];
00739         sh_jDHdrPrefix[threadIdx.x] = dHdrPrefix[j]; // load dHdrPrefix into shared
00740         sh_intRadJ0[threadIdx.x] = intRad0[j];
00741 #endif
00742       }
00743 
00744       sh_forceJ[threadIdx.x].x = 0.f;
00745       sh_forceJ[threadIdx.x].y = 0.f;
00746       sh_forceJ[threadIdx.x].z = 0.f;
00747 
00748       const bool diag_tile = diag_patch_pair && (blocki == blockj);
00749       const int modval = diag_tile ? 2*WARPSIZE : WARPSIZE;
00750 #ifdef KEPLER_SHUFFLE
00751       if (diag_tile) {
00752         xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00753         yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00754         zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00755         intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00756         dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00757         intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00758       }
00759 #endif
00760       int t = diag_tile ? 1 : 0;
00761       for (; t < WARPSIZE; ++t ) {
00762         int j = (t + threadIdx.x) % modval;
00763 #ifndef KEPLER_SHUFFLE
00764         float xj = sh_jpq[j].position.x;
00765         float yj = sh_jpq[j].position.y;
00766         float zj = sh_jpq[j].position.z;
00767         float intRadSJ = sh_jpq[j].charge;
00768         float dHdrPrefixJ = sh_jDHdrPrefix[j];
00769         float intRadJ0 = sh_intRadJ0[j];
00770 #endif
00771         if (j < nloopj && threadIdx.x < nloopi)
00772         {
00773           float dx = atomi.position.x - xj;
00774           float dy = atomi.position.y - yj;
00775           float dz = atomi.position.z - zj;
00776           float r2 = dx*dx + dy*dy + dz*dz;
00777 
00778           // within cutoff        different atoms
00779           if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00780 
00781             float r_i = 1.f / sqrt(r2);
00782             float r  = r2 * r_i;
00783             float dhij, dhji;
00784             int dij, dji;
00785             CalcDH(r,r2,r_i,a_cut,atomi.charge,intRadSJ,dhij,dij);
00786             CalcDH(r,r2,r_i,a_cut,intRadJ0,intRadIS,dhji,dji);
00787 
00788             float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
00789             float tmpx = dx * forceAlpha;
00790             float tmpy = dy * forceAlpha;
00791             float tmpz = dz * forceAlpha;
00792             forceI.x += tmpx;
00793             forceI.y += tmpy;
00794             forceI.z += tmpz;
00795             sh_forceJ[j].x -= tmpx;
00796             sh_forceJ[j].y -= tmpy;
00797             sh_forceJ[j].z -= tmpz;
00798           } // cutoff
00799         } // if (j < nloopj...)
00800 #ifdef KEPLER_SHUFFLE
00801         xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00802         yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00803         zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00804         intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00805         dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00806         intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00807 #endif
00808       } // for t
00809       if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00810         int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00811         atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
00812         atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
00813         atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
00814       }
00815     } // for block j
00816 
00817     // write psiSum to global memory buffer; to be accumulated later
00818     if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
00819       int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00820       atomicAdd(&tmp_forces[i_out].x, forceI.x);
00821       atomicAdd(&tmp_forces[i_out].y, forceI.y);
00822       atomicAdd(&tmp_forces[i_out].z, forceI.z);
00823     }
00824   } // for block i
00825 
00826   { // start of force sum
00827     // make sure forces are visible in global memory
00828     __threadfence();
00829     BLOCK_SYNC;
00830 
00831     // Mark patch pair (patch1_ind, patch2_ind) as "done"
00832     int patch1_ind = sh_patch_pair.patch1_ind;
00833     int patch2_ind = sh_patch_pair.patch2_ind;
00834 
00835     if (threadIdx.x == 0 && threadIdx.y == 0) {
00836       sh_patch_pair.patch_done[0] = false;
00837       sh_patch_pair.patch_done[1] = false;
00838 
00839       unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
00840       int patch1_old = atomicInc(&P3_counters[patch1_ind], patch1_num_pairs-1);
00841       if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
00842       if (patch1_ind != patch2_ind) {
00843         unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
00844         int patch2_old = atomicInc(&P3_counters[patch2_ind], patch2_num_pairs-1);
00845         if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
00846       }
00847     }
00848     // sync threads so that patch1_done and patch2_done are visible to all threads
00849     BLOCK_SYNC;
00850 
00851     if (sh_patch_pair.patch_done[0]) {
00852       const int start = sh_patch_pair.patch1_start;
00853       for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*WARPSIZE) {
00854         forces[start+i] = tmp_forces[start+i];
00855       }
00856     }
00857 
00858     if (sh_patch_pair.patch_done[1]) {
00859       const int start = sh_patch_pair.patch2_start;
00860       for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*WARPSIZE) {
00861         forces[start+i] = tmp_forces[start+i];
00862       }
00863     }
00864 
00865     if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
00866       // Make sure page-locked host arrays are up-to-date
00867 #if __CUDA_ARCH__ < 200
00868       __threadfence();
00869 #else
00870       __threadfence_system();
00871 #endif
00872     }
00873 
00874  } // end of force sum
00875 
00876 } //GBIS_P3
00877 

Generated on Tue Sep 19 01:17:10 2017 for NAMD by  doxygen 1.4.7