Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

ComputeGBISCUDAKernel.h

Go to the documentation of this file.
00001 #include <stdio.h>
00002 #define GBIS_CUDA
00003 #include "ComputeGBIS.inl"
00004 
00005 #define DORED 1
00006 
00007 /*******************************************************************************
00008   These GBIS kernel was patterned after Jim's nonbonded kernel
00009 *******************************************************************************/
00010 
00011 /*
00012   TODO
00013 
00014   using the namd force indices to sum psi and dEda require
00015   every atom to have a force, i.e., fixed atoms won't work
00016   consider using numFree/numFixed atoms to correct this
00017 */
00018 
00019 //00000000000000000000000000000000000000000000000000000000000
00020 //
00021 // GBIS Psi / dEda Reduction CUDA Kernal
00022 //
00023 //00000000000000000000000000000000000000000000000000000000000
00024 
00025 // Reduce Energy
00026 __device__ __forceinline__ static void GBIS_Energy_Reduction_Kernel (
00027   const int list_index,
00028   const force_list *lists,
00029   const float *sumBuffers,
00030   float *sum
00031 ) {
00032 //one thread block (128 threads) is doing this single patch
00033   int tx = threadIdx.x;
00034 
00035   #define myList fl.fl
00036   __shared__ union {
00037     force_list fl;
00038     unsigned int i[FORCE_LIST_SIZE];
00039   } fl;
00040 
00041   if ( tx < FORCE_LIST_USED ) {
00042     unsigned int tmp = ((unsigned int*)lists)[
00043                         FORCE_LIST_SIZE*list_index+tx];
00044     fl.i[tx] = tmp;
00045   }
00046   __syncthreads();
00047 
00048   __shared__ float energySh[BLOCK_SIZE];
00049 
00050   if (tx < myList.force_list_size) {
00051     energySh[tx] = sumBuffers[myList.virial_list_start/16 + tx];
00052   }
00053   __syncthreads();
00054 
00055   float nf = 1.f * myList.force_list_size; // exact number to sum
00056   float p2f = log2(nf);
00057   int p2i = ceil(p2f);
00058   int n2i = 1<<p2i; // next power of 2 over number to sum
00059   int b = n2i / 2; // first midpoint
00060   if (tx + b < myList.force_list_size) { // handle non power of 2
00061     energySh[tx] += energySh[tx+b];
00062   }
00063   __syncthreads();
00064   b /= 2;
00065 
00066   for ( ; b > 0; b /= 2) { // handle remaining powers of 2
00067     if (tx < b) {
00068       energySh[tx] += energySh[tx+b];
00069     }
00070     __syncthreads();
00071   }
00072 
00073     if (threadIdx.x == 0) {
00074       const int dest = myList.virial_output_start/16;
00075       sum[dest] = energySh[0];
00076     }
00077 } // GBIS_Energy_Reduction
00078 
00079 // Reduce Forces
00080 __device__ __forceinline__ static void GBIS_Force_Reduction_Kernel (
00081   const int list_index,
00082   const force_list *lists,
00083   const float4 *sumBuffers,
00084   float4 *sum
00085 ) {
00086   #define myList fl.fl
00087   __shared__ union {
00088     force_list fl;
00089     unsigned int i[FORCE_LIST_SIZE];
00090   } fl;
00091 
00092   if ( threadIdx.x < FORCE_LIST_USED ) {
00093     unsigned int tmp = ((unsigned int*)lists)[
00094                         FORCE_LIST_SIZE*list_index+threadIdx.x];
00095     fl.i[threadIdx.x] = tmp;
00096   }
00097   __syncthreads();
00098   for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00099     const float4 *sumBuf = sumBuffers + myList.force_list_start + j;
00100     float4 sumOut; sumOut.x = 0.f; sumOut.y = 0.f; sumOut.z = 0.f;
00101     for ( int i = 0; i < myList.force_list_size; ++i ) {
00102       float4 sumVal = *sumBuf;
00103       sumOut.x += sumVal.x;
00104       sumOut.y += sumVal.y;
00105       sumOut.z += sumVal.z;
00106       sumBuf += myList.patch_stride;
00107     } // i
00108     const int dest = myList.force_output_start + j;
00109     sum[dest].x += sumOut.x; // adding onto NAMD nonbonded, sync?
00110     sum[dest].y += sumOut.y;
00111     sum[dest].z += sumOut.z;
00112   } // j
00113 } // GBIS_Force_Reduction
00114 
00115 // Reduce psi / dEda
00116 __device__ __forceinline__ static void GBIS_Atom_Reduction_Kernel (
00117   const int list_index,
00118   const force_list *lists,
00119   const GBReal *sumBuffers,
00120   GBReal *sum
00121 ) {
00122   #define myList fl.fl
00123   __shared__ union {
00124     force_list fl;
00125     unsigned int i[FORCE_LIST_SIZE];
00126   } fl;
00127 
00128   if ( threadIdx.x < FORCE_LIST_USED ) {
00129     unsigned int tmp = ((unsigned int*)lists)[
00130                         FORCE_LIST_SIZE*list_index+threadIdx.x];
00131     fl.i[threadIdx.x] = tmp;
00132   }
00133   __syncthreads();
00134   for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00135     const GBReal *sumBuf = sumBuffers + myList.force_list_start + j;
00136     GBReal sumOut = 0.f;
00137     for ( int i = 0; i < myList.force_list_size; ++i ) {
00138       GBReal sumVal = *sumBuf;
00139       sumOut += sumVal;
00140       sumBuf += myList.patch_stride;
00141     } // i
00142     const int dest = myList.force_output_start + j;
00143     sum[dest] = sumOut;
00144   } // j
00145 } // GBIS_Atom_Reduction
00146 
00147 //1111111111111111111111111111111111111111111111111111111111
00148 //
00149 // GBIS Phase 1 CUDA Kernal
00150 //
00151 //1111111111111111111111111111111111111111111111111111111111
00152 __global__ static void GBIS_P1_Kernel (
00153   const patch_pair *patch_pairs, // atoms pointers and such
00154   const atom *atoms,             // position & charge
00155   const atom_param *atom_params,
00156   const float *intRad0,          // read in intrinsic radius
00157   const float *intRadS,          // read in intrinsic radius
00158   GBReal *psiSumBuffers,         // write out psi summation
00159   GBReal *psiSum,
00160   const float a_cut,             // P1 interaction cutoff
00161   const float rho_0,             // H(i,j) parameter
00162   float3 lata,
00163   float3 latb,
00164   float3 latc,
00165   const force_list *force_lists,
00166   unsigned int *P1_counters
00167 ) {
00168   int tx = threadIdx.x;
00169   int bx = blockIdx.x;
00170 #ifdef __DEVICE_EMULATION__
00171 //  printf("GBIS_P1: Blk(%3i) Thd(%3i)\n",bx,tx);
00172 #endif
00173 // one block per patch_pair
00174 // BLOCK_SIZE 128 threads per block
00175 // SHARED_SIZE 32
00176 // PATCH_PAIR_SIZE 16
00177 // PATCH_PAIR_USED 15
00178 
00179 //macro for patch pair
00180 #ifdef __DEVICE_EMULATION__
00181   #define myPatchPair (*(patch_pair*)(&pp.i))
00182 #else
00183   #define myPatchPair pp.pp
00184 #endif
00185   __shared__ union {
00186 #ifndef __DEVICE_EMULATION__
00187     patch_pair pp;
00188 #endif
00189     unsigned int i[PATCH_PAIR_SIZE];
00190   } pp;
00191   if ( tx < PATCH_PAIR_USED ) {
00192     unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00193     pp.i[tx] = tmp;
00194   } // ?
00195 
00196 
00197 //macro for list of j atoms
00198 #ifdef __DEVICE_EMULATION__
00199   #define jatoms ((atom*)(jpqu.i))
00200 #else
00201   #define jatoms jpqu.d
00202 #endif
00203   __shared__ union {
00204 #ifndef __DEVICE_EMULATION__
00205     atom d[SHARED_SIZE];
00206 #endif
00207     int i[4*SHARED_SIZE];
00208     float f[4*SHARED_SIZE];
00209   } jpqu;
00210 
00211   // convert scaled offset with current lattice
00212   if ( tx == 0 ) {
00213     float offx = myPatchPair.offset.x * lata.x
00214                + myPatchPair.offset.y * latb.x
00215                + myPatchPair.offset.z * latc.x;
00216     float offy = myPatchPair.offset.x * lata.y
00217                + myPatchPair.offset.y * latb.y
00218                + myPatchPair.offset.z * latc.y;
00219     float offz = myPatchPair.offset.x * lata.z
00220                + myPatchPair.offset.y * latb.z
00221                + myPatchPair.offset.z * latc.z;
00222     myPatchPair.offset.x = offx;
00223     myPatchPair.offset.y = offy;
00224     myPatchPair.offset.z = offz;
00225   }
00226   __syncthreads();
00227 
00228   int patch1size = myPatchPair.patch1_size;
00229   int patch2size = myPatchPair.patch2_size;
00230 
00231   //iterate over chunks of atoms within Patch 1
00232   for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00233 
00234     //this thread calculates only the force on atomi; iterating over js
00235     atom atomi;
00236     //int iindex;
00237     int i;
00238 
00239     // load BLOCK of Patch i atoms
00240     if ( blocki + tx < patch1size ) {
00241       i = myPatchPair.patch1_atom_start + blocki + tx;
00242       float4 tmpa = ((float4*)atoms)[i];
00243       atomi.position.x = tmpa.x + myPatchPair.offset.x;
00244       atomi.position.y = tmpa.y + myPatchPair.offset.y;
00245       atomi.position.z = tmpa.z + myPatchPair.offset.z;
00246       atomi.charge = intRad0[i]; // overwrite charge with radius
00247       //iindex =  ((uint4 *)(atom_params))[i].y; // store index
00248     } // load patch 1
00249 
00250     //init intermediate variables
00251     GBReal psiSumI = 0.f; // each thread accumulating single psi
00252 
00253     //iterate over chunks of atoms within Patch 2
00254     for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00255 
00256       int shared_size = patch2size - blockj;
00257       if ( shared_size > SHARED_SIZE )
00258         shared_size = SHARED_SIZE;
00259       __syncthreads();
00260 
00261       //load smaller chunk of j atoms into shared memory: coordinates
00262 
00263       int j = myPatchPair.patch2_atom_start + blockj;
00264       if ( tx < 4 * shared_size ) {
00265         jpqu.i[tx] = ((unsigned int *)(atoms       + j))[tx];
00266       }
00267       __syncthreads();
00268 
00269       //load smaller chunk of j atoms into shared memory: radii
00270       if ( tx < shared_size ) { // every 4th thread
00271         jatoms[tx].charge = intRadS[j+tx]; // load radius into charge float
00272         //printf("jatom2 = %7.3f\n", jatoms[tx].charge);
00273       }
00274       __syncthreads();
00275 
00276       if ( blocki + tx < patch1size ) {
00277         //each thread loop over shared atoms
00278         for ( int j = 0; j < shared_size; ++j ) {
00279           float dx = atomi.position.x - jatoms[j].position.x;
00280           float dy = atomi.position.y - jatoms[j].position.y;
00281           float dz = atomi.position.z - jatoms[j].position.z;
00282           float r2 = dx*dx + dy*dy + dz*dz;
00283 
00284           // within cutoff        different atoms
00285           if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00286             //int jj = myPatchPair.patch2_atom_start + blockj + j;
00287             //int jindex =  ((uint4 *)(atom_params))[jj].y; // store index
00288 
00289             // calculate H(i,j) [and not H(j,i)]
00290             float r_i = 1.f / sqrt(r2);
00291             float r  = r2 * r_i;
00292             float hij;
00293             int dij;
00294             CalcH(r,r2,r_i,a_cut,atomi.charge,jatoms[j].charge,hij,dij);
00295 
00296           psiSumI += hij;
00297 
00298           } // cutoff
00299         } // for j
00300       } // if thread in bounds
00301     } // for block j
00302     //psiSumI now contains contributions from all j in 2nd patch
00303 
00304     // write psiSum to global memory buffer; to be accumulated later
00305     if ( blocki + tx < myPatchPair.patch1_force_size) {
00306       int i_out = myPatchPair.patch1_force_start + blocki + tx;
00307       psiSumBuffers[i_out] = psiSumI;
00308     }
00309   } // for block i
00310 
00311  { // start of force sum
00312   // make sure psiSums are visible in global memory
00313   __threadfence();
00314   __syncthreads();
00315   __shared__ bool doSum;
00316 
00317   if (threadIdx.x == 0) {
00318     int fli = myPatchPair.patch1_force_list_index;
00319     int fls = myPatchPair.patch1_force_list_size;
00320     int old = atomicInc(P1_counters+fli,fls-1);
00321     doSum = ( old == fls - 1 );
00322   }
00323 
00324   __syncthreads();
00325 
00326   if ( doSum ) {
00327     GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00328        force_lists, psiSumBuffers, psiSum);
00329   }
00330  } // end of force sum
00331 } //GBIS_P1
00332 
00333 //2222222222222222222222222222222222222222222222222222222222
00334 //
00335 // GBIS Phase 2 CUDA Kernal
00336 //
00337 //2222222222222222222222222222222222222222222222222222222222
00338 __global__ static void GBIS_P2_Kernel (
00339   const patch_pair *patch_pairs,// atoms pointers and such
00340   const atom *atoms,            // position & charge
00341   const atom_param *atom_params,
00342   const float *bornRad,         // read in Born radius
00343   GBReal *dEdaSumBuffers,       // write out dEda summation
00344   GBReal *dEdaSum,
00345   const float a_cut,            // P1 interaction cutoff
00346   const float r_cut,            // P1 interaction cutoff
00347   const float scaling,          // scale nonbonded
00348   const float kappa,
00349   const float smoothDist,       // use interaction cutoff smoothing?
00350   const float epsilon_p,        // protein dielectric
00351   const float epsilon_s,        // solvent dielectric
00352   float3 lata,
00353   float3 latb,
00354   float3 latc,
00355   const int doEnergy,           // calculate energy too?
00356   const int doFullElec,         // calc dEdaSum for P3 full electrostatics
00357   const force_list *force_lists,
00358   float4 *forceBuffers,
00359   float4 *forces,
00360   float *energyBuffers,
00361   float *energy,
00362   unsigned int *P2_counters
00363 ) {
00364   int tx = threadIdx.x;
00365   int bx = blockIdx.x;
00366 #ifdef __DEVICE_EMULATION__
00367 //  printf("GBIS_P2: Blk(%3i) Thd(%3i)\n",bx,tx);
00368 #endif
00369 // one block per patch_pair
00370 // BLOCK_SIZE 128 threads per block
00371 // SHARED_SIZE 32
00372 // PATCH_PAIR_SIZE 16
00373 // PATCH_PAIR_USED 15
00374 
00375 //macro for patch pair
00376 #ifdef __DEVICE_EMULATION__
00377   #define myPatchPair (*(patch_pair*)(&pp.i))
00378 #else
00379   #define myPatchPair pp.pp
00380 #endif
00381   __shared__ union {
00382 #ifndef __DEVICE_EMULATION__
00383     patch_pair pp;
00384 #endif
00385     unsigned int i[PATCH_PAIR_SIZE];
00386   } pp;
00387   if ( tx < PATCH_PAIR_USED ) {
00388     unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00389     pp.i[tx] = tmp;
00390   } // ?
00391 
00392 
00393 //macro for list of j atoms
00394 #ifdef __DEVICE_EMULATION__
00395   #define jatoms ((atom*)(jpqu.i))
00396 #else
00397   #define jatoms jpqu.d
00398 #endif
00399   __shared__ union {
00400 #ifndef __DEVICE_EMULATION__
00401     atom d[SHARED_SIZE];
00402 #endif
00403     int i[4*SHARED_SIZE];
00404     float f[4*SHARED_SIZE];
00405   } jpqu;
00406 
00407   __shared__ float jBornRad[SHARED_SIZE];
00408   float energyT = 0.f; // total energy for this thread; to be reduced
00409 
00410   // convert scaled offset with current lattice
00411   if ( tx == 0 ) {
00412     float offx = myPatchPair.offset.x * lata.x
00413                + myPatchPair.offset.y * latb.x
00414                + myPatchPair.offset.z * latc.x;
00415     float offy = myPatchPair.offset.x * lata.y
00416                + myPatchPair.offset.y * latb.y
00417                + myPatchPair.offset.z * latc.y;
00418     float offz = myPatchPair.offset.x * lata.z
00419                + myPatchPair.offset.y * latb.z
00420                + myPatchPair.offset.z * latc.z;
00421     myPatchPair.offset.x = offx;
00422     myPatchPair.offset.y = offy;
00423     myPatchPair.offset.z = offz;
00424   }
00425   __syncthreads();
00426 
00427   int patch1size = myPatchPair.patch1_size;
00428   int patch2size = myPatchPair.patch2_size;
00429 
00430   //values used in loop
00431   float r_cut2 = r_cut*r_cut;
00432   float r_cut_2 = 1.f / r_cut2;
00433   float r_cut_4 = 4.f*r_cut_2*r_cut_2;
00434   float epsilon_s_i = 1.f / epsilon_s;
00435   float epsilon_p_i = 1.f / epsilon_p;
00436 
00437   //iterate over chunks of atoms within Patch 1
00438   for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00439 
00440     //this thread calculates only the force on atomi; iterating over js
00441     atom atomi;
00442     float bornRadI;
00443     //int iindex;
00444     int i;
00445 
00446     // load BLOCK of Patch i atoms
00447     if ( blocki + tx < patch1size ) {
00448       i = myPatchPair.patch1_atom_start + blocki + tx;
00449       float4 tmpa = ((float4*)atoms)[i];
00450       atomi.position.x = tmpa.x + myPatchPair.offset.x;
00451       atomi.position.y = tmpa.y + myPatchPair.offset.y;
00452       atomi.position.z = tmpa.z + myPatchPair.offset.z;
00453       atomi.charge = - tmpa.w * scaling;
00454       bornRadI = bornRad[i];
00455       //iindex =  ((uint4 *)(atom_params))[i].y; // store index
00456     } // load patch 1
00457 
00458     //init intermediate variables
00459     GBReal dEdaSumI = 0.f; // each thread accumulating single psi
00460     float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00461 
00462     //iterate over chunks of atoms within Patch 2
00463     for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00464 
00465       int shared_size = patch2size - blockj;
00466       if ( shared_size > SHARED_SIZE )
00467         shared_size = SHARED_SIZE;
00468       __syncthreads();
00469 
00470       //load smaller chunk of j atoms into shared memory: coordinates
00471       int j = myPatchPair.patch2_atom_start + blockj;
00472       if ( tx < 4 * shared_size ) {
00473         jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00474       }
00475       __syncthreads();
00476 
00477       //load smaller chunk of j atoms into shared memory: radii
00478       if ( tx < shared_size ) {
00479         jBornRad[tx] = bornRad[j+tx]; // load Born radius into shared
00480         //printf("BornRadius = %7.3f\n", jBornRadii[tx]);
00481       }
00482       __syncthreads();
00483 
00484       if ( blocki + tx < patch1size ) {
00485         //each thread loop over shared atoms
00486         for ( int j = 0; j < shared_size; ++j ) {
00487           float dx = atomi.position.x - jatoms[j].position.x;
00488           float dy = atomi.position.y - jatoms[j].position.y;
00489           float dz = atomi.position.z - jatoms[j].position.z;
00490           float r2 = dx*dx + dy*dy + dz*dz;
00491 
00492           // within cutoff different atoms
00493           if (r2 < r_cut*r_cut && r2 > 0.01f) {
00494             //int jj = myPatchPair.patch2_atom_start + blockj + j;
00495             //int jindex =  ((uint4 *)(atom_params))[jj].y; // store index
00496 
00497             float r_i = 1.f / sqrt(r2);
00498             float r  = r2 * r_i;
00499             float bornRadJ = jBornRad[j];
00500 
00501             //calculate GB energy
00502             float qiqj = atomi.charge*jatoms[j].charge;
00503             float aiaj = bornRadI*bornRadJ;
00504             float aiaj4 = 4*aiaj;
00505             float expr2aiaj4 = exp(-r2/aiaj4);
00506             float fij = sqrt(r2+aiaj*expr2aiaj4);
00507             float f_i = 1/fij;
00508             float expkappa = exp(-kappa*fij);
00509             float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00510             float gbEij = qiqj*Dij*f_i;
00511 
00512             //calculate energy derivatives
00513             float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00514             float ddrf_i = -ddrfij*f_i*f_i;
00515             float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
00516             float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00517 
00518             //NAMD smoothing function
00519             float scale = 1.f;
00520             float ddrScale = 0.f;
00521             float forcedEdr;
00522             if (smoothDist > 0.f) {
00523               scale = r2 * r_cut_2 - 1.f;
00524               scale *= scale;
00525               ddrScale = r*(r2-r_cut2)*r_cut_4;
00526               energyT += gbEij * scale * 0.5f;
00527               //printf("PAIRENERGY = %15.7e\n", gbEij*scale*0.5);
00528               forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
00529             } else {
00530               energyT += gbEij * 0.5f;
00531               //printf("PAIRENERGY = %15.7e\n", gbEij*0.5);
00532               forcedEdr = -ddrGbEij;
00533             }
00534 
00535             //add dEda
00536             float dEdai = 0.f;
00537             if (doFullElec) {
00538               //gbisParams->dEdaSum[0][i] += dEdai*scale;
00539               dEdai = 0.5f*qiqj*f_i*f_i
00540                       *(kappa*epsilon_s_i*expkappa-Dij*f_i)
00541                       *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;//0
00542               dEdaSumI += dEdai;
00543             }
00544 
00545             forcedEdr *= r_i;
00546             forceI.x += dx*forcedEdr;
00547             forceI.y += dy*forcedEdr;
00548             forceI.z += dz*forcedEdr;
00549 
00550 
00551           } // within cutoff
00552           if (r2 < 0.01f) {
00553             // GB Self Energy
00554             if (doEnergy) {
00555               float fij = bornRadI;//inf
00556               float expkappa = exp(-kappa*fij);//0
00557               float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00558               float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
00559               energyT += 0.5f*gbEij;
00560             }
00561           } //same atom or within cutoff
00562         } // for j
00563       } // if thread in bounds
00564     } // for block j
00565     //psiSumI now contains contributions from all j in 2nd patch
00566 
00567     // write psiSum to global memory buffer; to be accumulated later
00568     if ( blocki + tx < myPatchPair.patch1_force_size) {
00569       int i_out = myPatchPair.patch1_force_start + blocki + tx;
00570       dEdaSumBuffers[i_out] = dEdaSumI;
00571       forceBuffers[i_out] = forceI;
00572     }
00573   } // for block i
00574   //printf("energyT[%03i,%03i] = %15.7e\n", blockIdx.x, tx, energyT);
00575 
00576 //Energy Reduction
00577   if (doEnergy) {
00578     const int i_out = myPatchPair.virial_start / 16;
00579     __shared__ float energySh[BLOCK_SIZE];
00580     energySh[tx] = energyT;
00581     __syncthreads();
00582 
00583     for ( unsigned int b = blockDim.x/2; b > 0 ; b >>= 1 ) {
00584       if ( tx < b ) {
00585         //printf("RED eSh[%i] += eSh[%i]\n", tx, tx+b);
00586         energySh[tx] += energySh[tx+b];
00587       }
00588       __syncthreads();
00589     }
00590     //const int i_out = myPatchPair.virial_start + tx;
00591     if (tx == 0) {
00592       energyBuffers[i_out] = energySh[0]; // TODO
00593       //printf("eB[%i] = %15.7e\n", i_out, energySh[0]);
00594     }
00595   } //end Energy Reduction
00596 
00597  { // start of reduction
00598   // make sure psiSums are visible in global memory
00599   __threadfence();
00600   __syncthreads();
00601 
00602   __shared__ bool doSum;
00603 
00604   if (threadIdx.x == 0) {
00605     int fli = myPatchPair.patch1_force_list_index;
00606     int fls = myPatchPair.patch1_force_list_size;
00607     int old = atomicInc(P2_counters+fli,fls-1);
00608     doSum = ( old == fls - 1 );
00609   }
00610 
00611   __syncthreads();
00612 
00613   if ( doSum ) {
00614     GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00615        force_lists, dEdaSumBuffers, dEdaSum);
00616     GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00617        force_lists, forceBuffers, forces);
00618     if ( doEnergy ) {
00619       GBIS_Energy_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00620         force_lists, energyBuffers, energy);
00621     }
00622   }
00623  } // end of sum
00624 } //GBIS_P2
00625 
00626 
00627 //3333333333333333333333333333333333333333333333333333333333
00628 //
00629 // GBIS Phase 3 CUDA Kernal
00630 //
00631 //3333333333333333333333333333333333333333333333333333333333
00632 __global__ static void GBIS_P3_Kernel (
00633   const patch_pair *patch_pairs,  // atoms pointers and such
00634   const atom *atoms,              // position & charge
00635   const atom_param *atom_params,
00636   const float *intRad0,           // read in intrinsic radius
00637   const float *intRadS,           // read in intrinsic radius
00638   const float *dHdrPrefix,        // read in prefix
00639   const float a_cut,              // P1 interaction cutoff
00640   const float rho_0,              // H(i,j) parameter
00641   const float scaling,            // scale nonbonded
00642   float3 lata,
00643   float3 latb,
00644   float3 latc,
00645   const force_list *force_lists,
00646   float4 *forceBuffers,
00647   float4 *forces,
00648   unsigned int *P3_counters
00649 ) {
00650   int tx = threadIdx.x;
00651   int bx = blockIdx.x;
00652 #ifdef __DEVICE_EMULATION__
00653 //  printf("GBIS_P3: Blk(%3i) Thd(%3i)\n",bx,tx);
00654 #endif
00655 
00656 #ifdef __DEVICE_EMULATION__
00657   #define myPatchPair (*(patch_pair*)(&pp.i))
00658 #else
00659   #define myPatchPair pp.pp
00660 #endif
00661   __shared__ union {
00662 #ifndef __DEVICE_EMULATION__
00663     patch_pair pp;
00664 #endif
00665     unsigned int i[PATCH_PAIR_SIZE];
00666   } pp;
00667   if ( tx < PATCH_PAIR_USED ) {
00668     unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00669     pp.i[tx] = tmp;
00670   } // all threads load single shared patch pair
00671 
00672 //macro for list of j atoms
00673 #ifdef __DEVICE_EMULATION__
00674   #define jatoms ((atom*)(jpqu.i))
00675 #else
00676   #define jatoms jpqu.d
00677 #endif
00678   __shared__ union {
00679 #ifndef __DEVICE_EMULATION__
00680     atom d[SHARED_SIZE];
00681 #endif
00682     int i[4*SHARED_SIZE];
00683     float f[4*SHARED_SIZE];
00684   } jpqu;
00685 
00686   __shared__ float jDHdrPrefix[SHARED_SIZE];
00687   __shared__ float intRadJ0[SHARED_SIZE];
00688 
00689   // convert scaled offset with current lattice
00690   if ( tx == 0 ) {
00691     float offx = myPatchPair.offset.x * lata.x
00692                + myPatchPair.offset.y * latb.x
00693                + myPatchPair.offset.z * latc.x;
00694     float offy = myPatchPair.offset.x * lata.y
00695                + myPatchPair.offset.y * latb.y
00696                + myPatchPair.offset.z * latc.y;
00697     float offz = myPatchPair.offset.x * lata.z
00698                + myPatchPair.offset.y * latb.z
00699                + myPatchPair.offset.z * latc.z;
00700     myPatchPair.offset.x = offx;
00701     myPatchPair.offset.y = offy;
00702     myPatchPair.offset.z = offz;
00703   }
00704   __syncthreads();
00705 
00706   //iterate over chunks of atoms within Patch 1
00707   for ( int blocki = 0; blocki < myPatchPair.patch1_size; blocki += BLOCK_SIZE ) {
00708     //this thread calculates only the force on atomi; iterating over js
00709     atom atomi;
00710     float intRadIS;
00711     //int iindex;
00712     int i;
00713     float dHdrPrefixI;
00714 
00715     // load BLOCK of Patch i atoms
00716     if ( blocki + tx < myPatchPair.patch1_size ) {
00717       i = myPatchPair.patch1_atom_start + blocki + tx;
00718       float4 tmpa = ((float4*)atoms)[i];
00719       atomi.position.x = tmpa.x + myPatchPair.offset.x;
00720       atomi.position.y = tmpa.y + myPatchPair.offset.y;
00721       atomi.position.z = tmpa.z + myPatchPair.offset.z;
00722       atomi.charge = intRad0[i]; // overwrite charge with radius
00723       intRadIS = intRadS[i];
00724       //iindex =  ((uint4 *)(atom_params))[i].y; // store index
00725       dHdrPrefixI = dHdrPrefix[i];
00726     } // load patch 1
00727 
00728     //init intermediate variables
00729     float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00730 
00731     //iterate over chunks of atoms within Patch 2
00732     for ( int blockj = 0; blockj < myPatchPair.patch2_size; blockj += SHARED_SIZE ) {
00733 
00734       int shared_size = myPatchPair.patch2_size - blockj;
00735       if ( shared_size > SHARED_SIZE )
00736         shared_size = SHARED_SIZE;
00737       __syncthreads();
00738 
00739       //load smaller chunk of j atoms into shared memory: coordinates
00740 
00741       int j = myPatchPair.patch2_atom_start + blockj;
00742       if ( tx < 4 * shared_size ) {
00743         jpqu.i[tx] = ((unsigned int *)(atoms       + j))[tx];
00744       }
00745       __syncthreads();
00746 
00747       //load smaller chunk of j atoms into shared memory: radii
00748       if ( tx < shared_size ) { // every 4th thread
00749         jatoms[tx].charge = intRadS[j+tx]; // load radius into charge float
00750         jDHdrPrefix[tx] = dHdrPrefix[j+tx]; // load dHdrPrefix into shared
00751         intRadJ0[tx] = intRad0[j+tx];
00752       }
00753       __syncthreads();
00754 
00755       if ( blocki + tx < myPatchPair.patch1_size ) {
00756         //each thread loop over shared atoms
00757         for ( int j = 0; j < shared_size; ++j ) {
00758           float dx = atomi.position.x - jatoms[j].position.x;
00759           float dy = atomi.position.y - jatoms[j].position.y;
00760           float dz = atomi.position.z - jatoms[j].position.z;
00761           float r2 = dx*dx + dy*dy + dz*dz;
00762 
00763           // within cutoff        different atoms
00764           if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00765             //int jj = myPatchPair.patch2_atom_start + blockj + j;
00766             //int jindex =  ((uint4 *)(atom_params))[jj].y; // store index
00767 
00768             float r_i = 1.f / sqrt(r2);
00769             float r  = r2 * r_i;
00770             float dHdrPrefixJ = jDHdrPrefix[j];
00771             float dhij, dhji;
00772             int dij, dji;
00773             CalcDH(r,r2,r_i,a_cut,atomi.charge,
00774                     jatoms[j].charge,dhij,dij);
00775             CalcDH(r,r2,r_i,a_cut,intRadJ0[j],
00776                     intRadIS,dhji,dji);
00777 
00778             float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
00779             forceI.x += dx * forceAlpha;
00780             forceI.y += dy * forceAlpha;
00781             forceI.z += dz * forceAlpha;
00782           } // cutoff
00783         } // for j
00784       } // if thread in bounds
00785     } // for block j
00786 
00787     // write psiSum to global memory buffer; to be accumulated later
00788     if ( blocki + tx < myPatchPair.patch1_force_size) {
00789       int i_out = myPatchPair.patch1_force_start + blocki + tx;
00790       forceBuffers[i_out] = forceI;
00791     }
00792   } // for block i
00793 
00794  { // start of force sum
00795   // make sure forces are visible in global memory
00796   __threadfence();
00797   __syncthreads();
00798   __shared__ bool doSum;
00799 
00800   if (threadIdx.x == 0) {
00801     int fli = myPatchPair.patch1_force_list_index;
00802     int fls = myPatchPair.patch1_force_list_size;
00803     int old = atomicInc(P3_counters+fli,fls-1);
00804     doSum = ( old == fls - 1 );
00805   }
00806   __syncthreads();
00807 
00808   if ( doSum ) {
00809     GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00810        force_lists, forceBuffers, forces);
00811   }
00812  } // end of force sum
00813 } //GBIS_P3

Generated on Tue Jun 18 04:07:45 2013 for NAMD by  doxygen 1.3.9.1