Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

ComputeGBISCUDAKernel.h File Reference

#include <stdio.h>
#include "ComputeGBIS.inl"

Go to the source code of this file.

Defines

#define GBIS_CUDA
#define DORED   1
#define myList   fl.fl
#define myList   fl.fl
#define myList   fl.fl
#define myPatchPair   pp.pp
#define jatoms   jpqu.d
#define myPatchPair   pp.pp
#define jatoms   jpqu.d
#define myPatchPair   pp.pp
#define jatoms   jpqu.d

Functions

__device__ __forceinline__
void 
GBIS_Energy_Reduction_Kernel (const int list_index, const force_list *lists, const float *sumBuffers, float *sum)
__device__ __forceinline__
void 
GBIS_Force_Reduction_Kernel (const int list_index, const force_list *lists, const float4 *sumBuffers, float4 *sum)
__device__ __forceinline__
void 
GBIS_Atom_Reduction_Kernel (const int list_index, const force_list *lists, const GBReal *sumBuffers, GBReal *sum)
__global__ void GBIS_P1_Kernel (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const float *intRad0, const float *intRadS, GBReal *psiSumBuffers, GBReal *psiSum, const float a_cut, const float rho_0, float3 lata, float3 latb, float3 latc, const force_list *force_lists, unsigned int *P1_counters)
__global__ void GBIS_P2_Kernel (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const float *bornRad, GBReal *dEdaSumBuffers, GBReal *dEdaSum, const float a_cut, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float3 lata, float3 latb, float3 latc, const int doEnergy, const int doFullElec, const force_list *force_lists, float4 *forceBuffers, float4 *forces, float *energyBuffers, float *energy, unsigned int *P2_counters)
__global__ void GBIS_P3_Kernel (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const float *intRad0, const float *intRadS, const float *dHdrPrefix, const float a_cut, const float rho_0, const float scaling, float3 lata, float3 latb, float3 latc, const force_list *force_lists, float4 *forceBuffers, float4 *forces, unsigned int *P3_counters)


Define Documentation

#define DORED   1
 

Definition at line 5 of file ComputeGBISCUDAKernel.h.

#define GBIS_CUDA
 

Definition at line 2 of file ComputeGBISCUDAKernel.h.

#define jatoms   jpqu.d
 

#define jatoms   jpqu.d
 

#define jatoms   jpqu.d
 

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel().

#define myList   fl.fl
 

#define myList   fl.fl
 

#define myList   fl.fl
 

Referenced by GBIS_Atom_Reduction_Kernel(), GBIS_Energy_Reduction_Kernel(), and GBIS_Force_Reduction_Kernel().

#define myPatchPair   pp.pp
 

#define myPatchPair   pp.pp
 

#define myPatchPair   pp.pp
 

Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel().


Function Documentation

__device__ __forceinline__ void GBIS_Atom_Reduction_Kernel const int  list_index,
const force_list *  lists,
const GBReal sumBuffers,
GBReal sum
[static]
 

Definition at line 116 of file ComputeGBISCUDAKernel.h.

References FORCE_LIST_SIZE, GBReal, j, and myList.

Referenced by GBIS_P1_Kernel(), and GBIS_P2_Kernel().

00121   {
00122   #define myList fl.fl
00123   __shared__ union {
00124     force_list fl;
00125     unsigned int i[FORCE_LIST_SIZE];
00126   } fl;
00127 
00128   if ( threadIdx.x < FORCE_LIST_USED ) {
00129     unsigned int tmp = ((unsigned int*)lists)[
00130                         FORCE_LIST_SIZE*list_index+threadIdx.x];
00131     fl.i[threadIdx.x] = tmp;
00132   }
00133   __syncthreads();
00134   for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00135     const GBReal *sumBuf = sumBuffers + myList.force_list_start + j;
00136     GBReal sumOut = 0.f;
00137     for ( int i = 0; i < myList.force_list_size; ++i ) {
00138       GBReal sumVal = *sumBuf;
00139       sumOut += sumVal;
00140       sumBuf += myList.patch_stride;
00141     } // i
00142     const int dest = myList.force_output_start + j;
00143     sum[dest] = sumOut;
00144   } // j
00145 } // GBIS_Atom_Reduction

__device__ __forceinline__ void GBIS_Energy_Reduction_Kernel const int  list_index,
const force_list *  lists,
const float *  sumBuffers,
float *  sum
[static]
 

Definition at line 26 of file ComputeGBISCUDAKernel.h.

References FORCE_LIST_SIZE, and myList.

Referenced by GBIS_P2_Kernel().

00031   {
00032 //one thread block (128 threads) is doing this single patch
00033   int tx = threadIdx.x;
00034 
00035   #define myList fl.fl
00036   __shared__ union {
00037     force_list fl;
00038     unsigned int i[FORCE_LIST_SIZE];
00039   } fl;
00040 
00041   if ( tx < FORCE_LIST_USED ) {
00042     unsigned int tmp = ((unsigned int*)lists)[
00043                         FORCE_LIST_SIZE*list_index+tx];
00044     fl.i[tx] = tmp;
00045   }
00046   __syncthreads();
00047 
00048   __shared__ float energySh[BLOCK_SIZE];
00049 
00050   if (tx < myList.force_list_size) {
00051     energySh[tx] = sumBuffers[myList.virial_list_start/16 + tx];
00052   }
00053   __syncthreads();
00054 
00055   float nf = 1.f * myList.force_list_size; // exact number to sum
00056   float p2f = log2(nf);
00057   int p2i = ceil(p2f);
00058   int n2i = 1<<p2i; // next power of 2 over number to sum
00059   int b = n2i / 2; // first midpoint
00060   if (tx + b < myList.force_list_size) { // handle non power of 2
00061     energySh[tx] += energySh[tx+b];
00062   }
00063   __syncthreads();
00064   b /= 2;
00065 
00066   for ( ; b > 0; b /= 2) { // handle remaining powers of 2
00067     if (tx < b) {
00068       energySh[tx] += energySh[tx+b];
00069     }
00070     __syncthreads();
00071   }
00072 
00073     if (threadIdx.x == 0) {
00074       const int dest = myList.virial_output_start/16;
00075       sum[dest] = energySh[0];
00076     }
00077 } // GBIS_Energy_Reduction

__device__ __forceinline__ void GBIS_Force_Reduction_Kernel const int  list_index,
const force_list *  lists,
const float4 *  sumBuffers,
float4 *  sum
[static]
 

Definition at line 80 of file ComputeGBISCUDAKernel.h.

References FORCE_LIST_SIZE, j, and myList.

Referenced by GBIS_P2_Kernel(), and GBIS_P3_Kernel().

00085   {
00086   #define myList fl.fl
00087   __shared__ union {
00088     force_list fl;
00089     unsigned int i[FORCE_LIST_SIZE];
00090   } fl;
00091 
00092   if ( threadIdx.x < FORCE_LIST_USED ) {
00093     unsigned int tmp = ((unsigned int*)lists)[
00094                         FORCE_LIST_SIZE*list_index+threadIdx.x];
00095     fl.i[threadIdx.x] = tmp;
00096   }
00097   __syncthreads();
00098   for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00099     const float4 *sumBuf = sumBuffers + myList.force_list_start + j;
00100     float4 sumOut; sumOut.x = 0.f; sumOut.y = 0.f; sumOut.z = 0.f;
00101     for ( int i = 0; i < myList.force_list_size; ++i ) {
00102       float4 sumVal = *sumBuf;
00103       sumOut.x += sumVal.x;
00104       sumOut.y += sumVal.y;
00105       sumOut.z += sumVal.z;
00106       sumBuf += myList.patch_stride;
00107     } // i
00108     const int dest = myList.force_output_start + j;
00109     sum[dest].x += sumOut.x; // adding onto NAMD nonbonded, sync?
00110     sum[dest].y += sumOut.y;
00111     sum[dest].z += sumOut.z;
00112   } // j
00113 } // GBIS_Force_Reduction

__global__ void GBIS_P1_Kernel const patch_pair *  patch_pairs,
const atom *  atoms,
const atom_param *  atom_params,
const float *  intRad0,
const float *  intRadS,
GBReal psiSumBuffers,
GBReal psiSum,
const float  a_cut,
const float  rho_0,
float3  lata,
float3  latb,
float3  latc,
const force_list *  force_lists,
unsigned int *  P1_counters
[static]
 

Definition at line 152 of file ComputeGBISCUDAKernel.h.

References CalcH(), GBIS_Atom_Reduction_Kernel(), GBReal, j, jatoms, myPatchPair, and PATCH_PAIR_SIZE.

00167   {
00168   int tx = threadIdx.x;
00169   int bx = blockIdx.x;
00170 #ifdef __DEVICE_EMULATION__
00171 //  printf("GBIS_P1: Blk(%3i) Thd(%3i)\n",bx,tx);
00172 #endif
00173 // one block per patch_pair
00174 // BLOCK_SIZE 128 threads per block
00175 // SHARED_SIZE 32
00176 // PATCH_PAIR_SIZE 16
00177 // PATCH_PAIR_USED 15
00178 
00179 //macro for patch pair
00180 #ifdef __DEVICE_EMULATION__
00181   #define myPatchPair (*(patch_pair*)(&pp.i))
00182 #else
00183   #define myPatchPair pp.pp
00184 #endif
00185   __shared__ union {
00186 #ifndef __DEVICE_EMULATION__
00187     patch_pair pp;
00188 #endif
00189     unsigned int i[PATCH_PAIR_SIZE];
00190   } pp;
00191   if ( tx < PATCH_PAIR_USED ) {
00192     unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00193     pp.i[tx] = tmp;
00194   } // ?
00195 
00196 
00197 //macro for list of j atoms
00198 #ifdef __DEVICE_EMULATION__
00199   #define jatoms ((atom*)(jpqu.i))
00200 #else
00201   #define jatoms jpqu.d
00202 #endif
00203   __shared__ union {
00204 #ifndef __DEVICE_EMULATION__
00205     atom d[SHARED_SIZE];
00206 #endif
00207     int i[4*SHARED_SIZE];
00208     float f[4*SHARED_SIZE];
00209   } jpqu;
00210 
00211   // convert scaled offset with current lattice
00212   if ( tx == 0 ) {
00213     float offx = myPatchPair.offset.x * lata.x
00214                + myPatchPair.offset.y * latb.x
00215                + myPatchPair.offset.z * latc.x;
00216     float offy = myPatchPair.offset.x * lata.y
00217                + myPatchPair.offset.y * latb.y
00218                + myPatchPair.offset.z * latc.y;
00219     float offz = myPatchPair.offset.x * lata.z
00220                + myPatchPair.offset.y * latb.z
00221                + myPatchPair.offset.z * latc.z;
00222     myPatchPair.offset.x = offx;
00223     myPatchPair.offset.y = offy;
00224     myPatchPair.offset.z = offz;
00225   }
00226   __syncthreads();
00227 
00228   int patch1size = myPatchPair.patch1_size;
00229   int patch2size = myPatchPair.patch2_size;
00230 
00231   //iterate over chunks of atoms within Patch 1
00232   for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00233 
00234     //this thread calculates only the force on atomi; iterating over js
00235     atom atomi;
00236     //int iindex;
00237     int i;
00238 
00239     // load BLOCK of Patch i atoms
00240     if ( blocki + tx < patch1size ) {
00241       i = myPatchPair.patch1_atom_start + blocki + tx;
00242       float4 tmpa = ((float4*)atoms)[i];
00243       atomi.position.x = tmpa.x + myPatchPair.offset.x;
00244       atomi.position.y = tmpa.y + myPatchPair.offset.y;
00245       atomi.position.z = tmpa.z + myPatchPair.offset.z;
00246       atomi.charge = intRad0[i]; // overwrite charge with radius
00247       //iindex =  ((uint4 *)(atom_params))[i].y; // store index
00248     } // load patch 1
00249 
00250     //init intermediate variables
00251     GBReal psiSumI = 0.f; // each thread accumulating single psi
00252 
00253     //iterate over chunks of atoms within Patch 2
00254     for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00255 
00256       int shared_size = patch2size - blockj;
00257       if ( shared_size > SHARED_SIZE )
00258         shared_size = SHARED_SIZE;
00259       __syncthreads();
00260 
00261       //load smaller chunk of j atoms into shared memory: coordinates
00262 
00263       int j = myPatchPair.patch2_atom_start + blockj;
00264       if ( tx < 4 * shared_size ) {
00265         jpqu.i[tx] = ((unsigned int *)(atoms       + j))[tx];
00266       }
00267       __syncthreads();
00268 
00269       //load smaller chunk of j atoms into shared memory: radii
00270       if ( tx < shared_size ) { // every 4th thread
00271         jatoms[tx].charge = intRadS[j+tx]; // load radius into charge float
00272         //printf("jatom2 = %7.3f\n", jatoms[tx].charge);
00273       }
00274       __syncthreads();
00275 
00276       if ( blocki + tx < patch1size ) {
00277         //each thread loop over shared atoms
00278         for ( int j = 0; j < shared_size; ++j ) {
00279           float dx = atomi.position.x - jatoms[j].position.x;
00280           float dy = atomi.position.y - jatoms[j].position.y;
00281           float dz = atomi.position.z - jatoms[j].position.z;
00282           float r2 = dx*dx + dy*dy + dz*dz;
00283 
00284           // within cutoff        different atoms
00285           if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00286             //int jj = myPatchPair.patch2_atom_start + blockj + j;
00287             //int jindex =  ((uint4 *)(atom_params))[jj].y; // store index
00288 
00289             // calculate H(i,j) [and not H(j,i)]
00290             float r_i = 1.f / sqrt(r2);
00291             float r  = r2 * r_i;
00292             float hij;
00293             int dij;
00294             CalcH(r,r2,r_i,a_cut,atomi.charge,jatoms[j].charge,hij,dij);
00295 
00296           psiSumI += hij;
00297 
00298           } // cutoff
00299         } // for j
00300       } // if thread in bounds
00301     } // for block j
00302     //psiSumI now contains contributions from all j in 2nd patch
00303 
00304     // write psiSum to global memory buffer; to be accumulated later
00305     if ( blocki + tx < myPatchPair.patch1_force_size) {
00306       int i_out = myPatchPair.patch1_force_start + blocki + tx;
00307       psiSumBuffers[i_out] = psiSumI;
00308     }
00309   } // for block i
00310 
00311  { // start of force sum
00312   // make sure psiSums are visible in global memory
00313   __threadfence();
00314   __syncthreads();
00315   __shared__ bool doSum;
00316 
00317   if (threadIdx.x == 0) {
00318     int fli = myPatchPair.patch1_force_list_index;
00319     int fls = myPatchPair.patch1_force_list_size;
00320     int old = atomicInc(P1_counters+fli,fls-1);
00321     doSum = ( old == fls - 1 );
00322   }
00323 
00324   __syncthreads();
00325 
00326   if ( doSum ) {
00327     GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00328        force_lists, psiSumBuffers, psiSum);
00329   }
00330  } // end of force sum
00331 } //GBIS_P1

__global__ void GBIS_P2_Kernel const patch_pair *  patch_pairs,
const atom *  atoms,
const atom_param *  atom_params,
const float *  bornRad,
GBReal dEdaSumBuffers,
GBReal dEdaSum,
const float  a_cut,
const float  r_cut,
const float  scaling,
const float  kappa,
const float  smoothDist,
const float  epsilon_p,
const float  epsilon_s,
float3  lata,
float3  latb,
float3  latc,
const int  doEnergy,
const int  doFullElec,
const force_list *  force_lists,
float4 *  forceBuffers,
float4 *  forces,
float *  energyBuffers,
float *  energy,
unsigned int *  P2_counters
[static]
 

Definition at line 338 of file ComputeGBISCUDAKernel.h.

References GBIS_Atom_Reduction_Kernel(), GBIS_Energy_Reduction_Kernel(), GBIS_Force_Reduction_Kernel(), GBReal, j, jatoms, myPatchPair, and PATCH_PAIR_SIZE.

00363   {
00364   int tx = threadIdx.x;
00365   int bx = blockIdx.x;
00366 #ifdef __DEVICE_EMULATION__
00367 //  printf("GBIS_P2: Blk(%3i) Thd(%3i)\n",bx,tx);
00368 #endif
00369 // one block per patch_pair
00370 // BLOCK_SIZE 128 threads per block
00371 // SHARED_SIZE 32
00372 // PATCH_PAIR_SIZE 16
00373 // PATCH_PAIR_USED 15
00374 
00375 //macro for patch pair
00376 #ifdef __DEVICE_EMULATION__
00377   #define myPatchPair (*(patch_pair*)(&pp.i))
00378 #else
00379   #define myPatchPair pp.pp
00380 #endif
00381   __shared__ union {
00382 #ifndef __DEVICE_EMULATION__
00383     patch_pair pp;
00384 #endif
00385     unsigned int i[PATCH_PAIR_SIZE];
00386   } pp;
00387   if ( tx < PATCH_PAIR_USED ) {
00388     unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00389     pp.i[tx] = tmp;
00390   } // ?
00391 
00392 
00393 //macro for list of j atoms
00394 #ifdef __DEVICE_EMULATION__
00395   #define jatoms ((atom*)(jpqu.i))
00396 #else
00397   #define jatoms jpqu.d
00398 #endif
00399   __shared__ union {
00400 #ifndef __DEVICE_EMULATION__
00401     atom d[SHARED_SIZE];
00402 #endif
00403     int i[4*SHARED_SIZE];
00404     float f[4*SHARED_SIZE];
00405   } jpqu;
00406 
00407   __shared__ float jBornRad[SHARED_SIZE];
00408   float energyT = 0.f; // total energy for this thread; to be reduced
00409 
00410   // convert scaled offset with current lattice
00411   if ( tx == 0 ) {
00412     float offx = myPatchPair.offset.x * lata.x
00413                + myPatchPair.offset.y * latb.x
00414                + myPatchPair.offset.z * latc.x;
00415     float offy = myPatchPair.offset.x * lata.y
00416                + myPatchPair.offset.y * latb.y
00417                + myPatchPair.offset.z * latc.y;
00418     float offz = myPatchPair.offset.x * lata.z
00419                + myPatchPair.offset.y * latb.z
00420                + myPatchPair.offset.z * latc.z;
00421     myPatchPair.offset.x = offx;
00422     myPatchPair.offset.y = offy;
00423     myPatchPair.offset.z = offz;
00424   }
00425   __syncthreads();
00426 
00427   int patch1size = myPatchPair.patch1_size;
00428   int patch2size = myPatchPair.patch2_size;
00429 
00430   //values used in loop
00431   float r_cut2 = r_cut*r_cut;
00432   float r_cut_2 = 1.f / r_cut2;
00433   float r_cut_4 = 4.f*r_cut_2*r_cut_2;
00434   float epsilon_s_i = 1.f / epsilon_s;
00435   float epsilon_p_i = 1.f / epsilon_p;
00436 
00437   //iterate over chunks of atoms within Patch 1
00438   for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00439 
00440     //this thread calculates only the force on atomi; iterating over js
00441     atom atomi;
00442     float bornRadI;
00443     //int iindex;
00444     int i;
00445 
00446     // load BLOCK of Patch i atoms
00447     if ( blocki + tx < patch1size ) {
00448       i = myPatchPair.patch1_atom_start + blocki + tx;
00449       float4 tmpa = ((float4*)atoms)[i];
00450       atomi.position.x = tmpa.x + myPatchPair.offset.x;
00451       atomi.position.y = tmpa.y + myPatchPair.offset.y;
00452       atomi.position.z = tmpa.z + myPatchPair.offset.z;
00453       atomi.charge = - tmpa.w * scaling;
00454       bornRadI = bornRad[i];
00455       //iindex =  ((uint4 *)(atom_params))[i].y; // store index
00456     } // load patch 1
00457 
00458     //init intermediate variables
00459     GBReal dEdaSumI = 0.f; // each thread accumulating single psi
00460     float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00461 
00462     //iterate over chunks of atoms within Patch 2
00463     for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00464 
00465       int shared_size = patch2size - blockj;
00466       if ( shared_size > SHARED_SIZE )
00467         shared_size = SHARED_SIZE;
00468       __syncthreads();
00469 
00470       //load smaller chunk of j atoms into shared memory: coordinates
00471       int j = myPatchPair.patch2_atom_start + blockj;
00472       if ( tx < 4 * shared_size ) {
00473         jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00474       }
00475       __syncthreads();
00476 
00477       //load smaller chunk of j atoms into shared memory: radii
00478       if ( tx < shared_size ) {
00479         jBornRad[tx] = bornRad[j+tx]; // load Born radius into shared
00480         //printf("BornRadius = %7.3f\n", jBornRadii[tx]);
00481       }
00482       __syncthreads();
00483 
00484       if ( blocki + tx < patch1size ) {
00485         //each thread loop over shared atoms
00486         for ( int j = 0; j < shared_size; ++j ) {
00487           float dx = atomi.position.x - jatoms[j].position.x;
00488           float dy = atomi.position.y - jatoms[j].position.y;
00489           float dz = atomi.position.z - jatoms[j].position.z;
00490           float r2 = dx*dx + dy*dy + dz*dz;
00491 
00492           // within cutoff different atoms
00493           if (r2 < r_cut*r_cut && r2 > 0.01f) {
00494             //int jj = myPatchPair.patch2_atom_start + blockj + j;
00495             //int jindex =  ((uint4 *)(atom_params))[jj].y; // store index
00496 
00497             float r_i = 1.f / sqrt(r2);
00498             float r  = r2 * r_i;
00499             float bornRadJ = jBornRad[j];
00500 
00501             //calculate GB energy
00502             float qiqj = atomi.charge*jatoms[j].charge;
00503             float aiaj = bornRadI*bornRadJ;
00504             float aiaj4 = 4*aiaj;
00505             float expr2aiaj4 = exp(-r2/aiaj4);
00506             float fij = sqrt(r2+aiaj*expr2aiaj4);
00507             float f_i = 1/fij;
00508             float expkappa = exp(-kappa*fij);
00509             float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00510             float gbEij = qiqj*Dij*f_i;
00511 
00512             //calculate energy derivatives
00513             float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00514             float ddrf_i = -ddrfij*f_i*f_i;
00515             float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
00516             float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00517 
00518             //NAMD smoothing function
00519             float scale = 1.f;
00520             float ddrScale = 0.f;
00521             float forcedEdr;
00522             if (smoothDist > 0.f) {
00523               scale = r2 * r_cut_2 - 1.f;
00524               scale *= scale;
00525               ddrScale = r*(r2-r_cut2)*r_cut_4;
00526               energyT += gbEij * scale * 0.5f;
00527               //printf("PAIRENERGY = %15.7e\n", gbEij*scale*0.5);
00528               forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
00529             } else {
00530               energyT += gbEij * 0.5f;
00531               //printf("PAIRENERGY = %15.7e\n", gbEij*0.5);
00532               forcedEdr = -ddrGbEij;
00533             }
00534 
00535             //add dEda
00536             float dEdai = 0.f;
00537             if (doFullElec) {
00538               //gbisParams->dEdaSum[0][i] += dEdai*scale;
00539               dEdai = 0.5f*qiqj*f_i*f_i
00540                       *(kappa*epsilon_s_i*expkappa-Dij*f_i)
00541                       *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;//0
00542               dEdaSumI += dEdai;
00543             }
00544 
00545             forcedEdr *= r_i;
00546             forceI.x += dx*forcedEdr;
00547             forceI.y += dy*forcedEdr;
00548             forceI.z += dz*forcedEdr;
00549 
00550 
00551           } // within cutoff
00552           if (r2 < 0.01f) {
00553             // GB Self Energy
00554             if (doEnergy) {
00555               float fij = bornRadI;//inf
00556               float expkappa = exp(-kappa*fij);//0
00557               float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00558               float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
00559               energyT += 0.5f*gbEij;
00560             }
00561           } //same atom or within cutoff
00562         } // for j
00563       } // if thread in bounds
00564     } // for block j
00565     //psiSumI now contains contributions from all j in 2nd patch
00566 
00567     // write psiSum to global memory buffer; to be accumulated later
00568     if ( blocki + tx < myPatchPair.patch1_force_size) {
00569       int i_out = myPatchPair.patch1_force_start + blocki + tx;
00570       dEdaSumBuffers[i_out] = dEdaSumI;
00571       forceBuffers[i_out] = forceI;
00572     }
00573   } // for block i
00574   //printf("energyT[%03i,%03i] = %15.7e\n", blockIdx.x, tx, energyT);
00575 
00576 //Energy Reduction
00577   if (doEnergy) {
00578     const int i_out = myPatchPair.virial_start / 16;
00579     __shared__ float energySh[BLOCK_SIZE];
00580     energySh[tx] = energyT;
00581     __syncthreads();
00582 
00583     for ( unsigned int b = blockDim.x/2; b > 0 ; b >>= 1 ) {
00584       if ( tx < b ) {
00585         //printf("RED eSh[%i] += eSh[%i]\n", tx, tx+b);
00586         energySh[tx] += energySh[tx+b];
00587       }
00588       __syncthreads();
00589     }
00590     //const int i_out = myPatchPair.virial_start + tx;
00591     if (tx == 0) {
00592       energyBuffers[i_out] = energySh[0]; // TODO
00593       //printf("eB[%i] = %15.7e\n", i_out, energySh[0]);
00594     }
00595   } //end Energy Reduction
00596 
00597  { // start of reduction
00598   // make sure psiSums are visible in global memory
00599   __threadfence();
00600   __syncthreads();
00601 
00602   __shared__ bool doSum;
00603 
00604   if (threadIdx.x == 0) {
00605     int fli = myPatchPair.patch1_force_list_index;
00606     int fls = myPatchPair.patch1_force_list_size;
00607     int old = atomicInc(P2_counters+fli,fls-1);
00608     doSum = ( old == fls - 1 );
00609   }
00610 
00611   __syncthreads();
00612 
00613   if ( doSum ) {
00614     GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00615        force_lists, dEdaSumBuffers, dEdaSum);
00616     GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00617        force_lists, forceBuffers, forces);
00618     if ( doEnergy ) {
00619       GBIS_Energy_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00620         force_lists, energyBuffers, energy);
00621     }
00622   }
00623  } // end of sum
00624 } //GBIS_P2

__global__ void GBIS_P3_Kernel const patch_pair *  patch_pairs,
const atom *  atoms,
const atom_param *  atom_params,
const float *  intRad0,
const float *  intRadS,
const float *  dHdrPrefix,
const float  a_cut,
const float  rho_0,
const float  scaling,
float3  lata,
float3  latb,
float3  latc,
const force_list *  force_lists,
float4 *  forceBuffers,
float4 *  forces,
unsigned int *  P3_counters
[static]
 

Definition at line 632 of file ComputeGBISCUDAKernel.h.

References CalcDH(), GBIS_Force_Reduction_Kernel(), j, jatoms, myPatchPair, and PATCH_PAIR_SIZE.

00649   {
00650   int tx = threadIdx.x;
00651   int bx = blockIdx.x;
00652 #ifdef __DEVICE_EMULATION__
00653 //  printf("GBIS_P3: Blk(%3i) Thd(%3i)\n",bx,tx);
00654 #endif
00655 
00656 #ifdef __DEVICE_EMULATION__
00657   #define myPatchPair (*(patch_pair*)(&pp.i))
00658 #else
00659   #define myPatchPair pp.pp
00660 #endif
00661   __shared__ union {
00662 #ifndef __DEVICE_EMULATION__
00663     patch_pair pp;
00664 #endif
00665     unsigned int i[PATCH_PAIR_SIZE];
00666   } pp;
00667   if ( tx < PATCH_PAIR_USED ) {
00668     unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00669     pp.i[tx] = tmp;
00670   } // all threads load single shared patch pair
00671 
00672 //macro for list of j atoms
00673 #ifdef __DEVICE_EMULATION__
00674   #define jatoms ((atom*)(jpqu.i))
00675 #else
00676   #define jatoms jpqu.d
00677 #endif
00678   __shared__ union {
00679 #ifndef __DEVICE_EMULATION__
00680     atom d[SHARED_SIZE];
00681 #endif
00682     int i[4*SHARED_SIZE];
00683     float f[4*SHARED_SIZE];
00684   } jpqu;
00685 
00686   __shared__ float jDHdrPrefix[SHARED_SIZE];
00687   __shared__ float intRadJ0[SHARED_SIZE];
00688 
00689   // convert scaled offset with current lattice
00690   if ( tx == 0 ) {
00691     float offx = myPatchPair.offset.x * lata.x
00692                + myPatchPair.offset.y * latb.x
00693                + myPatchPair.offset.z * latc.x;
00694     float offy = myPatchPair.offset.x * lata.y
00695                + myPatchPair.offset.y * latb.y
00696                + myPatchPair.offset.z * latc.y;
00697     float offz = myPatchPair.offset.x * lata.z
00698                + myPatchPair.offset.y * latb.z
00699                + myPatchPair.offset.z * latc.z;
00700     myPatchPair.offset.x = offx;
00701     myPatchPair.offset.y = offy;
00702     myPatchPair.offset.z = offz;
00703   }
00704   __syncthreads();
00705 
00706   //iterate over chunks of atoms within Patch 1
00707   for ( int blocki = 0; blocki < myPatchPair.patch1_size; blocki += BLOCK_SIZE ) {
00708     //this thread calculates only the force on atomi; iterating over js
00709     atom atomi;
00710     float intRadIS;
00711     //int iindex;
00712     int i;
00713     float dHdrPrefixI;
00714 
00715     // load BLOCK of Patch i atoms
00716     if ( blocki + tx < myPatchPair.patch1_size ) {
00717       i = myPatchPair.patch1_atom_start + blocki + tx;
00718       float4 tmpa = ((float4*)atoms)[i];
00719       atomi.position.x = tmpa.x + myPatchPair.offset.x;
00720       atomi.position.y = tmpa.y + myPatchPair.offset.y;
00721       atomi.position.z = tmpa.z + myPatchPair.offset.z;
00722       atomi.charge = intRad0[i]; // overwrite charge with radius
00723       intRadIS = intRadS[i];
00724       //iindex =  ((uint4 *)(atom_params))[i].y; // store index
00725       dHdrPrefixI = dHdrPrefix[i];
00726     } // load patch 1
00727 
00728     //init intermediate variables
00729     float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00730 
00731     //iterate over chunks of atoms within Patch 2
00732     for ( int blockj = 0; blockj < myPatchPair.patch2_size; blockj += SHARED_SIZE ) {
00733 
00734       int shared_size = myPatchPair.patch2_size - blockj;
00735       if ( shared_size > SHARED_SIZE )
00736         shared_size = SHARED_SIZE;
00737       __syncthreads();
00738 
00739       //load smaller chunk of j atoms into shared memory: coordinates
00740 
00741       int j = myPatchPair.patch2_atom_start + blockj;
00742       if ( tx < 4 * shared_size ) {
00743         jpqu.i[tx] = ((unsigned int *)(atoms       + j))[tx];
00744       }
00745       __syncthreads();
00746 
00747       //load smaller chunk of j atoms into shared memory: radii
00748       if ( tx < shared_size ) { // every 4th thread
00749         jatoms[tx].charge = intRadS[j+tx]; // load radius into charge float
00750         jDHdrPrefix[tx] = dHdrPrefix[j+tx]; // load dHdrPrefix into shared
00751         intRadJ0[tx] = intRad0[j+tx];
00752       }
00753       __syncthreads();
00754 
00755       if ( blocki + tx < myPatchPair.patch1_size ) {
00756         //each thread loop over shared atoms
00757         for ( int j = 0; j < shared_size; ++j ) {
00758           float dx = atomi.position.x - jatoms[j].position.x;
00759           float dy = atomi.position.y - jatoms[j].position.y;
00760           float dz = atomi.position.z - jatoms[j].position.z;
00761           float r2 = dx*dx + dy*dy + dz*dz;
00762 
00763           // within cutoff        different atoms
00764           if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00765             //int jj = myPatchPair.patch2_atom_start + blockj + j;
00766             //int jindex =  ((uint4 *)(atom_params))[jj].y; // store index
00767 
00768             float r_i = 1.f / sqrt(r2);
00769             float r  = r2 * r_i;
00770             float dHdrPrefixJ = jDHdrPrefix[j];
00771             float dhij, dhji;
00772             int dij, dji;
00773             CalcDH(r,r2,r_i,a_cut,atomi.charge,
00774                     jatoms[j].charge,dhij,dij);
00775             CalcDH(r,r2,r_i,a_cut,intRadJ0[j],
00776                     intRadIS,dhji,dji);
00777 
00778             float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
00779             forceI.x += dx * forceAlpha;
00780             forceI.y += dy * forceAlpha;
00781             forceI.z += dz * forceAlpha;
00782           } // cutoff
00783         } // for j
00784       } // if thread in bounds
00785     } // for block j
00786 
00787     // write psiSum to global memory buffer; to be accumulated later
00788     if ( blocki + tx < myPatchPair.patch1_force_size) {
00789       int i_out = myPatchPair.patch1_force_start + blocki + tx;
00790       forceBuffers[i_out] = forceI;
00791     }
00792   } // for block i
00793 
00794  { // start of force sum
00795   // make sure forces are visible in global memory
00796   __threadfence();
00797   __syncthreads();
00798   __shared__ bool doSum;
00799 
00800   if (threadIdx.x == 0) {
00801     int fli = myPatchPair.patch1_force_list_index;
00802     int fls = myPatchPair.patch1_force_list_size;
00803     int old = atomicInc(P3_counters+fli,fls-1);
00804     doSum = ( old == fls - 1 );
00805   }
00806   __syncthreads();
00807 
00808   if ( doSum ) {
00809     GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00810        force_lists, forceBuffers, forces);
00811   }
00812  } // end of force sum
00813 } //GBIS_P3


Generated on Tue May 21 04:07:21 2013 for NAMD by  doxygen 1.3.9.1