#include <stdio.h>#include "ComputeGBIS.inl"Go to the source code of this file.
Defines | |
| #define | GBIS_CUDA |
| #define | DORED 1 |
| #define | myList fl.fl |
| #define | myList fl.fl |
| #define | myList fl.fl |
| #define | myPatchPair pp.pp |
| #define | jatoms jpqu.d |
| #define | myPatchPair pp.pp |
| #define | jatoms jpqu.d |
| #define | myPatchPair pp.pp |
| #define | jatoms jpqu.d |
Functions | |
| __device__ __forceinline__ void | GBIS_Energy_Reduction_Kernel (const int list_index, const force_list *lists, const float *sumBuffers, float *sum) |
| __device__ __forceinline__ void | GBIS_Force_Reduction_Kernel (const int list_index, const force_list *lists, const float4 *sumBuffers, float4 *sum) |
| __device__ __forceinline__ void | GBIS_Atom_Reduction_Kernel (const int list_index, const force_list *lists, const GBReal *sumBuffers, GBReal *sum) |
| __global__ void | GBIS_P1_Kernel (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const float *intRad0, const float *intRadS, GBReal *psiSumBuffers, GBReal *psiSum, const float a_cut, const float rho_0, float3 lata, float3 latb, float3 latc, const force_list *force_lists, unsigned int *P1_counters) |
| __global__ void | GBIS_P2_Kernel (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const float *bornRad, GBReal *dEdaSumBuffers, GBReal *dEdaSum, const float a_cut, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float3 lata, float3 latb, float3 latc, const int doEnergy, const int doFullElec, const force_list *force_lists, float4 *forceBuffers, float4 *forces, float *energyBuffers, float *energy, unsigned int *P2_counters) |
| __global__ void | GBIS_P3_Kernel (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const float *intRad0, const float *intRadS, const float *dHdrPrefix, const float a_cut, const float rho_0, const float scaling, float3 lata, float3 latb, float3 latc, const force_list *force_lists, float4 *forceBuffers, float4 *forces, unsigned int *P3_counters) |
|
|
Definition at line 5 of file ComputeGBISCUDAKernel.h. |
|
|
Definition at line 2 of file ComputeGBISCUDAKernel.h. |
|
|
|
|
|
|
|
|
Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel(). |
|
|
|
|
|
|
|
|
Referenced by GBIS_Atom_Reduction_Kernel(), GBIS_Energy_Reduction_Kernel(), and GBIS_Force_Reduction_Kernel(). |
|
|
|
|
|
|
|
|
Referenced by GBIS_P1_Kernel(), GBIS_P2_Kernel(), and GBIS_P3_Kernel(). |
|
||||||||||||||||||||
|
Definition at line 116 of file ComputeGBISCUDAKernel.h. References FORCE_LIST_SIZE, GBReal, j, and myList. Referenced by GBIS_P1_Kernel(), and GBIS_P2_Kernel(). 00121 {
00122 #define myList fl.fl
00123 __shared__ union {
00124 force_list fl;
00125 unsigned int i[FORCE_LIST_SIZE];
00126 } fl;
00127
00128 if ( threadIdx.x < FORCE_LIST_USED ) {
00129 unsigned int tmp = ((unsigned int*)lists)[
00130 FORCE_LIST_SIZE*list_index+threadIdx.x];
00131 fl.i[threadIdx.x] = tmp;
00132 }
00133 __syncthreads();
00134 for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00135 const GBReal *sumBuf = sumBuffers + myList.force_list_start + j;
00136 GBReal sumOut = 0.f;
00137 for ( int i = 0; i < myList.force_list_size; ++i ) {
00138 GBReal sumVal = *sumBuf;
00139 sumOut += sumVal;
00140 sumBuf += myList.patch_stride;
00141 } // i
00142 const int dest = myList.force_output_start + j;
00143 sum[dest] = sumOut;
00144 } // j
00145 } // GBIS_Atom_Reduction
|
|
||||||||||||||||||||
|
Definition at line 26 of file ComputeGBISCUDAKernel.h. References FORCE_LIST_SIZE, and myList. Referenced by GBIS_P2_Kernel(). 00031 {
00032 //one thread block (128 threads) is doing this single patch
00033 int tx = threadIdx.x;
00034
00035 #define myList fl.fl
00036 __shared__ union {
00037 force_list fl;
00038 unsigned int i[FORCE_LIST_SIZE];
00039 } fl;
00040
00041 if ( tx < FORCE_LIST_USED ) {
00042 unsigned int tmp = ((unsigned int*)lists)[
00043 FORCE_LIST_SIZE*list_index+tx];
00044 fl.i[tx] = tmp;
00045 }
00046 __syncthreads();
00047
00048 __shared__ float energySh[BLOCK_SIZE];
00049
00050 if (tx < myList.force_list_size) {
00051 energySh[tx] = sumBuffers[myList.virial_list_start/16 + tx];
00052 }
00053 __syncthreads();
00054
00055 float nf = 1.f * myList.force_list_size; // exact number to sum
00056 float p2f = log2(nf);
00057 int p2i = ceil(p2f);
00058 int n2i = 1<<p2i; // next power of 2 over number to sum
00059 int b = n2i / 2; // first midpoint
00060 if (tx + b < myList.force_list_size) { // handle non power of 2
00061 energySh[tx] += energySh[tx+b];
00062 }
00063 __syncthreads();
00064 b /= 2;
00065
00066 for ( ; b > 0; b /= 2) { // handle remaining powers of 2
00067 if (tx < b) {
00068 energySh[tx] += energySh[tx+b];
00069 }
00070 __syncthreads();
00071 }
00072
00073 if (threadIdx.x == 0) {
00074 const int dest = myList.virial_output_start/16;
00075 sum[dest] = energySh[0];
00076 }
00077 } // GBIS_Energy_Reduction
|
|
||||||||||||||||||||
|
Definition at line 80 of file ComputeGBISCUDAKernel.h. References FORCE_LIST_SIZE, j, and myList. Referenced by GBIS_P2_Kernel(), and GBIS_P3_Kernel(). 00085 {
00086 #define myList fl.fl
00087 __shared__ union {
00088 force_list fl;
00089 unsigned int i[FORCE_LIST_SIZE];
00090 } fl;
00091
00092 if ( threadIdx.x < FORCE_LIST_USED ) {
00093 unsigned int tmp = ((unsigned int*)lists)[
00094 FORCE_LIST_SIZE*list_index+threadIdx.x];
00095 fl.i[threadIdx.x] = tmp;
00096 }
00097 __syncthreads();
00098 for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00099 const float4 *sumBuf = sumBuffers + myList.force_list_start + j;
00100 float4 sumOut; sumOut.x = 0.f; sumOut.y = 0.f; sumOut.z = 0.f;
00101 for ( int i = 0; i < myList.force_list_size; ++i ) {
00102 float4 sumVal = *sumBuf;
00103 sumOut.x += sumVal.x;
00104 sumOut.y += sumVal.y;
00105 sumOut.z += sumVal.z;
00106 sumBuf += myList.patch_stride;
00107 } // i
00108 const int dest = myList.force_output_start + j;
00109 sum[dest].x += sumOut.x; // adding onto NAMD nonbonded, sync?
00110 sum[dest].y += sumOut.y;
00111 sum[dest].z += sumOut.z;
00112 } // j
00113 } // GBIS_Force_Reduction
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Definition at line 152 of file ComputeGBISCUDAKernel.h. References CalcH(), GBIS_Atom_Reduction_Kernel(), GBReal, j, jatoms, myPatchPair, and PATCH_PAIR_SIZE. 00167 {
00168 int tx = threadIdx.x;
00169 int bx = blockIdx.x;
00170 #ifdef __DEVICE_EMULATION__
00171 // printf("GBIS_P1: Blk(%3i) Thd(%3i)\n",bx,tx);
00172 #endif
00173 // one block per patch_pair
00174 // BLOCK_SIZE 128 threads per block
00175 // SHARED_SIZE 32
00176 // PATCH_PAIR_SIZE 16
00177 // PATCH_PAIR_USED 15
00178
00179 //macro for patch pair
00180 #ifdef __DEVICE_EMULATION__
00181 #define myPatchPair (*(patch_pair*)(&pp.i))
00182 #else
00183 #define myPatchPair pp.pp
00184 #endif
00185 __shared__ union {
00186 #ifndef __DEVICE_EMULATION__
00187 patch_pair pp;
00188 #endif
00189 unsigned int i[PATCH_PAIR_SIZE];
00190 } pp;
00191 if ( tx < PATCH_PAIR_USED ) {
00192 unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00193 pp.i[tx] = tmp;
00194 } // ?
00195
00196
00197 //macro for list of j atoms
00198 #ifdef __DEVICE_EMULATION__
00199 #define jatoms ((atom*)(jpqu.i))
00200 #else
00201 #define jatoms jpqu.d
00202 #endif
00203 __shared__ union {
00204 #ifndef __DEVICE_EMULATION__
00205 atom d[SHARED_SIZE];
00206 #endif
00207 int i[4*SHARED_SIZE];
00208 float f[4*SHARED_SIZE];
00209 } jpqu;
00210
00211 // convert scaled offset with current lattice
00212 if ( tx == 0 ) {
00213 float offx = myPatchPair.offset.x * lata.x
00214 + myPatchPair.offset.y * latb.x
00215 + myPatchPair.offset.z * latc.x;
00216 float offy = myPatchPair.offset.x * lata.y
00217 + myPatchPair.offset.y * latb.y
00218 + myPatchPair.offset.z * latc.y;
00219 float offz = myPatchPair.offset.x * lata.z
00220 + myPatchPair.offset.y * latb.z
00221 + myPatchPair.offset.z * latc.z;
00222 myPatchPair.offset.x = offx;
00223 myPatchPair.offset.y = offy;
00224 myPatchPair.offset.z = offz;
00225 }
00226 __syncthreads();
00227
00228 int patch1size = myPatchPair.patch1_size;
00229 int patch2size = myPatchPair.patch2_size;
00230
00231 //iterate over chunks of atoms within Patch 1
00232 for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00233
00234 //this thread calculates only the force on atomi; iterating over js
00235 atom atomi;
00236 //int iindex;
00237 int i;
00238
00239 // load BLOCK of Patch i atoms
00240 if ( blocki + tx < patch1size ) {
00241 i = myPatchPair.patch1_atom_start + blocki + tx;
00242 float4 tmpa = ((float4*)atoms)[i];
00243 atomi.position.x = tmpa.x + myPatchPair.offset.x;
00244 atomi.position.y = tmpa.y + myPatchPair.offset.y;
00245 atomi.position.z = tmpa.z + myPatchPair.offset.z;
00246 atomi.charge = intRad0[i]; // overwrite charge with radius
00247 //iindex = ((uint4 *)(atom_params))[i].y; // store index
00248 } // load patch 1
00249
00250 //init intermediate variables
00251 GBReal psiSumI = 0.f; // each thread accumulating single psi
00252
00253 //iterate over chunks of atoms within Patch 2
00254 for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00255
00256 int shared_size = patch2size - blockj;
00257 if ( shared_size > SHARED_SIZE )
00258 shared_size = SHARED_SIZE;
00259 __syncthreads();
00260
00261 //load smaller chunk of j atoms into shared memory: coordinates
00262
00263 int j = myPatchPair.patch2_atom_start + blockj;
00264 if ( tx < 4 * shared_size ) {
00265 jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00266 }
00267 __syncthreads();
00268
00269 //load smaller chunk of j atoms into shared memory: radii
00270 if ( tx < shared_size ) { // every 4th thread
00271 jatoms[tx].charge = intRadS[j+tx]; // load radius into charge float
00272 //printf("jatom2 = %7.3f\n", jatoms[tx].charge);
00273 }
00274 __syncthreads();
00275
00276 if ( blocki + tx < patch1size ) {
00277 //each thread loop over shared atoms
00278 for ( int j = 0; j < shared_size; ++j ) {
00279 float dx = atomi.position.x - jatoms[j].position.x;
00280 float dy = atomi.position.y - jatoms[j].position.y;
00281 float dz = atomi.position.z - jatoms[j].position.z;
00282 float r2 = dx*dx + dy*dy + dz*dz;
00283
00284 // within cutoff different atoms
00285 if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00286 //int jj = myPatchPair.patch2_atom_start + blockj + j;
00287 //int jindex = ((uint4 *)(atom_params))[jj].y; // store index
00288
00289 // calculate H(i,j) [and not H(j,i)]
00290 float r_i = 1.f / sqrt(r2);
00291 float r = r2 * r_i;
00292 float hij;
00293 int dij;
00294 CalcH(r,r2,r_i,a_cut,atomi.charge,jatoms[j].charge,hij,dij);
00295
00296 psiSumI += hij;
00297
00298 } // cutoff
00299 } // for j
00300 } // if thread in bounds
00301 } // for block j
00302 //psiSumI now contains contributions from all j in 2nd patch
00303
00304 // write psiSum to global memory buffer; to be accumulated later
00305 if ( blocki + tx < myPatchPair.patch1_force_size) {
00306 int i_out = myPatchPair.patch1_force_start + blocki + tx;
00307 psiSumBuffers[i_out] = psiSumI;
00308 }
00309 } // for block i
00310
00311 { // start of force sum
00312 // make sure psiSums are visible in global memory
00313 __threadfence();
00314 __syncthreads();
00315 __shared__ bool doSum;
00316
00317 if (threadIdx.x == 0) {
00318 int fli = myPatchPair.patch1_force_list_index;
00319 int fls = myPatchPair.patch1_force_list_size;
00320 int old = atomicInc(P1_counters+fli,fls-1);
00321 doSum = ( old == fls - 1 );
00322 }
00323
00324 __syncthreads();
00325
00326 if ( doSum ) {
00327 GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00328 force_lists, psiSumBuffers, psiSum);
00329 }
00330 } // end of force sum
00331 } //GBIS_P1
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Definition at line 338 of file ComputeGBISCUDAKernel.h. References GBIS_Atom_Reduction_Kernel(), GBIS_Energy_Reduction_Kernel(), GBIS_Force_Reduction_Kernel(), GBReal, j, jatoms, myPatchPair, and PATCH_PAIR_SIZE. 00363 {
00364 int tx = threadIdx.x;
00365 int bx = blockIdx.x;
00366 #ifdef __DEVICE_EMULATION__
00367 // printf("GBIS_P2: Blk(%3i) Thd(%3i)\n",bx,tx);
00368 #endif
00369 // one block per patch_pair
00370 // BLOCK_SIZE 128 threads per block
00371 // SHARED_SIZE 32
00372 // PATCH_PAIR_SIZE 16
00373 // PATCH_PAIR_USED 15
00374
00375 //macro for patch pair
00376 #ifdef __DEVICE_EMULATION__
00377 #define myPatchPair (*(patch_pair*)(&pp.i))
00378 #else
00379 #define myPatchPair pp.pp
00380 #endif
00381 __shared__ union {
00382 #ifndef __DEVICE_EMULATION__
00383 patch_pair pp;
00384 #endif
00385 unsigned int i[PATCH_PAIR_SIZE];
00386 } pp;
00387 if ( tx < PATCH_PAIR_USED ) {
00388 unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00389 pp.i[tx] = tmp;
00390 } // ?
00391
00392
00393 //macro for list of j atoms
00394 #ifdef __DEVICE_EMULATION__
00395 #define jatoms ((atom*)(jpqu.i))
00396 #else
00397 #define jatoms jpqu.d
00398 #endif
00399 __shared__ union {
00400 #ifndef __DEVICE_EMULATION__
00401 atom d[SHARED_SIZE];
00402 #endif
00403 int i[4*SHARED_SIZE];
00404 float f[4*SHARED_SIZE];
00405 } jpqu;
00406
00407 __shared__ float jBornRad[SHARED_SIZE];
00408 float energyT = 0.f; // total energy for this thread; to be reduced
00409
00410 // convert scaled offset with current lattice
00411 if ( tx == 0 ) {
00412 float offx = myPatchPair.offset.x * lata.x
00413 + myPatchPair.offset.y * latb.x
00414 + myPatchPair.offset.z * latc.x;
00415 float offy = myPatchPair.offset.x * lata.y
00416 + myPatchPair.offset.y * latb.y
00417 + myPatchPair.offset.z * latc.y;
00418 float offz = myPatchPair.offset.x * lata.z
00419 + myPatchPair.offset.y * latb.z
00420 + myPatchPair.offset.z * latc.z;
00421 myPatchPair.offset.x = offx;
00422 myPatchPair.offset.y = offy;
00423 myPatchPair.offset.z = offz;
00424 }
00425 __syncthreads();
00426
00427 int patch1size = myPatchPair.patch1_size;
00428 int patch2size = myPatchPair.patch2_size;
00429
00430 //values used in loop
00431 float r_cut2 = r_cut*r_cut;
00432 float r_cut_2 = 1.f / r_cut2;
00433 float r_cut_4 = 4.f*r_cut_2*r_cut_2;
00434 float epsilon_s_i = 1.f / epsilon_s;
00435 float epsilon_p_i = 1.f / epsilon_p;
00436
00437 //iterate over chunks of atoms within Patch 1
00438 for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00439
00440 //this thread calculates only the force on atomi; iterating over js
00441 atom atomi;
00442 float bornRadI;
00443 //int iindex;
00444 int i;
00445
00446 // load BLOCK of Patch i atoms
00447 if ( blocki + tx < patch1size ) {
00448 i = myPatchPair.patch1_atom_start + blocki + tx;
00449 float4 tmpa = ((float4*)atoms)[i];
00450 atomi.position.x = tmpa.x + myPatchPair.offset.x;
00451 atomi.position.y = tmpa.y + myPatchPair.offset.y;
00452 atomi.position.z = tmpa.z + myPatchPair.offset.z;
00453 atomi.charge = - tmpa.w * scaling;
00454 bornRadI = bornRad[i];
00455 //iindex = ((uint4 *)(atom_params))[i].y; // store index
00456 } // load patch 1
00457
00458 //init intermediate variables
00459 GBReal dEdaSumI = 0.f; // each thread accumulating single psi
00460 float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00461
00462 //iterate over chunks of atoms within Patch 2
00463 for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00464
00465 int shared_size = patch2size - blockj;
00466 if ( shared_size > SHARED_SIZE )
00467 shared_size = SHARED_SIZE;
00468 __syncthreads();
00469
00470 //load smaller chunk of j atoms into shared memory: coordinates
00471 int j = myPatchPair.patch2_atom_start + blockj;
00472 if ( tx < 4 * shared_size ) {
00473 jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00474 }
00475 __syncthreads();
00476
00477 //load smaller chunk of j atoms into shared memory: radii
00478 if ( tx < shared_size ) {
00479 jBornRad[tx] = bornRad[j+tx]; // load Born radius into shared
00480 //printf("BornRadius = %7.3f\n", jBornRadii[tx]);
00481 }
00482 __syncthreads();
00483
00484 if ( blocki + tx < patch1size ) {
00485 //each thread loop over shared atoms
00486 for ( int j = 0; j < shared_size; ++j ) {
00487 float dx = atomi.position.x - jatoms[j].position.x;
00488 float dy = atomi.position.y - jatoms[j].position.y;
00489 float dz = atomi.position.z - jatoms[j].position.z;
00490 float r2 = dx*dx + dy*dy + dz*dz;
00491
00492 // within cutoff different atoms
00493 if (r2 < r_cut*r_cut && r2 > 0.01f) {
00494 //int jj = myPatchPair.patch2_atom_start + blockj + j;
00495 //int jindex = ((uint4 *)(atom_params))[jj].y; // store index
00496
00497 float r_i = 1.f / sqrt(r2);
00498 float r = r2 * r_i;
00499 float bornRadJ = jBornRad[j];
00500
00501 //calculate GB energy
00502 float qiqj = atomi.charge*jatoms[j].charge;
00503 float aiaj = bornRadI*bornRadJ;
00504 float aiaj4 = 4*aiaj;
00505 float expr2aiaj4 = exp(-r2/aiaj4);
00506 float fij = sqrt(r2+aiaj*expr2aiaj4);
00507 float f_i = 1/fij;
00508 float expkappa = exp(-kappa*fij);
00509 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00510 float gbEij = qiqj*Dij*f_i;
00511
00512 //calculate energy derivatives
00513 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00514 float ddrf_i = -ddrfij*f_i*f_i;
00515 float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
00516 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00517
00518 //NAMD smoothing function
00519 float scale = 1.f;
00520 float ddrScale = 0.f;
00521 float forcedEdr;
00522 if (smoothDist > 0.f) {
00523 scale = r2 * r_cut_2 - 1.f;
00524 scale *= scale;
00525 ddrScale = r*(r2-r_cut2)*r_cut_4;
00526 energyT += gbEij * scale * 0.5f;
00527 //printf("PAIRENERGY = %15.7e\n", gbEij*scale*0.5);
00528 forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
00529 } else {
00530 energyT += gbEij * 0.5f;
00531 //printf("PAIRENERGY = %15.7e\n", gbEij*0.5);
00532 forcedEdr = -ddrGbEij;
00533 }
00534
00535 //add dEda
00536 float dEdai = 0.f;
00537 if (doFullElec) {
00538 //gbisParams->dEdaSum[0][i] += dEdai*scale;
00539 dEdai = 0.5f*qiqj*f_i*f_i
00540 *(kappa*epsilon_s_i*expkappa-Dij*f_i)
00541 *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;//0
00542 dEdaSumI += dEdai;
00543 }
00544
00545 forcedEdr *= r_i;
00546 forceI.x += dx*forcedEdr;
00547 forceI.y += dy*forcedEdr;
00548 forceI.z += dz*forcedEdr;
00549
00550
00551 } // within cutoff
00552 if (r2 < 0.01f) {
00553 // GB Self Energy
00554 if (doEnergy) {
00555 float fij = bornRadI;//inf
00556 float expkappa = exp(-kappa*fij);//0
00557 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00558 float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
00559 energyT += 0.5f*gbEij;
00560 }
00561 } //same atom or within cutoff
00562 } // for j
00563 } // if thread in bounds
00564 } // for block j
00565 //psiSumI now contains contributions from all j in 2nd patch
00566
00567 // write psiSum to global memory buffer; to be accumulated later
00568 if ( blocki + tx < myPatchPair.patch1_force_size) {
00569 int i_out = myPatchPair.patch1_force_start + blocki + tx;
00570 dEdaSumBuffers[i_out] = dEdaSumI;
00571 forceBuffers[i_out] = forceI;
00572 }
00573 } // for block i
00574 //printf("energyT[%03i,%03i] = %15.7e\n", blockIdx.x, tx, energyT);
00575
00576 //Energy Reduction
00577 if (doEnergy) {
00578 const int i_out = myPatchPair.virial_start / 16;
00579 __shared__ float energySh[BLOCK_SIZE];
00580 energySh[tx] = energyT;
00581 __syncthreads();
00582
00583 for ( unsigned int b = blockDim.x/2; b > 0 ; b >>= 1 ) {
00584 if ( tx < b ) {
00585 //printf("RED eSh[%i] += eSh[%i]\n", tx, tx+b);
00586 energySh[tx] += energySh[tx+b];
00587 }
00588 __syncthreads();
00589 }
00590 //const int i_out = myPatchPair.virial_start + tx;
00591 if (tx == 0) {
00592 energyBuffers[i_out] = energySh[0]; // TODO
00593 //printf("eB[%i] = %15.7e\n", i_out, energySh[0]);
00594 }
00595 } //end Energy Reduction
00596
00597 { // start of reduction
00598 // make sure psiSums are visible in global memory
00599 __threadfence();
00600 __syncthreads();
00601
00602 __shared__ bool doSum;
00603
00604 if (threadIdx.x == 0) {
00605 int fli = myPatchPair.patch1_force_list_index;
00606 int fls = myPatchPair.patch1_force_list_size;
00607 int old = atomicInc(P2_counters+fli,fls-1);
00608 doSum = ( old == fls - 1 );
00609 }
00610
00611 __syncthreads();
00612
00613 if ( doSum ) {
00614 GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00615 force_lists, dEdaSumBuffers, dEdaSum);
00616 GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00617 force_lists, forceBuffers, forces);
00618 if ( doEnergy ) {
00619 GBIS_Energy_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00620 force_lists, energyBuffers, energy);
00621 }
00622 }
00623 } // end of sum
00624 } //GBIS_P2
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Definition at line 632 of file ComputeGBISCUDAKernel.h. References CalcDH(), GBIS_Force_Reduction_Kernel(), j, jatoms, myPatchPair, and PATCH_PAIR_SIZE. 00649 {
00650 int tx = threadIdx.x;
00651 int bx = blockIdx.x;
00652 #ifdef __DEVICE_EMULATION__
00653 // printf("GBIS_P3: Blk(%3i) Thd(%3i)\n",bx,tx);
00654 #endif
00655
00656 #ifdef __DEVICE_EMULATION__
00657 #define myPatchPair (*(patch_pair*)(&pp.i))
00658 #else
00659 #define myPatchPair pp.pp
00660 #endif
00661 __shared__ union {
00662 #ifndef __DEVICE_EMULATION__
00663 patch_pair pp;
00664 #endif
00665 unsigned int i[PATCH_PAIR_SIZE];
00666 } pp;
00667 if ( tx < PATCH_PAIR_USED ) {
00668 unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00669 pp.i[tx] = tmp;
00670 } // all threads load single shared patch pair
00671
00672 //macro for list of j atoms
00673 #ifdef __DEVICE_EMULATION__
00674 #define jatoms ((atom*)(jpqu.i))
00675 #else
00676 #define jatoms jpqu.d
00677 #endif
00678 __shared__ union {
00679 #ifndef __DEVICE_EMULATION__
00680 atom d[SHARED_SIZE];
00681 #endif
00682 int i[4*SHARED_SIZE];
00683 float f[4*SHARED_SIZE];
00684 } jpqu;
00685
00686 __shared__ float jDHdrPrefix[SHARED_SIZE];
00687 __shared__ float intRadJ0[SHARED_SIZE];
00688
00689 // convert scaled offset with current lattice
00690 if ( tx == 0 ) {
00691 float offx = myPatchPair.offset.x * lata.x
00692 + myPatchPair.offset.y * latb.x
00693 + myPatchPair.offset.z * latc.x;
00694 float offy = myPatchPair.offset.x * lata.y
00695 + myPatchPair.offset.y * latb.y
00696 + myPatchPair.offset.z * latc.y;
00697 float offz = myPatchPair.offset.x * lata.z
00698 + myPatchPair.offset.y * latb.z
00699 + myPatchPair.offset.z * latc.z;
00700 myPatchPair.offset.x = offx;
00701 myPatchPair.offset.y = offy;
00702 myPatchPair.offset.z = offz;
00703 }
00704 __syncthreads();
00705
00706 //iterate over chunks of atoms within Patch 1
00707 for ( int blocki = 0; blocki < myPatchPair.patch1_size; blocki += BLOCK_SIZE ) {
00708 //this thread calculates only the force on atomi; iterating over js
00709 atom atomi;
00710 float intRadIS;
00711 //int iindex;
00712 int i;
00713 float dHdrPrefixI;
00714
00715 // load BLOCK of Patch i atoms
00716 if ( blocki + tx < myPatchPair.patch1_size ) {
00717 i = myPatchPair.patch1_atom_start + blocki + tx;
00718 float4 tmpa = ((float4*)atoms)[i];
00719 atomi.position.x = tmpa.x + myPatchPair.offset.x;
00720 atomi.position.y = tmpa.y + myPatchPair.offset.y;
00721 atomi.position.z = tmpa.z + myPatchPair.offset.z;
00722 atomi.charge = intRad0[i]; // overwrite charge with radius
00723 intRadIS = intRadS[i];
00724 //iindex = ((uint4 *)(atom_params))[i].y; // store index
00725 dHdrPrefixI = dHdrPrefix[i];
00726 } // load patch 1
00727
00728 //init intermediate variables
00729 float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00730
00731 //iterate over chunks of atoms within Patch 2
00732 for ( int blockj = 0; blockj < myPatchPair.patch2_size; blockj += SHARED_SIZE ) {
00733
00734 int shared_size = myPatchPair.patch2_size - blockj;
00735 if ( shared_size > SHARED_SIZE )
00736 shared_size = SHARED_SIZE;
00737 __syncthreads();
00738
00739 //load smaller chunk of j atoms into shared memory: coordinates
00740
00741 int j = myPatchPair.patch2_atom_start + blockj;
00742 if ( tx < 4 * shared_size ) {
00743 jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00744 }
00745 __syncthreads();
00746
00747 //load smaller chunk of j atoms into shared memory: radii
00748 if ( tx < shared_size ) { // every 4th thread
00749 jatoms[tx].charge = intRadS[j+tx]; // load radius into charge float
00750 jDHdrPrefix[tx] = dHdrPrefix[j+tx]; // load dHdrPrefix into shared
00751 intRadJ0[tx] = intRad0[j+tx];
00752 }
00753 __syncthreads();
00754
00755 if ( blocki + tx < myPatchPair.patch1_size ) {
00756 //each thread loop over shared atoms
00757 for ( int j = 0; j < shared_size; ++j ) {
00758 float dx = atomi.position.x - jatoms[j].position.x;
00759 float dy = atomi.position.y - jatoms[j].position.y;
00760 float dz = atomi.position.z - jatoms[j].position.z;
00761 float r2 = dx*dx + dy*dy + dz*dz;
00762
00763 // within cutoff different atoms
00764 if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00765 //int jj = myPatchPair.patch2_atom_start + blockj + j;
00766 //int jindex = ((uint4 *)(atom_params))[jj].y; // store index
00767
00768 float r_i = 1.f / sqrt(r2);
00769 float r = r2 * r_i;
00770 float dHdrPrefixJ = jDHdrPrefix[j];
00771 float dhij, dhji;
00772 int dij, dji;
00773 CalcDH(r,r2,r_i,a_cut,atomi.charge,
00774 jatoms[j].charge,dhij,dij);
00775 CalcDH(r,r2,r_i,a_cut,intRadJ0[j],
00776 intRadIS,dhji,dji);
00777
00778 float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
00779 forceI.x += dx * forceAlpha;
00780 forceI.y += dy * forceAlpha;
00781 forceI.z += dz * forceAlpha;
00782 } // cutoff
00783 } // for j
00784 } // if thread in bounds
00785 } // for block j
00786
00787 // write psiSum to global memory buffer; to be accumulated later
00788 if ( blocki + tx < myPatchPair.patch1_force_size) {
00789 int i_out = myPatchPair.patch1_force_start + blocki + tx;
00790 forceBuffers[i_out] = forceI;
00791 }
00792 } // for block i
00793
00794 { // start of force sum
00795 // make sure forces are visible in global memory
00796 __threadfence();
00797 __syncthreads();
00798 __shared__ bool doSum;
00799
00800 if (threadIdx.x == 0) {
00801 int fli = myPatchPair.patch1_force_list_index;
00802 int fls = myPatchPair.patch1_force_list_size;
00803 int old = atomicInc(P3_counters+fli,fls-1);
00804 doSum = ( old == fls - 1 );
00805 }
00806 __syncthreads();
00807
00808 if ( doSum ) {
00809 GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00810 force_lists, forceBuffers, forces);
00811 }
00812 } // end of force sum
00813 } //GBIS_P3
|
1.3.9.1