00001 #include <stdio.h>
00002 #define GBIS_CUDA
00003 #include "ComputeGBIS.inl"
00004
00005 #define DORED 1
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 __device__ __forceinline__ static void GBIS_Energy_Reduction_Kernel (
00027 const int list_index,
00028 const force_list *lists,
00029 const float *sumBuffers,
00030 float *sum
00031 ) {
00032
00033 int tx = threadIdx.x;
00034
00035 #define myList fl.fl
00036 __shared__ union {
00037 force_list fl;
00038 unsigned int i[FORCE_LIST_SIZE];
00039 } fl;
00040
00041 if ( tx < FORCE_LIST_USED ) {
00042 unsigned int tmp = ((unsigned int*)lists)[
00043 FORCE_LIST_SIZE*list_index+tx];
00044 fl.i[tx] = tmp;
00045 }
00046 __syncthreads();
00047
00048 __shared__ float energySh[BLOCK_SIZE];
00049
00050 if (tx < myList.force_list_size) {
00051 energySh[tx] = sumBuffers[myList.virial_list_start/16 + tx];
00052 }
00053 __syncthreads();
00054
00055 float nf = 1.f * myList.force_list_size;
00056 float p2f = log2(nf);
00057 int p2i = ceil(p2f);
00058 int n2i = 1<<p2i;
00059 int b = n2i / 2;
00060 if (tx + b < myList.force_list_size) {
00061 energySh[tx] += energySh[tx+b];
00062 }
00063 __syncthreads();
00064 b /= 2;
00065
00066 for ( ; b > 0; b /= 2) {
00067 if (tx < b) {
00068 energySh[tx] += energySh[tx+b];
00069 }
00070 __syncthreads();
00071 }
00072
00073 if (threadIdx.x == 0) {
00074 const int dest = myList.virial_output_start/16;
00075 sum[dest] = energySh[0];
00076 }
00077 }
00078
00079
00080 __device__ __forceinline__ static void GBIS_Force_Reduction_Kernel (
00081 const int list_index,
00082 const force_list *lists,
00083 const float4 *sumBuffers,
00084 float4 *sum
00085 ) {
00086 #define myList fl.fl
00087 __shared__ union {
00088 force_list fl;
00089 unsigned int i[FORCE_LIST_SIZE];
00090 } fl;
00091
00092 if ( threadIdx.x < FORCE_LIST_USED ) {
00093 unsigned int tmp = ((unsigned int*)lists)[
00094 FORCE_LIST_SIZE*list_index+threadIdx.x];
00095 fl.i[threadIdx.x] = tmp;
00096 }
00097 __syncthreads();
00098 for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00099 const float4 *sumBuf = sumBuffers + myList.force_list_start + j;
00100 float4 sumOut; sumOut.x = 0.f; sumOut.y = 0.f; sumOut.z = 0.f;
00101 for ( int i = 0; i < myList.force_list_size; ++i ) {
00102 float4 sumVal = *sumBuf;
00103 sumOut.x += sumVal.x;
00104 sumOut.y += sumVal.y;
00105 sumOut.z += sumVal.z;
00106 sumBuf += myList.patch_stride;
00107 }
00108 const int dest = myList.force_output_start + j;
00109 sum[dest].x += sumOut.x;
00110 sum[dest].y += sumOut.y;
00111 sum[dest].z += sumOut.z;
00112 }
00113 }
00114
00115
00116 __device__ __forceinline__ static void GBIS_Atom_Reduction_Kernel (
00117 const int list_index,
00118 const force_list *lists,
00119 const GBReal *sumBuffers,
00120 GBReal *sum
00121 ) {
00122 #define myList fl.fl
00123 __shared__ union {
00124 force_list fl;
00125 unsigned int i[FORCE_LIST_SIZE];
00126 } fl;
00127
00128 if ( threadIdx.x < FORCE_LIST_USED ) {
00129 unsigned int tmp = ((unsigned int*)lists)[
00130 FORCE_LIST_SIZE*list_index+threadIdx.x];
00131 fl.i[threadIdx.x] = tmp;
00132 }
00133 __syncthreads();
00134 for ( int j = threadIdx.x; j < myList.patch_size; j += BLOCK_SIZE ) {
00135 const GBReal *sumBuf = sumBuffers + myList.force_list_start + j;
00136 GBReal sumOut = 0.f;
00137 for ( int i = 0; i < myList.force_list_size; ++i ) {
00138 GBReal sumVal = *sumBuf;
00139 sumOut += sumVal;
00140 sumBuf += myList.patch_stride;
00141 }
00142 const int dest = myList.force_output_start + j;
00143 sum[dest] = sumOut;
00144 }
00145 }
00146
00147
00148
00149
00150
00151
00152 __global__ static void GBIS_P1_Kernel (
00153 const patch_pair *patch_pairs,
00154 const atom *atoms,
00155 const atom_param *atom_params,
00156 const float *intRad0,
00157 const float *intRadS,
00158 GBReal *psiSumBuffers,
00159 GBReal *psiSum,
00160 const float a_cut,
00161 const float rho_0,
00162 float3 lata,
00163 float3 latb,
00164 float3 latc,
00165 const force_list *force_lists,
00166 unsigned int *P1_counters
00167 ) {
00168 int tx = threadIdx.x;
00169 int bx = blockIdx.x;
00170 #ifdef __DEVICE_EMULATION__
00171
00172 #endif
00173
00174
00175
00176
00177
00178
00179
00180 #ifdef __DEVICE_EMULATION__
00181 #define myPatchPair (*(patch_pair*)(&pp.i))
00182 #else
00183 #define myPatchPair pp.pp
00184 #endif
00185 __shared__ union {
00186 #ifndef __DEVICE_EMULATION__
00187 patch_pair pp;
00188 #endif
00189 unsigned int i[PATCH_PAIR_SIZE];
00190 } pp;
00191 if ( tx < PATCH_PAIR_USED ) {
00192 unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00193 pp.i[tx] = tmp;
00194 }
00195
00196
00197
00198 #ifdef __DEVICE_EMULATION__
00199 #define jatoms ((atom*)(jpqu.i))
00200 #else
00201 #define jatoms jpqu.d
00202 #endif
00203 __shared__ union {
00204 #ifndef __DEVICE_EMULATION__
00205 atom d[SHARED_SIZE];
00206 #endif
00207 int i[4*SHARED_SIZE];
00208 float f[4*SHARED_SIZE];
00209 } jpqu;
00210
00211
00212 if ( tx == 0 ) {
00213 float offx = myPatchPair.offset.x * lata.x
00214 + myPatchPair.offset.y * latb.x
00215 + myPatchPair.offset.z * latc.x;
00216 float offy = myPatchPair.offset.x * lata.y
00217 + myPatchPair.offset.y * latb.y
00218 + myPatchPair.offset.z * latc.y;
00219 float offz = myPatchPair.offset.x * lata.z
00220 + myPatchPair.offset.y * latb.z
00221 + myPatchPair.offset.z * latc.z;
00222 myPatchPair.offset.x = offx;
00223 myPatchPair.offset.y = offy;
00224 myPatchPair.offset.z = offz;
00225 }
00226 __syncthreads();
00227
00228 int patch1size = myPatchPair.patch1_size;
00229 int patch2size = myPatchPair.patch2_size;
00230
00231
00232 for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00233
00234
00235 atom atomi;
00236
00237 int i;
00238
00239
00240 if ( blocki + tx < patch1size ) {
00241 i = myPatchPair.patch1_atom_start + blocki + tx;
00242 float4 tmpa = ((float4*)atoms)[i];
00243 atomi.position.x = tmpa.x + myPatchPair.offset.x;
00244 atomi.position.y = tmpa.y + myPatchPair.offset.y;
00245 atomi.position.z = tmpa.z + myPatchPair.offset.z;
00246 atomi.charge = intRad0[i];
00247
00248 }
00249
00250
00251 GBReal psiSumI = 0.f;
00252
00253
00254 for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00255
00256 int shared_size = patch2size - blockj;
00257 if ( shared_size > SHARED_SIZE )
00258 shared_size = SHARED_SIZE;
00259 __syncthreads();
00260
00261
00262
00263 int j = myPatchPair.patch2_atom_start + blockj;
00264 if ( tx < 4 * shared_size ) {
00265 jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00266 }
00267 __syncthreads();
00268
00269
00270 if ( tx < shared_size ) {
00271 jatoms[tx].charge = intRadS[j+tx];
00272
00273 }
00274 __syncthreads();
00275
00276 if ( blocki + tx < patch1size ) {
00277
00278 for ( int j = 0; j < shared_size; ++j ) {
00279 float dx = atomi.position.x - jatoms[j].position.x;
00280 float dy = atomi.position.y - jatoms[j].position.y;
00281 float dz = atomi.position.z - jatoms[j].position.z;
00282 float r2 = dx*dx + dy*dy + dz*dz;
00283
00284
00285 if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00286
00287
00288
00289
00290 float r_i = 1.f / sqrt(r2);
00291 float r = r2 * r_i;
00292 float hij;
00293 int dij;
00294 CalcH(r,r2,r_i,a_cut,atomi.charge,jatoms[j].charge,hij,dij);
00295
00296 psiSumI += hij;
00297
00298 }
00299 }
00300 }
00301 }
00302
00303
00304
00305 if ( blocki + tx < myPatchPair.patch1_force_size) {
00306 int i_out = myPatchPair.patch1_force_start + blocki + tx;
00307 psiSumBuffers[i_out] = psiSumI;
00308 }
00309 }
00310
00311 {
00312
00313 __threadfence();
00314 __syncthreads();
00315 __shared__ bool doSum;
00316
00317 if (threadIdx.x == 0) {
00318 int fli = myPatchPair.patch1_force_list_index;
00319 int fls = myPatchPair.patch1_force_list_size;
00320 int old = atomicInc(P1_counters+fli,fls-1);
00321 doSum = ( old == fls - 1 );
00322 }
00323
00324 __syncthreads();
00325
00326 if ( doSum ) {
00327 GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00328 force_lists, psiSumBuffers, psiSum);
00329 }
00330 }
00331 }
00332
00333
00334
00335
00336
00337
00338 __global__ static void GBIS_P2_Kernel (
00339 const patch_pair *patch_pairs,
00340 const atom *atoms,
00341 const atom_param *atom_params,
00342 const float *bornRad,
00343 GBReal *dEdaSumBuffers,
00344 GBReal *dEdaSum,
00345 const float a_cut,
00346 const float r_cut,
00347 const float scaling,
00348 const float kappa,
00349 const float smoothDist,
00350 const float epsilon_p,
00351 const float epsilon_s,
00352 float3 lata,
00353 float3 latb,
00354 float3 latc,
00355 const int doEnergy,
00356 const int doFullElec,
00357 const force_list *force_lists,
00358 float4 *forceBuffers,
00359 float4 *forces,
00360 float *energyBuffers,
00361 float *energy,
00362 unsigned int *P2_counters
00363 ) {
00364 int tx = threadIdx.x;
00365 int bx = blockIdx.x;
00366 #ifdef __DEVICE_EMULATION__
00367
00368 #endif
00369
00370
00371
00372
00373
00374
00375
00376 #ifdef __DEVICE_EMULATION__
00377 #define myPatchPair (*(patch_pair*)(&pp.i))
00378 #else
00379 #define myPatchPair pp.pp
00380 #endif
00381 __shared__ union {
00382 #ifndef __DEVICE_EMULATION__
00383 patch_pair pp;
00384 #endif
00385 unsigned int i[PATCH_PAIR_SIZE];
00386 } pp;
00387 if ( tx < PATCH_PAIR_USED ) {
00388 unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00389 pp.i[tx] = tmp;
00390 }
00391
00392
00393
00394 #ifdef __DEVICE_EMULATION__
00395 #define jatoms ((atom*)(jpqu.i))
00396 #else
00397 #define jatoms jpqu.d
00398 #endif
00399 __shared__ union {
00400 #ifndef __DEVICE_EMULATION__
00401 atom d[SHARED_SIZE];
00402 #endif
00403 int i[4*SHARED_SIZE];
00404 float f[4*SHARED_SIZE];
00405 } jpqu;
00406
00407 __shared__ float jBornRad[SHARED_SIZE];
00408 float energyT = 0.f;
00409
00410
00411 if ( tx == 0 ) {
00412 float offx = myPatchPair.offset.x * lata.x
00413 + myPatchPair.offset.y * latb.x
00414 + myPatchPair.offset.z * latc.x;
00415 float offy = myPatchPair.offset.x * lata.y
00416 + myPatchPair.offset.y * latb.y
00417 + myPatchPair.offset.z * latc.y;
00418 float offz = myPatchPair.offset.x * lata.z
00419 + myPatchPair.offset.y * latb.z
00420 + myPatchPair.offset.z * latc.z;
00421 myPatchPair.offset.x = offx;
00422 myPatchPair.offset.y = offy;
00423 myPatchPair.offset.z = offz;
00424 }
00425 __syncthreads();
00426
00427 int patch1size = myPatchPair.patch1_size;
00428 int patch2size = myPatchPair.patch2_size;
00429
00430
00431 float r_cut2 = r_cut*r_cut;
00432 float r_cut_2 = 1.f / r_cut2;
00433 float r_cut_4 = 4.f*r_cut_2*r_cut_2;
00434 float epsilon_s_i = 1.f / epsilon_s;
00435 float epsilon_p_i = 1.f / epsilon_p;
00436
00437
00438 for ( int blocki = 0; blocki < patch1size; blocki += BLOCK_SIZE ) {
00439
00440
00441 atom atomi;
00442 float bornRadI;
00443
00444 int i;
00445
00446
00447 if ( blocki + tx < patch1size ) {
00448 i = myPatchPair.patch1_atom_start + blocki + tx;
00449 float4 tmpa = ((float4*)atoms)[i];
00450 atomi.position.x = tmpa.x + myPatchPair.offset.x;
00451 atomi.position.y = tmpa.y + myPatchPair.offset.y;
00452 atomi.position.z = tmpa.z + myPatchPair.offset.z;
00453 atomi.charge = - tmpa.w * scaling;
00454 bornRadI = bornRad[i];
00455
00456 }
00457
00458
00459 GBReal dEdaSumI = 0.f;
00460 float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00461
00462
00463 for ( int blockj = 0; blockj < patch2size; blockj += SHARED_SIZE ) {
00464
00465 int shared_size = patch2size - blockj;
00466 if ( shared_size > SHARED_SIZE )
00467 shared_size = SHARED_SIZE;
00468 __syncthreads();
00469
00470
00471 int j = myPatchPair.patch2_atom_start + blockj;
00472 if ( tx < 4 * shared_size ) {
00473 jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00474 }
00475 __syncthreads();
00476
00477
00478 if ( tx < shared_size ) {
00479 jBornRad[tx] = bornRad[j+tx];
00480
00481 }
00482 __syncthreads();
00483
00484 if ( blocki + tx < patch1size ) {
00485
00486 for ( int j = 0; j < shared_size; ++j ) {
00487 float dx = atomi.position.x - jatoms[j].position.x;
00488 float dy = atomi.position.y - jatoms[j].position.y;
00489 float dz = atomi.position.z - jatoms[j].position.z;
00490 float r2 = dx*dx + dy*dy + dz*dz;
00491
00492
00493 if (r2 < r_cut*r_cut && r2 > 0.01f) {
00494
00495
00496
00497 float r_i = 1.f / sqrt(r2);
00498 float r = r2 * r_i;
00499 float bornRadJ = jBornRad[j];
00500
00501
00502 float qiqj = atomi.charge*jatoms[j].charge;
00503 float aiaj = bornRadI*bornRadJ;
00504 float aiaj4 = 4*aiaj;
00505 float expr2aiaj4 = exp(-r2/aiaj4);
00506 float fij = sqrt(r2+aiaj*expr2aiaj4);
00507 float f_i = 1/fij;
00508 float expkappa = exp(-kappa*fij);
00509 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00510 float gbEij = qiqj*Dij*f_i;
00511
00512
00513 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
00514 float ddrf_i = -ddrfij*f_i*f_i;
00515 float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
00516 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
00517
00518
00519 float scale = 1.f;
00520 float ddrScale = 0.f;
00521 float forcedEdr;
00522 if (smoothDist > 0.f) {
00523 scale = r2 * r_cut_2 - 1.f;
00524 scale *= scale;
00525 ddrScale = r*(r2-r_cut2)*r_cut_4;
00526 energyT += gbEij * scale * 0.5f;
00527
00528 forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
00529 } else {
00530 energyT += gbEij * 0.5f;
00531
00532 forcedEdr = -ddrGbEij;
00533 }
00534
00535
00536 float dEdai = 0.f;
00537 if (doFullElec) {
00538
00539 dEdai = 0.5f*qiqj*f_i*f_i
00540 *(kappa*epsilon_s_i*expkappa-Dij*f_i)
00541 *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;
00542 dEdaSumI += dEdai;
00543 }
00544
00545 forcedEdr *= r_i;
00546 forceI.x += dx*forcedEdr;
00547 forceI.y += dy*forcedEdr;
00548 forceI.z += dz*forcedEdr;
00549
00550
00551 }
00552 if (r2 < 0.01f) {
00553
00554 if (doEnergy) {
00555 float fij = bornRadI;
00556 float expkappa = exp(-kappa*fij);
00557 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
00558 float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
00559 energyT += 0.5f*gbEij;
00560 }
00561 }
00562 }
00563 }
00564 }
00565
00566
00567
00568 if ( blocki + tx < myPatchPair.patch1_force_size) {
00569 int i_out = myPatchPair.patch1_force_start + blocki + tx;
00570 dEdaSumBuffers[i_out] = dEdaSumI;
00571 forceBuffers[i_out] = forceI;
00572 }
00573 }
00574
00575
00576
00577 if (doEnergy) {
00578 const int i_out = myPatchPair.virial_start / 16;
00579 __shared__ float energySh[BLOCK_SIZE];
00580 energySh[tx] = energyT;
00581 __syncthreads();
00582
00583 for ( unsigned int b = blockDim.x/2; b > 0 ; b >>= 1 ) {
00584 if ( tx < b ) {
00585
00586 energySh[tx] += energySh[tx+b];
00587 }
00588 __syncthreads();
00589 }
00590
00591 if (tx == 0) {
00592 energyBuffers[i_out] = energySh[0];
00593
00594 }
00595 }
00596
00597 {
00598
00599 __threadfence();
00600 __syncthreads();
00601
00602 __shared__ bool doSum;
00603
00604 if (threadIdx.x == 0) {
00605 int fli = myPatchPair.patch1_force_list_index;
00606 int fls = myPatchPair.patch1_force_list_size;
00607 int old = atomicInc(P2_counters+fli,fls-1);
00608 doSum = ( old == fls - 1 );
00609 }
00610
00611 __syncthreads();
00612
00613 if ( doSum ) {
00614 GBIS_Atom_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00615 force_lists, dEdaSumBuffers, dEdaSum);
00616 GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00617 force_lists, forceBuffers, forces);
00618 if ( doEnergy ) {
00619 GBIS_Energy_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00620 force_lists, energyBuffers, energy);
00621 }
00622 }
00623 }
00624 }
00625
00626
00627
00628
00629
00630
00631
00632 __global__ static void GBIS_P3_Kernel (
00633 const patch_pair *patch_pairs,
00634 const atom *atoms,
00635 const atom_param *atom_params,
00636 const float *intRad0,
00637 const float *intRadS,
00638 const float *dHdrPrefix,
00639 const float a_cut,
00640 const float rho_0,
00641 const float scaling,
00642 float3 lata,
00643 float3 latb,
00644 float3 latc,
00645 const force_list *force_lists,
00646 float4 *forceBuffers,
00647 float4 *forces,
00648 unsigned int *P3_counters
00649 ) {
00650 int tx = threadIdx.x;
00651 int bx = blockIdx.x;
00652 #ifdef __DEVICE_EMULATION__
00653
00654 #endif
00655
00656 #ifdef __DEVICE_EMULATION__
00657 #define myPatchPair (*(patch_pair*)(&pp.i))
00658 #else
00659 #define myPatchPair pp.pp
00660 #endif
00661 __shared__ union {
00662 #ifndef __DEVICE_EMULATION__
00663 patch_pair pp;
00664 #endif
00665 unsigned int i[PATCH_PAIR_SIZE];
00666 } pp;
00667 if ( tx < PATCH_PAIR_USED ) {
00668 unsigned int tmp = ((unsigned int*)patch_pairs)[PATCH_PAIR_SIZE*bx+tx];
00669 pp.i[tx] = tmp;
00670 }
00671
00672
00673 #ifdef __DEVICE_EMULATION__
00674 #define jatoms ((atom*)(jpqu.i))
00675 #else
00676 #define jatoms jpqu.d
00677 #endif
00678 __shared__ union {
00679 #ifndef __DEVICE_EMULATION__
00680 atom d[SHARED_SIZE];
00681 #endif
00682 int i[4*SHARED_SIZE];
00683 float f[4*SHARED_SIZE];
00684 } jpqu;
00685
00686 __shared__ float jDHdrPrefix[SHARED_SIZE];
00687 __shared__ float intRadJ0[SHARED_SIZE];
00688
00689
00690 if ( tx == 0 ) {
00691 float offx = myPatchPair.offset.x * lata.x
00692 + myPatchPair.offset.y * latb.x
00693 + myPatchPair.offset.z * latc.x;
00694 float offy = myPatchPair.offset.x * lata.y
00695 + myPatchPair.offset.y * latb.y
00696 + myPatchPair.offset.z * latc.y;
00697 float offz = myPatchPair.offset.x * lata.z
00698 + myPatchPair.offset.y * latb.z
00699 + myPatchPair.offset.z * latc.z;
00700 myPatchPair.offset.x = offx;
00701 myPatchPair.offset.y = offy;
00702 myPatchPair.offset.z = offz;
00703 }
00704 __syncthreads();
00705
00706
00707 for ( int blocki = 0; blocki < myPatchPair.patch1_size; blocki += BLOCK_SIZE ) {
00708
00709 atom atomi;
00710 float intRadIS;
00711
00712 int i;
00713 float dHdrPrefixI;
00714
00715
00716 if ( blocki + tx < myPatchPair.patch1_size ) {
00717 i = myPatchPair.patch1_atom_start + blocki + tx;
00718 float4 tmpa = ((float4*)atoms)[i];
00719 atomi.position.x = tmpa.x + myPatchPair.offset.x;
00720 atomi.position.y = tmpa.y + myPatchPair.offset.y;
00721 atomi.position.z = tmpa.z + myPatchPair.offset.z;
00722 atomi.charge = intRad0[i];
00723 intRadIS = intRadS[i];
00724
00725 dHdrPrefixI = dHdrPrefix[i];
00726 }
00727
00728
00729 float4 forceI; forceI.x = forceI.y = forceI.z = forceI.w = 0.f;
00730
00731
00732 for ( int blockj = 0; blockj < myPatchPair.patch2_size; blockj += SHARED_SIZE ) {
00733
00734 int shared_size = myPatchPair.patch2_size - blockj;
00735 if ( shared_size > SHARED_SIZE )
00736 shared_size = SHARED_SIZE;
00737 __syncthreads();
00738
00739
00740
00741 int j = myPatchPair.patch2_atom_start + blockj;
00742 if ( tx < 4 * shared_size ) {
00743 jpqu.i[tx] = ((unsigned int *)(atoms + j))[tx];
00744 }
00745 __syncthreads();
00746
00747
00748 if ( tx < shared_size ) {
00749 jatoms[tx].charge = intRadS[j+tx];
00750 jDHdrPrefix[tx] = dHdrPrefix[j+tx];
00751 intRadJ0[tx] = intRad0[j+tx];
00752 }
00753 __syncthreads();
00754
00755 if ( blocki + tx < myPatchPair.patch1_size ) {
00756
00757 for ( int j = 0; j < shared_size; ++j ) {
00758 float dx = atomi.position.x - jatoms[j].position.x;
00759 float dy = atomi.position.y - jatoms[j].position.y;
00760 float dz = atomi.position.z - jatoms[j].position.z;
00761 float r2 = dx*dx + dy*dy + dz*dz;
00762
00763
00764 if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
00765
00766
00767
00768 float r_i = 1.f / sqrt(r2);
00769 float r = r2 * r_i;
00770 float dHdrPrefixJ = jDHdrPrefix[j];
00771 float dhij, dhji;
00772 int dij, dji;
00773 CalcDH(r,r2,r_i,a_cut,atomi.charge,
00774 jatoms[j].charge,dhij,dij);
00775 CalcDH(r,r2,r_i,a_cut,intRadJ0[j],
00776 intRadIS,dhji,dji);
00777
00778 float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
00779 forceI.x += dx * forceAlpha;
00780 forceI.y += dy * forceAlpha;
00781 forceI.z += dz * forceAlpha;
00782 }
00783 }
00784 }
00785 }
00786
00787
00788 if ( blocki + tx < myPatchPair.patch1_force_size) {
00789 int i_out = myPatchPair.patch1_force_start + blocki + tx;
00790 forceBuffers[i_out] = forceI;
00791 }
00792 }
00793
00794 {
00795
00796 __threadfence();
00797 __syncthreads();
00798 __shared__ bool doSum;
00799
00800 if (threadIdx.x == 0) {
00801 int fli = myPatchPair.patch1_force_list_index;
00802 int fls = myPatchPair.patch1_force_list_size;
00803 int old = atomicInc(P3_counters+fli,fls-1);
00804 doSum = ( old == fls - 1 );
00805 }
00806 __syncthreads();
00807
00808 if ( doSum ) {
00809 GBIS_Force_Reduction_Kernel(myPatchPair.patch1_force_list_index,
00810 force_lists, forceBuffers, forces);
00811 }
00812 }
00813 }