290 __shared__ patch_pair sh_patch_pair;
291 #ifndef KEPLER_SHUFFLE 292 __shared__ atom sh_jpq_2d[NUM_WARP][
WARPSIZE];
293 __shared__
float sh_jBornRad_2d[NUM_WARP][
WARPSIZE];
295 __shared__ float3 sh_forceJ_2d[NUM_WARP][
WARPSIZE];
296 __shared__
float sh_dEdaSumJ_2d[NUM_WARP][
WARPSIZE];
298 #ifndef KEPLER_SHUFFLE 299 volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
300 volatile float* sh_jBornRad = sh_jBornRad_2d[threadIdx.y];
302 volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
303 volatile float* sh_dEdaSumJ = sh_dEdaSumJ_2d[threadIdx.y];
307 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
308 if (t < PATCH_PAIR_SIZE) {
309 int* src = (
int *)&patch_pairs[blockIdx.x];
310 int* dst = (
int *)&sh_patch_pair;
317 float offx = sh_patch_pair.offset.x * lata.x
318 + sh_patch_pair.offset.y * latb.x
319 + sh_patch_pair.offset.z * latc.x;
320 float offy = sh_patch_pair.offset.x * lata.y
321 + sh_patch_pair.offset.y * latb.y
322 + sh_patch_pair.offset.z * latc.y;
323 float offz = sh_patch_pair.offset.x * lata.z
324 + sh_patch_pair.offset.y * latb.z
325 + sh_patch_pair.offset.z * latc.z;
326 sh_patch_pair.offset.x = offx;
327 sh_patch_pair.offset.y = offy;
328 sh_patch_pair.offset.z = offz;
336 float r_cut2 = r_cut*r_cut;
337 float r_cut_2 = 1.f / r_cut2;
338 float r_cut_4 = 4.f*r_cut_2*r_cut_2;
339 float epsilon_s_i = 1.f / epsilon_s;
340 float epsilon_p_i = 1.f / epsilon_p;
343 for (
int blocki = threadIdx.y*
WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += BLOCK_SIZE ) {
345 int nloopi = sh_patch_pair.patch1_size - blocki;
354 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
355 i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
356 float4 tmpa = ((float4*)atoms)[i];
357 atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
358 atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
359 atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
360 atomi.charge = - tmpa.w * scaling;
361 bornRadI = bornRad[i];
371 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
372 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
373 int blockj = (diag_patch_pair) ? blocki : 0;
375 for (; blockj < sh_patch_pair.patch2_size; blockj +=
WARPSIZE) {
377 int nloopj = min(sh_patch_pair.patch2_size - blockj,
WARPSIZE);
379 #ifdef KEPLER_SHUFFLE 388 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
389 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
390 float4 tmpa = ((float4*)atoms)[j];
391 #ifdef KEPLER_SHUFFLE 396 bornRadJ = bornRad[j];
398 sh_jpq[threadIdx.x].position.x = tmpa.x;
399 sh_jpq[threadIdx.x].position.y = tmpa.y;
400 sh_jpq[threadIdx.x].position.z = tmpa.z;
401 sh_jpq[threadIdx.x].charge = tmpa.w;
402 sh_jBornRad[threadIdx.x] = bornRad[j];
406 sh_forceJ[threadIdx.x].x = 0.f;
407 sh_forceJ[threadIdx.x].y = 0.f;
408 sh_forceJ[threadIdx.x].z = 0.f;
409 sh_dEdaSumJ[threadIdx.x] = 0.f;
411 const bool diag_tile = diag_patch_pair && (blocki == blockj);
414 int j = (t + threadIdx.x) % modval;
415 #ifndef KEPLER_SHUFFLE 416 float xj = sh_jpq[j].position.x;
417 float yj = sh_jpq[j].position.y;
418 float zj = sh_jpq[j].position.z;
419 float chargej = sh_jpq[j].charge;
420 float bornRadJ = sh_jBornRad[j];
422 if (j < nloopj && threadIdx.x < nloopi)
424 float dx = atomi.position.x - xj;
425 float dy = atomi.position.y - yj;
426 float dz = atomi.position.z - zj;
427 float r2 = dx*dx + dy*dy + dz*dz;
430 if (r2 < r_cut2 && r2 > 0.01f) {
432 float r_i = 1.f / sqrt(r2);
437 float qiqj = atomi.charge*chargej;
438 float aiaj = bornRadI*bornRadJ;
439 float aiaj4 = 4*aiaj;
440 float expr2aiaj4 = exp(-r2/aiaj4);
441 float fij = sqrt(r2+aiaj*expr2aiaj4);
443 float expkappa = exp(-kappa*fij);
444 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
445 float gbEij = qiqj*Dij*f_i;
448 float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
449 float ddrf_i = -ddrfij*f_i*f_i;
450 float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
451 float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
455 float ddrScale = 0.f;
457 if (smoothDist > 0.f) {
458 scale = r2 * r_cut_2 - 1.f;
460 ddrScale = r*(r2-r_cut2)*r_cut_4;
461 energyT += gbEij * scale;
462 forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
465 forcedEdr = -ddrGbEij;
470 float dEdai = 0.5f*qiqj*f_i*f_i
471 *(kappa*epsilon_s_i*expkappa-Dij*f_i)
472 *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;
474 float dEdaj = 0.5f*qiqj*f_i*f_i
475 *(kappa*epsilon_s_i*expkappa-Dij*f_i)
476 *(aiaj+0.25f*r2)*expr2aiaj4/bornRadJ*scale;
477 sh_dEdaSumJ[j] += dEdaj;
481 float tmpx = dx*forcedEdr;
482 float tmpy = dy*forcedEdr;
483 float tmpz = dz*forcedEdr;
487 sh_forceJ[j].x -= tmpx;
488 sh_forceJ[j].y -= tmpy;
489 sh_forceJ[j].z -= tmpz;
494 float fij = bornRadI;
495 float expkappa = exp(-kappa*fij);
496 float Dij = epsilon_p_i - expkappa*epsilon_s_i;
497 float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
498 energyT += 0.5f*gbEij;
502 #ifdef KEPLER_SHUFFLE 510 if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
511 int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
512 atomicAdd(&tmp_dEdaSum[i_out], sh_dEdaSumJ[threadIdx.x]);
513 atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
514 atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
515 atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
521 if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
522 int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
523 atomicAdd(&tmp_dEdaSum[i_out], dEdaSumI);
524 atomicAdd(&tmp_forces[i_out].x, forceI.x);
525 atomicAdd(&tmp_forces[i_out].y, forceI.y);
526 atomicAdd(&tmp_forces[i_out].z, forceI.z);
534 volatile float* sh_energy = (
float *)&sh_forceJ_2d[threadIdx.y][0].x;
536 sh_energy[threadIdx.x] = (
float)energyT;
538 int pos = threadIdx.x + d;
539 float val = (pos <
WARPSIZE) ? sh_energy[pos] : 0.0f;
540 sh_energy[threadIdx.x] += val;
544 if (threadIdx.x == 0 && threadIdx.y == 0) {
545 float tot_energy = 0.0f;
547 for (
int i=0;i < NUM_WARP;++i) {
548 tot_energy += ((
float *)&sh_forceJ_2d[i][0].x)[0];
550 int patch1_ind = sh_patch_pair.patch1_ind;
551 atomicAdd(&tmp_energy[patch1_ind], (
float)tot_energy);
561 int patch1_ind = sh_patch_pair.patch1_ind;
562 int patch2_ind = sh_patch_pair.patch2_ind;
564 if (threadIdx.x == 0 && threadIdx.y == 0) {
565 sh_patch_pair.patch_done[0] =
false;
566 sh_patch_pair.patch_done[1] =
false;
568 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
569 int patch1_old = atomicInc(&P2_counters[patch1_ind], patch1_num_pairs-1);
570 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
571 if (patch1_ind != patch2_ind) {
572 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
573 int patch2_old = atomicInc(&P2_counters[patch2_ind], patch2_num_pairs-1);
574 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
580 if (sh_patch_pair.patch_done[0]) {
581 const int start = sh_patch_pair.patch1_start;
582 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*
WARPSIZE) {
583 forces[start+i] = tmp_forces[start+i];
584 dEdaSum[start+i] = tmp_dEdaSum[start+i];
586 energy[patch1_ind] = tmp_energy[patch1_ind];
589 if (sh_patch_pair.patch_done[1]) {
590 const int start = sh_patch_pair.patch2_start;
591 for (
int i=threadIdx.x+threadIdx.y*
WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*
WARPSIZE) {
592 forces[start+i] = tmp_forces[start+i];
593 dEdaSum[start+i] = tmp_dEdaSum[start+i];
595 energy[patch2_ind] = tmp_energy[patch2_ind];
598 if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
600 #if __CUDA_ARCH__ < 200 603 __threadfence_system();
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)