180 __shared__ patch_pair sh_patch_pair;
185 #ifndef KEPLER_SHUFFLE
193 SLOW(
float totales = 0.f; )
199 #ifndef KEPLER_SHUFFLE
204 __shared__
unsigned int sh_plist_val[
NUM_WARP];);
208 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
211 float *p = (
float *)sh_iforcesum;
212 p[threadIdx.x] = 0.0f;
216 int* src = (
int *)&
patch_pairs[block_begin + blockIdx.x];
217 int* dst = (
int *)&sh_patch_pair;
224 USEPAIRLIST(
if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
228 if (t < sh_patch_pair.plist_size)
229 plist[sh_patch_pair.plist_start + t] = 0;
234 float offx = sh_patch_pair.offset.x *
lata.x
235 + sh_patch_pair.offset.y *
latb.x
236 + sh_patch_pair.offset.z *
latc.x;
237 float offy = sh_patch_pair.offset.x *
lata.y
238 + sh_patch_pair.offset.y *
latb.y
239 + sh_patch_pair.offset.z *
latc.y;
240 float offz = sh_patch_pair.offset.x *
lata.z
241 + sh_patch_pair.offset.y *
latb.z
242 + sh_patch_pair.offset.z *
latc.z;
243 sh_patch_pair.offset.x = offx;
244 sh_patch_pair.offset.y = offy;
245 sh_patch_pair.offset.z = offz;
253 volatile float3* sh_jforce = &sh_jforce_2d[threadIdx.y][0];
254 SLOW(
volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
257 #ifndef KEPLER_SHUFFLE
258 atom* sh_jpq = &sh_jpq_2d[threadIdx.y][0];
259 GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
260 USEPAIRLIST(
int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
269 if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
270 int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
271 float4 tmpa = ((float4*)
atoms)[i];
272 ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
273 ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
274 ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
278 iap.index = tmpap.y;);
287 SLOW(float3 iforce_slow;
288 iforce_slow.x = 0.0f;
289 iforce_slow.y = 0.0f;
290 iforce_slow.z = 0.0f;)
292 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
293 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
294 int blockj = (diag_patch_pair) ? blocki : 0;
295 for (;blockj < sh_patch_pair.patch2_size;blockj +=
WARPSIZE) {
298 const int size2 = (sh_patch_pair.patch2_size-1)/
WARPSIZE+1;
300 int plist_ind = pos/32;
301 unsigned int plist_bit = 1 << (pos % 32);
303 if (plist_ind != sh_plist_ind[threadIdx.y]) {
304 sh_plist_val[threadIdx.y] =
plist[sh_patch_pair.plist_start + plist_ind];
305 sh_plist_ind[threadIdx.y] = plist_ind;
307 if ((sh_plist_val[threadIdx.y] & plist_bit) == 0)
continue;
311 #ifdef KEPLER_SHUFFLE
319 if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size)
continue;
320 int nfreej = sh_patch_pair.patch2_free_size - blockj;
321 int nloopj = min(sh_patch_pair.patch2_size - blockj,
WARPSIZE);
325 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
326 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
327 float4 tmpa = ((float4*)
atoms)[j];
328 #ifdef KEPLER_SHUFFLE
329 jpq.position.x = tmpa.x;
330 jpq.position.y = tmpa.y;
331 jpq.position.z = tmpa.z;
334 sh_jpq[threadIdx.x].position.x = tmpa.x;
335 sh_jpq[threadIdx.x].position.y = tmpa.y;
336 sh_jpq[threadIdx.x].position.z = tmpa.z;
337 sh_jpq[threadIdx.x].charge = tmpa.w;
340 #ifdef KEPLER_SHUFFLE
355 SLOW(float3 jforce_slow;
356 jforce_slow.x = 0.0f;
357 jforce_slow.y = 0.0f;
358 jforce_slow.z = 0.0f;
361 sh_jforce[threadIdx.x].x = 0.0f;
362 sh_jforce[threadIdx.x].y = 0.0f;
363 sh_jforce[threadIdx.x].z = 0.0f;
364 SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
365 sh_jforce_slow[threadIdx.x].y = 0.0f;
366 sh_jforce_slow[threadIdx.x].z = 0.0f;)
371 const int size2 = (sh_patch_pair.patch2_size-1)/
WARPSIZE+1;
373 unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].
excl[threadIdx.x];
376 int nloopi = sh_patch_pair.patch1_size - blocki;
379 int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
381 const bool diag_tile = diag_patch_pair && (blocki == blockj);
386 int t = (diag_tile) ? 1 : 0;
389 #ifdef KEPLER_SHUFFLE
404 int j = (t + threadIdx.x) & modval;
405 #ifdef KEPLER_SHUFFLE
410 int j_vdw_type = jap.vdw_type;
411 int j_index = jap.index;
412 int j_excl_maxdiff = jap.excl_maxdiff;
413 int j_excl_index = jap.excl_index;
415 float j_charge = jpq.charge;
417 int j_vdw_type = jap_vdw_type;
420 GENPAIRLIST(
if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
423 #ifndef KEPLER_SHUFFLE
424 float tmpx = sh_jpq[j].position.x - ipq.position.x;
425 float tmpy = sh_jpq[j].position.y - ipq.position.y;
426 float tmpz = sh_jpq[j].position.z - ipq.position.z;
428 int j_vdw_type = sh_jap[j].vdw_type;
429 int j_index = sh_jap[j].index;
430 int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
431 int j_excl_index = sh_jap[j].excl_index;
433 float j_charge = sh_jpq[j].charge;
435 int j_vdw_type = sh_jap_vdw_type[j];
438 float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
443 bool excluded =
false;
444 int indexdiff = (int)(iap.index) - j_index;
445 if ( abs(indexdiff) <= j_excl_maxdiff) {
446 indexdiff += j_excl_index;
447 int indexword = ((
unsigned int) indexdiff) >> 5;
454 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
455 if (excluded) nexcluded++;
457 if (!excluded) excl |= 0x80000000;
461 ENERGY(
float rsqrtfr2; );
467 float f_slow = ipq.charge * j_charge;
468 float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
470 float ev = ljab.x * ei.z + ljab.y * ei.y;
471 float ee = f_slow * ei.x;
472 SLOW(
float es = f_slow * ei.w; )
474 SLOW( f_slow *= fi.w; )
478 SLOW( totales += es; )
491 sh_jforce[j].x -= fx;
492 sh_jforce[j].y -= fy;
493 sh_jforce[j].z -= fz;
496 float fx_slow = tmpx * f_slow;
497 float fy_slow = tmpy * f_slow;
498 float fz_slow = tmpz * f_slow;
499 iforce_slow.x += fx_slow;
500 iforce_slow.y += fy_slow;
501 iforce_slow.z += fz_slow;
505 jforce_slow.x -= fx_slow;
506 jforce_slow.y -= fy_slow;
507 jforce_slow.z -= fz_slow;
511 sh_jforce_slow[j].
x -= fx_slow;
512 sh_jforce_slow[j].y -= fy_slow;
513 sh_jforce_slow[j].z -= fz_slow;
521 #ifdef KEPLER_SHUFFLE
544 if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
545 int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
547 atomicAdd(&
tmpforces[jforce_pos].
x, jforce.x);
548 atomicAdd(&
tmpforces[jforce_pos].
y, jforce.y);
549 atomicAdd(&
tmpforces[jforce_pos].
z, jforce.z);
554 atomicAdd(&
tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
555 atomicAdd(&
tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
556 atomicAdd(&
tmpforces[jforce_pos].
z, sh_jforce[threadIdx.x].z);
558 atomicAdd(&
slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
559 atomicAdd(&
slow_tmpforces[jforce_pos].
z, sh_jforce_slow[threadIdx.x].z););
564 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
565 int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
566 exclmasks[sh_patch_pair.exclmask_start+pos].
excl[threadIdx.x] = excl;
567 if (threadIdx.x == 0) {
568 int plist_ind = pos/32;
569 unsigned int plist_bit = 1 << (pos % 32);
570 atomicOr(&
plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
578 if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
579 int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
580 atomicAdd(&
tmpforces[iforce_pos].x, iforce.x);
581 atomicAdd(&
tmpforces[iforce_pos].y, iforce.y);
582 atomicAdd(&
tmpforces[iforce_pos].
z, iforce.z);
588 #ifdef KEPLER_SHUFFLE
589 for (
int i=WARPSIZE/2;i >= 1;i/=2) {
599 if (threadIdx.x == 0) {
600 sh_iforcesum[threadIdx.y].x += iforce.x;
601 sh_iforcesum[threadIdx.y].y += iforce.y;
602 sh_iforcesum[threadIdx.y].z += iforce.z;
604 sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
605 sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
606 sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
610 sh_jforce[threadIdx.x].x = iforce.x;
611 sh_jforce[threadIdx.x].y = iforce.y;
612 sh_jforce[threadIdx.x].z = iforce.z;
614 sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
615 sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
616 sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
619 int pos = threadIdx.x + d;
620 float valx = (pos <
WARPSIZE) ? sh_jforce[pos].x : 0.0f;
621 float valy = (pos <
WARPSIZE) ? sh_jforce[pos].y : 0.0f;
622 float valz = (pos <
WARPSIZE) ? sh_jforce[pos].
z : 0.0f;
624 float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].x : 0.0f;
625 float slow_valy = (pos <
WARPSIZE) ? sh_jforce_slow[pos].y : 0.0f;
626 float slow_valz = (pos <
WARPSIZE) ? sh_jforce_slow[pos].
z : 0.0f;
628 sh_jforce[threadIdx.x].x += valx;
629 sh_jforce[threadIdx.x].y += valy;
630 sh_jforce[threadIdx.x].z += valz;
632 sh_jforce_slow[threadIdx.x].x += slow_valx;
633 sh_jforce_slow[threadIdx.x].y += slow_valy;
634 sh_jforce_slow[threadIdx.x].z += slow_valz;
637 if (threadIdx.x == 0) {
638 sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
639 sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
640 sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
642 sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
643 sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
644 sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
657 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
658 __shared__
float sh_buf[NUM_WARP*(
SLOW(9)+9)];
659 #else // ! REG_JFORCE
661 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
662 volatile float* sh_buf = (
float *)&sh_jforce_2d[0][0];
678 if (threadIdx.x <
SLOW(3+)3 && threadIdx.y == 0) {
679 float* sh_virials = (
float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
680 float iforcesum = 0.0f;
682 for (
int i=0;i < 3*
NUM_WARP;i+=3) iforcesum += sh_virials[i];
683 float vx = iforcesum*sh_patch_pair.offset.x;
684 float vy = iforcesum*sh_patch_pair.offset.y;
685 float vz = iforcesum*sh_patch_pair.offset.z;
686 sh_iforcesum[threadIdx.x].x = vx;
687 sh_iforcesum[threadIdx.x].y = vy;
688 sh_iforcesum[threadIdx.x].z = vz;
690 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
692 float* sh_virials = (
float *)sh_iforcesum;
693 int patch1_ind = sh_patch_pair.patch1_ind;
695 atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
703 int patch1_ind = sh_patch_pair.patch1_ind;
704 int patch2_ind = sh_patch_pair.patch2_ind;
705 if (threadIdx.x == 0 && threadIdx.y == 0) {
706 sh_patch_pair.patch_done[0] =
false;
707 sh_patch_pair.patch_done[1] =
false;
713 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
714 int patch1_old = atomicInc(&
global_counters[patch1_ind+2], patch1_num_pairs-1);
715 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
716 if (patch1_ind != patch2_ind) {
717 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
718 int patch2_old = atomicInc(&
global_counters[patch2_ind+2], patch2_num_pairs-1);
719 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
725 if (sh_patch_pair.patch_done[0]) {
730 #ifndef KEPLER_SHUFFLE
731 volatile float* sh_vcc = (
volatile float*)&sh_jpq_2d[0][0];
732 volatile float* sh_slow_buf = NULL;
733 SLOW(sh_slow_buf = (
volatile float*)&sh_jforce_slow_2d[0][0];)
736 patch1_ind,
atoms, sh_buf,
737 #ifndef KEPLER_SHUFFLE
745 if (sh_patch_pair.patch_done[1]) {
749 #ifndef KEPLER_SHUFFLE
750 volatile float* sh_vcc = (
volatile float*)&sh_jpq_2d[0][0];
751 volatile float* sh_slow_buf = NULL;
752 SLOW(sh_slow_buf = (
volatile float*)&sh_jforce_slow_2d[0][0];)
755 patch2_ind,
atoms, sh_buf,
756 #ifndef KEPLER_SHUFFLE
763 if (
force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
765 #if __CUDA_ARCH__ < 200
768 __threadfence_system();
772 if (threadIdx.x == 0 && threadIdx.y == 0) {
773 if (sh_patch_pair.patch_done[0]) {
777 if (sh_patch_pair.patch_done[1]) {
782 #if __CUDA_ARCH__ < 200
785 __threadfence_system();
790 if (threadIdx.x == 0 && threadIdx.y == 0 &&
block_order != NULL) {
static __thread int * block_order
__device__ static __forceinline__ void NAME() finish_forces_virials(const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
static __thread atom * atoms
static __thread float4 * forces
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread float * slow_tmpvirials
if(ComputeNonbondedUtil::goMethod==2)
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread float * slow_virials
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
texture< float2, 1, cudaReadModeElementType > lj_table
static __thread float * virials
#define MAX_CONST_EXCLUSIONS
__device__ __forceinline__ void NAME() reduceVariables(volatile T *sh_buf, T *dst, T val1, T val2, T val3)
static __thread float4 * slow_tmpforces
static __thread unsigned int * global_counters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread atom_param * atom_params
texture< float4, 1, cudaReadModeElementType > force_table
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
static __thread int * vdw_types
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
#define WARP_ANY(MASK, P)
static __thread float4 * tmpforces
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)