2 #include <hip/hip_runtime.h>
5 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
7 #define NAME(X) SLOWNAME( X )
13 #define SLOWNAME(X) ENERGYNAME( X ## _slow )
16 #define SLOWNAME(X) ENERGYNAME( X )
23 #define ENERGYNAME(X) PAIRLISTNAME( X ## _energy )
26 #define ENERGYNAME(X) PAIRLISTNAME( X )
33 #define GENPAIRLIST(X) X
34 #define USEPAIRLIST(X)
35 #define PAIRLISTNAME(X) LAST( X ## _pairlist )
37 #define GENPAIRLIST(X)
38 #define USEPAIRLIST(X) X
39 #define PAIRLISTNAME(X) LAST( X )
46 #define KEPLER_SHUFFLE
47 #if __CUDA_ARCH__ < 300
60 __device__ __forceinline__
static void NAME(shfl_reduction)(
76 __device__ __forceinline__
79 volatile float* sh_buf,
80 #ifndef KEPLER_SHUFFLE
81 volatile float* sh_slow_buf,
volatile float* sh_vcc,
91 template<
typename T,
int n,
int sh_buf_size>
92 __device__ __forceinline__
100 for (
int i=
WARPSIZE/2;i >= 1;i/=2) {
105 if (threadIdx.x == 0) {
106 if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
107 if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
108 if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
111 if (threadIdx.x < n && threadIdx.y == 0) {
115 finalval += sh_buf[i*n + threadIdx.x];
117 atomicAdd(&dst[threadIdx.x], finalval);
119 #else // ! KEPLER_SHUFFLE
122 volatile T* sh_bufy = &sh_buf[threadIdx.y*n*
WARPSIZE];
123 if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
124 if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
125 if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
128 int pos = threadIdx.x + d;
129 T val1t, val2t, val3t;
130 if (n >= 1) val1t = (pos <
WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
131 if (n >= 2) val2t = (pos <
WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
132 if (n >= 3) val3t = (pos <
WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
133 if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
134 if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
135 if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
138 if (threadIdx.x < n && threadIdx.y == 0) {
142 finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
144 atomicAdd(&dst[threadIdx.x], finalval);
146 #endif // KEPLER_SHUFFLE
154 __global__
static void
168 const
int block_begin, const
int total_block_count,
int*
block_order,
170 const float3
lata, const float3
latb, const float3
latc,
180 __shared__ patch_pair sh_patch_pair;
183 SLOW(__shared__ float3 sh_jforce_slow_2d[
NUM_WARP][WARPSIZE];)
185 #ifndef KEPLER_SHUFFLE
186 __shared__ atom sh_jpq_2d[
NUM_WARP][WARPSIZE];
193 SLOW(
float totales = 0.f; )
199 #ifndef KEPLER_SHUFFLE
204 __shared__
unsigned int sh_plist_val[
NUM_WARP];);
208 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
211 float *p = (
float *)sh_iforcesum;
212 p[threadIdx.x] = 0.0f;
216 int* src = (
int *)&patch_pairs[block_begin + blockIdx.x];
217 int* dst = (
int *)&sh_patch_pair;
224 USEPAIRLIST(
if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
228 if (t < sh_patch_pair.plist_size)
229 plist[sh_patch_pair.plist_start + t] = 0;
234 float offx = sh_patch_pair.offset.x * lata.x
235 + sh_patch_pair.offset.y * latb.x
236 + sh_patch_pair.offset.z * latc.x;
237 float offy = sh_patch_pair.offset.x * lata.y
238 + sh_patch_pair.offset.y * latb.y
239 + sh_patch_pair.offset.z * latc.y;
240 float offz = sh_patch_pair.offset.x * lata.z
241 + sh_patch_pair.offset.y * latb.z
242 + sh_patch_pair.offset.z * latc.z;
243 sh_patch_pair.offset.x = offx;
244 sh_patch_pair.offset.y = offy;
245 sh_patch_pair.offset.z = offz;
253 volatile float3* sh_jforce = &sh_jforce_2d[threadIdx.y][0];
254 SLOW(
volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
257 #ifndef KEPLER_SHUFFLE
258 atom* sh_jpq = &sh_jpq_2d[threadIdx.y][0];
259 GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
260 USEPAIRLIST(
int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
263 for (
int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*
NUM_WARP) {
269 if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
270 int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
271 float4 tmpa = ((float4*)atoms)[i];
272 ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
273 ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
274 ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
276 GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
278 iap.index = tmpap.y;);
279 USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
287 SLOW(float3 iforce_slow;
288 iforce_slow.x = 0.0f;
289 iforce_slow.y = 0.0f;
290 iforce_slow.z = 0.0f;)
292 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
293 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
294 int blockj = (diag_patch_pair) ? blocki : 0;
295 for (;blockj < sh_patch_pair.patch2_size;blockj +=
WARPSIZE) {
298 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
299 int pos = (blockj/
WARPSIZE) + (blocki/WARPSIZE)*size2;
300 int plist_ind = pos/32;
301 unsigned int plist_bit = 1 << (pos % 32);
303 if (plist_ind != sh_plist_ind[threadIdx.y]) {
304 sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
305 sh_plist_ind[threadIdx.y] = plist_ind;
307 if ((sh_plist_val[threadIdx.y] & plist_bit) == 0)
continue;
311 #ifdef KEPLER_SHUFFLE
319 if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size)
continue;
320 int nfreej = sh_patch_pair.patch2_free_size - blockj;
321 int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
325 if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
326 int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
327 float4 tmpa = ((float4*)atoms)[j];
328 #ifdef KEPLER_SHUFFLE
329 jpq.position.x = tmpa.x;
330 jpq.position.y = tmpa.y;
331 jpq.position.z = tmpa.z;
334 sh_jpq[threadIdx.x].position.x = tmpa.x;
335 sh_jpq[threadIdx.x].position.y = tmpa.y;
336 sh_jpq[threadIdx.x].position.z = tmpa.z;
337 sh_jpq[threadIdx.x].charge = tmpa.w;
340 #ifdef KEPLER_SHUFFLE
345 USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
355 SLOW(float3 jforce_slow;
356 jforce_slow.x = 0.0f;
357 jforce_slow.y = 0.0f;
358 jforce_slow.z = 0.0f;
361 sh_jforce[threadIdx.x].x = 0.0f;
362 sh_jforce[threadIdx.x].y = 0.0f;
363 sh_jforce[threadIdx.x].z = 0.0f;
364 SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
365 sh_jforce_slow[threadIdx.x].y = 0.0f;
366 sh_jforce_slow[threadIdx.x].z = 0.0f;)
371 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
372 const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
373 unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
376 int nloopi = sh_patch_pair.patch1_size - blocki;
377 if (nloopi > WARPSIZE) nloopi = WARPSIZE;
379 int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
381 const bool diag_tile = diag_patch_pair && (blocki == blockj);
385 const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
386 int t = (diag_tile) ? 1 : 0;
389 #ifdef KEPLER_SHUFFLE
404 int j = (t + threadIdx.x) & modval;
405 #ifdef KEPLER_SHUFFLE
410 int j_vdw_type = jap.vdw_type;
411 int j_index = jap.index;
412 int j_excl_maxdiff = jap.excl_maxdiff;
413 int j_excl_index = jap.excl_index;
415 float j_charge = jpq.charge;
417 int j_vdw_type = jap_vdw_type;
420 GENPAIRLIST(
if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
423 #ifndef KEPLER_SHUFFLE
424 float tmpx = sh_jpq[j].position.x - ipq.position.x;
425 float tmpy = sh_jpq[j].position.y - ipq.position.y;
426 float tmpz = sh_jpq[j].position.z - ipq.position.z;
428 int j_vdw_type = sh_jap[j].vdw_type;
429 int j_index = sh_jap[j].index;
430 int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
431 int j_excl_index = sh_jap[j].excl_index;
433 float j_charge = sh_jpq[j].charge;
435 int j_vdw_type = sh_jap_vdw_type[j];
438 float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
443 bool excluded =
false;
444 int indexdiff = (int)(iap.index) - j_index;
445 if ( abs(indexdiff) <= j_excl_maxdiff) {
446 indexdiff += j_excl_index;
447 int indexword = ((
unsigned int) indexdiff) >> 5;
452 indexword = overflow_exclusions[indexword];
454 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
455 if (excluded) nexcluded++;
457 if (!excluded) excl |= 0x80000000;
461 ENERGY(
float rsqrtfr2; );
467 float f_slow = ipq.charge * j_charge;
468 float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
470 float ev = ljab.x * ei.z + ljab.y * ei.y;
471 float ee = f_slow * ei.x;
472 SLOW(
float es = f_slow * ei.w; )
474 SLOW( f_slow *= fi.w; )
478 SLOW( totales += es; )
491 sh_jforce[j].x -= fx;
492 sh_jforce[j].y -= fy;
493 sh_jforce[j].z -= fz;
496 float fx_slow = tmpx * f_slow;
497 float fy_slow = tmpy * f_slow;
498 float fz_slow = tmpz * f_slow;
499 iforce_slow.x += fx_slow;
500 iforce_slow.y += fy_slow;
501 iforce_slow.z += fz_slow;
505 jforce_slow.x -= fx_slow;
506 jforce_slow.y -= fy_slow;
507 jforce_slow.z -= fz_slow;
511 sh_jforce_slow[j].
x -= fx_slow;
512 sh_jforce_slow[j].y -= fy_slow;
513 sh_jforce_slow[j].z -= fz_slow;
521 #ifdef KEPLER_SHUFFLE
544 if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
545 int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
547 atomicAdd(&tmpforces[jforce_pos].
x, jforce.x);
548 atomicAdd(&tmpforces[jforce_pos].
y, jforce.y);
549 atomicAdd(&tmpforces[jforce_pos].
z, jforce.z);
550 SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
551 atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
552 atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
554 atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
555 atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
556 atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
557 SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
558 atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
559 atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
564 const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
565 int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
566 exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
567 if (threadIdx.x == 0) {
568 int plist_ind = pos/32;
569 unsigned int plist_bit = 1 << (pos % 32);
570 atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
578 if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
579 int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
580 atomicAdd(&tmpforces[iforce_pos].
x, iforce.x);
581 atomicAdd(&tmpforces[iforce_pos].
y, iforce.y);
582 atomicAdd(&tmpforces[iforce_pos].
z, iforce.z);
583 SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
584 atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
585 atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
588 #ifdef KEPLER_SHUFFLE
589 for (
int i=WARPSIZE/2;i >= 1;i/=2) {
599 if (threadIdx.x == 0) {
600 sh_iforcesum[threadIdx.y].x += iforce.x;
601 sh_iforcesum[threadIdx.y].y += iforce.y;
602 sh_iforcesum[threadIdx.y].z += iforce.z;
604 sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
605 sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
606 sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
610 sh_jforce[threadIdx.x].x = iforce.x;
611 sh_jforce[threadIdx.x].y = iforce.y;
612 sh_jforce[threadIdx.x].z = iforce.z;
614 sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
615 sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
616 sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
619 int pos = threadIdx.x + d;
620 float valx = (pos <
WARPSIZE) ? sh_jforce[pos].
x : 0.0f;
621 float valy = (pos <
WARPSIZE) ? sh_jforce[pos].
y : 0.0f;
622 float valz = (pos <
WARPSIZE) ? sh_jforce[pos].
z : 0.0f;
624 float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].
x : 0.0f;
625 float slow_valy = (pos <
WARPSIZE) ? sh_jforce_slow[pos].
y : 0.0f;
626 float slow_valz = (pos <
WARPSIZE) ? sh_jforce_slow[pos].
z : 0.0f;
628 sh_jforce[threadIdx.x].x += valx;
629 sh_jforce[threadIdx.x].y += valy;
630 sh_jforce[threadIdx.x].z += valz;
632 sh_jforce_slow[threadIdx.x].x += slow_valx;
633 sh_jforce_slow[threadIdx.x].y += slow_valy;
634 sh_jforce_slow[threadIdx.x].z += slow_valz;
637 if (threadIdx.x == 0) {
638 sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
639 sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
640 sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
642 sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
643 sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
644 sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
657 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
658 __shared__
float sh_buf[NUM_WARP*(
SLOW(9)+9)];
659 #else // ! REG_JFORCE
661 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
662 volatile float* sh_buf = (
float *)&sh_jforce_2d[0][0];
678 if (threadIdx.x <
SLOW(3+)3 && threadIdx.y == 0) {
679 float* sh_virials = (
float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
680 float iforcesum = 0.0f;
682 for (
int i=0;i < 3*
NUM_WARP;i+=3) iforcesum += sh_virials[i];
683 float vx = iforcesum*sh_patch_pair.offset.x;
684 float vy = iforcesum*sh_patch_pair.offset.y;
685 float vz = iforcesum*sh_patch_pair.offset.z;
686 sh_iforcesum[threadIdx.x].x = vx;
687 sh_iforcesum[threadIdx.x].y = vy;
688 sh_iforcesum[threadIdx.x].z = vz;
690 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
692 float* sh_virials = (
float *)sh_iforcesum;
693 int patch1_ind = sh_patch_pair.patch1_ind;
694 float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
695 atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
703 int patch1_ind = sh_patch_pair.patch1_ind;
704 int patch2_ind = sh_patch_pair.patch2_ind;
705 if (threadIdx.x == 0 && threadIdx.y == 0) {
706 sh_patch_pair.patch_done[0] =
false;
707 sh_patch_pair.patch_done[1] =
false;
713 unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
714 int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
715 if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] =
true;
716 if (patch1_ind != patch2_ind) {
717 unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
718 int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
719 if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] =
true;
725 if (sh_patch_pair.patch_done[0]) {
730 #ifndef KEPLER_SHUFFLE
731 volatile float* sh_vcc = (
volatile float*)&sh_jpq_2d[0][0];
732 volatile float* sh_slow_buf = NULL;
733 SLOW(sh_slow_buf = (
volatile float*)&sh_jforce_slow_2d[0][0];)
736 patch1_ind,
atoms, sh_buf,
737 #ifndef KEPLER_SHUFFLE
745 if (sh_patch_pair.patch_done[1]) {
749 #ifndef KEPLER_SHUFFLE
750 volatile float* sh_vcc = (
volatile float*)&sh_jpq_2d[0][0];
751 volatile float* sh_slow_buf = NULL;
752 SLOW(sh_slow_buf = (
volatile float*)&sh_jforce_slow_2d[0][0];)
755 patch2_ind,
atoms, sh_buf,
756 #ifndef KEPLER_SHUFFLE
763 if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
765 #if __CUDA_ARCH__ < 200
768 __threadfence_system();
772 if (threadIdx.x == 0 && threadIdx.y == 0) {
773 if (sh_patch_pair.patch_done[0]) {
774 int ind = atomicInc(&global_counters[0], npatches-1);
775 force_ready_queue[ind] = patch1_ind;
777 if (sh_patch_pair.patch_done[1]) {
778 int ind = atomicInc(&global_counters[0], npatches-1);
779 force_ready_queue[ind] = patch2_ind;
782 #if __CUDA_ARCH__ < 200
785 __threadfence_system();
790 if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
791 int old = atomicInc(&global_counters[1], total_block_count-1);
792 block_order[old] = block_begin + blockIdx.x;
802 __device__ __forceinline__
805 volatile float* sh_buf,
806 #ifndef KEPLER_SHUFFLE
807 volatile float* sh_slow_buf,
volatile float* sh_vcc,
823 float slow_vxx = 0.f;
824 float slow_vxy = 0.f;
825 float slow_vxz = 0.f;
826 float slow_vyx = 0.f;
827 float slow_vyy = 0.f;
828 float slow_vyz = 0.f;
829 float slow_vzx = 0.f;
830 float slow_vzy = 0.f;
831 float slow_vzz = 0.f;
833 for (
int i=threadIdx.x+threadIdx.y*WARPSIZE;i < size;i+=NUM_WARP*WARPSIZE) {
834 const int p = start+i;
835 float4 f = tmpforces[p];
837 float4 pos = ((float4*)atoms)[p];
848 float4 slow_f = slow_tmpforces[p];
849 slow_forces[p] = slow_f;
850 slow_vxx += slow_f.x * pos.x;
851 slow_vxy += slow_f.x * pos.y;
852 slow_vxz += slow_f.x * pos.z;
853 slow_vyx += slow_f.y * pos.x;
854 slow_vyy += slow_f.y * pos.y;
855 slow_vyz += slow_f.y * pos.z;
856 slow_vzx += slow_f.z * pos.x;
857 slow_vzy += slow_f.z * pos.y;
858 slow_vzz += slow_f.z * pos.z;
861 #ifdef KEPLER_SHUFFLE
863 for (
int i=WARPSIZE/2;i >= 1;i/=2) {
887 if (threadIdx.x == 0) {
888 sh_buf[threadIdx.y*(
SLOW(9)+9) + 0] = vxx;
889 sh_buf[threadIdx.y*(
SLOW(9)+9) + 1] = vxy;
890 sh_buf[threadIdx.y*(
SLOW(9)+9) + 2] = vxz;
891 sh_buf[threadIdx.y*(
SLOW(9)+9) + 3] = vyx;
892 sh_buf[threadIdx.y*(
SLOW(9)+9) + 4] = vyy;
893 sh_buf[threadIdx.y*(
SLOW(9)+9) + 5] = vyz;
894 sh_buf[threadIdx.y*(
SLOW(9)+9) + 6] = vzx;
895 sh_buf[threadIdx.y*(
SLOW(9)+9) + 7] = vzy;
896 sh_buf[threadIdx.y*(
SLOW(9)+9) + 8] = vzz;
898 sh_buf[threadIdx.y*(
SLOW(9)+9) + 9] = slow_vxx;
899 sh_buf[threadIdx.y*(
SLOW(9)+9) + 10] = slow_vxy;
900 sh_buf[threadIdx.y*(
SLOW(9)+9) + 11] = slow_vxz;
901 sh_buf[threadIdx.y*(
SLOW(9)+9) + 12] = slow_vyx;
902 sh_buf[threadIdx.y*(
SLOW(9)+9) + 13] = slow_vyy;
903 sh_buf[threadIdx.y*(
SLOW(9)+9) + 14] = slow_vyz;
904 sh_buf[threadIdx.y*(
SLOW(9)+9) + 15] = slow_vzx;
905 sh_buf[threadIdx.y*(
SLOW(9)+9) + 16] = slow_vzy;
906 sh_buf[threadIdx.y*(
SLOW(9)+9) + 17] = slow_vzz;
911 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
914 for (
int i=0;i <
NUM_WARP;i++) v += sh_buf[i*(
SLOW(9)+9) + threadIdx.x];
915 float* dst = (threadIdx.x < 9) ? virials :
slow_virials;
916 const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
917 int pos = patch_ind*16 + (threadIdx.x % 9);
918 dst[pos] = v + src[pos];
920 #else // ! KEPLER_SHUFFLE
924 const int t = threadIdx.x + threadIdx.y*
WARPSIZE;
925 volatile float* sh_v1 = &sh_buf[0];
926 volatile float* sh_v2 = &sh_buf[NUM_WARP*
WARPSIZE];
927 volatile float* sh_v3 = &sh_buf[2*NUM_WARP*
WARPSIZE];
929 volatile float* sh_slow_v1 = &sh_slow_buf[0];
930 volatile float* sh_slow_v2 = &sh_slow_buf[NUM_WARP*
WARPSIZE];
931 volatile float* sh_slow_v3 = &sh_slow_buf[2*NUM_WARP*
WARPSIZE];
939 sh_slow_v1[t] = slow_vxx;
940 sh_slow_v2[t] = slow_vxy;
941 sh_slow_v3[t] = slow_vxz;
943 for (
int d=1;d < NUM_WARP*
WARPSIZE;d*=2) {
945 float v1 = (pos < NUM_WARP*
WARPSIZE) ? sh_v1[pos] : 0.0f;
946 float v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_v2[pos] : 0.0f;
947 float v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_v3[pos] : 0.0f;
949 float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
950 float slow_v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
951 float slow_v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
958 sh_slow_v1[t] += slow_v1;
959 sh_slow_v2[t] += slow_v2;
960 sh_slow_v3[t] += slow_v3;
964 if (threadIdx.x == 0 && threadIdx.y == 0) {
965 sh_vcc[0] = sh_v1[0];
966 sh_vcc[1] = sh_v2[0];
967 sh_vcc[2] = sh_v3[0];
969 sh_vcc[9+0] = sh_slow_v1[0];
970 sh_vcc[9+1] = sh_slow_v2[0];
971 sh_vcc[9+2] = sh_slow_v3[0];
979 sh_slow_v1[t] = slow_vyx;
980 sh_slow_v2[t] = slow_vyy;
981 sh_slow_v3[t] = slow_vyz;
983 for (
int d=1;d < NUM_WARP*
WARPSIZE;d*=2) {
985 float v1 = (pos < NUM_WARP*
WARPSIZE) ? sh_v1[pos] : 0.0f;
986 float v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_v2[pos] : 0.0f;
987 float v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_v3[pos] : 0.0f;
989 float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
990 float slow_v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
991 float slow_v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
998 sh_slow_v1[t] += slow_v1;
999 sh_slow_v2[t] += slow_v2;
1000 sh_slow_v3[t] += slow_v3;
1004 if (threadIdx.x == 0 && threadIdx.y == 0) {
1005 sh_vcc[3] = sh_v1[0];
1006 sh_vcc[4] = sh_v2[0];
1007 sh_vcc[5] = sh_v3[0];
1009 sh_vcc[9+3] = sh_slow_v1[0];
1010 sh_vcc[9+4] = sh_slow_v2[0];
1011 sh_vcc[9+5] = sh_slow_v3[0];
1019 sh_slow_v1[t] = slow_vzx;
1020 sh_slow_v2[t] = slow_vzy;
1021 sh_slow_v3[t] = slow_vzz;
1023 for (
int d=1;d < NUM_WARP*
WARPSIZE;d*=2) {
1025 float v1 = (pos < NUM_WARP*
WARPSIZE) ? sh_v1[pos] : 0.0f;
1026 float v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_v2[pos] : 0.0f;
1027 float v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_v3[pos] : 0.0f;
1029 float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
1030 float slow_v2 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
1031 float slow_v3 = (pos < NUM_WARP*
WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
1038 sh_slow_v1[t] += slow_v1;
1039 sh_slow_v2[t] += slow_v2;
1040 sh_slow_v3[t] += slow_v3;
1044 if (threadIdx.x == 0 && threadIdx.y == 0) {
1045 sh_vcc[6] = sh_v1[0];
1046 sh_vcc[7] = sh_v2[0];
1047 sh_vcc[8] = sh_v3[0];
1049 sh_vcc[9+6] = sh_slow_v1[0];
1050 sh_vcc[9+7] = sh_slow_v2[0];
1051 sh_vcc[9+8] = sh_slow_v3[0];
1055 if (threadIdx.x <
SLOW(9+)9 && threadIdx.y == 0) {
1056 float* dst = (threadIdx.x < 9) ? virials : slow_virials;
1057 const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
1058 int pos = patch_ind*16 + (threadIdx.x % 9);
1059 dst[pos] = sh_vcc[threadIdx.x] + src[pos];
1061 #endif // KEPLER_SHUFFLE
1064 if (threadIdx.x < 3 && threadIdx.y == 0) {
1065 int pos = patch_ind*16 + 9 + threadIdx.x;
1066 virials[pos] = tmpvirials[pos];
1070 if (threadIdx.x == 0 && threadIdx.y == 0) {
1071 int pos = patch_ind*16 + 12;
1072 virials[pos] = tmpvirials[pos];
static __thread int * block_order
__device__ static __forceinline__ void NAME() finish_forces_virials(const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
static __thread atom * atoms
static __thread float4 * forces
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread exclmask * exclmasks
#define cuda_static_assert(expr)
static __thread float * slow_tmpvirials
if(ComputeNonbondedUtil::goMethod==2)
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread float * slow_virials
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
texture< float2, 1, cudaReadModeElementType > lj_table
static __thread float * virials
#define MAX_CONST_EXCLUSIONS
__device__ __forceinline__ void NAME() reduceVariables(volatile T *sh_buf, T *dst, T val1, T val2, T val3)
static __thread float4 * slow_tmpforces
static __thread unsigned int * global_counters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread atom_param * atom_params
texture< float4, 1, cudaReadModeElementType > force_table
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
static __thread int * vdw_types
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
#define WARP_ANY(MASK, P)
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
static __thread float4 * tmpforces
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)