Macros
#define	NAME(X) SLOWNAME( X )

#define	SLOW(X)

#define	SLOWNAME(X) ENERGYNAME( X )

#define	ENERGY(X)

#define	ENERGYNAME(X) PAIRLISTNAME( X )

#define	GENPAIRLIST(X)

#define	USEPAIRLIST(X) X

#define	PAIRLISTNAME(X) LAST( X )

#define	LAST(X) X

#define	SH_BUF_SIZE NUM_WARPWARPSIZE3*sizeof(float)

Functions
__device__ static __forceinline__ void NAME()	finish_forces_virials (const int start, const int size, const int patch_ind, const atom atoms, volatile float sh_buf, volatile float sh_slow_buf, volatile float sh_vcc, float4 tmpforces, float4 slow_tmpforces, float4 forces, float4 slow_forces, float tmpvirials, float slow_tmpvirials, float virials, float slow_virials)

template<typename T , int n, int sh_buf_size>
__device__ __forceinline__ void NAME()	reduceVariables (volatile T sh_buf, T dst, T val1, T val2, T val3)

static __global__	void (const patch_pair patch_pairs, const atom atoms, const atom_param atom_params, const int vdw_types, unsigned int plist, float4 tmpforces, float4 slow_tmpforces, float4 forces, float4 slow_forces, float tmpvirials, float slow_tmpvirials, float virials, float slow_virials, unsigned int global_counters, int force_ready_queue, const unsigned int overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int block_order, exclmask exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)

Macro Definition Documentation

#define ENERGY ( X )

Definition at line 25 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define ENERGYNAME ( X ) PAIRLISTNAME( X )

Definition at line 26 of file ComputeNonbondedCUDAKernelBase.h.

#define GENPAIRLIST ( X )

Definition at line 37 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define LAST ( X ) X

Definition at line 42 of file ComputeNonbondedCUDAKernelBase.h.

#define NAME ( X ) SLOWNAME( X )

Definition at line 7 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define PAIRLISTNAME ( X ) LAST( X )

Definition at line 39 of file ComputeNonbondedCUDAKernelBase.h.

#define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)

Referenced by void().

#define SLOW ( X )

Definition at line 15 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define SLOWNAME ( X ) ENERGYNAME( X )

Definition at line 16 of file ComputeNonbondedCUDAKernelBase.h.

#define USEPAIRLIST ( X ) X

Definition at line 38 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

Function Documentation

__device__ static __forceinline__ void NAME() finish_forces_virials	(	const int	start,
		const int	size,
		const int	patch_ind,
		const atom *	atoms,
		volatile float *	sh_buf,
		volatile float *	sh_slow_buf,
		volatile float *	sh_vcc,
		float4 *	tmpforces,
		float4 *	slow_tmpforces,
		float4 *	forces,
		float4 *	slow_forces,
		float *	tmpvirials,
		float *	slow_tmpvirials,
		float *	virials,
		float *	slow_virials
	)

static

Referenced by void().

template<typename T , int n, int sh_buf_size>

__device__ __forceinline__ void NAME() reduceVariables	(	volatile T *	sh_buf,
		T *	dst,
		T	val1,
		T	val2,
		T	val3
	)

Definition at line 93 of file ComputeNonbondedCUDAKernelBase.h.

References BLOCK_SYNC, cuda_static_assert, NUM_WARP, WARP_FULL_MASK, WARP_SHUFFLE_XOR, and WARPSIZE.

Referenced by void().

                                                                                {
         // Sanity check
         cuda_static_assert(n > 0 && n <= NUM_WARP);
 #ifdef KEPLER_SHUFFLE
   // Requires NUM_WARP*n*sizeof(float) shared memory
   cuda_static_assert(sh_buf_size >= NUM_WARP*n*sizeof(T));
   // Reduce within warp
   for (int i=WARPSIZE/2;i >= 1;i/=2) {
     if (n >= 1) val1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val1, i, WARPSIZE);
     if (n >= 2) val2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val2, i, WARPSIZE);
     if (n >= 3) val3 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val3, i, WARPSIZE);
   }
   if (threadIdx.x == 0) {
     if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
     if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
     if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
   }
   BLOCK_SYNC;
   if (threadIdx.x < n && threadIdx.y == 0) {
     T finalval = (T)0;
 #pragma unroll
     for (int i=0;i < NUM_WARP;++i) {
       finalval += sh_buf[i*n + threadIdx.x];
     }
     atomicAdd(&dst[threadIdx.x], finalval);
   }
 #else // ! KEPLER_SHUFFLE
   // Requires NUM_WARP*n*WARPSIZE*sizeof(float) shared memory
   cuda_static_assert(sh_buf_size >= NUM_WARP*n*WARPSIZE*sizeof(T));
   volatile T* sh_bufy = &sh_buf[threadIdx.y*n*WARPSIZE];
   if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
   if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
   if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
   // Reducue within warp
   for (int d=1;d < WARPSIZE;d*=2) {
     int pos = threadIdx.x + d;
     T val1t, val2t, val3t;
     if (n >= 1) val1t = (pos < WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
     if (n >= 2) val2t = (pos < WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
     if (n >= 3) val3t = (pos < WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
     if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
     if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
     if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
   }
   BLOCK_SYNC;
   if (threadIdx.x < n && threadIdx.y == 0) {
     T finalval = (T)0;
 #pragma unroll
     for (int i=0;i < NUM_WARP;++i) {
       finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
     }
     atomicAdd(&dst[threadIdx.x], finalval);
   }
 #endif // KEPLER_SHUFFLE
 }

static __global__ void	(	const patch_pair *	patch_pairs,
		const atom *	atoms,
		const atom_param *	atom_params,
		const int *	vdw_types,
		unsigned int *	plist,
		float4 *	tmpforces,
		float4 *	slow_tmpforces,
		float4 *	forces,
		float4 *	slow_forces,
		float *	tmpvirials,
		float *	slow_tmpvirials,
		float *	virials,
		float *	slow_virials,
		unsigned int *	global_counters,
		int *	force_ready_queue,
		const unsigned int *	overflow_exclusions,
		const int	npatches,
		const int	block_begin,
		const int	total_block_count,
		int *	block_order,
		exclmask *	exclmasks,
		const int	lj_table_size,
		const float3	lata,
		const float3	latb,
		const float3	latc,
		const float	cutoff2,
		const float	plcutoff2,
		const int	doSlow
	)

static

Definition at line 158 of file ComputeNonbondedCUDAKernelBase.h.

References atoms, BLOCK_SYNC, const_exclusions, ENERGY, energy_table, finish_forces_virials(), force_table, forces, GENPAIRLIST, if(), lj_table, lj_table_size, MAX_CONST_EXCLUSIONS, NAME, NUM_WARP, PATCH_PAIR_SIZE, reduceVariables(), SH_BUF_SIZE, SLOW, slow_forces, slow_tmpforces, slow_tmpvirials, slow_virials, tmpforces, tmpvirials, USEPAIRLIST, virials, WARP_ANY, WARP_FULL_MASK, WARP_SHUFFLE, WARP_SHUFFLE_XOR, WARPSIZE, x, y, and z.

                                                                     {
 
   // Local structure definitions
   GENPAIRLIST(struct vdw_index {
     int vdw_type;
     int index;
   };)
 
   // Shared memory
   __shared__ patch_pair sh_patch_pair;
 #ifndef REG_JFORCE
   __shared__ float3 sh_jforce_2d[NUM_WARP][WARPSIZE];
   SLOW(__shared__ float3 sh_jforce_slow_2d[NUM_WARP][WARPSIZE];)
 #endif
 #ifndef KEPLER_SHUFFLE
   __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
 #endif
   __shared__ float3 sh_iforcesum[SLOW(NUM_WARP+) NUM_WARP];
 
   ENERGY(
          float totalev = 0.f;
          float totalee = 0.f;
          SLOW( float totales = 0.f; )
         )
 
   GENPAIRLIST(int nexcluded=0;);
 
   {
 #ifndef KEPLER_SHUFFLE
   GENPAIRLIST(__shared__ atom_param sh_jap_2d[NUM_WARP][WARPSIZE];)
   USEPAIRLIST(__shared__ int sh_jap_vdw_type_2d[NUM_WARP][WARPSIZE];)
 #endif
   USEPAIRLIST(__shared__ int sh_plist_ind[NUM_WARP];
               __shared__ unsigned int sh_plist_val[NUM_WARP];);
 
   // Load patch_pair -data into shared memory
   {
     const int t = threadIdx.x + threadIdx.y*WARPSIZE;
 
     if (t < 3*(SLOW(NUM_WARP+) NUM_WARP)) {
       float *p = (float *)sh_iforcesum;
       p[threadIdx.x] = 0.0f;
     }
 
     if (t < PATCH_PAIR_SIZE) {
       int* src = (int *)&patch_pairs[block_begin + blockIdx.x];
       int* dst = (int *)&sh_patch_pair;
       dst[t] = src[t];
     }
     // Need to sync here to make sure sh_patch_pair is ready
     BLOCK_SYNC;
 
     // Initialize pairlist index to impossible value
     USEPAIRLIST(if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
 
     // Initialize pair list to "no interactions"
     GENPAIRLIST({
         if (t < sh_patch_pair.plist_size)
           plist[sh_patch_pair.plist_start + t] = 0;
       })
 
     // convert scaled offset with current lattice and write into shared memory
     if (t == 0) {
       float offx = sh_patch_pair.offset.x * lata.x
         + sh_patch_pair.offset.y * latb.x
         + sh_patch_pair.offset.z * latc.x;
       float offy = sh_patch_pair.offset.x * lata.y
         + sh_patch_pair.offset.y * latb.y
         + sh_patch_pair.offset.z * latc.y;
       float offz = sh_patch_pair.offset.x * lata.z
         + sh_patch_pair.offset.y * latb.z
         + sh_patch_pair.offset.z * latc.z;
       sh_patch_pair.offset.x = offx;
       sh_patch_pair.offset.y = offy;
       sh_patch_pair.offset.z = offz;
     }
 
     BLOCK_SYNC;
   }
 
   // Compute pointers to shared memory to avoid point computation later on
 #ifndef REG_JFORCE
   volatile float3* sh_jforce      = &sh_jforce_2d[threadIdx.y][0];
   SLOW(volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
 #endif
 
 #ifndef KEPLER_SHUFFLE
   atom* sh_jpq       = &sh_jpq_2d[threadIdx.y][0];
   GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
   USEPAIRLIST(int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
 #endif
 
   for (int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*NUM_WARP) {
 
     atom ipq;
     GENPAIRLIST(vdw_index iap;);
     USEPAIRLIST(int iap_vdw_type;);
     // Load i atom data
     if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
       int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
       float4 tmpa = ((float4*)atoms)[i];
       ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
       ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
       ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
       ipq.charge = tmpa.w;
       GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
                   iap.vdw_type = tmpap.x*lj_table_size;
                   iap.index = tmpap.y;);
       USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
     }
 
     // i-forces in registers
     float3 iforce;
     iforce.x = 0.0f;
     iforce.y = 0.0f;
     iforce.z = 0.0f;
     SLOW(float3 iforce_slow;
          iforce_slow.x = 0.0f;
          iforce_slow.y = 0.0f;
          iforce_slow.z = 0.0f;)
 
                 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
                 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
     int blockj = (diag_patch_pair) ? blocki : 0;
     for (;blockj < sh_patch_pair.patch2_size;blockj += WARPSIZE) {
 
       USEPAIRLIST({
           const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
           int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
           int plist_ind = pos/32;
           unsigned int plist_bit = 1 << (pos % 32);
           // Check if we need to load next entry in the pairlist
           if (plist_ind != sh_plist_ind[threadIdx.y]) {
                 sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
                 sh_plist_ind[threadIdx.y] = plist_ind;
           }
           if ((sh_plist_val[threadIdx.y] & plist_bit) == 0) continue;
         })
 
       // Load j atom data
 #ifdef KEPLER_SHUFFLE
       atom jpq;
       GENPAIRLIST(atom_param jap;);
       USEPAIRLIST(int jap_vdw_type;);
 #endif
 
       GENPAIRLIST(
                 // Avoid calculating pairs of blocks where all atoms on both blocks are fixed
                 if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size) continue;
                   int nfreej = sh_patch_pair.patch2_free_size - blockj;
                   int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
                   );
 
       //GENPAIRLIST(bool inside_plcutoff = false;)
       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
         float4 tmpa = ((float4*)atoms)[j];
 #ifdef KEPLER_SHUFFLE
         jpq.position.x = tmpa.x;
         jpq.position.y = tmpa.y;
         jpq.position.z = tmpa.z;
         jpq.charge = tmpa.w;
 #else
         sh_jpq[threadIdx.x].position.x = tmpa.x;
         sh_jpq[threadIdx.x].position.y = tmpa.y;
         sh_jpq[threadIdx.x].position.z = tmpa.z;
         sh_jpq[threadIdx.x].charge = tmpa.w;
 #endif
 
 #ifdef KEPLER_SHUFFLE
         GENPAIRLIST(jap = atom_params[j];)
         USEPAIRLIST(jap_vdw_type = vdw_types[j];)
 #else
         GENPAIRLIST(sh_jap[threadIdx.x] = atom_params[j];)
         USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
 #endif
       }
 
       // j-forces in shared memory
 #ifdef REG_JFORCE
       float3 jforce;
       jforce.x = 0.0f;
       jforce.y = 0.0f;
       jforce.z = 0.0f;
       SLOW(float3 jforce_slow;
            jforce_slow.x = 0.0f;
            jforce_slow.y = 0.0f;
            jforce_slow.z = 0.0f;
            );
 #else
       sh_jforce[threadIdx.x].x = 0.0f;
       sh_jforce[threadIdx.x].y = 0.0f;
       sh_jforce[threadIdx.x].z = 0.0f;
       SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
            sh_jforce_slow[threadIdx.x].y = 0.0f;
            sh_jforce_slow[threadIdx.x].z = 0.0f;)
 #endif
 
       GENPAIRLIST(unsigned int excl = 0;)
       USEPAIRLIST(
                   const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
                   const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
                   unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
                   );
       GENPAIRLIST(
                   int nloopi = sh_patch_pair.patch1_size - blocki;
                   if (nloopi > WARPSIZE) nloopi = WARPSIZE;
                   // NOTE: We must truncate nfreei to be non-negative number since we're comparing to threadIdx.x (unsigned int) later on
                   int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
                   )
       const bool diag_tile = diag_patch_pair && (blocki == blockj);
       // Loop through tile diagonals. Local tile indices are:
       // i = threadIdx.x % WARPSIZE = constant
       // j = (t + threadIdx.x) % WARPSIZE
       const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
       int t = (diag_tile) ? 1 : 0;
       if (diag_tile) {
         USEPAIRLIST(excl >>= 1;);
 #ifdef KEPLER_SHUFFLE
         jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
         USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
         GENPAIRLIST(jap.vdw_type     = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     jap.index        = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     jap.excl_index   = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     );
 #endif
       }
 
       for (; t < WARPSIZE; ++t) {
         USEPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl & 1)))
         {
         GENPAIRLIST(excl >>= 1;);
         int j = (t + threadIdx.x) & modval;
 #ifdef KEPLER_SHUFFLE
         float tmpx = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.x, j, WARPSIZE) - ipq.position.x;
         float tmpy = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.y, j, WARPSIZE) - ipq.position.y;
         float tmpz = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.z, j, WARPSIZE) - ipq.position.z;
         GENPAIRLIST(
                     int j_vdw_type     = jap.vdw_type;
                     int j_index        = jap.index;
                     int j_excl_maxdiff = jap.excl_maxdiff;
                     int j_excl_index   = jap.excl_index;
                     );
         float j_charge = jpq.charge;
         USEPAIRLIST(
                     int j_vdw_type = jap_vdw_type;
                     );
 #endif
         GENPAIRLIST(if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
           {
 
 #ifndef KEPLER_SHUFFLE
           float tmpx = sh_jpq[j].position.x - ipq.position.x;
           float tmpy = sh_jpq[j].position.y - ipq.position.y;
           float tmpz = sh_jpq[j].position.z - ipq.position.z;
           GENPAIRLIST(
                       int j_vdw_type     = sh_jap[j].vdw_type;
                       int j_index        = sh_jap[j].index;
                       int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
                       int j_excl_index   = sh_jap[j].excl_index;
                       );
           float j_charge = sh_jpq[j].charge;
           USEPAIRLIST(
                       int j_vdw_type = sh_jap_vdw_type[j];
                       );
 #endif
           float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
           GENPAIRLIST(if (r2 < plcutoff2))
           USEPAIRLIST(if ((excl & 1) && r2 < cutoff2))
             {
             GENPAIRLIST(
                         bool excluded = false;
                         int indexdiff = (int)(iap.index) - j_index;
                         if ( abs(indexdiff) <= j_excl_maxdiff) {
                             indexdiff += j_excl_index;
                             int indexword = ((unsigned int) indexdiff) >> 5;
                             //indexword = tex1Dfetch(tex_exclusions, indexword);
                             if ( indexword < MAX_CONST_EXCLUSIONS )
                               indexword = const_exclusions[indexword];
                             else {
                               indexword = overflow_exclusions[indexword];
                             }
                             excluded = ((indexword & (1<<(indexdiff&31))) != 0);
                             if (excluded) nexcluded++;
                         }
                         if (!excluded) excl |= 0x80000000;
                         )
             GENPAIRLIST(if ( ! excluded && r2 < cutoff2))
             {
             ENERGY( float rsqrtfr2; );
             float4 fi = tex1D(force_table, ENERGY(rsqrtfr2 =) rsqrtf(r2));
             ENERGY( float4 ei = tex1D(energy_table, rsqrtfr2); );
             GENPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap.vdw_type););
             USEPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap_vdw_type););
 
               float f_slow = ipq.charge * j_charge;
               float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
               ENERGY(
                      float ev = ljab.x * ei.z + ljab.y * ei.y;
                      float ee = f_slow * ei.x;
                      SLOW( float es = f_slow * ei.w; )
                      )
               SLOW( f_slow *= fi.w; )
               ENERGY(
                      totalev += ev;
                      totalee += ee;
                      SLOW( totales += es; )
                      )
               float fx = tmpx * f;
               float fy = tmpy * f;
               float fz = tmpz * f;
               iforce.x += fx;
               iforce.y += fy;
               iforce.z += fz;
 #ifdef REG_JFORCE
               jforce.x -= fx;
               jforce.y -= fy;
               jforce.z -= fz;
 #else
               sh_jforce[j].x -= fx;
               sh_jforce[j].y -= fy;
               sh_jforce[j].z -= fz;
 #endif
               SLOW(
                    float fx_slow = tmpx * f_slow;
                    float fy_slow = tmpy * f_slow;
                    float fz_slow = tmpz * f_slow;
                    iforce_slow.x += fx_slow;
                    iforce_slow.y += fy_slow;
                    iforce_slow.z += fz_slow;
                    )
 #ifdef REG_JFORCE
               SLOW(
                    jforce_slow.x -= fx_slow;
                    jforce_slow.y -= fy_slow;
                    jforce_slow.z -= fz_slow;
                    )
 #else
               SLOW(
                    sh_jforce_slow[j].x -= fx_slow;
                    sh_jforce_slow[j].y -= fy_slow;
                    sh_jforce_slow[j].z -= fz_slow;
                    )
 #endif
             }
             } // cutoff
         } // if (j < nloopj...)
 }
         USEPAIRLIST(excl >>= 1;);
 #ifdef KEPLER_SHUFFLE
         jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
         USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
         GENPAIRLIST(jap.vdw_type     = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     jap.index        = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     jap.excl_index   = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
                     );
 #ifdef REG_JFORCE
         jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
         jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
         jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
         SLOW(
              jforce_slow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
              jforce_slow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
              jforce_slow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
              );
 #endif
 #endif
       } // t
 
       // Write j-forces
       GENPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl != 0))) {
         if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
           int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
 #ifdef REG_JFORCE
           atomicAdd(&tmpforces[jforce_pos].x, jforce.x);
           atomicAdd(&tmpforces[jforce_pos].y, jforce.y);
           atomicAdd(&tmpforces[jforce_pos].z, jforce.z);
           SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
                atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
                atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
 #else
           atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
           atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
           atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
           SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
                atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
                atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
 #endif
         }
 
       GENPAIRLIST(
                   const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
                   int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
                   exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
                   if (threadIdx.x == 0) {
                     int plist_ind = pos/32;
                     unsigned int plist_bit = 1 << (pos % 32);
                     atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
                   }
                   );
       }
 
     } // for (blockj)
 
     // Write i-forces
     if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
       int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
       atomicAdd(&tmpforces[iforce_pos].x, iforce.x);
       atomicAdd(&tmpforces[iforce_pos].y, iforce.y);
       atomicAdd(&tmpforces[iforce_pos].z, iforce.z);
       SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
            atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
            atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
     }
     // Accumulate total forces for virial (warp synchronous)
 #ifdef KEPLER_SHUFFLE
     for (int i=WARPSIZE/2;i >= 1;i/=2) {
       iforce.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.x, i, WARPSIZE);
       iforce.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.y, i, WARPSIZE);
       iforce.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.z, i, WARPSIZE);
       SLOW(
            iforce_slow.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.x, i, WARPSIZE);
            iforce_slow.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.y, i, WARPSIZE);
            iforce_slow.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.z, i, WARPSIZE);
            );
     }
     if (threadIdx.x == 0) {
       sh_iforcesum[threadIdx.y].x += iforce.x;
       sh_iforcesum[threadIdx.y].y += iforce.y;
       sh_iforcesum[threadIdx.y].z += iforce.z;
       SLOW(
            sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
            sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
            sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
            );
     }
 #else
     sh_jforce[threadIdx.x].x = iforce.x;
     sh_jforce[threadIdx.x].y = iforce.y;
     sh_jforce[threadIdx.x].z = iforce.z;
     SLOW(
          sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
          sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
          sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
          );
     for (int d=1;d < WARPSIZE;d*=2) {
       int pos = threadIdx.x + d;
       float valx = (pos < WARPSIZE) ? sh_jforce[pos].x : 0.0f;
       float valy = (pos < WARPSIZE) ? sh_jforce[pos].y : 0.0f;
       float valz = (pos < WARPSIZE) ? sh_jforce[pos].z : 0.0f;
       SLOW(
            float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].x : 0.0f;
            float slow_valy = (pos < WARPSIZE) ? sh_jforce_slow[pos].y : 0.0f;
            float slow_valz = (pos < WARPSIZE) ? sh_jforce_slow[pos].z : 0.0f;
            );
       sh_jforce[threadIdx.x].x += valx;
       sh_jforce[threadIdx.x].y += valy;
       sh_jforce[threadIdx.x].z += valz;
       SLOW(
            sh_jforce_slow[threadIdx.x].x += slow_valx;
            sh_jforce_slow[threadIdx.x].y += slow_valy;
            sh_jforce_slow[threadIdx.x].z += slow_valz;
            );
     }
     if (threadIdx.x == 0) {
       sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
       sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
       sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
       SLOW(
            sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
            sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
            sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
            );
     }
 #endif
 
   } // for (blocki)
 
   }
 
   {
 
 #ifdef REG_JFORCE
 #undef SH_BUF_SIZE
 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
     __shared__ float sh_buf[NUM_WARP*(SLOW(9)+9)];
 #else // ! REG_JFORCE
 #undef SH_BUF_SIZE
 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
     volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
         // Sync here to make sure we can write into shared memory (sh_jforce_2d)
         BLOCK_SYNC;
 #endif
 
 #if ENERGY(1+)0
         NAME(reduceVariables)<float, SLOW(1+)2, SH_BUF_SIZE>(sh_buf, &tmpvirials[sh_patch_pair.patch1_ind*16 + 9], totalev, totalee, SLOW(totales+)0.0f);
 #endif
 
 #if GENPAIRLIST(1+)0
         ENERGY(BLOCK_SYNC);
     NAME(reduceVariables)<int, 1, SH_BUF_SIZE>((int *)sh_buf, (int *)&tmpvirials[sh_patch_pair.patch1_ind*16 + 12], nexcluded, 0, 0);
 #endif
 
   // Virials
   BLOCK_SYNC;
   if (threadIdx.x < SLOW(3+)3 && threadIdx.y == 0) {
     float* sh_virials = (float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
     float iforcesum = 0.0f;
 #pragma unroll
     for (int i=0;i < 3*NUM_WARP;i+=3) iforcesum += sh_virials[i];
     float vx = iforcesum*sh_patch_pair.offset.x;
     float vy = iforcesum*sh_patch_pair.offset.y;
     float vz = iforcesum*sh_patch_pair.offset.z;
     sh_iforcesum[threadIdx.x].x = vx;
     sh_iforcesum[threadIdx.x].y = vy;
     sh_iforcesum[threadIdx.x].z = vz;
   }
   if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
     // virials are in sh_virials[0...8] and slow virials in sh_virials[9...17]
     float* sh_virials = (float *)sh_iforcesum;
     int patch1_ind = sh_patch_pair.patch1_ind;
     float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
     atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
   }
 
   // Make sure forces are up-to-date in device global memory
   __threadfence();
   BLOCK_SYNC;
 
   // Mark patch pair (patch1_ind, patch2_ind) as "done"
   int patch1_ind = sh_patch_pair.patch1_ind;
   int patch2_ind = sh_patch_pair.patch2_ind;
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     sh_patch_pair.patch_done[0] = false;
     sh_patch_pair.patch_done[1] = false;
     //
     // global_counters[0]: force_ready_queue
     // global_counters[1]: block_order
     // global_counters[2...npatches+1]: number of pairs finished for patch i+2
     //
     unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
     int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
     if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
     if (patch1_ind != patch2_ind) {
       unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
       int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
       if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
     }
   }
   // sync threads so that patch1_done and patch2_done are visible to all threads
   BLOCK_SYNC;
 
   if (sh_patch_pair.patch_done[0]) {
 
 // #ifndef REG_JFORCE
 //     volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
 // #endif
 #ifndef KEPLER_SHUFFLE
     volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
     volatile float* sh_slow_buf = NULL;
     SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
 #endif
     NAME(finish_forces_virials)(sh_patch_pair.patch1_start, sh_patch_pair.patch1_size,
                                 patch1_ind, atoms, sh_buf,
 #ifndef KEPLER_SHUFFLE
                                 sh_slow_buf, sh_vcc,
 #endif
                                 tmpforces, slow_tmpforces, forces, slow_forces,
                                 tmpvirials, slow_tmpvirials, virials, slow_virials);
 
   }
 
   if (sh_patch_pair.patch_done[1]) {
 // #ifndef REG_JFORCE
 //     volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
 // #endif
 #ifndef KEPLER_SHUFFLE
     volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
     volatile float* sh_slow_buf = NULL;
     SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
 #endif
     NAME(finish_forces_virials)(sh_patch_pair.patch2_start, sh_patch_pair.patch2_size,
                                 patch2_ind, atoms, sh_buf,
 #ifndef KEPLER_SHUFFLE
                                 sh_slow_buf, sh_vcc,
 #endif
                                 tmpforces, slow_tmpforces, forces, slow_forces,
                                 tmpvirials, slow_tmpvirials, virials, slow_virials);
   }
 
   if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
         // Make sure page-locked host forces are up-to-date
 #if __CUDA_ARCH__ < 200
     __threadfence();
 #else
     __threadfence_system();
 #endif
     BLOCK_SYNC;
     // Add patch into "force_ready_queue"
     if (threadIdx.x == 0 && threadIdx.y == 0) {
         if (sh_patch_pair.patch_done[0]) {
         int ind = atomicInc(&global_counters[0], npatches-1);
         force_ready_queue[ind] = patch1_ind;
       }
       if (sh_patch_pair.patch_done[1]) {
         int ind = atomicInc(&global_counters[0], npatches-1);
         force_ready_queue[ind] = patch2_ind;
       }
       // Make sure "force_ready_queue" is visible in page-locked host memory
 #if __CUDA_ARCH__ < 200
       __threadfence();
 #else
       __threadfence_system();
 #endif
     }
   }
 
   if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
     int old = atomicInc(&global_counters[1], total_block_count-1);
     block_order[old] = block_begin + blockIdx.x;
   }
   }
 
 }

Macros

Functions

Macro Definition Documentation

Function Documentation