NAMD
Macros | Functions
ComputeNonbondedCUDAKernelBase.h File Reference

Go to the source code of this file.

Macros

#define NAME(X)   SLOWNAME( X )
 
#define SLOW(X)
 
#define SLOWNAME(X)   ENERGYNAME( X )
 
#define ENERGY(X)
 
#define ENERGYNAME(X)   PAIRLISTNAME( X )
 
#define GENPAIRLIST(X)
 
#define USEPAIRLIST(X)   X
 
#define PAIRLISTNAME(X)   LAST( X )
 
#define LAST(X)   X
 
#define SH_BUF_SIZE   NUM_WARP*WARPSIZE*3*sizeof(float)
 

Functions

__device__ static
__forceinline__ void NAME() 
finish_forces_virials (const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
 
template<typename T , int n, int sh_buf_size>
__device__ __forceinline__
void NAME() 
reduceVariables (volatile T *sh_buf, T *dst, T val1, T val2, T val3)
 
static __global__ void (const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const int *vdw_types, unsigned int *plist, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials, unsigned int *global_counters, int *force_ready_queue, const unsigned int *overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int *block_order, exclmask *exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)
 

Macro Definition Documentation

#define ENERGY (   X)

Definition at line 25 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define ENERGYNAME (   X)    PAIRLISTNAME( X )

Definition at line 26 of file ComputeNonbondedCUDAKernelBase.h.

#define GENPAIRLIST (   X)

Definition at line 37 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define LAST (   X)    X

Definition at line 42 of file ComputeNonbondedCUDAKernelBase.h.

#define NAME (   X)    SLOWNAME( X )

Definition at line 7 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define PAIRLISTNAME (   X)    LAST( X )

Definition at line 39 of file ComputeNonbondedCUDAKernelBase.h.

#define SH_BUF_SIZE   NUM_WARP*WARPSIZE*3*sizeof(float)

Referenced by void().

#define SLOW (   X)

Definition at line 15 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

#define SLOWNAME (   X)    ENERGYNAME( X )

Definition at line 16 of file ComputeNonbondedCUDAKernelBase.h.

#define USEPAIRLIST (   X)    X

Definition at line 38 of file ComputeNonbondedCUDAKernelBase.h.

Referenced by void().

Function Documentation

__device__ static __forceinline__ void NAME() finish_forces_virials ( const int  start,
const int  size,
const int  patch_ind,
const atom *  atoms,
volatile float *  sh_buf,
volatile float *  sh_slow_buf,
volatile float *  sh_vcc,
float4 *  tmpforces,
float4 *  slow_tmpforces,
float4 *  forces,
float4 *  slow_forces,
float *  tmpvirials,
float *  slow_tmpvirials,
float *  virials,
float *  slow_virials 
)
static

Referenced by void().

template<typename T , int n, int sh_buf_size>
__device__ __forceinline__ void NAME() reduceVariables ( volatile T *  sh_buf,
T *  dst,
val1,
val2,
val3 
)

Definition at line 93 of file ComputeNonbondedCUDAKernelBase.h.

References BLOCK_SYNC, cuda_static_assert, NUM_WARP, WARP_FULL_MASK, WARP_SHUFFLE_XOR, and WARPSIZE.

Referenced by void().

93  {
94  // Sanity check
95  cuda_static_assert(n > 0 && n <= NUM_WARP);
96 #ifdef KEPLER_SHUFFLE
97  // Requires NUM_WARP*n*sizeof(float) shared memory
98  cuda_static_assert(sh_buf_size >= NUM_WARP*n*sizeof(T));
99  // Reduce within warp
100  for (int i=WARPSIZE/2;i >= 1;i/=2) {
101  if (n >= 1) val1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val1, i, WARPSIZE);
102  if (n >= 2) val2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val2, i, WARPSIZE);
103  if (n >= 3) val3 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val3, i, WARPSIZE);
104  }
105  if (threadIdx.x == 0) {
106  if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
107  if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
108  if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
109  }
110  BLOCK_SYNC;
111  if (threadIdx.x < n && threadIdx.y == 0) {
112  T finalval = (T)0;
113 #pragma unroll
114  for (int i=0;i < NUM_WARP;++i) {
115  finalval += sh_buf[i*n + threadIdx.x];
116  }
117  atomicAdd(&dst[threadIdx.x], finalval);
118  }
119 #else // ! KEPLER_SHUFFLE
120  // Requires NUM_WARP*n*WARPSIZE*sizeof(float) shared memory
121  cuda_static_assert(sh_buf_size >= NUM_WARP*n*WARPSIZE*sizeof(T));
122  volatile T* sh_bufy = &sh_buf[threadIdx.y*n*WARPSIZE];
123  if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
124  if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
125  if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
126  // Reducue within warp
127  for (int d=1;d < WARPSIZE;d*=2) {
128  int pos = threadIdx.x + d;
129  T val1t, val2t, val3t;
130  if (n >= 1) val1t = (pos < WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
131  if (n >= 2) val2t = (pos < WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
132  if (n >= 3) val3t = (pos < WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
133  if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
134  if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
135  if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
136  }
137  BLOCK_SYNC;
138  if (threadIdx.x < n && threadIdx.y == 0) {
139  T finalval = (T)0;
140 #pragma unroll
141  for (int i=0;i < NUM_WARP;++i) {
142  finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
143  }
144  atomicAdd(&dst[threadIdx.x], finalval);
145  }
146 #endif // KEPLER_SHUFFLE
147 }
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define cuda_static_assert(expr)
Definition: CudaUtils.h:85
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
#define NUM_WARP
static __global__ void ( const patch_pair *  patch_pairs,
const atom *  atoms,
const atom_param *  atom_params,
const int *  vdw_types,
unsigned int *  plist,
float4 *  tmpforces,
float4 *  slow_tmpforces,
float4 *  forces,
float4 *  slow_forces,
float *  tmpvirials,
float *  slow_tmpvirials,
float *  virials,
float *  slow_virials,
unsigned int *  global_counters,
int *  force_ready_queue,
const unsigned int *  overflow_exclusions,
const int  npatches,
const int  block_begin,
const int  total_block_count,
int *  block_order,
exclmask exclmasks,
const int  lj_table_size,
const float3  lata,
const float3  latb,
const float3  latc,
const float  cutoff2,
const float  plcutoff2,
const int  doSlow 
)
static

Definition at line 158 of file ComputeNonbondedCUDAKernelBase.h.

References atoms, BLOCK_SYNC, const_exclusions, ENERGY, energy_table, finish_forces_virials(), force_table, forces, GENPAIRLIST, if(), lj_table, lj_table_size, MAX_CONST_EXCLUSIONS, NAME, NUM_WARP, PATCH_PAIR_SIZE, reduceVariables(), SH_BUF_SIZE, SLOW, slow_forces, slow_tmpforces, slow_tmpvirials, slow_virials, tmpforces, tmpvirials, USEPAIRLIST, virials, WARP_ANY, WARP_FULL_MASK, WARP_SHUFFLE, WARP_SHUFFLE_XOR, WARPSIZE, x, y, and z.

171  {
172 
173  // Local structure definitions
174  GENPAIRLIST(struct vdw_index {
175  int vdw_type;
176  int index;
177  };)
178 
179  // Shared memory
180  __shared__ patch_pair sh_patch_pair;
181 #ifndef REG_JFORCE
182  __shared__ float3 sh_jforce_2d[NUM_WARP][WARPSIZE];
183  SLOW(__shared__ float3 sh_jforce_slow_2d[NUM_WARP][WARPSIZE];)
184 #endif
185 #ifndef KEPLER_SHUFFLE
186  __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
187 #endif
188  __shared__ float3 sh_iforcesum[SLOW(NUM_WARP+) NUM_WARP];
189 
190  ENERGY(
191  float totalev = 0.f;
192  float totalee = 0.f;
193  SLOW( float totales = 0.f; )
194  )
195 
196  GENPAIRLIST(int nexcluded=0;);
197 
198  {
199 #ifndef KEPLER_SHUFFLE
200  GENPAIRLIST(__shared__ atom_param sh_jap_2d[NUM_WARP][WARPSIZE];)
201  USEPAIRLIST(__shared__ int sh_jap_vdw_type_2d[NUM_WARP][WARPSIZE];)
202 #endif
203  USEPAIRLIST(__shared__ int sh_plist_ind[NUM_WARP];
204  __shared__ unsigned int sh_plist_val[NUM_WARP];);
205 
206  // Load patch_pair -data into shared memory
207  {
208  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
209 
210  if (t < 3*(SLOW(NUM_WARP+) NUM_WARP)) {
211  float *p = (float *)sh_iforcesum;
212  p[threadIdx.x] = 0.0f;
213  }
214 
215  if (t < PATCH_PAIR_SIZE) {
216  int* src = (int *)&patch_pairs[block_begin + blockIdx.x];
217  int* dst = (int *)&sh_patch_pair;
218  dst[t] = src[t];
219  }
220  // Need to sync here to make sure sh_patch_pair is ready
221  BLOCK_SYNC;
222 
223  // Initialize pairlist index to impossible value
224  USEPAIRLIST(if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
225 
226  // Initialize pair list to "no interactions"
227  GENPAIRLIST({
228  if (t < sh_patch_pair.plist_size)
229  plist[sh_patch_pair.plist_start + t] = 0;
230  })
231 
232  // convert scaled offset with current lattice and write into shared memory
233  if (t == 0) {
234  float offx = sh_patch_pair.offset.x * lata.x
235  + sh_patch_pair.offset.y * latb.x
236  + sh_patch_pair.offset.z * latc.x;
237  float offy = sh_patch_pair.offset.x * lata.y
238  + sh_patch_pair.offset.y * latb.y
239  + sh_patch_pair.offset.z * latc.y;
240  float offz = sh_patch_pair.offset.x * lata.z
241  + sh_patch_pair.offset.y * latb.z
242  + sh_patch_pair.offset.z * latc.z;
243  sh_patch_pair.offset.x = offx;
244  sh_patch_pair.offset.y = offy;
245  sh_patch_pair.offset.z = offz;
246  }
247 
248  BLOCK_SYNC;
249  }
250 
251  // Compute pointers to shared memory to avoid point computation later on
252 #ifndef REG_JFORCE
253  volatile float3* sh_jforce = &sh_jforce_2d[threadIdx.y][0];
254  SLOW(volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
255 #endif
256 
257 #ifndef KEPLER_SHUFFLE
258  atom* sh_jpq = &sh_jpq_2d[threadIdx.y][0];
259  GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
260  USEPAIRLIST(int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
261 #endif
262 
263  for (int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*NUM_WARP) {
264 
265  atom ipq;
266  GENPAIRLIST(vdw_index iap;);
267  USEPAIRLIST(int iap_vdw_type;);
268  // Load i atom data
269  if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
270  int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
271  float4 tmpa = ((float4*)atoms)[i];
272  ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
273  ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
274  ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
275  ipq.charge = tmpa.w;
276  GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
277  iap.vdw_type = tmpap.x*lj_table_size;
278  iap.index = tmpap.y;);
279  USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
280  }
281 
282  // i-forces in registers
283  float3 iforce;
284  iforce.x = 0.0f;
285  iforce.y = 0.0f;
286  iforce.z = 0.0f;
287  SLOW(float3 iforce_slow;
288  iforce_slow.x = 0.0f;
289  iforce_slow.y = 0.0f;
290  iforce_slow.z = 0.0f;)
291 
292  const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
293  (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
294  int blockj = (diag_patch_pair) ? blocki : 0;
295  for (;blockj < sh_patch_pair.patch2_size;blockj += WARPSIZE) {
296 
297  USEPAIRLIST({
298  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
299  int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
300  int plist_ind = pos/32;
301  unsigned int plist_bit = 1 << (pos % 32);
302  // Check if we need to load next entry in the pairlist
303  if (plist_ind != sh_plist_ind[threadIdx.y]) {
304  sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
305  sh_plist_ind[threadIdx.y] = plist_ind;
306  }
307  if ((sh_plist_val[threadIdx.y] & plist_bit) == 0) continue;
308  })
309 
310  // Load j atom data
311 #ifdef KEPLER_SHUFFLE
312  atom jpq;
313  GENPAIRLIST(atom_param jap;);
314  USEPAIRLIST(int jap_vdw_type;);
315 #endif
316 
317  GENPAIRLIST(
318  // Avoid calculating pairs of blocks where all atoms on both blocks are fixed
319  if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size) continue;
320  int nfreej = sh_patch_pair.patch2_free_size - blockj;
321  int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
322  );
323 
324  //GENPAIRLIST(bool inside_plcutoff = false;)
325  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
326  int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
327  float4 tmpa = ((float4*)atoms)[j];
328 #ifdef KEPLER_SHUFFLE
329  jpq.position.x = tmpa.x;
330  jpq.position.y = tmpa.y;
331  jpq.position.z = tmpa.z;
332  jpq.charge = tmpa.w;
333 #else
334  sh_jpq[threadIdx.x].position.x = tmpa.x;
335  sh_jpq[threadIdx.x].position.y = tmpa.y;
336  sh_jpq[threadIdx.x].position.z = tmpa.z;
337  sh_jpq[threadIdx.x].charge = tmpa.w;
338 #endif
339 
340 #ifdef KEPLER_SHUFFLE
341  GENPAIRLIST(jap = atom_params[j];)
342  USEPAIRLIST(jap_vdw_type = vdw_types[j];)
343 #else
344  GENPAIRLIST(sh_jap[threadIdx.x] = atom_params[j];)
345  USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
346 #endif
347  }
348 
349  // j-forces in shared memory
350 #ifdef REG_JFORCE
351  float3 jforce;
352  jforce.x = 0.0f;
353  jforce.y = 0.0f;
354  jforce.z = 0.0f;
355  SLOW(float3 jforce_slow;
356  jforce_slow.x = 0.0f;
357  jforce_slow.y = 0.0f;
358  jforce_slow.z = 0.0f;
359  );
360 #else
361  sh_jforce[threadIdx.x].x = 0.0f;
362  sh_jforce[threadIdx.x].y = 0.0f;
363  sh_jforce[threadIdx.x].z = 0.0f;
364  SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
365  sh_jforce_slow[threadIdx.x].y = 0.0f;
366  sh_jforce_slow[threadIdx.x].z = 0.0f;)
367 #endif
368 
369  GENPAIRLIST(unsigned int excl = 0;)
370  USEPAIRLIST(
371  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
372  const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
373  unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
374  );
375  GENPAIRLIST(
376  int nloopi = sh_patch_pair.patch1_size - blocki;
377  if (nloopi > WARPSIZE) nloopi = WARPSIZE;
378  // NOTE: We must truncate nfreei to be non-negative number since we're comparing to threadIdx.x (unsigned int) later on
379  int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
380  )
381  const bool diag_tile = diag_patch_pair && (blocki == blockj);
382  // Loop through tile diagonals. Local tile indices are:
383  // i = threadIdx.x % WARPSIZE = constant
384  // j = (t + threadIdx.x) % WARPSIZE
385  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
386  int t = (diag_tile) ? 1 : 0;
387  if (diag_tile) {
388  USEPAIRLIST(excl >>= 1;);
389 #ifdef KEPLER_SHUFFLE
390  jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
391  USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
392  GENPAIRLIST(jap.vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
393  jap.index = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
394  jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
395  jap.excl_index = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
396  );
397 #endif
398  }
399 
400  for (; t < WARPSIZE; ++t) {
401  USEPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl & 1)))
402  {
403  GENPAIRLIST(excl >>= 1;);
404  int j = (t + threadIdx.x) & modval;
405 #ifdef KEPLER_SHUFFLE
406  float tmpx = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.x, j, WARPSIZE) - ipq.position.x;
407  float tmpy = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.y, j, WARPSIZE) - ipq.position.y;
408  float tmpz = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.z, j, WARPSIZE) - ipq.position.z;
409  GENPAIRLIST(
410  int j_vdw_type = jap.vdw_type;
411  int j_index = jap.index;
412  int j_excl_maxdiff = jap.excl_maxdiff;
413  int j_excl_index = jap.excl_index;
414  );
415  float j_charge = jpq.charge;
416  USEPAIRLIST(
417  int j_vdw_type = jap_vdw_type;
418  );
419 #endif
420  GENPAIRLIST(if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
421  {
422 
423 #ifndef KEPLER_SHUFFLE
424  float tmpx = sh_jpq[j].position.x - ipq.position.x;
425  float tmpy = sh_jpq[j].position.y - ipq.position.y;
426  float tmpz = sh_jpq[j].position.z - ipq.position.z;
427  GENPAIRLIST(
428  int j_vdw_type = sh_jap[j].vdw_type;
429  int j_index = sh_jap[j].index;
430  int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
431  int j_excl_index = sh_jap[j].excl_index;
432  );
433  float j_charge = sh_jpq[j].charge;
434  USEPAIRLIST(
435  int j_vdw_type = sh_jap_vdw_type[j];
436  );
437 #endif
438  float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
439  GENPAIRLIST(if (r2 < plcutoff2))
440  USEPAIRLIST(if ((excl & 1) && r2 < cutoff2))
441  {
442  GENPAIRLIST(
443  bool excluded = false;
444  int indexdiff = (int)(iap.index) - j_index;
445  if ( abs(indexdiff) <= j_excl_maxdiff) {
446  indexdiff += j_excl_index;
447  int indexword = ((unsigned int) indexdiff) >> 5;
448  //indexword = tex1Dfetch(tex_exclusions, indexword);
449  if ( indexword < MAX_CONST_EXCLUSIONS )
450  indexword = const_exclusions[indexword];
451  else {
452  indexword = overflow_exclusions[indexword];
453  }
454  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
455  if (excluded) nexcluded++;
456  }
457  if (!excluded) excl |= 0x80000000;
458  )
459  GENPAIRLIST(if ( ! excluded && r2 < cutoff2))
460  {
461  ENERGY( float rsqrtfr2; );
462  float4 fi = tex1D(force_table, ENERGY(rsqrtfr2 =) rsqrtf(r2));
463  ENERGY( float4 ei = tex1D(energy_table, rsqrtfr2); );
464  GENPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap.vdw_type););
465  USEPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap_vdw_type););
466 
467  float f_slow = ipq.charge * j_charge;
468  float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
469  ENERGY(
470  float ev = ljab.x * ei.z + ljab.y * ei.y;
471  float ee = f_slow * ei.x;
472  SLOW( float es = f_slow * ei.w; )
473  )
474  SLOW( f_slow *= fi.w; )
475  ENERGY(
476  totalev += ev;
477  totalee += ee;
478  SLOW( totales += es; )
479  )
480  float fx = tmpx * f;
481  float fy = tmpy * f;
482  float fz = tmpz * f;
483  iforce.x += fx;
484  iforce.y += fy;
485  iforce.z += fz;
486 #ifdef REG_JFORCE
487  jforce.x -= fx;
488  jforce.y -= fy;
489  jforce.z -= fz;
490 #else
491  sh_jforce[j].x -= fx;
492  sh_jforce[j].y -= fy;
493  sh_jforce[j].z -= fz;
494 #endif
495  SLOW(
496  float fx_slow = tmpx * f_slow;
497  float fy_slow = tmpy * f_slow;
498  float fz_slow = tmpz * f_slow;
499  iforce_slow.x += fx_slow;
500  iforce_slow.y += fy_slow;
501  iforce_slow.z += fz_slow;
502  )
503 #ifdef REG_JFORCE
504  SLOW(
505  jforce_slow.x -= fx_slow;
506  jforce_slow.y -= fy_slow;
507  jforce_slow.z -= fz_slow;
508  )
509 #else
510  SLOW(
511  sh_jforce_slow[j].x -= fx_slow;
512  sh_jforce_slow[j].y -= fy_slow;
513  sh_jforce_slow[j].z -= fz_slow;
514  )
515 #endif
516  }
517  } // cutoff
518  } // if (j < nloopj...)
519 }
520  USEPAIRLIST(excl >>= 1;);
521 #ifdef KEPLER_SHUFFLE
522  jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
523  USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
524  GENPAIRLIST(jap.vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
525  jap.index = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
526  jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
527  jap.excl_index = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
528  );
529 #ifdef REG_JFORCE
530  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
531  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
532  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
533  SLOW(
534  jforce_slow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
535  jforce_slow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
536  jforce_slow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
537  );
538 #endif
539 #endif
540  } // t
541 
542  // Write j-forces
543  GENPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl != 0))) {
544  if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
545  int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
546 #ifdef REG_JFORCE
547  atomicAdd(&tmpforces[jforce_pos].x, jforce.x);
548  atomicAdd(&tmpforces[jforce_pos].y, jforce.y);
549  atomicAdd(&tmpforces[jforce_pos].z, jforce.z);
550  SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
551  atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
552  atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
553 #else
554  atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
555  atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
556  atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
557  SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
558  atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
559  atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
560 #endif
561  }
562 
563  GENPAIRLIST(
564  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
565  int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
566  exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
567  if (threadIdx.x == 0) {
568  int plist_ind = pos/32;
569  unsigned int plist_bit = 1 << (pos % 32);
570  atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
571  }
572  );
573  }
574 
575  } // for (blockj)
576 
577  // Write i-forces
578  if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
579  int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
580  atomicAdd(&tmpforces[iforce_pos].x, iforce.x);
581  atomicAdd(&tmpforces[iforce_pos].y, iforce.y);
582  atomicAdd(&tmpforces[iforce_pos].z, iforce.z);
583  SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
584  atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
585  atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
586  }
587  // Accumulate total forces for virial (warp synchronous)
588 #ifdef KEPLER_SHUFFLE
589  for (int i=WARPSIZE/2;i >= 1;i/=2) {
590  iforce.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.x, i, WARPSIZE);
591  iforce.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.y, i, WARPSIZE);
592  iforce.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.z, i, WARPSIZE);
593  SLOW(
594  iforce_slow.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.x, i, WARPSIZE);
595  iforce_slow.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.y, i, WARPSIZE);
596  iforce_slow.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.z, i, WARPSIZE);
597  );
598  }
599  if (threadIdx.x == 0) {
600  sh_iforcesum[threadIdx.y].x += iforce.x;
601  sh_iforcesum[threadIdx.y].y += iforce.y;
602  sh_iforcesum[threadIdx.y].z += iforce.z;
603  SLOW(
604  sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
605  sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
606  sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
607  );
608  }
609 #else
610  sh_jforce[threadIdx.x].x = iforce.x;
611  sh_jforce[threadIdx.x].y = iforce.y;
612  sh_jforce[threadIdx.x].z = iforce.z;
613  SLOW(
614  sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
615  sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
616  sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
617  );
618  for (int d=1;d < WARPSIZE;d*=2) {
619  int pos = threadIdx.x + d;
620  float valx = (pos < WARPSIZE) ? sh_jforce[pos].x : 0.0f;
621  float valy = (pos < WARPSIZE) ? sh_jforce[pos].y : 0.0f;
622  float valz = (pos < WARPSIZE) ? sh_jforce[pos].z : 0.0f;
623  SLOW(
624  float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].x : 0.0f;
625  float slow_valy = (pos < WARPSIZE) ? sh_jforce_slow[pos].y : 0.0f;
626  float slow_valz = (pos < WARPSIZE) ? sh_jforce_slow[pos].z : 0.0f;
627  );
628  sh_jforce[threadIdx.x].x += valx;
629  sh_jforce[threadIdx.x].y += valy;
630  sh_jforce[threadIdx.x].z += valz;
631  SLOW(
632  sh_jforce_slow[threadIdx.x].x += slow_valx;
633  sh_jforce_slow[threadIdx.x].y += slow_valy;
634  sh_jforce_slow[threadIdx.x].z += slow_valz;
635  );
636  }
637  if (threadIdx.x == 0) {
638  sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
639  sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
640  sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
641  SLOW(
642  sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
643  sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
644  sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
645  );
646  }
647 #endif
648 
649  } // for (blocki)
650 
651  }
652 
653  {
654 
655 #ifdef REG_JFORCE
656 #undef SH_BUF_SIZE
657 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
658  __shared__ float sh_buf[NUM_WARP*(SLOW(9)+9)];
659 #else // ! REG_JFORCE
660 #undef SH_BUF_SIZE
661 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
662  volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
663  // Sync here to make sure we can write into shared memory (sh_jforce_2d)
664  BLOCK_SYNC;
665 #endif
666 
667 #if ENERGY(1+)0
668  NAME(reduceVariables)<float, SLOW(1+)2, SH_BUF_SIZE>(sh_buf, &tmpvirials[sh_patch_pair.patch1_ind*16 + 9], totalev, totalee, SLOW(totales+)0.0f);
669 #endif
670 
671 #if GENPAIRLIST(1+)0
673  NAME(reduceVariables)<int, 1, SH_BUF_SIZE>((int *)sh_buf, (int *)&tmpvirials[sh_patch_pair.patch1_ind*16 + 12], nexcluded, 0, 0);
674 #endif
675 
676  // Virials
677  BLOCK_SYNC;
678  if (threadIdx.x < SLOW(3+)3 && threadIdx.y == 0) {
679  float* sh_virials = (float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
680  float iforcesum = 0.0f;
681 #pragma unroll
682  for (int i=0;i < 3*NUM_WARP;i+=3) iforcesum += sh_virials[i];
683  float vx = iforcesum*sh_patch_pair.offset.x;
684  float vy = iforcesum*sh_patch_pair.offset.y;
685  float vz = iforcesum*sh_patch_pair.offset.z;
686  sh_iforcesum[threadIdx.x].x = vx;
687  sh_iforcesum[threadIdx.x].y = vy;
688  sh_iforcesum[threadIdx.x].z = vz;
689  }
690  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
691  // virials are in sh_virials[0...8] and slow virials in sh_virials[9...17]
692  float* sh_virials = (float *)sh_iforcesum;
693  int patch1_ind = sh_patch_pair.patch1_ind;
694  float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
695  atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
696  }
697 
698  // Make sure forces are up-to-date in device global memory
699  __threadfence();
700  BLOCK_SYNC;
701 
702  // Mark patch pair (patch1_ind, patch2_ind) as "done"
703  int patch1_ind = sh_patch_pair.patch1_ind;
704  int patch2_ind = sh_patch_pair.patch2_ind;
705  if (threadIdx.x == 0 && threadIdx.y == 0) {
706  sh_patch_pair.patch_done[0] = false;
707  sh_patch_pair.patch_done[1] = false;
708  //
709  // global_counters[0]: force_ready_queue
710  // global_counters[1]: block_order
711  // global_counters[2...npatches+1]: number of pairs finished for patch i+2
712  //
713  unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
714  int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
715  if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
716  if (patch1_ind != patch2_ind) {
717  unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
718  int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
719  if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
720  }
721  }
722  // sync threads so that patch1_done and patch2_done are visible to all threads
723  BLOCK_SYNC;
724 
725  if (sh_patch_pair.patch_done[0]) {
726 
727 // #ifndef REG_JFORCE
728 // volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
729 // #endif
730 #ifndef KEPLER_SHUFFLE
731  volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
732  volatile float* sh_slow_buf = NULL;
733  SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
734 #endif
735  NAME(finish_forces_virials)(sh_patch_pair.patch1_start, sh_patch_pair.patch1_size,
736  patch1_ind, atoms, sh_buf,
737 #ifndef KEPLER_SHUFFLE
738  sh_slow_buf, sh_vcc,
739 #endif
742 
743  }
744 
745  if (sh_patch_pair.patch_done[1]) {
746 // #ifndef REG_JFORCE
747 // volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
748 // #endif
749 #ifndef KEPLER_SHUFFLE
750  volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
751  volatile float* sh_slow_buf = NULL;
752  SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
753 #endif
754  NAME(finish_forces_virials)(sh_patch_pair.patch2_start, sh_patch_pair.patch2_size,
755  patch2_ind, atoms, sh_buf,
756 #ifndef KEPLER_SHUFFLE
757  sh_slow_buf, sh_vcc,
758 #endif
761  }
762 
763  if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
764  // Make sure page-locked host forces are up-to-date
765 #if __CUDA_ARCH__ < 200
766  __threadfence();
767 #else
768  __threadfence_system();
769 #endif
770  BLOCK_SYNC;
771  // Add patch into "force_ready_queue"
772  if (threadIdx.x == 0 && threadIdx.y == 0) {
773  if (sh_patch_pair.patch_done[0]) {
774  int ind = atomicInc(&global_counters[0], npatches-1);
775  force_ready_queue[ind] = patch1_ind;
776  }
777  if (sh_patch_pair.patch_done[1]) {
778  int ind = atomicInc(&global_counters[0], npatches-1);
779  force_ready_queue[ind] = patch2_ind;
780  }
781  // Make sure "force_ready_queue" is visible in page-locked host memory
782 #if __CUDA_ARCH__ < 200
783  __threadfence();
784 #else
785  __threadfence_system();
786 #endif
787  }
788  }
789 
790  if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
791  int old = atomicInc(&global_counters[1], total_block_count-1);
792  block_order[old] = block_begin + blockIdx.x;
793  }
794  }
795 
796 }
static __thread int * block_order
#define USEPAIRLIST(X)
#define ENERGY(X)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__device__ static __forceinline__ void NAME() finish_forces_virials(const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
static __thread atom * atoms
static __thread float4 * forces
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread float * slow_tmpvirials
#define PATCH_PAIR_SIZE
if(ComputeNonbondedUtil::goMethod==2)
#define WARPSIZE
Definition: CudaUtils.h:10
#define SLOW(X)
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread float * slow_virials
#define SH_BUF_SIZE
#define NAME(X)
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
unsigned int excl[32]
texture< float2, 1, cudaReadModeElementType > lj_table
gridSize z
#define GENPAIRLIST(X)
static __thread float * virials
#define MAX_CONST_EXCLUSIONS
__device__ __forceinline__ void NAME() reduceVariables(volatile T *sh_buf, T *dst, T val1, T val2, T val3)
static __thread float4 * slow_tmpforces
static __thread unsigned int * global_counters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread atom_param * atom_params
texture< float4, 1, cudaReadModeElementType > force_table
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
gridSize y
#define NUM_WARP
static __thread int * vdw_types
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
#define WARP_ANY(MASK, P)
Definition: CudaUtils.h:57
static __thread float4 * tmpforces
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54