NAMD
ComputeNonbondedCUDAKernelBase.h
Go to the documentation of this file.
1 #if defined(NAMD_HIP)
2 #include <hip/hip_runtime.h>
3 #endif
4 
5 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
6 
7 #define NAME(X) SLOWNAME( X )
8 
9 #undef SLOW
10 #undef SLOWNAME
11 #ifdef DO_SLOW
12 #define SLOW(X) X
13 #define SLOWNAME(X) ENERGYNAME( X ## _slow )
14 #else
15 #define SLOW(X)
16 #define SLOWNAME(X) ENERGYNAME( X )
17 #endif
18 
19 #undef ENERGY
20 #undef ENERGYNAME
21 #ifdef DO_ENERGY
22 #define ENERGY(X) X
23 #define ENERGYNAME(X) PAIRLISTNAME( X ## _energy )
24 #else
25 #define ENERGY(X)
26 #define ENERGYNAME(X) PAIRLISTNAME( X )
27 #endif
28 
29 #undef GENPAIRLIST
30 #undef USEPAIRLIST
31 #undef PAIRLISTNAME
32 #ifdef MAKE_PAIRLIST
33 #define GENPAIRLIST(X) X
34 #define USEPAIRLIST(X)
35 #define PAIRLISTNAME(X) LAST( X ## _pairlist )
36 #else
37 #define GENPAIRLIST(X)
38 #define USEPAIRLIST(X) X
39 #define PAIRLISTNAME(X) LAST( X )
40 #endif
41 
42 #define LAST(X) X
43 
44 #undef KEPLER_SHUFFLE
45 #ifdef __CUDA_ARCH__
46 #define KEPLER_SHUFFLE
47 #if __CUDA_ARCH__ < 300
48 #undef KEPLER_SHUFFLE
49 #endif
50 #endif
51 
52 #undef REG_JFORCE
53 #ifdef KEPLER_SHUFFLE
54 #ifndef MAKE_PAIRLIST
55 #define REG_JFORCE
56 #endif
57 #endif
58 
59 #ifdef KEPLER_SHUFFLE
60 __device__ __forceinline__ static void NAME(shfl_reduction)(
61  float *reg,
62  float *smem
63  )
64 {
65  for (int i=WARPSIZE/2;i >= 1;i/=2) {
66  *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, i, WARPSIZE);
67  }
68 
69  if ( threadIdx.x % WARPSIZE == 0 ) {
70  atomicAdd(smem,*reg);
71  }
72  return;
73 }
74 #endif /* KEPLER_SHUFFLE */
75 
76 __device__ __forceinline__
77 static void NAME(finish_forces_virials)(const int start, const int size, const int patch_ind,
78  const atom* atoms,
79  volatile float* sh_buf,
80 #ifndef KEPLER_SHUFFLE
81  volatile float* sh_slow_buf, volatile float* sh_vcc,
82 #endif
83  float4* tmpforces, float4* slow_tmpforces,
84  float4* forces, float4* slow_forces,
85  float* tmpvirials, float* slow_tmpvirials,
86  float* virials, float* slow_virials);
87 
88 //
89 // Reduces up to three variables into global memory locations dst[0], dst[1], dst[2]
90 //
91 template<typename T, int n, int sh_buf_size>
92 __device__ __forceinline__
93 void NAME(reduceVariables)(volatile T* sh_buf, T* dst, T val1, T val2, T val3) {
94  // Sanity check
95  cuda_static_assert(n > 0 && n <= NUM_WARP);
96 #ifdef KEPLER_SHUFFLE
97  // Requires NUM_WARP*n*sizeof(float) shared memory
98  cuda_static_assert(sh_buf_size >= NUM_WARP*n*sizeof(T));
99  // Reduce within warp
100  for (int i=WARPSIZE/2;i >= 1;i/=2) {
101  if (n >= 1) val1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val1, i, WARPSIZE);
102  if (n >= 2) val2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val2, i, WARPSIZE);
103  if (n >= 3) val3 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val3, i, WARPSIZE);
104  }
105  if (threadIdx.x == 0) {
106  if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
107  if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
108  if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
109  }
110  BLOCK_SYNC;
111  if (threadIdx.x < n && threadIdx.y == 0) {
112  T finalval = (T)0;
113 #pragma unroll
114  for (int i=0;i < NUM_WARP;++i) {
115  finalval += sh_buf[i*n + threadIdx.x];
116  }
117  atomicAdd(&dst[threadIdx.x], finalval);
118  }
119 #else // ! KEPLER_SHUFFLE
120  // Requires NUM_WARP*n*WARPSIZE*sizeof(float) shared memory
121  cuda_static_assert(sh_buf_size >= NUM_WARP*n*WARPSIZE*sizeof(T));
122  volatile T* sh_bufy = &sh_buf[threadIdx.y*n*WARPSIZE];
123  if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
124  if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
125  if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
126  // Reducue within warp
127  for (int d=1;d < WARPSIZE;d*=2) {
128  int pos = threadIdx.x + d;
129  T val1t, val2t, val3t;
130  if (n >= 1) val1t = (pos < WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
131  if (n >= 2) val2t = (pos < WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
132  if (n >= 3) val3t = (pos < WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
133  if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
134  if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
135  if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
136  }
137  BLOCK_SYNC;
138  if (threadIdx.x < n && threadIdx.y == 0) {
139  T finalval = (T)0;
140 #pragma unroll
141  for (int i=0;i < NUM_WARP;++i) {
142  finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
143  }
144  atomicAdd(&dst[threadIdx.x], finalval);
145  }
146 #endif // KEPLER_SHUFFLE
147 }
148 
149 //
150 // Called with 2d thread setting:
151 // x-threadblock = warpSize = 32
152 // y-threadblock = NUM_WARP = 4
153 //
154 __global__ static void
156 USEPAIRLIST(__launch_bounds__(NUM_WARP*WARPSIZE, 12) )
157 NAME(dev_nonbonded)
158  (const patch_pair* patch_pairs,
159  const atom *atoms, const atom_param *atom_params,
160  const int* vdw_types, unsigned int* plist,
161  float4 *tmpforces, float4 *slow_tmpforces,
162  float4 *forces, float4 *slow_forces,
163  float* tmpvirials, float* slow_tmpvirials,
164  float* virials, float* slow_virials,
165  unsigned int* global_counters, int* force_ready_queue,
166  const unsigned int *overflow_exclusions,
167  const int npatches,
168  const int block_begin, const int total_block_count, int* block_order,
169  exclmask* exclmasks, const int lj_table_size,
170  const float3 lata, const float3 latb, const float3 latc,
171  const float cutoff2, const float plcutoff2, const int doSlow) {
172 
173  // Local structure definitions
174  GENPAIRLIST(struct vdw_index {
175  int vdw_type;
176  int index;
177  };)
178 
179  // Shared memory
180  __shared__ patch_pair sh_patch_pair;
181 #ifndef REG_JFORCE
182  __shared__ float3 sh_jforce_2d[NUM_WARP][WARPSIZE];
183  SLOW(__shared__ float3 sh_jforce_slow_2d[NUM_WARP][WARPSIZE];)
184 #endif
185 #ifndef KEPLER_SHUFFLE
186  __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
187 #endif
188  __shared__ float3 sh_iforcesum[SLOW(NUM_WARP+) NUM_WARP];
189 
190  ENERGY(
191  float totalev = 0.f;
192  float totalee = 0.f;
193  SLOW( float totales = 0.f; )
194  )
195 
196  GENPAIRLIST(int nexcluded=0;);
197 
198  {
199 #ifndef KEPLER_SHUFFLE
200  GENPAIRLIST(__shared__ atom_param sh_jap_2d[NUM_WARP][WARPSIZE];)
201  USEPAIRLIST(__shared__ int sh_jap_vdw_type_2d[NUM_WARP][WARPSIZE];)
202 #endif
203  USEPAIRLIST(__shared__ int sh_plist_ind[NUM_WARP];
204  __shared__ unsigned int sh_plist_val[NUM_WARP];);
205 
206  // Load patch_pair -data into shared memory
207  {
208  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
209 
210  if (t < 3*(SLOW(NUM_WARP+) NUM_WARP)) {
211  float *p = (float *)sh_iforcesum;
212  p[threadIdx.x] = 0.0f;
213  }
214 
215  if (t < PATCH_PAIR_SIZE) {
216  int* src = (int *)&patch_pairs[block_begin + blockIdx.x];
217  int* dst = (int *)&sh_patch_pair;
218  dst[t] = src[t];
219  }
220  // Need to sync here to make sure sh_patch_pair is ready
221  BLOCK_SYNC;
222 
223  // Initialize pairlist index to impossible value
224  USEPAIRLIST(if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
225 
226  // Initialize pair list to "no interactions"
227  GENPAIRLIST({
228  if (t < sh_patch_pair.plist_size)
229  plist[sh_patch_pair.plist_start + t] = 0;
230  })
231 
232  // convert scaled offset with current lattice and write into shared memory
233  if (t == 0) {
234  float offx = sh_patch_pair.offset.x * lata.x
235  + sh_patch_pair.offset.y * latb.x
236  + sh_patch_pair.offset.z * latc.x;
237  float offy = sh_patch_pair.offset.x * lata.y
238  + sh_patch_pair.offset.y * latb.y
239  + sh_patch_pair.offset.z * latc.y;
240  float offz = sh_patch_pair.offset.x * lata.z
241  + sh_patch_pair.offset.y * latb.z
242  + sh_patch_pair.offset.z * latc.z;
243  sh_patch_pair.offset.x = offx;
244  sh_patch_pair.offset.y = offy;
245  sh_patch_pair.offset.z = offz;
246  }
247 
248  BLOCK_SYNC;
249  }
250 
251  // Compute pointers to shared memory to avoid point computation later on
252 #ifndef REG_JFORCE
253  volatile float3* sh_jforce = &sh_jforce_2d[threadIdx.y][0];
254  SLOW(volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
255 #endif
256 
257 #ifndef KEPLER_SHUFFLE
258  atom* sh_jpq = &sh_jpq_2d[threadIdx.y][0];
259  GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
260  USEPAIRLIST(int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
261 #endif
262 
263  for (int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*NUM_WARP) {
264 
265  atom ipq;
266  GENPAIRLIST(vdw_index iap;);
267  USEPAIRLIST(int iap_vdw_type;);
268  // Load i atom data
269  if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
270  int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
271  float4 tmpa = ((float4*)atoms)[i];
272  ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
273  ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
274  ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
275  ipq.charge = tmpa.w;
276  GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
277  iap.vdw_type = tmpap.x*lj_table_size;
278  iap.index = tmpap.y;);
279  USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
280  }
281 
282  // i-forces in registers
283  float3 iforce;
284  iforce.x = 0.0f;
285  iforce.y = 0.0f;
286  iforce.z = 0.0f;
287  SLOW(float3 iforce_slow;
288  iforce_slow.x = 0.0f;
289  iforce_slow.y = 0.0f;
290  iforce_slow.z = 0.0f;)
291 
292  const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
293  (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
294  int blockj = (diag_patch_pair) ? blocki : 0;
295  for (;blockj < sh_patch_pair.patch2_size;blockj += WARPSIZE) {
296 
297  USEPAIRLIST({
298  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
299  int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
300  int plist_ind = pos/32;
301  unsigned int plist_bit = 1 << (pos % 32);
302  // Check if we need to load next entry in the pairlist
303  if (plist_ind != sh_plist_ind[threadIdx.y]) {
304  sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
305  sh_plist_ind[threadIdx.y] = plist_ind;
306  }
307  if ((sh_plist_val[threadIdx.y] & plist_bit) == 0) continue;
308  })
309 
310  // Load j atom data
311 #ifdef KEPLER_SHUFFLE
312  atom jpq;
313  GENPAIRLIST(atom_param jap;);
314  USEPAIRLIST(int jap_vdw_type;);
315 #endif
316 
317  GENPAIRLIST(
318  // Avoid calculating pairs of blocks where all atoms on both blocks are fixed
319  if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size) continue;
320  int nfreej = sh_patch_pair.patch2_free_size - blockj;
321  int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
322  );
323 
324  //GENPAIRLIST(bool inside_plcutoff = false;)
325  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
326  int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
327  float4 tmpa = ((float4*)atoms)[j];
328 #ifdef KEPLER_SHUFFLE
329  jpq.position.x = tmpa.x;
330  jpq.position.y = tmpa.y;
331  jpq.position.z = tmpa.z;
332  jpq.charge = tmpa.w;
333 #else
334  sh_jpq[threadIdx.x].position.x = tmpa.x;
335  sh_jpq[threadIdx.x].position.y = tmpa.y;
336  sh_jpq[threadIdx.x].position.z = tmpa.z;
337  sh_jpq[threadIdx.x].charge = tmpa.w;
338 #endif
339 
340 #ifdef KEPLER_SHUFFLE
341  GENPAIRLIST(jap = atom_params[j];)
342  USEPAIRLIST(jap_vdw_type = vdw_types[j];)
343 #else
344  GENPAIRLIST(sh_jap[threadIdx.x] = atom_params[j];)
345  USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
346 #endif
347  }
348 
349  // j-forces in shared memory
350 #ifdef REG_JFORCE
351  float3 jforce;
352  jforce.x = 0.0f;
353  jforce.y = 0.0f;
354  jforce.z = 0.0f;
355  SLOW(float3 jforce_slow;
356  jforce_slow.x = 0.0f;
357  jforce_slow.y = 0.0f;
358  jforce_slow.z = 0.0f;
359  );
360 #else
361  sh_jforce[threadIdx.x].x = 0.0f;
362  sh_jforce[threadIdx.x].y = 0.0f;
363  sh_jforce[threadIdx.x].z = 0.0f;
364  SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
365  sh_jforce_slow[threadIdx.x].y = 0.0f;
366  sh_jforce_slow[threadIdx.x].z = 0.0f;)
367 #endif
368 
369  GENPAIRLIST(unsigned int excl = 0;)
370  USEPAIRLIST(
371  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
372  const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
373  unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
374  );
375  GENPAIRLIST(
376  int nloopi = sh_patch_pair.patch1_size - blocki;
377  if (nloopi > WARPSIZE) nloopi = WARPSIZE;
378  // NOTE: We must truncate nfreei to be non-negative number since we're comparing to threadIdx.x (unsigned int) later on
379  int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
380  )
381  const bool diag_tile = diag_patch_pair && (blocki == blockj);
382  // Loop through tile diagonals. Local tile indices are:
383  // i = threadIdx.x % WARPSIZE = constant
384  // j = (t + threadIdx.x) % WARPSIZE
385  const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
386  int t = (diag_tile) ? 1 : 0;
387  if (diag_tile) {
388  USEPAIRLIST(excl >>= 1;);
389 #ifdef KEPLER_SHUFFLE
390  jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
391  USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
392  GENPAIRLIST(jap.vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
393  jap.index = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
394  jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
395  jap.excl_index = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
396  );
397 #endif
398  }
399 
400  for (; t < WARPSIZE; ++t) {
401  USEPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl & 1)))
402  {
403  GENPAIRLIST(excl >>= 1;);
404  int j = (t + threadIdx.x) & modval;
405 #ifdef KEPLER_SHUFFLE
406  float tmpx = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.x, j, WARPSIZE) - ipq.position.x;
407  float tmpy = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.y, j, WARPSIZE) - ipq.position.y;
408  float tmpz = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.z, j, WARPSIZE) - ipq.position.z;
409  GENPAIRLIST(
410  int j_vdw_type = jap.vdw_type;
411  int j_index = jap.index;
412  int j_excl_maxdiff = jap.excl_maxdiff;
413  int j_excl_index = jap.excl_index;
414  );
415  float j_charge = jpq.charge;
416  USEPAIRLIST(
417  int j_vdw_type = jap_vdw_type;
418  );
419 #endif
420  GENPAIRLIST(if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
421  {
422 
423 #ifndef KEPLER_SHUFFLE
424  float tmpx = sh_jpq[j].position.x - ipq.position.x;
425  float tmpy = sh_jpq[j].position.y - ipq.position.y;
426  float tmpz = sh_jpq[j].position.z - ipq.position.z;
427  GENPAIRLIST(
428  int j_vdw_type = sh_jap[j].vdw_type;
429  int j_index = sh_jap[j].index;
430  int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
431  int j_excl_index = sh_jap[j].excl_index;
432  );
433  float j_charge = sh_jpq[j].charge;
434  USEPAIRLIST(
435  int j_vdw_type = sh_jap_vdw_type[j];
436  );
437 #endif
438  float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
439  GENPAIRLIST(if (r2 < plcutoff2))
440  USEPAIRLIST(if ((excl & 1) && r2 < cutoff2))
441  {
442  GENPAIRLIST(
443  bool excluded = false;
444  int indexdiff = (int)(iap.index) - j_index;
445  if ( abs(indexdiff) <= j_excl_maxdiff) {
446  indexdiff += j_excl_index;
447  int indexword = ((unsigned int) indexdiff) >> 5;
448  //indexword = tex1Dfetch(tex_exclusions, indexword);
449  if ( indexword < MAX_CONST_EXCLUSIONS )
450  indexword = const_exclusions[indexword];
451  else {
452  indexword = overflow_exclusions[indexword];
453  }
454  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
455  if (excluded) nexcluded++;
456  }
457  if (!excluded) excl |= 0x80000000;
458  )
459  GENPAIRLIST(if ( ! excluded && r2 < cutoff2))
460  {
461  ENERGY( float rsqrtfr2; );
462  float4 fi = tex1D(force_table, ENERGY(rsqrtfr2 =) rsqrtf(r2));
463  ENERGY( float4 ei = tex1D(energy_table, rsqrtfr2); );
464  GENPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap.vdw_type););
465  USEPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap_vdw_type););
466 
467  float f_slow = ipq.charge * j_charge;
468  float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
469  ENERGY(
470  float ev = ljab.x * ei.z + ljab.y * ei.y;
471  float ee = f_slow * ei.x;
472  SLOW( float es = f_slow * ei.w; )
473  )
474  SLOW( f_slow *= fi.w; )
475  ENERGY(
476  totalev += ev;
477  totalee += ee;
478  SLOW( totales += es; )
479  )
480  float fx = tmpx * f;
481  float fy = tmpy * f;
482  float fz = tmpz * f;
483  iforce.x += fx;
484  iforce.y += fy;
485  iforce.z += fz;
486 #ifdef REG_JFORCE
487  jforce.x -= fx;
488  jforce.y -= fy;
489  jforce.z -= fz;
490 #else
491  sh_jforce[j].x -= fx;
492  sh_jforce[j].y -= fy;
493  sh_jforce[j].z -= fz;
494 #endif
495  SLOW(
496  float fx_slow = tmpx * f_slow;
497  float fy_slow = tmpy * f_slow;
498  float fz_slow = tmpz * f_slow;
499  iforce_slow.x += fx_slow;
500  iforce_slow.y += fy_slow;
501  iforce_slow.z += fz_slow;
502  )
503 #ifdef REG_JFORCE
504  SLOW(
505  jforce_slow.x -= fx_slow;
506  jforce_slow.y -= fy_slow;
507  jforce_slow.z -= fz_slow;
508  )
509 #else
510  SLOW(
511  sh_jforce_slow[j].x -= fx_slow;
512  sh_jforce_slow[j].y -= fy_slow;
513  sh_jforce_slow[j].z -= fz_slow;
514  )
515 #endif
516  }
517  } // cutoff
518  } // if (j < nloopj...)
519 }
520  USEPAIRLIST(excl >>= 1;);
521 #ifdef KEPLER_SHUFFLE
522  jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
523  USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
524  GENPAIRLIST(jap.vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
525  jap.index = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
526  jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
527  jap.excl_index = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
528  );
529 #ifdef REG_JFORCE
530  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
531  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
532  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
533  SLOW(
534  jforce_slow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
535  jforce_slow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
536  jforce_slow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
537  );
538 #endif
539 #endif
540  } // t
541 
542  // Write j-forces
543  GENPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl != 0))) {
544  if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
545  int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
546 #ifdef REG_JFORCE
547  atomicAdd(&tmpforces[jforce_pos].x, jforce.x);
548  atomicAdd(&tmpforces[jforce_pos].y, jforce.y);
549  atomicAdd(&tmpforces[jforce_pos].z, jforce.z);
550  SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
551  atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
552  atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
553 #else
554  atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
555  atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
556  atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
557  SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
558  atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
559  atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
560 #endif
561  }
562 
563  GENPAIRLIST(
564  const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
565  int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
566  exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
567  if (threadIdx.x == 0) {
568  int plist_ind = pos/32;
569  unsigned int plist_bit = 1 << (pos % 32);
570  atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
571  }
572  );
573  }
574 
575  } // for (blockj)
576 
577  // Write i-forces
578  if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
579  int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
580  atomicAdd(&tmpforces[iforce_pos].x, iforce.x);
581  atomicAdd(&tmpforces[iforce_pos].y, iforce.y);
582  atomicAdd(&tmpforces[iforce_pos].z, iforce.z);
583  SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
584  atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
585  atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
586  }
587  // Accumulate total forces for virial (warp synchronous)
588 #ifdef KEPLER_SHUFFLE
589  for (int i=WARPSIZE/2;i >= 1;i/=2) {
590  iforce.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.x, i, WARPSIZE);
591  iforce.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.y, i, WARPSIZE);
592  iforce.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.z, i, WARPSIZE);
593  SLOW(
594  iforce_slow.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.x, i, WARPSIZE);
595  iforce_slow.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.y, i, WARPSIZE);
596  iforce_slow.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.z, i, WARPSIZE);
597  );
598  }
599  if (threadIdx.x == 0) {
600  sh_iforcesum[threadIdx.y].x += iforce.x;
601  sh_iforcesum[threadIdx.y].y += iforce.y;
602  sh_iforcesum[threadIdx.y].z += iforce.z;
603  SLOW(
604  sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
605  sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
606  sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
607  );
608  }
609 #else
610  sh_jforce[threadIdx.x].x = iforce.x;
611  sh_jforce[threadIdx.x].y = iforce.y;
612  sh_jforce[threadIdx.x].z = iforce.z;
613  SLOW(
614  sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
615  sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
616  sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
617  );
618  for (int d=1;d < WARPSIZE;d*=2) {
619  int pos = threadIdx.x + d;
620  float valx = (pos < WARPSIZE) ? sh_jforce[pos].x : 0.0f;
621  float valy = (pos < WARPSIZE) ? sh_jforce[pos].y : 0.0f;
622  float valz = (pos < WARPSIZE) ? sh_jforce[pos].z : 0.0f;
623  SLOW(
624  float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].x : 0.0f;
625  float slow_valy = (pos < WARPSIZE) ? sh_jforce_slow[pos].y : 0.0f;
626  float slow_valz = (pos < WARPSIZE) ? sh_jforce_slow[pos].z : 0.0f;
627  );
628  sh_jforce[threadIdx.x].x += valx;
629  sh_jforce[threadIdx.x].y += valy;
630  sh_jforce[threadIdx.x].z += valz;
631  SLOW(
632  sh_jforce_slow[threadIdx.x].x += slow_valx;
633  sh_jforce_slow[threadIdx.x].y += slow_valy;
634  sh_jforce_slow[threadIdx.x].z += slow_valz;
635  );
636  }
637  if (threadIdx.x == 0) {
638  sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
639  sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
640  sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
641  SLOW(
642  sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
643  sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
644  sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
645  );
646  }
647 #endif
648 
649  } // for (blocki)
650 
651  }
652 
653  {
654 
655 #ifdef REG_JFORCE
656 #undef SH_BUF_SIZE
657 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
658  __shared__ float sh_buf[NUM_WARP*(SLOW(9)+9)];
659 #else // ! REG_JFORCE
660 #undef SH_BUF_SIZE
661 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
662  volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
663  // Sync here to make sure we can write into shared memory (sh_jforce_2d)
664  BLOCK_SYNC;
665 #endif
666 
667 #if ENERGY(1+)0
668  NAME(reduceVariables)<float, SLOW(1+)2, SH_BUF_SIZE>(sh_buf, &tmpvirials[sh_patch_pair.patch1_ind*16 + 9], totalev, totalee, SLOW(totales+)0.0f);
669 #endif
670 
671 #if GENPAIRLIST(1+)0
673  NAME(reduceVariables)<int, 1, SH_BUF_SIZE>((int *)sh_buf, (int *)&tmpvirials[sh_patch_pair.patch1_ind*16 + 12], nexcluded, 0, 0);
674 #endif
675 
676  // Virials
677  BLOCK_SYNC;
678  if (threadIdx.x < SLOW(3+)3 && threadIdx.y == 0) {
679  float* sh_virials = (float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
680  float iforcesum = 0.0f;
681 #pragma unroll
682  for (int i=0;i < 3*NUM_WARP;i+=3) iforcesum += sh_virials[i];
683  float vx = iforcesum*sh_patch_pair.offset.x;
684  float vy = iforcesum*sh_patch_pair.offset.y;
685  float vz = iforcesum*sh_patch_pair.offset.z;
686  sh_iforcesum[threadIdx.x].x = vx;
687  sh_iforcesum[threadIdx.x].y = vy;
688  sh_iforcesum[threadIdx.x].z = vz;
689  }
690  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
691  // virials are in sh_virials[0...8] and slow virials in sh_virials[9...17]
692  float* sh_virials = (float *)sh_iforcesum;
693  int patch1_ind = sh_patch_pair.patch1_ind;
694  float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
695  atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
696  }
697 
698  // Make sure forces are up-to-date in device global memory
699  __threadfence();
700  BLOCK_SYNC;
701 
702  // Mark patch pair (patch1_ind, patch2_ind) as "done"
703  int patch1_ind = sh_patch_pair.patch1_ind;
704  int patch2_ind = sh_patch_pair.patch2_ind;
705  if (threadIdx.x == 0 && threadIdx.y == 0) {
706  sh_patch_pair.patch_done[0] = false;
707  sh_patch_pair.patch_done[1] = false;
708  //
709  // global_counters[0]: force_ready_queue
710  // global_counters[1]: block_order
711  // global_counters[2...npatches+1]: number of pairs finished for patch i+2
712  //
713  unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
714  int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
715  if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
716  if (patch1_ind != patch2_ind) {
717  unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
718  int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
719  if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
720  }
721  }
722  // sync threads so that patch1_done and patch2_done are visible to all threads
723  BLOCK_SYNC;
724 
725  if (sh_patch_pair.patch_done[0]) {
726 
727 // #ifndef REG_JFORCE
728 // volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
729 // #endif
730 #ifndef KEPLER_SHUFFLE
731  volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
732  volatile float* sh_slow_buf = NULL;
733  SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
734 #endif
735  NAME(finish_forces_virials)(sh_patch_pair.patch1_start, sh_patch_pair.patch1_size,
736  patch1_ind, atoms, sh_buf,
737 #ifndef KEPLER_SHUFFLE
738  sh_slow_buf, sh_vcc,
739 #endif
742 
743  }
744 
745  if (sh_patch_pair.patch_done[1]) {
746 // #ifndef REG_JFORCE
747 // volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
748 // #endif
749 #ifndef KEPLER_SHUFFLE
750  volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
751  volatile float* sh_slow_buf = NULL;
752  SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
753 #endif
754  NAME(finish_forces_virials)(sh_patch_pair.patch2_start, sh_patch_pair.patch2_size,
755  patch2_ind, atoms, sh_buf,
756 #ifndef KEPLER_SHUFFLE
757  sh_slow_buf, sh_vcc,
758 #endif
761  }
762 
763  if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
764  // Make sure page-locked host forces are up-to-date
765 #if __CUDA_ARCH__ < 200
766  __threadfence();
767 #else
768  __threadfence_system();
769 #endif
770  BLOCK_SYNC;
771  // Add patch into "force_ready_queue"
772  if (threadIdx.x == 0 && threadIdx.y == 0) {
773  if (sh_patch_pair.patch_done[0]) {
774  int ind = atomicInc(&global_counters[0], npatches-1);
775  force_ready_queue[ind] = patch1_ind;
776  }
777  if (sh_patch_pair.patch_done[1]) {
778  int ind = atomicInc(&global_counters[0], npatches-1);
779  force_ready_queue[ind] = patch2_ind;
780  }
781  // Make sure "force_ready_queue" is visible in page-locked host memory
782 #if __CUDA_ARCH__ < 200
783  __threadfence();
784 #else
785  __threadfence_system();
786 #endif
787  }
788  }
789 
790  if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
791  int old = atomicInc(&global_counters[1], total_block_count-1);
792  block_order[old] = block_begin + blockIdx.x;
793  }
794  }
795 
796 }
797 
798 //
799 // Copy patch forces to their final resting place in page-locked host memory
800 // and reduce virials
801 //
802 __device__ __forceinline__
803 static void NAME(finish_forces_virials)(const int start, const int size, const int patch_ind,
804  const atom* atoms,
805  volatile float* sh_buf,
806 #ifndef KEPLER_SHUFFLE
807  volatile float* sh_slow_buf, volatile float* sh_vcc,
808 #endif
809  float4* tmpforces, float4* slow_tmpforces,
810  float4* forces, float4* slow_forces,
811  float* tmpvirials, float* slow_tmpvirials,
812  float* virials, float* slow_virials) {
813  float vxx = 0.f;
814  float vxy = 0.f;
815  float vxz = 0.f;
816  float vyx = 0.f;
817  float vyy = 0.f;
818  float vyz = 0.f;
819  float vzx = 0.f;
820  float vzy = 0.f;
821  float vzz = 0.f;
822  SLOW(
823  float slow_vxx = 0.f;
824  float slow_vxy = 0.f;
825  float slow_vxz = 0.f;
826  float slow_vyx = 0.f;
827  float slow_vyy = 0.f;
828  float slow_vyz = 0.f;
829  float slow_vzx = 0.f;
830  float slow_vzy = 0.f;
831  float slow_vzz = 0.f;
832  )
833  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < size;i+=NUM_WARP*WARPSIZE) {
834  const int p = start+i;
835  float4 f = tmpforces[p];
836  forces[p] = f;
837  float4 pos = ((float4*)atoms)[p];
838  vxx += f.x * pos.x;
839  vxy += f.x * pos.y;
840  vxz += f.x * pos.z;
841  vyx += f.y * pos.x;
842  vyy += f.y * pos.y;
843  vyz += f.y * pos.z;
844  vzx += f.z * pos.x;
845  vzy += f.z * pos.y;
846  vzz += f.z * pos.z;
847  SLOW(
848  float4 slow_f = slow_tmpforces[p];
849  slow_forces[p] = slow_f;
850  slow_vxx += slow_f.x * pos.x;
851  slow_vxy += slow_f.x * pos.y;
852  slow_vxz += slow_f.x * pos.z;
853  slow_vyx += slow_f.y * pos.x;
854  slow_vyy += slow_f.y * pos.y;
855  slow_vyz += slow_f.y * pos.z;
856  slow_vzx += slow_f.z * pos.x;
857  slow_vzy += slow_f.z * pos.y;
858  slow_vzz += slow_f.z * pos.z;
859  )
860  }
861 #ifdef KEPLER_SHUFFLE
862  // Reduce within warps
863  for (int i=WARPSIZE/2;i >= 1;i/=2) {
864  vxx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxx, i, WARPSIZE);
865  vxy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxy, i, WARPSIZE);
866  vxz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxz, i, WARPSIZE);
867  vyx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyx, i, WARPSIZE);
868  vyy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyy, i, WARPSIZE);
869  vyz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyz, i, WARPSIZE);
870  vzx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzx, i, WARPSIZE);
871  vzy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzy, i, WARPSIZE);
872  vzz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzz, i, WARPSIZE);
873  SLOW(
874  slow_vxx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxx, i, WARPSIZE);
875  slow_vxy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxy, i, WARPSIZE);
876  slow_vxz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxz, i, WARPSIZE);
877  slow_vyx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyx, i, WARPSIZE);
878  slow_vyy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyy, i, WARPSIZE);
879  slow_vyz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyz, i, WARPSIZE);
880  slow_vzx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzx, i, WARPSIZE);
881  slow_vzy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzy, i, WARPSIZE);
882  slow_vzz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzz, i, WARPSIZE);
883  )
884  }
885  // Reduce between warps
886  // Requires NUM_WARP*(SLOW(9)+9)*sizeof(float) amount of shared memory
887  if (threadIdx.x == 0) {
888  sh_buf[threadIdx.y*(SLOW(9)+9) + 0] = vxx;
889  sh_buf[threadIdx.y*(SLOW(9)+9) + 1] = vxy;
890  sh_buf[threadIdx.y*(SLOW(9)+9) + 2] = vxz;
891  sh_buf[threadIdx.y*(SLOW(9)+9) + 3] = vyx;
892  sh_buf[threadIdx.y*(SLOW(9)+9) + 4] = vyy;
893  sh_buf[threadIdx.y*(SLOW(9)+9) + 5] = vyz;
894  sh_buf[threadIdx.y*(SLOW(9)+9) + 6] = vzx;
895  sh_buf[threadIdx.y*(SLOW(9)+9) + 7] = vzy;
896  sh_buf[threadIdx.y*(SLOW(9)+9) + 8] = vzz;
897  SLOW(
898  sh_buf[threadIdx.y*(SLOW(9)+9) + 9] = slow_vxx;
899  sh_buf[threadIdx.y*(SLOW(9)+9) + 10] = slow_vxy;
900  sh_buf[threadIdx.y*(SLOW(9)+9) + 11] = slow_vxz;
901  sh_buf[threadIdx.y*(SLOW(9)+9) + 12] = slow_vyx;
902  sh_buf[threadIdx.y*(SLOW(9)+9) + 13] = slow_vyy;
903  sh_buf[threadIdx.y*(SLOW(9)+9) + 14] = slow_vyz;
904  sh_buf[threadIdx.y*(SLOW(9)+9) + 15] = slow_vzx;
905  sh_buf[threadIdx.y*(SLOW(9)+9) + 16] = slow_vzy;
906  sh_buf[threadIdx.y*(SLOW(9)+9) + 17] = slow_vzz;
907  )
908  }
909  BLOCK_SYNC;
910  // Write final virials into global memory
911  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
912  float v = 0.0f;
913 #pragma unroll
914  for (int i=0;i < NUM_WARP;i++) v += sh_buf[i*(SLOW(9)+9) + threadIdx.x];
915  float* dst = (threadIdx.x < 9) ? virials : slow_virials;
916  const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
917  int pos = patch_ind*16 + (threadIdx.x % 9);
918  dst[pos] = v + src[pos];
919  }
920 #else // ! KEPLER_SHUFFLE
921  // We have total of NUM_WARP*WARPSIZE*3 floats, reduce in sets of three
922  // (NOTE: we do have more shared memory available, so this could be optimized further
923  // for pre-Kepler architectures.)
924  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
925  volatile float* sh_v1 = &sh_buf[0];
926  volatile float* sh_v2 = &sh_buf[NUM_WARP*WARPSIZE];
927  volatile float* sh_v3 = &sh_buf[2*NUM_WARP*WARPSIZE];
928  SLOW(
929  volatile float* sh_slow_v1 = &sh_slow_buf[0];
930  volatile float* sh_slow_v2 = &sh_slow_buf[NUM_WARP*WARPSIZE];
931  volatile float* sh_slow_v3 = &sh_slow_buf[2*NUM_WARP*WARPSIZE];
932  )
933 
934  // vxx, vxy, vxz
935  sh_v1[t] = vxx;
936  sh_v2[t] = vxy;
937  sh_v3[t] = vxz;
938  SLOW(
939  sh_slow_v1[t] = slow_vxx;
940  sh_slow_v2[t] = slow_vxy;
941  sh_slow_v3[t] = slow_vxz;
942  )
943  for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
944  int pos = t + d;
945  float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
946  float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
947  float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
948  SLOW(
949  float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
950  float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
951  float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
952  )
953  BLOCK_SYNC;
954  sh_v1[t] += v1;
955  sh_v2[t] += v2;
956  sh_v3[t] += v3;
957  SLOW(
958  sh_slow_v1[t] += slow_v1;
959  sh_slow_v2[t] += slow_v2;
960  sh_slow_v3[t] += slow_v3;
961  )
962  BLOCK_SYNC;
963  }
964  if (threadIdx.x == 0 && threadIdx.y == 0) {
965  sh_vcc[0] = sh_v1[0];
966  sh_vcc[1] = sh_v2[0];
967  sh_vcc[2] = sh_v3[0];
968  SLOW(
969  sh_vcc[9+0] = sh_slow_v1[0];
970  sh_vcc[9+1] = sh_slow_v2[0];
971  sh_vcc[9+2] = sh_slow_v3[0];
972  )
973  }
974  // vyx, vyy, vyz
975  sh_v1[t] = vyx;
976  sh_v2[t] = vyy;
977  sh_v3[t] = vyz;
978  SLOW(
979  sh_slow_v1[t] = slow_vyx;
980  sh_slow_v2[t] = slow_vyy;
981  sh_slow_v3[t] = slow_vyz;
982  )
983  for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
984  int pos = t + d;
985  float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
986  float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
987  float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
988  SLOW(
989  float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
990  float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
991  float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
992  )
993  BLOCK_SYNC;
994  sh_v1[t] += v1;
995  sh_v2[t] += v2;
996  sh_v3[t] += v3;
997  SLOW(
998  sh_slow_v1[t] += slow_v1;
999  sh_slow_v2[t] += slow_v2;
1000  sh_slow_v3[t] += slow_v3;
1001  )
1002  BLOCK_SYNC;
1003  }
1004  if (threadIdx.x == 0 && threadIdx.y == 0) {
1005  sh_vcc[3] = sh_v1[0];
1006  sh_vcc[4] = sh_v2[0];
1007  sh_vcc[5] = sh_v3[0];
1008  SLOW(
1009  sh_vcc[9+3] = sh_slow_v1[0];
1010  sh_vcc[9+4] = sh_slow_v2[0];
1011  sh_vcc[9+5] = sh_slow_v3[0];
1012  )
1013  }
1014  // vzx, vzy, vzz
1015  sh_v1[t] = vzx;
1016  sh_v2[t] = vzy;
1017  sh_v3[t] = vzz;
1018  SLOW(
1019  sh_slow_v1[t] = slow_vzx;
1020  sh_slow_v2[t] = slow_vzy;
1021  sh_slow_v3[t] = slow_vzz;
1022  )
1023  for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
1024  int pos = t + d;
1025  float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
1026  float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
1027  float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
1028  SLOW(
1029  float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
1030  float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
1031  float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
1032  )
1033  BLOCK_SYNC;
1034  sh_v1[t] += v1;
1035  sh_v2[t] += v2;
1036  sh_v3[t] += v3;
1037  SLOW(
1038  sh_slow_v1[t] += slow_v1;
1039  sh_slow_v2[t] += slow_v2;
1040  sh_slow_v3[t] += slow_v3;
1041  )
1042  BLOCK_SYNC;
1043  }
1044  if (threadIdx.x == 0 && threadIdx.y == 0) {
1045  sh_vcc[6] = sh_v1[0];
1046  sh_vcc[7] = sh_v2[0];
1047  sh_vcc[8] = sh_v3[0];
1048  SLOW(
1049  sh_vcc[9+6] = sh_slow_v1[0];
1050  sh_vcc[9+7] = sh_slow_v2[0];
1051  sh_vcc[9+8] = sh_slow_v3[0];
1052  )
1053  }
1054  // Write final virials and energies into global memory
1055  if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
1056  float* dst = (threadIdx.x < 9) ? virials : slow_virials;
1057  const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
1058  int pos = patch_ind*16 + (threadIdx.x % 9);
1059  dst[pos] = sh_vcc[threadIdx.x] + src[pos];
1060  }
1061 #endif // KEPLER_SHUFFLE
1062  ENERGY(
1063  // Write final energies into global memory
1064  if (threadIdx.x < 3 && threadIdx.y == 0) {
1065  int pos = patch_ind*16 + 9 + threadIdx.x;
1066  virials[pos] = tmpvirials[pos];
1067  }
1068  );
1069  GENPAIRLIST(
1070  if (threadIdx.x == 0 && threadIdx.y == 0) {
1071  int pos = patch_ind*16 + 12;
1072  virials[pos] = tmpvirials[pos];
1073  }
1074  );
1075 }
1076 
1077 #endif // NAMD_CUDA
1078 
static __thread int * block_order
#define USEPAIRLIST(X)
#define ENERGY(X)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__device__ static __forceinline__ void NAME() finish_forces_virials(const int start, const int size, const int patch_ind, const atom *atoms, volatile float *sh_buf, volatile float *sh_slow_buf, volatile float *sh_vcc, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
static __thread atom * atoms
static __thread float4 * forces
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread exclmask * exclmasks
#define cuda_static_assert(expr)
Definition: CudaUtils.h:85
static __thread float * slow_tmpvirials
#define PATCH_PAIR_SIZE
if(ComputeNonbondedUtil::goMethod==2)
#define WARPSIZE
Definition: CudaUtils.h:10
#define SLOW(X)
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread float * slow_virials
#define SH_BUF_SIZE
#define NAME(X)
static __thread float * tmpvirials
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
texture< float2, 1, cudaReadModeElementType > lj_table
gridSize z
#define GENPAIRLIST(X)
static __thread float * virials
#define MAX_CONST_EXCLUSIONS
__device__ __forceinline__ void NAME() reduceVariables(volatile T *sh_buf, T *dst, T val1, T val2, T val3)
static __thread float4 * slow_tmpforces
static __thread unsigned int * global_counters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread atom_param * atom_params
texture< float4, 1, cudaReadModeElementType > force_table
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
gridSize y
#define NUM_WARP
static __thread int * vdw_types
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
#define WARP_ANY(MASK, P)
Definition: CudaUtils.h:57
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
static __thread float4 * tmpforces
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54