NAMD
ComputeNonbondedMICKernelBase.h
Go to the documentation of this file.
1 
2 #ifdef NAMD_MIC
3 
6 
7 #ifdef ARCH_POWERPC
8  #include <builtins.h>
9 #endif
10 
11 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)
12  #include <emmintrin.h> // We're using SSE2 intrinsics
13  #if defined(__INTEL_COMPILER)
14  #define __align(X) __declspec(align(X) )
15  #elif defined(__GNUC__) || defined(__PGI)
16  #define __align(X) __attribute__((aligned(X) ))
17  #else
18  #define __align(X) __declspec(align(X) )
19  #endif
20 #endif
21 
22 // DMK - NOTE : Removing the includes for now, since we are using the "raw data"
23 // from the host data structures (i.e. packing them up at startup and passing
24 // that raw data to the card). As data structures are included on the device
25 // as well (if that happens), then some of these could be included.
26 //#ifdef DEFINITION // (
27 // #include "LJTable.h"
28 // #include "Molecule.h"
29 // #include "ComputeNonbondedUtil.h"
30 //#endif // )
31 //#include "Parameters.h"
32 //#if NAMD_ComputeNonbonded_SortAtoms != 0
33 // #include "PatchMap.h"
34 //#endif
35 
36 // determining class name
37 #undef NAME
38 #undef CLASS
39 #undef CLASSNAME
40 #define NAME CLASSNAME(calc)
41 
42 #undef PAIR
43 #if NBTYPE == NBPAIR
44  #define PAIR(X) X
45  #define CLASS ComputeNonbondedPair
46  #define CLASSNAME(X) ENERGYNAME( X ## _pair )
47 #else
48  #define PAIR(X)
49 #endif
50 
51 #undef SELF
52 #if NBTYPE == NBSELF
53  #define SELF(X) X
54  #define CLASS ComputeNonbondedSelf
55  #define CLASSNAME(X) ENERGYNAME( X ## _self )
56 #else
57  #define SELF(X)
58 #endif
59 
60 #undef ENERGYNAME
61 #undef ENERGY
62 #undef NOENERGY
63 #ifdef CALCENERGY
64  #define ENERGY(X) X
65  #define NOENERGY(X)
66  #define ENERGYNAME(X) SLOWONLYNAME( X ## _energy )
67 #else
68  #define ENERGY(X)
69  #define NOENERGY(X) X
70  #define ENERGYNAME(X) SLOWONLYNAME( X )
71 #endif
72 
73 #undef SLOWONLYNAME
74 #undef FAST
75 #ifdef SLOWONLY
76  #define FAST(X)
77  #define SLOWONLYNAME(X) MERGEELECTNAME( X ## _slow )
78 #else
79  #define FAST(X) X
80  #define SLOWONLYNAME(X) MERGEELECTNAME( X )
81 #endif
82 
83 #undef MERGEELECTNAME
84 #undef SHORT
85 #undef NOSHORT
86 #ifdef MERGEELECT
87  #define SHORT(X)
88  #define NOSHORT(X) X
89  #define MERGEELECTNAME(X) FULLELECTNAME( X ## _merge )
90 #else
91  #define SHORT(X) X
92  #define NOSHORT(X)
93  #define MERGEELECTNAME(X) FULLELECTNAME( X )
94 #endif
95 
96 #undef FULLELECTNAME
97 #undef FULL
98 #undef NOFULL
99 #ifdef FULLELECT
100  #define FULLELECTNAME(X) TABENERGYNAME( X ## _fullelect )
101  #define FULL(X) X
102  #define NOFULL(X)
103 #else
104  #define FULLELECTNAME(X) TABENERGYNAME( X )
105  #define FULL(X)
106  #define NOFULL(X) X
107 #endif
108 
109 #undef TABENERGYNAME
110 #undef TABENERGY
111 #undef NOTABENERGY
112 #ifdef TABENERGYFLAG
113  #define TABENERGYNAME(X) FEPNAME( X ## _tabener )
114  #define TABENERGY(X) X
115  #define NOTABENERGY(X)
116 #else
117  #define TABENERGYNAME(X) FEPNAME( X )
118  #define TABENERGY(X)
119  #define NOTABENERGY(X) X
120 #endif
121 
122 // Here are what these macros stand for:
123 // FEP/NOT_FEP: FEP free energy perturbation is active/inactive
124 // (does NOT use LAM)
125 // LES: locally-enhanced sampling is active
126 // LAM: scale the potential by a factor lambda (except FEP)
127 // INT: measure interaction energies
128 // PPROF: pressure profiling
129 
130 #undef FEPNAME
131 #undef FEP
132 #undef LES
133 #undef INT
134 #undef PPROF
135 #undef LAM
136 #undef CUDA
137 #undef MIC
138 #undef ALCH
139 #undef TI
140 #undef GO
141 #define FEPNAME(X) LAST( X )
142 #define FEP(X)
143 #define ALCHPAIR(X)
144 #define NOT_ALCHPAIR(X) X
145 #define LES(X)
146 #define INT(X)
147 #define PPROF(X)
148 #define LAM(X)
149 #define CUDA(X)
150 #define MIC(X)
151 #define ALCH(X)
152 #define TI(X)
153 #define GO(X)
154 #ifdef FEPFLAG
155  #undef FEPNAME
156  #undef FEP
157  #undef ALCH
158  #define FEPNAME(X) LAST( X ## _fep )
159  #define FEP(X) X
160  #define ALCH(X) X
161 #endif
162 #ifdef TIFLAG
163  #undef FEPNAME
164  #undef TI
165  #undef ALCH
166  #define FEPNAME(X) LAST( X ## _ti )
167  #define TI(X) X
168  #define ALCH(X) X
169 #endif
170 #ifdef LESFLAG
171  #undef FEPNAME
172  #undef LES
173  #undef LAM
174  #define FEPNAME(X) LAST( X ## _les )
175  #define LES(X) X
176  #define LAM(X) X
177 #endif
178 #ifdef INTFLAG
179  #undef FEPNAME
180  #undef INT
181  #define FEPNAME(X) LAST( X ## _int )
182  #define INT(X) X
183 #endif
184 #ifdef PPROFFLAG
185  #undef FEPNAME
186  #undef INT
187  #undef PPROF
188  #define FEPNAME(X) LAST( X ## _pprof )
189  #define INT(X) X
190  #define PPROF(X) X
191 #endif
192 #ifdef GOFORCES
193  #undef FEPNAME
194  #undef GO
195  #define FEPNAME(X) LAST( X ## _go )
196  #define GO(X) X
197 #endif
198 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
199  #undef CUDA
200  #define CUDA(X) X
201 #endif
202 #ifdef NAMD_MIC
203  #undef MIC
204  #define MIC(X) X
205 #endif
206 
207 #define LAST(X) X
208 
209 // see if things are really messed up
210 SELF( PAIR( foo bar ) )
211 LES( FEP( foo bar ) )
212 LES( INT( foo bar ) )
213 FEP( INT( foo bar ) )
214 LAM( INT( foo bar ) )
215 FEP( NOENERGY( foo bar ) )
216 ENERGY( NOENERGY( foo bar ) )
217 TABENERGY(NOTABENERGY( foo bar ) )
218 
219 #define ALLOCA_ALIGNED(p, s, a) { \
220  char *ptr = alloca((s) + (a)); \
221  int64 a1 = (a) - 1; \
222  int64 ptr64 = ((int64)ptr) + a1); \
223  (p) = (void*)(ptr64 - (ptr64 & a1)); \
224 }
225 
226 #define __MIC_PAD_PLGEN_CTRL (MIC_PAD_PLGEN)
227 
230 
231 
233 // Declare the compute function
234 
235 __attribute__((target(mic))) void NAME (mic_params &params) {
236 
238 
239  #if MIC_HANDCODE_FORCE_SINGLE != 0
240  #define CALC_TYPE float
241  #else
242  #define CALC_TYPE double
243  #endif
244 
245  // Setup nonbonded table pointers (using the main table pointer, create pointers for each
246  // of the non-overlapping sub-tables)
247  const int table_four_n_16 = params.table_four_n_16;
248  __ASSUME(table_four_n_16 % 16 == 0);
249  __ASSERT(params.table_four_base_ptr != NULL);
250  const CALC_TYPE * RESTRICT const table_noshort = (CALC_TYPE*)(params.table_four_base_ptr) ; __ASSUME_ALIGNED(table_noshort);
251  const CALC_TYPE * RESTRICT const table_short = (CALC_TYPE*)(params.table_four_base_ptr) + 16 * table_four_n_16; __ASSUME_ALIGNED(table_short);
252  const CALC_TYPE * RESTRICT const slow_table = (CALC_TYPE*)(params.table_four_base_ptr) + 32 * table_four_n_16; __ASSUME_ALIGNED(slow_table);
253  //const CALC_TYPE * RESTRICT const fast_table = (CALC_TYPE*)(params.table_four_base_ptr) + 36 * table_four_n_16; __ASSUME_ALIGNED(fast_table);
254  //const CALC_TYPE * RESTRICT const scor_table = (CALC_TYPE*)(params.table_four_base_ptr) + 40 * table_four_n_16; __ASSUME_ALIGNED(scor_table);
255  //const CALC_TYPE * RESTRICT const corr_table = (CALC_TYPE*)(params.table_four_base_ptr) + 44 * table_four_n_16; __ASSUME_ALIGNED(corr_table);
256  //const CALC_TYPE * RESTRICT const full_table = (CALC_TYPE*)(params.table_four_base_ptr) + 48 * table_four_n_16; __ASSUME_ALIGNED(full_table);
257  //const CALC_TYPE * RESTRICT const vdwa_table = (CALC_TYPE*)(params.table_four_base_ptr) + 52 * table_four_n_16; __ASSUME_ALIGNED(vdwa_table);
258  //const CALC_TYPE * RESTRICT const vdwb_table = (CALC_TYPE*)(params.table_four_base_ptr) + 56 * table_four_n_16; __ASSUME_ALIGNED(vdwb_table);
259  //const CALC_TYPE * RESTRICT const r2_table = (CALC_TYPE*)(params.table_four_base_ptr) + 60 * table_four_n_16; __ASSUME_ALIGNED(r2_table);
260  // DMK - NOTE : Other table pointers will be useful as this code grows in scope, so including them all now
261 
262  // Setup LJ table pointers
263  const CALC_TYPE * RESTRICT const lj_table_base_ptr = (CALC_TYPE*)(params.lj_table_base_ptr);
264  __ASSERT(lj_table_base_ptr != NULL);
265  const int lj_table_dim = params.lj_table_dim;
266  __ASSUME_ALIGNED(lj_table_base_ptr);
267 
268  // Constants
269  const CALC_TYPE r2_delta = params.constants->r2_delta;
270  const int r2_delta_exp = params.constants->r2_delta_exp;
271  const int r2_delta_expc = params.constants->r2_delta_expc;
272  const CALC_TYPE scaling = params.constants->scaling;
273  const CALC_TYPE modf_mod = params.constants->modf_mod;
274 
275  // Cutoff values
276  const CALC_TYPE plcutoff = params.pp->plcutoff;
277  const CALC_TYPE plcutoff2 = plcutoff * plcutoff;
278  const CALC_TYPE plcutoff2_delta = plcutoff2 + r2_delta;
279  const CALC_TYPE cutoff2 = params.constants->cutoff2;
280  const CALC_TYPE cutoff2_delta = cutoff2 + r2_delta;
281 
282  // Number of atoms in each list of atoms
283  const int i_upper = params.numAtoms[0];
284  const int j_upper = params.numAtoms[1];
285  const int i_upper_16 = params.numAtoms_16[0];
286  const int j_upper_16 = params.numAtoms_16[1];
287  __ASSERT(i_upper >= 0); __ASSERT(j_upper >= 0);
288  __ASSERT(i_upper_16 >= 0); __ASSERT(j_upper_16 >= 0);
289  __ASSUME(i_upper_16 % 16 == 0);
290  __ASSUME(j_upper_16 % 16 == 0);
291 
292  // Setup pointers to atom input arrays
293  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
294  atom * RESTRICT p_0 = params.p[0]; __ASSERT(p_0 != NULL); __ASSUME_ALIGNED(p_0);
295  atom_param * RESTRICT pExt_0 = params.pExt[0]; __ASSERT(pExt_0 != NULL); __ASSUME_ALIGNED(pExt_0);
296  #if (0 SELF(+1))
297  #define p_1 p_0
298  #define pExt_1 pExt_0
299  #else
300  atom * RESTRICT p_1 = params.p[1]; __ASSERT(p_1 != NULL); __ASSUME_ALIGNED(p_1);
301  atom_param * RESTRICT pExt_1 = params.pExt[1]; __ASSERT(pExt_1 != NULL); __ASSUME_ALIGNED(pExt_1);
302  #endif
303  #else
304  mic_position_t * RESTRICT p_0_x = params.p_x[0];
305  mic_position_t * RESTRICT p_0_y = params.p_y[0];
306  mic_position_t * RESTRICT p_0_z = params.p_z[0];
307  mic_position_t * RESTRICT p_0_q = params.p_q[0];
308  int * RESTRICT pExt_0_vdwType = params.pExt_vdwType[0];
309  int * RESTRICT pExt_0_index = params.pExt_index[0];
310  int * RESTRICT pExt_0_exclIndex = params.pExt_exclIndex[0];
311  int * RESTRICT pExt_0_exclMaxDiff = params.pExt_exclMaxDiff[0];
312  #if (0 SELF(+1))
313  #define p_1_x p_0_x
314  #define p_1_y p_0_y
315  #define p_1_z p_0_z
316  #define p_1_q p_0_q
317  #define p_1_vdwType p_0_vdwType
318  #define p_1_index p_0_index
319  #define p_1_exclIndex p_0_exclIndex
320  #define p_1_exclMaxDiff p_0_exclMaxDiff
321  #else
322  mic_position_t * RESTRICT p_1_x = params.p_x[1];
323  mic_position_t * RESTRICT p_1_y = params.p_y[1];
324  mic_position_t * RESTRICT p_1_z = params.p_z[1];
325  mic_position_t * RESTRICT p_1_q = params.p_q[1];
326  int * RESTRICT pExt_1_vdwType = params.pExt_vdwType[1];
327  int * RESTRICT pExt_1_index = params.pExt_index[1];
328  int * RESTRICT pExt_1_exclIndex = params.pExt_exclIndex[1];
329  int * RESTRICT pExt_1_exclMaxDiff = params.pExt_exclMaxDiff[1];
330  #endif
331  __ASSERT(p_0_x != NULL); __ASSERT(p_0_y != NULL); __ASSERT(p_0_z != NULL); __ASSERT(p_0_q != NULL);
332  __ASSERT(p_1_x != NULL); __ASSERT(p_1_y != NULL); __ASSERT(p_1_z != NULL); __ASSERT(p_1_q != NULL);
333  __ASSERT(pExt_0_vdwType != NULL); __ASSERT(pExt_0_index != NULL);
334  __ASSERT(pExt_1_vdwType != NULL); __ASSERT(pExt_1_index != NULL);
335  __ASSERT(pExt_0_exclIndex != NULL); __ASSERT(pExt_0_exclMaxDiff != NULL);
336  __ASSERT(pExt_1_exclIndex != NULL); __ASSERT(pExt_1_exclMaxDiff != NULL);
337  __ASSUME_ALIGNED(p_0_x); __ASSUME_ALIGNED(p_1_x);
338  __ASSUME_ALIGNED(p_0_y); __ASSUME_ALIGNED(p_1_y);
339  __ASSUME_ALIGNED(p_0_z); __ASSUME_ALIGNED(p_1_z);
340  __ASSUME_ALIGNED(p_0_q); __ASSUME_ALIGNED(p_1_q);
341  __ASSUME_ALIGNED(pExt_0_vdwType); __ASSUME_ALIGNED(pExt_0_index);
342  __ASSUME_ALIGNED(pExt_1_vdwType); __ASSUME_ALIGNED(pExt_1_index);
343  __ASSUME_ALIGNED(pExt_0_exclIndex); __ASSUME_ALIGNED(pExt_0_exclMaxDiff);
344  __ASSUME_ALIGNED(pExt_1_exclIndex); __ASSUME_ALIGNED(pExt_1_exclMaxDiff);
345  #endif
346 
347  // Setup pointers to force output arrays and clear those arrays (init to zero)
348  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
349  double4 * RESTRICT f_0 = params.ff[0];
350  #if (0 SELF(+1))
351  #define f_1 f_0
352  #else
353  double4 * RESTRICT f_1 = params.ff[1];
354  #endif
355  __ASSERT(f_0 != NULL); __ASSUME_ALIGNED(f_0);
356  __ASSERT(f_1 != NULL); __ASSUME_ALIGNED(f_1);
357  memset(f_0, 0, sizeof(double4) * i_upper_16);
358  PAIR( memset(f_1, 0, sizeof(double4) * j_upper_16); )
359  #if (0 FULL(+1))
360  double4 * RESTRICT fullf_0 = params.fullf[0];
361  #if (0 SELF(+1))
362  #define fullf_1 fullf_0
363  #else
364  double4 * RESTRICT fullf_1 = params.fullf[1];
365  #endif
366  __ASSERT(fullf_0 != NULL); __ASSUME_ALIGNED(fullf_0);
367  __ASSERT(fullf_1 != NULL); __ASSUME_ALIGNED(fullf_1);
368  memset(fullf_0, 0, sizeof(double4) * i_upper_16);
369  PAIR( memset(fullf_1, 0, sizeof(double4) * j_upper_16); )
370  #endif
371  #else
372  double * RESTRICT f_0_x = params.ff_x[0];
373  double * RESTRICT f_0_y = params.ff_y[0];
374  double * RESTRICT f_0_z = params.ff_z[0];
375  double * RESTRICT f_0_w = params.ff_w[0];
376  #if (0 SELF(+1))
377  #define f_1_x f_0_x
378  #define f_1_y f_0_y
379  #define f_1_z f_0_z
380  #define f_1_w f_0_w
381  #else
382  double * RESTRICT f_1_x = params.ff_x[1];
383  double * RESTRICT f_1_y = params.ff_y[1];
384  double * RESTRICT f_1_z = params.ff_z[1];
385  double * RESTRICT f_1_w = params.ff_w[1];
386  #endif
387  __ASSERT(f_0_x != NULL); __ASSERT(f_0_y != NULL); __ASSERT(f_0_z != NULL); __ASSERT(f_0_w != NULL);
388  __ASSERT(f_1_x != NULL); __ASSERT(f_1_y != NULL); __ASSERT(f_1_z != NULL); __ASSERT(f_1_w != NULL);
389  __ASSUME_ALIGNED(f_0_x); __ASSUME_ALIGNED(f_1_x);
390  __ASSUME_ALIGNED(f_0_y); __ASSUME_ALIGNED(f_1_y);
391  __ASSUME_ALIGNED(f_0_z); __ASSUME_ALIGNED(f_1_z);
392  __ASSUME_ALIGNED(f_0_w); __ASSUME_ALIGNED(f_1_w);
393  memset(f_0_x, 0, 4 * sizeof(double) * i_upper_16);
394  PAIR( memset(f_1_x, 0, 4 * sizeof(double) * j_upper_16); )
395  #if (0 FULL(+1))
396  double * RESTRICT fullf_0_x = params.fullf_x[0];
397  double * RESTRICT fullf_0_y = params.fullf_y[0];
398  double * RESTRICT fullf_0_z = params.fullf_z[0];
399  double * RESTRICT fullf_0_w = params.fullf_w[0];
400  #if (0 SELF(+1))
401  #define fullf_1_x fullf_0_x
402  #define fullf_1_y fullf_0_y
403  #define fullf_1_z fullf_0_z
404  #define fullf_1_w fullf_0_w
405  #else
406  double * RESTRICT fullf_1_x = params.fullf_x[1];
407  double * RESTRICT fullf_1_y = params.fullf_y[1];
408  double * RESTRICT fullf_1_z = params.fullf_z[1];
409  double * RESTRICT fullf_1_w = params.fullf_w[1];
410  #endif
411  __ASSERT(fullf_0_x != NULL); __ASSERT(fullf_0_y != NULL); __ASSERT(fullf_0_z != NULL); __ASSERT(fullf_0_w != NULL);
412  __ASSERT(fullf_1_x != NULL); __ASSERT(fullf_1_y != NULL); __ASSERT(fullf_1_z != NULL); __ASSERT(fullf_1_w != NULL);
413  __ASSUME_ALIGNED(fullf_0_x); __ASSUME_ALIGNED(fullf_1_x);
414  __ASSUME_ALIGNED(fullf_0_y); __ASSUME_ALIGNED(fullf_1_y);
415  __ASSUME_ALIGNED(fullf_0_z); __ASSUME_ALIGNED(fullf_1_z);
416  __ASSUME_ALIGNED(fullf_0_w); __ASSUME_ALIGNED(fullf_1_w);
417  memset(fullf_0_x, 0, 4 * sizeof(double) * i_upper_16);
418  PAIR( memset(fullf_1_x, 0, 4 * sizeof(double) * j_upper_16); )
419  #endif
420  #endif
421 
422  // If the commOnly flag has been set, return now (after having zero'd the output forces)
423  if (params.constants->commOnly) { return; }
424 
425  // If either atom list is size zero, return now (after having zero'd the output forces)
426  if (i_upper <= 0 || j_upper <= 0) { return; }
427 
428  CALC_TYPE offset_x = (CALC_TYPE)(params.offset.x);
429  CALC_TYPE offset_y = (CALC_TYPE)(params.offset.y);
430  CALC_TYPE offset_z = (CALC_TYPE)(params.offset.z);
431 
433 
434  // Grab the pairlist pointers for the various pairlists that will be used
435  int * RESTRICT pairlist_norm = params.pairlists_ptr[PL_NORM_INDEX];
436  int * RESTRICT pairlist_mod = params.pairlists_ptr[PL_MOD_INDEX];
437  int * RESTRICT pairlist_excl = params.pairlists_ptr[PL_EXCL_INDEX];
438  #if MIC_CONDITION_NORMAL != 0
439  int * RESTRICT pairlist_normhi = params.pairlists_ptr[PL_NORMHI_INDEX];
440  #endif
441 
442  // NOTE : The first 2 integers are used for sizing information. The actual
443  // pairlist entries (i.e. what we want to be 64 byte aligned) start at
444  // element 2 (thus the +/- 14 shifts in the macros below).
445  #define PAIRLIST_ALLOC_CHECK(pl, init_size) { \
446  if ((pl) == NULL) { \
447  const int plInitSize = (init_size) + (16 * (MIC_HANDCODE_FORCE_PFDIST + 1)); \
448  if (device__timestep == 0) { /* NOTE: Avoid all the prints on timestep 0 */ \
449  (pl) = (int*)(_mm_malloc(plInitSize * sizeof(int) + 64, 64)); \
450  } else { \
451  (pl) = (int*)_MM_MALLOC_WRAPPER(plInitSize * sizeof(int) + 64, 64, "pairlist_alloc_check"); \
452  } \
453  __ASSERT((pl) != NULL); \
454  (pl) += 14; \
455  (pl)[0] = plInitSize; \
456  (pl)[1] = 2; \
457  } \
458  }
459 
460  #define PAIRLIST_GROW_CHECK(pl, offset, mul) { \
461  if ((offset) + j_upper + 8 + MIC_PREFETCH_DISTANCE + (16 * (MIC_HANDCODE_FORCE_PFDIST + 1)) > (pl)[0]) { \
462  int newPlAllocSize = (int)(((pl)[0]) * (mul) + j_upper + 64); /* NOTE: add at least j_upper (max that can be added before next check) in case pl[0] is small to begin with */ \
463  newPlAllocSize = (~2047) & (newPlAllocSize + 2047); /* NOTE: round up to 2K, add at least 2K */ \
464  int* RESTRICT newPl = (int*)_MM_MALLOC_WRAPPER(newPlAllocSize * sizeof(int), 64, "pairlist_grow_check"); \
465  __ASSERT((newPl) != NULL); \
466  newPl += 14; \
467  memcpy(newPl, (pl), (offset) * sizeof(int)); \
468  _MM_FREE_WRAPPER((pl) - 14); \
469  (pl) = newPl; \
470  (pl)[0] = newPlAllocSize; \
471  } \
472  }
473 
474  DEVICE_FPRINTF("P");
475 
476  // Regenerate the pairlists, if need be
477  if ((!(params.usePairlists)) || (params.savePairlists)) {
478 
479  // For the sake of getting a good estimate of the virtual memory require for pairlists,
480  // do an actual count of the interactions within this compute (part).
481  // NOTE: This will only occur during timestep 0. Later allocations grow the buffer by
482  // a fixed factor.
483 
484  if (pairlist_norm == NULL) { // NOTE: Only checking one pointer, but all will be allocated together
485  // NOTE: This only occurs once when the compute is run for the first time
486 
487  int plCount_norm = 16;
488  int plCount_mod = 16;
489  int plCount_excl = 16;
490  #if MIC_CONDITION_NORMAL != 0
491  int plCount_normhi = 16;
492  #endif
493 
494  int numParts = params.pp->numParts;
495  int part = params.pp->part;
496  for (int i = part; i < i_upper; i += numParts) {
497 
498  // Load position information for the current "i" atom
499  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
500  CALC_TYPE p_i_x = p_0[i].x + offset_x;
501  CALC_TYPE p_i_y = p_0[i].y + offset_y;
502  CALC_TYPE p_i_z = p_0[i].z + offset_z;
503  #else
504  CALC_TYPE p_i_x = p_0_x[i] + offset_x;
505  CALC_TYPE p_i_y = p_0_y[i] + offset_y;
506  CALC_TYPE p_i_z = p_0_z[i] + offset_z;
507  #endif
508 
509  for (int j = 0 SELF(+i+1); j < j_upper; j++) {
510 
511  // Load the "j" atom's position, calculate/check the distance (squared)
512  // between the "i" and "j" atoms
513  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
514  CALC_TYPE p_d_x = p_i_x - p_1[j].x;
515  CALC_TYPE p_d_y = p_i_y - p_1[j].y;
516  CALC_TYPE p_d_z = p_i_z - p_1[j].z;
517  #else
518  CALC_TYPE p_d_x = p_i_x - p_1_x[j];
519  CALC_TYPE p_d_y = p_i_y - p_1_y[j];
520  CALC_TYPE p_d_z = p_i_z - p_1_z[j];
521  #endif
522  CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
523  if (r2 <= plcutoff2) {
524 
525  // Check the exclusion bits to set the modified and excluded flags
526  // NOTE: This code MUST match the exclusion lists generation code
527  // on ComputeNonbondedMIC.C, which generates the flags. In
528  // particular, the order of the 2 bits must be correct, per the
529  // pairlist format listed below (i.e. isModified -> MSB).
530  int exclFlags = 0x00;
531  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
532  const int indexDiff = pExt_0[i].index - pExt_1[j].index;
533  const int maxDiff = pExt_1[j].excl_maxdiff;
534  #else
535  const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
536  const int maxDiff = pExt_1_exclMaxDiff[j];
537  #endif
538  if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
539  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
540  const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
541  #else
542  const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
543  #endif
544  const int offset_major = offset / (sizeof(unsigned int) * 8);
545  const int offset_minor = offset % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
546  exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
547  }
548 
549  // Create the pairlist entry value (i and j value) and store it in to the
550  // appropriate pairlist based on the type of interaction (exclFlags value)
551  if (exclFlags == 0) {
552  #if MIC_CONDITION_NORMAL != 0
553  if (r2 <= normhi_split) { plCount_norm++; }
554  else { plCount_normhi++; }
555  #else
556  plCount_norm++;
557  #endif
558  } else if (exclFlags == 1) {
559  plCount_excl++;
560  } else if (exclFlags == 2) {
561  plCount_mod++;
562  }
563 
564  } // end if (r2 <= plcutoff2)
565  } // end for (j < j_upper)
566  } // end for (i < i_upper)
567 
568  // If padding the pairlists, add some extra room for padding ('vector width * num sub-lists' total)
569  #if __MIC_PAD_PLGEN_CTRL != 0
570  plCount_norm += (16 * i_upper);
571  plCount_mod += (16 * i_upper);
572  plCount_excl += (16 * i_upper);
573  #if MIC_CONDITION_NORMAL != 0
574  plCount_normhi += (16 * i_upper);
575  #endif
576  #endif
577 
578  // Add a constant percent and round up to a multiple of 1024
579  const float plPercent = 1.4;
580  const int plRound = 2048;
581  plCount_norm = (int)(plCount_norm * plPercent); plCount_norm = (~(plRound-1)) & (plCount_norm + (plRound-1));
582  plCount_mod = (int)(plCount_mod * plPercent); plCount_mod = (~(plRound-1)) & (plCount_mod + (plRound-1));
583  plCount_excl = (int)(plCount_excl * plPercent); plCount_excl = (~(plRound-1)) & (plCount_excl + (plRound-1));
584  #if MIC_CONDITION_NORMAL != 0
585  plCount_normhi = (int)(plCount_normhi * plPercent); plCount_normhi = (~(plRound-1)) & (plCount_normhi + (plRound-1));
586  plCount_norm += plCount_normhi;
587  #endif
588 
589  // DMK - DEBUG - printing allocations, but reduce output by skipping initial pairlist allocations in timestep 0 (lots of them)
590  // NOTE: This check is temporary for debugging, remove (keep wrapper version) when no longer needed
591  if (device__timestep == 0) { /* NOTE: Avoid all the allocation prints during timestep 0 */
592  pairlist_norm = (int*)(_mm_malloc(plCount_norm * sizeof(int), 64)); __ASSERT(pairlist_norm != NULL);
593  pairlist_mod = (int*)(_mm_malloc(plCount_mod * sizeof(int), 64)); __ASSERT(pairlist_mod != NULL);
594  pairlist_excl = (int*)(_mm_malloc(plCount_excl * sizeof(int), 64)); __ASSERT(pairlist_excl != NULL);
595  #if MIC_CONDITION_NORMAL != 0
596  pairlist_normhi = (int*)(_mm_malloc(plCount_normhi * sizeof(int), 64)); __ASSERT(pairlist_normhi != NULL);
597  #endif
598  } else {
599  pairlist_norm = (int*)(_MM_MALLOC_WRAPPER(plCount_norm * sizeof(int), 64, "pairlist_norm")); __ASSERT(pairlist_norm != NULL);
600  pairlist_mod = (int*)(_MM_MALLOC_WRAPPER(plCount_mod * sizeof(int), 64, "pairlist_mod")); __ASSERT(pairlist_mod != NULL);
601  pairlist_excl = (int*)(_MM_MALLOC_WRAPPER(plCount_excl * sizeof(int), 64, "pairlist_excl")); __ASSERT(pairlist_excl != NULL);
602  #if MIC_CONDITION_NORMAL != 0
603  pairlist_normhi = (int*)(_MM_MALLOC_WRAPPER(plCount_normhi * sizeof(int), 64, "pairlist_normhi")); __ASSERT(pairlist_normhi != NULL);
604  #endif
605  }
606 
607  pairlist_norm += 14; pairlist_norm[0] = plCount_norm; pairlist_norm[1] = 2;
608  pairlist_mod += 14; pairlist_mod[0] = plCount_mod; pairlist_mod[1] = 2;
609  pairlist_excl += 14; pairlist_excl[0] = plCount_excl; pairlist_excl[1] = 2;
610  #if MIC_CONDITION_NORMAL != 0
611  pairlist_normhi += 14; pairlist_normhi[0] = plCount_normhi; pairlist_normhi[1] = 2;
612  #endif
613 
614  // DMK - DEBUG - Pairlist memory stats
615  #if MIC_TRACK_DEVICE_MEM_USAGE != 0
616  pairlist_norm[-1] = 0;
617  pairlist_mod[-1] = 0;
618  pairlist_excl[-1] = 0;
619  #if MIC_CONDITION_NORMAL != 0
620  pairlist_normhi[-1] = 0;
621  #endif
622  #endif
623 
624  } // end if (pairlist_norm == NULL)
625 
626  // Create the pairlists from scratch. The first two elements in each pairlist
627  // are special values (element 0 is the allocated size of the memory buffer
628  // and element 1 is the length of the pairlist itself, including the these
629  // first two elements).
630  int plOffset_norm = 2; // Current length of the normal pairlist
631  int plOffset_mod = 2; // Current length of the modified pairlist
632  int plOffset_excl = 2; // Current length of the excluded pairlist
633  #if MIC_CONDITION_NORMAL != 0
634  double normhi_split = (0.25 * cutoff2) + (0.75 * plcutoff2); // Weighted average of cutoff2 and plcutoff2
635  int plOffset_normhi = 2; // Current length of the "normal high" pairlist (entires
636  // that belong in the normal pairlist, but that are
637  // further than the normhi_split distance, such that
638  // cutoff2 <= normhi_split <= plcutoff2).
639  #endif
640 
641  // If tiling of the pairlist is enabled...
642  #if MIC_TILE_PLGEN != 0
643 
644  // Calculate the loop tiling size
645  int plTileSize = (i_upper > j_upper) ? (i_upper) : (j_upper); //(i_upper + j_upper) / 2; // Avg. of list sizes ...
646  plTileSize /= MIC_TILE_PLGEN; // ... divided by MIC_TILE_PLGEN and ...
647  plTileSize = (plTileSize + 15) & (~15); // ... rounded up to multiple of 16
648  __ASSUME(plTileSize % 16 == 0);
649 
650  // The "i" and "j" tile loops
651  for (int _i = 0; _i < i_upper SELF(-1); _i += plTileSize) {
652  for (int _j = 0 SELF(+ _i); _j < j_upper; _j += plTileSize) {
653 
654  // Calculate the "i" loop bounds for the current tile
655  int i_lo = _i;
656  int i_hi = _i + plTileSize;
657  i_hi = (i_hi > i_upper SELF(-1)) ? (i_upper SELF(-1)) : (i_hi);
658 
659  // The "i" loop, iterating over the first list of atoms
660  for (int i = i_lo; i < i_hi; i++) {
661 
662  // If the pairlist is not long enough, grow the pairlist
663  PAIRLIST_GROW_CHECK(pairlist_norm, plOffset_norm, 1.4);
664  PAIRLIST_GROW_CHECK(pairlist_mod , plOffset_mod, 1.2);
665  PAIRLIST_GROW_CHECK(pairlist_excl, plOffset_excl, 1.2);
666  #if MIC_CONDITION_NORMAL != 0
667  PAIRLIST_GROW_CHECK(pairlist_normhi, plOffset_normhi, 1.3);
668  #endif
669 
670  // Load position information for the current "i" atom
671  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
672  CALC_TYPE p_i_x = p_0[i].x + offset_x;
673  CALC_TYPE p_i_y = p_0[i].y + offset_y;
674  CALC_TYPE p_i_z = p_0[i].z + offset_z;
675  #else
676  CALC_TYPE p_i_x = p_0_x[i] + offset_x; //params.offset.x;
677  CALC_TYPE p_i_y = p_0_y[i] + offset_y; //params.offset.y;
678  CALC_TYPE p_i_z = p_0_z[i] + offset_z; //params.offset.z;
679  #endif
680 
681  // Calculate the "j" loop bounds for the current tile
682  int j_lo = PAIR(_j) SELF((_i == _j) ? (i+1) : (_j));
683  int j_hi = _j + plTileSize;
684  j_hi = (j_hi > j_upper) ? (j_upper) : (j_hi);
685 
686  #if MIC_HANDCODE_PLGEN != 0
687 
688  __m512 p_i_x_vec = _mm512_set_1to16_ps(p_i_x);
689  __m512 p_i_y_vec = _mm512_set_1to16_ps(p_i_y);
690  __m512 p_i_z_vec = _mm512_set_1to16_ps(p_i_z);
691  __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0_index[i]);
692 
693  __m512i i_shifted_vec = _mm512_slli_epi32(_mm512_set_1to16_epi32(i), 16);
694  __m512i j_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10, 9, 8,
695  7, 6, 5, 4, 3, 2, 1, 0);
696  // For self computes, round (i+1) down to nearest multiple of 16 and then
697  // add that value to j_vec (skipping iterations where all j values
698  // are < i+1)
699  PAIR( j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(j_lo)); )
700  #if (0 SELF(+1))
701  const int j_lo_16 = (j_lo) & (~0x0F);
702  j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(j_lo_16));
703  const int j_lo_m1 = j_lo - 1;
704  #endif
705 
706  // The "j" loop, iterating over the second list of atoms
707  #pragma novector
708  #pragma loop count(35)
709  for (int j = PAIR(j_lo) SELF(j_lo_16); j < j_hi; j += 16) {
710 
711  // Create the active_mask
712  __mmask16 active_mask = _mm512_cmplt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_hi));
713  #if (0 SELF(+1))
714  active_mask = _mm512_kand(active_mask, _mm512_cmpgt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_lo_m1)));
715  #endif
716 
717  // Create the pairlist entry values for these would-be interactions before advancing
718  // the j_vec values (i in upper 16 bits and j in lower 16 bits)
719  __m512i plEntry_vec = _mm512_or_epi32(i_shifted_vec, j_vec);
720 
721  // Advance j_vec counter
722  j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(16));
723 
724  // Load the positions for the "j" atoms
725  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
726  __m512i p_j_tmp0a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j )); // Load first set of 4 atoms
727  __m512i p_j_tmp1a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 4)); // Load second set of 4 atoms
728  __m512i p_j_tmp2a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 8)); // Load third set of 4 atoms
729  __m512i p_j_tmp3a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 12)); // Load fourth set of 4 atoms
730  __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
731  __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
732  __m512i p_j_tmp0b_vec = _mm512_mask_swizzle_epi32(p_j_tmp0a_vec, k_2x2_0, p_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
733  __m512i p_j_tmp1b_vec = _mm512_mask_swizzle_epi32(p_j_tmp1a_vec, k_2x2_1, p_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
734  __m512i p_j_tmp2b_vec = _mm512_mask_swizzle_epi32(p_j_tmp2a_vec, k_2x2_0, p_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
735  __m512i p_j_tmp3b_vec = _mm512_mask_swizzle_epi32(p_j_tmp3a_vec, k_2x2_1, p_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
736  __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
737  __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
738  __m512i p_j_tmp0c_vec = _mm512_mask_swizzle_epi32(p_j_tmp0b_vec, k_4x4_0, p_j_tmp2b_vec, _MM_SWIZ_REG_BADC);
739  __m512i p_j_tmp1c_vec = _mm512_mask_swizzle_epi32(p_j_tmp1b_vec, k_4x4_0, p_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
740  __m512i p_j_tmp2c_vec = _mm512_mask_swizzle_epi32(p_j_tmp2b_vec, k_4x4_1, p_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
741  __m512i p_j_perm_pattern = _mm512_set_1to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
742  __m512 p_j_x_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp0c_vec));
743  __m512 p_j_y_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp1c_vec));
744  __m512 p_j_z_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp2c_vec));
745  #else
746  __m512 p_j_x_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_x + j);
747  __m512 p_j_y_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_y + j);
748  __m512 p_j_z_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_z + j);
749  #endif
750 
751  // Calculate the distance between "i" atom and these "j" atoms (in double)
752  __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
753  __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
754  __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
755  __m512 r2_vec = _mm512_mul_ps(p_ij_x_vec, p_ij_x_vec);
756  r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_y_vec, p_ij_y_vec));
757  r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_z_vec, p_ij_z_vec));
758 
759  // Do cutoff distance check. If there are no particles within cutoff, move on
760  // to the next iteration.
761  __mmask16 cutoff_mask = _mm512_mask_cmple_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps((float)(plcutoff2)));
762 
763  if (_mm512_kortestz(cutoff_mask, cutoff_mask)) { continue; }
764 
765  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
766  __m512i pExt_j_tmp0a_vec = _mm512_load_epi32(pExt_1 + j ); // Load first set of 4 atoms
767  __m512i pExt_j_tmp1a_vec = _mm512_load_epi32(pExt_1 + j + 4); // Load second set of 4 atoms
768  __m512i pExt_j_tmp2a_vec = _mm512_load_epi32(pExt_1 + j + 8); // Load third set of 4 atoms
769  __m512i pExt_j_tmp3a_vec = _mm512_load_epi32(pExt_1 + j + 12); // Load fourth set of 4 atoms
770  __m512i pExt_j_tmp0b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp0a_vec, k_2x2_0, pExt_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
771  __m512i pExt_j_tmp1b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1a_vec, k_2x2_1, pExt_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
772  __m512i pExt_j_tmp2b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2a_vec, k_2x2_0, pExt_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
773  __m512i pExt_j_tmp3b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3a_vec, k_2x2_1, pExt_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
774  __m512i pExt_j_tmp1c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1b_vec, k_4x4_0, pExt_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
775  __m512i pExt_j_tmp2c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2b_vec, k_4x4_1, pExt_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
776  __m512i pExt_j_tmp3c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3b_vec, k_4x4_1, pExt_j_tmp1b_vec, _MM_SWIZ_REG_BADC);
777  __m512i pExt_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
778  __m512i pExt_j_index_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp1c_vec);
779  __m512i pExt_j_exclIndex_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp2c_vec);
780  __m512i pExt_j_maxDiff_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp3c_vec);
781  #else
782  __m512i pExt_j_index_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_index + j);
783  __m512i pExt_j_maxDiff_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclMaxDiff + j);
784  __m512i pExt_j_exclIndex_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclIndex + j);
785  #endif
786 
787  // Check the index difference vs the maxDiff value to see if there the exclusion bits
788  // for this particular pair of atoms needs to be loaded.
789  __m512i indexDiff_vec = _mm512_mask_sub_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_i_index_vec, pExt_j_index_vec);
790  __mmask16 indexRange_min_mask = _mm512_mask_cmpge_epi32_mask(cutoff_mask, indexDiff_vec, _mm512_sub_epi32(_mm512_setzero_epi32(), pExt_j_maxDiff_vec)); // NOTE: indexDiff >= -1 * maxDiff
791  __mmask16 indexRange_max_mask = _mm512_mask_cmple_epi32_mask(cutoff_mask, indexDiff_vec, pExt_j_maxDiff_vec);
792  __mmask16 indexRange_mask = _mm512_kand(indexRange_min_mask, indexRange_max_mask);
793 
794  // Calculate offsets into the exclusion flags
795  __m512i offset_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), indexRange_mask, _mm512_slli_epi32(indexDiff_vec, 1), pExt_j_exclIndex_vec);
796  __m512i offset_major_vec = _mm512_srli_epi32(offset_vec, 5); // NOTE : offset / 32
797  __m512i offset_minor_vec = _mm512_and_epi32(_mm512_set_1to16_epi32(0x001f), offset_vec); // NOTE : offset % 32
798 
799  // Gather exclFlags using offset_major values and then extra 2-bit fields using offset_minor.
800  #if 0
801  int offset_major[16] __attribute__((aligned(64)));
802  int exclFlags[16] __attribute__((aligned(64)));
803  _mm512_store_epi32(offset_major, offset_major_vec);
804  exclFlags[ 0] = params.exclusion_bits[offset_major[ 0]];
805  exclFlags[ 1] = params.exclusion_bits[offset_major[ 1]];
806  exclFlags[ 2] = params.exclusion_bits[offset_major[ 2]];
807  exclFlags[ 3] = params.exclusion_bits[offset_major[ 3]];
808  exclFlags[ 4] = params.exclusion_bits[offset_major[ 4]];
809  exclFlags[ 5] = params.exclusion_bits[offset_major[ 5]];
810  exclFlags[ 6] = params.exclusion_bits[offset_major[ 6]];
811  exclFlags[ 7] = params.exclusion_bits[offset_major[ 7]];
812  exclFlags[ 8] = params.exclusion_bits[offset_major[ 8]];
813  exclFlags[ 9] = params.exclusion_bits[offset_major[ 9]];
814  exclFlags[10] = params.exclusion_bits[offset_major[10]];
815  exclFlags[11] = params.exclusion_bits[offset_major[11]];
816  exclFlags[12] = params.exclusion_bits[offset_major[12]];
817  exclFlags[13] = params.exclusion_bits[offset_major[13]];
818  exclFlags[14] = params.exclusion_bits[offset_major[14]];
819  exclFlags[15] = params.exclusion_bits[offset_major[15]];
820  __m512i exclFlags_vec = _mm512_load_epi32(exclFlags);
821  #else
822  __m512i exclFlags_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, offset_major_vec, params.exclusion_bits, _MM_SCALE_4);
823  #endif
824  exclFlags_vec = _mm512_mask_srlv_epi32(_mm512_setzero_epi32(), indexRange_mask, exclFlags_vec, offset_minor_vec);
825  exclFlags_vec = _mm512_and_epi32(exclFlags_vec, _mm512_set_1to16_epi32(0x03)); // NOTE : Mask out all but 2 LSBs
826 
827  // Create masks for each type of interaction (normal, modified, excluded) and
828  // store the generated pairlist entry values into the pairlists
829  __mmask16 norm_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_setzero_epi32());
830  #if MIC_CONDITION_NORMAL != 0
831  __mmask16 normhi_mask = _mm512_mask_cmplt_ps_mask(norm_mask, _mm512_set_1to16_ps(normhi_split), r2_vec);
832  norm_mask = _mm512_kxor(norm_mask, normhi_mask); // Unset any bits that were set in normhi
833  #endif
834  __mmask16 excl_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(1));
835  __mmask16 mod_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(2));
836  _mm512_mask_packstorelo_epi32(pairlist_norm + plOffset_norm , norm_mask, plEntry_vec);
837  _mm512_mask_packstorehi_epi32(pairlist_norm + plOffset_norm + 16, norm_mask, plEntry_vec);
838  #if MIC_CONDITION_NORMAL != 0
839  _mm512_mask_packstorelo_epi32(pairlist_normhi + plOffset_normhi , normhi_mask, plEntry_vec);
840  _mm512_mask_packstorehi_epi32(pairlist_normhi + plOffset_normhi + 16, normhi_mask, plEntry_vec);
841  #endif
842  _mm512_mask_packstorelo_epi32(pairlist_excl + plOffset_excl , excl_mask, plEntry_vec);
843  _mm512_mask_packstorehi_epi32(pairlist_excl + plOffset_excl + 16, excl_mask, plEntry_vec);
844  _mm512_mask_packstorelo_epi32(pairlist_mod + plOffset_mod , mod_mask, plEntry_vec);
845  _mm512_mask_packstorehi_epi32(pairlist_mod + plOffset_mod + 16, mod_mask, plEntry_vec);
846  __m512i one_vec = _mm512_set_1to16_epi32(1);
847  plOffset_norm += _mm512_mask_reduce_add_epi32(norm_mask, one_vec);
848  #if MIC_CONDITION_NORMAL != 0
849  plOffset_normhi += _mm512_mask_reduce_add_epi32(normhi_mask, one_vec);
850  #endif
851  plOffset_excl += _mm512_mask_reduce_add_epi32(excl_mask, one_vec);
852  plOffset_mod += _mm512_mask_reduce_add_epi32( mod_mask, one_vec);
853  }
854 
855  #else
856 
857  // The "j" loop, iterating over the second list of atoms
858  for (int j = j_lo; j < j_hi; j++) {
859 
860  // Load the "j" atom's position, calculate/check the distance (squared)
861  // between the "i" and "j" atoms
862  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
863  CALC_TYPE p_d_x = p_i_x - p_1[j].x;
864  CALC_TYPE p_d_y = p_i_y - p_1[j].y;
865  CALC_TYPE p_d_z = p_i_z - p_1[j].z;
866  #else
867  CALC_TYPE p_d_x = p_i_x - p_1_x[j];
868  CALC_TYPE p_d_y = p_i_y - p_1_y[j];
869  CALC_TYPE p_d_z = p_i_z - p_1_z[j];
870  #endif
871  CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
872  if (r2 <= plcutoff2) {
873 
874  // Check the exclusion bits to set the modified and excluded flags
875  // NOTE: This code MUST match the exclusion lists generation code
876  // on ComputeNonbondedMIC.C, which generates the flags. In
877  // particular, the order of the 2 bits must be correct, per the
878  // pairlist format listed below (i.e. isModified -> MSB).
879  int exclFlags = 0x00;
880  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
881  const int indexDiff = pExt_0[i].index - pExt_1[i].index;
882  const int maxDiff = pExt_1[j].excl_maxdiff;
883  #else
884  const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
885  const int maxDiff = pExt_1_exclMaxDiff[j];
886  #endif
887  if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
888  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
889  const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
890  #else
891  const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
892  #endif
893  const int offset_major = offset / (sizeof(unsigned int) * 8);
894  const int offset_minor = offset % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
895  exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
896  }
897 
898  // Create the pairlist entry value and store it
899  const int plEntry = ((i & 0xFFFF) << 16) | (j & 0xFFFF);
900  if (exclFlags == 0) {
901  #if MIC_CONDITION_NORMAL != 0
902  if (r2 <= normhi_split) {
903  pairlist_norm[plOffset_norm++] = plEntry;
904  } else {
905  pairlist_normhi[plOffset_normhi++] = plEntry;
906  }
907  #else
908  pairlist_norm[plOffset_norm++] = plEntry;
909  #endif
910  } else if (exclFlags == 1) {
911  pairlist_excl[plOffset_excl++] = plEntry;
912  } else if (exclFlags == 2) {
913  pairlist_mod[plOffset_mod++] = plEntry;
914  }
915 
916  } // end if (r2 <= plcutoff2
917  } // end for (j < j_hi)
918 
919  #endif
920 
921  #if __MIC_PAD_PLGEN_CTRL != 0
922  #if MIC_HANDCODE_FORCE_SINGLE != 0
923  const int padLen = 16;
924  #else
925  const int padLen = 8;
926  #endif
927  const int padValue = (i << 16) | 0xFFFF;
928  // NOTE: xxx % 8 != 2 because pairlist offset includes first two ints (pairlist size and alloc size)
929  while (plOffset_norm % padLen != 2) { pairlist_norm[plOffset_norm++] = padValue; }
930  while (plOffset_mod % padLen != 2) { pairlist_mod [plOffset_mod++ ] = padValue; }
931  while (plOffset_excl % padLen != 2) { pairlist_excl[plOffset_excl++] = padValue; }
932  #if MIC_CONDITION_NORMAL != 0
933  while (plOffset_normhi % padLen != 2) { pairlist_normhi[plOffset_normhi++] = padValue; }
934  #endif
935  #endif
936 
937  } // end for (i < i_hi)
938  } // end (_j < j_upper; _j += plTileSize)
939  } // end (_i < i_upper; _i += plTileSize)
940 
941  #else // MIC_TILE_PLGEN
942 
943  // Do the distance checks for all possible pairs of atoms between the
944  // two input atom arrays
945  // The "i" loop, iterating over the first list of atoms
946 
947  int numParts = params.pp->numParts;
948  int part = params.pp->part;
949 
950  // Generate plGenILo and plGenHi values (the portion of the "i" atoms that will be processed by this "part").
951  // Note that this is done differently for self computes, because of the unequal work per "i" atom. For
952  // pair computes, the "i" iteration space is just divided up evenly.
953  #if 0
954  int plGenILo = 0;
955  int plGenIHi = i_upper SELF(-1);
956  if (numParts > 1) {
957 
958  #if (0 SELF(+1))
959 
960  // For self computes where the iteration space forms a triangle, divide the rows in the
961  // iteration space so more "shorter" rows are grouped together while fewer "longer" rows
962  // are grouped together.
963  float totalArea = ((i_upper) * (i_upper - 1)) / 2.0f;
964  float areaPerPart = totalArea / numParts;
965 
966  // NOTE : We want to divide the triangular iteration space (0 <= i < i_upper - 1, i < j < i_upper).
967  // Since we know the area per part and the area of the iteration space "before" some arbitrary
968  // index X (i.e. 0 <= i < X), we can calculate indexes for each part.
969  // A = X(X-1)/2 where A = area "before" X
970  // solving for X we get X^2 - X - 2A = 0, or via the quadradic equation, X = (1 +- sqrt(1+8A))/2
971  // NOTE: This calculation might be a little messy, so double check the border cases
972  plGenILo = ((part==0)?(0) : ((int)((1.0f+sqrtf(1+8*(areaPerPart*part)))/2.0f)) );
973  plGenIHi = ((part==numParts-1)?(i_upper-1) : ((int)((1.0f+sqrtf(1+8*(areaPerPart*(part+1))))/2.0f)) );
974  // Reverse the indexes since this calculation assumes i=0 has little work i=i_upper-1 has lots of
975  // work (i.e. upper triangular versus lower triangular)
976  int plGenTmp = plGenILo;
977  plGenILo = (i_upper - 1) - plGenIHi;
978  plGenIHi = (i_upper - 1) - plGenTmp;
979 
980  #else // SELF
981 
982  // For pair computes where the iteration space forms a square, divide the rows in the
983  // iteration space evenly since they all have the same area
984  plGenILo = (int)(((float)(i_upper)) * ((float)(part )) / ((float)(numParts)));
985  plGenIHi = (int)(((float)(i_upper)) * ((float)(part + 1)) / ((float)(numParts)));
986 
987  #endif // SELF
988 
989  // Round up plGenILo and plGenIHi to a multiple of 16 so there is no false sharing
990  // on force output cachelines
991  plGenILo = (plGenILo + 15) & (~15);
992  plGenIHi = (plGenIHi + 15) & (~15);
993  if (plGenIHi > i_upper SELF(-1)) { plGenIHi = i_upper SELF(-1); }
994  }
995 
996  #pragma loop_count (300)
997  for (int i = plGenILo; i < plGenIHi; i++) {
998 
999  #else
1000 
1001  #pragma loop_count (300)
1002  for (int i = part; i < i_upper SELF(-1); i += numParts) {
1003 
1004  #endif
1005 
1006  // If the pairlist is not long enough, grow the pairlist
1007  PAIRLIST_GROW_CHECK(pairlist_norm, plOffset_norm, 50);
1008  PAIRLIST_GROW_CHECK(pairlist_mod , plOffset_mod, 8);
1009  PAIRLIST_GROW_CHECK(pairlist_excl, plOffset_excl, 8);
1010  #if MIC_CONDITION_NORMAL != 0
1011  PAIRLIST_GROW_CHECK(pairlist_normhi, plOffset_normhi, 15);
1012  #endif
1013 
1014  // Load position information for the current "i" atom
1015  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1016  CALC_TYPE p_i_x = p_0[i].x + offset_x;
1017  CALC_TYPE p_i_y = p_0[i].y + offset_y;
1018  CALC_TYPE p_i_z = p_0[i].z + offset_z;
1019  #else
1020  CALC_TYPE p_i_x = p_0_x[i] + offset_x;
1021  CALC_TYPE p_i_y = p_0_y[i] + offset_y;
1022  CALC_TYPE p_i_z = p_0_z[i] + offset_z;
1023  #endif
1024 
1025  #if MIC_HANDCODE_PLGEN != 0
1026 
1027  // Setup loop constants ("i" atom values) and iterator vector.
1028  __m512 p_i_x_vec = _mm512_set_1to16_ps(p_i_x);
1029  __m512 p_i_y_vec = _mm512_set_1to16_ps(p_i_y);
1030  __m512 p_i_z_vec = _mm512_set_1to16_ps(p_i_z);
1031  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1032  __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0[i].index);
1033  #else
1034  __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0_index[i]);
1035  #endif
1036 
1037  __m512i i_shifted_vec = _mm512_slli_epi32(_mm512_set_1to16_epi32(i), 16);
1038  __m512i j_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10, 9, 8,
1039  7, 6, 5, 4, 3, 2, 1, 0);
1040 
1041  // For self computes, round (i+1) down to nearest multiple of 16 and then
1042  // add that value to j_vec (skipping iterations where all j values
1043  // are < i+1)
1044  #if (0 SELF(+1))
1045  const int ip1_16 = (i+1) & (~0x0F);
1046  j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(ip1_16));
1047  #endif
1048 
1049  // The "j" loop, iterating over the second list of atoms
1050  #pragma novector
1051  #pragma loop count(35)
1052  for (int j = 0 SELF(+ ip1_16); j < j_upper; j += 16) {
1053 
1054  // Create the active_mask
1055  __mmask16 active_mask = _mm512_cmplt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_upper));
1056  #if (0 SELF(+1))
1057  active_mask = _mm512_kand(active_mask, _mm512_cmpgt_epi32_mask(j_vec, _mm512_set_1to16_epi32(i)));
1058  #endif
1059 
1060  // Create the pairlist entry values for these would-be interactions before advancing
1061  // the j_vec values (i in upper 16 bits and j in lower 16 bits)
1062  __m512i plEntry_vec = _mm512_or_epi32(i_shifted_vec, j_vec);
1063 
1064  // Advance j_vec counter
1065  j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(16));
1066 
1067  // Load the positions for the "j" atoms
1068  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1069  __m512i p_j_tmp0a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j )); // Load first set of 4 atoms
1070  __m512i p_j_tmp1a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 4)); // Load second set of 4 atoms
1071  __m512i p_j_tmp2a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 8)); // Load third set of 4 atoms
1072  __m512i p_j_tmp3a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 12)); // Load fourth set of 4 atoms
1073  __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
1074  __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
1075  __m512i p_j_tmp0b_vec = _mm512_mask_swizzle_epi32(p_j_tmp0a_vec, k_2x2_0, p_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
1076  __m512i p_j_tmp1b_vec = _mm512_mask_swizzle_epi32(p_j_tmp1a_vec, k_2x2_1, p_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
1077  __m512i p_j_tmp2b_vec = _mm512_mask_swizzle_epi32(p_j_tmp2a_vec, k_2x2_0, p_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
1078  __m512i p_j_tmp3b_vec = _mm512_mask_swizzle_epi32(p_j_tmp3a_vec, k_2x2_1, p_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
1079  __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
1080  __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
1081  __m512i p_j_tmp0c_vec = _mm512_mask_swizzle_epi32(p_j_tmp0b_vec, k_4x4_0, p_j_tmp2b_vec, _MM_SWIZ_REG_BADC);
1082  __m512i p_j_tmp1c_vec = _mm512_mask_swizzle_epi32(p_j_tmp1b_vec, k_4x4_0, p_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
1083  __m512i p_j_tmp2c_vec = _mm512_mask_swizzle_epi32(p_j_tmp2b_vec, k_4x4_1, p_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
1084  __m512i p_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
1085  __m512 p_j_x_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp0c_vec));
1086  __m512 p_j_y_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp1c_vec));
1087  __m512 p_j_z_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp2c_vec));
1088  #else
1089  __m512 p_j_x_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_x + j);
1090  __m512 p_j_y_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_y + j);
1091  __m512 p_j_z_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_z + j);
1092  #endif
1093 
1094  // Calculate the distance between "i" atom and these "j" atoms (in double)
1095  __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
1096  __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
1097  __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
1098  __m512 r2_vec = _mm512_mul_ps(p_ij_x_vec, p_ij_x_vec);
1099  r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_y_vec, p_ij_y_vec));
1100  r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_z_vec, p_ij_z_vec));
1101 
1102  // Do cutoff distance check. If there are no particles within cutoff, move on
1103  // to the next iteration.
1104  __mmask16 cutoff_mask = _mm512_mask_cmple_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps((float)(plcutoff2)));
1105 
1106  // If nothing passed the cutoff check, then move on to the next set of atoms (iteration)
1107  if (_mm512_kortestz(cutoff_mask, cutoff_mask)) { continue; }
1108 
1109  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1110  __m512i pExt_j_tmp0a_vec = _mm512_load_epi32(pExt_1 + j ); // Load first set of 4 atoms
1111  __m512i pExt_j_tmp1a_vec = _mm512_load_epi32(pExt_1 + j + 4); // Load second set of 4 atoms
1112  __m512i pExt_j_tmp2a_vec = _mm512_load_epi32(pExt_1 + j + 8); // Load third set of 4 atoms
1113  __m512i pExt_j_tmp3a_vec = _mm512_load_epi32(pExt_1 + j + 12); // Load fourth set of 4 atoms
1114  __m512i pExt_j_tmp0b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp0a_vec, k_2x2_0, pExt_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
1115  __m512i pExt_j_tmp1b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1a_vec, k_2x2_1, pExt_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
1116  __m512i pExt_j_tmp2b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2a_vec, k_2x2_0, pExt_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
1117  __m512i pExt_j_tmp3b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3a_vec, k_2x2_1, pExt_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
1118  __m512i pExt_j_tmp1c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1b_vec, k_4x4_0, pExt_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
1119  __m512i pExt_j_tmp2c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2b_vec, k_4x4_1, pExt_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
1120  __m512i pExt_j_tmp3c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3b_vec, k_4x4_1, pExt_j_tmp1b_vec, _MM_SWIZ_REG_BADC);
1121  __m512i pExt_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
1122  __m512i pExt_j_index_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp1c_vec);
1123  __m512i pExt_j_exclIndex_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp2c_vec);
1124  __m512i pExt_j_maxDiff_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp3c_vec);
1125  #else
1126  __m512i pExt_j_index_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_index + j);
1127  __m512i pExt_j_maxDiff_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclMaxDiff + j);
1128  __m512i pExt_j_exclIndex_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclIndex + j);
1129  #endif
1130 
1131  // Check the index difference vs the maxDiff value to see if there the exclusion bits
1132  // for this particular pair of atoms needs to be loaded.
1133  __m512i indexDiff_vec = _mm512_mask_sub_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_i_index_vec, pExt_j_index_vec);
1134  __mmask16 indexRange_min_mask = _mm512_mask_cmpge_epi32_mask(cutoff_mask, indexDiff_vec, _mm512_sub_epi32(_mm512_setzero_epi32(), pExt_j_maxDiff_vec)); // NOTE: indexDiff >= -1 * maxDiff
1135  __mmask16 indexRange_max_mask = _mm512_mask_cmple_epi32_mask(cutoff_mask, indexDiff_vec, pExt_j_maxDiff_vec);
1136  __mmask16 indexRange_mask = _mm512_kand(indexRange_min_mask, indexRange_max_mask);
1137 
1138  // Calculate offsets into the exclusion flags
1139  __m512i offset_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), indexRange_mask, _mm512_slli_epi32(indexDiff_vec, 1), pExt_j_exclIndex_vec);
1140  __m512i offset_major_vec = _mm512_srli_epi32(offset_vec, 5); // NOTE : offset / 32
1141  __m512i offset_minor_vec = _mm512_and_epi32(_mm512_set_1to16_epi32(0x001f), offset_vec); // NOTE : offset % 32
1142 
1143  // Gather exclFlags using offset_major values and then extra 2-bit fields using offset_minor.
1144  __m512i exclFlags_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, offset_major_vec, params.exclusion_bits, _MM_SCALE_4);
1145  exclFlags_vec = _mm512_mask_srlv_epi32(_mm512_setzero_epi32(), indexRange_mask, exclFlags_vec, offset_minor_vec);
1146  exclFlags_vec = _mm512_and_epi32(exclFlags_vec, _mm512_set_1to16_epi32(0x03)); // NOTE : Mask out all but 2 LSBs
1147 
1148  // Create masks for each type of interaction (normal, modified, excluded) and
1149  // store the generated pairlist entry values into the pairlists
1150  __mmask16 norm_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_setzero_epi32());
1151  #if MIC_CONDITION_NORMAL != 0
1152  __mmask16 normhi_mask = _mm512_mask_cmplt_ps_mask(norm_mask, _mm512_set_1to16_ps(normhi_split), r2_vec);
1153  norm_mask = _mm512_kxor(norm_mask, normhi_mask); // Unset any bits that were set in normhi
1154  #endif
1155  __mmask16 excl_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(1));
1156  __mmask16 mod_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(2));
1157  _mm512_mask_packstorelo_epi32(pairlist_norm + plOffset_norm , norm_mask, plEntry_vec);
1158  _mm512_mask_packstorehi_epi32(pairlist_norm + plOffset_norm + 16, norm_mask, plEntry_vec);
1159  #if MIC_CONDITION_NORMAL != 0
1160  _mm512_mask_packstorelo_epi32(pairlist_normhi + plOffset_normhi , normhi_mask, plEntry_vec);
1161  _mm512_mask_packstorehi_epi32(pairlist_normhi + plOffset_normhi + 16, normhi_mask, plEntry_vec);
1162  #endif
1163  _mm512_mask_packstorelo_epi32(pairlist_excl + plOffset_excl , excl_mask, plEntry_vec);
1164  _mm512_mask_packstorehi_epi32(pairlist_excl + plOffset_excl + 16, excl_mask, plEntry_vec);
1165  _mm512_mask_packstorelo_epi32(pairlist_mod + plOffset_mod , mod_mask, plEntry_vec);
1166  _mm512_mask_packstorehi_epi32(pairlist_mod + plOffset_mod + 16, mod_mask, plEntry_vec);
1167  __m512i one_vec = _mm512_set_1to16_epi32(1);
1168 
1169  // Move the offsets forward by the number of atoms added to each list
1170  plOffset_norm += _mm512_mask_reduce_add_epi32(norm_mask, one_vec);
1171  #if MIC_CONDITION_NORMAL != 0
1172  plOffset_normhi += _mm512_mask_reduce_add_epi32(normhi_mask, one_vec);
1173  #endif
1174  plOffset_excl += _mm512_mask_reduce_add_epi32(excl_mask, one_vec);
1175  plOffset_mod += _mm512_mask_reduce_add_epi32( mod_mask, one_vec);
1176  }
1177 
1178  #else // if MIC_HANDCODE_PLGEN != 0
1179 
1180  // The "j" loop, iterating over the second list of atoms
1181  #if (0 PAIR(+1))
1182  #pragma loop_count (30)
1183  #elif (0 SELF(+1))
1184  #pragma loop_count (300)
1185  #endif
1186  for (int j = 0 SELF(+i+1); j < j_upper; j++) {
1187 
1188  // Load the "j" atom's position, calculate/check the distance (squared)
1189  // between the "i" and "j" atoms
1190  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1191  CALC_TYPE p_d_x = p_i_x - p_1[j].x;
1192  CALC_TYPE p_d_y = p_i_y - p_1[j].y;
1193  CALC_TYPE p_d_z = p_i_z - p_1[j].z;
1194  #else
1195  CALC_TYPE p_d_x = p_i_x - p_1_x[j];
1196  CALC_TYPE p_d_y = p_i_y - p_1_y[j];
1197  CALC_TYPE p_d_z = p_i_z - p_1_z[j];
1198  #endif
1199  CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
1200  if (r2 <= plcutoff2) {
1201 
1202  // Check the exclusion bits to set the modified and excluded flags
1203  // NOTE: This code MUST match the exclusion lists generation code
1204  // on ComputeNonbondedMIC.C, which generates the flags. In
1205  // particular, the order of the 2 bits must be correct, per the
1206  // pairlist format listed below (i.e. isModified -> MSB).
1207  int exclFlags = 0x00;
1208  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1209  const int indexDiff = pExt_0[i].index - pExt_1[j].index;
1210  const int maxDiff = pExt_1[j].excl_maxdiff;
1211  #else
1212  const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
1213  const int maxDiff = pExt_1_exclMaxDiff[j];
1214  #endif
1215  if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
1216  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1217  const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
1218  #else
1219  const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
1220  #endif
1221  const int offset_major = offset / (sizeof(unsigned int) * 8);
1222  const int offset_minor = offset % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
1223  exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
1224  }
1225 
1226  // Create the pairlist entry value (i and j value) and store it in to the
1227  // appropriate pairlist based on the type of interaction (exclFlags value)
1228  const int plEntry = ((i & 0xFFFF) << 16) | (j & 0xFFFF);
1229  if (exclFlags == 0) {
1230  #if MIC_CONDITION_NORMAL != 0
1231  if (r2 <= normhi_split) {
1232  pairlist_norm[plOffset_norm++] = plEntry;
1233  } else {
1234  pairlist_normhi[plOffset_normhi++] = plEntry;
1235  }
1236  #else
1237  pairlist_norm[plOffset_norm++] = plEntry;
1238  #endif
1239  } else if (exclFlags == 1) {
1240  pairlist_excl[plOffset_excl++] = plEntry;
1241  } else if (exclFlags == 2) {
1242  pairlist_mod[plOffset_mod++] = plEntry;
1243  }
1244 
1245  } // end if (r2 < plcutoff2)
1246  } // end for (j < j_upper)
1247 
1248  #endif // if MIC_HANDCODE_PLGEN != 0
1249 
1250  //#if MIC_PAD_PLGEN != 0
1251  #if __MIC_PAD_PLGEN_CTRL != 0
1252  //#if (MIC_HANDCODE_FORCE != 0) && (MIC_HANDCODE_FORCE_SINGLE != 0)
1253  #if MIC_HANDCODE_FORCE_SINGLE != 0
1254  const int padLen = 16;
1255  #else
1256  const int padLen = 8;
1257  #endif
1258  const int padValue = (i << 16) | 0xFFFF;
1259  // NOTE: xxx % padLen != 2 because offset includes first two ints (pairlist sizes), so 0 entries <-> offset 2, 16 entries <-> offset 18, etc.
1260  // Alignment is setup so that the entries (minus the first two ints) are cacheline aligned.
1261  while (plOffset_norm % padLen != 2) { pairlist_norm[plOffset_norm++] = padValue; }
1262  while (plOffset_mod % padLen != 2) { pairlist_mod [plOffset_mod++ ] = padValue; }
1263  while (plOffset_excl % padLen != 2) { pairlist_excl[plOffset_excl++] = padValue; }
1264  #if MIC_CONDITION_NORMAL != 0
1265  while (plOffset_normhi % padLen != 2) { pairlist_normhi[plOffset_normhi++] = padValue; }
1266  #endif
1267  #endif
1268 
1269  } // end for (i < i_upper)
1270 
1271  #endif // if MIC_TILE_PLGEN != 0
1272 
1273  // If we are conditioning the normal pairlist, place the contents of pairlist_normhi
1274  // at the end of pairlist_norm
1275  #if MIC_CONDITION_NORMAL != 0
1276 
1277  // Make sure there is enough room in pairlist_norm for the contents of pairlist_normhi
1278  if (plOffset_norm + plOffset_normhi > pairlist_norm[0]) {
1279  int newPlAllocSize = pairlist_norm[0] + 2 * plOffset_normhi + 8;
1280  int* RESTRICT newPl = (int*)_MM_MALLOC_WRAPPER(newPlAllocSize * sizeof(int) + 64, 64, "pairlist_norm grow for conditioning");
1281  __ASSERT(newPl != NULL);
1282  newPl += 14; // Align pairlist entries, which start at index 2
1283  memcpy(newPl, pairlist_norm, plOffset_norm * sizeof(int));
1284  _MM_FREE_WRAPPER(pairlist_norm - 14);
1285  pairlist_norm = newPl;
1286  pairlist_norm[0] = newPlAllocSize;
1287  }
1288 
1289  // Copy the contents of pairlist_normhi into pairlist_norm
1290  for (int i = 0; i < plOffset_normhi - 2; i++) {
1291  pairlist_norm[plOffset_norm + i] = pairlist_normhi[2 + i];
1292  }
1293  plOffset_norm += plOffset_normhi - 2;
1294  plOffset_normhi = 2;
1295 
1296  #endif
1297 
1298  // Store the current offsets (pairlist lengths) in to the pairlist data structure itself
1299  pairlist_norm[1] = plOffset_norm; // NOTE: The size includes the initial 2 ints
1300  pairlist_mod[1] = plOffset_mod;
1301  pairlist_excl[1] = plOffset_excl;
1302  #if MIC_CONDITION_NORMAL != 0
1303  pairlist_normhi[1] = plOffset_normhi;
1304  #endif
1305 
1306  // DMK - DEBUG - Pairlist memory size info
1307  #if MIC_TRACK_DEVICE_MEM_USAGE != 0
1308  if (plOffset_norm > pairlist_norm[-1]) { pairlist_norm[-1] = plOffset_norm; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
1309  if (plOffset_mod > pairlist_mod[-1]) { pairlist_mod[-1] = plOffset_mod; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
1310  if (plOffset_excl > pairlist_excl[-1]) { pairlist_excl[-1] = plOffset_excl; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
1311  #if MIC_CONDITION_NORMAL != 0
1312  if (plOffset_normhi > pairlist_normhi[-1]) { pairlist_normhi[-1] = plOffset_normhi; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
1313  #endif
1314  #endif
1315 
1316  // If prefetch is enabled, pad-out some extra (valid) entries in the pairlist
1317  // that can be safely prefetched (i.e. without segfaulting), even though they will have
1318  // no actual effect (i.e. forces generated related to them)
1319  #if MIC_PREFETCH_DISTANCE > 0
1320  for (int pfI = 0; pfI < MIC_PREFETCH_DISTANCE; pfI++) {
1321  pairlist_norm[plOffset_norm + pfI] = 0;
1322  pairlist_mod [plOffset_mod + pfI] = 0;
1323  pairlist_excl[plOffset_excl + pfI] = 0;
1324  }
1325  #endif
1326 
1327  // If we need to pad the pairlists with valid entries, create "zero" entries at the end
1328  #if MIC_HANDCODE_FORCE != 0 && MIC_HANDCODE_FORCE_PFDIST > 0
1329  for (int pfI = 0; pfI < 16 * MIC_HANDCODE_FORCE_PFDIST; pfI++) {
1330  #if MIC_HANDCODE_FORCE_SINGLE != 0
1331  pairlist_norm[plOffset_norm + pfI] = -1;
1332  pairlist_mod [plOffset_mod + pfI] = -1;
1333  pairlist_excl[plOffset_excl + pfI] = -1;
1334  #else
1335  pairlist_norm[plOffset_norm + pfI] = 0;
1336  pairlist_mod [plOffset_mod + pfI] = 0;
1337  pairlist_excl[plOffset_excl + pfI] = 0;
1338  #endif
1339  }
1340  #endif
1341 
1342  // Save the pointer values back into the pairlist pointer array, so the buffers can
1343  // be reused across timesteps, in case the pointers were changed here
1344  params.pairlists_ptr[PL_NORM_INDEX] = pairlist_norm;
1345  params.pairlists_ptr[ PL_MOD_INDEX] = pairlist_mod;
1346  params.pairlists_ptr[PL_EXCL_INDEX] = pairlist_excl;
1347  #if MIC_CONDITION_NORMAL != 0
1348  params.pairlists_ptr[PL_NORMHI_INDEX] = pairlist_normhi;
1349  #endif
1350 
1351  } // end if (params.savePairlists)
1352 
1353  #undef PAIRLIST_ALLOC_CHECK
1354  #undef PAIRLIST_GROW_CHECK
1355 
1356  // Declare the scalars (or vector-width-long arrays if using force splitting) related
1357  // to accumulating virial and energy output, initializing these values to zero
1358  #if (0 FAST(+1))
1359  #if (0 ENERGY(+1))
1360  double vdwEnergy = 0;
1361  SHORT( double electEnergy = 0; )
1362  #endif
1363  #if (0 SHORT(+1))
1364  double virial_xx = 0;
1365  double virial_xy = 0;
1366  double virial_xz = 0;
1367  double virial_yy = 0;
1368  double virial_yz = 0;
1369  double virial_zz = 0;
1370  #endif
1371  #endif
1372  #if (0 FULL(+1))
1373  #if (0 ENERGY(+1))
1374  double fullElectEnergy = 0;
1375  #endif
1376  double fullElectVirial_xx = 0;
1377  double fullElectVirial_xy = 0;
1378  double fullElectVirial_xz = 0;
1379  double fullElectVirial_yy = 0;
1380  double fullElectVirial_yz = 0;
1381  double fullElectVirial_zz = 0;
1382  #endif
1383 
1384  // NORMAL LOOP
1385  DEVICE_FPRINTF("N");
1386  #define PAIRLIST pairlist_norm
1387  #define NORMAL(X) X
1388  #define EXCLUDED(X)
1389  #define MODIFIED(X)
1391  #undef PAIRLIST
1392  #undef NORMAL
1393  #undef EXCLUDED
1394  #undef MODIFIED
1395 
1396  // MODIFIED LOOP
1397  DEVICE_FPRINTF("M");
1398  #define PAIRLIST pairlist_mod
1399  #define NORMAL(X)
1400  #define EXCLUDED(X)
1401  #define MODIFIED(X) X
1403  #undef PAIRLIST
1404  #undef NORMAL
1405  #undef EXCLUDED
1406  #undef MODIFIED
1407 
1408  // EXCLUDED LOOP
1409  #if defined(FULLELECT)
1410 
1411  DEVICE_FPRINTF("E");
1412  #define PAIRLIST pairlist_excl
1413  #undef FAST
1414  #define FAST(X)
1415  #define NORMAL(X)
1416  #define EXCLUDED(X) X
1417  #define MODIFIED(X)
1419  #undef PAIRLIST
1420  #undef FAST
1421  #ifdef SLOWONLY
1422  #define FAST(X)
1423  #else
1424  #define FAST(X) X
1425  #endif
1426  #undef NORMAL
1427  #undef EXCLUDED
1428  #undef MODIFIED
1429 
1430  #else // defined(FULLELECT)
1431 
1432  // If we are not executing the excluded loop, then we need to count exclusive
1433  // interactions per atom. Contribute the number of entries in the EXCLUDED
1434  // pairlist (valid only if padding pairlists)
1435  DEVICE_FPRINTF("e");
1436  #if MIC_EXCL_CHECKSUM_FULL != 0
1437  {
1438  #if __MIC_PAD_PLGEN_CTRL != 0
1439  // NOTE: If using padding, the pairlist will have 'invalid' entries mixed in with the
1440  // valid entries, so we need to scan through and only count the valid entries
1441  const int * const pairlist_excl_base = pairlist_excl + 2;
1442  const int pairlist_excl_len = pairlist_excl[1] - 2;
1443  int exclSum = 0;
1444  __ASSUME_ALIGNED(pairlist_excl_base);
1445  #pragma omp simd reduction(+ : exclSum)
1446  for (int plI = 0; plI < pairlist_excl_len; plI++) {
1447  if ((pairlist_excl_base[plI] & 0xFFFF) != 0xFFFF) {
1448  exclSum += 1;
1449  }
1450  }
1451  params.exclusionSum += exclSum;
1452  #else
1453  params.exclusionSum += pairlist_excl[1] - 2; // NOTE: Size includes first two elements (alloc size and size), don't count those
1454  #endif
1455  }
1456  #endif
1457 
1458  #endif // defined(FULLELECT)
1459 
1460  // If this is a self compute, do the virial calculation here
1461  #if (0 SHORT( FAST( SELF(+1))))
1462  for (int i = 0; i < i_upper; i++) {
1463  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1464  virial_xx += f_0[i].x * (p_0[i].x + params.patch1_center_x);
1465  virial_xy += f_0[i].x * (p_0[i].y + params.patch1_center_y);
1466  virial_xz += f_0[i].x * (p_0[i].z + params.patch1_center_z);
1467  virial_yy += f_0[i].y * (p_0[i].y + params.patch1_center_y);
1468  virial_yz += f_0[i].y * (p_0[i].z + params.patch1_center_z);
1469  virial_zz += f_0[i].z * (p_0[i].z + params.patch1_center_z);
1470  #else
1471  virial_xx += f_0_x[i] * (p_0_x[i] + params.patch1_center_x);
1472  virial_xy += f_0_x[i] * (p_0_y[i] + params.patch1_center_y);
1473  virial_xz += f_0_x[i] * (p_0_z[i] + params.patch1_center_z);
1474  virial_yy += f_0_y[i] * (p_0_y[i] + params.patch1_center_y);
1475  virial_yz += f_0_y[i] * (p_0_z[i] + params.patch1_center_z);
1476  virial_zz += f_0_z[i] * (p_0_z[i] + params.patch1_center_z);
1477  #endif
1478  }
1479  #endif
1480  #if (0 FULL( SELF(+1)))
1481  for (int i = 0; i < i_upper; i++) {
1482  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1483  fullElectVirial_xx += fullf_0[i].x * (p_0[i].x + params.patch1_center_x);
1484  fullElectVirial_xy += fullf_0[i].x * (p_0[i].y + params.patch1_center_y);
1485  fullElectVirial_xz += fullf_0[i].x * (p_0[i].z + params.patch1_center_z);
1486  fullElectVirial_yy += fullf_0[i].y * (p_0[i].y + params.patch1_center_y);
1487  fullElectVirial_yz += fullf_0[i].y * (p_0[i].z + params.patch1_center_z);
1488  fullElectVirial_zz += fullf_0[i].z * (p_0[i].z + params.patch1_center_z);
1489  #else
1490  fullElectVirial_xx += fullf_0_x[i] * (p_0_x[i] + params.patch1_center_x);
1491  fullElectVirial_xy += fullf_0_x[i] * (p_0_y[i] + params.patch1_center_y);
1492  fullElectVirial_xz += fullf_0_x[i] * (p_0_z[i] + params.patch1_center_z);
1493  fullElectVirial_yy += fullf_0_y[i] * (p_0_y[i] + params.patch1_center_y);
1494  fullElectVirial_yz += fullf_0_y[i] * (p_0_z[i] + params.patch1_center_z);
1495  fullElectVirial_zz += fullf_0_z[i] * (p_0_z[i] + params.patch1_center_z);
1496  #endif
1497  }
1498  #endif
1499 
1500  FAST( SHORT( params.virial_xx = virial_xx; ) )
1501  FAST( SHORT( params.virial_xy = virial_xy; ) )
1502  FAST( SHORT( params.virial_xz = virial_xz; ) )
1503  FAST( SHORT( params.virial_yy = virial_yy; ) )
1504  FAST( SHORT( params.virial_yz = virial_yz; ) )
1505  FAST( SHORT( params.virial_zz = virial_zz; ) )
1506  FULL( params.fullElectVirial_xx = fullElectVirial_xx; )
1507  FULL( params.fullElectVirial_xy = fullElectVirial_xy; )
1508  FULL( params.fullElectVirial_xz = fullElectVirial_xz; )
1509  FULL( params.fullElectVirial_yy = fullElectVirial_yy; )
1510  FULL( params.fullElectVirial_yz = fullElectVirial_yz; )
1511  FULL( params.fullElectVirial_zz = fullElectVirial_zz; )
1512  FAST( ENERGY( params.vdwEnergy = vdwEnergy; ) )
1513  FAST( ENERGY( SHORT( params.electEnergy = electEnergy; ) ) )
1514  FULL( ENERGY( params.fullElectEnergy = fullElectEnergy; ) )
1515 
1516  #undef CALC_TYPE
1517 }
1518 
1519 
1520 // Undefine atom and force macros for SELF computes
1521 #if (0 SELF(+1))
1522  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1523  #undef p_1
1524  #undef pExt_1
1525  #undef f_1
1526  #if (0 FULL(+1))
1527  #undef fullf_1
1528  #endif
1529  #else
1530  #undef p_1_x
1531  #undef p_1_y
1532  #undef p_1_z
1533  #undef p_1_q
1534  #undef p_1_vdwType
1535  #undef p_1_index
1536  #undef p_1_exclIndex
1537  #undef p_1_exclMaxDiff
1538  #undef f_1_x
1539  #undef f_1_y
1540  #undef f_1_z
1541  #undef f_1_w
1542  #if (0 FULL(+1))
1543  #undef fullf_1_x
1544  #undef fullf_1_y
1545  #undef fullf_1_z
1546  #undef fullf_1_w
1547  #endif
1548  #endif
1549 #endif
1550 
1551 #undef __MIC_PAD_PLGEN_CTRL
1552 
1553 #else // NAMD_MIC
1554 
1556 
1557 #endif // NAMD_MIC
1558 
register BigReal virial_xy
register BigReal virial_xz
#define PAIR(X)
register BigReal virial_yz
#define LAM(X)
static __global__ void(const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const int *vdw_types, unsigned int *plist, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials, unsigned int *global_counters, int *force_ready_queue, const unsigned int *overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int *block_order, exclmask *exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)
register BigReal electEnergy
if(ComputeNonbondedUtil::goMethod==2)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
#define INT(X)
#define NOTABENERGY(X)
register BigReal virial_yy
#define NAME
#define SHORT(X)
#define SELF(X)
register BigReal virial_zz
#define FAST(X)
#define FEP(X)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
#define ENERGY(X)
register BigReal virial_xx
#define FULL(X)
#define LES(X)
#define NOENERGY(X)
#define TABENERGY(X)