NAMD
ComputeNonbondedMICKernelBase2_handcode_single.h
Go to the documentation of this file.
1 #ifdef NAMD_MIC
2 
3  #define GATHER_PS_I32_OFFSET(v, p, i, o) \
4  { \
5  __mmask16 k0 = _mm512_int2mask(0x0008); \
6  __mmask16 k1 = _mm512_int2mask(0x0080); \
7  __mmask16 k2 = _mm512_int2mask(0x0800); \
8  __mmask16 k3 = _mm512_int2mask(0x8000); \
9  (v) = _mm512_mask_loadunpacklo_ps( _mm512_setzero_ps(), k3, &((p)[(i)[13] + (o)])); \
10  (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[ 9] + (o)])); \
11  (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 5] + (o)])); \
12  (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 1] + (o)])); \
13  (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[12] + (o)])); \
14  (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[ 8] + (o)])); \
15  (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 4] + (o)])); \
16  (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 0] + (o)])); \
17  (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_BADC), k3, &((p)[(i)[14] + (o)])); \
18  (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[10] + (o)])); \
19  (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 6] + (o)])); \
20  (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 2] + (o)])); \
21  (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[15] + (o)])); \
22  (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[11] + (o)])); \
23  (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 7] + (o)])); \
24  (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 3] + (o)])); \
25  }
26  #define GATHER_PS_I32(v, p, i) GATHER_PS_I32_OFFSET((v), (p), (i), 0)
27 
28 
29  #define GATHER_EPI32_I32_OFFSET(v, p, i, o) \
30  { \
31  __mmask16 k0 = _mm512_int2mask(0x0008); \
32  __mmask16 k1 = _mm512_int2mask(0x0080); \
33  __mmask16 k2 = _mm512_int2mask(0x0800); \
34  __mmask16 k3 = _mm512_int2mask(0x8000); \
35  (v) = _mm512_mask_loadunpacklo_epi32( _mm512_setzero_epi32(), k3, &((p)[(i)[13] + (o)])); \
36  (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[ 9] + (o)])); \
37  (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 5] + (o)])); \
38  (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 1] + (o)])); \
39  (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[12] + (o)])); \
40  (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[ 8] + (o)])); \
41  (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 4] + (o)])); \
42  (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 0] + (o)])); \
43  (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_BADC), k3, &((p)[(i)[14] + (o)])); \
44  (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[10] + (o)])); \
45  (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 6] + (o)])); \
46  (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 2] + (o)])); \
47  (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[15] + (o)])); \
48  (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[11] + (o)])); \
49  (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 7] + (o)])); \
50  (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 3] + (o)])); \
51  }
52  #define GATHER_EPI32_I32(v, p, i) GATHER_EPI32_I32_OFFSET((v), (p), (i), 0)
53 
54 
55  #define SCATTER_INC_PD_I32_STEP(v, p, i, k, idx, pattern) \
56  t = _mm512_mask_loadunpacklo_pd(t, (k), &((p)[(i)[(idx)]])); \
57  t = _mm512_mask_add_pd(t, (k), _mm512_swizzle_pd((v), (pattern), t); \
58  _mm512_mask_packstorelo_pd(&((p)[(i)[(idx)]]), (k), t);
59 
60  #define SCATTER_INC_PD_I32(v, p, i) \
61  { \
62  __mmask16 k0 = _mm512_int2mask(0x02); \
63  __mmask16 k1 = _mm512_int2mask(0x20); \
64  __m512d t = _mm512_setzero_pd(); \
65  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 0, _MM_SWIZ_REG_CDAB) \
66  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 1, _MM_SWIZ_REG_DCBA) \
67  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 2, _MM_SWIZ_REG_DACB) \
68  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 3, _MM_SWIZ_REG_BADC) \
69  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 4, _MM_SWIZ_REG_CDAB) \
70  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 5, _MM_SWIZ_REG_DCBA) \
71  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 6, _MM_SWIZ_REG_DACB) \
72  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 7, _MM_SWIZ_REG_BADC) \
73  }
74 
75 
76  #define SCATTER_INC_PS_I32_STEP(v, p, i, k, idx, pattern) \
77  t = _mm512_mask_loadunpacklo_ps(t, (k), &((p)[(i)[(idx)]])); \
78  t = _mm512_mask_add_ps(t, (k), _mm512_swizzle_ps((v), (pattern), t); \
79  _mm512_mask_packstorelo_ps(&((p)[(i)[(idx)]]), (k), t);
80 
81  #define SCATTER_INC_PS_I32(v, p, i) \
82  { \
83  __mmask16 k0 = _mm512_int2mask(0x0008); \
84  __mmask16 k1 = _mm512_int2mask(0x0080); \
85  __mmask16 k2 = _mm512_int2mask(0x0800); \
86  __mmask16 k3 = _mm512_int2mask(0x8000); \
87  __m512 t = _mm512_setzero_ps(); \
88  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 0, _MM_SWIZ_REG_CDAB) \
89  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 1, _MM_SWIZ_REG_DCBA) \
90  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 2, _MM_SWIZ_REG_DACB) \
91  SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 3, _MM_SWIZ_REG_BADC) \
92  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 4, _MM_SWIZ_REG_CDAB) \
93  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 5, _MM_SWIZ_REG_DCBA) \
94  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 6, _MM_SWIZ_REG_DACB) \
95  SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 7, _MM_SWIZ_REG_BADC) \
96  SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 8, _MM_SWIZ_REG_CDAB) \
97  SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 9, _MM_SWIZ_REG_DCBA) \
98  SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 10, _MM_SWIZ_REG_DACB) \
99  SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 11, _MM_SWIZ_REG_BADC) \
100  SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 12, _MM_SWIZ_REG_CDAB) \
101  SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 13, _MM_SWIZ_REG_DCBA) \
102  SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 14, _MM_SWIZ_REG_DACB) \
103  SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 15, _MM_SWIZ_REG_BADC) \
104  }
105 
106  // DMK - NOTE - The instructions for these macros are they way they are so
107  // values are converted to double before accumulating (e.g. rather than
108  // sum the floats, convert, and then accumulate).
109  #define CONTRIB_ADD_PS2PD(pd_vec, ps_vec) \
110  { __m512 ps_tmp_vec = (ps_vec); \
111  (pd_vec) = _mm512_add_pd((pd_vec), _mm512_cvtpslo_pd(ps_tmp_vec)); \
112  (pd_vec) = _mm512_add_pd((pd_vec), _mm512_cvtpslo_pd(_mm512_permute4f128_ps(ps_tmp_vec, _MM_PERM_BADC))); \
113  }
114  #define CONTRIB_SUB_PS2PD(pd_vec, ps_vec) \
115  { __m512 ps_tmp_vec = (ps_vec); \
116  (pd_vec) = _mm512_sub_pd((pd_vec), _mm512_cvtpslo_pd(ps_tmp_vec)); \
117  (pd_vec) = _mm512_sub_pd((pd_vec), _mm512_cvtpslo_pd(_mm512_permute4f128_ps(ps_tmp_vec, _MM_PERM_BADC))); \
118  }
119 
120 
121  #define APPLY_FORCES_PS2PD_STEP_ADD(f_x, f_y, f_z, i, v_x, v_y, v_z, p, k) \
122  tx = _mm512_mask_loadunpacklo_pd(tx, (k), (f_x) + (i)); \
123  ty = _mm512_mask_loadunpacklo_pd(ty, (k), (f_y) + (i)); \
124  tz = _mm512_mask_loadunpacklo_pd(tz, (k), (f_z) + (i)); \
125  tx = _mm512_mask_add_pd(tx, (k), tx, _mm512_swizzle_pd((v_x), (p))); \
126  ty = _mm512_mask_add_pd(ty, (k), ty, _mm512_swizzle_pd((v_y), (p))); \
127  tz = _mm512_mask_add_pd(tz, (k), tz, _mm512_swizzle_pd((v_z), (p))); \
128  _mm512_mask_packstorelo_pd((f_x) + (i), (k), tx); \
129  _mm512_mask_packstorelo_pd((f_y) + (i), (k), ty); \
130  _mm512_mask_packstorelo_pd((f_z) + (i), (k), tz);
131 
132  #define APPLY_FORCES_PS2PD_STEP_SUB(f_x, f_y, f_z, i, v_x, v_y, v_z, p, k) \
133  tx = _mm512_mask_loadunpacklo_pd(tx, (k), (f_x) + (i)); \
134  ty = _mm512_mask_loadunpacklo_pd(ty, (k), (f_y) + (i)); \
135  tz = _mm512_mask_loadunpacklo_pd(tz, (k), (f_z) + (i)); \
136  tx = _mm512_mask_sub_pd(tx, (k), tx, _mm512_swizzle_pd((v_x), (p))); \
137  ty = _mm512_mask_sub_pd(ty, (k), ty, _mm512_swizzle_pd((v_y), (p))); \
138  tz = _mm512_mask_sub_pd(tz, (k), tz, _mm512_swizzle_pd((v_z), (p))); \
139  _mm512_mask_packstorelo_pd((f_x) + (i), (k), tx); \
140  _mm512_mask_packstorelo_pd((f_y) + (i), (k), ty); \
141  _mm512_mask_packstorelo_pd((f_z) + (i), (k), tz);
142 
143  //#if MIC_PAD_PLGEN != 0
144  #if __MIC_PAD_PLGEN_CTRL != 0
145 
146  #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
147 
148  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
149 
150  #define APPLY_FORCES_PS2PD_JSTEP(j_lo, j_hi, v, fv) \
151  { \
152  FAST(SHORT( v_tmp = _mm512_mask_loadunpacklo_pd( v_tmp, k_lo, f_1 + tmpJ32[(j_lo)]); )) \
153  FULL( fv_tmp = _mm512_mask_loadunpacklo_pd(fv_tmp, k_lo, fullf_1 + tmpJ32[(j_lo)]); ) \
154  FAST(SHORT( v_tmp = _mm512_sub_pd( v_tmp, ( v)); )) \
155  FULL( fv_tmp = _mm512_sub_pd(fv_tmp, (fv)); ) \
156  FAST(SHORT( _mm512_mask_packstorelo_pd( f_1 + tmpJ32[(j_lo)], k_lo, v_tmp); )) \
157  FULL( _mm512_mask_packstorelo_pd(fullf_1 + tmpJ32[(j_lo)], k_lo, fv_tmp); ) \
158  FAST(SHORT( v_tmp = _mm512_mask_loadunpacklo_pd( v_tmp, k_hi, f_1 + tmpJ32[(j_hi)]); )) \
159  FULL( fv_tmp = _mm512_mask_loadunpacklo_pd(fv_tmp, k_hi, fullf_1 + tmpJ32[(j_hi)]); ) \
160  FAST(SHORT( v_tmp = _mm512_sub_pd( v_tmp, ( v)); )) \
161  FULL( fv_tmp = _mm512_sub_pd(fv_tmp, (fv)); ) \
162  FAST(SHORT( _mm512_mask_packstorelo_pd( f_1 + tmpJ32[(j_hi)], k_hi, v_tmp); )) \
163  FULL( _mm512_mask_packstorelo_pd(fullf_1 + tmpJ32[(j_hi)], k_hi, fv_tmp); ) \
164  }
165 
166  #define APPLY_FORCES_PS2PD \
167  { \
168  /* Set 'w' values (exclusion checksum) to -1.0f when counting so 'sub' operation for x,y,z,w actually adds 1.0f */ \
169  FAST(SHORT( __m512 tmp_w_vec = _mm512_mask_mov_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_set_1to16_ps(0.0f MODIFIED(- 1.0f) EXCLUDED(- 1.0f))); )) \
170  FULL( __m512 fulltmp_w_vec = _mm512_setzero_ps(); ) \
171  /* Transpose the values so that each set of 4 lanes has x,y,z,w values for a given interaction. */ \
172  /* NOTE: This rearranges the values via a 4x4 transpose. */ \
173  __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA); \
174  __mmask16 k_2x2_1 = _mm512_int2mask(0x5555); \
175  FAST(SHORT( __m512i tmp_a0 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_x_vec), k_2x2_0, _mm512_castps_si512( tmp_y_vec), _MM_SWIZ_REG_CDAB); )) \
176  FAST(SHORT( __m512i tmp_a1 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_y_vec), k_2x2_1, _mm512_castps_si512( tmp_x_vec), _MM_SWIZ_REG_CDAB); )) \
177  FAST(SHORT( __m512i tmp_a2 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_z_vec), k_2x2_0, _mm512_castps_si512( tmp_w_vec), _MM_SWIZ_REG_CDAB); )) \
178  FAST(SHORT( __m512i tmp_a3 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_w_vec), k_2x2_1, _mm512_castps_si512( tmp_z_vec), _MM_SWIZ_REG_CDAB); )) \
179  FULL( __m512i tmp_b0 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_x_vec), k_2x2_0, _mm512_castps_si512(fulltmp_y_vec), _MM_SWIZ_REG_CDAB); ) \
180  FULL( __m512i tmp_b1 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_y_vec), k_2x2_1, _mm512_castps_si512(fulltmp_x_vec), _MM_SWIZ_REG_CDAB); ) \
181  FULL( __m512i tmp_b2 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_z_vec), k_2x2_0, _mm512_castps_si512(fulltmp_w_vec), _MM_SWIZ_REG_CDAB); ) \
182  FULL( __m512i tmp_b3 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_w_vec), k_2x2_1, _mm512_castps_si512(fulltmp_z_vec), _MM_SWIZ_REG_CDAB); ) \
183  __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC); \
184  __mmask16 k_4x4_1 = _mm512_int2mask(0x3333); \
185  FAST(SHORT( __m512 tmp_0_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a0, k_4x4_0, tmp_a2, _MM_SWIZ_REG_BADC)); )) \
186  FAST(SHORT( __m512 tmp_1_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a1, k_4x4_0, tmp_a3, _MM_SWIZ_REG_BADC)); )) \
187  FAST(SHORT( __m512 tmp_2_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a2, k_4x4_1, tmp_a0, _MM_SWIZ_REG_BADC)); )) \
188  FAST(SHORT( __m512 tmp_3_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a3, k_4x4_1, tmp_a1, _MM_SWIZ_REG_BADC)); )) \
189  FULL( __m512 fulltmp_0_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b0, k_4x4_0, tmp_b2, _MM_SWIZ_REG_BADC)); ) \
190  FULL( __m512 fulltmp_1_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b1, k_4x4_0, tmp_b3, _MM_SWIZ_REG_BADC)); ) \
191  FULL( __m512 fulltmp_2_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b2, k_4x4_1, tmp_b0, _MM_SWIZ_REG_BADC)); ) \
192  FULL( __m512 fulltmp_3_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b3, k_4x4_1, tmp_b1, _MM_SWIZ_REG_BADC)); ) \
193  /* Convert the floats to doubles. NOTE: This must be done prior to any math because of the difference in magnitudes. */ \
194  FAST(SHORT( __m512d v_0_lo = _mm512_cvtpslo_pd( tmp_0_vec ); )) \
195  FAST(SHORT( __m512d v_0_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_0_vec, _MM_PERM_BADC)); )) \
196  FAST(SHORT( __m512d v_1_lo = _mm512_cvtpslo_pd( tmp_1_vec ); )) \
197  FAST(SHORT( __m512d v_1_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_1_vec, _MM_PERM_BADC)); )) \
198  FAST(SHORT( __m512d v_2_lo = _mm512_cvtpslo_pd( tmp_2_vec ); )) \
199  FAST(SHORT( __m512d v_2_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_2_vec, _MM_PERM_BADC)); )) \
200  FAST(SHORT( __m512d v_3_lo = _mm512_cvtpslo_pd( tmp_3_vec ); )) \
201  FAST(SHORT( __m512d v_3_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_3_vec, _MM_PERM_BADC)); )) \
202  FULL( __m512d fv_0_lo = _mm512_cvtpslo_pd( fulltmp_0_vec ); ) \
203  FULL( __m512d fv_0_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_0_vec, _MM_PERM_BADC)); ) \
204  FULL( __m512d fv_1_lo = _mm512_cvtpslo_pd( fulltmp_1_vec ); ) \
205  FULL( __m512d fv_1_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_1_vec, _MM_PERM_BADC)); ) \
206  FULL( __m512d fv_2_lo = _mm512_cvtpslo_pd( fulltmp_2_vec ); ) \
207  FULL( __m512d fv_2_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_2_vec, _MM_PERM_BADC)); ) \
208  FULL( __m512d fv_3_lo = _mm512_cvtpslo_pd( fulltmp_3_vec ); ) \
209  FULL( __m512d fv_3_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_3_vec, _MM_PERM_BADC)); ) \
210  /* Apply the forces to the 'j' atoms. */ \
211  __mmask16 k_lo = _mm512_int2mask(0x000F); \
212  __mmask16 k_hi = _mm512_int2mask(0x00F0); \
213  __m512d v_tmp = _mm512_setzero_pd(); \
214  __m512d fv_tmp = _mm512_setzero_pd(); \
215  APPLY_FORCES_PS2PD_JSTEP( 0, 4, v_0_lo, fv_0_lo); \
216  APPLY_FORCES_PS2PD_JSTEP( 1, 5, v_1_lo, fv_1_lo); \
217  APPLY_FORCES_PS2PD_JSTEP( 2, 6, v_2_lo, fv_2_lo); \
218  APPLY_FORCES_PS2PD_JSTEP( 3, 7, v_3_lo, fv_3_lo); \
219  APPLY_FORCES_PS2PD_JSTEP( 8, 12, v_0_hi, fv_0_hi); \
220  APPLY_FORCES_PS2PD_JSTEP( 9, 13, v_1_hi, fv_1_hi); \
221  APPLY_FORCES_PS2PD_JSTEP(10, 14, v_2_hi, fv_2_hi); \
222  APPLY_FORCES_PS2PD_JSTEP(11, 15, v_3_hi, fv_3_hi); \
223  /* Reduce (add) the forces into a single force vector and apply it to the 'i' atom. */ \
224  FAST(SHORT( __m512d v_tmp0_xyzw = _mm512_add_pd( v_0_lo, v_0_hi); )) \
225  FULL( __m512d fv_tmp0_xyzw = _mm512_add_pd(fv_0_lo, fv_0_hi); ) \
226  FAST(SHORT( __m512d v_tmp1_xyzw = _mm512_add_pd( v_1_lo, v_1_hi); )) \
227  FULL( __m512d fv_tmp1_xyzw = _mm512_add_pd(fv_1_lo, fv_1_hi); ) \
228  FAST(SHORT( __m512d v_tmp2_xyzw = _mm512_add_pd( v_2_lo, v_2_hi); )) \
229  FULL( __m512d fv_tmp2_xyzw = _mm512_add_pd(fv_2_lo, fv_2_hi); ) \
230  FAST(SHORT( __m512d v_tmp3_xyzw = _mm512_add_pd( v_3_lo, v_3_hi); )) \
231  FULL( __m512d fv_tmp3_xyzw = _mm512_add_pd(fv_3_lo, fv_3_hi); ) \
232  FAST(SHORT( __m512d v_tmp4_xyzw = _mm512_add_pd( v_tmp0_xyzw, v_tmp1_xyzw); )) \
233  FULL( __m512d fv_tmp4_xyzw = _mm512_add_pd(fv_tmp0_xyzw, fv_tmp1_xyzw); ) \
234  FAST(SHORT( __m512d v_tmp5_xyzw = _mm512_add_pd( v_tmp2_xyzw, v_tmp3_xyzw); )) \
235  FULL( __m512d fv_tmp5_xyzw = _mm512_add_pd(fv_tmp2_xyzw, fv_tmp3_xyzw); ) \
236  FAST(SHORT( __m512d v_tmp6_xyzw = _mm512_add_pd( v_tmp4_xyzw, v_tmp5_xyzw); )) \
237  FULL( __m512d fv_tmp6_xyzw = _mm512_add_pd(fv_tmp4_xyzw, fv_tmp5_xyzw); ) \
238  FAST(SHORT( __m512d v_xyzw = _mm512_add_pd( v_tmp6_xyzw, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512( v_tmp6_xyzw), _MM_PERM_BADC))); )) \
239  FULL( __m512d fv_xyzw = _mm512_add_pd(fv_tmp6_xyzw, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(fv_tmp6_xyzw), _MM_PERM_BADC))); ) \
240  FAST(SHORT(MODIFIED( v_xyzw = _mm512_mask_sub_pd( v_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), v_xyzw); ))) /* NOTE: w *= -1.0 for add operation below */ \
241  FAST(SHORT(EXCLUDED( v_xyzw = _mm512_mask_sub_pd( v_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), v_xyzw); ))) \
242  FULL(MODIFIED( fv_xyzw = _mm512_mask_sub_pd(fv_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), fv_xyzw); )) \
243  FULL(EXCLUDED( fv_xyzw = _mm512_mask_sub_pd(fv_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), fv_xyzw); )) \
244  int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
245  uintptr_t iTest_i = tmpI32[iTest_index]; \
246  FAST(SHORT( __m512d i_xyzw = _mm512_mask_loadunpacklo_pd(_mm512_setzero_pd(), k_lo, f_0 + iTest_i); )) \
247  FULL( __m512d fi_xyzw = _mm512_mask_loadunpacklo_pd(_mm512_setzero_pd(), k_lo, fullf_0 + iTest_i); ) \
248  FAST(SHORT( i_xyzw = _mm512_mask_add_pd( i_xyzw, k_lo, i_xyzw, v_xyzw); )) \
249  FULL( fi_xyzw = _mm512_mask_add_pd(fi_xyzw, k_lo, fi_xyzw, fv_xyzw); ) \
250  FAST(SHORT( _mm512_mask_packstorelo_pd( f_0 + iTest_i, k_lo, i_xyzw); )) \
251  FULL( _mm512_mask_packstorelo_pd(fullf_0 + iTest_i, k_lo, fi_xyzw); ) \
252  }
253 
254  #else // if MIC_HANDCODE_FORC_SOA_VS_AOS != 0
255 
256  #define APPLY_FORCES_PS2PD \
257  { \
258  FAST(SHORT( __m512d v_x_lo = _mm512_cvtpslo_pd( tmp_x_vec ); )) \
259  FAST(SHORT( __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_x_vec, _MM_PERM_BADC)); )) \
260  FAST(SHORT( __m512d v_y_lo = _mm512_cvtpslo_pd( tmp_y_vec ); )) \
261  FAST(SHORT( __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_y_vec, _MM_PERM_BADC)); )) \
262  FAST(SHORT( __m512d v_z_lo = _mm512_cvtpslo_pd( tmp_z_vec ); )) \
263  FAST(SHORT( __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_z_vec, _MM_PERM_BADC)); )) \
264  FULL( __m512d fv_x_lo = _mm512_cvtpslo_pd( fulltmp_x_vec ); ) \
265  FULL( __m512d fv_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_x_vec, _MM_PERM_BADC)); ) \
266  FULL( __m512d fv_y_lo = _mm512_cvtpslo_pd( fulltmp_y_vec ); ) \
267  FULL( __m512d fv_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_y_vec, _MM_PERM_BADC)); ) \
268  FULL( __m512d fv_z_lo = _mm512_cvtpslo_pd( fulltmp_z_vec ); ) \
269  FULL( __m512d fv_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_z_vec, _MM_PERM_BADC)); ) \
270  int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
271  uintptr_t iTest_i = tmpI32[iTest_index]; \
272  __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
273  __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
274  FAST(SHORT( f_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_x_hi, v_x_lo)); )) \
275  FAST(SHORT( f_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_y_hi, v_y_lo)); )) \
276  FAST(SHORT( f_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_z_hi, v_z_lo)); )) \
277  FULL( fullf_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_x_hi, fv_x_lo)); ) \
278  FULL( fullf_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_y_hi, fv_y_lo)); ) \
279  FULL( fullf_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_z_hi, fv_z_lo)); ) \
280  __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
281  __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
282  FAST(SHORT( __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_x, _MM_SCALE_8); )) \
283  FAST(SHORT( __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_y, _MM_SCALE_8); )) \
284  FAST(SHORT( __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_z, _MM_SCALE_8); )) \
285  FAST(SHORT( __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_x, _MM_SCALE_8); )) \
286  FAST(SHORT( __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_y, _MM_SCALE_8); )) \
287  FAST(SHORT( __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_z, _MM_SCALE_8); )) \
288  FULL( __m512d ff_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_x, _MM_SCALE_8); ) \
289  FULL( __m512d ff_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_y, _MM_SCALE_8); ) \
290  FULL( __m512d ff_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_z, _MM_SCALE_8); ) \
291  FULL( __m512d ff_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_x, _MM_SCALE_8); ) \
292  FULL( __m512d ff_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_y, _MM_SCALE_8); ) \
293  FULL( __m512d ff_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_z, _MM_SCALE_8); ) \
294  FAST(SHORT( f_j_x_lo_vec = _mm512_mask_sub_pd( f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); )) \
295  FAST(SHORT( f_j_y_lo_vec = _mm512_mask_sub_pd( f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); )) \
296  FAST(SHORT( f_j_z_lo_vec = _mm512_mask_sub_pd( f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); )) \
297  FAST(SHORT( f_j_x_hi_vec = _mm512_mask_sub_pd( f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); )) \
298  FAST(SHORT( f_j_y_hi_vec = _mm512_mask_sub_pd( f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); )) \
299  FAST(SHORT( f_j_z_hi_vec = _mm512_mask_sub_pd( f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); )) \
300  FULL( ff_j_x_lo_vec = _mm512_mask_sub_pd(ff_j_x_lo_vec, cutoff_lo, ff_j_x_lo_vec, fv_x_lo); ) \
301  FULL( ff_j_y_lo_vec = _mm512_mask_sub_pd(ff_j_y_lo_vec, cutoff_lo, ff_j_y_lo_vec, fv_y_lo); ) \
302  FULL( ff_j_z_lo_vec = _mm512_mask_sub_pd(ff_j_z_lo_vec, cutoff_lo, ff_j_z_lo_vec, fv_z_lo); ) \
303  FULL( ff_j_x_hi_vec = _mm512_mask_sub_pd(ff_j_x_hi_vec, cutoff_hi, ff_j_x_hi_vec, fv_x_hi); ) \
304  FULL( ff_j_y_hi_vec = _mm512_mask_sub_pd(ff_j_y_hi_vec, cutoff_hi, ff_j_y_hi_vec, fv_y_hi); ) \
305  FULL( ff_j_z_hi_vec = _mm512_mask_sub_pd(ff_j_z_hi_vec, cutoff_hi, ff_j_z_hi_vec, fv_z_hi); ) \
306  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); )) \
307  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); )) \
308  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); )) \
309  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); )) \
310  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); )) \
311  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); )) \
312  FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_lo, j_lo_vec, ff_j_x_lo_vec, _MM_SCALE_8); ) \
313  FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_lo, j_lo_vec, ff_j_y_lo_vec, _MM_SCALE_8); ) \
314  FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_lo, j_lo_vec, ff_j_z_lo_vec, _MM_SCALE_8); ) \
315  FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_hi, j_hi_vec, ff_j_x_hi_vec, _MM_SCALE_8); ) \
316  FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_hi, j_hi_vec, ff_j_y_hi_vec, _MM_SCALE_8); ) \
317  FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_hi, j_hi_vec, ff_j_z_hi_vec, _MM_SCALE_8); ) \
318  }
319 
320  #endif // MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
321 
322  #else // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
323 
324  #define APPLY_FORCES_PS2PD(v_x, v_y, v_z, f_i_x, f_i_y, f_i_z, f_j_x, f_j_y, f_j_z, i, j) \
325  { \
326  __m512d v_x_lo = _mm512_cvtpslo_pd(v_x); \
327  __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_x), _MM_PERM_BADC)); \
328  __m512d v_y_lo = _mm512_cvtpslo_pd(v_y); \
329  __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_y), _MM_PERM_BADC)); \
330  __m512d v_z_lo = _mm512_cvtpslo_pd(v_z); \
331  __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_z), _MM_PERM_BADC)); \
332  __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
333  __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
334  __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
335  __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
336  int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
337  uintptr_t iTest_i = tmpI32[iTest_index]; \
338  f_i_x[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_x_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_x_lo); \
339  f_i_y[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_y_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_y_lo); \
340  f_i_z[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_z_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_z_lo); \
341  __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_x), _MM_SCALE_8); \
342  __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_y), _MM_SCALE_8); \
343  __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_z), _MM_SCALE_8); \
344  __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_x), _MM_SCALE_8); \
345  __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_y), _MM_SCALE_8); \
346  __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_z), _MM_SCALE_8); \
347  f_j_x_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_x_lo_vec, v_x_lo); \
348  f_j_y_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_y_lo_vec, v_y_lo); \
349  f_j_z_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_z_lo_vec, v_z_lo); \
350  f_j_x_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_x_hi_vec, v_x_hi); \
351  f_j_y_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_y_hi_vec, v_y_hi); \
352  f_j_z_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_z_hi_vec, v_z_hi); \
353  _mm512_mask_i32loscatter_pd((f_j_x), cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); \
354  _mm512_mask_i32loscatter_pd((f_j_y), cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); \
355  _mm512_mask_i32loscatter_pd((f_j_z), cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); \
356  _mm512_mask_i32loscatter_pd((f_j_x), cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); \
357  _mm512_mask_i32loscatter_pd((f_j_y), cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); \
358  _mm512_mask_i32loscatter_pd((f_j_z), cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); \
359  }
360 
361  #endif // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
362 
363  #else // MIC_PAD_PLGEN != 0
364 
365  #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
366 
367  #define APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x, v_y, v_z, fv_x, fv_y, fv_z, i, p, k) \
368  FAST(SHORT( tx = _mm512_mask_loadunpacklo_pd( tx, (k), f_0_x + (i)); )) \
369  FAST(SHORT( ty = _mm512_mask_loadunpacklo_pd( ty, (k), f_0_y + (i)); )) \
370  FAST(SHORT( tz = _mm512_mask_loadunpacklo_pd( tz, (k), f_0_z + (i)); )) \
371  FULL( ftx = _mm512_mask_loadunpacklo_pd(ftx, (k), fullf_0_x + (i)); ) \
372  FULL( fty = _mm512_mask_loadunpacklo_pd(fty, (k), fullf_0_y + (i)); ) \
373  FULL( ftz = _mm512_mask_loadunpacklo_pd(ftz, (k), fullf_0_z + (i)); ) \
374  FAST(SHORT( tx = _mm512_mask_add_pd( tx, (k), tx, _mm512_swizzle_pd(( v_x), (p))); )) \
375  FAST(SHORT( ty = _mm512_mask_add_pd( ty, (k), ty, _mm512_swizzle_pd(( v_y), (p))); )) \
376  FAST(SHORT( tz = _mm512_mask_add_pd( tz, (k), tz, _mm512_swizzle_pd(( v_z), (p))); )) \
377  FULL( ftx = _mm512_mask_add_pd(ftx, (k), ftx, _mm512_swizzle_pd((fv_x), (p))); ) \
378  FULL( fty = _mm512_mask_add_pd(fty, (k), fty, _mm512_swizzle_pd((fv_y), (p))); ) \
379  FULL( ftz = _mm512_mask_add_pd(ftz, (k), ftz, _mm512_swizzle_pd((fv_z), (p))); ) \
380  FAST(SHORT( _mm512_mask_packstorelo_pd( f_0_x + (i), (k), tx); )) \
381  FAST(SHORT( _mm512_mask_packstorelo_pd( f_0_y + (i), (k), ty); )) \
382  FAST(SHORT( _mm512_mask_packstorelo_pd( f_0_z + (i), (k), tz); )) \
383  FULL( _mm512_mask_packstorelo_pd(fullf_0_x + (i), (k), ftx); ) \
384  FULL( _mm512_mask_packstorelo_pd(fullf_0_y + (i), (k), fty); ) \
385  FULL( _mm512_mask_packstorelo_pd(fullf_0_z + (i), (k), ftz); )
386 
387  #define APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x, v_y, v_z, fv_x, fv_y, fv_z, i, p, k) \
388  FAST(SHORT( tx = _mm512_mask_loadunpacklo_pd( tx, (k), f_1_x + (i)); )) \
389  FAST(SHORT( ty = _mm512_mask_loadunpacklo_pd( ty, (k), f_1_y + (i)); )) \
390  FAST(SHORT( tz = _mm512_mask_loadunpacklo_pd( tz, (k), f_1_z + (i)); )) \
391  FULL( ftx = _mm512_mask_loadunpacklo_pd(ftx, (k), fullf_1_x + (i)); ) \
392  FULL( fty = _mm512_mask_loadunpacklo_pd(fty, (k), fullf_1_y + (i)); ) \
393  FULL( ftz = _mm512_mask_loadunpacklo_pd(ftz, (k), fullf_1_z + (i)); ) \
394  FAST(SHORT( tx = _mm512_mask_sub_pd( tx, (k), tx, _mm512_swizzle_pd(( v_x), (p))); )) \
395  FAST(SHORT( ty = _mm512_mask_sub_pd( ty, (k), ty, _mm512_swizzle_pd(( v_y), (p))); )) \
396  FAST(SHORT( tz = _mm512_mask_sub_pd( tz, (k), tz, _mm512_swizzle_pd(( v_z), (p))); )) \
397  FULL( ftx = _mm512_mask_sub_pd(ftx, (k), ftx, _mm512_swizzle_pd((fv_x), (p))); ) \
398  FULL( fty = _mm512_mask_sub_pd(fty, (k), fty, _mm512_swizzle_pd((fv_y), (p))); ) \
399  FULL( ftz = _mm512_mask_sub_pd(ftz, (k), ftz, _mm512_swizzle_pd((fv_z), (p))); ) \
400  FAST(SHORT( _mm512_mask_packstorelo_pd( f_1_x + (i), (k), tx); )) \
401  FAST(SHORT( _mm512_mask_packstorelo_pd( f_1_y + (i), (k), ty); )) \
402  FAST(SHORT( _mm512_mask_packstorelo_pd( f_1_z + (i), (k), tz); )) \
403  FULL( _mm512_mask_packstorelo_pd(fullf_1_x + (i), (k), ftx); ) \
404  FULL( _mm512_mask_packstorelo_pd(fullf_1_y + (i), (k), fty); ) \
405  FULL( _mm512_mask_packstorelo_pd(fullf_1_z + (i), (k), ftz); )
406 
407  #define APPLY_FORCES_PS2PD \
408  { \
409  __mmask16 k_lo = _mm512_int2mask(0x02); \
410  __mmask16 k_hi = _mm512_int2mask(0x20); \
411  FAST(SHORT( __m512d tx = _mm512_setzero_pd(); )) \
412  FAST(SHORT( __m512d ty = _mm512_setzero_pd(); )) \
413  FAST(SHORT( __m512d tz = _mm512_setzero_pd(); )) \
414  FULL( __m512d ftx = _mm512_setzero_pd(); ) \
415  FULL( __m512d fty = _mm512_setzero_pd(); ) \
416  FULL( __m512d ftz = _mm512_setzero_pd(); ) \
417  FAST(SHORT( __m512d v_x_lo = _mm512_cvtpslo_pd( tmp_x_vec ); )) \
418  FAST(SHORT( __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_x_vec, _MM_PERM_BADC)); )) \
419  FAST(SHORT( __m512d v_y_lo = _mm512_cvtpslo_pd( tmp_y_vec ); )) \
420  FAST(SHORT( __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_y_vec, _MM_PERM_BADC)); )) \
421  FAST(SHORT( __m512d v_z_lo = _mm512_cvtpslo_pd( tmp_z_vec ); )) \
422  FAST(SHORT( __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_z_vec, _MM_PERM_BADC)); )) \
423  FULL( __m512d fv_x_lo = _mm512_cvtpslo_pd( fulltmp_x_vec ); ) \
424  FULL( __m512d fv_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_x_vec, _MM_PERM_BADC)); ) \
425  FULL( __m512d fv_y_lo = _mm512_cvtpslo_pd( fulltmp_y_vec ); ) \
426  FULL( __m512d fv_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_y_vec, _MM_PERM_BADC)); ) \
427  FULL( __m512d fv_z_lo = _mm512_cvtpslo_pd( fulltmp_z_vec ); ) \
428  FULL( __m512d fv_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_z_vec, _MM_PERM_BADC)); ) \
429  int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
430  uintptr_t iTest_i = tmpI32[iTest_index]; \
431  __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, _mm512_set_1to16_epi32(iTest_i), i_vec); \
432  iTest_mask = _mm512_kxor(iTest_mask, cutoff_mask); \
433  if (_mm512_kortestz(iTest_mask, iTest_mask)) { \
434  __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
435  __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
436  FAST(SHORT( f_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_x_hi, v_x_lo)); )) \
437  FAST(SHORT( f_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_y_hi, v_y_lo)); )) \
438  FAST(SHORT( f_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_z_hi, v_z_lo)); )) \
439  FULL( fullf_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_x_hi, fv_x_lo)); ) \
440  FULL( fullf_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_y_hi, fv_y_lo)); ) \
441  FULL( fullf_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_z_hi, fv_z_lo)); ) \
442  __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
443  __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
444  FAST(SHORT( __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_x, _MM_SCALE_8); )) \
445  FAST(SHORT( __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_y, _MM_SCALE_8); )) \
446  FAST(SHORT( __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_z, _MM_SCALE_8); )) \
447  FAST(SHORT( __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_x, _MM_SCALE_8); )) \
448  FAST(SHORT( __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_y, _MM_SCALE_8); )) \
449  FAST(SHORT( __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_z, _MM_SCALE_8); )) \
450  FULL( __m512d ff_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_x, _MM_SCALE_8); ) \
451  FULL( __m512d ff_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_y, _MM_SCALE_8); ) \
452  FULL( __m512d ff_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_z, _MM_SCALE_8); ) \
453  FULL( __m512d ff_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_x, _MM_SCALE_8); ) \
454  FULL( __m512d ff_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_y, _MM_SCALE_8); ) \
455  FULL( __m512d ff_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_z, _MM_SCALE_8); ) \
456  FAST(SHORT( f_j_x_lo_vec = _mm512_mask_sub_pd( f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); )) \
457  FAST(SHORT( f_j_y_lo_vec = _mm512_mask_sub_pd( f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); )) \
458  FAST(SHORT( f_j_z_lo_vec = _mm512_mask_sub_pd( f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); )) \
459  FAST(SHORT( f_j_x_hi_vec = _mm512_mask_sub_pd( f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); )) \
460  FAST(SHORT( f_j_y_hi_vec = _mm512_mask_sub_pd( f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); )) \
461  FAST(SHORT( f_j_z_hi_vec = _mm512_mask_sub_pd( f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); )) \
462  FULL( ff_j_x_lo_vec = _mm512_mask_sub_pd(ff_j_x_lo_vec, cutoff_lo, ff_j_x_lo_vec, fv_x_lo); ) \
463  FULL( ff_j_y_lo_vec = _mm512_mask_sub_pd(ff_j_y_lo_vec, cutoff_lo, ff_j_y_lo_vec, fv_y_lo); ) \
464  FULL( ff_j_z_lo_vec = _mm512_mask_sub_pd(ff_j_z_lo_vec, cutoff_lo, ff_j_z_lo_vec, fv_z_lo); ) \
465  FULL( ff_j_x_hi_vec = _mm512_mask_sub_pd(ff_j_x_hi_vec, cutoff_hi, ff_j_x_hi_vec, fv_x_hi); ) \
466  FULL( ff_j_y_hi_vec = _mm512_mask_sub_pd(ff_j_y_hi_vec, cutoff_hi, ff_j_y_hi_vec, fv_y_hi); ) \
467  FULL( ff_j_z_hi_vec = _mm512_mask_sub_pd(ff_j_z_hi_vec, cutoff_hi, ff_j_z_hi_vec, fv_z_hi); ) \
468  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); )) \
469  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); )) \
470  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); )) \
471  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); )) \
472  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); )) \
473  FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); )) \
474  FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_lo, j_lo_vec, ff_j_x_lo_vec, _MM_SCALE_8); ) \
475  FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_lo, j_lo_vec, ff_j_y_lo_vec, _MM_SCALE_8); ) \
476  FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_lo, j_lo_vec, ff_j_z_lo_vec, _MM_SCALE_8); ) \
477  FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_hi, j_hi_vec, ff_j_x_hi_vec, _MM_SCALE_8); ) \
478  FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_hi, j_hi_vec, ff_j_y_hi_vec, _MM_SCALE_8); ) \
479  FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_hi, j_hi_vec, ff_j_z_hi_vec, _MM_SCALE_8); ) \
480  } else { \
481  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 0], _MM_SWIZ_REG_CDAB, k_lo); \
482  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 1], _MM_SWIZ_REG_DCBA, k_lo); \
483  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 2], _MM_SWIZ_REG_DACB, k_lo); \
484  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 3], _MM_SWIZ_REG_BADC, k_lo); \
485  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 4], _MM_SWIZ_REG_CDAB, k_hi); \
486  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 5], _MM_SWIZ_REG_DCBA, k_hi); \
487  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 6], _MM_SWIZ_REG_DACB, k_hi); \
488  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 7], _MM_SWIZ_REG_BADC, k_hi); \
489  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[ 8], _MM_SWIZ_REG_CDAB, k_lo); \
490  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[ 9], _MM_SWIZ_REG_DCBA, k_lo); \
491  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[10], _MM_SWIZ_REG_DACB, k_lo); \
492  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[11], _MM_SWIZ_REG_BADC, k_lo); \
493  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[12], _MM_SWIZ_REG_CDAB, k_hi); \
494  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[13], _MM_SWIZ_REG_DCBA, k_hi); \
495  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[14], _MM_SWIZ_REG_DACB, k_hi); \
496  APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[15], _MM_SWIZ_REG_BADC, k_hi); \
497  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 0], _MM_SWIZ_REG_CDAB, k_lo); \
498  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 1], _MM_SWIZ_REG_DCBA, k_lo); \
499  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 2], _MM_SWIZ_REG_DACB, k_lo); \
500  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 3], _MM_SWIZ_REG_BADC, k_lo); \
501  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 4], _MM_SWIZ_REG_CDAB, k_hi); \
502  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 5], _MM_SWIZ_REG_DCBA, k_hi); \
503  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 6], _MM_SWIZ_REG_DACB, k_hi); \
504  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 7], _MM_SWIZ_REG_BADC, k_hi); \
505  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[ 8], _MM_SWIZ_REG_CDAB, k_lo); \
506  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[ 9], _MM_SWIZ_REG_DCBA, k_lo); \
507  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[10], _MM_SWIZ_REG_DACB, k_lo); \
508  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[11], _MM_SWIZ_REG_BADC, k_lo); \
509  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[12], _MM_SWIZ_REG_CDAB, k_hi); \
510  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[13], _MM_SWIZ_REG_DCBA, k_hi); \
511  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[14], _MM_SWIZ_REG_DACB, k_hi); \
512  APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[15], _MM_SWIZ_REG_BADC, k_hi); \
513  } \
514  }
515 
516  #else // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
517 
518  #define APPLY_FORCES_PS2PD(v_x, v_y, v_z, f_i_x, f_i_y, f_i_z, f_j_x, f_j_y, f_j_z, i, j) \
519  { \
520  __mmask16 k_lo = _mm512_int2mask(0x02); \
521  __mmask16 k_hi = _mm512_int2mask(0x20); \
522  __m512d tx = _mm512_setzero_pd(); \
523  __m512d ty = _mm512_setzero_pd(); \
524  __m512d tz = _mm512_setzero_pd(); \
525  __m512d v_x_lo = _mm512_cvtpslo_pd(v_x); \
526  __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_x), _MM_PERM_BADC)); \
527  __m512d v_y_lo = _mm512_cvtpslo_pd(v_y); \
528  __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_y), _MM_PERM_BADC)); \
529  __m512d v_z_lo = _mm512_cvtpslo_pd(v_z); \
530  __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_z), _MM_PERM_BADC)); \
531  int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
532  uintptr_t iTest_i = tmpI32[iTest_index]; \
533  __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, _mm512_set_1to16_epi32(iTest_i), i_vec); \
534  iTest_mask = _mm512_kxor(iTest_mask, cutoff_mask); \
535  if (_mm512_kortestz(iTest_mask, iTest_mask)) { \
536  __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
537  __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
538  f_i_x[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_x_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_x_lo); \
539  f_i_y[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_y_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_y_lo); \
540  f_i_z[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_z_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_z_lo); \
541  __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
542  __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
543  __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_x), _MM_SCALE_8); \
544  __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_y), _MM_SCALE_8); \
545  __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_z), _MM_SCALE_8); \
546  __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_x), _MM_SCALE_8); \
547  __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_y), _MM_SCALE_8); \
548  __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_z), _MM_SCALE_8); \
549  f_j_x_lo_vec = _mm512_mask_sub_pd(f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); \
550  f_j_y_lo_vec = _mm512_mask_sub_pd(f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); \
551  f_j_z_lo_vec = _mm512_mask_sub_pd(f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); \
552  f_j_x_hi_vec = _mm512_mask_sub_pd(f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); \
553  f_j_y_hi_vec = _mm512_mask_sub_pd(f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); \
554  f_j_z_hi_vec = _mm512_mask_sub_pd(f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); \
555  _mm512_mask_i32loscatter_pd((f_j_x), cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); \
556  _mm512_mask_i32loscatter_pd((f_j_y), cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); \
557  _mm512_mask_i32loscatter_pd((f_j_z), cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); \
558  _mm512_mask_i32loscatter_pd((f_j_x), cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); \
559  _mm512_mask_i32loscatter_pd((f_j_y), cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); \
560  _mm512_mask_i32loscatter_pd((f_j_z), cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); \
561  } else { \
562  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 0], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_lo); \
563  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 1], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_lo); \
564  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 2], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_lo); \
565  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 3], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_lo); \
566  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 4], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_hi); \
567  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 5], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_hi); \
568  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 6], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_hi); \
569  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 7], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_hi); \
570  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 8], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_lo); \
571  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 9], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_lo); \
572  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[10], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_lo); \
573  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[11], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_lo); \
574  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[12], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_hi); \
575  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[13], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_hi); \
576  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[14], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_hi); \
577  APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[15], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_hi); \
578  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 0], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_lo); \
579  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 1], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_lo); \
580  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 2], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_lo); \
581  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 3], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_lo); \
582  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 4], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_hi); \
583  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 5], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_hi); \
584  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 6], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_hi); \
585  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 7], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_hi); \
586  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 8], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_lo); \
587  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 9], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_lo); \
588  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[10], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_lo); \
589  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[11], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_lo); \
590  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[12], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_hi); \
591  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[13], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_hi); \
592  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[14], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_hi); \
593  APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[15], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_hi); \
594  } \
595  }
596 
597  #endif // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
598  #endif // MIC_PAD_PLGEN != 0
599 
600  __m512i plI_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10, 9, 8,
601  7, 6, 5, 4, 3, 2, 1, 0);
602  const int plSize_16 = (plSize + 15) & (~15); // Round up to a multiple of 16
603 
604  FAST(ENERGY( __m512d vdwEnergy_vec = _mm512_setzero_pd(); ))
605  FAST(SHORT(ENERGY( __m512d electEnergy_vec = _mm512_setzero_pd(); )))
606  FULL(ENERGY( __m512d fullElectEnergy_vec = _mm512_setzero_pd(); ))
607  #if (0 PAIR(FAST(SHORT(+1))))
608  __m512d virial_xx_vec = _mm512_setzero_pd();
609  __m512d virial_xy_vec = _mm512_setzero_pd();
610  __m512d virial_xz_vec = _mm512_setzero_pd();
611  __m512d virial_yy_vec = _mm512_setzero_pd();
612  __m512d virial_yz_vec = _mm512_setzero_pd();
613  __m512d virial_zz_vec = _mm512_setzero_pd();
614  #endif
615  #if (0 PAIR(FULL(+1)))
616  __m512d fullElectVirial_xx_vec = _mm512_setzero_pd();
617  __m512d fullElectVirial_xy_vec = _mm512_setzero_pd();
618  __m512d fullElectVirial_xz_vec = _mm512_setzero_pd();
619  __m512d fullElectVirial_yy_vec = _mm512_setzero_pd();
620  __m512d fullElectVirial_yz_vec = _mm512_setzero_pd();
621  __m512d fullElectVirial_zz_vec = _mm512_setzero_pd();
622  #endif
623 
624  __m512i ij_mask_vec = _mm512_set_1to16_epi32(0x0000FFFF);
625  __m512i ij_store_perm_pattern = _mm512_set_16to16_epi32( 9, 7, 9, 6, 9, 5, 9, 4,
626  9, 3, 9, 2, 9, 1, 9, 0 );
627 
628  #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
629  __m512i exclusionSum_vec = _mm512_setzero_epi32();
630  #endif
631 
632  #if (0 NORMAL(+1))
633  #if (0 PAIR(+1))
634  #pragma loop count (500)
635  #else
636  #pragma loop count (7000)
637  #endif
638  #else
639  #if (0 PAIR(+1))
640  #pragma loop count (2)
641  #else
642  #pragma loop count (30)
643  #endif
644  #endif
645  #pragma prefetch plArray:_MM_HINT_NTA
646  #pragma novector
647  for (int plI = 0; plI < plSize_16; plI += 16) {
648 
649  // Create the active_mask
650  __mmask16 active_mask = _mm512_cmplt_epi32_mask(plI_vec, _mm512_set_1to16_epi32(plSize));
651 
652  #if MIC_HANDCODE_FORCE_PFDIST != 0
653  __m512i future_ij_vec = _mm512_load_epi32(plArray + plI + (16 * MIC_HANDCODE_FORCE_PFDIST));
654  __mmask16 future_ij_mask = _mm512_cmpneq_epi32_mask(future_ij_vec, _mm512_set_1to16_epi32(-1));
655  __m512i future_j_vec = _mm512_and_epi32(future_ij_vec, ij_mask_vec);
656  _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_x, _MM_SCALE_4, _MM_HINT_T0);
657  _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_y, _MM_SCALE_4, _MM_HINT_T0);
658  _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_z, _MM_SCALE_4, _MM_HINT_T0);
659  #endif
660 
661  // Load the i and j values from the pairlist array and modify the active_mask accordingly
662  __m512i ij_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), active_mask, plArray + plI);
663  #if __MIC_PAD_PLGEN_CTRL != 0
664  __m512i ij_mask_vec = _mm512_set_1to16_epi32(0xFFFF);
665  __m512i ij_lo_vec = _mm512_and_epi32(ij_vec, ij_mask_vec);
666  active_mask = _mm512_mask_cmpneq_epi32_mask(active_mask, ij_lo_vec, ij_mask_vec);
667  #endif
668  __m512i i_vec = _mm512_and_epi32(_mm512_mask_srli_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, 16), ij_mask_vec);
669  __m512i j_vec = _mm512_mask_and_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, ij_mask_vec);
670 
671  // Store the index out to an array of scalar values so they can be individually accessed by lane
672  uintptr_t tmpI32[16] __attribute__((aligned(64)));
673  uintptr_t tmpJ32[16] __attribute__((aligned(64)));
674  _mm512_store_epi64(tmpI32 , _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, i_vec ));
675  _mm512_store_epi64(tmpI32 + 8, _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, _mm512_permute4f128_epi32(i_vec, _MM_PERM_BADC)));
676  _mm512_store_epi64(tmpJ32 , _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, j_vec ));
677  _mm512_store_epi64(tmpJ32 + 8, _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, _mm512_permute4f128_epi32(j_vec, _MM_PERM_BADC)));
678 
679  // Increment the vectorized loop counter
680  plI_vec = _mm512_add_epi32(plI_vec, _mm512_set_1to16_epi32(16));
681 
682  // Load position/charge data for the i and j atoms
683  __m512 p_i_x_vec, p_i_y_vec, p_i_z_vec, p_i_q_vec, p_j_x_vec, p_j_y_vec, p_j_z_vec, p_j_q_vec;
684  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
685  __m512i pExt_i_vdwType_vec, pExt_j_vdwType_vec;
686  #endif
687  {
688  __m512 zero_vec = _mm512_setzero_ps();
689 
690  int iTest_index = _mm_tzcnt_32(_mm512_mask2int(active_mask)); // Index of lease significant 1 bit in active mask (NOTE: active_mask should be not all zeros)
691  unsigned int iTest_i = tmpI32[iTest_index]; // NOTE: iTest_i = a valid "i" value
692  #if __MIC_PAD_PLGEN_CTRL != 0
693  #else
694  __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(active_mask, _mm512_set_1to16_epi32(iTest_i), i_vec);
695  iTest_mask =_mm512_kxor(iTest_mask, active_mask);
696  if (_mm512_kortestz(iTest_mask, iTest_mask)) {
697  #endif
698  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
699  p_i_x_vec = _mm512_set_1to16_ps(p_0[iTest_i].x);
700  p_i_y_vec = _mm512_set_1to16_ps(p_0[iTest_i].y);
701  p_i_z_vec = _mm512_set_1to16_ps(p_0[iTest_i].z);
702  p_i_q_vec = _mm512_set_1to16_ps(p_0[iTest_i].charge);
703  {
704  __mmask16 k0 = _mm512_int2mask(0x000F);
705  __mmask16 k1 = _mm512_int2mask(0x00F0);
706  __mmask16 k2 = _mm512_int2mask(0x0F00);
707  __mmask16 k3 = _mm512_int2mask(0xF000);
708  __m512i tmp_a0 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 0]);
709  tmp_a0 = _mm512_mask_loadunpacklo_epi32( tmp_a0, k1, p_1 + tmpJ32[ 4]);
710  tmp_a0 = _mm512_mask_loadunpacklo_epi32( tmp_a0, k2, p_1 + tmpJ32[ 8]);
711  tmp_a0 = _mm512_mask_loadunpacklo_epi32( tmp_a0, k3, p_1 + tmpJ32[12]);
712  __m512i tmp_a1 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 1]);
713  tmp_a1 = _mm512_mask_loadunpacklo_epi32( tmp_a1, k1, p_1 + tmpJ32[ 5]);
714  tmp_a1 = _mm512_mask_loadunpacklo_epi32( tmp_a1, k2, p_1 + tmpJ32[ 9]);
715  tmp_a1 = _mm512_mask_loadunpacklo_epi32( tmp_a1, k3, p_1 + tmpJ32[13]);
716  __m512i tmp_a2 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 2]);
717  tmp_a2 = _mm512_mask_loadunpacklo_epi32( tmp_a2, k1, p_1 + tmpJ32[ 6]);
718  tmp_a2 = _mm512_mask_loadunpacklo_epi32( tmp_a2, k2, p_1 + tmpJ32[10]);
719  tmp_a2 = _mm512_mask_loadunpacklo_epi32( tmp_a2, k3, p_1 + tmpJ32[14]);
720  __m512i tmp_a3 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 3]);
721  tmp_a3 = _mm512_mask_loadunpacklo_epi32( tmp_a3, k1, p_1 + tmpJ32[ 7]);
722  tmp_a3 = _mm512_mask_loadunpacklo_epi32( tmp_a3, k2, p_1 + tmpJ32[11]);
723  tmp_a3 = _mm512_mask_loadunpacklo_epi32( tmp_a3, k3, p_1 + tmpJ32[15]);
724  __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
725  __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
726  __m512i tmp_b0 = _mm512_mask_swizzle_epi32(tmp_a0, k_2x2_0, tmp_a1, _MM_SWIZ_REG_CDAB);
727  __m512i tmp_b1 = _mm512_mask_swizzle_epi32(tmp_a1, k_2x2_1, tmp_a0, _MM_SWIZ_REG_CDAB);
728  __m512i tmp_b2 = _mm512_mask_swizzle_epi32(tmp_a2, k_2x2_0, tmp_a3, _MM_SWIZ_REG_CDAB);
729  __m512i tmp_b3 = _mm512_mask_swizzle_epi32(tmp_a3, k_2x2_1, tmp_a2, _MM_SWIZ_REG_CDAB);
730  __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
731  __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
732  p_j_x_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b0, k_4x4_0, tmp_b2, _MM_SWIZ_REG_BADC));
733  p_j_y_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b1, k_4x4_0, tmp_b3, _MM_SWIZ_REG_BADC));
734  p_j_z_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b2, k_4x4_1, tmp_b0, _MM_SWIZ_REG_BADC));
735  p_j_q_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b3, k_4x4_1, tmp_b1, _MM_SWIZ_REG_BADC));
736  }
737  #else
738  p_i_x_vec = _mm512_set_1to16_ps(p_0_x[iTest_i]);
739  p_i_y_vec = _mm512_set_1to16_ps(p_0_y[iTest_i]);
740  p_i_z_vec = _mm512_set_1to16_ps(p_0_z[iTest_i]);
741  p_i_q_vec = _mm512_set_1to16_ps(p_0_q[iTest_i]);
742  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
743  pExt_i_vdwType_vec = _mm512_set_1to16_epi32(pExt_0_vdwType[iTest_i]);
744  #endif
745  #if MIC_HANDCODE_FORCE_USEGATHER != 0
746  p_j_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_x, _MM_SCALE_4);
747  p_j_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_y, _MM_SCALE_4);
748  p_j_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_z, _MM_SCALE_4);
749  p_j_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_q, _MM_SCALE_4);
750  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
751  pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
752  #endif
753  #else
754  GATHER_PS_I32(p_j_x_vec, p_1_x, tmpJ32);
755  GATHER_PS_I32(p_j_y_vec, p_1_y, tmpJ32);
756  GATHER_PS_I32(p_j_z_vec, p_1_z, tmpJ32);
757  GATHER_PS_I32(p_j_q_vec, p_1_q, tmpJ32);
758  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
759  GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
760  #endif
761  #endif
762  #endif // MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
763  #if __MIC_PAD_PLGEN_CTRL != 0
764  #else
765  } else {
766  #if MIC_HANDCODE_FORCE_USEGATHER != 0
767  p_i_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_x, _MM_SCALE_4);
768  p_i_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_y, _MM_SCALE_4);
769  p_i_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_z, _MM_SCALE_4);
770  p_i_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_q, _MM_SCALE_4);
771  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
772  pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, i_vec, pExt_0_vdwType, _MM_SCALE_4);
773  #endif
774  p_j_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_x, _MM_SCALE_4);
775  p_j_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_y, _MM_SCALE_4);
776  p_j_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_z, _MM_SCALE_4);
777  p_j_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_q, _MM_SCALE_4);
778  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
779  pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
780  #endif
781  #else
782  GATHER_PS_I32(p_i_x_vec, p_0_x, tmpI32);
783  GATHER_PS_I32(p_i_y_vec, p_0_y, tmpI32);
784  GATHER_PS_I32(p_i_z_vec, p_0_z, tmpI32);
785  GATHER_PS_I32(p_i_q_vec, p_0_q, tmpI32);
786  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
787  GATHER_EPI32_I32(pExt_i_vdwType_vec, pExt_0_vdwType, tmpI32);
788  #endif
789  GATHER_PS_I32(p_j_x_vec, p_1_x, tmpJ32);
790  GATHER_PS_I32(p_j_y_vec, p_1_y, tmpJ32);
791  GATHER_PS_I32(p_j_z_vec, p_1_z, tmpJ32);
792  GATHER_PS_I32(p_j_q_vec, p_1_q, tmpJ32);
793  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
794  GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
795  #endif
796  #endif
797  }
798  #endif
799  }
800 
801  // Calculate the delta-x, delta-y, delta-z, and radius-squared
802  p_i_x_vec = _mm512_add_ps(p_i_x_vec, _mm512_set_1to16_ps(offset_x));
803  p_i_y_vec = _mm512_add_ps(p_i_y_vec, _mm512_set_1to16_ps(offset_y));
804  p_i_z_vec = _mm512_add_ps(p_i_z_vec, _mm512_set_1to16_ps(offset_z));
805  __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
806  __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
807  __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
808  __m512 r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_x_vec, p_ij_x_vec), _mm512_set_1to16_ps(r2_delta));
809  r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_y_vec, p_ij_y_vec), r2_vec);
810  r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_z_vec, p_ij_z_vec), r2_vec);
811 
812  // Do the 'within cutoff' check
813  // DMK - NOTE - When intrinsics are used, the compiler generates extra instructions to clear all but the least significant
814  // 8 bits of the mask register storing the result of the comparison.
815  __mmask16 cutoff_mask = _mm512_mask_cmplt_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps(cutoff2_delta));
816  if (_mm512_kortestz(cutoff_mask, cutoff_mask)) { continue; } // If the mask is completely unset, move on to the next vector
817 
818  #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
819  exclusionSum_vec = _mm512_mask_add_epi32(exclusionSum_vec, cutoff_mask, exclusionSum_vec, _mm512_set_1to16_epi32(1));
820  #endif
821 
822  // Calculate kqq = p_0_q[i] * p_1_q[j]
823  __m512 kqq_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, p_i_q_vec, p_j_q_vec);
824 
825  // Calculate table_i = (r2 >> 46) + r2_delta_expc
826  __m512i r2i_vec = _mm512_castps_si512(r2_vec);
827  __m512i table_i_vec = _mm512_srli_epi32(r2i_vec, 17);
828  table_i_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_i_vec, _mm512_set_1to16_epi32(r2_delta_expc));
829 
830  // Calculate diffa = r2 - r2_table[table_i]
831  __m512 r2_table_vec;
832  {
833  // From ComputeNonbondedUtil.C Simplified:
834  // r2_base = r2_delta * (1 << (i/64)) r2_base = r2_delta * (1 << (i/64))
835  // r2_del = r2_base / 64.0; r2_del = r2_base / 64.0;
836  // r2 = r2_base - r2_delta + r2_del * (i%64) r2_table[i] = r2_base - r2_delta + r2_del * (i%64) + r2_delta;
837  // r2_table[i] = r2 + r2_delta; = r2_base + r2_del * (i%64)
838  // NOTE: For i = 0, r2_table[0] = r2_delta + (r2_delta / 64) * 0 = r2_delta, so there no need
839  // to special case if table_i = 0 then r2_table[0] = r2_delta (see ComputeNonbondedUtil.C:606)
840  __m512 r2_delta_vec = _mm512_set_1to16_ps(r2_delta);
841  __m512 t0_vec = _mm512_cvtfxpnt_round_adjustepi32_ps(_mm512_sllv_epi32(_mm512_set_1to16_epi32(1), _mm512_srli_epi32(table_i_vec, 6)), _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
842  __m512 r2_base_vec = _mm512_mul_ps(r2_delta_vec, t0_vec); // NOTE: r2_delta * (1 << (i/64))
843  __m512 r2_del_vec = _mm512_mul_ps(r2_base_vec, _mm512_set_1to16_ps(0.015625f)); // NOTE: r2_base / 64
844  __m512i t1_vec = _mm512_and_epi32(table_i_vec, _mm512_set_1to16_epi32(0x3F)); // NOTE: (i%64)
845  __m512 t2_vec = _mm512_cvtfxpnt_round_adjustepi32_ps(t1_vec, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); // NOTE: (float)(i%64)
846  r2_table_vec = _mm512_add_ps(_mm512_mul_ps(r2_del_vec, t2_vec), r2_base_vec);
847  }
848  __m512 diffa_vec = _mm512_sub_ps(r2_vec, r2_table_vec);
849 
850  // Load LJ A and B values
851  #if (0 FAST(+1))
852 
853  // Load lj_pars from the table and multiply by scaling
854  __m512 A_vec, B_vec;
855  __m512i pExt_i_vdwType_vec; // NOTE : Moved outside block so could be printed in large force detection code below
856  __m512i pExt_j_vdwType_vec;
857  {
858  // Load VDW types for the "i" and "j" atoms
859  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT == 0
860  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
861  // DMK - NOTE : This code assumes that vdw_type is the first of four members of atom_param
862  pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, _mm512_slli_epi32(i_vec, 2), pExt_0, _MM_SCALE_4);
863  pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, _mm512_slli_epi32(j_vec, 2), pExt_1, _MM_SCALE_4);
864  #else
865  #if MIC_HANDCODE_FORCE_USEGATHER != 0
866  pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, i_vec, pExt_0_vdwType, _MM_SCALE_4);
867  pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
868  #else
869  GATHER_EPI32_I32(pExt_i_vdwType_vec, pExt_0_vdwType, tmpI32);
870  GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
871  #endif
872  #endif
873  #endif
874 
875  // Caclulate offsets into LJ table
876  __m512i t0_vec = _mm512_fmadd_epi32(pExt_i_vdwType_vec, _mm512_set_1to16_epi32(lj_table_dim), pExt_j_vdwType_vec);
877  __m512i ljOffset_vec = _mm512_slli_epi32(t0_vec, 2); // NOTE: *4
878  MODIFIED( ljOffset_vec = _mm512_add_epi32(ljOffset_vec, _mm512_set_1to16_epi32(2)); )
879 
880  // Gather A and B values
881  #if MIC_HANDCODE_FORCE_USEGATHER != 0
882  __m512i ljOffset_p1_vec = _mm512_add_epi32(ljOffset_vec, _mm512_set_1to16_epi32(1));
883  A_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, ljOffset_vec, lj_table_base_ptr, _MM_SCALE_4);
884  B_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, ljOffset_p1_vec, lj_table_base_ptr, _MM_SCALE_4);
885  #else
886  unsigned int ljOffset[16] __attribute__((aligned(64)));
887  _mm512_store_epi32(ljOffset, ljOffset_vec);
888  GATHER_PS_I32 (A_vec, lj_table_base_ptr, ljOffset );
889  GATHER_PS_I32_OFFSET(B_vec, lj_table_base_ptr, ljOffset, 1);
890  #endif
891 
892  // Scale the A and B values
893  __m512 scaling_vec = _mm512_set_1to16_ps(scaling);
894  A_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, A_vec, scaling_vec);
895  B_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, B_vec, scaling_vec);
896  }
897  #endif // FAST
898 
899  // Load the table_four values
900  // NOTE : In this loop, each vector lane is calculating an interaction between two particles, i and j.
901  // As part of this calculation, a table lookup is done (this code) where each table entry has 16
902  // values. Rather than do 16 gathers, extract the index for each interaction, do vector loads on
903  // the table entries (i.e. 1 table entry = 16 values = 2 512-bit vector registers), and then transpose
904  // those values in-register to create the vectorized registers that can be used by the code below
905  // (i.e. 1 vector register per each of the 16 fields). Once the values have been loaded, only
906  // swizzles and permutes are required to do this, so it shouldn't be very expensive to do.
907  const CALC_TYPE * const table_four_base = SHORT(table_short) NOSHORT(table_noshort);
908  __m512 table_four_i_0_vec, table_four_i_1_vec, table_four_i_2_vec, table_four_i_3_vec,
909  table_four_i_4_vec, table_four_i_5_vec, table_four_i_6_vec, table_four_i_7_vec,
910  table_four_i_8_vec, table_four_i_9_vec, table_four_i_10_vec, table_four_i_11_vec,
911  table_four_i_12_vec, table_four_i_13_vec, table_four_i_14_vec, table_four_i_15_vec;
912  {
913  #if MIC_HANDCODE_FORCE_USEGATHER_NBTBL != 0
914 
915  #if 0
916 
917  __m512i table_four_i_offsets = _mm512_slli_epi32(table_i_vec, 4); // NOTE: table_i * 16 (16 elements per table entry)
918  table_four_i_0_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets , table_four_base, _MM_SCALE_4);
919  table_four_i_1_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 1)), table_four_base, _MM_SCALE_4);
920  table_four_i_2_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 2)), table_four_base, _MM_SCALE_4);
921  table_four_i_3_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 3)), table_four_base, _MM_SCALE_4);
922  table_four_i_4_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 4)), table_four_base, _MM_SCALE_4);
923  table_four_i_5_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 5)), table_four_base, _MM_SCALE_4);
924  table_four_i_6_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 6)), table_four_base, _MM_SCALE_4);
925  table_four_i_7_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 7)), table_four_base, _MM_SCALE_4);
926  table_four_i_8_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 8)), table_four_base, _MM_SCALE_4);
927  table_four_i_9_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 9)), table_four_base, _MM_SCALE_4);
928  table_four_i_10_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(10)), table_four_base, _MM_SCALE_4);
929  table_four_i_11_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(11)), table_four_base, _MM_SCALE_4);
930  table_four_i_12_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(12)), table_four_base, _MM_SCALE_4);
931  table_four_i_13_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(13)), table_four_base, _MM_SCALE_4);
932  table_four_i_14_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(14)), table_four_base, _MM_SCALE_4);
933  table_four_i_15_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(15)), table_four_base, _MM_SCALE_4);
934 
935  #else
936 
937  __m512i table_four_i_offsets = _mm512_slli_epi32(table_i_vec, 4); // NOTE: table_i * 16 (16 elements per table entry)
938  table_four_i_0_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base, _MM_SCALE_4);
939  table_four_i_1_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 1, _MM_SCALE_4);
940  table_four_i_2_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 2, _MM_SCALE_4);
941  table_four_i_3_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 3, _MM_SCALE_4);
942  table_four_i_4_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 4, _MM_SCALE_4);
943  table_four_i_5_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 5, _MM_SCALE_4);
944  table_four_i_6_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 6, _MM_SCALE_4);
945  table_four_i_7_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 7, _MM_SCALE_4);
946  table_four_i_8_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 8, _MM_SCALE_4);
947  table_four_i_9_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 9, _MM_SCALE_4);
948  table_four_i_10_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 10, _MM_SCALE_4);
949  table_four_i_11_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 11, _MM_SCALE_4);
950  table_four_i_12_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 12, _MM_SCALE_4);
951  table_four_i_13_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 13, _MM_SCALE_4);
952  table_four_i_14_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 14, _MM_SCALE_4);
953  table_four_i_15_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 15, _MM_SCALE_4);
954 
955  #endif
956 
957  #else
958 
959  unsigned int table_four_i_offsets[16] __attribute__((aligned(64)));
960  _mm512_store_epi32(table_four_i_offsets, _mm512_slli_epi32(table_i_vec, 4)); // NOTE: table_i * 16 (16 elements per table entry)
961 
962  // Load and transpose 256 values (16 x 16 matrix)
963  __m512i tmpA_00 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 0]])));
964  __m512i tmpA_01 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 1]])));
965  __m512i tmpA_02 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 2]])));
966  __m512i tmpA_03 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 3]])));
967  __m512i tmpA_04 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 4]])));
968  __m512i tmpA_05 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 5]])));
969  __m512i tmpA_06 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 6]])));
970  __m512i tmpA_07 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 7]])));
971  __m512i tmpA_08 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 8]])));
972  __m512i tmpA_09 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 9]])));
973  __m512i tmpA_10 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[10]])));
974  __m512i tmpA_11 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[11]])));
975  __m512i tmpA_12 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[12]])));
976  __m512i tmpA_13 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[13]])));
977  __m512i tmpA_14 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[14]])));
978  __m512i tmpA_15 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[15]])));
979  __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
980  __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
981  __m512i tmpB_00 = _mm512_mask_swizzle_epi32(tmpA_00, k_2x2_0, tmpA_01, _MM_SWIZ_REG_CDAB);
982  __m512i tmpB_01 = _mm512_mask_swizzle_epi32(tmpA_01, k_2x2_1, tmpA_00, _MM_SWIZ_REG_CDAB);
983  __m512i tmpB_02 = _mm512_mask_swizzle_epi32(tmpA_02, k_2x2_0, tmpA_03, _MM_SWIZ_REG_CDAB);
984  __m512i tmpB_03 = _mm512_mask_swizzle_epi32(tmpA_03, k_2x2_1, tmpA_02, _MM_SWIZ_REG_CDAB);
985  __m512i tmpB_04 = _mm512_mask_swizzle_epi32(tmpA_04, k_2x2_0, tmpA_05, _MM_SWIZ_REG_CDAB);
986  __m512i tmpB_05 = _mm512_mask_swizzle_epi32(tmpA_05, k_2x2_1, tmpA_04, _MM_SWIZ_REG_CDAB);
987  __m512i tmpB_06 = _mm512_mask_swizzle_epi32(tmpA_06, k_2x2_0, tmpA_07, _MM_SWIZ_REG_CDAB);
988  __m512i tmpB_07 = _mm512_mask_swizzle_epi32(tmpA_07, k_2x2_1, tmpA_06, _MM_SWIZ_REG_CDAB);
989  __m512i tmpB_08 = _mm512_mask_swizzle_epi32(tmpA_08, k_2x2_0, tmpA_09, _MM_SWIZ_REG_CDAB);
990  __m512i tmpB_09 = _mm512_mask_swizzle_epi32(tmpA_09, k_2x2_1, tmpA_08, _MM_SWIZ_REG_CDAB);
991  __m512i tmpB_10 = _mm512_mask_swizzle_epi32(tmpA_10, k_2x2_0, tmpA_11, _MM_SWIZ_REG_CDAB);
992  __m512i tmpB_11 = _mm512_mask_swizzle_epi32(tmpA_11, k_2x2_1, tmpA_10, _MM_SWIZ_REG_CDAB);
993  __m512i tmpB_12 = _mm512_mask_swizzle_epi32(tmpA_12, k_2x2_0, tmpA_13, _MM_SWIZ_REG_CDAB);
994  __m512i tmpB_13 = _mm512_mask_swizzle_epi32(tmpA_13, k_2x2_1, tmpA_12, _MM_SWIZ_REG_CDAB);
995  __m512i tmpB_14 = _mm512_mask_swizzle_epi32(tmpA_14, k_2x2_0, tmpA_15, _MM_SWIZ_REG_CDAB);
996  __m512i tmpB_15 = _mm512_mask_swizzle_epi32(tmpA_15, k_2x2_1, tmpA_14, _MM_SWIZ_REG_CDAB);
997  __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
998  __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
999  __m512i tmpC_00 = _mm512_mask_swizzle_epi32(tmpB_00, k_4x4_0, tmpB_02, _MM_SWIZ_REG_BADC);
1000  __m512i tmpC_01 = _mm512_mask_swizzle_epi32(tmpB_01, k_4x4_0, tmpB_03, _MM_SWIZ_REG_BADC);
1001  __m512i tmpC_02 = _mm512_mask_swizzle_epi32(tmpB_02, k_4x4_1, tmpB_00, _MM_SWIZ_REG_BADC);
1002  __m512i tmpC_03 = _mm512_mask_swizzle_epi32(tmpB_03, k_4x4_1, tmpB_01, _MM_SWIZ_REG_BADC);
1003  __m512i tmpC_04 = _mm512_mask_swizzle_epi32(tmpB_04, k_4x4_0, tmpB_06, _MM_SWIZ_REG_BADC);
1004  __m512i tmpC_05 = _mm512_mask_swizzle_epi32(tmpB_05, k_4x4_0, tmpB_07, _MM_SWIZ_REG_BADC);
1005  __m512i tmpC_06 = _mm512_mask_swizzle_epi32(tmpB_06, k_4x4_1, tmpB_04, _MM_SWIZ_REG_BADC);
1006  __m512i tmpC_07 = _mm512_mask_swizzle_epi32(tmpB_07, k_4x4_1, tmpB_05, _MM_SWIZ_REG_BADC);
1007  __m512i tmpC_08 = _mm512_mask_swizzle_epi32(tmpB_08, k_4x4_0, tmpB_10, _MM_SWIZ_REG_BADC);
1008  __m512i tmpC_09 = _mm512_mask_swizzle_epi32(tmpB_09, k_4x4_0, tmpB_11, _MM_SWIZ_REG_BADC);
1009  __m512i tmpC_10 = _mm512_mask_swizzle_epi32(tmpB_10, k_4x4_1, tmpB_08, _MM_SWIZ_REG_BADC);
1010  __m512i tmpC_11 = _mm512_mask_swizzle_epi32(tmpB_11, k_4x4_1, tmpB_09, _MM_SWIZ_REG_BADC);
1011  __m512i tmpC_12 = _mm512_mask_swizzle_epi32(tmpB_12, k_4x4_0, tmpB_14, _MM_SWIZ_REG_BADC);
1012  __m512i tmpC_13 = _mm512_mask_swizzle_epi32(tmpB_13, k_4x4_0, tmpB_15, _MM_SWIZ_REG_BADC);
1013  __m512i tmpC_14 = _mm512_mask_swizzle_epi32(tmpB_14, k_4x4_1, tmpB_12, _MM_SWIZ_REG_BADC);
1014  __m512i tmpC_15 = _mm512_mask_swizzle_epi32(tmpB_15, k_4x4_1, tmpB_13, _MM_SWIZ_REG_BADC);
1015  __mmask16 k_8x8_0 = _mm512_int2mask(0xF0F0);
1016  __mmask16 k_8x8_1 = _mm512_int2mask(0x0F0F);
1017  __m512i tmpD_00 = _mm512_mask_permute4f128_epi32(tmpC_00, k_8x8_0, tmpC_04, _MM_PERM_CDAB);
1018  __m512i tmpD_01 = _mm512_mask_permute4f128_epi32(tmpC_01, k_8x8_0, tmpC_05, _MM_PERM_CDAB);
1019  __m512i tmpD_02 = _mm512_mask_permute4f128_epi32(tmpC_02, k_8x8_0, tmpC_06, _MM_PERM_CDAB);
1020  __m512i tmpD_03 = _mm512_mask_permute4f128_epi32(tmpC_03, k_8x8_0, tmpC_07, _MM_PERM_CDAB);
1021  __m512i tmpD_04 = _mm512_mask_permute4f128_epi32(tmpC_04, k_8x8_1, tmpC_00, _MM_PERM_CDAB);
1022  __m512i tmpD_05 = _mm512_mask_permute4f128_epi32(tmpC_05, k_8x8_1, tmpC_01, _MM_PERM_CDAB);
1023  __m512i tmpD_06 = _mm512_mask_permute4f128_epi32(tmpC_06, k_8x8_1, tmpC_02, _MM_PERM_CDAB);
1024  __m512i tmpD_07 = _mm512_mask_permute4f128_epi32(tmpC_07, k_8x8_1, tmpC_03, _MM_PERM_CDAB);
1025  __m512i tmpD_08 = _mm512_mask_permute4f128_epi32(tmpC_08, k_8x8_0, tmpC_12, _MM_PERM_CDAB);
1026  __m512i tmpD_09 = _mm512_mask_permute4f128_epi32(tmpC_09, k_8x8_0, tmpC_13, _MM_PERM_CDAB);
1027  __m512i tmpD_10 = _mm512_mask_permute4f128_epi32(tmpC_10, k_8x8_0, tmpC_14, _MM_PERM_CDAB);
1028  __m512i tmpD_11 = _mm512_mask_permute4f128_epi32(tmpC_11, k_8x8_0, tmpC_15, _MM_PERM_CDAB);
1029  __m512i tmpD_12 = _mm512_mask_permute4f128_epi32(tmpC_12, k_8x8_1, tmpC_08, _MM_PERM_CDAB);
1030  __m512i tmpD_13 = _mm512_mask_permute4f128_epi32(tmpC_13, k_8x8_1, tmpC_09, _MM_PERM_CDAB);
1031  __m512i tmpD_14 = _mm512_mask_permute4f128_epi32(tmpC_14, k_8x8_1, tmpC_10, _MM_PERM_CDAB);
1032  __m512i tmpD_15 = _mm512_mask_permute4f128_epi32(tmpC_15, k_8x8_1, tmpC_11, _MM_PERM_CDAB);
1033  __mmask16 k_16x16_0 = _mm512_int2mask(0xFF00);
1034  __mmask16 k_16x16_1 = _mm512_int2mask(0x00FF);
1035  table_four_i_0_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_00, k_16x16_0, tmpD_08, _MM_PERM_BADC));
1036  table_four_i_1_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_01, k_16x16_0, tmpD_09, _MM_PERM_BADC));
1037  table_four_i_2_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_02, k_16x16_0, tmpD_10, _MM_PERM_BADC));
1038  table_four_i_3_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_03, k_16x16_0, tmpD_11, _MM_PERM_BADC));
1039  table_four_i_4_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_04, k_16x16_0, tmpD_12, _MM_PERM_BADC));
1040  table_four_i_5_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_05, k_16x16_0, tmpD_13, _MM_PERM_BADC));
1041  table_four_i_6_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_06, k_16x16_0, tmpD_14, _MM_PERM_BADC));
1042  table_four_i_7_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_07, k_16x16_0, tmpD_15, _MM_PERM_BADC));
1043  table_four_i_8_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_08, k_16x16_1, tmpD_00, _MM_PERM_BADC));
1044  table_four_i_9_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_09, k_16x16_1, tmpD_01, _MM_PERM_BADC));
1045  table_four_i_10_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_10, k_16x16_1, tmpD_02, _MM_PERM_BADC));
1046  table_four_i_11_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_11, k_16x16_1, tmpD_03, _MM_PERM_BADC));
1047  table_four_i_12_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_12, k_16x16_1, tmpD_04, _MM_PERM_BADC));
1048  table_four_i_13_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_13, k_16x16_1, tmpD_05, _MM_PERM_BADC));
1049  table_four_i_14_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_14, k_16x16_1, tmpD_06, _MM_PERM_BADC));
1050  table_four_i_15_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_15, k_16x16_1, tmpD_07, _MM_PERM_BADC));
1051 
1052  #endif
1053  }
1054 
1055  #if (0 FAST(+1))
1056 
1057  __m512 vdw_d_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_0_vec), _mm512_mul_ps(B_vec, table_four_i_4_vec));
1058  __m512 vdw_c_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_1_vec), _mm512_mul_ps(B_vec, table_four_i_5_vec));
1059  __m512 vdw_b_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_2_vec), _mm512_mul_ps(B_vec, table_four_i_6_vec));
1060  __m512 vdw_a_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_3_vec), _mm512_mul_ps(B_vec, table_four_i_7_vec));
1061 
1062  #if (0 ENERGY(+1))
1063  __m512 vdw_val_tmp_0_vec = _mm512_mul_ps(vdw_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
1064  __m512 vdw_val_tmp_1_vec = _mm512_mul_ps(vdw_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
1065  __m512 vdw_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_0_vec), vdw_val_tmp_1_vec);
1066  __m512 vdw_val_tmp_3_vec = _mm512_mul_ps(vdw_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
1067  __m512 vdw_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_2_vec), vdw_val_tmp_3_vec);
1068  __m512 vdw_val_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_4_vec), vdw_a_vec);
1069  CONTRIB_SUB_PS2PD(vdwEnergy_vec, vdw_val_vec);
1070  #endif
1071 
1072  #if (0 SHORT(+1))
1073 
1074  #if (0 NORMAL(+1))
1075  __m512 fast_d_vec = _mm512_mul_ps(kqq_vec, table_four_i_8_vec);
1076  __m512 fast_c_vec = _mm512_mul_ps(kqq_vec, table_four_i_9_vec);
1077  __m512 fast_b_vec = _mm512_mul_ps(kqq_vec, table_four_i_10_vec);
1078  __m512 fast_a_vec = _mm512_mul_ps(kqq_vec, table_four_i_11_vec);
1079  #endif
1080  #if (0 MODIFIED(+1))
1081  __m512 modfckqq_vec = _mm512_mul_ps(_mm512_set_1to16_ps(1.0f - modf_mod), kqq_vec);
1082  __m512 fast_d_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_8_vec);
1083  __m512 fast_c_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_9_vec);
1084  __m512 fast_b_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_10_vec);
1085  __m512 fast_a_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_11_vec);
1086  #endif
1087 
1088  #if (0 ENERGY(+1))
1089  __m512 fast_val_tmp_0_vec = _mm512_mul_ps(fast_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
1090  __m512 fast_val_tmp_1_vec = _mm512_mul_ps(fast_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
1091  __m512 fast_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_0_vec), fast_val_tmp_1_vec);
1092  __m512 fast_val_tmp_3_vec = _mm512_mul_ps(fast_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
1093  __m512 fast_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_2_vec), fast_val_tmp_3_vec);
1094  __m512 fast_val_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_4_vec), fast_a_vec);
1095  CONTRIB_SUB_PS2PD(electEnergy_vec, fast_val_vec);
1096  #endif
1097 
1098  #if (0 NOT_ALCHPAIR(+1))
1099  fast_d_vec = _mm512_add_ps(fast_d_vec, vdw_d_vec);
1100  fast_c_vec = _mm512_add_ps(fast_c_vec, vdw_c_vec);
1101  fast_b_vec = _mm512_add_ps(fast_b_vec, vdw_b_vec);
1102  fast_a_vec = _mm512_add_ps(fast_a_vec, vdw_a_vec);
1103  #endif
1104 
1105  __m512 fast_dir_tmp_0_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_d_vec), fast_c_vec);
1106  __m512 fast_dir_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_dir_tmp_0_vec), fast_b_vec);
1107  __m512 force_r_vec = fast_dir_vec; // NOTE: No-op left in as placeholder for when LAM is added
1108 
1109  const __m512 tmp_x_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_x_vec);
1110  PAIR( CONTRIB_ADD_PS2PD(virial_xx_vec, _mm512_mul_ps(tmp_x_vec, p_ij_x_vec)); )
1111  PAIR( CONTRIB_ADD_PS2PD(virial_xy_vec, _mm512_mul_ps(tmp_x_vec, p_ij_y_vec)); )
1112  PAIR( CONTRIB_ADD_PS2PD(virial_xz_vec, _mm512_mul_ps(tmp_x_vec, p_ij_z_vec)); )
1113 
1114  const __m512 tmp_y_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_y_vec);
1115  PAIR( CONTRIB_ADD_PS2PD(virial_yy_vec, _mm512_mul_ps(tmp_y_vec, p_ij_y_vec)); )
1116  PAIR( CONTRIB_ADD_PS2PD(virial_yz_vec, _mm512_mul_ps(tmp_y_vec, p_ij_z_vec)); )
1117 
1118  const __m512 tmp_z_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_z_vec);
1119  PAIR( CONTRIB_ADD_PS2PD(virial_zz_vec, _mm512_mul_ps(tmp_z_vec, p_ij_z_vec)); )
1120 
1121  #if MIC_HANDCODE_FORCE_COMBINE_FORCES == 0
1122  APPLY_FORCES_PS2PD(tmp_x_vec, tmp_y_vec, tmp_z_vec, f_0_x, f_0_y, f_0_z, f_1_x, f_1_y, f_1_z, tmpI32, tmpJ32);
1123  #endif
1124 
1125  #endif // SHORT
1126  #endif // FAST
1127 
1128  #if (0 FULL(+1))
1129 
1130  // Load the slow_table values, if need be
1131  #if (0 SHORT( EXCLUDED(+1) MODIFIED(+1) ))
1132 
1133  __m512 slow_i_0_vec, slow_i_1_vec, slow_i_2_vec, slow_i_3_vec;
1134  {
1135  // Create the indexes
1136  unsigned int slow_i_offsets[16] __attribute__((aligned(64)));
1137  _mm512_store_epi32(slow_i_offsets, _mm512_slli_epi32(table_i_vec, 2)); // table_i * 4
1138 
1139  // Load the values (64 values, 16x4 matrix, so we only care about elements 0-3)
1140  // DMK - NOTE : Since the slow table is aligned, all four elements being loaded by the loadunpacklo
1141  // will be grouped on a single cacheline, so no need to also include a loadunpackhi instruction
1142  #define __LOAD_SLOW(dstA, offset) \
1143  __m512i (dstA) = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x000F), slow_table + (offset));
1144  __LOAD_SLOW(slow_i_tmp00_vec, slow_i_offsets[ 0]);
1145  __LOAD_SLOW(slow_i_tmp01_vec, slow_i_offsets[ 1]);
1146  __LOAD_SLOW(slow_i_tmp02_vec, slow_i_offsets[ 2]);
1147  __LOAD_SLOW(slow_i_tmp03_vec, slow_i_offsets[ 3]);
1148  __LOAD_SLOW(slow_i_tmp04_vec, slow_i_offsets[ 4]);
1149  __LOAD_SLOW(slow_i_tmp05_vec, slow_i_offsets[ 5]);
1150  __LOAD_SLOW(slow_i_tmp06_vec, slow_i_offsets[ 6]);
1151  __LOAD_SLOW(slow_i_tmp07_vec, slow_i_offsets[ 7]);
1152  __LOAD_SLOW(slow_i_tmp08_vec, slow_i_offsets[ 8]);
1153  __LOAD_SLOW(slow_i_tmp09_vec, slow_i_offsets[ 9]);
1154  __LOAD_SLOW(slow_i_tmp10_vec, slow_i_offsets[10]);
1155  __LOAD_SLOW(slow_i_tmp11_vec, slow_i_offsets[11]);
1156  __LOAD_SLOW(slow_i_tmp12_vec, slow_i_offsets[12]);
1157  __LOAD_SLOW(slow_i_tmp13_vec, slow_i_offsets[13]);
1158  __LOAD_SLOW(slow_i_tmp14_vec, slow_i_offsets[14]);
1159  __LOAD_SLOW(slow_i_tmp15_vec, slow_i_offsets[15]);
1160  #undef __LOAD_SLOW
1161 
1162  // Transpose the values
1163  // NOTE : Transpose each 2x2 sub-matrix, but only elements 0-4 have data we care about
1164  #define __TRANSPOSE_2x2__(dstA, dstB, srcA, srcB) \
1165  __m512i dstA = _mm512_mask_swizzle_epi32((srcA), _mm512_int2mask(0xAAAA), (srcB), _MM_SWIZ_REG_CDAB); \
1166  __m512i dstB = _mm512_mask_swizzle_epi32((srcB), _mm512_int2mask(0x5555), (srcA), _MM_SWIZ_REG_CDAB);
1167  __TRANSPOSE_2x2__(slow_2x2_tmp00_vec, slow_2x2_tmp01_vec, slow_i_tmp00_vec, slow_i_tmp01_vec);
1168  __TRANSPOSE_2x2__(slow_2x2_tmp02_vec, slow_2x2_tmp03_vec, slow_i_tmp02_vec, slow_i_tmp03_vec);
1169  __TRANSPOSE_2x2__(slow_2x2_tmp04_vec, slow_2x2_tmp05_vec, slow_i_tmp04_vec, slow_i_tmp05_vec);
1170  __TRANSPOSE_2x2__(slow_2x2_tmp06_vec, slow_2x2_tmp07_vec, slow_i_tmp06_vec, slow_i_tmp07_vec);
1171  __TRANSPOSE_2x2__(slow_2x2_tmp08_vec, slow_2x2_tmp09_vec, slow_i_tmp08_vec, slow_i_tmp09_vec);
1172  __TRANSPOSE_2x2__(slow_2x2_tmp10_vec, slow_2x2_tmp11_vec, slow_i_tmp10_vec, slow_i_tmp11_vec);
1173  __TRANSPOSE_2x2__(slow_2x2_tmp12_vec, slow_2x2_tmp13_vec, slow_i_tmp12_vec, slow_i_tmp13_vec);
1174  __TRANSPOSE_2x2__(slow_2x2_tmp14_vec, slow_2x2_tmp15_vec, slow_i_tmp14_vec, slow_i_tmp15_vec);
1175  #undef __TRANSPOSE_2x2__
1176  #define __TRANSPOSE_4x4__(dstA, dstB, dstC, dstD, srcA, srcB, srcC, srcD) \
1177  __m512i dstA = _mm512_mask_swizzle_epi32((srcA), _mm512_int2mask(0xCCCC), (srcC), _MM_SWIZ_REG_BADC); \
1178  __m512i dstB = _mm512_mask_swizzle_epi32((srcB), _mm512_int2mask(0xCCCC), (srcD), _MM_SWIZ_REG_BADC); \
1179  __m512i dstC = _mm512_mask_swizzle_epi32((srcC), _mm512_int2mask(0x3333), (srcA), _MM_SWIZ_REG_BADC); \
1180  __m512i dstD = _mm512_mask_swizzle_epi32((srcD), _mm512_int2mask(0x3333), (srcB), _MM_SWIZ_REG_BADC);
1181  __TRANSPOSE_4x4__(slow_4x4_tmp00_vec, slow_4x4_tmp01_vec, slow_4x4_tmp02_vec, slow_4x4_tmp03_vec, slow_2x2_tmp00_vec, slow_2x2_tmp01_vec, slow_2x2_tmp02_vec, slow_2x2_tmp03_vec);
1182  __TRANSPOSE_4x4__(slow_4x4_tmp04_vec, slow_4x4_tmp05_vec, slow_4x4_tmp06_vec, slow_4x4_tmp07_vec, slow_2x2_tmp04_vec, slow_2x2_tmp05_vec, slow_2x2_tmp06_vec, slow_2x2_tmp07_vec);
1183  __TRANSPOSE_4x4__(slow_4x4_tmp08_vec, slow_4x4_tmp09_vec, slow_4x4_tmp10_vec, slow_4x4_tmp11_vec, slow_2x2_tmp08_vec, slow_2x2_tmp09_vec, slow_2x2_tmp10_vec, slow_2x2_tmp11_vec);
1184  __TRANSPOSE_4x4__(slow_4x4_tmp12_vec, slow_4x4_tmp13_vec, slow_4x4_tmp14_vec, slow_4x4_tmp15_vec, slow_2x2_tmp12_vec, slow_2x2_tmp13_vec, slow_2x2_tmp14_vec, slow_2x2_tmp15_vec);
1185  #undef __TRANSPOSE_2x2__
1186  __m512i slow_4x8_tmp00_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp00_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp04_vec, _MM_PERM_CDAB);
1187  __m512i slow_4x8_tmp01_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp01_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp05_vec, _MM_PERM_CDAB);
1188  __m512i slow_4x8_tmp02_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp02_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp06_vec, _MM_PERM_CDAB);
1189  __m512i slow_4x8_tmp03_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp03_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp07_vec, _MM_PERM_CDAB);
1190  __m512i slow_4x8_tmp04_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp08_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp12_vec, _MM_PERM_CDAB);
1191  __m512i slow_4x8_tmp05_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp09_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp13_vec, _MM_PERM_CDAB);
1192  __m512i slow_4x8_tmp06_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp10_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp14_vec, _MM_PERM_CDAB);
1193  __m512i slow_4x8_tmp07_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp11_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp15_vec, _MM_PERM_CDAB);
1194  slow_i_0_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp00_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp04_vec, _MM_PERM_BADC));
1195  slow_i_1_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp01_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp05_vec, _MM_PERM_BADC));
1196  slow_i_2_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp02_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp06_vec, _MM_PERM_BADC));
1197  slow_i_3_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp03_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp07_vec, _MM_PERM_BADC));
1198  }
1199 
1200  #endif // SHORT && (EXCLUDED || MODIFIED)
1201 
1202  __m512 slow_d_vec = NOSHORT( table_four_i_8_vec) SHORT(table_four_i_12_vec);
1203  __m512 slow_c_vec = NOSHORT( table_four_i_9_vec) SHORT(table_four_i_13_vec);
1204  __m512 slow_b_vec = NOSHORT(table_four_i_10_vec) SHORT(table_four_i_14_vec);
1205  __m512 slow_a_vec = NOSHORT(table_four_i_11_vec) SHORT(table_four_i_15_vec);
1206 
1207  #if (0 EXCLUDED(+1))
1208  #if (0 SHORT(+1))
1209  slow_a_vec = _mm512_add_ps( slow_i_3_vec , slow_a_vec);
1210  slow_b_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(2.0f), slow_i_2_vec), slow_b_vec);
1211  slow_c_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(4.0f), slow_i_1_vec), slow_c_vec);
1212  slow_d_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(6.0f), slow_i_0_vec), slow_d_vec);
1213  #endif
1214  #if (0 NOSHORT(+1))
1215  slow_d_vec = _mm512_sub_ps(slow_d_vec, table_four_i_12_vec);
1216  slow_c_vec = _mm512_sub_ps(slow_c_vec, table_four_i_13_vec);
1217  slow_b_vec = _mm512_sub_ps(slow_b_vec, table_four_i_14_vec);
1218  slow_a_vec = _mm512_sub_ps(slow_a_vec, table_four_i_15_vec);
1219  #endif
1220  #endif // EXCLUDED
1221  #if (0 MODIFIED(+1))
1222  #if (0 SHORT(+1))
1223  slow_a_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps( modf_mod), slow_i_3_vec), slow_a_vec);
1224  slow_b_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(2.0f * modf_mod), slow_i_2_vec), slow_b_vec);
1225  slow_c_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(4.0f * modf_mod), slow_i_1_vec), slow_c_vec);
1226  slow_d_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(6.0f * modf_mod), slow_i_0_vec), slow_d_vec);
1227  #endif
1228  #if (0 NOSHORT(+1))
1229  __m512 modf_mod_vec = _mm512_set_1to16_ps(modf_mod);
1230  slow_d_vec = _mm512_sub_ps(slow_d_vec, _mm512_mul_ps(modf_mod, table_four_i_12_vec));
1231  slow_c_vec = _mm512_sub_ps(slow_c_vec, _mm512_mul_ps(modf_mod, table_four_i_13_vec));
1232  slow_b_vec = _mm512_sub_ps(slow_b_vec, _mm512_mul_ps(modf_mod, table_four_i_14_vec));
1233  slow_a_vec = _mm512_sub_ps(slow_a_vec, _mm512_mul_ps(modf_mod, table_four_i_15_vec));
1234  #endif
1235  #endif // MODIFIED
1236  slow_d_vec = _mm512_mul_ps(slow_d_vec, kqq_vec);
1237  slow_c_vec = _mm512_mul_ps(slow_c_vec, kqq_vec);
1238  slow_b_vec = _mm512_mul_ps(slow_b_vec, kqq_vec);
1239  slow_a_vec = _mm512_mul_ps(slow_a_vec, kqq_vec);
1240 
1241  #if (0 ENERGY(+1))
1242  __m512 slow_val_tmp_0_vec = _mm512_mul_ps(slow_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
1243  __m512 slow_val_tmp_1_vec = _mm512_mul_ps(slow_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
1244  __m512 slow_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_0_vec), slow_val_tmp_1_vec);
1245  __m512 slow_val_tmp_3_vec = _mm512_mul_ps(slow_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
1246  __m512 slow_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_2_vec), slow_val_tmp_3_vec);
1247  __m512 slow_val_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_4_vec), slow_a_vec);
1248  CONTRIB_SUB_PS2PD(fullElectEnergy_vec, slow_val_vec);
1249  #endif
1250 
1251  #if (0 NOT_ALCHPAIR(FAST(NOSHORT(+1))))
1252  slow_d_vec = _mm512_add_ps(vdw_d_vec, slow_d_vec);
1253  slow_c_vec = _mm512_add_ps(vdw_c_vec, slow_c_vec);
1254  slow_b_vec = _mm512_add_ps(vdw_b_vec, slow_b_vec);
1255  slow_a_vec = _mm512_add_ps(vdw_a_vec, slow_a_vec);
1256  #endif
1257 
1258  __m512 slow_dir_tmp_0_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_d_vec), slow_c_vec);
1259  __m512 slow_dir_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_dir_tmp_0_vec), slow_b_vec);
1260  __m512 fullforce_r_vec = slow_dir_vec; // NOTE: No-op left in as a placeholder for when LAM is added
1261 
1262  const __m512 fulltmp_x_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_x_vec);
1263  PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xx_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_x_vec)); )
1264  PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xy_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_y_vec)); )
1265  PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xz_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_z_vec)); )
1266 
1267  const __m512 fulltmp_y_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_y_vec);
1268  PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_yy_vec, _mm512_mul_ps(fulltmp_y_vec, p_ij_y_vec)); )
1269  PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_yz_vec, _mm512_mul_ps(fulltmp_y_vec, p_ij_z_vec)); )
1270 
1271  const __m512 fulltmp_z_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_z_vec);
1272  PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_zz_vec, _mm512_mul_ps(fulltmp_z_vec, p_ij_z_vec)); )
1273 
1274  #if MIC_HANDCODE_FORCE_COMBINE_FORCES == 0
1275  APPLY_FORCES_PS2PD(fulltmp_x_vec, fulltmp_y_vec, fulltmp_z_vec, fullf_0_x, fullf_0_y, fullf_0_z, fullf_1_x, fullf_1_y, fullf_1_z, tmpI32, tmpJ32);
1276  #endif
1277 
1278  #endif // FULL
1279 
1280  #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
1281  APPLY_FORCES_PS2PD;
1282  #endif
1283 
1284  // DMK - DEBUG - LARGE FORCE DETECTION
1285  #if (MIC_DATA_STRUCT_VERIFY != 0) && (MIC_DATA_STRUCT_VERIFY_KERNELS != 0)
1286  const float lfd_const = 3.2e2f; // Set to 5.0e1 to print a few per timestep for ApoA1
1287  __mmask16 lfd_mask = _mm512_int2mask(0x0000);
1288  #define LFD_CHECK(v) _mm512_mask_cmplt_ps_mask(cutoff_mask, _mm512_set_1to16_ps(lfd_const), _mm512_mask_mul_ps(v, _mm512_cmplt_ps_mask(v, _mm512_setzero_ps()), v, _mm512_set_1to16_ps(-1.0f)))
1289  #if (0 FAST(SHORT(+1)))
1290  lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_x_vec));
1291  lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_y_vec));
1292  lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_z_vec));
1293  #endif
1294  #if (0 FULL(+1))
1295  lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_x_vec));
1296  lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_y_vec));
1297  lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_z_vec));
1298  #endif
1299  #undef LFD_CHECK
1300  #pragma unroll(16)
1301  for (int _k = 0; _k < 16; _k++) {
1302  lfd_mask = _mm512_kor(lfd_mask, _mm512_int2mask(((pExt_0[tmpI32[_k]].index == 53839 && pExt_1[tmpJ32[_k]].index == 53827) ? (0x1) : (0x0)) << _k));
1303  }
1304  if (_mm512_mask2int(lfd_mask) != 0) {
1305 
1306  float tmp_force[16] __attribute__((aligned(64)));
1307  float tmp_fullforce[16] __attribute__((aligned(64)));
1308  float tmp_ijx[16] __attribute__((aligned(64)));
1309  float tmp_ijy[16] __attribute__((aligned(64)));
1310  float tmp_ijz[16] __attribute__((aligned(64)));
1311  float tmp_x[16] __attribute__((aligned(64)));
1312  float tmp_y[16] __attribute__((aligned(64)));
1313  float tmp_z[16] __attribute__((aligned(64)));
1314  float fulltmp_x[16] __attribute__((aligned(64)));
1315  float fulltmp_y[16] __attribute__((aligned(64)));
1316  float fulltmp_z[16] __attribute__((aligned(64)));
1317  float pi_x[16] __attribute__((aligned(64)));
1318  float pi_y[16] __attribute__((aligned(64)));
1319  float pi_z[16] __attribute__((aligned(64)));
1320  float pi_q[16] __attribute__((aligned(64)));
1321  float pj_x[16] __attribute__((aligned(64)));
1322  float pj_y[16] __attribute__((aligned(64)));
1323  float pj_z[16] __attribute__((aligned(64)));
1324  float pj_q[16] __attribute__((aligned(64)));
1325  float r2[16] __attribute__((aligned(64)));
1326  int table_i[16] __attribute__((aligned(64)));
1327  float kqq[16] __attribute__((aligned(64)));
1328  float diffa[16] __attribute__((aligned(64)));
1329  float vdw_a[16] __attribute__((aligned(64)));
1330  float vdw_b[16] __attribute__((aligned(64)));
1331  float vdw_c[16] __attribute__((aligned(64)));
1332  float vdw_d[16] __attribute__((aligned(64)));
1333  float fast_a[16] __attribute__((aligned(64)));
1334  float fast_b[16] __attribute__((aligned(64)));
1335  float fast_c[16] __attribute__((aligned(64)));
1336  float fast_d[16] __attribute__((aligned(64)));
1337  float A[16] __attribute__((aligned(64)));
1338  float B[16] __attribute__((aligned(64)));
1339  float table_four_i_0[16] __attribute__((aligned(64)));
1340  float table_four_i_1[16] __attribute__((aligned(64)));
1341  float table_four_i_2[16] __attribute__((aligned(64)));
1342  float table_four_i_3[16] __attribute__((aligned(64)));
1343  float table_four_i_4[16] __attribute__((aligned(64)));
1344  float table_four_i_5[16] __attribute__((aligned(64)));
1345  float table_four_i_6[16] __attribute__((aligned(64)));
1346  float table_four_i_7[16] __attribute__((aligned(64)));
1347  int i_vdwType[16] __attribute__((aligned(64)));
1348  int j_vdwType[16] __attribute__((aligned(64)));
1349 
1350  _mm512_store_ps(pi_x, p_i_x_vec);
1351  _mm512_store_ps(pi_y, p_i_y_vec);
1352  _mm512_store_ps(pi_z, p_i_z_vec);
1353  _mm512_store_ps(pi_q, p_i_q_vec);
1354  _mm512_store_ps(pj_x, p_j_x_vec);
1355  _mm512_store_ps(pj_y, p_j_y_vec);
1356  _mm512_store_ps(pj_z, p_j_z_vec);
1357  _mm512_store_ps(pj_q, p_j_q_vec);
1358  _mm512_store_ps(r2, r2_vec);
1359  _mm512_store_epi32(table_i, table_i_vec);
1360  _mm512_store_ps(kqq, kqq_vec);
1361  _mm512_store_ps(diffa, diffa_vec);
1362  _mm512_store_ps(table_four_i_0, table_four_i_0_vec);
1363  _mm512_store_ps(table_four_i_1, table_four_i_1_vec);
1364  _mm512_store_ps(table_four_i_2, table_four_i_2_vec);
1365  _mm512_store_ps(table_four_i_3, table_four_i_3_vec);
1366  _mm512_store_ps(table_four_i_4, table_four_i_4_vec);
1367  _mm512_store_ps(table_four_i_5, table_four_i_5_vec);
1368  _mm512_store_ps(table_four_i_6, table_four_i_6_vec);
1369  _mm512_store_ps(table_four_i_7, table_four_i_7_vec);
1370  #if (0 FAST(+1))
1371  _mm512_store_ps(vdw_a, vdw_a_vec);
1372  _mm512_store_ps(vdw_b, vdw_b_vec);
1373  _mm512_store_ps(vdw_c, vdw_c_vec);
1374  _mm512_store_ps(vdw_d, vdw_d_vec);
1375  _mm512_store_ps(A, A_vec);
1376  _mm512_store_ps(B, B_vec);
1377  _mm512_store_epi64(i_vdwType, pExt_i_vdwType_vec);
1378  _mm512_store_epi64(j_vdwType, pExt_j_vdwType_vec);
1379  #else
1380  _mm512_store_ps(vdw_a, _mm512_setzero_ps());
1381  _mm512_store_ps(vdw_b, _mm512_setzero_ps());
1382  _mm512_store_ps(vdw_c, _mm512_setzero_ps());
1383  _mm512_store_ps(vdw_d, _mm512_setzero_ps());
1384  _mm512_store_ps(A, _mm512_setzero_ps());
1385  _mm512_store_ps(B, _mm512_setzero_ps());
1386  _mm512_store_epi64(i_vdwType, _mm512_setzero_epi32());
1387  _mm512_store_epi64(j_vdwType, _mm512_setzero_epi32());
1388  #endif
1389  #if (0 FAST(SHORT(+1)))
1390  _mm512_store_ps(tmp_force, force_r_vec);
1391  _mm512_store_ps(tmp_x, tmp_x_vec);
1392  _mm512_store_ps(tmp_y, tmp_y_vec);
1393  _mm512_store_ps(tmp_z, tmp_z_vec);
1394  _mm512_store_ps(fast_a, fast_a_vec);
1395  _mm512_store_ps(fast_b, fast_b_vec);
1396  _mm512_store_ps(fast_c, fast_c_vec);
1397  _mm512_store_ps(fast_d, fast_d_vec);
1398  #else
1399  _mm512_store_ps(tmp_force, _mm512_setzero_ps());
1400  _mm512_store_ps(tmp_x, _mm512_setzero_ps());
1401  _mm512_store_ps(tmp_y, _mm512_setzero_ps());
1402  _mm512_store_ps(tmp_z, _mm512_setzero_ps());
1403  _mm512_store_ps(fast_a, _mm512_setzero_ps());
1404  _mm512_store_ps(fast_b, _mm512_setzero_ps());
1405  _mm512_store_ps(fast_c, _mm512_setzero_ps());
1406  _mm512_store_ps(fast_d, _mm512_setzero_ps());
1407  #endif
1408  #if (0 FULL(+1))
1409  _mm512_store_ps(tmp_fullforce, fullforce_r_vec);
1410  _mm512_store_ps(fulltmp_x, fulltmp_x_vec);
1411  _mm512_store_ps(fulltmp_y, fulltmp_y_vec);
1412  _mm512_store_ps(fulltmp_z, fulltmp_z_vec);
1413  #else
1414  _mm512_store_ps(tmp_fullforce, _mm512_setzero_ps());
1415  _mm512_store_ps(fulltmp_x, _mm512_setzero_ps());
1416  _mm512_store_ps(fulltmp_y, _mm512_setzero_ps());
1417  _mm512_store_ps(fulltmp_z, _mm512_setzero_ps());
1418  #endif
1419  _mm512_store_ps(tmp_ijx, p_ij_x_vec);
1420  _mm512_store_ps(tmp_ijy, p_ij_y_vec);
1421  _mm512_store_ps(tmp_ijz, p_ij_z_vec);
1422 
1423  #define PRNT(idx) \
1424  if (_mm512_mask2int(lfd_mask) & (1 << (idx))) { \
1425  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : tmp x:%+.3e, y:%+.3e, z:%+.3e : fulltmp x:%+.3e, y:%+.3e, z:%+.3e\n", \
1426  params.pe, params.timestep, \
1427  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1428  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1429  tmp_x[idx], tmp_y[idx], tmp_z[idx], \
1430  fulltmp_x[idx], fulltmp_y[idx], fulltmp_z[idx] \
1431  ); \
1432  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : calc_" SELF("self") PAIR("pair") FULL("_fullelect") ENERGY("_energy") "." NORMAL("NORM") MODIFIED("MOD") EXCLUDED("EXCL") "\n", \
1433  params.pe, params.timestep, \
1434  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1435  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]) \
1436  ); \
1437  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : i:%ld, j:%ld, i_index:%d, j_index:%d\n", \
1438  params.pe, params.timestep, \
1439  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1440  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1441  tmpI32[idx], tmpJ32[idx], pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index \
1442  ); \
1443  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : p_i x:%+.3e, y:%+.3e, z:%+.3e, q:%+.3e : p_j x:%+.3e, y:%+.3e, z:%+.3e, q:%+.3e : r2 %+.3e\n", \
1444  params.pe, params.timestep, \
1445  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1446  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1447  pi_x[idx], pi_y[idx], pi_z[idx], pi_q[idx], \
1448  pj_x[idx], pj_y[idx], pj_z[idx], pj_q[idx], \
1449  r2[idx] \
1450  ); \
1451  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : offset x:%+.3e, y:%+.3e, z:%+.3e\n", \
1452  params.pe, params.timestep, \
1453  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1454  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1455  offset_x, offset_y, offset_z \
1456  ); \
1457  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : forces f&s:%+.3e, full:%+.3e : p_ij x:%+.3e, y:%+.3e, z:%+.3e\n", \
1458  params.pe, params.timestep, \
1459  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1460  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1461  tmp_force[idx], tmp_fullforce[idx], \
1462  tmp_ijx[idx], tmp_ijy[idx], tmp_ijz[idx] \
1463  ); \
1464  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : table_i:%d, kqq::%+.3e, diffa:%+.3e\n", \
1465  params.pe, params.timestep, \
1466  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1467  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1468  table_i[idx], kqq[idx], diffa[idx] \
1469  ); \
1470  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : vdw a:%+.3e, b:%+.3e, c:%+.3e, d:%+.3e : fast a:%+.3e, b:%+.3e, c:%+.3e, d:%+.3e, \n", \
1471  params.pe, params.timestep, \
1472  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1473  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1474  vdw_a[idx], vdw_b[idx], vdw_c[idx], vdw_d[idx], \
1475  fast_a[idx], fast_b[idx], fast_c[idx], fast_d[idx] \
1476  ); \
1477  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : A:%+.3e, B:%+.3e : vdwType i:%d, j:%d\n", \
1478  params.pe, params.timestep, \
1479  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1480  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1481  A[idx], B[idx], i_vdwType[idx], j_vdwType[idx] \
1482  ); \
1483  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : table_four_i 0:%+.3e, 1:%+.3e, 2:%+.3e, 3:%+.3e, 4:%+.3e, 5:%+.3e, 6:%+.3e, 7:%+.3e\n", \
1484  params.pe, params.timestep, \
1485  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1486  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1487  table_four_i_0[idx], table_four_i_1[idx], table_four_i_2[idx], table_four_i_3[idx], table_four_i_4[idx], table_four_i_5[idx], table_four_i_6[idx], table_four_i_7[idx] \
1488  ); \
1489  int indexDiff = pExt_0[tmpI32[idx]].index - pExt_1[tmpJ32[idx]].index; \
1490  int maxDiff = pExt_1[tmpJ32[idx]].excl_maxdiff; \
1491  int offset = -1, offset_major = -1, offset_minor = -1, flags = 0; \
1492  if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) { \
1493  offset = (2 * indexDiff) + pExt_1[tmpJ32[idx]].excl_index; \
1494  offset_major = offset / (sizeof(unsigned int) * 8); \
1495  offset_minor = offset % (sizeof(unsigned int) * 8); \
1496  flags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03; \
1497  } \
1498  printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : excl: exclIndex:%d, indexDiff:%d, maxDiff:%d, offset:%d, flags:%d\n", \
1499  params.pe, params.timestep, \
1500  pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1501  params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1502  pExt_1[tmpJ32[idx]].excl_index, indexDiff, maxDiff, offset, flags \
1503  ); \
1504  }
1505  PRNT( 0); PRNT( 1); PRNT( 2); PRNT( 3); PRNT( 4); PRNT( 5); PRNT( 6); PRNT( 7);
1506  PRNT( 8); PRNT( 9); PRNT(10); PRNT(11); PRNT(12); PRNT(13); PRNT(14); PRNT(15);
1507  #undef PRNT
1508  } // end if (_mm512_mask2int(lfd_mask) != 0)
1509  #endif // (MIC_DATA_STRUCT_VERIFY != 0) && (MIC_DATA_STRUCT_VERIFY_KERNELS != 0)
1510  }
1511 
1512  FAST(ENERGY( vdwEnergy += _mm512_reduce_add_pd(vdwEnergy_vec); ))
1513  FAST(SHORT(ENERGY( electEnergy += _mm512_reduce_add_pd(electEnergy_vec); )))
1514  FULL(ENERGY( fullElectEnergy += _mm512_reduce_add_pd(fullElectEnergy_vec); ))
1515  #if (0 PAIR(FAST(SHORT(+1))))
1516  virial_xx += _mm512_reduce_add_pd(virial_xx_vec);
1517  virial_xy += _mm512_reduce_add_pd(virial_xy_vec);
1518  virial_xz += _mm512_reduce_add_pd(virial_xz_vec);
1519  virial_yy += _mm512_reduce_add_pd(virial_yy_vec);
1520  virial_yz += _mm512_reduce_add_pd(virial_yz_vec);
1521  virial_zz += _mm512_reduce_add_pd(virial_zz_vec);
1522  #endif
1523  #if (0 PAIR(FULL(+1)))
1524  fullElectVirial_xx += _mm512_reduce_add_pd(fullElectVirial_xx_vec);
1525  fullElectVirial_xy += _mm512_reduce_add_pd(fullElectVirial_xy_vec);
1526  fullElectVirial_xz += _mm512_reduce_add_pd(fullElectVirial_xz_vec);
1527  fullElectVirial_yy += _mm512_reduce_add_pd(fullElectVirial_yy_vec);
1528  fullElectVirial_yz += _mm512_reduce_add_pd(fullElectVirial_yz_vec);
1529  fullElectVirial_zz += _mm512_reduce_add_pd(fullElectVirial_zz_vec);
1530  #endif
1531 
1532  #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
1533  params.exclusionSum += _mm512_reduce_add_epi32(exclusionSum_vec);
1534  #endif
1535 
1536  #undef GATHER_PS_I32_OFFSET
1537  #undef GATHER_PS_I32
1538  #undef GATHER_EPI32_I32_OFFSET
1539  #undef GATHER_EPI32_I32
1540  #undef SCATTER_INC_PD_I32_STEP
1541  #undef SCATTER_INC_PD_I32
1542  #undef SCATTER_INC_PS_I32_STEP
1543  #undef SCATTER_INC_PS_I32
1544  #undef CONTRIB_ADD_PS2PD
1545  #undef CONTRIB_SUB_PS2PD
1546  #undef APPLY_FORCES_PS2PD_STEP_ADD
1547  #undef APPLY_FORCES_PS2PD_STEP_SUB
1548  #undef APPLY_FORCES_PS2PD_STEP_ADD_COMBO
1549  #undef APPLY_FORCES_PS2PD_STEP_SUB_COMBO
1550  #undef APPLY_FORCES_PS2PD_JSTEP
1551  #undef APPLY_FORCES_PS2PD
1552 
1553 #endif // NAMD_MIC
register BigReal virial_xy
register BigReal virial_xz
#define PAIR(X)
register BigReal virial_yz
const BigReal A
register BigReal electEnergy
#define NOSHORT(X)
BigReal vdw_b
BigReal vdw_c
register BigReal virial_yy
#define SHORT(X)
gridSize z
register BigReal virial_zz
#define FAST(X)
#define MODIFIED(X)
BigReal vdw_d
#define ENERGY(X)
const BigReal B
register BigReal virial_xx
#define FULL(X)
BigReal vdw_a
k< npairi;++k){TABENERGY(const int numtypes=simParams->tableNumTypes;const float table_spacing=simParams->tableSpacing;const int npertype=(int)(namdnearbyint(simParams->tableMaxDist/simParams->tableSpacing)+1);) int table_i=(r2iilist[2 *k] >> 14)+r2_delta_expc;const int j=pairlisti[k];#define p_j BigReal diffa=r2list[k]-r2_table[table_i];#define table_four_i TABENERGY(register const int tabtype=-1-(lj_pars->A< 0?lj_pars->A:0);) BigReal kqq=kq_i *p_j-> charge
gridSize y
gridSize x