3 #define GATHER_PS_I32_OFFSET(v, p, i, o) \
5 __mmask16 k0 = _mm512_int2mask(0x0008); \
6 __mmask16 k1 = _mm512_int2mask(0x0080); \
7 __mmask16 k2 = _mm512_int2mask(0x0800); \
8 __mmask16 k3 = _mm512_int2mask(0x8000); \
9 (v) = _mm512_mask_loadunpacklo_ps( _mm512_setzero_ps(), k3, &((p)[(i)[13] + (o)])); \
10 (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[ 9] + (o)])); \
11 (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 5] + (o)])); \
12 (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 1] + (o)])); \
13 (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[12] + (o)])); \
14 (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[ 8] + (o)])); \
15 (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 4] + (o)])); \
16 (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 0] + (o)])); \
17 (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_BADC), k3, &((p)[(i)[14] + (o)])); \
18 (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[10] + (o)])); \
19 (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 6] + (o)])); \
20 (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 2] + (o)])); \
21 (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[15] + (o)])); \
22 (v) = _mm512_mask_loadunpacklo_ps( (v), k2, &((p)[(i)[11] + (o)])); \
23 (v) = _mm512_mask_loadunpacklo_ps( (v), k1, &((p)[(i)[ 7] + (o)])); \
24 (v) = _mm512_mask_loadunpacklo_ps( (v), k0, &((p)[(i)[ 3] + (o)])); \
26 #define GATHER_PS_I32(v, p, i) GATHER_PS_I32_OFFSET((v), (p), (i), 0)
29 #define GATHER_EPI32_I32_OFFSET(v, p, i, o) \
31 __mmask16 k0 = _mm512_int2mask(0x0008); \
32 __mmask16 k1 = _mm512_int2mask(0x0080); \
33 __mmask16 k2 = _mm512_int2mask(0x0800); \
34 __mmask16 k3 = _mm512_int2mask(0x8000); \
35 (v) = _mm512_mask_loadunpacklo_epi32( _mm512_setzero_epi32(), k3, &((p)[(i)[13] + (o)])); \
36 (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[ 9] + (o)])); \
37 (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 5] + (o)])); \
38 (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 1] + (o)])); \
39 (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[12] + (o)])); \
40 (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[ 8] + (o)])); \
41 (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 4] + (o)])); \
42 (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 0] + (o)])); \
43 (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_BADC), k3, &((p)[(i)[14] + (o)])); \
44 (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[10] + (o)])); \
45 (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 6] + (o)])); \
46 (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 2] + (o)])); \
47 (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[15] + (o)])); \
48 (v) = _mm512_mask_loadunpacklo_epi32( (v), k2, &((p)[(i)[11] + (o)])); \
49 (v) = _mm512_mask_loadunpacklo_epi32( (v), k1, &((p)[(i)[ 7] + (o)])); \
50 (v) = _mm512_mask_loadunpacklo_epi32( (v), k0, &((p)[(i)[ 3] + (o)])); \
52 #define GATHER_EPI32_I32(v, p, i) GATHER_EPI32_I32_OFFSET((v), (p), (i), 0)
55 #define SCATTER_INC_PD_I32_STEP(v, p, i, k, idx, pattern) \
56 t = _mm512_mask_loadunpacklo_pd(t, (k), &((p)[(i)[(idx)]])); \
57 t = _mm512_mask_add_pd(t, (k), _mm512_swizzle_pd((v), (pattern), t); \
58 _mm512_mask_packstorelo_pd(&((p)[(i)[(idx)]]), (k), t);
60 #define SCATTER_INC_PD_I32(v, p, i) \
62 __mmask16 k0 = _mm512_int2mask(0x02); \
63 __mmask16 k1 = _mm512_int2mask(0x20); \
64 __m512d t = _mm512_setzero_pd(); \
65 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 0, _MM_SWIZ_REG_CDAB) \
66 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 1, _MM_SWIZ_REG_DCBA) \
67 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 2, _MM_SWIZ_REG_DACB) \
68 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 3, _MM_SWIZ_REG_BADC) \
69 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 4, _MM_SWIZ_REG_CDAB) \
70 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 5, _MM_SWIZ_REG_DCBA) \
71 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 6, _MM_SWIZ_REG_DACB) \
72 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 7, _MM_SWIZ_REG_BADC) \
76 #define SCATTER_INC_PS_I32_STEP(v, p, i, k, idx, pattern) \
77 t = _mm512_mask_loadunpacklo_ps(t, (k), &((p)[(i)[(idx)]])); \
78 t = _mm512_mask_add_ps(t, (k), _mm512_swizzle_ps((v), (pattern), t); \
79 _mm512_mask_packstorelo_ps(&((p)[(i)[(idx)]]), (k), t);
81 #define SCATTER_INC_PS_I32(v, p, i) \
83 __mmask16 k0 = _mm512_int2mask(0x0008); \
84 __mmask16 k1 = _mm512_int2mask(0x0080); \
85 __mmask16 k2 = _mm512_int2mask(0x0800); \
86 __mmask16 k3 = _mm512_int2mask(0x8000); \
87 __m512 t = _mm512_setzero_ps(); \
88 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 0, _MM_SWIZ_REG_CDAB) \
89 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 1, _MM_SWIZ_REG_DCBA) \
90 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 2, _MM_SWIZ_REG_DACB) \
91 SCATTER_INC_PS_I32_STEP((v), (p), (i), k0, 3, _MM_SWIZ_REG_BADC) \
92 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 4, _MM_SWIZ_REG_CDAB) \
93 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 5, _MM_SWIZ_REG_DCBA) \
94 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 6, _MM_SWIZ_REG_DACB) \
95 SCATTER_INC_PS_I32_STEP((v), (p), (i), k1, 7, _MM_SWIZ_REG_BADC) \
96 SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 8, _MM_SWIZ_REG_CDAB) \
97 SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 9, _MM_SWIZ_REG_DCBA) \
98 SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 10, _MM_SWIZ_REG_DACB) \
99 SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 11, _MM_SWIZ_REG_BADC) \
100 SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 12, _MM_SWIZ_REG_CDAB) \
101 SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 13, _MM_SWIZ_REG_DCBA) \
102 SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 14, _MM_SWIZ_REG_DACB) \
103 SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 15, _MM_SWIZ_REG_BADC) \
109 #define CONTRIB_ADD_PS2PD(pd_vec, ps_vec) \
110 { __m512 ps_tmp_vec = (ps_vec); \
111 (pd_vec) = _mm512_add_pd((pd_vec), _mm512_cvtpslo_pd(ps_tmp_vec)); \
112 (pd_vec) = _mm512_add_pd((pd_vec), _mm512_cvtpslo_pd(_mm512_permute4f128_ps(ps_tmp_vec, _MM_PERM_BADC))); \
114 #define CONTRIB_SUB_PS2PD(pd_vec, ps_vec) \
115 { __m512 ps_tmp_vec = (ps_vec); \
116 (pd_vec) = _mm512_sub_pd((pd_vec), _mm512_cvtpslo_pd(ps_tmp_vec)); \
117 (pd_vec) = _mm512_sub_pd((pd_vec), _mm512_cvtpslo_pd(_mm512_permute4f128_ps(ps_tmp_vec, _MM_PERM_BADC))); \
121 #define APPLY_FORCES_PS2PD_STEP_ADD(f_x, f_y, f_z, i, v_x, v_y, v_z, p, k) \
122 tx = _mm512_mask_loadunpacklo_pd(tx, (k), (f_x) + (i)); \
123 ty = _mm512_mask_loadunpacklo_pd(ty, (k), (f_y) + (i)); \
124 tz = _mm512_mask_loadunpacklo_pd(tz, (k), (f_z) + (i)); \
125 tx = _mm512_mask_add_pd(tx, (k), tx, _mm512_swizzle_pd((v_x), (p))); \
126 ty = _mm512_mask_add_pd(ty, (k), ty, _mm512_swizzle_pd((v_y), (p))); \
127 tz = _mm512_mask_add_pd(tz, (k), tz, _mm512_swizzle_pd((v_z), (p))); \
128 _mm512_mask_packstorelo_pd((f_x) + (i), (k), tx); \
129 _mm512_mask_packstorelo_pd((f_y) + (i), (k), ty); \
130 _mm512_mask_packstorelo_pd((f_z) + (i), (k), tz);
132 #define APPLY_FORCES_PS2PD_STEP_SUB(f_x, f_y, f_z, i, v_x, v_y, v_z, p, k) \
133 tx = _mm512_mask_loadunpacklo_pd(tx, (k), (f_x) + (i)); \
134 ty = _mm512_mask_loadunpacklo_pd(ty, (k), (f_y) + (i)); \
135 tz = _mm512_mask_loadunpacklo_pd(tz, (k), (f_z) + (i)); \
136 tx = _mm512_mask_sub_pd(tx, (k), tx, _mm512_swizzle_pd((v_x), (p))); \
137 ty = _mm512_mask_sub_pd(ty, (k), ty, _mm512_swizzle_pd((v_y), (p))); \
138 tz = _mm512_mask_sub_pd(tz, (k), tz, _mm512_swizzle_pd((v_z), (p))); \
139 _mm512_mask_packstorelo_pd((f_x) + (i), (k), tx); \
140 _mm512_mask_packstorelo_pd((f_y) + (i), (k), ty); \
141 _mm512_mask_packstorelo_pd((f_z) + (i), (k), tz);
144 #if __MIC_PAD_PLGEN_CTRL != 0
146 #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
148 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
150 #define APPLY_FORCES_PS2PD_JSTEP(j_lo, j_hi, v, fv) \
152 FAST(SHORT( v_tmp = _mm512_mask_loadunpacklo_pd( v_tmp, k_lo, f_1 + tmpJ32[(j_lo)]); )) \
153 FULL( fv_tmp = _mm512_mask_loadunpacklo_pd(fv_tmp, k_lo, fullf_1 + tmpJ32[(j_lo)]); ) \
154 FAST(SHORT( v_tmp = _mm512_sub_pd( v_tmp, ( v)); )) \
155 FULL( fv_tmp = _mm512_sub_pd(fv_tmp, (fv)); ) \
156 FAST(SHORT( _mm512_mask_packstorelo_pd( f_1 + tmpJ32[(j_lo)], k_lo, v_tmp); )) \
157 FULL( _mm512_mask_packstorelo_pd(fullf_1 + tmpJ32[(j_lo)], k_lo, fv_tmp); ) \
158 FAST(SHORT( v_tmp = _mm512_mask_loadunpacklo_pd( v_tmp, k_hi, f_1 + tmpJ32[(j_hi)]); )) \
159 FULL( fv_tmp = _mm512_mask_loadunpacklo_pd(fv_tmp, k_hi, fullf_1 + tmpJ32[(j_hi)]); ) \
160 FAST(SHORT( v_tmp = _mm512_sub_pd( v_tmp, ( v)); )) \
161 FULL( fv_tmp = _mm512_sub_pd(fv_tmp, (fv)); ) \
162 FAST(SHORT( _mm512_mask_packstorelo_pd( f_1 + tmpJ32[(j_hi)], k_hi, v_tmp); )) \
163 FULL( _mm512_mask_packstorelo_pd(fullf_1 + tmpJ32[(j_hi)], k_hi, fv_tmp); ) \
166 #define APPLY_FORCES_PS2PD \
169 FAST(SHORT( __m512 tmp_w_vec = _mm512_mask_mov_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_set_1to16_ps(0.0f MODIFIED(- 1.0f) EXCLUDED(- 1.0f))); )) \
170 FULL( __m512 fulltmp_w_vec = _mm512_setzero_ps(); ) \
173 __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA); \
174 __mmask16 k_2x2_1 = _mm512_int2mask(0x5555); \
175 FAST(SHORT( __m512i tmp_a0 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_x_vec), k_2x2_0, _mm512_castps_si512( tmp_y_vec), _MM_SWIZ_REG_CDAB); )) \
176 FAST(SHORT( __m512i tmp_a1 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_y_vec), k_2x2_1, _mm512_castps_si512( tmp_x_vec), _MM_SWIZ_REG_CDAB); )) \
177 FAST(SHORT( __m512i tmp_a2 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_z_vec), k_2x2_0, _mm512_castps_si512( tmp_w_vec), _MM_SWIZ_REG_CDAB); )) \
178 FAST(SHORT( __m512i tmp_a3 = _mm512_mask_swizzle_epi32(_mm512_castps_si512( tmp_w_vec), k_2x2_1, _mm512_castps_si512( tmp_z_vec), _MM_SWIZ_REG_CDAB); )) \
179 FULL( __m512i tmp_b0 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_x_vec), k_2x2_0, _mm512_castps_si512(fulltmp_y_vec), _MM_SWIZ_REG_CDAB); ) \
180 FULL( __m512i tmp_b1 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_y_vec), k_2x2_1, _mm512_castps_si512(fulltmp_x_vec), _MM_SWIZ_REG_CDAB); ) \
181 FULL( __m512i tmp_b2 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_z_vec), k_2x2_0, _mm512_castps_si512(fulltmp_w_vec), _MM_SWIZ_REG_CDAB); ) \
182 FULL( __m512i tmp_b3 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_w_vec), k_2x2_1, _mm512_castps_si512(fulltmp_z_vec), _MM_SWIZ_REG_CDAB); ) \
183 __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC); \
184 __mmask16 k_4x4_1 = _mm512_int2mask(0x3333); \
185 FAST(SHORT( __m512 tmp_0_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a0, k_4x4_0, tmp_a2, _MM_SWIZ_REG_BADC)); )) \
186 FAST(SHORT( __m512 tmp_1_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a1, k_4x4_0, tmp_a3, _MM_SWIZ_REG_BADC)); )) \
187 FAST(SHORT( __m512 tmp_2_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a2, k_4x4_1, tmp_a0, _MM_SWIZ_REG_BADC)); )) \
188 FAST(SHORT( __m512 tmp_3_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a3, k_4x4_1, tmp_a1, _MM_SWIZ_REG_BADC)); )) \
189 FULL( __m512 fulltmp_0_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b0, k_4x4_0, tmp_b2, _MM_SWIZ_REG_BADC)); ) \
190 FULL( __m512 fulltmp_1_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b1, k_4x4_0, tmp_b3, _MM_SWIZ_REG_BADC)); ) \
191 FULL( __m512 fulltmp_2_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b2, k_4x4_1, tmp_b0, _MM_SWIZ_REG_BADC)); ) \
192 FULL( __m512 fulltmp_3_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b3, k_4x4_1, tmp_b1, _MM_SWIZ_REG_BADC)); ) \
194 FAST(SHORT( __m512d v_0_lo = _mm512_cvtpslo_pd( tmp_0_vec ); )) \
195 FAST(SHORT( __m512d v_0_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_0_vec, _MM_PERM_BADC)); )) \
196 FAST(SHORT( __m512d v_1_lo = _mm512_cvtpslo_pd( tmp_1_vec ); )) \
197 FAST(SHORT( __m512d v_1_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_1_vec, _MM_PERM_BADC)); )) \
198 FAST(SHORT( __m512d v_2_lo = _mm512_cvtpslo_pd( tmp_2_vec ); )) \
199 FAST(SHORT( __m512d v_2_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_2_vec, _MM_PERM_BADC)); )) \
200 FAST(SHORT( __m512d v_3_lo = _mm512_cvtpslo_pd( tmp_3_vec ); )) \
201 FAST(SHORT( __m512d v_3_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_3_vec, _MM_PERM_BADC)); )) \
202 FULL( __m512d fv_0_lo = _mm512_cvtpslo_pd( fulltmp_0_vec ); ) \
203 FULL( __m512d fv_0_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_0_vec, _MM_PERM_BADC)); ) \
204 FULL( __m512d fv_1_lo = _mm512_cvtpslo_pd( fulltmp_1_vec ); ) \
205 FULL( __m512d fv_1_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_1_vec, _MM_PERM_BADC)); ) \
206 FULL( __m512d fv_2_lo = _mm512_cvtpslo_pd( fulltmp_2_vec ); ) \
207 FULL( __m512d fv_2_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_2_vec, _MM_PERM_BADC)); ) \
208 FULL( __m512d fv_3_lo = _mm512_cvtpslo_pd( fulltmp_3_vec ); ) \
209 FULL( __m512d fv_3_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_3_vec, _MM_PERM_BADC)); ) \
211 __mmask16 k_lo = _mm512_int2mask(0x000F); \
212 __mmask16 k_hi = _mm512_int2mask(0x00F0); \
213 __m512d v_tmp = _mm512_setzero_pd(); \
214 __m512d fv_tmp = _mm512_setzero_pd(); \
215 APPLY_FORCES_PS2PD_JSTEP( 0, 4, v_0_lo, fv_0_lo); \
216 APPLY_FORCES_PS2PD_JSTEP( 1, 5, v_1_lo, fv_1_lo); \
217 APPLY_FORCES_PS2PD_JSTEP( 2, 6, v_2_lo, fv_2_lo); \
218 APPLY_FORCES_PS2PD_JSTEP( 3, 7, v_3_lo, fv_3_lo); \
219 APPLY_FORCES_PS2PD_JSTEP( 8, 12, v_0_hi, fv_0_hi); \
220 APPLY_FORCES_PS2PD_JSTEP( 9, 13, v_1_hi, fv_1_hi); \
221 APPLY_FORCES_PS2PD_JSTEP(10, 14, v_2_hi, fv_2_hi); \
222 APPLY_FORCES_PS2PD_JSTEP(11, 15, v_3_hi, fv_3_hi); \
224 FAST(SHORT( __m512d v_tmp0_xyzw = _mm512_add_pd( v_0_lo, v_0_hi); )) \
225 FULL( __m512d fv_tmp0_xyzw = _mm512_add_pd(fv_0_lo, fv_0_hi); ) \
226 FAST(SHORT( __m512d v_tmp1_xyzw = _mm512_add_pd( v_1_lo, v_1_hi); )) \
227 FULL( __m512d fv_tmp1_xyzw = _mm512_add_pd(fv_1_lo, fv_1_hi); ) \
228 FAST(SHORT( __m512d v_tmp2_xyzw = _mm512_add_pd( v_2_lo, v_2_hi); )) \
229 FULL( __m512d fv_tmp2_xyzw = _mm512_add_pd(fv_2_lo, fv_2_hi); ) \
230 FAST(SHORT( __m512d v_tmp3_xyzw = _mm512_add_pd( v_3_lo, v_3_hi); )) \
231 FULL( __m512d fv_tmp3_xyzw = _mm512_add_pd(fv_3_lo, fv_3_hi); ) \
232 FAST(SHORT( __m512d v_tmp4_xyzw = _mm512_add_pd( v_tmp0_xyzw, v_tmp1_xyzw); )) \
233 FULL( __m512d fv_tmp4_xyzw = _mm512_add_pd(fv_tmp0_xyzw, fv_tmp1_xyzw); ) \
234 FAST(SHORT( __m512d v_tmp5_xyzw = _mm512_add_pd( v_tmp2_xyzw, v_tmp3_xyzw); )) \
235 FULL( __m512d fv_tmp5_xyzw = _mm512_add_pd(fv_tmp2_xyzw, fv_tmp3_xyzw); ) \
236 FAST(SHORT( __m512d v_tmp6_xyzw = _mm512_add_pd( v_tmp4_xyzw, v_tmp5_xyzw); )) \
237 FULL( __m512d fv_tmp6_xyzw = _mm512_add_pd(fv_tmp4_xyzw, fv_tmp5_xyzw); ) \
238 FAST(SHORT( __m512d v_xyzw = _mm512_add_pd( v_tmp6_xyzw, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512( v_tmp6_xyzw), _MM_PERM_BADC))); )) \
239 FULL( __m512d fv_xyzw = _mm512_add_pd(fv_tmp6_xyzw, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(fv_tmp6_xyzw), _MM_PERM_BADC))); ) \
240 FAST(SHORT(MODIFIED( v_xyzw = _mm512_mask_sub_pd( v_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), v_xyzw); ))) \
241 FAST(SHORT(EXCLUDED( v_xyzw = _mm512_mask_sub_pd( v_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), v_xyzw); ))) \
242 FULL(MODIFIED( fv_xyzw = _mm512_mask_sub_pd(fv_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), fv_xyzw); )) \
243 FULL(EXCLUDED( fv_xyzw = _mm512_mask_sub_pd(fv_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), fv_xyzw); )) \
244 int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
245 uintptr_t iTest_i = tmpI32[iTest_index]; \
246 FAST(SHORT( __m512d i_xyzw = _mm512_mask_loadunpacklo_pd(_mm512_setzero_pd(), k_lo, f_0 + iTest_i); )) \
247 FULL( __m512d fi_xyzw = _mm512_mask_loadunpacklo_pd(_mm512_setzero_pd(), k_lo, fullf_0 + iTest_i); ) \
248 FAST(SHORT( i_xyzw = _mm512_mask_add_pd( i_xyzw, k_lo, i_xyzw, v_xyzw); )) \
249 FULL( fi_xyzw = _mm512_mask_add_pd(fi_xyzw, k_lo, fi_xyzw, fv_xyzw); ) \
250 FAST(SHORT( _mm512_mask_packstorelo_pd( f_0 + iTest_i, k_lo, i_xyzw); )) \
251 FULL( _mm512_mask_packstorelo_pd(fullf_0 + iTest_i, k_lo, fi_xyzw); ) \
254 #else // if MIC_HANDCODE_FORC_SOA_VS_AOS != 0
256 #define APPLY_FORCES_PS2PD \
258 FAST(SHORT( __m512d v_x_lo = _mm512_cvtpslo_pd( tmp_x_vec ); )) \
259 FAST(SHORT( __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_x_vec, _MM_PERM_BADC)); )) \
260 FAST(SHORT( __m512d v_y_lo = _mm512_cvtpslo_pd( tmp_y_vec ); )) \
261 FAST(SHORT( __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_y_vec, _MM_PERM_BADC)); )) \
262 FAST(SHORT( __m512d v_z_lo = _mm512_cvtpslo_pd( tmp_z_vec ); )) \
263 FAST(SHORT( __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_z_vec, _MM_PERM_BADC)); )) \
264 FULL( __m512d fv_x_lo = _mm512_cvtpslo_pd( fulltmp_x_vec ); ) \
265 FULL( __m512d fv_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_x_vec, _MM_PERM_BADC)); ) \
266 FULL( __m512d fv_y_lo = _mm512_cvtpslo_pd( fulltmp_y_vec ); ) \
267 FULL( __m512d fv_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_y_vec, _MM_PERM_BADC)); ) \
268 FULL( __m512d fv_z_lo = _mm512_cvtpslo_pd( fulltmp_z_vec ); ) \
269 FULL( __m512d fv_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_z_vec, _MM_PERM_BADC)); ) \
270 int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
271 uintptr_t iTest_i = tmpI32[iTest_index]; \
272 __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
273 __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
274 FAST(SHORT( f_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_x_hi, v_x_lo)); )) \
275 FAST(SHORT( f_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_y_hi, v_y_lo)); )) \
276 FAST(SHORT( f_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_z_hi, v_z_lo)); )) \
277 FULL( fullf_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_x_hi, fv_x_lo)); ) \
278 FULL( fullf_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_y_hi, fv_y_lo)); ) \
279 FULL( fullf_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_z_hi, fv_z_lo)); ) \
280 __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
281 __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
282 FAST(SHORT( __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_x, _MM_SCALE_8); )) \
283 FAST(SHORT( __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_y, _MM_SCALE_8); )) \
284 FAST(SHORT( __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_z, _MM_SCALE_8); )) \
285 FAST(SHORT( __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_x, _MM_SCALE_8); )) \
286 FAST(SHORT( __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_y, _MM_SCALE_8); )) \
287 FAST(SHORT( __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_z, _MM_SCALE_8); )) \
288 FULL( __m512d ff_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_x, _MM_SCALE_8); ) \
289 FULL( __m512d ff_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_y, _MM_SCALE_8); ) \
290 FULL( __m512d ff_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_z, _MM_SCALE_8); ) \
291 FULL( __m512d ff_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_x, _MM_SCALE_8); ) \
292 FULL( __m512d ff_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_y, _MM_SCALE_8); ) \
293 FULL( __m512d ff_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_z, _MM_SCALE_8); ) \
294 FAST(SHORT( f_j_x_lo_vec = _mm512_mask_sub_pd( f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); )) \
295 FAST(SHORT( f_j_y_lo_vec = _mm512_mask_sub_pd( f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); )) \
296 FAST(SHORT( f_j_z_lo_vec = _mm512_mask_sub_pd( f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); )) \
297 FAST(SHORT( f_j_x_hi_vec = _mm512_mask_sub_pd( f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); )) \
298 FAST(SHORT( f_j_y_hi_vec = _mm512_mask_sub_pd( f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); )) \
299 FAST(SHORT( f_j_z_hi_vec = _mm512_mask_sub_pd( f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); )) \
300 FULL( ff_j_x_lo_vec = _mm512_mask_sub_pd(ff_j_x_lo_vec, cutoff_lo, ff_j_x_lo_vec, fv_x_lo); ) \
301 FULL( ff_j_y_lo_vec = _mm512_mask_sub_pd(ff_j_y_lo_vec, cutoff_lo, ff_j_y_lo_vec, fv_y_lo); ) \
302 FULL( ff_j_z_lo_vec = _mm512_mask_sub_pd(ff_j_z_lo_vec, cutoff_lo, ff_j_z_lo_vec, fv_z_lo); ) \
303 FULL( ff_j_x_hi_vec = _mm512_mask_sub_pd(ff_j_x_hi_vec, cutoff_hi, ff_j_x_hi_vec, fv_x_hi); ) \
304 FULL( ff_j_y_hi_vec = _mm512_mask_sub_pd(ff_j_y_hi_vec, cutoff_hi, ff_j_y_hi_vec, fv_y_hi); ) \
305 FULL( ff_j_z_hi_vec = _mm512_mask_sub_pd(ff_j_z_hi_vec, cutoff_hi, ff_j_z_hi_vec, fv_z_hi); ) \
306 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); )) \
307 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); )) \
308 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); )) \
309 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); )) \
310 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); )) \
311 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); )) \
312 FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_lo, j_lo_vec, ff_j_x_lo_vec, _MM_SCALE_8); ) \
313 FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_lo, j_lo_vec, ff_j_y_lo_vec, _MM_SCALE_8); ) \
314 FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_lo, j_lo_vec, ff_j_z_lo_vec, _MM_SCALE_8); ) \
315 FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_hi, j_hi_vec, ff_j_x_hi_vec, _MM_SCALE_8); ) \
316 FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_hi, j_hi_vec, ff_j_y_hi_vec, _MM_SCALE_8); ) \
317 FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_hi, j_hi_vec, ff_j_z_hi_vec, _MM_SCALE_8); ) \
320 #endif // MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
322 #else // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
324 #define APPLY_FORCES_PS2PD(v_x, v_y, v_z, f_i_x, f_i_y, f_i_z, f_j_x, f_j_y, f_j_z, i, j) \
326 __m512d v_x_lo = _mm512_cvtpslo_pd(v_x); \
327 __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_x), _MM_PERM_BADC)); \
328 __m512d v_y_lo = _mm512_cvtpslo_pd(v_y); \
329 __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_y), _MM_PERM_BADC)); \
330 __m512d v_z_lo = _mm512_cvtpslo_pd(v_z); \
331 __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_z), _MM_PERM_BADC)); \
332 __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
333 __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
334 __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
335 __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
336 int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
337 uintptr_t iTest_i = tmpI32[iTest_index]; \
338 f_i_x[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_x_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_x_lo); \
339 f_i_y[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_y_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_y_lo); \
340 f_i_z[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_z_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_z_lo); \
341 __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_x), _MM_SCALE_8); \
342 __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_y), _MM_SCALE_8); \
343 __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_z), _MM_SCALE_8); \
344 __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_x), _MM_SCALE_8); \
345 __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_y), _MM_SCALE_8); \
346 __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_z), _MM_SCALE_8); \
347 f_j_x_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_x_lo_vec, v_x_lo); \
348 f_j_y_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_y_lo_vec, v_y_lo); \
349 f_j_z_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_z_lo_vec, v_z_lo); \
350 f_j_x_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_x_hi_vec, v_x_hi); \
351 f_j_y_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_y_hi_vec, v_y_hi); \
352 f_j_z_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_z_hi_vec, v_z_hi); \
353 _mm512_mask_i32loscatter_pd((f_j_x), cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); \
354 _mm512_mask_i32loscatter_pd((f_j_y), cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); \
355 _mm512_mask_i32loscatter_pd((f_j_z), cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); \
356 _mm512_mask_i32loscatter_pd((f_j_x), cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); \
357 _mm512_mask_i32loscatter_pd((f_j_y), cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); \
358 _mm512_mask_i32loscatter_pd((f_j_z), cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); \
361 #endif // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
363 #else // MIC_PAD_PLGEN != 0
365 #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
367 #define APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x, v_y, v_z, fv_x, fv_y, fv_z, i, p, k) \
368 FAST(SHORT( tx = _mm512_mask_loadunpacklo_pd( tx, (k), f_0_x + (i)); )) \
369 FAST(SHORT( ty = _mm512_mask_loadunpacklo_pd( ty, (k), f_0_y + (i)); )) \
370 FAST(SHORT( tz = _mm512_mask_loadunpacklo_pd( tz, (k), f_0_z + (i)); )) \
371 FULL( ftx = _mm512_mask_loadunpacklo_pd(ftx, (k), fullf_0_x + (i)); ) \
372 FULL( fty = _mm512_mask_loadunpacklo_pd(fty, (k), fullf_0_y + (i)); ) \
373 FULL( ftz = _mm512_mask_loadunpacklo_pd(ftz, (k), fullf_0_z + (i)); ) \
374 FAST(SHORT( tx = _mm512_mask_add_pd( tx, (k), tx, _mm512_swizzle_pd(( v_x), (p))); )) \
375 FAST(SHORT( ty = _mm512_mask_add_pd( ty, (k), ty, _mm512_swizzle_pd(( v_y), (p))); )) \
376 FAST(SHORT( tz = _mm512_mask_add_pd( tz, (k), tz, _mm512_swizzle_pd(( v_z), (p))); )) \
377 FULL( ftx = _mm512_mask_add_pd(ftx, (k), ftx, _mm512_swizzle_pd((fv_x), (p))); ) \
378 FULL( fty = _mm512_mask_add_pd(fty, (k), fty, _mm512_swizzle_pd((fv_y), (p))); ) \
379 FULL( ftz = _mm512_mask_add_pd(ftz, (k), ftz, _mm512_swizzle_pd((fv_z), (p))); ) \
380 FAST(SHORT( _mm512_mask_packstorelo_pd( f_0_x + (i), (k), tx); )) \
381 FAST(SHORT( _mm512_mask_packstorelo_pd( f_0_y + (i), (k), ty); )) \
382 FAST(SHORT( _mm512_mask_packstorelo_pd( f_0_z + (i), (k), tz); )) \
383 FULL( _mm512_mask_packstorelo_pd(fullf_0_x + (i), (k), ftx); ) \
384 FULL( _mm512_mask_packstorelo_pd(fullf_0_y + (i), (k), fty); ) \
385 FULL( _mm512_mask_packstorelo_pd(fullf_0_z + (i), (k), ftz); )
387 #define APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x, v_y, v_z, fv_x, fv_y, fv_z, i, p, k) \
388 FAST(SHORT( tx = _mm512_mask_loadunpacklo_pd( tx, (k), f_1_x + (i)); )) \
389 FAST(SHORT( ty = _mm512_mask_loadunpacklo_pd( ty, (k), f_1_y + (i)); )) \
390 FAST(SHORT( tz = _mm512_mask_loadunpacklo_pd( tz, (k), f_1_z + (i)); )) \
391 FULL( ftx = _mm512_mask_loadunpacklo_pd(ftx, (k), fullf_1_x + (i)); ) \
392 FULL( fty = _mm512_mask_loadunpacklo_pd(fty, (k), fullf_1_y + (i)); ) \
393 FULL( ftz = _mm512_mask_loadunpacklo_pd(ftz, (k), fullf_1_z + (i)); ) \
394 FAST(SHORT( tx = _mm512_mask_sub_pd( tx, (k), tx, _mm512_swizzle_pd(( v_x), (p))); )) \
395 FAST(SHORT( ty = _mm512_mask_sub_pd( ty, (k), ty, _mm512_swizzle_pd(( v_y), (p))); )) \
396 FAST(SHORT( tz = _mm512_mask_sub_pd( tz, (k), tz, _mm512_swizzle_pd(( v_z), (p))); )) \
397 FULL( ftx = _mm512_mask_sub_pd(ftx, (k), ftx, _mm512_swizzle_pd((fv_x), (p))); ) \
398 FULL( fty = _mm512_mask_sub_pd(fty, (k), fty, _mm512_swizzle_pd((fv_y), (p))); ) \
399 FULL( ftz = _mm512_mask_sub_pd(ftz, (k), ftz, _mm512_swizzle_pd((fv_z), (p))); ) \
400 FAST(SHORT( _mm512_mask_packstorelo_pd( f_1_x + (i), (k), tx); )) \
401 FAST(SHORT( _mm512_mask_packstorelo_pd( f_1_y + (i), (k), ty); )) \
402 FAST(SHORT( _mm512_mask_packstorelo_pd( f_1_z + (i), (k), tz); )) \
403 FULL( _mm512_mask_packstorelo_pd(fullf_1_x + (i), (k), ftx); ) \
404 FULL( _mm512_mask_packstorelo_pd(fullf_1_y + (i), (k), fty); ) \
405 FULL( _mm512_mask_packstorelo_pd(fullf_1_z + (i), (k), ftz); )
407 #define APPLY_FORCES_PS2PD \
409 __mmask16 k_lo = _mm512_int2mask(0x02); \
410 __mmask16 k_hi = _mm512_int2mask(0x20); \
411 FAST(SHORT( __m512d tx = _mm512_setzero_pd(); )) \
412 FAST(SHORT( __m512d ty = _mm512_setzero_pd(); )) \
413 FAST(SHORT( __m512d tz = _mm512_setzero_pd(); )) \
414 FULL( __m512d ftx = _mm512_setzero_pd(); ) \
415 FULL( __m512d fty = _mm512_setzero_pd(); ) \
416 FULL( __m512d ftz = _mm512_setzero_pd(); ) \
417 FAST(SHORT( __m512d v_x_lo = _mm512_cvtpslo_pd( tmp_x_vec ); )) \
418 FAST(SHORT( __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_x_vec, _MM_PERM_BADC)); )) \
419 FAST(SHORT( __m512d v_y_lo = _mm512_cvtpslo_pd( tmp_y_vec ); )) \
420 FAST(SHORT( __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_y_vec, _MM_PERM_BADC)); )) \
421 FAST(SHORT( __m512d v_z_lo = _mm512_cvtpslo_pd( tmp_z_vec ); )) \
422 FAST(SHORT( __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps( tmp_z_vec, _MM_PERM_BADC)); )) \
423 FULL( __m512d fv_x_lo = _mm512_cvtpslo_pd( fulltmp_x_vec ); ) \
424 FULL( __m512d fv_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_x_vec, _MM_PERM_BADC)); ) \
425 FULL( __m512d fv_y_lo = _mm512_cvtpslo_pd( fulltmp_y_vec ); ) \
426 FULL( __m512d fv_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_y_vec, _MM_PERM_BADC)); ) \
427 FULL( __m512d fv_z_lo = _mm512_cvtpslo_pd( fulltmp_z_vec ); ) \
428 FULL( __m512d fv_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_z_vec, _MM_PERM_BADC)); ) \
429 int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
430 uintptr_t iTest_i = tmpI32[iTest_index]; \
431 __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, _mm512_set_1to16_epi32(iTest_i), i_vec); \
432 iTest_mask = _mm512_kxor(iTest_mask, cutoff_mask); \
433 if (_mm512_kortestz(iTest_mask, iTest_mask)) { \
434 __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
435 __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
436 FAST(SHORT( f_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_x_hi, v_x_lo)); )) \
437 FAST(SHORT( f_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_y_hi, v_y_lo)); )) \
438 FAST(SHORT( f_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_z_hi, v_z_lo)); )) \
439 FULL( fullf_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_x_hi, fv_x_lo)); ) \
440 FULL( fullf_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_y_hi, fv_y_lo)); ) \
441 FULL( fullf_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_z_hi, fv_z_lo)); ) \
442 __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
443 __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
444 FAST(SHORT( __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_x, _MM_SCALE_8); )) \
445 FAST(SHORT( __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_y, _MM_SCALE_8); )) \
446 FAST(SHORT( __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, f_1_z, _MM_SCALE_8); )) \
447 FAST(SHORT( __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_x, _MM_SCALE_8); )) \
448 FAST(SHORT( __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_y, _MM_SCALE_8); )) \
449 FAST(SHORT( __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, f_1_z, _MM_SCALE_8); )) \
450 FULL( __m512d ff_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_x, _MM_SCALE_8); ) \
451 FULL( __m512d ff_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_y, _MM_SCALE_8); ) \
452 FULL( __m512d ff_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_z, _MM_SCALE_8); ) \
453 FULL( __m512d ff_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_x, _MM_SCALE_8); ) \
454 FULL( __m512d ff_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_y, _MM_SCALE_8); ) \
455 FULL( __m512d ff_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_z, _MM_SCALE_8); ) \
456 FAST(SHORT( f_j_x_lo_vec = _mm512_mask_sub_pd( f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); )) \
457 FAST(SHORT( f_j_y_lo_vec = _mm512_mask_sub_pd( f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); )) \
458 FAST(SHORT( f_j_z_lo_vec = _mm512_mask_sub_pd( f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); )) \
459 FAST(SHORT( f_j_x_hi_vec = _mm512_mask_sub_pd( f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); )) \
460 FAST(SHORT( f_j_y_hi_vec = _mm512_mask_sub_pd( f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); )) \
461 FAST(SHORT( f_j_z_hi_vec = _mm512_mask_sub_pd( f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); )) \
462 FULL( ff_j_x_lo_vec = _mm512_mask_sub_pd(ff_j_x_lo_vec, cutoff_lo, ff_j_x_lo_vec, fv_x_lo); ) \
463 FULL( ff_j_y_lo_vec = _mm512_mask_sub_pd(ff_j_y_lo_vec, cutoff_lo, ff_j_y_lo_vec, fv_y_lo); ) \
464 FULL( ff_j_z_lo_vec = _mm512_mask_sub_pd(ff_j_z_lo_vec, cutoff_lo, ff_j_z_lo_vec, fv_z_lo); ) \
465 FULL( ff_j_x_hi_vec = _mm512_mask_sub_pd(ff_j_x_hi_vec, cutoff_hi, ff_j_x_hi_vec, fv_x_hi); ) \
466 FULL( ff_j_y_hi_vec = _mm512_mask_sub_pd(ff_j_y_hi_vec, cutoff_hi, ff_j_y_hi_vec, fv_y_hi); ) \
467 FULL( ff_j_z_hi_vec = _mm512_mask_sub_pd(ff_j_z_hi_vec, cutoff_hi, ff_j_z_hi_vec, fv_z_hi); ) \
468 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); )) \
469 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); )) \
470 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); )) \
471 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_x, cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); )) \
472 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_y, cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); )) \
473 FAST(SHORT( _mm512_mask_i32loscatter_pd( f_1_z, cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); )) \
474 FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_lo, j_lo_vec, ff_j_x_lo_vec, _MM_SCALE_8); ) \
475 FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_lo, j_lo_vec, ff_j_y_lo_vec, _MM_SCALE_8); ) \
476 FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_lo, j_lo_vec, ff_j_z_lo_vec, _MM_SCALE_8); ) \
477 FULL( _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_hi, j_hi_vec, ff_j_x_hi_vec, _MM_SCALE_8); ) \
478 FULL( _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_hi, j_hi_vec, ff_j_y_hi_vec, _MM_SCALE_8); ) \
479 FULL( _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_hi, j_hi_vec, ff_j_z_hi_vec, _MM_SCALE_8); ) \
481 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 0], _MM_SWIZ_REG_CDAB, k_lo); \
482 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 1], _MM_SWIZ_REG_DCBA, k_lo); \
483 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 2], _MM_SWIZ_REG_DACB, k_lo); \
484 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 3], _MM_SWIZ_REG_BADC, k_lo); \
485 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 4], _MM_SWIZ_REG_CDAB, k_hi); \
486 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 5], _MM_SWIZ_REG_DCBA, k_hi); \
487 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 6], _MM_SWIZ_REG_DACB, k_hi); \
488 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 7], _MM_SWIZ_REG_BADC, k_hi); \
489 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[ 8], _MM_SWIZ_REG_CDAB, k_lo); \
490 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[ 9], _MM_SWIZ_REG_DCBA, k_lo); \
491 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[10], _MM_SWIZ_REG_DACB, k_lo); \
492 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[11], _MM_SWIZ_REG_BADC, k_lo); \
493 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[12], _MM_SWIZ_REG_CDAB, k_hi); \
494 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[13], _MM_SWIZ_REG_DCBA, k_hi); \
495 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[14], _MM_SWIZ_REG_DACB, k_hi); \
496 APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[15], _MM_SWIZ_REG_BADC, k_hi); \
497 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 0], _MM_SWIZ_REG_CDAB, k_lo); \
498 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 1], _MM_SWIZ_REG_DCBA, k_lo); \
499 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 2], _MM_SWIZ_REG_DACB, k_lo); \
500 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 3], _MM_SWIZ_REG_BADC, k_lo); \
501 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 4], _MM_SWIZ_REG_CDAB, k_hi); \
502 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 5], _MM_SWIZ_REG_DCBA, k_hi); \
503 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 6], _MM_SWIZ_REG_DACB, k_hi); \
504 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 7], _MM_SWIZ_REG_BADC, k_hi); \
505 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[ 8], _MM_SWIZ_REG_CDAB, k_lo); \
506 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[ 9], _MM_SWIZ_REG_DCBA, k_lo); \
507 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[10], _MM_SWIZ_REG_DACB, k_lo); \
508 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[11], _MM_SWIZ_REG_BADC, k_lo); \
509 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[12], _MM_SWIZ_REG_CDAB, k_hi); \
510 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[13], _MM_SWIZ_REG_DCBA, k_hi); \
511 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[14], _MM_SWIZ_REG_DACB, k_hi); \
512 APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[15], _MM_SWIZ_REG_BADC, k_hi); \
516 #else // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
518 #define APPLY_FORCES_PS2PD(v_x, v_y, v_z, f_i_x, f_i_y, f_i_z, f_j_x, f_j_y, f_j_z, i, j) \
520 __mmask16 k_lo = _mm512_int2mask(0x02); \
521 __mmask16 k_hi = _mm512_int2mask(0x20); \
522 __m512d tx = _mm512_setzero_pd(); \
523 __m512d ty = _mm512_setzero_pd(); \
524 __m512d tz = _mm512_setzero_pd(); \
525 __m512d v_x_lo = _mm512_cvtpslo_pd(v_x); \
526 __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_x), _MM_PERM_BADC)); \
527 __m512d v_y_lo = _mm512_cvtpslo_pd(v_y); \
528 __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_y), _MM_PERM_BADC)); \
529 __m512d v_z_lo = _mm512_cvtpslo_pd(v_z); \
530 __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_z), _MM_PERM_BADC)); \
531 int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
532 uintptr_t iTest_i = tmpI32[iTest_index]; \
533 __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, _mm512_set_1to16_epi32(iTest_i), i_vec); \
534 iTest_mask = _mm512_kxor(iTest_mask, cutoff_mask); \
535 if (_mm512_kortestz(iTest_mask, iTest_mask)) { \
536 __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
537 __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
538 f_i_x[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_x_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_x_lo); \
539 f_i_y[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_y_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_y_lo); \
540 f_i_z[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_z_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_z_lo); \
541 __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
542 __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
543 __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_x), _MM_SCALE_8); \
544 __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_y), _MM_SCALE_8); \
545 __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_z), _MM_SCALE_8); \
546 __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_x), _MM_SCALE_8); \
547 __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_y), _MM_SCALE_8); \
548 __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_z), _MM_SCALE_8); \
549 f_j_x_lo_vec = _mm512_mask_sub_pd(f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); \
550 f_j_y_lo_vec = _mm512_mask_sub_pd(f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); \
551 f_j_z_lo_vec = _mm512_mask_sub_pd(f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); \
552 f_j_x_hi_vec = _mm512_mask_sub_pd(f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); \
553 f_j_y_hi_vec = _mm512_mask_sub_pd(f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); \
554 f_j_z_hi_vec = _mm512_mask_sub_pd(f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); \
555 _mm512_mask_i32loscatter_pd((f_j_x), cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); \
556 _mm512_mask_i32loscatter_pd((f_j_y), cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); \
557 _mm512_mask_i32loscatter_pd((f_j_z), cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); \
558 _mm512_mask_i32loscatter_pd((f_j_x), cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); \
559 _mm512_mask_i32loscatter_pd((f_j_y), cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); \
560 _mm512_mask_i32loscatter_pd((f_j_z), cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); \
562 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 0], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_lo); \
563 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 1], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_lo); \
564 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 2], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_lo); \
565 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 3], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_lo); \
566 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 4], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_hi); \
567 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 5], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_hi); \
568 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 6], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_hi); \
569 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 7], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_hi); \
570 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 8], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_lo); \
571 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 9], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_lo); \
572 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[10], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_lo); \
573 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[11], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_lo); \
574 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[12], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_hi); \
575 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[13], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_hi); \
576 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[14], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_hi); \
577 APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[15], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_hi); \
578 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 0], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_lo); \
579 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 1], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_lo); \
580 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 2], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_lo); \
581 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 3], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_lo); \
582 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 4], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_hi); \
583 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 5], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_hi); \
584 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 6], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_hi); \
585 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 7], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_hi); \
586 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 8], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_lo); \
587 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 9], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_lo); \
588 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[10], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_lo); \
589 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[11], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_lo); \
590 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[12], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_hi); \
591 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[13], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_hi); \
592 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[14], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_hi); \
593 APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[15], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_hi); \
597 #endif // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
598 #endif // MIC_PAD_PLGEN != 0
600 __m512i plI_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10, 9, 8,
601 7, 6, 5, 4, 3, 2, 1, 0);
602 const int plSize_16 = (plSize + 15) & (~15);
604 FAST(
ENERGY( __m512d vdwEnergy_vec = _mm512_setzero_pd(); ))
606 FULL(
ENERGY( __m512d fullElectEnergy_vec = _mm512_setzero_pd(); ))
607 #if (0 PAIR(FAST(SHORT(+1))))
608 __m512d virial_xx_vec = _mm512_setzero_pd();
609 __m512d virial_xy_vec = _mm512_setzero_pd();
610 __m512d virial_xz_vec = _mm512_setzero_pd();
611 __m512d virial_yy_vec = _mm512_setzero_pd();
612 __m512d virial_yz_vec = _mm512_setzero_pd();
613 __m512d virial_zz_vec = _mm512_setzero_pd();
615 #if (0 PAIR(FULL(+1)))
616 __m512d fullElectVirial_xx_vec = _mm512_setzero_pd();
617 __m512d fullElectVirial_xy_vec = _mm512_setzero_pd();
618 __m512d fullElectVirial_xz_vec = _mm512_setzero_pd();
619 __m512d fullElectVirial_yy_vec = _mm512_setzero_pd();
620 __m512d fullElectVirial_yz_vec = _mm512_setzero_pd();
621 __m512d fullElectVirial_zz_vec = _mm512_setzero_pd();
624 __m512i ij_mask_vec = _mm512_set_1to16_epi32(0x0000FFFF);
625 __m512i ij_store_perm_pattern = _mm512_set_16to16_epi32( 9, 7, 9, 6, 9, 5, 9, 4,
626 9, 3, 9, 2, 9, 1, 9, 0 );
628 #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
629 __m512i exclusionSum_vec = _mm512_setzero_epi32();
634 #pragma loop count (500)
636 #pragma loop count (7000)
640 #pragma loop count (2)
642 #pragma loop count (30)
645 #pragma prefetch plArray:_MM_HINT_NTA
647 for (
int plI = 0; plI < plSize_16; plI += 16) {
650 __mmask16 active_mask = _mm512_cmplt_epi32_mask(plI_vec, _mm512_set_1to16_epi32(plSize));
652 #if MIC_HANDCODE_FORCE_PFDIST != 0
653 __m512i future_ij_vec = _mm512_load_epi32(plArray + plI + (16 * MIC_HANDCODE_FORCE_PFDIST));
654 __mmask16 future_ij_mask = _mm512_cmpneq_epi32_mask(future_ij_vec, _mm512_set_1to16_epi32(-1));
655 __m512i future_j_vec = _mm512_and_epi32(future_ij_vec, ij_mask_vec);
656 _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_x, _MM_SCALE_4, _MM_HINT_T0);
657 _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_y, _MM_SCALE_4, _MM_HINT_T0);
658 _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_z, _MM_SCALE_4, _MM_HINT_T0);
662 __m512i ij_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), active_mask, plArray + plI);
663 #if __MIC_PAD_PLGEN_CTRL != 0
664 __m512i ij_mask_vec = _mm512_set_1to16_epi32(0xFFFF);
665 __m512i ij_lo_vec = _mm512_and_epi32(ij_vec, ij_mask_vec);
666 active_mask = _mm512_mask_cmpneq_epi32_mask(active_mask, ij_lo_vec, ij_mask_vec);
668 __m512i i_vec = _mm512_and_epi32(_mm512_mask_srli_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, 16), ij_mask_vec);
669 __m512i j_vec = _mm512_mask_and_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, ij_mask_vec);
672 uintptr_t tmpI32[16] __attribute__((aligned(64)));
673 uintptr_t tmpJ32[16] __attribute__((aligned(64)));
674 _mm512_store_epi64(tmpI32 , _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, i_vec ));
675 _mm512_store_epi64(tmpI32 + 8, _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, _mm512_permute4f128_epi32(i_vec, _MM_PERM_BADC)));
676 _mm512_store_epi64(tmpJ32 , _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, j_vec ));
677 _mm512_store_epi64(tmpJ32 + 8, _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, _mm512_permute4f128_epi32(j_vec, _MM_PERM_BADC)));
680 plI_vec = _mm512_add_epi32(plI_vec, _mm512_set_1to16_epi32(16));
683 __m512 p_i_x_vec, p_i_y_vec, p_i_z_vec, p_i_q_vec, p_j_x_vec, p_j_y_vec, p_j_z_vec, p_j_q_vec;
684 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
685 __m512i pExt_i_vdwType_vec, pExt_j_vdwType_vec;
688 __m512 zero_vec = _mm512_setzero_ps();
690 int iTest_index = _mm_tzcnt_32(_mm512_mask2int(active_mask));
691 unsigned int iTest_i = tmpI32[iTest_index];
692 #if __MIC_PAD_PLGEN_CTRL != 0
694 __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(active_mask, _mm512_set_1to16_epi32(iTest_i), i_vec);
695 iTest_mask =_mm512_kxor(iTest_mask, active_mask);
696 if (_mm512_kortestz(iTest_mask, iTest_mask)) {
698 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
699 p_i_x_vec = _mm512_set_1to16_ps(p_0[iTest_i].
x);
700 p_i_y_vec = _mm512_set_1to16_ps(p_0[iTest_i].
y);
701 p_i_z_vec = _mm512_set_1to16_ps(p_0[iTest_i].
z);
702 p_i_q_vec = _mm512_set_1to16_ps(p_0[iTest_i].
charge);
704 __mmask16 k0 = _mm512_int2mask(0x000F);
705 __mmask16 k1 = _mm512_int2mask(0x00F0);
706 __mmask16 k2 = _mm512_int2mask(0x0F00);
707 __mmask16 k3 = _mm512_int2mask(0xF000);
708 __m512i tmp_a0 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 0]);
709 tmp_a0 = _mm512_mask_loadunpacklo_epi32( tmp_a0, k1, p_1 + tmpJ32[ 4]);
710 tmp_a0 = _mm512_mask_loadunpacklo_epi32( tmp_a0, k2, p_1 + tmpJ32[ 8]);
711 tmp_a0 = _mm512_mask_loadunpacklo_epi32( tmp_a0, k3, p_1 + tmpJ32[12]);
712 __m512i tmp_a1 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 1]);
713 tmp_a1 = _mm512_mask_loadunpacklo_epi32( tmp_a1, k1, p_1 + tmpJ32[ 5]);
714 tmp_a1 = _mm512_mask_loadunpacklo_epi32( tmp_a1, k2, p_1 + tmpJ32[ 9]);
715 tmp_a1 = _mm512_mask_loadunpacklo_epi32( tmp_a1, k3, p_1 + tmpJ32[13]);
716 __m512i tmp_a2 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 2]);
717 tmp_a2 = _mm512_mask_loadunpacklo_epi32( tmp_a2, k1, p_1 + tmpJ32[ 6]);
718 tmp_a2 = _mm512_mask_loadunpacklo_epi32( tmp_a2, k2, p_1 + tmpJ32[10]);
719 tmp_a2 = _mm512_mask_loadunpacklo_epi32( tmp_a2, k3, p_1 + tmpJ32[14]);
720 __m512i tmp_a3 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 3]);
721 tmp_a3 = _mm512_mask_loadunpacklo_epi32( tmp_a3, k1, p_1 + tmpJ32[ 7]);
722 tmp_a3 = _mm512_mask_loadunpacklo_epi32( tmp_a3, k2, p_1 + tmpJ32[11]);
723 tmp_a3 = _mm512_mask_loadunpacklo_epi32( tmp_a3, k3, p_1 + tmpJ32[15]);
724 __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
725 __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
726 __m512i tmp_b0 = _mm512_mask_swizzle_epi32(tmp_a0, k_2x2_0, tmp_a1, _MM_SWIZ_REG_CDAB);
727 __m512i tmp_b1 = _mm512_mask_swizzle_epi32(tmp_a1, k_2x2_1, tmp_a0, _MM_SWIZ_REG_CDAB);
728 __m512i tmp_b2 = _mm512_mask_swizzle_epi32(tmp_a2, k_2x2_0, tmp_a3, _MM_SWIZ_REG_CDAB);
729 __m512i tmp_b3 = _mm512_mask_swizzle_epi32(tmp_a3, k_2x2_1, tmp_a2, _MM_SWIZ_REG_CDAB);
730 __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
731 __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
732 p_j_x_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b0, k_4x4_0, tmp_b2, _MM_SWIZ_REG_BADC));
733 p_j_y_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b1, k_4x4_0, tmp_b3, _MM_SWIZ_REG_BADC));
734 p_j_z_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b2, k_4x4_1, tmp_b0, _MM_SWIZ_REG_BADC));
735 p_j_q_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b3, k_4x4_1, tmp_b1, _MM_SWIZ_REG_BADC));
738 p_i_x_vec = _mm512_set_1to16_ps(p_0_x[iTest_i]);
739 p_i_y_vec = _mm512_set_1to16_ps(p_0_y[iTest_i]);
740 p_i_z_vec = _mm512_set_1to16_ps(p_0_z[iTest_i]);
741 p_i_q_vec = _mm512_set_1to16_ps(p_0_q[iTest_i]);
742 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
743 pExt_i_vdwType_vec = _mm512_set_1to16_epi32(pExt_0_vdwType[iTest_i]);
745 #if MIC_HANDCODE_FORCE_USEGATHER != 0
746 p_j_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_x, _MM_SCALE_4);
747 p_j_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_y, _MM_SCALE_4);
748 p_j_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_z, _MM_SCALE_4);
749 p_j_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_q, _MM_SCALE_4);
750 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
751 pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
754 GATHER_PS_I32(p_j_x_vec, p_1_x, tmpJ32);
755 GATHER_PS_I32(p_j_y_vec, p_1_y, tmpJ32);
756 GATHER_PS_I32(p_j_z_vec, p_1_z, tmpJ32);
757 GATHER_PS_I32(p_j_q_vec, p_1_q, tmpJ32);
758 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
759 GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
762 #endif // MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
763 #if __MIC_PAD_PLGEN_CTRL != 0
766 #if MIC_HANDCODE_FORCE_USEGATHER != 0
767 p_i_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_x, _MM_SCALE_4);
768 p_i_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_y, _MM_SCALE_4);
769 p_i_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_z, _MM_SCALE_4);
770 p_i_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_q, _MM_SCALE_4);
771 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
772 pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, i_vec, pExt_0_vdwType, _MM_SCALE_4);
774 p_j_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_x, _MM_SCALE_4);
775 p_j_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_y, _MM_SCALE_4);
776 p_j_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_z, _MM_SCALE_4);
777 p_j_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_q, _MM_SCALE_4);
778 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
779 pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
782 GATHER_PS_I32(p_i_x_vec, p_0_x, tmpI32);
783 GATHER_PS_I32(p_i_y_vec, p_0_y, tmpI32);
784 GATHER_PS_I32(p_i_z_vec, p_0_z, tmpI32);
785 GATHER_PS_I32(p_i_q_vec, p_0_q, tmpI32);
786 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
787 GATHER_EPI32_I32(pExt_i_vdwType_vec, pExt_0_vdwType, tmpI32);
789 GATHER_PS_I32(p_j_x_vec, p_1_x, tmpJ32);
790 GATHER_PS_I32(p_j_y_vec, p_1_y, tmpJ32);
791 GATHER_PS_I32(p_j_z_vec, p_1_z, tmpJ32);
792 GATHER_PS_I32(p_j_q_vec, p_1_q, tmpJ32);
793 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
794 GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
802 p_i_x_vec = _mm512_add_ps(p_i_x_vec, _mm512_set_1to16_ps(offset_x));
803 p_i_y_vec = _mm512_add_ps(p_i_y_vec, _mm512_set_1to16_ps(offset_y));
804 p_i_z_vec = _mm512_add_ps(p_i_z_vec, _mm512_set_1to16_ps(offset_z));
805 __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
806 __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
807 __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
808 __m512 r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_x_vec, p_ij_x_vec), _mm512_set_1to16_ps(r2_delta));
809 r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_y_vec, p_ij_y_vec), r2_vec);
810 r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_z_vec, p_ij_z_vec), r2_vec);
815 __mmask16 cutoff_mask = _mm512_mask_cmplt_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps(cutoff2_delta));
816 if (_mm512_kortestz(cutoff_mask, cutoff_mask)) {
continue; }
818 #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
819 exclusionSum_vec = _mm512_mask_add_epi32(exclusionSum_vec, cutoff_mask, exclusionSum_vec, _mm512_set_1to16_epi32(1));
823 __m512 kqq_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, p_i_q_vec, p_j_q_vec);
826 __m512i r2i_vec = _mm512_castps_si512(r2_vec);
827 __m512i table_i_vec = _mm512_srli_epi32(r2i_vec, 17);
828 table_i_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_i_vec, _mm512_set_1to16_epi32(r2_delta_expc));
840 __m512 r2_delta_vec = _mm512_set_1to16_ps(r2_delta);
841 __m512 t0_vec = _mm512_cvtfxpnt_round_adjustepi32_ps(_mm512_sllv_epi32(_mm512_set_1to16_epi32(1), _mm512_srli_epi32(table_i_vec, 6)), _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
842 __m512 r2_base_vec = _mm512_mul_ps(r2_delta_vec, t0_vec);
843 __m512 r2_del_vec = _mm512_mul_ps(r2_base_vec, _mm512_set_1to16_ps(0.015625f));
844 __m512i t1_vec = _mm512_and_epi32(table_i_vec, _mm512_set_1to16_epi32(0x3F));
845 __m512 t2_vec = _mm512_cvtfxpnt_round_adjustepi32_ps(t1_vec, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
846 r2_table_vec = _mm512_add_ps(_mm512_mul_ps(r2_del_vec, t2_vec), r2_base_vec);
848 __m512 diffa_vec = _mm512_sub_ps(r2_vec, r2_table_vec);
855 __m512i pExt_i_vdwType_vec;
856 __m512i pExt_j_vdwType_vec;
859 #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT == 0
860 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
862 pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, _mm512_slli_epi32(i_vec, 2), pExt_0, _MM_SCALE_4);
863 pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, _mm512_slli_epi32(j_vec, 2), pExt_1, _MM_SCALE_4);
865 #if MIC_HANDCODE_FORCE_USEGATHER != 0
866 pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, i_vec, pExt_0_vdwType, _MM_SCALE_4);
867 pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
869 GATHER_EPI32_I32(pExt_i_vdwType_vec, pExt_0_vdwType, tmpI32);
870 GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
876 __m512i t0_vec = _mm512_fmadd_epi32(pExt_i_vdwType_vec, _mm512_set_1to16_epi32(lj_table_dim), pExt_j_vdwType_vec);
877 __m512i ljOffset_vec = _mm512_slli_epi32(t0_vec, 2);
878 MODIFIED( ljOffset_vec = _mm512_add_epi32(ljOffset_vec, _mm512_set_1to16_epi32(2)); )
881 #
if MIC_HANDCODE_FORCE_USEGATHER != 0
882 __m512i ljOffset_p1_vec = _mm512_add_epi32(ljOffset_vec, _mm512_set_1to16_epi32(1));
883 A_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, ljOffset_vec, lj_table_base_ptr, _MM_SCALE_4);
884 B_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, ljOffset_p1_vec, lj_table_base_ptr, _MM_SCALE_4);
886 unsigned int ljOffset[16] __attribute__((aligned(64)));
887 _mm512_store_epi32(ljOffset, ljOffset_vec);
888 GATHER_PS_I32 (A_vec, lj_table_base_ptr, ljOffset );
889 GATHER_PS_I32_OFFSET(B_vec, lj_table_base_ptr, ljOffset, 1);
893 __m512 scaling_vec = _mm512_set_1to16_ps(scaling);
894 A_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, A_vec, scaling_vec);
895 B_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, B_vec, scaling_vec);
907 const CALC_TYPE *
const table_four_base =
SHORT(table_short)
NOSHORT(table_noshort);
908 __m512 table_four_i_0_vec, table_four_i_1_vec, table_four_i_2_vec, table_four_i_3_vec,
909 table_four_i_4_vec, table_four_i_5_vec, table_four_i_6_vec, table_four_i_7_vec,
910 table_four_i_8_vec, table_four_i_9_vec, table_four_i_10_vec, table_four_i_11_vec,
911 table_four_i_12_vec, table_four_i_13_vec, table_four_i_14_vec, table_four_i_15_vec;
913 #if MIC_HANDCODE_FORCE_USEGATHER_NBTBL != 0
917 __m512i table_four_i_offsets = _mm512_slli_epi32(table_i_vec, 4);
918 table_four_i_0_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets , table_four_base, _MM_SCALE_4);
919 table_four_i_1_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 1)), table_four_base, _MM_SCALE_4);
920 table_four_i_2_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 2)), table_four_base, _MM_SCALE_4);
921 table_four_i_3_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 3)), table_four_base, _MM_SCALE_4);
922 table_four_i_4_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 4)), table_four_base, _MM_SCALE_4);
923 table_four_i_5_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 5)), table_four_base, _MM_SCALE_4);
924 table_four_i_6_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 6)), table_four_base, _MM_SCALE_4);
925 table_four_i_7_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 7)), table_four_base, _MM_SCALE_4);
926 table_four_i_8_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 8)), table_four_base, _MM_SCALE_4);
927 table_four_i_9_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 9)), table_four_base, _MM_SCALE_4);
928 table_four_i_10_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(10)), table_four_base, _MM_SCALE_4);
929 table_four_i_11_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(11)), table_four_base, _MM_SCALE_4);
930 table_four_i_12_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(12)), table_four_base, _MM_SCALE_4);
931 table_four_i_13_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(13)), table_four_base, _MM_SCALE_4);
932 table_four_i_14_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(14)), table_four_base, _MM_SCALE_4);
933 table_four_i_15_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(15)), table_four_base, _MM_SCALE_4);
937 __m512i table_four_i_offsets = _mm512_slli_epi32(table_i_vec, 4);
938 table_four_i_0_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base, _MM_SCALE_4);
939 table_four_i_1_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 1, _MM_SCALE_4);
940 table_four_i_2_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 2, _MM_SCALE_4);
941 table_four_i_3_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 3, _MM_SCALE_4);
942 table_four_i_4_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 4, _MM_SCALE_4);
943 table_four_i_5_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 5, _MM_SCALE_4);
944 table_four_i_6_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 6, _MM_SCALE_4);
945 table_four_i_7_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 7, _MM_SCALE_4);
946 table_four_i_8_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 8, _MM_SCALE_4);
947 table_four_i_9_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 9, _MM_SCALE_4);
948 table_four_i_10_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 10, _MM_SCALE_4);
949 table_four_i_11_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 11, _MM_SCALE_4);
950 table_four_i_12_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 12, _MM_SCALE_4);
951 table_four_i_13_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 13, _MM_SCALE_4);
952 table_four_i_14_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 14, _MM_SCALE_4);
953 table_four_i_15_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 15, _MM_SCALE_4);
959 unsigned int table_four_i_offsets[16] __attribute__((aligned(64)));
960 _mm512_store_epi32(table_four_i_offsets, _mm512_slli_epi32(table_i_vec, 4));
963 __m512i tmpA_00 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 0]])));
964 __m512i tmpA_01 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 1]])));
965 __m512i tmpA_02 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 2]])));
966 __m512i tmpA_03 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 3]])));
967 __m512i tmpA_04 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 4]])));
968 __m512i tmpA_05 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 5]])));
969 __m512i tmpA_06 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 6]])));
970 __m512i tmpA_07 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 7]])));
971 __m512i tmpA_08 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 8]])));
972 __m512i tmpA_09 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 9]])));
973 __m512i tmpA_10 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[10]])));
974 __m512i tmpA_11 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[11]])));
975 __m512i tmpA_12 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[12]])));
976 __m512i tmpA_13 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[13]])));
977 __m512i tmpA_14 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[14]])));
978 __m512i tmpA_15 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[15]])));
979 __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
980 __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
981 __m512i tmpB_00 = _mm512_mask_swizzle_epi32(tmpA_00, k_2x2_0, tmpA_01, _MM_SWIZ_REG_CDAB);
982 __m512i tmpB_01 = _mm512_mask_swizzle_epi32(tmpA_01, k_2x2_1, tmpA_00, _MM_SWIZ_REG_CDAB);
983 __m512i tmpB_02 = _mm512_mask_swizzle_epi32(tmpA_02, k_2x2_0, tmpA_03, _MM_SWIZ_REG_CDAB);
984 __m512i tmpB_03 = _mm512_mask_swizzle_epi32(tmpA_03, k_2x2_1, tmpA_02, _MM_SWIZ_REG_CDAB);
985 __m512i tmpB_04 = _mm512_mask_swizzle_epi32(tmpA_04, k_2x2_0, tmpA_05, _MM_SWIZ_REG_CDAB);
986 __m512i tmpB_05 = _mm512_mask_swizzle_epi32(tmpA_05, k_2x2_1, tmpA_04, _MM_SWIZ_REG_CDAB);
987 __m512i tmpB_06 = _mm512_mask_swizzle_epi32(tmpA_06, k_2x2_0, tmpA_07, _MM_SWIZ_REG_CDAB);
988 __m512i tmpB_07 = _mm512_mask_swizzle_epi32(tmpA_07, k_2x2_1, tmpA_06, _MM_SWIZ_REG_CDAB);
989 __m512i tmpB_08 = _mm512_mask_swizzle_epi32(tmpA_08, k_2x2_0, tmpA_09, _MM_SWIZ_REG_CDAB);
990 __m512i tmpB_09 = _mm512_mask_swizzle_epi32(tmpA_09, k_2x2_1, tmpA_08, _MM_SWIZ_REG_CDAB);
991 __m512i tmpB_10 = _mm512_mask_swizzle_epi32(tmpA_10, k_2x2_0, tmpA_11, _MM_SWIZ_REG_CDAB);
992 __m512i tmpB_11 = _mm512_mask_swizzle_epi32(tmpA_11, k_2x2_1, tmpA_10, _MM_SWIZ_REG_CDAB);
993 __m512i tmpB_12 = _mm512_mask_swizzle_epi32(tmpA_12, k_2x2_0, tmpA_13, _MM_SWIZ_REG_CDAB);
994 __m512i tmpB_13 = _mm512_mask_swizzle_epi32(tmpA_13, k_2x2_1, tmpA_12, _MM_SWIZ_REG_CDAB);
995 __m512i tmpB_14 = _mm512_mask_swizzle_epi32(tmpA_14, k_2x2_0, tmpA_15, _MM_SWIZ_REG_CDAB);
996 __m512i tmpB_15 = _mm512_mask_swizzle_epi32(tmpA_15, k_2x2_1, tmpA_14, _MM_SWIZ_REG_CDAB);
997 __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
998 __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
999 __m512i tmpC_00 = _mm512_mask_swizzle_epi32(tmpB_00, k_4x4_0, tmpB_02, _MM_SWIZ_REG_BADC);
1000 __m512i tmpC_01 = _mm512_mask_swizzle_epi32(tmpB_01, k_4x4_0, tmpB_03, _MM_SWIZ_REG_BADC);
1001 __m512i tmpC_02 = _mm512_mask_swizzle_epi32(tmpB_02, k_4x4_1, tmpB_00, _MM_SWIZ_REG_BADC);
1002 __m512i tmpC_03 = _mm512_mask_swizzle_epi32(tmpB_03, k_4x4_1, tmpB_01, _MM_SWIZ_REG_BADC);
1003 __m512i tmpC_04 = _mm512_mask_swizzle_epi32(tmpB_04, k_4x4_0, tmpB_06, _MM_SWIZ_REG_BADC);
1004 __m512i tmpC_05 = _mm512_mask_swizzle_epi32(tmpB_05, k_4x4_0, tmpB_07, _MM_SWIZ_REG_BADC);
1005 __m512i tmpC_06 = _mm512_mask_swizzle_epi32(tmpB_06, k_4x4_1, tmpB_04, _MM_SWIZ_REG_BADC);
1006 __m512i tmpC_07 = _mm512_mask_swizzle_epi32(tmpB_07, k_4x4_1, tmpB_05, _MM_SWIZ_REG_BADC);
1007 __m512i tmpC_08 = _mm512_mask_swizzle_epi32(tmpB_08, k_4x4_0, tmpB_10, _MM_SWIZ_REG_BADC);
1008 __m512i tmpC_09 = _mm512_mask_swizzle_epi32(tmpB_09, k_4x4_0, tmpB_11, _MM_SWIZ_REG_BADC);
1009 __m512i tmpC_10 = _mm512_mask_swizzle_epi32(tmpB_10, k_4x4_1, tmpB_08, _MM_SWIZ_REG_BADC);
1010 __m512i tmpC_11 = _mm512_mask_swizzle_epi32(tmpB_11, k_4x4_1, tmpB_09, _MM_SWIZ_REG_BADC);
1011 __m512i tmpC_12 = _mm512_mask_swizzle_epi32(tmpB_12, k_4x4_0, tmpB_14, _MM_SWIZ_REG_BADC);
1012 __m512i tmpC_13 = _mm512_mask_swizzle_epi32(tmpB_13, k_4x4_0, tmpB_15, _MM_SWIZ_REG_BADC);
1013 __m512i tmpC_14 = _mm512_mask_swizzle_epi32(tmpB_14, k_4x4_1, tmpB_12, _MM_SWIZ_REG_BADC);
1014 __m512i tmpC_15 = _mm512_mask_swizzle_epi32(tmpB_15, k_4x4_1, tmpB_13, _MM_SWIZ_REG_BADC);
1015 __mmask16 k_8x8_0 = _mm512_int2mask(0xF0F0);
1016 __mmask16 k_8x8_1 = _mm512_int2mask(0x0F0F);
1017 __m512i tmpD_00 = _mm512_mask_permute4f128_epi32(tmpC_00, k_8x8_0, tmpC_04, _MM_PERM_CDAB);
1018 __m512i tmpD_01 = _mm512_mask_permute4f128_epi32(tmpC_01, k_8x8_0, tmpC_05, _MM_PERM_CDAB);
1019 __m512i tmpD_02 = _mm512_mask_permute4f128_epi32(tmpC_02, k_8x8_0, tmpC_06, _MM_PERM_CDAB);
1020 __m512i tmpD_03 = _mm512_mask_permute4f128_epi32(tmpC_03, k_8x8_0, tmpC_07, _MM_PERM_CDAB);
1021 __m512i tmpD_04 = _mm512_mask_permute4f128_epi32(tmpC_04, k_8x8_1, tmpC_00, _MM_PERM_CDAB);
1022 __m512i tmpD_05 = _mm512_mask_permute4f128_epi32(tmpC_05, k_8x8_1, tmpC_01, _MM_PERM_CDAB);
1023 __m512i tmpD_06 = _mm512_mask_permute4f128_epi32(tmpC_06, k_8x8_1, tmpC_02, _MM_PERM_CDAB);
1024 __m512i tmpD_07 = _mm512_mask_permute4f128_epi32(tmpC_07, k_8x8_1, tmpC_03, _MM_PERM_CDAB);
1025 __m512i tmpD_08 = _mm512_mask_permute4f128_epi32(tmpC_08, k_8x8_0, tmpC_12, _MM_PERM_CDAB);
1026 __m512i tmpD_09 = _mm512_mask_permute4f128_epi32(tmpC_09, k_8x8_0, tmpC_13, _MM_PERM_CDAB);
1027 __m512i tmpD_10 = _mm512_mask_permute4f128_epi32(tmpC_10, k_8x8_0, tmpC_14, _MM_PERM_CDAB);
1028 __m512i tmpD_11 = _mm512_mask_permute4f128_epi32(tmpC_11, k_8x8_0, tmpC_15, _MM_PERM_CDAB);
1029 __m512i tmpD_12 = _mm512_mask_permute4f128_epi32(tmpC_12, k_8x8_1, tmpC_08, _MM_PERM_CDAB);
1030 __m512i tmpD_13 = _mm512_mask_permute4f128_epi32(tmpC_13, k_8x8_1, tmpC_09, _MM_PERM_CDAB);
1031 __m512i tmpD_14 = _mm512_mask_permute4f128_epi32(tmpC_14, k_8x8_1, tmpC_10, _MM_PERM_CDAB);
1032 __m512i tmpD_15 = _mm512_mask_permute4f128_epi32(tmpC_15, k_8x8_1, tmpC_11, _MM_PERM_CDAB);
1033 __mmask16 k_16x16_0 = _mm512_int2mask(0xFF00);
1034 __mmask16 k_16x16_1 = _mm512_int2mask(0x00FF);
1035 table_four_i_0_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_00, k_16x16_0, tmpD_08, _MM_PERM_BADC));
1036 table_four_i_1_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_01, k_16x16_0, tmpD_09, _MM_PERM_BADC));
1037 table_four_i_2_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_02, k_16x16_0, tmpD_10, _MM_PERM_BADC));
1038 table_four_i_3_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_03, k_16x16_0, tmpD_11, _MM_PERM_BADC));
1039 table_four_i_4_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_04, k_16x16_0, tmpD_12, _MM_PERM_BADC));
1040 table_four_i_5_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_05, k_16x16_0, tmpD_13, _MM_PERM_BADC));
1041 table_four_i_6_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_06, k_16x16_0, tmpD_14, _MM_PERM_BADC));
1042 table_four_i_7_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_07, k_16x16_0, tmpD_15, _MM_PERM_BADC));
1043 table_four_i_8_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_08, k_16x16_1, tmpD_00, _MM_PERM_BADC));
1044 table_four_i_9_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_09, k_16x16_1, tmpD_01, _MM_PERM_BADC));
1045 table_four_i_10_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_10, k_16x16_1, tmpD_02, _MM_PERM_BADC));
1046 table_four_i_11_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_11, k_16x16_1, tmpD_03, _MM_PERM_BADC));
1047 table_four_i_12_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_12, k_16x16_1, tmpD_04, _MM_PERM_BADC));
1048 table_four_i_13_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_13, k_16x16_1, tmpD_05, _MM_PERM_BADC));
1049 table_four_i_14_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_14, k_16x16_1, tmpD_06, _MM_PERM_BADC));
1050 table_four_i_15_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_15, k_16x16_1, tmpD_07, _MM_PERM_BADC));
1057 __m512 vdw_d_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_0_vec), _mm512_mul_ps(B_vec, table_four_i_4_vec));
1058 __m512 vdw_c_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_1_vec), _mm512_mul_ps(B_vec, table_four_i_5_vec));
1059 __m512 vdw_b_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_2_vec), _mm512_mul_ps(B_vec, table_four_i_6_vec));
1060 __m512 vdw_a_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_3_vec), _mm512_mul_ps(B_vec, table_four_i_7_vec));
1063 __m512 vdw_val_tmp_0_vec = _mm512_mul_ps(vdw_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
1064 __m512 vdw_val_tmp_1_vec = _mm512_mul_ps(vdw_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
1065 __m512 vdw_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_0_vec), vdw_val_tmp_1_vec);
1066 __m512 vdw_val_tmp_3_vec = _mm512_mul_ps(vdw_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
1067 __m512 vdw_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_2_vec), vdw_val_tmp_3_vec);
1068 __m512 vdw_val_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_4_vec), vdw_a_vec);
1069 CONTRIB_SUB_PS2PD(vdwEnergy_vec, vdw_val_vec);
1075 __m512 fast_d_vec = _mm512_mul_ps(kqq_vec, table_four_i_8_vec);
1076 __m512 fast_c_vec = _mm512_mul_ps(kqq_vec, table_four_i_9_vec);
1077 __m512 fast_b_vec = _mm512_mul_ps(kqq_vec, table_four_i_10_vec);
1078 __m512 fast_a_vec = _mm512_mul_ps(kqq_vec, table_four_i_11_vec);
1080 #if (0 MODIFIED(+1))
1081 __m512 modfckqq_vec = _mm512_mul_ps(_mm512_set_1to16_ps(1.0f - modf_mod), kqq_vec);
1082 __m512 fast_d_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_8_vec);
1083 __m512 fast_c_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_9_vec);
1084 __m512 fast_b_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_10_vec);
1085 __m512 fast_a_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_11_vec);
1089 __m512 fast_val_tmp_0_vec = _mm512_mul_ps(fast_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
1090 __m512 fast_val_tmp_1_vec = _mm512_mul_ps(fast_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
1091 __m512 fast_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_0_vec), fast_val_tmp_1_vec);
1092 __m512 fast_val_tmp_3_vec = _mm512_mul_ps(fast_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
1093 __m512 fast_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_2_vec), fast_val_tmp_3_vec);
1094 __m512 fast_val_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_4_vec), fast_a_vec);
1095 CONTRIB_SUB_PS2PD(electEnergy_vec, fast_val_vec);
1098 #if (0 NOT_ALCHPAIR(+1))
1099 fast_d_vec = _mm512_add_ps(fast_d_vec, vdw_d_vec);
1100 fast_c_vec = _mm512_add_ps(fast_c_vec, vdw_c_vec);
1101 fast_b_vec = _mm512_add_ps(fast_b_vec, vdw_b_vec);
1102 fast_a_vec = _mm512_add_ps(fast_a_vec, vdw_a_vec);
1105 __m512 fast_dir_tmp_0_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_d_vec), fast_c_vec);
1106 __m512 fast_dir_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_dir_tmp_0_vec), fast_b_vec);
1107 __m512 force_r_vec = fast_dir_vec;
1109 const __m512 tmp_x_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_x_vec);
1110 PAIR( CONTRIB_ADD_PS2PD(virial_xx_vec, _mm512_mul_ps(tmp_x_vec, p_ij_x_vec)); )
1111 PAIR( CONTRIB_ADD_PS2PD(virial_xy_vec, _mm512_mul_ps(tmp_x_vec, p_ij_y_vec)); )
1112 PAIR( CONTRIB_ADD_PS2PD(virial_xz_vec, _mm512_mul_ps(tmp_x_vec, p_ij_z_vec)); )
1114 const __m512 tmp_y_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_y_vec);
1115 PAIR( CONTRIB_ADD_PS2PD(virial_yy_vec, _mm512_mul_ps(tmp_y_vec, p_ij_y_vec)); )
1116 PAIR( CONTRIB_ADD_PS2PD(virial_yz_vec, _mm512_mul_ps(tmp_y_vec, p_ij_z_vec)); )
1118 const __m512 tmp_z_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_z_vec);
1119 PAIR( CONTRIB_ADD_PS2PD(virial_zz_vec, _mm512_mul_ps(tmp_z_vec, p_ij_z_vec)); )
1121 #
if MIC_HANDCODE_FORCE_COMBINE_FORCES == 0
1122 APPLY_FORCES_PS2PD(tmp_x_vec, tmp_y_vec, tmp_z_vec, f_0_x, f_0_y, f_0_z, f_1_x, f_1_y, f_1_z, tmpI32, tmpJ32);
1131 #if (0 SHORT( EXCLUDED(+1) MODIFIED(+1) ))
1133 __m512 slow_i_0_vec, slow_i_1_vec, slow_i_2_vec, slow_i_3_vec;
1136 unsigned int slow_i_offsets[16] __attribute__((aligned(64)));
1137 _mm512_store_epi32(slow_i_offsets, _mm512_slli_epi32(table_i_vec, 2));
1142 #define __LOAD_SLOW(dstA, offset) \
1143 __m512i (dstA) = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x000F), slow_table + (offset));
1144 __LOAD_SLOW(slow_i_tmp00_vec, slow_i_offsets[ 0]);
1145 __LOAD_SLOW(slow_i_tmp01_vec, slow_i_offsets[ 1]);
1146 __LOAD_SLOW(slow_i_tmp02_vec, slow_i_offsets[ 2]);
1147 __LOAD_SLOW(slow_i_tmp03_vec, slow_i_offsets[ 3]);
1148 __LOAD_SLOW(slow_i_tmp04_vec, slow_i_offsets[ 4]);
1149 __LOAD_SLOW(slow_i_tmp05_vec, slow_i_offsets[ 5]);
1150 __LOAD_SLOW(slow_i_tmp06_vec, slow_i_offsets[ 6]);
1151 __LOAD_SLOW(slow_i_tmp07_vec, slow_i_offsets[ 7]);
1152 __LOAD_SLOW(slow_i_tmp08_vec, slow_i_offsets[ 8]);
1153 __LOAD_SLOW(slow_i_tmp09_vec, slow_i_offsets[ 9]);
1154 __LOAD_SLOW(slow_i_tmp10_vec, slow_i_offsets[10]);
1155 __LOAD_SLOW(slow_i_tmp11_vec, slow_i_offsets[11]);
1156 __LOAD_SLOW(slow_i_tmp12_vec, slow_i_offsets[12]);
1157 __LOAD_SLOW(slow_i_tmp13_vec, slow_i_offsets[13]);
1158 __LOAD_SLOW(slow_i_tmp14_vec, slow_i_offsets[14]);
1159 __LOAD_SLOW(slow_i_tmp15_vec, slow_i_offsets[15]);
1164 #define __TRANSPOSE_2x2__(dstA, dstB, srcA, srcB) \
1165 __m512i dstA = _mm512_mask_swizzle_epi32((srcA), _mm512_int2mask(0xAAAA), (srcB), _MM_SWIZ_REG_CDAB); \
1166 __m512i dstB = _mm512_mask_swizzle_epi32((srcB), _mm512_int2mask(0x5555), (srcA), _MM_SWIZ_REG_CDAB);
1167 __TRANSPOSE_2x2__(slow_2x2_tmp00_vec, slow_2x2_tmp01_vec, slow_i_tmp00_vec, slow_i_tmp01_vec);
1168 __TRANSPOSE_2x2__(slow_2x2_tmp02_vec, slow_2x2_tmp03_vec, slow_i_tmp02_vec, slow_i_tmp03_vec);
1169 __TRANSPOSE_2x2__(slow_2x2_tmp04_vec, slow_2x2_tmp05_vec, slow_i_tmp04_vec, slow_i_tmp05_vec);
1170 __TRANSPOSE_2x2__(slow_2x2_tmp06_vec, slow_2x2_tmp07_vec, slow_i_tmp06_vec, slow_i_tmp07_vec);
1171 __TRANSPOSE_2x2__(slow_2x2_tmp08_vec, slow_2x2_tmp09_vec, slow_i_tmp08_vec, slow_i_tmp09_vec);
1172 __TRANSPOSE_2x2__(slow_2x2_tmp10_vec, slow_2x2_tmp11_vec, slow_i_tmp10_vec, slow_i_tmp11_vec);
1173 __TRANSPOSE_2x2__(slow_2x2_tmp12_vec, slow_2x2_tmp13_vec, slow_i_tmp12_vec, slow_i_tmp13_vec);
1174 __TRANSPOSE_2x2__(slow_2x2_tmp14_vec, slow_2x2_tmp15_vec, slow_i_tmp14_vec, slow_i_tmp15_vec);
1175 #undef __TRANSPOSE_2x2__
1176 #define __TRANSPOSE_4x4__(dstA, dstB, dstC, dstD, srcA, srcB, srcC, srcD) \
1177 __m512i dstA = _mm512_mask_swizzle_epi32((srcA), _mm512_int2mask(0xCCCC), (srcC), _MM_SWIZ_REG_BADC); \
1178 __m512i dstB = _mm512_mask_swizzle_epi32((srcB), _mm512_int2mask(0xCCCC), (srcD), _MM_SWIZ_REG_BADC); \
1179 __m512i dstC = _mm512_mask_swizzle_epi32((srcC), _mm512_int2mask(0x3333), (srcA), _MM_SWIZ_REG_BADC); \
1180 __m512i dstD = _mm512_mask_swizzle_epi32((srcD), _mm512_int2mask(0x3333), (srcB), _MM_SWIZ_REG_BADC);
1181 __TRANSPOSE_4x4__(slow_4x4_tmp00_vec, slow_4x4_tmp01_vec, slow_4x4_tmp02_vec, slow_4x4_tmp03_vec, slow_2x2_tmp00_vec, slow_2x2_tmp01_vec, slow_2x2_tmp02_vec, slow_2x2_tmp03_vec);
1182 __TRANSPOSE_4x4__(slow_4x4_tmp04_vec, slow_4x4_tmp05_vec, slow_4x4_tmp06_vec, slow_4x4_tmp07_vec, slow_2x2_tmp04_vec, slow_2x2_tmp05_vec, slow_2x2_tmp06_vec, slow_2x2_tmp07_vec);
1183 __TRANSPOSE_4x4__(slow_4x4_tmp08_vec, slow_4x4_tmp09_vec, slow_4x4_tmp10_vec, slow_4x4_tmp11_vec, slow_2x2_tmp08_vec, slow_2x2_tmp09_vec, slow_2x2_tmp10_vec, slow_2x2_tmp11_vec);
1184 __TRANSPOSE_4x4__(slow_4x4_tmp12_vec, slow_4x4_tmp13_vec, slow_4x4_tmp14_vec, slow_4x4_tmp15_vec, slow_2x2_tmp12_vec, slow_2x2_tmp13_vec, slow_2x2_tmp14_vec, slow_2x2_tmp15_vec);
1185 #undef __TRANSPOSE_2x2__
1186 __m512i slow_4x8_tmp00_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp00_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp04_vec, _MM_PERM_CDAB);
1187 __m512i slow_4x8_tmp01_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp01_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp05_vec, _MM_PERM_CDAB);
1188 __m512i slow_4x8_tmp02_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp02_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp06_vec, _MM_PERM_CDAB);
1189 __m512i slow_4x8_tmp03_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp03_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp07_vec, _MM_PERM_CDAB);
1190 __m512i slow_4x8_tmp04_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp08_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp12_vec, _MM_PERM_CDAB);
1191 __m512i slow_4x8_tmp05_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp09_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp13_vec, _MM_PERM_CDAB);
1192 __m512i slow_4x8_tmp06_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp10_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp14_vec, _MM_PERM_CDAB);
1193 __m512i slow_4x8_tmp07_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp11_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp15_vec, _MM_PERM_CDAB);
1194 slow_i_0_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp00_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp04_vec, _MM_PERM_BADC));
1195 slow_i_1_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp01_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp05_vec, _MM_PERM_BADC));
1196 slow_i_2_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp02_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp06_vec, _MM_PERM_BADC));
1197 slow_i_3_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp03_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp07_vec, _MM_PERM_BADC));
1200 #endif // SHORT && (EXCLUDED || MODIFIED)
1202 __m512 slow_d_vec =
NOSHORT( table_four_i_8_vec)
SHORT(table_four_i_12_vec);
1203 __m512 slow_c_vec =
NOSHORT( table_four_i_9_vec)
SHORT(table_four_i_13_vec);
1204 __m512 slow_b_vec =
NOSHORT(table_four_i_10_vec)
SHORT(table_four_i_14_vec);
1205 __m512 slow_a_vec =
NOSHORT(table_four_i_11_vec)
SHORT(table_four_i_15_vec);
1207 #if (0 EXCLUDED(+1))
1209 slow_a_vec = _mm512_add_ps( slow_i_3_vec , slow_a_vec);
1210 slow_b_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(2.0f), slow_i_2_vec), slow_b_vec);
1211 slow_c_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(4.0f), slow_i_1_vec), slow_c_vec);
1212 slow_d_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(6.0f), slow_i_0_vec), slow_d_vec);
1215 slow_d_vec = _mm512_sub_ps(slow_d_vec, table_four_i_12_vec);
1216 slow_c_vec = _mm512_sub_ps(slow_c_vec, table_four_i_13_vec);
1217 slow_b_vec = _mm512_sub_ps(slow_b_vec, table_four_i_14_vec);
1218 slow_a_vec = _mm512_sub_ps(slow_a_vec, table_four_i_15_vec);
1221 #if (0 MODIFIED(+1))
1223 slow_a_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps( modf_mod), slow_i_3_vec), slow_a_vec);
1224 slow_b_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(2.0f * modf_mod), slow_i_2_vec), slow_b_vec);
1225 slow_c_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(4.0f * modf_mod), slow_i_1_vec), slow_c_vec);
1226 slow_d_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(6.0f * modf_mod), slow_i_0_vec), slow_d_vec);
1229 __m512 modf_mod_vec = _mm512_set_1to16_ps(modf_mod);
1230 slow_d_vec = _mm512_sub_ps(slow_d_vec, _mm512_mul_ps(modf_mod, table_four_i_12_vec));
1231 slow_c_vec = _mm512_sub_ps(slow_c_vec, _mm512_mul_ps(modf_mod, table_four_i_13_vec));
1232 slow_b_vec = _mm512_sub_ps(slow_b_vec, _mm512_mul_ps(modf_mod, table_four_i_14_vec));
1233 slow_a_vec = _mm512_sub_ps(slow_a_vec, _mm512_mul_ps(modf_mod, table_four_i_15_vec));
1236 slow_d_vec = _mm512_mul_ps(slow_d_vec, kqq_vec);
1237 slow_c_vec = _mm512_mul_ps(slow_c_vec, kqq_vec);
1238 slow_b_vec = _mm512_mul_ps(slow_b_vec, kqq_vec);
1239 slow_a_vec = _mm512_mul_ps(slow_a_vec, kqq_vec);
1242 __m512 slow_val_tmp_0_vec = _mm512_mul_ps(slow_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
1243 __m512 slow_val_tmp_1_vec = _mm512_mul_ps(slow_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
1244 __m512 slow_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_0_vec), slow_val_tmp_1_vec);
1245 __m512 slow_val_tmp_3_vec = _mm512_mul_ps(slow_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
1246 __m512 slow_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_2_vec), slow_val_tmp_3_vec);
1247 __m512 slow_val_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_4_vec), slow_a_vec);
1248 CONTRIB_SUB_PS2PD(fullElectEnergy_vec, slow_val_vec);
1251 #if (0 NOT_ALCHPAIR(FAST(NOSHORT(+1))))
1252 slow_d_vec = _mm512_add_ps(vdw_d_vec, slow_d_vec);
1253 slow_c_vec = _mm512_add_ps(vdw_c_vec, slow_c_vec);
1254 slow_b_vec = _mm512_add_ps(vdw_b_vec, slow_b_vec);
1255 slow_a_vec = _mm512_add_ps(vdw_a_vec, slow_a_vec);
1258 __m512 slow_dir_tmp_0_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_d_vec), slow_c_vec);
1259 __m512 slow_dir_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_dir_tmp_0_vec), slow_b_vec);
1260 __m512 fullforce_r_vec = slow_dir_vec;
1262 const __m512 fulltmp_x_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_x_vec);
1263 PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xx_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_x_vec)); )
1264 PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xy_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_y_vec)); )
1265 PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xz_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_z_vec)); )
1267 const __m512 fulltmp_y_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_y_vec);
1268 PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_yy_vec, _mm512_mul_ps(fulltmp_y_vec, p_ij_y_vec)); )
1269 PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_yz_vec, _mm512_mul_ps(fulltmp_y_vec, p_ij_z_vec)); )
1271 const __m512 fulltmp_z_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_z_vec);
1272 PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_zz_vec, _mm512_mul_ps(fulltmp_z_vec, p_ij_z_vec)); )
1274 #
if MIC_HANDCODE_FORCE_COMBINE_FORCES == 0
1275 APPLY_FORCES_PS2PD(fulltmp_x_vec, fulltmp_y_vec, fulltmp_z_vec, fullf_0_x, fullf_0_y, fullf_0_z, fullf_1_x, fullf_1_y, fullf_1_z, tmpI32, tmpJ32);
1280 #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
1285 #if (MIC_DATA_STRUCT_VERIFY != 0) && (MIC_DATA_STRUCT_VERIFY_KERNELS != 0)
1286 const float lfd_const = 3.2e2f;
1287 __mmask16 lfd_mask = _mm512_int2mask(0x0000);
1288 #define LFD_CHECK(v) _mm512_mask_cmplt_ps_mask(cutoff_mask, _mm512_set_1to16_ps(lfd_const), _mm512_mask_mul_ps(v, _mm512_cmplt_ps_mask(v, _mm512_setzero_ps()), v, _mm512_set_1to16_ps(-1.0f)))
1289 #if (0 FAST(SHORT(+1)))
1290 lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_x_vec));
1291 lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_y_vec));
1292 lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_z_vec));
1295 lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_x_vec));
1296 lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_y_vec));
1297 lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_z_vec));
1301 for (
int _k = 0; _k < 16; _k++) {
1302 lfd_mask = _mm512_kor(lfd_mask, _mm512_int2mask(((pExt_0[tmpI32[_k]].index == 53839 && pExt_1[tmpJ32[_k]].index == 53827) ? (0x1) : (0x0)) << _k));
1304 if (_mm512_mask2int(lfd_mask) != 0) {
1306 float tmp_force[16] __attribute__((aligned(64)));
1307 float tmp_fullforce[16] __attribute__((aligned(64)));
1308 float tmp_ijx[16] __attribute__((aligned(64)));
1309 float tmp_ijy[16] __attribute__((aligned(64)));
1310 float tmp_ijz[16] __attribute__((aligned(64)));
1311 float tmp_x[16] __attribute__((aligned(64)));
1312 float tmp_y[16] __attribute__((aligned(64)));
1313 float tmp_z[16] __attribute__((aligned(64)));
1314 float fulltmp_x[16] __attribute__((aligned(64)));
1315 float fulltmp_y[16] __attribute__((aligned(64)));
1316 float fulltmp_z[16] __attribute__((aligned(64)));
1317 float pi_x[16] __attribute__((aligned(64)));
1318 float pi_y[16] __attribute__((aligned(64)));
1319 float pi_z[16] __attribute__((aligned(64)));
1320 float pi_q[16] __attribute__((aligned(64)));
1321 float pj_x[16] __attribute__((aligned(64)));
1322 float pj_y[16] __attribute__((aligned(64)));
1323 float pj_z[16] __attribute__((aligned(64)));
1324 float pj_q[16] __attribute__((aligned(64)));
1325 float r2[16] __attribute__((aligned(64)));
1326 int table_i[16] __attribute__((aligned(64)));
1327 float kqq[16] __attribute__((aligned(64)));
1328 float diffa[16] __attribute__((aligned(64)));
1329 float vdw_a[16] __attribute__((aligned(64)));
1330 float vdw_b[16] __attribute__((aligned(64)));
1331 float vdw_c[16] __attribute__((aligned(64)));
1332 float vdw_d[16] __attribute__((aligned(64)));
1333 float fast_a[16] __attribute__((aligned(64)));
1334 float fast_b[16] __attribute__((aligned(64)));
1335 float fast_c[16] __attribute__((aligned(64)));
1336 float fast_d[16] __attribute__((aligned(64)));
1337 float A[16] __attribute__((aligned(64)));
1338 float B[16] __attribute__((aligned(64)));
1339 float table_four_i_0[16] __attribute__((aligned(64)));
1340 float table_four_i_1[16] __attribute__((aligned(64)));
1341 float table_four_i_2[16] __attribute__((aligned(64)));
1342 float table_four_i_3[16] __attribute__((aligned(64)));
1343 float table_four_i_4[16] __attribute__((aligned(64)));
1344 float table_four_i_5[16] __attribute__((aligned(64)));
1345 float table_four_i_6[16] __attribute__((aligned(64)));
1346 float table_four_i_7[16] __attribute__((aligned(64)));
1347 int i_vdwType[16] __attribute__((aligned(64)));
1348 int j_vdwType[16] __attribute__((aligned(64)));
1350 _mm512_store_ps(pi_x, p_i_x_vec);
1351 _mm512_store_ps(pi_y, p_i_y_vec);
1352 _mm512_store_ps(pi_z, p_i_z_vec);
1353 _mm512_store_ps(pi_q, p_i_q_vec);
1354 _mm512_store_ps(pj_x, p_j_x_vec);
1355 _mm512_store_ps(pj_y, p_j_y_vec);
1356 _mm512_store_ps(pj_z, p_j_z_vec);
1357 _mm512_store_ps(pj_q, p_j_q_vec);
1358 _mm512_store_ps(r2, r2_vec);
1359 _mm512_store_epi32(table_i, table_i_vec);
1360 _mm512_store_ps(kqq, kqq_vec);
1361 _mm512_store_ps(diffa, diffa_vec);
1362 _mm512_store_ps(table_four_i_0, table_four_i_0_vec);
1363 _mm512_store_ps(table_four_i_1, table_four_i_1_vec);
1364 _mm512_store_ps(table_four_i_2, table_four_i_2_vec);
1365 _mm512_store_ps(table_four_i_3, table_four_i_3_vec);
1366 _mm512_store_ps(table_four_i_4, table_four_i_4_vec);
1367 _mm512_store_ps(table_four_i_5, table_four_i_5_vec);
1368 _mm512_store_ps(table_four_i_6, table_four_i_6_vec);
1369 _mm512_store_ps(table_four_i_7, table_four_i_7_vec);
1371 _mm512_store_ps(
vdw_a, vdw_a_vec);
1372 _mm512_store_ps(
vdw_b, vdw_b_vec);
1373 _mm512_store_ps(
vdw_c, vdw_c_vec);
1374 _mm512_store_ps(
vdw_d, vdw_d_vec);
1375 _mm512_store_ps(
A, A_vec);
1376 _mm512_store_ps(
B, B_vec);
1377 _mm512_store_epi64(i_vdwType, pExt_i_vdwType_vec);
1378 _mm512_store_epi64(j_vdwType, pExt_j_vdwType_vec);
1380 _mm512_store_ps(
vdw_a, _mm512_setzero_ps());
1381 _mm512_store_ps(
vdw_b, _mm512_setzero_ps());
1382 _mm512_store_ps(
vdw_c, _mm512_setzero_ps());
1383 _mm512_store_ps(
vdw_d, _mm512_setzero_ps());
1384 _mm512_store_ps(
A, _mm512_setzero_ps());
1385 _mm512_store_ps(
B, _mm512_setzero_ps());
1386 _mm512_store_epi64(i_vdwType, _mm512_setzero_epi32());
1387 _mm512_store_epi64(j_vdwType, _mm512_setzero_epi32());
1389 #if (0 FAST(SHORT(+1)))
1390 _mm512_store_ps(tmp_force, force_r_vec);
1391 _mm512_store_ps(tmp_x, tmp_x_vec);
1392 _mm512_store_ps(tmp_y, tmp_y_vec);
1393 _mm512_store_ps(tmp_z, tmp_z_vec);
1394 _mm512_store_ps(
fast_a, fast_a_vec);
1395 _mm512_store_ps(
fast_b, fast_b_vec);
1396 _mm512_store_ps(
fast_c, fast_c_vec);
1397 _mm512_store_ps(fast_d, fast_d_vec);
1399 _mm512_store_ps(tmp_force, _mm512_setzero_ps());
1400 _mm512_store_ps(tmp_x, _mm512_setzero_ps());
1401 _mm512_store_ps(tmp_y, _mm512_setzero_ps());
1402 _mm512_store_ps(tmp_z, _mm512_setzero_ps());
1403 _mm512_store_ps(
fast_a, _mm512_setzero_ps());
1404 _mm512_store_ps(
fast_b, _mm512_setzero_ps());
1405 _mm512_store_ps(
fast_c, _mm512_setzero_ps());
1406 _mm512_store_ps(fast_d, _mm512_setzero_ps());
1409 _mm512_store_ps(tmp_fullforce, fullforce_r_vec);
1410 _mm512_store_ps(fulltmp_x, fulltmp_x_vec);
1411 _mm512_store_ps(fulltmp_y, fulltmp_y_vec);
1412 _mm512_store_ps(fulltmp_z, fulltmp_z_vec);
1414 _mm512_store_ps(tmp_fullforce, _mm512_setzero_ps());
1415 _mm512_store_ps(fulltmp_x, _mm512_setzero_ps());
1416 _mm512_store_ps(fulltmp_y, _mm512_setzero_ps());
1417 _mm512_store_ps(fulltmp_z, _mm512_setzero_ps());
1419 _mm512_store_ps(tmp_ijx, p_ij_x_vec);
1420 _mm512_store_ps(tmp_ijy, p_ij_y_vec);
1421 _mm512_store_ps(tmp_ijz, p_ij_z_vec);
1424 if (_mm512_mask2int(lfd_mask) & (1 << (idx))) { \
1425 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : tmp x:%+.3e, y:%+.3e, z:%+.3e : fulltmp x:%+.3e, y:%+.3e, z:%+.3e\n", \
1426 params.pe, params.timestep, \
1427 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1428 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1429 tmp_x[idx], tmp_y[idx], tmp_z[idx], \
1430 fulltmp_x[idx], fulltmp_y[idx], fulltmp_z[idx] \
1432 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : calc_" SELF("self") PAIR("pair") FULL("_fullelect") ENERGY("_energy") "." NORMAL("NORM") MODIFIED("MOD") EXCLUDED("EXCL") "\n", \
1433 params.pe, params.timestep, \
1434 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1435 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]) \
1437 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : i:%ld, j:%ld, i_index:%d, j_index:%d\n", \
1438 params.pe, params.timestep, \
1439 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1440 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1441 tmpI32[idx], tmpJ32[idx], pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index \
1443 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : p_i x:%+.3e, y:%+.3e, z:%+.3e, q:%+.3e : p_j x:%+.3e, y:%+.3e, z:%+.3e, q:%+.3e : r2 %+.3e\n", \
1444 params.pe, params.timestep, \
1445 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1446 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1447 pi_x[idx], pi_y[idx], pi_z[idx], pi_q[idx], \
1448 pj_x[idx], pj_y[idx], pj_z[idx], pj_q[idx], \
1451 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : offset x:%+.3e, y:%+.3e, z:%+.3e\n", \
1452 params.pe, params.timestep, \
1453 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1454 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1455 offset_x, offset_y, offset_z \
1457 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : forces f&s:%+.3e, full:%+.3e : p_ij x:%+.3e, y:%+.3e, z:%+.3e\n", \
1458 params.pe, params.timestep, \
1459 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1460 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1461 tmp_force[idx], tmp_fullforce[idx], \
1462 tmp_ijx[idx], tmp_ijy[idx], tmp_ijz[idx] \
1464 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : table_i:%d, kqq::%+.3e, diffa:%+.3e\n", \
1465 params.pe, params.timestep, \
1466 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1467 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1468 table_i[idx], kqq[idx], diffa[idx] \
1470 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : vdw a:%+.3e, b:%+.3e, c:%+.3e, d:%+.3e : fast a:%+.3e, b:%+.3e, c:%+.3e, d:%+.3e, \n", \
1471 params.pe, params.timestep, \
1472 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1473 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1474 vdw_a[idx], vdw_b[idx], vdw_c[idx], vdw_d[idx], \
1475 fast_a[idx], fast_b[idx], fast_c[idx], fast_d[idx] \
1477 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : A:%+.3e, B:%+.3e : vdwType i:%d, j:%d\n", \
1478 params.pe, params.timestep, \
1479 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1480 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1481 A[idx], B[idx], i_vdwType[idx], j_vdwType[idx] \
1483 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : table_four_i 0:%+.3e, 1:%+.3e, 2:%+.3e, 3:%+.3e, 4:%+.3e, 5:%+.3e, 6:%+.3e, 7:%+.3e\n", \
1484 params.pe, params.timestep, \
1485 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1486 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1487 table_four_i_0[idx], table_four_i_1[idx], table_four_i_2[idx], table_four_i_3[idx], table_four_i_4[idx], table_four_i_5[idx], table_four_i_6[idx], table_four_i_7[idx] \
1489 int indexDiff = pExt_0[tmpI32[idx]].index - pExt_1[tmpJ32[idx]].index; \
1490 int maxDiff = pExt_1[tmpJ32[idx]].excl_maxdiff; \
1491 int offset = -1, offset_major = -1, offset_minor = -1, flags = 0; \
1492 if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) { \
1493 offset = (2 * indexDiff) + pExt_1[tmpJ32[idx]].excl_index; \
1494 offset_major = offset / (sizeof(unsigned int) * 8); \
1495 offset_minor = offset % (sizeof(unsigned int) * 8); \
1496 flags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03; \
1498 printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d : excl: exclIndex:%d, indexDiff:%d, maxDiff:%d, offset:%d, flags:%d\n", \
1499 params.pe, params.timestep, \
1500 pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
1501 params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
1502 pExt_1[tmpJ32[idx]].excl_index, indexDiff, maxDiff, offset, flags \
1505 PRNT( 0); PRNT( 1); PRNT( 2); PRNT( 3); PRNT( 4); PRNT( 5); PRNT( 6); PRNT( 7);
1506 PRNT( 8); PRNT( 9); PRNT(10); PRNT(11); PRNT(12); PRNT(13); PRNT(14); PRNT(15);
1509 #endif // (MIC_DATA_STRUCT_VERIFY != 0) && (MIC_DATA_STRUCT_VERIFY_KERNELS != 0)
1512 FAST(
ENERGY( vdwEnergy += _mm512_reduce_add_pd(vdwEnergy_vec); ))
1514 FULL(
ENERGY( fullElectEnergy += _mm512_reduce_add_pd(fullElectEnergy_vec); ))
1515 #if (0 PAIR(FAST(SHORT(+1))))
1516 virial_xx += _mm512_reduce_add_pd(virial_xx_vec);
1517 virial_xy += _mm512_reduce_add_pd(virial_xy_vec);
1518 virial_xz += _mm512_reduce_add_pd(virial_xz_vec);
1519 virial_yy += _mm512_reduce_add_pd(virial_yy_vec);
1520 virial_yz += _mm512_reduce_add_pd(virial_yz_vec);
1521 virial_zz += _mm512_reduce_add_pd(virial_zz_vec);
1523 #if (0 PAIR(FULL(+1)))
1524 fullElectVirial_xx += _mm512_reduce_add_pd(fullElectVirial_xx_vec);
1525 fullElectVirial_xy += _mm512_reduce_add_pd(fullElectVirial_xy_vec);
1526 fullElectVirial_xz += _mm512_reduce_add_pd(fullElectVirial_xz_vec);
1527 fullElectVirial_yy += _mm512_reduce_add_pd(fullElectVirial_yy_vec);
1528 fullElectVirial_yz += _mm512_reduce_add_pd(fullElectVirial_yz_vec);
1529 fullElectVirial_zz += _mm512_reduce_add_pd(fullElectVirial_zz_vec);
1532 #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
1533 params.exclusionSum += _mm512_reduce_add_epi32(exclusionSum_vec);
1536 #undef GATHER_PS_I32_OFFSET
1537 #undef GATHER_PS_I32
1538 #undef GATHER_EPI32_I32_OFFSET
1539 #undef GATHER_EPI32_I32
1540 #undef SCATTER_INC_PD_I32_STEP
1541 #undef SCATTER_INC_PD_I32
1542 #undef SCATTER_INC_PS_I32_STEP
1543 #undef SCATTER_INC_PS_I32
1544 #undef CONTRIB_ADD_PS2PD
1545 #undef CONTRIB_SUB_PS2PD
1546 #undef APPLY_FORCES_PS2PD_STEP_ADD
1547 #undef APPLY_FORCES_PS2PD_STEP_SUB
1548 #undef APPLY_FORCES_PS2PD_STEP_ADD_COMBO
1549 #undef APPLY_FORCES_PS2PD_STEP_SUB_COMBO
1550 #undef APPLY_FORCES_PS2PD_JSTEP
1551 #undef APPLY_FORCES_PS2PD
register BigReal virial_xy
register BigReal virial_xz
register BigReal virial_yz
register BigReal electEnergy
register BigReal virial_yy
register BigReal virial_zz
register BigReal virial_xx
k< npairi;++k){TABENERGY(const int numtypes=simParams->tableNumTypes;const float table_spacing=simParams->tableSpacing;const int npertype=(int)(namdnearbyint(simParams->tableMaxDist/simParams->tableSpacing)+1);) int table_i=(r2iilist[2 *k] >> 14)+r2_delta_expc;const int j=pairlisti[k];#define p_j BigReal diffa=r2list[k]-r2_table[table_i];#define table_four_i TABENERGY(register const int tabtype=-1-(lj_pars->A< 0?lj_pars->A:0);) BigReal kqq=kq_i *p_j-> charge