ComputeNonbondedMICKernelBase2_handcode_single.h

Go to the documentation of this file.
00001 #ifdef NAMD_MIC
00002 
00003   #define GATHER_PS_I32_OFFSET(v, p, i, o) \
00004   { \
00005     __mmask16 k0 = _mm512_int2mask(0x0008); \
00006     __mmask16 k1 = _mm512_int2mask(0x0080); \
00007     __mmask16 k2 = _mm512_int2mask(0x0800); \
00008     __mmask16 k3 = _mm512_int2mask(0x8000); \
00009     (v) = _mm512_mask_loadunpacklo_ps(                      _mm512_setzero_ps(), k3, &((p)[(i)[13] + (o)])); \
00010     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k2, &((p)[(i)[ 9] + (o)])); \
00011     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k1, &((p)[(i)[ 5] + (o)])); \
00012     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k0, &((p)[(i)[ 1] + (o)])); \
00013     (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[12] + (o)])); \
00014     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k2, &((p)[(i)[ 8] + (o)])); \
00015     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k1, &((p)[(i)[ 4] + (o)])); \
00016     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k0, &((p)[(i)[ 0] + (o)])); \
00017     (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_BADC), k3, &((p)[(i)[14] + (o)])); \
00018     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k2, &((p)[(i)[10] + (o)])); \
00019     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k1, &((p)[(i)[ 6] + (o)])); \
00020     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k0, &((p)[(i)[ 2] + (o)])); \
00021     (v) = _mm512_mask_loadunpacklo_ps(_mm512_swizzle_ps((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[15] + (o)])); \
00022     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k2, &((p)[(i)[11] + (o)])); \
00023     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k1, &((p)[(i)[ 7] + (o)])); \
00024     (v) = _mm512_mask_loadunpacklo_ps(                                      (v), k0, &((p)[(i)[ 3] + (o)])); \
00025   }
00026   #define GATHER_PS_I32(v, p, i)  GATHER_PS_I32_OFFSET((v), (p), (i), 0)
00027 
00028 
00029   #define GATHER_EPI32_I32_OFFSET(v, p, i, o) \
00030   { \
00031     __mmask16 k0 = _mm512_int2mask(0x0008); \
00032     __mmask16 k1 = _mm512_int2mask(0x0080); \
00033     __mmask16 k2 = _mm512_int2mask(0x0800); \
00034     __mmask16 k3 = _mm512_int2mask(0x8000); \
00035     (v) = _mm512_mask_loadunpacklo_epi32(                      _mm512_setzero_epi32(), k3, &((p)[(i)[13] + (o)])); \
00036     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k2, &((p)[(i)[ 9] + (o)])); \
00037     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k1, &((p)[(i)[ 5] + (o)])); \
00038     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k0, &((p)[(i)[ 1] + (o)])); \
00039     (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[12] + (o)])); \
00040     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k2, &((p)[(i)[ 8] + (o)])); \
00041     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k1, &((p)[(i)[ 4] + (o)])); \
00042     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k0, &((p)[(i)[ 0] + (o)])); \
00043     (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_BADC), k3, &((p)[(i)[14] + (o)])); \
00044     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k2, &((p)[(i)[10] + (o)])); \
00045     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k1, &((p)[(i)[ 6] + (o)])); \
00046     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k0, &((p)[(i)[ 2] + (o)])); \
00047     (v) = _mm512_mask_loadunpacklo_epi32(_mm512_swizzle_epi32((v), _MM_SWIZ_REG_CDAB), k3, &((p)[(i)[15] + (o)])); \
00048     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k2, &((p)[(i)[11] + (o)])); \
00049     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k1, &((p)[(i)[ 7] + (o)])); \
00050     (v) = _mm512_mask_loadunpacklo_epi32(                                         (v), k0, &((p)[(i)[ 3] + (o)])); \
00051   }
00052   #define GATHER_EPI32_I32(v, p, i)  GATHER_EPI32_I32_OFFSET((v), (p), (i), 0)
00053 
00054 
00055   #define SCATTER_INC_PD_I32_STEP(v, p, i, k, idx, pattern) \
00056     t = _mm512_mask_loadunpacklo_pd(t, (k), &((p)[(i)[(idx)]])); \
00057     t = _mm512_mask_add_pd(t, (k), _mm512_swizzle_pd((v), (pattern), t); \
00058     _mm512_mask_packstorelo_pd(&((p)[(i)[(idx)]]), (k), t);
00059 
00060   #define SCATTER_INC_PD_I32(v, p, i) \
00061   { \
00062     __mmask16 k0 = _mm512_int2mask(0x02); \
00063     __mmask16 k1 = _mm512_int2mask(0x20); \
00064     __m512d t = _mm512_setzero_pd(); \
00065     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  0, _MM_SWIZ_REG_CDAB) \
00066     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  1, _MM_SWIZ_REG_DCBA) \
00067     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  2, _MM_SWIZ_REG_DACB) \
00068     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  3, _MM_SWIZ_REG_BADC) \
00069     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  4, _MM_SWIZ_REG_CDAB) \
00070     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  5, _MM_SWIZ_REG_DCBA) \
00071     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  6, _MM_SWIZ_REG_DACB) \
00072     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  7, _MM_SWIZ_REG_BADC) \
00073   }
00074 
00075 
00076   #define SCATTER_INC_PS_I32_STEP(v, p, i, k, idx, pattern) \
00077     t = _mm512_mask_loadunpacklo_ps(t, (k), &((p)[(i)[(idx)]])); \
00078     t = _mm512_mask_add_ps(t, (k), _mm512_swizzle_ps((v), (pattern), t); \
00079     _mm512_mask_packstorelo_ps(&((p)[(i)[(idx)]]), (k), t);
00080     
00081   #define SCATTER_INC_PS_I32(v, p, i) \
00082   { \
00083     __mmask16 k0 = _mm512_int2mask(0x0008); \
00084     __mmask16 k1 = _mm512_int2mask(0x0080); \
00085     __mmask16 k2 = _mm512_int2mask(0x0800); \
00086     __mmask16 k3 = _mm512_int2mask(0x8000); \
00087     __m512 t = _mm512_setzero_ps(); \
00088     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  0, _MM_SWIZ_REG_CDAB) \
00089     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  1, _MM_SWIZ_REG_DCBA) \
00090     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  2, _MM_SWIZ_REG_DACB) \
00091     SCATTER_INC_PS_I32_STEP((v), (p), (i), k0,  3, _MM_SWIZ_REG_BADC) \
00092     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  4, _MM_SWIZ_REG_CDAB) \
00093     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  5, _MM_SWIZ_REG_DCBA) \
00094     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  6, _MM_SWIZ_REG_DACB) \
00095     SCATTER_INC_PS_I32_STEP((v), (p), (i), k1,  7, _MM_SWIZ_REG_BADC) \
00096     SCATTER_INC_PS_I32_STEP((v), (p), (i), k2,  8, _MM_SWIZ_REG_CDAB) \
00097     SCATTER_INC_PS_I32_STEP((v), (p), (i), k2,  9, _MM_SWIZ_REG_DCBA) \
00098     SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 10, _MM_SWIZ_REG_DACB) \
00099     SCATTER_INC_PS_I32_STEP((v), (p), (i), k2, 11, _MM_SWIZ_REG_BADC) \
00100     SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 12, _MM_SWIZ_REG_CDAB) \
00101     SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 13, _MM_SWIZ_REG_DCBA) \
00102     SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 14, _MM_SWIZ_REG_DACB) \
00103     SCATTER_INC_PS_I32_STEP((v), (p), (i), k3, 15, _MM_SWIZ_REG_BADC) \
00104   }
00105 
00106   // DMK - NOTE - The instructions for these macros are they way they are so
00107   //   values are converted to double before accumulating (e.g. rather than
00108   //   sum the floats, convert, and then accumulate).
00109   #define CONTRIB_ADD_PS2PD(pd_vec, ps_vec) \
00110   { __m512 ps_tmp_vec = (ps_vec); \
00111     (pd_vec) = _mm512_add_pd((pd_vec), _mm512_cvtpslo_pd(ps_tmp_vec)); \
00112     (pd_vec) = _mm512_add_pd((pd_vec), _mm512_cvtpslo_pd(_mm512_permute4f128_ps(ps_tmp_vec, _MM_PERM_BADC))); \
00113   }
00114   #define CONTRIB_SUB_PS2PD(pd_vec, ps_vec) \
00115   { __m512 ps_tmp_vec = (ps_vec); \
00116     (pd_vec) = _mm512_sub_pd((pd_vec), _mm512_cvtpslo_pd(ps_tmp_vec)); \
00117     (pd_vec) = _mm512_sub_pd((pd_vec), _mm512_cvtpslo_pd(_mm512_permute4f128_ps(ps_tmp_vec, _MM_PERM_BADC))); \
00118   }
00119 
00120 
00121   #define APPLY_FORCES_PS2PD_STEP_ADD(f_x, f_y, f_z, i, v_x, v_y, v_z, p, k) \
00122     tx = _mm512_mask_loadunpacklo_pd(tx, (k), (f_x) + (i)); \
00123     ty = _mm512_mask_loadunpacklo_pd(ty, (k), (f_y) + (i)); \
00124     tz = _mm512_mask_loadunpacklo_pd(tz, (k), (f_z) + (i)); \
00125     tx = _mm512_mask_add_pd(tx, (k), tx, _mm512_swizzle_pd((v_x), (p))); \
00126     ty = _mm512_mask_add_pd(ty, (k), ty, _mm512_swizzle_pd((v_y), (p))); \
00127     tz = _mm512_mask_add_pd(tz, (k), tz, _mm512_swizzle_pd((v_z), (p))); \
00128     _mm512_mask_packstorelo_pd((f_x) + (i), (k), tx); \
00129     _mm512_mask_packstorelo_pd((f_y) + (i), (k), ty); \
00130     _mm512_mask_packstorelo_pd((f_z) + (i), (k), tz);
00131 
00132   #define APPLY_FORCES_PS2PD_STEP_SUB(f_x, f_y, f_z, i, v_x, v_y, v_z, p, k) \
00133     tx = _mm512_mask_loadunpacklo_pd(tx, (k), (f_x) + (i)); \
00134     ty = _mm512_mask_loadunpacklo_pd(ty, (k), (f_y) + (i)); \
00135     tz = _mm512_mask_loadunpacklo_pd(tz, (k), (f_z) + (i)); \
00136     tx = _mm512_mask_sub_pd(tx, (k), tx, _mm512_swizzle_pd((v_x), (p))); \
00137     ty = _mm512_mask_sub_pd(ty, (k), ty, _mm512_swizzle_pd((v_y), (p))); \
00138     tz = _mm512_mask_sub_pd(tz, (k), tz, _mm512_swizzle_pd((v_z), (p))); \
00139     _mm512_mask_packstorelo_pd((f_x) + (i), (k), tx); \
00140     _mm512_mask_packstorelo_pd((f_y) + (i), (k), ty); \
00141     _mm512_mask_packstorelo_pd((f_z) + (i), (k), tz);
00142 
00143   //#if MIC_PAD_PLGEN != 0
00144   #if __MIC_PAD_PLGEN_CTRL != 0
00145 
00146     #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
00147 
00148       #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00149 
00150         #define APPLY_FORCES_PS2PD_JSTEP(j_lo, j_hi, v, fv) \
00151         { \
00152           FAST(SHORT(  v_tmp = _mm512_mask_loadunpacklo_pd( v_tmp, k_lo,     f_1 + tmpJ32[(j_lo)]); )) \
00153           FULL(       fv_tmp = _mm512_mask_loadunpacklo_pd(fv_tmp, k_lo, fullf_1 + tmpJ32[(j_lo)]);  ) \
00154           FAST(SHORT(  v_tmp = _mm512_sub_pd( v_tmp, ( v)); )) \
00155           FULL(       fv_tmp = _mm512_sub_pd(fv_tmp, (fv));  ) \
00156           FAST(SHORT(          _mm512_mask_packstorelo_pd(    f_1 + tmpJ32[(j_lo)], k_lo,  v_tmp); )) \
00157           FULL(                _mm512_mask_packstorelo_pd(fullf_1 + tmpJ32[(j_lo)], k_lo, fv_tmp);  ) \
00158           FAST(SHORT(  v_tmp = _mm512_mask_loadunpacklo_pd( v_tmp, k_hi,     f_1 + tmpJ32[(j_hi)]); )) \
00159           FULL(       fv_tmp = _mm512_mask_loadunpacklo_pd(fv_tmp, k_hi, fullf_1 + tmpJ32[(j_hi)]);  ) \
00160           FAST(SHORT(  v_tmp = _mm512_sub_pd( v_tmp, ( v)); )) \
00161           FULL(       fv_tmp = _mm512_sub_pd(fv_tmp, (fv));  ) \
00162           FAST(SHORT(          _mm512_mask_packstorelo_pd(    f_1 + tmpJ32[(j_hi)], k_hi,  v_tmp); )) \
00163           FULL(                _mm512_mask_packstorelo_pd(fullf_1 + tmpJ32[(j_hi)], k_hi, fv_tmp);  ) \
00164         }
00165 
00166         #define APPLY_FORCES_PS2PD \
00167           { \
00168           /* Set 'w' values (exclusion checksum) to -1.0f when counting so 'sub' operation for x,y,z,w actually adds 1.0f */ \
00169           FAST(SHORT( __m512     tmp_w_vec = _mm512_mask_mov_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_set_1to16_ps(0.0f MODIFIED(- 1.0f) EXCLUDED(- 1.0f))); )) \
00170           FULL(       __m512 fulltmp_w_vec = _mm512_setzero_ps();  ) \
00171           /* Transpose the values so that each set of 4 lanes has x,y,z,w values for a given interaction. */ \
00172           /*   NOTE: This rearranges the values via a 4x4 transpose.                                      */ \
00173           __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA); \
00174           __mmask16 k_2x2_1 = _mm512_int2mask(0x5555); \
00175           FAST(SHORT( __m512i tmp_a0 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(    tmp_x_vec), k_2x2_0, _mm512_castps_si512(    tmp_y_vec), _MM_SWIZ_REG_CDAB); )) \
00176           FAST(SHORT( __m512i tmp_a1 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(    tmp_y_vec), k_2x2_1, _mm512_castps_si512(    tmp_x_vec), _MM_SWIZ_REG_CDAB); )) \
00177           FAST(SHORT( __m512i tmp_a2 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(    tmp_z_vec), k_2x2_0, _mm512_castps_si512(    tmp_w_vec), _MM_SWIZ_REG_CDAB); )) \
00178           FAST(SHORT( __m512i tmp_a3 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(    tmp_w_vec), k_2x2_1, _mm512_castps_si512(    tmp_z_vec), _MM_SWIZ_REG_CDAB); )) \
00179           FULL(       __m512i tmp_b0 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_x_vec), k_2x2_0, _mm512_castps_si512(fulltmp_y_vec), _MM_SWIZ_REG_CDAB);  ) \
00180           FULL(       __m512i tmp_b1 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_y_vec), k_2x2_1, _mm512_castps_si512(fulltmp_x_vec), _MM_SWIZ_REG_CDAB);  ) \
00181           FULL(       __m512i tmp_b2 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_z_vec), k_2x2_0, _mm512_castps_si512(fulltmp_w_vec), _MM_SWIZ_REG_CDAB);  ) \
00182           FULL(       __m512i tmp_b3 = _mm512_mask_swizzle_epi32(_mm512_castps_si512(fulltmp_w_vec), k_2x2_1, _mm512_castps_si512(fulltmp_z_vec), _MM_SWIZ_REG_CDAB);  ) \
00183           __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC); \
00184           __mmask16 k_4x4_1 = _mm512_int2mask(0x3333); \
00185           FAST(SHORT( __m512     tmp_0_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a0, k_4x4_0, tmp_a2, _MM_SWIZ_REG_BADC)); )) \
00186           FAST(SHORT( __m512     tmp_1_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a1, k_4x4_0, tmp_a3, _MM_SWIZ_REG_BADC)); )) \
00187           FAST(SHORT( __m512     tmp_2_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a2, k_4x4_1, tmp_a0, _MM_SWIZ_REG_BADC)); )) \
00188           FAST(SHORT( __m512     tmp_3_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_a3, k_4x4_1, tmp_a1, _MM_SWIZ_REG_BADC)); )) \
00189           FULL(       __m512 fulltmp_0_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b0, k_4x4_0, tmp_b2, _MM_SWIZ_REG_BADC));  ) \
00190           FULL(       __m512 fulltmp_1_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b1, k_4x4_0, tmp_b3, _MM_SWIZ_REG_BADC));  ) \
00191           FULL(       __m512 fulltmp_2_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b2, k_4x4_1, tmp_b0, _MM_SWIZ_REG_BADC));  ) \
00192           FULL(       __m512 fulltmp_3_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b3, k_4x4_1, tmp_b1, _MM_SWIZ_REG_BADC));  ) \
00193           /* Convert the floats to doubles.  NOTE: This must be done prior to any math because of the difference in magnitudes. */ \
00194           FAST(SHORT( __m512d  v_0_lo = _mm512_cvtpslo_pd(                           tmp_0_vec                ); )) \
00195           FAST(SHORT( __m512d  v_0_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_0_vec, _MM_PERM_BADC)); )) \
00196           FAST(SHORT( __m512d  v_1_lo = _mm512_cvtpslo_pd(                           tmp_1_vec                ); )) \
00197           FAST(SHORT( __m512d  v_1_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_1_vec, _MM_PERM_BADC)); )) \
00198           FAST(SHORT( __m512d  v_2_lo = _mm512_cvtpslo_pd(                           tmp_2_vec                ); )) \
00199           FAST(SHORT( __m512d  v_2_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_2_vec, _MM_PERM_BADC)); )) \
00200           FAST(SHORT( __m512d  v_3_lo = _mm512_cvtpslo_pd(                           tmp_3_vec                ); )) \
00201           FAST(SHORT( __m512d  v_3_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_3_vec, _MM_PERM_BADC)); )) \
00202           FULL(       __m512d fv_0_lo = _mm512_cvtpslo_pd(                       fulltmp_0_vec                );  ) \
00203           FULL(       __m512d fv_0_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_0_vec, _MM_PERM_BADC));  ) \
00204           FULL(       __m512d fv_1_lo = _mm512_cvtpslo_pd(                       fulltmp_1_vec                );  ) \
00205           FULL(       __m512d fv_1_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_1_vec, _MM_PERM_BADC));  ) \
00206           FULL(       __m512d fv_2_lo = _mm512_cvtpslo_pd(                       fulltmp_2_vec                );  ) \
00207           FULL(       __m512d fv_2_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_2_vec, _MM_PERM_BADC));  ) \
00208           FULL(       __m512d fv_3_lo = _mm512_cvtpslo_pd(                       fulltmp_3_vec                );  ) \
00209           FULL(       __m512d fv_3_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_3_vec, _MM_PERM_BADC));  ) \
00210           /* Apply the forces to the 'j' atoms. */ \
00211           __mmask16 k_lo = _mm512_int2mask(0x000F); \
00212           __mmask16 k_hi = _mm512_int2mask(0x00F0); \
00213           __m512d  v_tmp = _mm512_setzero_pd(); \
00214           __m512d fv_tmp = _mm512_setzero_pd(); \
00215           APPLY_FORCES_PS2PD_JSTEP( 0,  4, v_0_lo, fv_0_lo); \
00216           APPLY_FORCES_PS2PD_JSTEP( 1,  5, v_1_lo, fv_1_lo); \
00217           APPLY_FORCES_PS2PD_JSTEP( 2,  6, v_2_lo, fv_2_lo); \
00218           APPLY_FORCES_PS2PD_JSTEP( 3,  7, v_3_lo, fv_3_lo); \
00219           APPLY_FORCES_PS2PD_JSTEP( 8, 12, v_0_hi, fv_0_hi); \
00220           APPLY_FORCES_PS2PD_JSTEP( 9, 13, v_1_hi, fv_1_hi); \
00221           APPLY_FORCES_PS2PD_JSTEP(10, 14, v_2_hi, fv_2_hi); \
00222           APPLY_FORCES_PS2PD_JSTEP(11, 15, v_3_hi, fv_3_hi); \
00223           /* Reduce (add) the forces into a single force vector and apply it to the 'i' atom. */ \
00224           FAST(SHORT( __m512d  v_tmp0_xyzw = _mm512_add_pd( v_0_lo,  v_0_hi); )) \
00225           FULL(       __m512d fv_tmp0_xyzw = _mm512_add_pd(fv_0_lo, fv_0_hi);  ) \
00226           FAST(SHORT( __m512d  v_tmp1_xyzw = _mm512_add_pd( v_1_lo,  v_1_hi); )) \
00227           FULL(       __m512d fv_tmp1_xyzw = _mm512_add_pd(fv_1_lo, fv_1_hi);  ) \
00228           FAST(SHORT( __m512d  v_tmp2_xyzw = _mm512_add_pd( v_2_lo,  v_2_hi); )) \
00229           FULL(       __m512d fv_tmp2_xyzw = _mm512_add_pd(fv_2_lo, fv_2_hi);  ) \
00230           FAST(SHORT( __m512d  v_tmp3_xyzw = _mm512_add_pd( v_3_lo,  v_3_hi); )) \
00231           FULL(       __m512d fv_tmp3_xyzw = _mm512_add_pd(fv_3_lo, fv_3_hi);  ) \
00232           FAST(SHORT( __m512d  v_tmp4_xyzw = _mm512_add_pd( v_tmp0_xyzw,  v_tmp1_xyzw); )) \
00233           FULL(       __m512d fv_tmp4_xyzw = _mm512_add_pd(fv_tmp0_xyzw, fv_tmp1_xyzw);  ) \
00234           FAST(SHORT( __m512d  v_tmp5_xyzw = _mm512_add_pd( v_tmp2_xyzw,  v_tmp3_xyzw); )) \
00235           FULL(       __m512d fv_tmp5_xyzw = _mm512_add_pd(fv_tmp2_xyzw, fv_tmp3_xyzw);  ) \
00236           FAST(SHORT( __m512d  v_tmp6_xyzw = _mm512_add_pd( v_tmp4_xyzw,  v_tmp5_xyzw); )) \
00237           FULL(       __m512d fv_tmp6_xyzw = _mm512_add_pd(fv_tmp4_xyzw, fv_tmp5_xyzw);  ) \
00238           FAST(SHORT( __m512d  v_xyzw = _mm512_add_pd( v_tmp6_xyzw, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512( v_tmp6_xyzw), _MM_PERM_BADC))); )) \
00239           FULL(       __m512d fv_xyzw = _mm512_add_pd(fv_tmp6_xyzw, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(fv_tmp6_xyzw), _MM_PERM_BADC)));  ) \
00240           FAST(SHORT(MODIFIED(  v_xyzw = _mm512_mask_sub_pd( v_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(),  v_xyzw); ))) /* NOTE: w *= -1.0 for add operation below */ \
00241           FAST(SHORT(EXCLUDED(  v_xyzw = _mm512_mask_sub_pd( v_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(),  v_xyzw); ))) \
00242           FULL(MODIFIED(       fv_xyzw = _mm512_mask_sub_pd(fv_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), fv_xyzw);  )) \
00243           FULL(EXCLUDED(       fv_xyzw = _mm512_mask_sub_pd(fv_xyzw, _mm512_int2mask(0x88), _mm512_setzero_pd(), fv_xyzw);  )) \
00244           int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
00245           uintptr_t iTest_i = tmpI32[iTest_index]; \
00246           FAST(SHORT( __m512d  i_xyzw = _mm512_mask_loadunpacklo_pd(_mm512_setzero_pd(), k_lo,     f_0 + iTest_i); )) \
00247           FULL(       __m512d fi_xyzw = _mm512_mask_loadunpacklo_pd(_mm512_setzero_pd(), k_lo, fullf_0 + iTest_i);  ) \
00248           FAST(SHORT(          i_xyzw = _mm512_mask_add_pd( i_xyzw, k_lo,  i_xyzw,  v_xyzw); )) \
00249           FULL(               fi_xyzw = _mm512_mask_add_pd(fi_xyzw, k_lo, fi_xyzw, fv_xyzw);  ) \
00250           FAST(SHORT(                   _mm512_mask_packstorelo_pd(    f_0 + iTest_i, k_lo,  i_xyzw); )) \
00251           FULL(                         _mm512_mask_packstorelo_pd(fullf_0 + iTest_i, k_lo, fi_xyzw);  ) \
00252           }
00253 
00254       #else // if MIC_HANDCODE_FORC_SOA_VS_AOS != 0
00255 
00256         #define APPLY_FORCES_PS2PD \
00257         { \
00258           FAST(SHORT( __m512d  v_x_lo = _mm512_cvtpslo_pd(                           tmp_x_vec                ); )) \
00259           FAST(SHORT( __m512d  v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_x_vec, _MM_PERM_BADC)); )) \
00260           FAST(SHORT( __m512d  v_y_lo = _mm512_cvtpslo_pd(                           tmp_y_vec                ); )) \
00261           FAST(SHORT( __m512d  v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_y_vec, _MM_PERM_BADC)); )) \
00262           FAST(SHORT( __m512d  v_z_lo = _mm512_cvtpslo_pd(                           tmp_z_vec                ); )) \
00263           FAST(SHORT( __m512d  v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_z_vec, _MM_PERM_BADC)); )) \
00264           FULL(       __m512d fv_x_lo = _mm512_cvtpslo_pd(                       fulltmp_x_vec                );  ) \
00265           FULL(       __m512d fv_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_x_vec, _MM_PERM_BADC));  ) \
00266           FULL(       __m512d fv_y_lo = _mm512_cvtpslo_pd(                       fulltmp_y_vec                );  ) \
00267           FULL(       __m512d fv_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_y_vec, _MM_PERM_BADC));  ) \
00268           FULL(       __m512d fv_z_lo = _mm512_cvtpslo_pd(                       fulltmp_z_vec                );  ) \
00269           FULL(       __m512d fv_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_z_vec, _MM_PERM_BADC));  ) \
00270           int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
00271           uintptr_t iTest_i = tmpI32[iTest_index]; \
00272           __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
00273           __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
00274           FAST(SHORT(     f_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_x_hi,  v_x_lo)); )) \
00275           FAST(SHORT(     f_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_y_hi,  v_y_lo)); )) \
00276           FAST(SHORT(     f_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_z_hi,  v_z_lo)); )) \
00277           FULL(       fullf_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_x_hi, fv_x_lo));  ) \
00278           FULL(       fullf_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_y_hi, fv_y_lo));  ) \
00279           FULL(       fullf_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_z_hi, fv_z_lo));  ) \
00280           __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
00281           __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
00282           FAST(SHORT( __m512d  f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec,     f_1_x, _MM_SCALE_8); )) \
00283           FAST(SHORT( __m512d  f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec,     f_1_y, _MM_SCALE_8); )) \
00284           FAST(SHORT( __m512d  f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec,     f_1_z, _MM_SCALE_8); )) \
00285           FAST(SHORT( __m512d  f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec,     f_1_x, _MM_SCALE_8); )) \
00286           FAST(SHORT( __m512d  f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec,     f_1_y, _MM_SCALE_8); )) \
00287           FAST(SHORT( __m512d  f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec,     f_1_z, _MM_SCALE_8); )) \
00288           FULL(       __m512d ff_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_x, _MM_SCALE_8);  ) \
00289           FULL(       __m512d ff_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_y, _MM_SCALE_8);  ) \
00290           FULL(       __m512d ff_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_z, _MM_SCALE_8);  ) \
00291           FULL(       __m512d ff_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_x, _MM_SCALE_8);  ) \
00292           FULL(       __m512d ff_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_y, _MM_SCALE_8);  ) \
00293           FULL(       __m512d ff_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_z, _MM_SCALE_8);  ) \
00294           FAST(SHORT(  f_j_x_lo_vec = _mm512_mask_sub_pd( f_j_x_lo_vec, cutoff_lo,  f_j_x_lo_vec,  v_x_lo); )) \
00295           FAST(SHORT(  f_j_y_lo_vec = _mm512_mask_sub_pd( f_j_y_lo_vec, cutoff_lo,  f_j_y_lo_vec,  v_y_lo); )) \
00296           FAST(SHORT(  f_j_z_lo_vec = _mm512_mask_sub_pd( f_j_z_lo_vec, cutoff_lo,  f_j_z_lo_vec,  v_z_lo); )) \
00297           FAST(SHORT(  f_j_x_hi_vec = _mm512_mask_sub_pd( f_j_x_hi_vec, cutoff_hi,  f_j_x_hi_vec,  v_x_hi); )) \
00298           FAST(SHORT(  f_j_y_hi_vec = _mm512_mask_sub_pd( f_j_y_hi_vec, cutoff_hi,  f_j_y_hi_vec,  v_y_hi); )) \
00299           FAST(SHORT(  f_j_z_hi_vec = _mm512_mask_sub_pd( f_j_z_hi_vec, cutoff_hi,  f_j_z_hi_vec,  v_z_hi); )) \
00300           FULL(       ff_j_x_lo_vec = _mm512_mask_sub_pd(ff_j_x_lo_vec, cutoff_lo, ff_j_x_lo_vec, fv_x_lo);  ) \
00301           FULL(       ff_j_y_lo_vec = _mm512_mask_sub_pd(ff_j_y_lo_vec, cutoff_lo, ff_j_y_lo_vec, fv_y_lo);  ) \
00302           FULL(       ff_j_z_lo_vec = _mm512_mask_sub_pd(ff_j_z_lo_vec, cutoff_lo, ff_j_z_lo_vec, fv_z_lo);  ) \
00303           FULL(       ff_j_x_hi_vec = _mm512_mask_sub_pd(ff_j_x_hi_vec, cutoff_hi, ff_j_x_hi_vec, fv_x_hi);  ) \
00304           FULL(       ff_j_y_hi_vec = _mm512_mask_sub_pd(ff_j_y_hi_vec, cutoff_hi, ff_j_y_hi_vec, fv_y_hi);  ) \
00305           FULL(       ff_j_z_hi_vec = _mm512_mask_sub_pd(ff_j_z_hi_vec, cutoff_hi, ff_j_z_hi_vec, fv_z_hi);  ) \
00306           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_x, cutoff_lo, j_lo_vec,  f_j_x_lo_vec, _MM_SCALE_8); )) \
00307           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_y, cutoff_lo, j_lo_vec,  f_j_y_lo_vec, _MM_SCALE_8); )) \
00308           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_z, cutoff_lo, j_lo_vec,  f_j_z_lo_vec, _MM_SCALE_8); )) \
00309           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_x, cutoff_hi, j_hi_vec,  f_j_x_hi_vec, _MM_SCALE_8); )) \
00310           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_y, cutoff_hi, j_hi_vec,  f_j_y_hi_vec, _MM_SCALE_8); )) \
00311           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_z, cutoff_hi, j_hi_vec,  f_j_z_hi_vec, _MM_SCALE_8); )) \
00312           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_lo, j_lo_vec, ff_j_x_lo_vec, _MM_SCALE_8);  ) \
00313           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_lo, j_lo_vec, ff_j_y_lo_vec, _MM_SCALE_8);  ) \
00314           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_lo, j_lo_vec, ff_j_z_lo_vec, _MM_SCALE_8);  ) \
00315           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_hi, j_hi_vec, ff_j_x_hi_vec, _MM_SCALE_8);  ) \
00316           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_hi, j_hi_vec, ff_j_y_hi_vec, _MM_SCALE_8);  ) \
00317           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_hi, j_hi_vec, ff_j_z_hi_vec, _MM_SCALE_8);  ) \
00318         }
00319 
00320       #endif // MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00321 
00322     #else // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
00323 
00324       #define APPLY_FORCES_PS2PD(v_x, v_y, v_z, f_i_x, f_i_y, f_i_z, f_j_x, f_j_y, f_j_z, i, j) \
00325       { \
00326         __m512d v_x_lo = _mm512_cvtpslo_pd(v_x); \
00327         __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_x), _MM_PERM_BADC)); \
00328         __m512d v_y_lo = _mm512_cvtpslo_pd(v_y); \
00329         __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_y), _MM_PERM_BADC)); \
00330         __m512d v_z_lo = _mm512_cvtpslo_pd(v_z); \
00331         __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_z), _MM_PERM_BADC)); \
00332         __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
00333         __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
00334         __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
00335         __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
00336         int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
00337         uintptr_t iTest_i = tmpI32[iTest_index]; \
00338         f_i_x[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_x_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_x_lo); \
00339         f_i_y[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_y_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_y_lo); \
00340         f_i_z[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_z_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_z_lo); \
00341         __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_x), _MM_SCALE_8); \
00342         __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_y), _MM_SCALE_8); \
00343         __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_z), _MM_SCALE_8); \
00344         __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_x), _MM_SCALE_8); \
00345         __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_y), _MM_SCALE_8); \
00346         __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_z), _MM_SCALE_8); \
00347         f_j_x_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_x_lo_vec, v_x_lo); \
00348         f_j_y_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_y_lo_vec, v_y_lo); \
00349         f_j_z_lo_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_lo, f_j_z_lo_vec, v_z_lo); \
00350         f_j_x_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_x_hi_vec, v_x_hi); \
00351         f_j_y_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_y_hi_vec, v_y_hi); \
00352         f_j_z_hi_vec = _mm512_mask_sub_pd(_mm512_setzero_pd(), cutoff_hi, f_j_z_hi_vec, v_z_hi); \
00353         _mm512_mask_i32loscatter_pd((f_j_x), cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); \
00354         _mm512_mask_i32loscatter_pd((f_j_y), cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); \
00355         _mm512_mask_i32loscatter_pd((f_j_z), cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); \
00356         _mm512_mask_i32loscatter_pd((f_j_x), cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); \
00357         _mm512_mask_i32loscatter_pd((f_j_y), cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); \
00358         _mm512_mask_i32loscatter_pd((f_j_z), cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); \
00359       }
00360 
00361     #endif // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
00362 
00363   #else // MIC_PAD_PLGEN != 0
00364 
00365     #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
00366 
00367       #define APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x, v_y, v_z, fv_x, fv_y, fv_z, i, p, k) \
00368         FAST(SHORT(  tx = _mm512_mask_loadunpacklo_pd( tx, (k),     f_0_x + (i)); )) \
00369         FAST(SHORT(  ty = _mm512_mask_loadunpacklo_pd( ty, (k),     f_0_y + (i)); )) \
00370         FAST(SHORT(  tz = _mm512_mask_loadunpacklo_pd( tz, (k),     f_0_z + (i)); )) \
00371         FULL(       ftx = _mm512_mask_loadunpacklo_pd(ftx, (k), fullf_0_x + (i));  ) \
00372         FULL(       fty = _mm512_mask_loadunpacklo_pd(fty, (k), fullf_0_y + (i));  ) \
00373         FULL(       ftz = _mm512_mask_loadunpacklo_pd(ftz, (k), fullf_0_z + (i));  ) \
00374         FAST(SHORT(  tx = _mm512_mask_add_pd( tx, (k),  tx, _mm512_swizzle_pd(( v_x), (p))); )) \
00375         FAST(SHORT(  ty = _mm512_mask_add_pd( ty, (k),  ty, _mm512_swizzle_pd(( v_y), (p))); )) \
00376         FAST(SHORT(  tz = _mm512_mask_add_pd( tz, (k),  tz, _mm512_swizzle_pd(( v_z), (p))); )) \
00377         FULL(       ftx = _mm512_mask_add_pd(ftx, (k), ftx, _mm512_swizzle_pd((fv_x), (p)));  ) \
00378         FULL(       fty = _mm512_mask_add_pd(fty, (k), fty, _mm512_swizzle_pd((fv_y), (p)));  ) \
00379         FULL(       ftz = _mm512_mask_add_pd(ftz, (k), ftz, _mm512_swizzle_pd((fv_z), (p)));  ) \
00380         FAST(SHORT( _mm512_mask_packstorelo_pd(    f_0_x + (i), (k),  tx); )) \
00381         FAST(SHORT( _mm512_mask_packstorelo_pd(    f_0_y + (i), (k),  ty); )) \
00382         FAST(SHORT( _mm512_mask_packstorelo_pd(    f_0_z + (i), (k),  tz); )) \
00383         FULL(       _mm512_mask_packstorelo_pd(fullf_0_x + (i), (k), ftx);  ) \
00384         FULL(       _mm512_mask_packstorelo_pd(fullf_0_y + (i), (k), fty);  ) \
00385         FULL(       _mm512_mask_packstorelo_pd(fullf_0_z + (i), (k), ftz);  )
00386 
00387       #define APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x, v_y, v_z, fv_x, fv_y, fv_z, i, p, k) \
00388         FAST(SHORT(  tx = _mm512_mask_loadunpacklo_pd( tx, (k),     f_1_x + (i)); )) \
00389         FAST(SHORT(  ty = _mm512_mask_loadunpacklo_pd( ty, (k),     f_1_y + (i)); )) \
00390         FAST(SHORT(  tz = _mm512_mask_loadunpacklo_pd( tz, (k),     f_1_z + (i)); )) \
00391         FULL(       ftx = _mm512_mask_loadunpacklo_pd(ftx, (k), fullf_1_x + (i));  ) \
00392         FULL(       fty = _mm512_mask_loadunpacklo_pd(fty, (k), fullf_1_y + (i));  ) \
00393         FULL(       ftz = _mm512_mask_loadunpacklo_pd(ftz, (k), fullf_1_z + (i));  ) \
00394         FAST(SHORT(  tx = _mm512_mask_sub_pd( tx, (k),  tx, _mm512_swizzle_pd(( v_x), (p))); )) \
00395         FAST(SHORT(  ty = _mm512_mask_sub_pd( ty, (k),  ty, _mm512_swizzle_pd(( v_y), (p))); )) \
00396         FAST(SHORT(  tz = _mm512_mask_sub_pd( tz, (k),  tz, _mm512_swizzle_pd(( v_z), (p))); )) \
00397         FULL(       ftx = _mm512_mask_sub_pd(ftx, (k), ftx, _mm512_swizzle_pd((fv_x), (p)));  ) \
00398         FULL(       fty = _mm512_mask_sub_pd(fty, (k), fty, _mm512_swizzle_pd((fv_y), (p)));  ) \
00399         FULL(       ftz = _mm512_mask_sub_pd(ftz, (k), ftz, _mm512_swizzle_pd((fv_z), (p)));  ) \
00400         FAST(SHORT( _mm512_mask_packstorelo_pd(    f_1_x + (i), (k),  tx); )) \
00401         FAST(SHORT( _mm512_mask_packstorelo_pd(    f_1_y + (i), (k),  ty); )) \
00402         FAST(SHORT( _mm512_mask_packstorelo_pd(    f_1_z + (i), (k),  tz); )) \
00403         FULL(       _mm512_mask_packstorelo_pd(fullf_1_x + (i), (k), ftx);  ) \
00404         FULL(       _mm512_mask_packstorelo_pd(fullf_1_y + (i), (k), fty);  ) \
00405         FULL(       _mm512_mask_packstorelo_pd(fullf_1_z + (i), (k), ftz);  )
00406 
00407       #define APPLY_FORCES_PS2PD \
00408       { \
00409         __mmask16 k_lo = _mm512_int2mask(0x02); \
00410         __mmask16 k_hi = _mm512_int2mask(0x20); \
00411         FAST(SHORT( __m512d  tx = _mm512_setzero_pd(); )) \
00412         FAST(SHORT( __m512d  ty = _mm512_setzero_pd(); )) \
00413         FAST(SHORT( __m512d  tz = _mm512_setzero_pd(); )) \
00414         FULL(       __m512d ftx = _mm512_setzero_pd();  ) \
00415         FULL(       __m512d fty = _mm512_setzero_pd();  ) \
00416         FULL(       __m512d ftz = _mm512_setzero_pd();  ) \
00417         FAST(SHORT( __m512d  v_x_lo = _mm512_cvtpslo_pd(                           tmp_x_vec                ); )) \
00418         FAST(SHORT( __m512d  v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_x_vec, _MM_PERM_BADC)); )) \
00419         FAST(SHORT( __m512d  v_y_lo = _mm512_cvtpslo_pd(                           tmp_y_vec                ); )) \
00420         FAST(SHORT( __m512d  v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_y_vec, _MM_PERM_BADC)); )) \
00421         FAST(SHORT( __m512d  v_z_lo = _mm512_cvtpslo_pd(                           tmp_z_vec                ); )) \
00422         FAST(SHORT( __m512d  v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(    tmp_z_vec, _MM_PERM_BADC)); )) \
00423         FULL(       __m512d fv_x_lo = _mm512_cvtpslo_pd(                       fulltmp_x_vec                );  ) \
00424         FULL(       __m512d fv_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_x_vec, _MM_PERM_BADC));  ) \
00425         FULL(       __m512d fv_y_lo = _mm512_cvtpslo_pd(                       fulltmp_y_vec                );  ) \
00426         FULL(       __m512d fv_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_y_vec, _MM_PERM_BADC));  ) \
00427         FULL(       __m512d fv_z_lo = _mm512_cvtpslo_pd(                       fulltmp_z_vec                );  ) \
00428         FULL(       __m512d fv_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(fulltmp_z_vec, _MM_PERM_BADC));  ) \
00429         int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
00430         uintptr_t iTest_i = tmpI32[iTest_index]; \
00431         __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, _mm512_set_1to16_epi32(iTest_i), i_vec); \
00432         iTest_mask = _mm512_kxor(iTest_mask, cutoff_mask); \
00433         if (_mm512_kortestz(iTest_mask, iTest_mask)) { \
00434           __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
00435           __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
00436           FAST(SHORT(     f_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_x_hi,  v_x_lo)); )) \
00437           FAST(SHORT(     f_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_y_hi,  v_y_lo)); )) \
00438           FAST(SHORT(     f_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd( v_z_hi,  v_z_lo)); )) \
00439           FULL(       fullf_0_x[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_x_hi, fv_x_lo));  ) \
00440           FULL(       fullf_0_y[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_y_hi, fv_y_lo));  ) \
00441           FULL(       fullf_0_z[iTest_i] += _mm512_reduce_add_pd(_mm512_add_pd(fv_z_hi, fv_z_lo));  ) \
00442           __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
00443           __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
00444           FAST(SHORT( __m512d  f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec,     f_1_x, _MM_SCALE_8); )) \
00445           FAST(SHORT( __m512d  f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec,     f_1_y, _MM_SCALE_8); )) \
00446           FAST(SHORT( __m512d  f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec,     f_1_z, _MM_SCALE_8); )) \
00447           FAST(SHORT( __m512d  f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec,     f_1_x, _MM_SCALE_8); )) \
00448           FAST(SHORT( __m512d  f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec,     f_1_y, _MM_SCALE_8); )) \
00449           FAST(SHORT( __m512d  f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec,     f_1_z, _MM_SCALE_8); )) \
00450           FULL(       __m512d ff_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_x, _MM_SCALE_8);  ) \
00451           FULL(       __m512d ff_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_y, _MM_SCALE_8);  ) \
00452           FULL(       __m512d ff_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, fullf_1_z, _MM_SCALE_8);  ) \
00453           FULL(       __m512d ff_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_x, _MM_SCALE_8);  ) \
00454           FULL(       __m512d ff_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_y, _MM_SCALE_8);  ) \
00455           FULL(       __m512d ff_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, fullf_1_z, _MM_SCALE_8);  ) \
00456           FAST(SHORT(  f_j_x_lo_vec = _mm512_mask_sub_pd( f_j_x_lo_vec, cutoff_lo,  f_j_x_lo_vec,  v_x_lo); )) \
00457           FAST(SHORT(  f_j_y_lo_vec = _mm512_mask_sub_pd( f_j_y_lo_vec, cutoff_lo,  f_j_y_lo_vec,  v_y_lo); )) \
00458           FAST(SHORT(  f_j_z_lo_vec = _mm512_mask_sub_pd( f_j_z_lo_vec, cutoff_lo,  f_j_z_lo_vec,  v_z_lo); )) \
00459           FAST(SHORT(  f_j_x_hi_vec = _mm512_mask_sub_pd( f_j_x_hi_vec, cutoff_hi,  f_j_x_hi_vec,  v_x_hi); )) \
00460           FAST(SHORT(  f_j_y_hi_vec = _mm512_mask_sub_pd( f_j_y_hi_vec, cutoff_hi,  f_j_y_hi_vec,  v_y_hi); )) \
00461           FAST(SHORT(  f_j_z_hi_vec = _mm512_mask_sub_pd( f_j_z_hi_vec, cutoff_hi,  f_j_z_hi_vec,  v_z_hi); )) \
00462           FULL(       ff_j_x_lo_vec = _mm512_mask_sub_pd(ff_j_x_lo_vec, cutoff_lo, ff_j_x_lo_vec, fv_x_lo);  ) \
00463           FULL(       ff_j_y_lo_vec = _mm512_mask_sub_pd(ff_j_y_lo_vec, cutoff_lo, ff_j_y_lo_vec, fv_y_lo);  ) \
00464           FULL(       ff_j_z_lo_vec = _mm512_mask_sub_pd(ff_j_z_lo_vec, cutoff_lo, ff_j_z_lo_vec, fv_z_lo);  ) \
00465           FULL(       ff_j_x_hi_vec = _mm512_mask_sub_pd(ff_j_x_hi_vec, cutoff_hi, ff_j_x_hi_vec, fv_x_hi);  ) \
00466           FULL(       ff_j_y_hi_vec = _mm512_mask_sub_pd(ff_j_y_hi_vec, cutoff_hi, ff_j_y_hi_vec, fv_y_hi);  ) \
00467           FULL(       ff_j_z_hi_vec = _mm512_mask_sub_pd(ff_j_z_hi_vec, cutoff_hi, ff_j_z_hi_vec, fv_z_hi);  ) \
00468           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_x, cutoff_lo, j_lo_vec,  f_j_x_lo_vec, _MM_SCALE_8); )) \
00469           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_y, cutoff_lo, j_lo_vec,  f_j_y_lo_vec, _MM_SCALE_8); )) \
00470           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_z, cutoff_lo, j_lo_vec,  f_j_z_lo_vec, _MM_SCALE_8); )) \
00471           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_x, cutoff_hi, j_hi_vec,  f_j_x_hi_vec, _MM_SCALE_8); )) \
00472           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_y, cutoff_hi, j_hi_vec,  f_j_y_hi_vec, _MM_SCALE_8); )) \
00473           FAST(SHORT( _mm512_mask_i32loscatter_pd(    f_1_z, cutoff_hi, j_hi_vec,  f_j_z_hi_vec, _MM_SCALE_8); )) \
00474           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_lo, j_lo_vec, ff_j_x_lo_vec, _MM_SCALE_8);  ) \
00475           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_lo, j_lo_vec, ff_j_y_lo_vec, _MM_SCALE_8);  ) \
00476           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_lo, j_lo_vec, ff_j_z_lo_vec, _MM_SCALE_8);  ) \
00477           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_x, cutoff_hi, j_hi_vec, ff_j_x_hi_vec, _MM_SCALE_8);  ) \
00478           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_y, cutoff_hi, j_hi_vec, ff_j_y_hi_vec, _MM_SCALE_8);  ) \
00479           FULL(       _mm512_mask_i32loscatter_pd(fullf_1_z, cutoff_hi, j_hi_vec, ff_j_z_hi_vec, _MM_SCALE_8);  ) \
00480         } else { \
00481           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 0], _MM_SWIZ_REG_CDAB, k_lo); \
00482           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 1], _MM_SWIZ_REG_DCBA, k_lo); \
00483           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 2], _MM_SWIZ_REG_DACB, k_lo); \
00484           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 3], _MM_SWIZ_REG_BADC, k_lo); \
00485           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 4], _MM_SWIZ_REG_CDAB, k_hi); \
00486           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 5], _MM_SWIZ_REG_DCBA, k_hi); \
00487           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 6], _MM_SWIZ_REG_DACB, k_hi); \
00488           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpI32[ 7], _MM_SWIZ_REG_BADC, k_hi); \
00489           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[ 8], _MM_SWIZ_REG_CDAB, k_lo); \
00490           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[ 9], _MM_SWIZ_REG_DCBA, k_lo); \
00491           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[10], _MM_SWIZ_REG_DACB, k_lo); \
00492           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[11], _MM_SWIZ_REG_BADC, k_lo); \
00493           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[12], _MM_SWIZ_REG_CDAB, k_hi); \
00494           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[13], _MM_SWIZ_REG_DCBA, k_hi); \
00495           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[14], _MM_SWIZ_REG_DACB, k_hi); \
00496           APPLY_FORCES_PS2PD_STEP_ADD_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpI32[15], _MM_SWIZ_REG_BADC, k_hi); \
00497           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 0], _MM_SWIZ_REG_CDAB, k_lo); \
00498           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 1], _MM_SWIZ_REG_DCBA, k_lo); \
00499           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 2], _MM_SWIZ_REG_DACB, k_lo); \
00500           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 3], _MM_SWIZ_REG_BADC, k_lo); \
00501           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 4], _MM_SWIZ_REG_CDAB, k_hi); \
00502           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 5], _MM_SWIZ_REG_DCBA, k_hi); \
00503           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 6], _MM_SWIZ_REG_DACB, k_hi); \
00504           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_lo, v_y_lo, v_z_lo, fv_x_lo, fv_y_lo, fv_z_lo, tmpJ32[ 7], _MM_SWIZ_REG_BADC, k_hi); \
00505           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[ 8], _MM_SWIZ_REG_CDAB, k_lo); \
00506           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[ 9], _MM_SWIZ_REG_DCBA, k_lo); \
00507           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[10], _MM_SWIZ_REG_DACB, k_lo); \
00508           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[11], _MM_SWIZ_REG_BADC, k_lo); \
00509           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[12], _MM_SWIZ_REG_CDAB, k_hi); \
00510           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[13], _MM_SWIZ_REG_DCBA, k_hi); \
00511           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[14], _MM_SWIZ_REG_DACB, k_hi); \
00512           APPLY_FORCES_PS2PD_STEP_SUB_COMBO(v_x_hi, v_y_hi, v_z_hi, fv_x_hi, fv_y_hi, fv_z_hi, tmpJ32[15], _MM_SWIZ_REG_BADC, k_hi); \
00513         } \
00514       }
00515       
00516     #else // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
00517 
00518       #define APPLY_FORCES_PS2PD(v_x, v_y, v_z, f_i_x, f_i_y, f_i_z, f_j_x, f_j_y, f_j_z, i, j) \
00519       { \
00520         __mmask16 k_lo = _mm512_int2mask(0x02); \
00521         __mmask16 k_hi = _mm512_int2mask(0x20); \
00522         __m512d tx = _mm512_setzero_pd(); \
00523         __m512d ty = _mm512_setzero_pd(); \
00524         __m512d tz = _mm512_setzero_pd(); \
00525         __m512d v_x_lo = _mm512_cvtpslo_pd(v_x); \
00526         __m512d v_x_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_x), _MM_PERM_BADC)); \
00527         __m512d v_y_lo = _mm512_cvtpslo_pd(v_y); \
00528         __m512d v_y_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_y), _MM_PERM_BADC)); \
00529         __m512d v_z_lo = _mm512_cvtpslo_pd(v_z); \
00530         __m512d v_z_hi = _mm512_cvtpslo_pd(_mm512_permute4f128_ps((v_z), _MM_PERM_BADC)); \
00531         int iTest_index = _mm_tzcnt_32(_mm512_mask2int(cutoff_mask)); \
00532         uintptr_t iTest_i = tmpI32[iTest_index]; \
00533         __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, _mm512_set_1to16_epi32(iTest_i), i_vec); \
00534         iTest_mask = _mm512_kxor(iTest_mask, cutoff_mask); \
00535         if (_mm512_kortestz(iTest_mask, iTest_mask)) { \
00536           __mmask16 cutoff_lo = _mm512_kand(cutoff_mask, _mm512_int2mask(0xFF)); \
00537           __mmask16 cutoff_hi = _mm512_kmerge2l1h(cutoff_mask, _mm512_kxor(cutoff_mask, cutoff_mask)); \
00538           f_i_x[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_x_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_x_lo); \
00539           f_i_y[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_y_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_y_lo); \
00540           f_i_z[iTest_i] += _mm512_mask_reduce_add_pd(cutoff_hi, v_z_hi) + _mm512_mask_reduce_add_pd(cutoff_lo, v_z_lo); \
00541           __m512i j_lo_vec = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), cutoff_lo, j_vec); \
00542           __m512i j_hi_vec = _mm512_mask_permute4f128_epi32(_mm512_setzero_epi32(), cutoff_hi, j_vec, _MM_PERM_BADC); \
00543           __m512d f_j_x_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_x), _MM_SCALE_8); \
00544           __m512d f_j_y_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_y), _MM_SCALE_8); \
00545           __m512d f_j_z_lo_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_lo, j_lo_vec, (f_j_z), _MM_SCALE_8); \
00546           __m512d f_j_x_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_x), _MM_SCALE_8); \
00547           __m512d f_j_y_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_y), _MM_SCALE_8); \
00548           __m512d f_j_z_hi_vec = _mm512_mask_i32logather_pd(_mm512_setzero_pd(), cutoff_hi, j_hi_vec, (f_j_z), _MM_SCALE_8); \
00549           f_j_x_lo_vec = _mm512_mask_sub_pd(f_j_x_lo_vec, cutoff_lo, f_j_x_lo_vec, v_x_lo); \
00550           f_j_y_lo_vec = _mm512_mask_sub_pd(f_j_y_lo_vec, cutoff_lo, f_j_y_lo_vec, v_y_lo); \
00551           f_j_z_lo_vec = _mm512_mask_sub_pd(f_j_z_lo_vec, cutoff_lo, f_j_z_lo_vec, v_z_lo); \
00552           f_j_x_hi_vec = _mm512_mask_sub_pd(f_j_x_hi_vec, cutoff_hi, f_j_x_hi_vec, v_x_hi); \
00553           f_j_y_hi_vec = _mm512_mask_sub_pd(f_j_y_hi_vec, cutoff_hi, f_j_y_hi_vec, v_y_hi); \
00554           f_j_z_hi_vec = _mm512_mask_sub_pd(f_j_z_hi_vec, cutoff_hi, f_j_z_hi_vec, v_z_hi); \
00555           _mm512_mask_i32loscatter_pd((f_j_x), cutoff_lo, j_lo_vec, f_j_x_lo_vec, _MM_SCALE_8); \
00556           _mm512_mask_i32loscatter_pd((f_j_y), cutoff_lo, j_lo_vec, f_j_y_lo_vec, _MM_SCALE_8); \
00557           _mm512_mask_i32loscatter_pd((f_j_z), cutoff_lo, j_lo_vec, f_j_z_lo_vec, _MM_SCALE_8); \
00558           _mm512_mask_i32loscatter_pd((f_j_x), cutoff_hi, j_hi_vec, f_j_x_hi_vec, _MM_SCALE_8); \
00559           _mm512_mask_i32loscatter_pd((f_j_y), cutoff_hi, j_hi_vec, f_j_y_hi_vec, _MM_SCALE_8); \
00560           _mm512_mask_i32loscatter_pd((f_j_z), cutoff_hi, j_hi_vec, f_j_z_hi_vec, _MM_SCALE_8); \
00561         } else { \
00562           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 0], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_lo); \
00563           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 1], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_lo); \
00564           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 2], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_lo); \
00565           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 3], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_lo); \
00566           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 4], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_hi); \
00567           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 5], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_hi); \
00568           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 6], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_hi); \
00569           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 7], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_hi); \
00570           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 8], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_lo); \
00571           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[ 9], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_lo); \
00572           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[10], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_lo); \
00573           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[11], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_lo); \
00574           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[12], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_hi); \
00575           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[13], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_hi); \
00576           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[14], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_hi); \
00577           APPLY_FORCES_PS2PD_STEP_ADD(f_i_x, f_i_y, f_i_z, (i)[15], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_hi); \
00578           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 0], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_lo); \
00579           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 1], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_lo); \
00580           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 2], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_lo); \
00581           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 3], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_lo); \
00582           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 4], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_CDAB, k_hi); \
00583           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 5], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DCBA, k_hi); \
00584           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 6], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_DACB, k_hi); \
00585           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 7], v_x_lo, v_y_lo, v_z_lo, _MM_SWIZ_REG_BADC, k_hi); \
00586           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 8], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_lo); \
00587           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[ 9], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_lo); \
00588           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[10], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_lo); \
00589           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[11], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_lo); \
00590           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[12], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_CDAB, k_hi); \
00591           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[13], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DCBA, k_hi); \
00592           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[14], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_DACB, k_hi); \
00593           APPLY_FORCES_PS2PD_STEP_SUB(f_j_x, f_j_y, f_j_z, (j)[15], v_x_hi, v_y_hi, v_z_hi, _MM_SWIZ_REG_BADC, k_hi); \
00594         } \
00595       }
00596 
00597     #endif // MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
00598   #endif // MIC_PAD_PLGEN != 0
00599 
00600   __m512i plI_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10,  9,  8,
00601                                              7,  6,  5,  4,  3,  2,  1,  0);
00602   const int plSize_16 = (plSize + 15) & (~15); // Round up to a multiple of 16
00603 
00604   FAST(ENERGY( __m512d vdwEnergy_vec = _mm512_setzero_pd(); ))
00605   FAST(SHORT(ENERGY( __m512d electEnergy_vec = _mm512_setzero_pd(); )))
00606   FULL(ENERGY( __m512d fullElectEnergy_vec = _mm512_setzero_pd(); ))
00607   #if (0 PAIR(FAST(SHORT(+1))))
00608     __m512d virial_xx_vec = _mm512_setzero_pd();
00609     __m512d virial_xy_vec = _mm512_setzero_pd();
00610     __m512d virial_xz_vec = _mm512_setzero_pd();
00611     __m512d virial_yy_vec = _mm512_setzero_pd();
00612     __m512d virial_yz_vec = _mm512_setzero_pd();
00613     __m512d virial_zz_vec = _mm512_setzero_pd();
00614   #endif
00615   #if (0 PAIR(FULL(+1)))
00616     __m512d fullElectVirial_xx_vec = _mm512_setzero_pd();
00617     __m512d fullElectVirial_xy_vec = _mm512_setzero_pd();
00618     __m512d fullElectVirial_xz_vec = _mm512_setzero_pd();
00619     __m512d fullElectVirial_yy_vec = _mm512_setzero_pd();
00620     __m512d fullElectVirial_yz_vec = _mm512_setzero_pd();
00621     __m512d fullElectVirial_zz_vec = _mm512_setzero_pd();
00622   #endif
00623 
00624   __m512i ij_mask_vec = _mm512_set_1to16_epi32(0x0000FFFF);
00625   __m512i ij_store_perm_pattern = _mm512_set_16to16_epi32( 9,  7,  9,  6,  9,  5,  9,  4,
00626                                                            9,  3,  9,  2,  9,  1,  9,  0  );
00627 
00628   #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
00629     __m512i exclusionSum_vec = _mm512_setzero_epi32();
00630   #endif
00631 
00632   #if (0 NORMAL(+1))
00633     #if (0 PAIR(+1))
00634       #pragma loop count (500)
00635     #else
00636       #pragma loop count (7000)
00637     #endif
00638   #else
00639     #if (0 PAIR(+1))
00640       #pragma loop count (2)
00641     #else
00642       #pragma loop count (30)
00643     #endif
00644   #endif
00645   #pragma prefetch plArray:_MM_HINT_NTA
00646   #pragma novector
00647   for (int plI = 0; plI < plSize_16; plI += 16) {
00648 
00649     // Create the active_mask
00650     __mmask16 active_mask = _mm512_cmplt_epi32_mask(plI_vec, _mm512_set_1to16_epi32(plSize));
00651 
00652     #if MIC_HANDCODE_FORCE_PFDIST != 0
00653       __m512i future_ij_vec = _mm512_load_epi32(plArray + plI + (16 * MIC_HANDCODE_FORCE_PFDIST));
00654       __mmask16 future_ij_mask = _mm512_cmpneq_epi32_mask(future_ij_vec, _mm512_set_1to16_epi32(-1));
00655       __m512i future_j_vec = _mm512_and_epi32(future_ij_vec, ij_mask_vec);
00656       _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_x, _MM_SCALE_4, _MM_HINT_T0);
00657       _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_y, _MM_SCALE_4, _MM_HINT_T0);
00658       _mm512_mask_prefetch_i32gather_ps(future_j_vec, future_ij_mask, p_1_z, _MM_SCALE_4, _MM_HINT_T0);
00659     #endif
00660 
00661     // Load the i and j values from the pairlist array and modify the active_mask accordingly
00662     __m512i ij_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), active_mask, plArray + plI);
00663     #if __MIC_PAD_PLGEN_CTRL != 0
00664       __m512i ij_mask_vec = _mm512_set_1to16_epi32(0xFFFF);
00665       __m512i ij_lo_vec = _mm512_and_epi32(ij_vec, ij_mask_vec);
00666       active_mask = _mm512_mask_cmpneq_epi32_mask(active_mask, ij_lo_vec, ij_mask_vec);
00667     #endif
00668     __m512i i_vec = _mm512_and_epi32(_mm512_mask_srli_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, 16), ij_mask_vec);
00669     __m512i j_vec = _mm512_mask_and_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, ij_mask_vec);
00670 
00671     // Store the index out to an array of scalar values so they can be individually accessed by lane
00672     uintptr_t tmpI32[16] __attribute__((aligned(64)));
00673     uintptr_t tmpJ32[16] __attribute__((aligned(64)));
00674     _mm512_store_epi64(tmpI32    , _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern,                           i_vec                ));
00675     _mm512_store_epi64(tmpI32 + 8, _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, _mm512_permute4f128_epi32(i_vec, _MM_PERM_BADC)));
00676     _mm512_store_epi64(tmpJ32    , _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern,                           j_vec                ));
00677     _mm512_store_epi64(tmpJ32 + 8, _mm512_mask_permutevar_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x5555), ij_store_perm_pattern, _mm512_permute4f128_epi32(j_vec, _MM_PERM_BADC)));
00678 
00679     // Increment the vectorized loop counter
00680     plI_vec = _mm512_add_epi32(plI_vec, _mm512_set_1to16_epi32(16));
00681 
00682     // Load position/charge data for the i and j atoms
00683     __m512 p_i_x_vec, p_i_y_vec, p_i_z_vec, p_i_q_vec, p_j_x_vec, p_j_y_vec, p_j_z_vec, p_j_q_vec;
00684     #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00685       __m512i pExt_i_vdwType_vec, pExt_j_vdwType_vec;
00686     #endif
00687     {
00688       __m512 zero_vec = _mm512_setzero_ps();
00689 
00690       int iTest_index = _mm_tzcnt_32(_mm512_mask2int(active_mask)); // Index of lease significant 1 bit in active mask (NOTE: active_mask should be not all zeros)
00691       unsigned int iTest_i = tmpI32[iTest_index];  // NOTE: iTest_i = a valid "i" value
00692       #if __MIC_PAD_PLGEN_CTRL != 0
00693       #else
00694       __mmask16 iTest_mask = _mm512_mask_cmpeq_epi32_mask(active_mask, _mm512_set_1to16_epi32(iTest_i), i_vec);
00695       iTest_mask =_mm512_kxor(iTest_mask, active_mask);
00696       if (_mm512_kortestz(iTest_mask, iTest_mask)) {
00697       #endif
00698         #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00699           p_i_x_vec = _mm512_set_1to16_ps(p_0[iTest_i].x);
00700           p_i_y_vec = _mm512_set_1to16_ps(p_0[iTest_i].y);
00701           p_i_z_vec = _mm512_set_1to16_ps(p_0[iTest_i].z);
00702           p_i_q_vec = _mm512_set_1to16_ps(p_0[iTest_i].charge);
00703           {
00704             __mmask16 k0 = _mm512_int2mask(0x000F);
00705             __mmask16 k1 = _mm512_int2mask(0x00F0);
00706             __mmask16 k2 = _mm512_int2mask(0x0F00);
00707             __mmask16 k3 = _mm512_int2mask(0xF000);
00708             __m512i tmp_a0 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 0]);
00709                     tmp_a0 = _mm512_mask_loadunpacklo_epi32(                tmp_a0, k1, p_1 + tmpJ32[ 4]);
00710                     tmp_a0 = _mm512_mask_loadunpacklo_epi32(                tmp_a0, k2, p_1 + tmpJ32[ 8]);
00711                     tmp_a0 = _mm512_mask_loadunpacklo_epi32(                tmp_a0, k3, p_1 + tmpJ32[12]);
00712             __m512i tmp_a1 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 1]);
00713                     tmp_a1 = _mm512_mask_loadunpacklo_epi32(                tmp_a1, k1, p_1 + tmpJ32[ 5]);
00714                     tmp_a1 = _mm512_mask_loadunpacklo_epi32(                tmp_a1, k2, p_1 + tmpJ32[ 9]);
00715                     tmp_a1 = _mm512_mask_loadunpacklo_epi32(                tmp_a1, k3, p_1 + tmpJ32[13]);
00716             __m512i tmp_a2 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 2]);
00717                     tmp_a2 = _mm512_mask_loadunpacklo_epi32(                tmp_a2, k1, p_1 + tmpJ32[ 6]);
00718                     tmp_a2 = _mm512_mask_loadunpacklo_epi32(                tmp_a2, k2, p_1 + tmpJ32[10]);
00719                     tmp_a2 = _mm512_mask_loadunpacklo_epi32(                tmp_a2, k3, p_1 + tmpJ32[14]);
00720             __m512i tmp_a3 = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k0, p_1 + tmpJ32[ 3]);
00721                     tmp_a3 = _mm512_mask_loadunpacklo_epi32(                tmp_a3, k1, p_1 + tmpJ32[ 7]);
00722                     tmp_a3 = _mm512_mask_loadunpacklo_epi32(                tmp_a3, k2, p_1 + tmpJ32[11]);
00723                     tmp_a3 = _mm512_mask_loadunpacklo_epi32(                tmp_a3, k3, p_1 + tmpJ32[15]);
00724             __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
00725             __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
00726             __m512i tmp_b0 = _mm512_mask_swizzle_epi32(tmp_a0, k_2x2_0, tmp_a1, _MM_SWIZ_REG_CDAB);
00727             __m512i tmp_b1 = _mm512_mask_swizzle_epi32(tmp_a1, k_2x2_1, tmp_a0, _MM_SWIZ_REG_CDAB);
00728             __m512i tmp_b2 = _mm512_mask_swizzle_epi32(tmp_a2, k_2x2_0, tmp_a3, _MM_SWIZ_REG_CDAB);
00729             __m512i tmp_b3 = _mm512_mask_swizzle_epi32(tmp_a3, k_2x2_1, tmp_a2, _MM_SWIZ_REG_CDAB);
00730             __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
00731             __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
00732             p_j_x_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b0, k_4x4_0, tmp_b2, _MM_SWIZ_REG_BADC));
00733             p_j_y_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b1, k_4x4_0, tmp_b3, _MM_SWIZ_REG_BADC));
00734             p_j_z_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b2, k_4x4_1, tmp_b0, _MM_SWIZ_REG_BADC));
00735             p_j_q_vec = _mm512_castsi512_ps(_mm512_mask_swizzle_epi32(tmp_b3, k_4x4_1, tmp_b1, _MM_SWIZ_REG_BADC));
00736           }
00737         #else
00738           p_i_x_vec = _mm512_set_1to16_ps(p_0_x[iTest_i]);
00739           p_i_y_vec = _mm512_set_1to16_ps(p_0_y[iTest_i]);
00740           p_i_z_vec = _mm512_set_1to16_ps(p_0_z[iTest_i]);
00741           p_i_q_vec = _mm512_set_1to16_ps(p_0_q[iTest_i]);
00742           #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00743             pExt_i_vdwType_vec = _mm512_set_1to16_epi32(pExt_0_vdwType[iTest_i]);
00744           #endif
00745           #if MIC_HANDCODE_FORCE_USEGATHER != 0
00746             p_j_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_x, _MM_SCALE_4);
00747             p_j_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_y, _MM_SCALE_4);
00748             p_j_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_z, _MM_SCALE_4);
00749             p_j_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_q, _MM_SCALE_4);
00750             #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00751               pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
00752             #endif
00753           #else
00754             GATHER_PS_I32(p_j_x_vec, p_1_x, tmpJ32);
00755             GATHER_PS_I32(p_j_y_vec, p_1_y, tmpJ32);
00756             GATHER_PS_I32(p_j_z_vec, p_1_z, tmpJ32);
00757             GATHER_PS_I32(p_j_q_vec, p_1_q, tmpJ32);
00758             #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00759               GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
00760             #endif
00761           #endif
00762         #endif // MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00763       #if __MIC_PAD_PLGEN_CTRL != 0
00764       #else
00765       } else {
00766         #if MIC_HANDCODE_FORCE_USEGATHER != 0
00767           p_i_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_x, _MM_SCALE_4);
00768           p_i_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_y, _MM_SCALE_4);
00769           p_i_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_z, _MM_SCALE_4);
00770           p_i_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, i_vec, p_0_q, _MM_SCALE_4);
00771           #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00772             pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, i_vec, pExt_0_vdwType, _MM_SCALE_4);
00773           #endif
00774           p_j_x_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_x, _MM_SCALE_4);
00775           p_j_y_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_y, _MM_SCALE_4);
00776           p_j_z_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_z, _MM_SCALE_4);
00777           p_j_q_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), active_mask, j_vec, p_1_q, _MM_SCALE_4);
00778           #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00779             pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), active_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
00780           #endif
00781         #else
00782           GATHER_PS_I32(p_i_x_vec, p_0_x, tmpI32);
00783           GATHER_PS_I32(p_i_y_vec, p_0_y, tmpI32);
00784           GATHER_PS_I32(p_i_z_vec, p_0_z, tmpI32);
00785           GATHER_PS_I32(p_i_q_vec, p_0_q, tmpI32);
00786           #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00787             GATHER_EPI32_I32(pExt_i_vdwType_vec, pExt_0_vdwType, tmpI32);
00788           #endif
00789          GATHER_PS_I32(p_j_x_vec, p_1_x, tmpJ32);
00790           GATHER_PS_I32(p_j_y_vec, p_1_y, tmpJ32);
00791           GATHER_PS_I32(p_j_z_vec, p_1_z, tmpJ32);
00792           GATHER_PS_I32(p_j_q_vec, p_1_q, tmpJ32);
00793           #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00794             GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
00795           #endif
00796         #endif
00797       }
00798       #endif
00799     }
00800 
00801     // Calculate the delta-x, delta-y, delta-z, and radius-squared
00802     p_i_x_vec = _mm512_add_ps(p_i_x_vec, _mm512_set_1to16_ps(offset_x));
00803     p_i_y_vec = _mm512_add_ps(p_i_y_vec, _mm512_set_1to16_ps(offset_y));
00804     p_i_z_vec = _mm512_add_ps(p_i_z_vec, _mm512_set_1to16_ps(offset_z));
00805     __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
00806     __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
00807     __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
00808     __m512 r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_x_vec, p_ij_x_vec), _mm512_set_1to16_ps(r2_delta));
00809     r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_y_vec, p_ij_y_vec), r2_vec);
00810     r2_vec = _mm512_add_ps(_mm512_mul_ps(p_ij_z_vec, p_ij_z_vec), r2_vec);
00811 
00812     // Do the 'within cutoff' check
00813     // DMK - NOTE - When intrinsics are used, the compiler generates extra instructions to clear all but the least significant
00814     //   8 bits of the mask register storing the result of the comparison.
00815     __mmask16 cutoff_mask = _mm512_mask_cmplt_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps(cutoff2_delta));
00816     if (_mm512_kortestz(cutoff_mask, cutoff_mask)) { continue; }  // If the mask is completely unset, move on to the next vector
00817 
00818     #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
00819       exclusionSum_vec = _mm512_mask_add_epi32(exclusionSum_vec, cutoff_mask, exclusionSum_vec, _mm512_set_1to16_epi32(1));
00820     #endif
00821 
00822     // Calculate kqq = p_0_q[i] * p_1_q[j]
00823     __m512 kqq_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, p_i_q_vec, p_j_q_vec);
00824 
00825     // Calculate table_i = (r2 >> 46) + r2_delta_expc
00826     __m512i r2i_vec = _mm512_castps_si512(r2_vec);
00827     __m512i table_i_vec = _mm512_srli_epi32(r2i_vec, 17);
00828     table_i_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_i_vec, _mm512_set_1to16_epi32(r2_delta_expc));
00829 
00830     // Calculate diffa = r2 - r2_table[table_i]
00831     __m512 r2_table_vec;
00832     {
00833       // From ComputeNonbondedUtil.C                    Simplified:
00834       //   r2_base = r2_delta * (1 << (i/64))             r2_base = r2_delta * (1 << (i/64))
00835       //   r2_del = r2_base / 64.0;                       r2_del = r2_base / 64.0;
00836       //   r2 = r2_base - r2_delta + r2_del * (i%64)      r2_table[i] = r2_base - r2_delta + r2_del * (i%64) + r2_delta;
00837       //   r2_table[i] = r2 + r2_delta;                               = r2_base + r2_del * (i%64)
00838       // NOTE: For i = 0, r2_table[0] = r2_delta + (r2_delta / 64) * 0 = r2_delta, so there no need
00839       //   to special case if table_i = 0 then r2_table[0] = r2_delta (see ComputeNonbondedUtil.C:606)
00840       __m512 r2_delta_vec = _mm512_set_1to16_ps(r2_delta);
00841       __m512 t0_vec = _mm512_cvtfxpnt_round_adjustepi32_ps(_mm512_sllv_epi32(_mm512_set_1to16_epi32(1), _mm512_srli_epi32(table_i_vec, 6)), _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
00842       __m512 r2_base_vec = _mm512_mul_ps(r2_delta_vec, t0_vec); // NOTE: r2_delta * (1 << (i/64))
00843       __m512 r2_del_vec = _mm512_mul_ps(r2_base_vec, _mm512_set_1to16_ps(0.015625f)); // NOTE: r2_base / 64
00844       __m512i t1_vec = _mm512_and_epi32(table_i_vec, _mm512_set_1to16_epi32(0x3F)); // NOTE: (i%64)
00845       __m512 t2_vec = _mm512_cvtfxpnt_round_adjustepi32_ps(t1_vec, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE); // NOTE: (float)(i%64)
00846       r2_table_vec = _mm512_add_ps(_mm512_mul_ps(r2_del_vec, t2_vec), r2_base_vec);
00847     }
00848     __m512 diffa_vec = _mm512_sub_ps(r2_vec, r2_table_vec);
00849 
00850     // Load LJ A and B values
00851     #if (0 FAST(+1))
00852 
00853       // Load lj_pars from the table and multiply by scaling
00854       __m512 A_vec, B_vec;
00855           __m512i pExt_i_vdwType_vec;  // NOTE : Moved outside block so could be printed in large force detection code below
00856           __m512i pExt_j_vdwType_vec;
00857       {
00858         // Load VDW types for the "i" and "j" atoms
00859         #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT == 0
00860           #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00861             // DMK - NOTE : This code assumes that vdw_type is the first of four members of atom_param
00862             pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, _mm512_slli_epi32(i_vec, 2), pExt_0, _MM_SCALE_4);
00863             pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, _mm512_slli_epi32(j_vec, 2), pExt_1, _MM_SCALE_4);
00864           #else
00865             #if MIC_HANDCODE_FORCE_USEGATHER != 0
00866               pExt_i_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, i_vec, pExt_0_vdwType, _MM_SCALE_4);
00867               pExt_j_vdwType_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, j_vec, pExt_1_vdwType, _MM_SCALE_4);
00868             #else
00869               GATHER_EPI32_I32(pExt_i_vdwType_vec, pExt_0_vdwType, tmpI32);
00870               GATHER_EPI32_I32(pExt_j_vdwType_vec, pExt_1_vdwType, tmpJ32);
00871             #endif
00872           #endif
00873         #endif
00874 
00875         // Caclulate offsets into LJ table
00876         __m512i t0_vec = _mm512_fmadd_epi32(pExt_i_vdwType_vec, _mm512_set_1to16_epi32(lj_table_dim), pExt_j_vdwType_vec);
00877         __m512i ljOffset_vec = _mm512_slli_epi32(t0_vec, 2); // NOTE: *4
00878         MODIFIED( ljOffset_vec = _mm512_add_epi32(ljOffset_vec, _mm512_set_1to16_epi32(2)); )
00879 
00880         // Gather A and B values
00881         #if MIC_HANDCODE_FORCE_USEGATHER != 0
00882           __m512i ljOffset_p1_vec = _mm512_add_epi32(ljOffset_vec, _mm512_set_1to16_epi32(1));
00883           A_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask,    ljOffset_vec, lj_table_base_ptr, _MM_SCALE_4);
00884           B_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, ljOffset_p1_vec, lj_table_base_ptr, _MM_SCALE_4);
00885         #else
00886           unsigned int ljOffset[16] __attribute__((aligned(64)));
00887           _mm512_store_epi32(ljOffset, ljOffset_vec);
00888           GATHER_PS_I32       (A_vec, lj_table_base_ptr, ljOffset   );
00889           GATHER_PS_I32_OFFSET(B_vec, lj_table_base_ptr, ljOffset, 1);
00890         #endif
00891 
00892         // Scale the A and B values
00893         __m512 scaling_vec = _mm512_set_1to16_ps(scaling);
00894         A_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, A_vec, scaling_vec);
00895         B_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, B_vec, scaling_vec);
00896       }
00897     #endif // FAST
00898 
00899     // Load the table_four values
00900     // NOTE : In this loop, each vector lane is calculating an interaction between two particles, i and j.
00901     //   As part of this calculation, a table lookup is done (this code) where each table entry has 16
00902     //   values. Rather than do 16 gathers, extract the index for each interaction, do vector loads on
00903     //   the table entries (i.e. 1 table entry = 16 values = 2 512-bit vector registers), and then transpose
00904     //   those values in-register to create the vectorized registers that can be used by the code below
00905     //   (i.e. 1 vector register per each of the 16 fields).  Once the values have been loaded, only
00906     //   swizzles and permutes are required to do this, so it shouldn't be very expensive to do.
00907     const CALC_TYPE * const table_four_base = SHORT(table_short) NOSHORT(table_noshort);
00908     __m512 table_four_i_0_vec,  table_four_i_1_vec,  table_four_i_2_vec,  table_four_i_3_vec,
00909            table_four_i_4_vec,  table_four_i_5_vec,  table_four_i_6_vec,  table_four_i_7_vec,
00910            table_four_i_8_vec,  table_four_i_9_vec,  table_four_i_10_vec, table_four_i_11_vec,
00911            table_four_i_12_vec, table_four_i_13_vec, table_four_i_14_vec, table_four_i_15_vec;
00912     {
00913       #if MIC_HANDCODE_FORCE_USEGATHER_NBTBL != 0
00914 
00915         #if 0
00916 
00917           __m512i table_four_i_offsets = _mm512_slli_epi32(table_i_vec, 4); // NOTE: table_i * 16 (16 elements per table entry)
00918           table_four_i_0_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask,                                                            table_four_i_offsets                             , table_four_base, _MM_SCALE_4);
00919           table_four_i_1_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 1)), table_four_base, _MM_SCALE_4);
00920           table_four_i_2_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 2)), table_four_base, _MM_SCALE_4);
00921           table_four_i_3_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 3)), table_four_base, _MM_SCALE_4);
00922           table_four_i_4_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 4)), table_four_base, _MM_SCALE_4);
00923           table_four_i_5_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 5)), table_four_base, _MM_SCALE_4);
00924           table_four_i_6_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 6)), table_four_base, _MM_SCALE_4);
00925           table_four_i_7_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 7)), table_four_base, _MM_SCALE_4);
00926           table_four_i_8_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 8)), table_four_base, _MM_SCALE_4);
00927           table_four_i_9_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32( 9)), table_four_base, _MM_SCALE_4);
00928           table_four_i_10_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(10)), table_four_base, _MM_SCALE_4);
00929           table_four_i_11_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(11)), table_four_base, _MM_SCALE_4);
00930           table_four_i_12_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(12)), table_four_base, _MM_SCALE_4);
00931           table_four_i_13_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(13)), table_four_base, _MM_SCALE_4);
00932           table_four_i_14_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(14)), table_four_base, _MM_SCALE_4);
00933           table_four_i_15_vec = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, _mm512_mask_add_epi32(_mm512_setzero_epi32(), cutoff_mask, table_four_i_offsets, _mm512_set_1to16_epi32(15)), table_four_base, _MM_SCALE_4);
00934 
00935         #else
00936 
00937           __m512i table_four_i_offsets = _mm512_slli_epi32(table_i_vec, 4); // NOTE: table_i * 16 (16 elements per table entry)
00938           table_four_i_0_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base, _MM_SCALE_4);
00939           table_four_i_1_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 1, _MM_SCALE_4);
00940           table_four_i_2_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 2, _MM_SCALE_4);
00941           table_four_i_3_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 3, _MM_SCALE_4);
00942           table_four_i_4_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 4, _MM_SCALE_4);
00943           table_four_i_5_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 5, _MM_SCALE_4);
00944           table_four_i_6_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 6, _MM_SCALE_4);
00945           table_four_i_7_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 7, _MM_SCALE_4);
00946           table_four_i_8_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 8, _MM_SCALE_4);
00947           table_four_i_9_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 9, _MM_SCALE_4);
00948           table_four_i_10_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 10, _MM_SCALE_4);
00949           table_four_i_11_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 11, _MM_SCALE_4);
00950           table_four_i_12_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 12, _MM_SCALE_4);
00951           table_four_i_13_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 13, _MM_SCALE_4);
00952           table_four_i_14_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 14, _MM_SCALE_4);
00953           table_four_i_15_vec  = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), cutoff_mask, table_four_i_offsets, table_four_base + 15, _MM_SCALE_4);
00954 
00955         #endif
00956 
00957       #else
00958 
00959       unsigned int table_four_i_offsets[16] __attribute__((aligned(64)));
00960       _mm512_store_epi32(table_four_i_offsets, _mm512_slli_epi32(table_i_vec, 4)); // NOTE: table_i * 16 (16 elements per table entry)
00961 
00962       // Load and transpose 256 values (16 x 16 matrix)
00963       __m512i tmpA_00 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 0]])));
00964       __m512i tmpA_01 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 1]])));
00965       __m512i tmpA_02 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 2]])));
00966       __m512i tmpA_03 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 3]])));
00967       __m512i tmpA_04 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 4]])));
00968       __m512i tmpA_05 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 5]])));
00969       __m512i tmpA_06 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 6]])));
00970       __m512i tmpA_07 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 7]])));
00971       __m512i tmpA_08 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 8]])));
00972       __m512i tmpA_09 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[ 9]])));
00973       __m512i tmpA_10 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[10]])));
00974       __m512i tmpA_11 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[11]])));
00975       __m512i tmpA_12 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[12]])));
00976       __m512i tmpA_13 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[13]])));
00977       __m512i tmpA_14 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[14]])));
00978       __m512i tmpA_15 = _mm512_castps_si512(_mm512_load_ps(&(table_four_base[table_four_i_offsets[15]])));
00979       __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
00980       __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
00981       __m512i tmpB_00 = _mm512_mask_swizzle_epi32(tmpA_00, k_2x2_0, tmpA_01, _MM_SWIZ_REG_CDAB);
00982       __m512i tmpB_01 = _mm512_mask_swizzle_epi32(tmpA_01, k_2x2_1, tmpA_00, _MM_SWIZ_REG_CDAB);
00983       __m512i tmpB_02 = _mm512_mask_swizzle_epi32(tmpA_02, k_2x2_0, tmpA_03, _MM_SWIZ_REG_CDAB);
00984       __m512i tmpB_03 = _mm512_mask_swizzle_epi32(tmpA_03, k_2x2_1, tmpA_02, _MM_SWIZ_REG_CDAB);
00985       __m512i tmpB_04 = _mm512_mask_swizzle_epi32(tmpA_04, k_2x2_0, tmpA_05, _MM_SWIZ_REG_CDAB);
00986       __m512i tmpB_05 = _mm512_mask_swizzle_epi32(tmpA_05, k_2x2_1, tmpA_04, _MM_SWIZ_REG_CDAB);
00987       __m512i tmpB_06 = _mm512_mask_swizzle_epi32(tmpA_06, k_2x2_0, tmpA_07, _MM_SWIZ_REG_CDAB);
00988       __m512i tmpB_07 = _mm512_mask_swizzle_epi32(tmpA_07, k_2x2_1, tmpA_06, _MM_SWIZ_REG_CDAB);
00989       __m512i tmpB_08 = _mm512_mask_swizzle_epi32(tmpA_08, k_2x2_0, tmpA_09, _MM_SWIZ_REG_CDAB);
00990       __m512i tmpB_09 = _mm512_mask_swizzle_epi32(tmpA_09, k_2x2_1, tmpA_08, _MM_SWIZ_REG_CDAB);
00991       __m512i tmpB_10 = _mm512_mask_swizzle_epi32(tmpA_10, k_2x2_0, tmpA_11, _MM_SWIZ_REG_CDAB);
00992       __m512i tmpB_11 = _mm512_mask_swizzle_epi32(tmpA_11, k_2x2_1, tmpA_10, _MM_SWIZ_REG_CDAB);
00993       __m512i tmpB_12 = _mm512_mask_swizzle_epi32(tmpA_12, k_2x2_0, tmpA_13, _MM_SWIZ_REG_CDAB);
00994       __m512i tmpB_13 = _mm512_mask_swizzle_epi32(tmpA_13, k_2x2_1, tmpA_12, _MM_SWIZ_REG_CDAB);
00995       __m512i tmpB_14 = _mm512_mask_swizzle_epi32(tmpA_14, k_2x2_0, tmpA_15, _MM_SWIZ_REG_CDAB);
00996       __m512i tmpB_15 = _mm512_mask_swizzle_epi32(tmpA_15, k_2x2_1, tmpA_14, _MM_SWIZ_REG_CDAB);
00997       __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
00998       __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
00999       __m512i tmpC_00 = _mm512_mask_swizzle_epi32(tmpB_00, k_4x4_0, tmpB_02, _MM_SWIZ_REG_BADC);
01000       __m512i tmpC_01 = _mm512_mask_swizzle_epi32(tmpB_01, k_4x4_0, tmpB_03, _MM_SWIZ_REG_BADC);
01001       __m512i tmpC_02 = _mm512_mask_swizzle_epi32(tmpB_02, k_4x4_1, tmpB_00, _MM_SWIZ_REG_BADC);
01002       __m512i tmpC_03 = _mm512_mask_swizzle_epi32(tmpB_03, k_4x4_1, tmpB_01, _MM_SWIZ_REG_BADC);
01003       __m512i tmpC_04 = _mm512_mask_swizzle_epi32(tmpB_04, k_4x4_0, tmpB_06, _MM_SWIZ_REG_BADC);
01004       __m512i tmpC_05 = _mm512_mask_swizzle_epi32(tmpB_05, k_4x4_0, tmpB_07, _MM_SWIZ_REG_BADC);
01005       __m512i tmpC_06 = _mm512_mask_swizzle_epi32(tmpB_06, k_4x4_1, tmpB_04, _MM_SWIZ_REG_BADC);
01006       __m512i tmpC_07 = _mm512_mask_swizzle_epi32(tmpB_07, k_4x4_1, tmpB_05, _MM_SWIZ_REG_BADC);
01007       __m512i tmpC_08 = _mm512_mask_swizzle_epi32(tmpB_08, k_4x4_0, tmpB_10, _MM_SWIZ_REG_BADC);
01008       __m512i tmpC_09 = _mm512_mask_swizzle_epi32(tmpB_09, k_4x4_0, tmpB_11, _MM_SWIZ_REG_BADC);
01009       __m512i tmpC_10 = _mm512_mask_swizzle_epi32(tmpB_10, k_4x4_1, tmpB_08, _MM_SWIZ_REG_BADC);
01010       __m512i tmpC_11 = _mm512_mask_swizzle_epi32(tmpB_11, k_4x4_1, tmpB_09, _MM_SWIZ_REG_BADC);
01011       __m512i tmpC_12 = _mm512_mask_swizzle_epi32(tmpB_12, k_4x4_0, tmpB_14, _MM_SWIZ_REG_BADC);
01012       __m512i tmpC_13 = _mm512_mask_swizzle_epi32(tmpB_13, k_4x4_0, tmpB_15, _MM_SWIZ_REG_BADC);
01013       __m512i tmpC_14 = _mm512_mask_swizzle_epi32(tmpB_14, k_4x4_1, tmpB_12, _MM_SWIZ_REG_BADC);
01014       __m512i tmpC_15 = _mm512_mask_swizzle_epi32(tmpB_15, k_4x4_1, tmpB_13, _MM_SWIZ_REG_BADC);
01015       __mmask16 k_8x8_0 = _mm512_int2mask(0xF0F0);
01016       __mmask16 k_8x8_1 = _mm512_int2mask(0x0F0F);
01017       __m512i tmpD_00 = _mm512_mask_permute4f128_epi32(tmpC_00, k_8x8_0, tmpC_04, _MM_PERM_CDAB);
01018       __m512i tmpD_01 = _mm512_mask_permute4f128_epi32(tmpC_01, k_8x8_0, tmpC_05, _MM_PERM_CDAB);
01019       __m512i tmpD_02 = _mm512_mask_permute4f128_epi32(tmpC_02, k_8x8_0, tmpC_06, _MM_PERM_CDAB);
01020       __m512i tmpD_03 = _mm512_mask_permute4f128_epi32(tmpC_03, k_8x8_0, tmpC_07, _MM_PERM_CDAB);
01021       __m512i tmpD_04 = _mm512_mask_permute4f128_epi32(tmpC_04, k_8x8_1, tmpC_00, _MM_PERM_CDAB);
01022       __m512i tmpD_05 = _mm512_mask_permute4f128_epi32(tmpC_05, k_8x8_1, tmpC_01, _MM_PERM_CDAB);
01023       __m512i tmpD_06 = _mm512_mask_permute4f128_epi32(tmpC_06, k_8x8_1, tmpC_02, _MM_PERM_CDAB);
01024       __m512i tmpD_07 = _mm512_mask_permute4f128_epi32(tmpC_07, k_8x8_1, tmpC_03, _MM_PERM_CDAB);
01025       __m512i tmpD_08 = _mm512_mask_permute4f128_epi32(tmpC_08, k_8x8_0, tmpC_12, _MM_PERM_CDAB);
01026       __m512i tmpD_09 = _mm512_mask_permute4f128_epi32(tmpC_09, k_8x8_0, tmpC_13, _MM_PERM_CDAB);
01027       __m512i tmpD_10 = _mm512_mask_permute4f128_epi32(tmpC_10, k_8x8_0, tmpC_14, _MM_PERM_CDAB);
01028       __m512i tmpD_11 = _mm512_mask_permute4f128_epi32(tmpC_11, k_8x8_0, tmpC_15, _MM_PERM_CDAB);
01029       __m512i tmpD_12 = _mm512_mask_permute4f128_epi32(tmpC_12, k_8x8_1, tmpC_08, _MM_PERM_CDAB);
01030       __m512i tmpD_13 = _mm512_mask_permute4f128_epi32(tmpC_13, k_8x8_1, tmpC_09, _MM_PERM_CDAB);
01031       __m512i tmpD_14 = _mm512_mask_permute4f128_epi32(tmpC_14, k_8x8_1, tmpC_10, _MM_PERM_CDAB);
01032       __m512i tmpD_15 = _mm512_mask_permute4f128_epi32(tmpC_15, k_8x8_1, tmpC_11, _MM_PERM_CDAB);
01033       __mmask16 k_16x16_0 = _mm512_int2mask(0xFF00);
01034       __mmask16 k_16x16_1 = _mm512_int2mask(0x00FF);
01035       table_four_i_0_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_00, k_16x16_0, tmpD_08, _MM_PERM_BADC));
01036       table_four_i_1_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_01, k_16x16_0, tmpD_09, _MM_PERM_BADC));
01037       table_four_i_2_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_02, k_16x16_0, tmpD_10, _MM_PERM_BADC));
01038       table_four_i_3_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_03, k_16x16_0, tmpD_11, _MM_PERM_BADC));
01039       table_four_i_4_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_04, k_16x16_0, tmpD_12, _MM_PERM_BADC));
01040       table_four_i_5_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_05, k_16x16_0, tmpD_13, _MM_PERM_BADC));
01041       table_four_i_6_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_06, k_16x16_0, tmpD_14, _MM_PERM_BADC));
01042       table_four_i_7_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_07, k_16x16_0, tmpD_15, _MM_PERM_BADC));
01043       table_four_i_8_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_08, k_16x16_1, tmpD_00, _MM_PERM_BADC));
01044       table_four_i_9_vec  = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_09, k_16x16_1, tmpD_01, _MM_PERM_BADC));
01045       table_four_i_10_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_10, k_16x16_1, tmpD_02, _MM_PERM_BADC));
01046       table_four_i_11_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_11, k_16x16_1, tmpD_03, _MM_PERM_BADC));
01047       table_four_i_12_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_12, k_16x16_1, tmpD_04, _MM_PERM_BADC));
01048       table_four_i_13_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_13, k_16x16_1, tmpD_05, _MM_PERM_BADC));
01049       table_four_i_14_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_14, k_16x16_1, tmpD_06, _MM_PERM_BADC));
01050       table_four_i_15_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(tmpD_15, k_16x16_1, tmpD_07, _MM_PERM_BADC));
01051 
01052       #endif
01053     }
01054 
01055     #if (0 FAST(+1))
01056 
01057       __m512 vdw_d_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_0_vec), _mm512_mul_ps(B_vec, table_four_i_4_vec));
01058       __m512 vdw_c_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_1_vec), _mm512_mul_ps(B_vec, table_four_i_5_vec));
01059       __m512 vdw_b_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_2_vec), _mm512_mul_ps(B_vec, table_four_i_6_vec));
01060       __m512 vdw_a_vec = _mm512_sub_ps(_mm512_mul_ps(A_vec, table_four_i_3_vec), _mm512_mul_ps(B_vec, table_four_i_7_vec));
01061 
01062       #if (0 ENERGY(+1))
01063         __m512 vdw_val_tmp_0_vec = _mm512_mul_ps(vdw_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
01064         __m512 vdw_val_tmp_1_vec = _mm512_mul_ps(vdw_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
01065         __m512 vdw_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_0_vec), vdw_val_tmp_1_vec);
01066         __m512 vdw_val_tmp_3_vec = _mm512_mul_ps(vdw_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
01067         __m512 vdw_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_2_vec), vdw_val_tmp_3_vec);
01068         __m512 vdw_val_vec       = _mm512_add_ps(_mm512_mul_ps(diffa_vec, vdw_val_tmp_4_vec), vdw_a_vec);
01069         CONTRIB_SUB_PS2PD(vdwEnergy_vec, vdw_val_vec);
01070       #endif
01071 
01072       #if (0 SHORT(+1))
01073 
01074         #if (0 NORMAL(+1))
01075           __m512 fast_d_vec = _mm512_mul_ps(kqq_vec, table_four_i_8_vec);
01076           __m512 fast_c_vec = _mm512_mul_ps(kqq_vec, table_four_i_9_vec);
01077           __m512 fast_b_vec = _mm512_mul_ps(kqq_vec, table_four_i_10_vec);
01078           __m512 fast_a_vec = _mm512_mul_ps(kqq_vec, table_four_i_11_vec);
01079         #endif
01080         #if (0 MODIFIED(+1))
01081           __m512 modfckqq_vec = _mm512_mul_ps(_mm512_set_1to16_ps(1.0f - modf_mod), kqq_vec);
01082           __m512 fast_d_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_8_vec);
01083           __m512 fast_c_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_9_vec);
01084           __m512 fast_b_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_10_vec);
01085           __m512 fast_a_vec = _mm512_mul_ps(modfckqq_vec, table_four_i_11_vec);
01086         #endif
01087 
01088         #if (0 ENERGY(+1))
01089           __m512 fast_val_tmp_0_vec = _mm512_mul_ps(fast_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
01090           __m512 fast_val_tmp_1_vec = _mm512_mul_ps(fast_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
01091           __m512 fast_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_0_vec), fast_val_tmp_1_vec);
01092           __m512 fast_val_tmp_3_vec = _mm512_mul_ps(fast_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
01093           __m512 fast_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_2_vec), fast_val_tmp_3_vec);
01094           __m512 fast_val_vec       = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_val_tmp_4_vec), fast_a_vec);
01095           CONTRIB_SUB_PS2PD(electEnergy_vec, fast_val_vec);
01096         #endif
01097 
01098         #if (0 NOT_ALCHPAIR(+1))
01099           fast_d_vec = _mm512_add_ps(fast_d_vec, vdw_d_vec);
01100           fast_c_vec = _mm512_add_ps(fast_c_vec, vdw_c_vec);
01101           fast_b_vec = _mm512_add_ps(fast_b_vec, vdw_b_vec);
01102           fast_a_vec = _mm512_add_ps(fast_a_vec, vdw_a_vec);
01103         #endif
01104 
01105         __m512 fast_dir_tmp_0_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_d_vec), fast_c_vec);
01106         __m512 fast_dir_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, fast_dir_tmp_0_vec), fast_b_vec);
01107         __m512 force_r_vec = fast_dir_vec;  // NOTE: No-op left in as placeholder for when LAM is added
01108 
01109         const __m512 tmp_x_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_x_vec);
01110         PAIR( CONTRIB_ADD_PS2PD(virial_xx_vec, _mm512_mul_ps(tmp_x_vec, p_ij_x_vec)); )
01111         PAIR( CONTRIB_ADD_PS2PD(virial_xy_vec, _mm512_mul_ps(tmp_x_vec, p_ij_y_vec)); )
01112         PAIR( CONTRIB_ADD_PS2PD(virial_xz_vec, _mm512_mul_ps(tmp_x_vec, p_ij_z_vec)); )
01113 
01114         const __m512 tmp_y_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_y_vec);
01115         PAIR( CONTRIB_ADD_PS2PD(virial_yy_vec, _mm512_mul_ps(tmp_y_vec, p_ij_y_vec)); )
01116         PAIR( CONTRIB_ADD_PS2PD(virial_yz_vec, _mm512_mul_ps(tmp_y_vec, p_ij_z_vec)); )
01117 
01118         const __m512 tmp_z_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, force_r_vec, p_ij_z_vec);
01119         PAIR( CONTRIB_ADD_PS2PD(virial_zz_vec, _mm512_mul_ps(tmp_z_vec, p_ij_z_vec)); )
01120 
01121         #if MIC_HANDCODE_FORCE_COMBINE_FORCES == 0
01122           APPLY_FORCES_PS2PD(tmp_x_vec, tmp_y_vec, tmp_z_vec, f_0_x, f_0_y, f_0_z, f_1_x, f_1_y, f_1_z, tmpI32, tmpJ32);
01123         #endif
01124 
01125       #endif // SHORT
01126     #endif // FAST
01127 
01128     #if (0 FULL(+1))
01129 
01130       // Load the slow_table values, if need be
01131       #if (0 SHORT( EXCLUDED(+1) MODIFIED(+1) ))
01132 
01133         __m512 slow_i_0_vec, slow_i_1_vec, slow_i_2_vec, slow_i_3_vec;
01134         {
01135           // Create the indexes
01136           unsigned int slow_i_offsets[16] __attribute__((aligned(64)));
01137           _mm512_store_epi32(slow_i_offsets, _mm512_slli_epi32(table_i_vec, 2)); // table_i * 4
01138 
01139           // Load the values (64 values, 16x4 matrix, so we only care about elements 0-3)
01140           // DMK - NOTE : Since the slow table is aligned, all four elements being loaded by the loadunpacklo
01141           //   will be grouped on a single cacheline, so no need to also include a loadunpackhi instruction
01142           #define __LOAD_SLOW(dstA, offset) \
01143             __m512i (dstA) = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0x000F), slow_table + (offset));
01144           __LOAD_SLOW(slow_i_tmp00_vec, slow_i_offsets[ 0]);
01145           __LOAD_SLOW(slow_i_tmp01_vec, slow_i_offsets[ 1]);
01146           __LOAD_SLOW(slow_i_tmp02_vec, slow_i_offsets[ 2]);
01147           __LOAD_SLOW(slow_i_tmp03_vec, slow_i_offsets[ 3]);
01148           __LOAD_SLOW(slow_i_tmp04_vec, slow_i_offsets[ 4]);
01149           __LOAD_SLOW(slow_i_tmp05_vec, slow_i_offsets[ 5]);
01150           __LOAD_SLOW(slow_i_tmp06_vec, slow_i_offsets[ 6]);
01151           __LOAD_SLOW(slow_i_tmp07_vec, slow_i_offsets[ 7]);
01152           __LOAD_SLOW(slow_i_tmp08_vec, slow_i_offsets[ 8]);
01153           __LOAD_SLOW(slow_i_tmp09_vec, slow_i_offsets[ 9]);
01154           __LOAD_SLOW(slow_i_tmp10_vec, slow_i_offsets[10]);
01155           __LOAD_SLOW(slow_i_tmp11_vec, slow_i_offsets[11]);
01156           __LOAD_SLOW(slow_i_tmp12_vec, slow_i_offsets[12]);
01157           __LOAD_SLOW(slow_i_tmp13_vec, slow_i_offsets[13]);
01158           __LOAD_SLOW(slow_i_tmp14_vec, slow_i_offsets[14]);
01159           __LOAD_SLOW(slow_i_tmp15_vec, slow_i_offsets[15]);
01160           #undef __LOAD_SLOW
01161 
01162           // Transpose the values
01163           // NOTE : Transpose each 2x2 sub-matrix, but only elements 0-4 have data we care about
01164           #define __TRANSPOSE_2x2__(dstA, dstB, srcA, srcB) \
01165             __m512i dstA = _mm512_mask_swizzle_epi32((srcA), _mm512_int2mask(0xAAAA), (srcB), _MM_SWIZ_REG_CDAB); \
01166             __m512i dstB = _mm512_mask_swizzle_epi32((srcB), _mm512_int2mask(0x5555), (srcA), _MM_SWIZ_REG_CDAB);
01167           __TRANSPOSE_2x2__(slow_2x2_tmp00_vec, slow_2x2_tmp01_vec, slow_i_tmp00_vec, slow_i_tmp01_vec);
01168           __TRANSPOSE_2x2__(slow_2x2_tmp02_vec, slow_2x2_tmp03_vec, slow_i_tmp02_vec, slow_i_tmp03_vec);
01169           __TRANSPOSE_2x2__(slow_2x2_tmp04_vec, slow_2x2_tmp05_vec, slow_i_tmp04_vec, slow_i_tmp05_vec);
01170           __TRANSPOSE_2x2__(slow_2x2_tmp06_vec, slow_2x2_tmp07_vec, slow_i_tmp06_vec, slow_i_tmp07_vec);
01171           __TRANSPOSE_2x2__(slow_2x2_tmp08_vec, slow_2x2_tmp09_vec, slow_i_tmp08_vec, slow_i_tmp09_vec);
01172           __TRANSPOSE_2x2__(slow_2x2_tmp10_vec, slow_2x2_tmp11_vec, slow_i_tmp10_vec, slow_i_tmp11_vec);
01173           __TRANSPOSE_2x2__(slow_2x2_tmp12_vec, slow_2x2_tmp13_vec, slow_i_tmp12_vec, slow_i_tmp13_vec);
01174           __TRANSPOSE_2x2__(slow_2x2_tmp14_vec, slow_2x2_tmp15_vec, slow_i_tmp14_vec, slow_i_tmp15_vec);
01175           #undef __TRANSPOSE_2x2__
01176           #define __TRANSPOSE_4x4__(dstA, dstB, dstC, dstD, srcA, srcB, srcC, srcD) \
01177             __m512i dstA = _mm512_mask_swizzle_epi32((srcA), _mm512_int2mask(0xCCCC), (srcC), _MM_SWIZ_REG_BADC); \
01178             __m512i dstB = _mm512_mask_swizzle_epi32((srcB), _mm512_int2mask(0xCCCC), (srcD), _MM_SWIZ_REG_BADC); \
01179             __m512i dstC = _mm512_mask_swizzle_epi32((srcC), _mm512_int2mask(0x3333), (srcA), _MM_SWIZ_REG_BADC); \
01180             __m512i dstD = _mm512_mask_swizzle_epi32((srcD), _mm512_int2mask(0x3333), (srcB), _MM_SWIZ_REG_BADC);
01181           __TRANSPOSE_4x4__(slow_4x4_tmp00_vec, slow_4x4_tmp01_vec, slow_4x4_tmp02_vec, slow_4x4_tmp03_vec, slow_2x2_tmp00_vec, slow_2x2_tmp01_vec, slow_2x2_tmp02_vec, slow_2x2_tmp03_vec);
01182           __TRANSPOSE_4x4__(slow_4x4_tmp04_vec, slow_4x4_tmp05_vec, slow_4x4_tmp06_vec, slow_4x4_tmp07_vec, slow_2x2_tmp04_vec, slow_2x2_tmp05_vec, slow_2x2_tmp06_vec, slow_2x2_tmp07_vec);
01183           __TRANSPOSE_4x4__(slow_4x4_tmp08_vec, slow_4x4_tmp09_vec, slow_4x4_tmp10_vec, slow_4x4_tmp11_vec, slow_2x2_tmp08_vec, slow_2x2_tmp09_vec, slow_2x2_tmp10_vec, slow_2x2_tmp11_vec);
01184           __TRANSPOSE_4x4__(slow_4x4_tmp12_vec, slow_4x4_tmp13_vec, slow_4x4_tmp14_vec, slow_4x4_tmp15_vec, slow_2x2_tmp12_vec, slow_2x2_tmp13_vec, slow_2x2_tmp14_vec, slow_2x2_tmp15_vec);
01185           #undef __TRANSPOSE_2x2__
01186           __m512i slow_4x8_tmp00_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp00_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp04_vec, _MM_PERM_CDAB);
01187           __m512i slow_4x8_tmp01_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp01_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp05_vec, _MM_PERM_CDAB);
01188           __m512i slow_4x8_tmp02_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp02_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp06_vec, _MM_PERM_CDAB);
01189           __m512i slow_4x8_tmp03_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp03_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp07_vec, _MM_PERM_CDAB);
01190           __m512i slow_4x8_tmp04_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp08_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp12_vec, _MM_PERM_CDAB);
01191           __m512i slow_4x8_tmp05_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp09_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp13_vec, _MM_PERM_CDAB);
01192           __m512i slow_4x8_tmp06_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp10_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp14_vec, _MM_PERM_CDAB);
01193           __m512i slow_4x8_tmp07_vec = _mm512_mask_permute4f128_epi32(slow_4x4_tmp11_vec, _mm512_int2mask(0x00F0), slow_4x4_tmp15_vec, _MM_PERM_CDAB);
01194           slow_i_0_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp00_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp04_vec, _MM_PERM_BADC));
01195           slow_i_1_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp01_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp05_vec, _MM_PERM_BADC));
01196           slow_i_2_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp02_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp06_vec, _MM_PERM_BADC));
01197           slow_i_3_vec = _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(slow_4x8_tmp03_vec, _mm512_int2mask(0xFF00), slow_4x8_tmp07_vec, _MM_PERM_BADC));
01198         }
01199 
01200       #endif // SHORT && (EXCLUDED || MODIFIED)
01201 
01202       __m512 slow_d_vec = NOSHORT( table_four_i_8_vec) SHORT(table_four_i_12_vec);
01203       __m512 slow_c_vec = NOSHORT( table_four_i_9_vec) SHORT(table_four_i_13_vec);
01204       __m512 slow_b_vec = NOSHORT(table_four_i_10_vec) SHORT(table_four_i_14_vec);
01205       __m512 slow_a_vec = NOSHORT(table_four_i_11_vec) SHORT(table_four_i_15_vec);
01206 
01207       #if (0 EXCLUDED(+1))
01208         #if (0 SHORT(+1))
01209           slow_a_vec = _mm512_add_ps(                                         slow_i_3_vec , slow_a_vec);
01210           slow_b_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(2.0f), slow_i_2_vec), slow_b_vec);
01211           slow_c_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(4.0f), slow_i_1_vec), slow_c_vec);
01212           slow_d_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(6.0f), slow_i_0_vec), slow_d_vec);
01213         #endif
01214         #if (0 NOSHORT(+1))
01215           slow_d_vec = _mm512_sub_ps(slow_d_vec, table_four_i_12_vec);
01216           slow_c_vec = _mm512_sub_ps(slow_c_vec, table_four_i_13_vec);
01217           slow_b_vec = _mm512_sub_ps(slow_b_vec, table_four_i_14_vec);
01218           slow_a_vec = _mm512_sub_ps(slow_a_vec, table_four_i_15_vec);
01219         #endif
01220       #endif // EXCLUDED
01221       #if (0 MODIFIED(+1))
01222         #if (0 SHORT(+1))
01223           slow_a_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(       modf_mod), slow_i_3_vec), slow_a_vec);
01224           slow_b_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(2.0f * modf_mod), slow_i_2_vec), slow_b_vec);
01225           slow_c_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(4.0f * modf_mod), slow_i_1_vec), slow_c_vec);
01226           slow_d_vec = _mm512_add_ps(_mm512_mul_ps(_mm512_set_1to16_ps(6.0f * modf_mod), slow_i_0_vec), slow_d_vec);
01227         #endif
01228         #if (0 NOSHORT(+1))
01229           __m512 modf_mod_vec = _mm512_set_1to16_ps(modf_mod);
01230           slow_d_vec = _mm512_sub_ps(slow_d_vec, _mm512_mul_ps(modf_mod, table_four_i_12_vec));
01231           slow_c_vec = _mm512_sub_ps(slow_c_vec, _mm512_mul_ps(modf_mod, table_four_i_13_vec));
01232           slow_b_vec = _mm512_sub_ps(slow_b_vec, _mm512_mul_ps(modf_mod, table_four_i_14_vec));
01233           slow_a_vec = _mm512_sub_ps(slow_a_vec, _mm512_mul_ps(modf_mod, table_four_i_15_vec));
01234         #endif
01235       #endif // MODIFIED
01236       slow_d_vec = _mm512_mul_ps(slow_d_vec, kqq_vec);
01237       slow_c_vec = _mm512_mul_ps(slow_c_vec, kqq_vec);
01238       slow_b_vec = _mm512_mul_ps(slow_b_vec, kqq_vec);
01239       slow_a_vec = _mm512_mul_ps(slow_a_vec, kqq_vec);
01240 
01241       #if (0 ENERGY(+1))
01242         __m512 slow_val_tmp_0_vec = _mm512_mul_ps(slow_d_vec, _mm512_set_1to16_ps(1.0f/6.0f));
01243         __m512 slow_val_tmp_1_vec = _mm512_mul_ps(slow_c_vec, _mm512_set_1to16_ps(1.0f/4.0f));
01244         __m512 slow_val_tmp_2_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_0_vec), slow_val_tmp_1_vec);
01245         __m512 slow_val_tmp_3_vec = _mm512_mul_ps(slow_b_vec, _mm512_set_1to16_ps(1.0f/2.0f));
01246         __m512 slow_val_tmp_4_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_2_vec), slow_val_tmp_3_vec);
01247         __m512 slow_val_vec       = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_val_tmp_4_vec), slow_a_vec);
01248         CONTRIB_SUB_PS2PD(fullElectEnergy_vec, slow_val_vec);
01249       #endif
01250 
01251       #if (0 NOT_ALCHPAIR(FAST(NOSHORT(+1))))
01252         slow_d_vec = _mm512_add_ps(vdw_d_vec, slow_d_vec);
01253         slow_c_vec = _mm512_add_ps(vdw_c_vec, slow_c_vec);
01254         slow_b_vec = _mm512_add_ps(vdw_b_vec, slow_b_vec);
01255         slow_a_vec = _mm512_add_ps(vdw_a_vec, slow_a_vec);
01256       #endif
01257 
01258       __m512 slow_dir_tmp_0_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_d_vec), slow_c_vec);
01259       __m512 slow_dir_vec = _mm512_add_ps(_mm512_mul_ps(diffa_vec, slow_dir_tmp_0_vec), slow_b_vec);
01260       __m512 fullforce_r_vec = slow_dir_vec;  // NOTE: No-op left in as a placeholder for when LAM is added
01261 
01262       const __m512 fulltmp_x_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_x_vec);
01263       PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xx_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_x_vec)); )
01264       PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xy_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_y_vec)); )
01265       PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_xz_vec, _mm512_mul_ps(fulltmp_x_vec, p_ij_z_vec)); )
01266 
01267       const __m512 fulltmp_y_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_y_vec);
01268       PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_yy_vec, _mm512_mul_ps(fulltmp_y_vec, p_ij_y_vec)); )
01269       PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_yz_vec, _mm512_mul_ps(fulltmp_y_vec, p_ij_z_vec)); )
01270 
01271       const __m512 fulltmp_z_vec = _mm512_mask_mul_ps(_mm512_setzero_ps(), cutoff_mask, fullforce_r_vec, p_ij_z_vec);
01272       PAIR( CONTRIB_ADD_PS2PD(fullElectVirial_zz_vec, _mm512_mul_ps(fulltmp_z_vec, p_ij_z_vec)); )
01273 
01274       #if MIC_HANDCODE_FORCE_COMBINE_FORCES == 0
01275         APPLY_FORCES_PS2PD(fulltmp_x_vec, fulltmp_y_vec, fulltmp_z_vec, fullf_0_x, fullf_0_y, fullf_0_z, fullf_1_x, fullf_1_y, fullf_1_z, tmpI32, tmpJ32);
01276       #endif
01277 
01278     #endif // FULL
01279 
01280     #if MIC_HANDCODE_FORCE_COMBINE_FORCES != 0
01281       APPLY_FORCES_PS2PD;
01282     #endif
01283 
01284     // DMK - DEBUG - LARGE FORCE DETECTION
01285     #if (MIC_DATA_STRUCT_VERIFY != 0) && (MIC_DATA_STRUCT_VERIFY_KERNELS != 0)
01286       const float lfd_const = 3.2e2f; // Set to 5.0e1 to print a few per timestep for ApoA1
01287       __mmask16 lfd_mask = _mm512_int2mask(0x0000);
01288       #define LFD_CHECK(v)  _mm512_mask_cmplt_ps_mask(cutoff_mask, _mm512_set_1to16_ps(lfd_const), _mm512_mask_mul_ps(v, _mm512_cmplt_ps_mask(v, _mm512_setzero_ps()), v, _mm512_set_1to16_ps(-1.0f)))
01289       #if (0 FAST(SHORT(+1)))
01290         lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_x_vec));
01291         lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_y_vec));
01292         lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(tmp_z_vec));
01293       #endif
01294       #if (0 FULL(+1))
01295         lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_x_vec));
01296         lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_y_vec));
01297         lfd_mask = _mm512_kor(lfd_mask, LFD_CHECK(fulltmp_z_vec));
01298       #endif
01299       #undef LFD_CHECK
01300       #pragma unroll(16)
01301       for (int _k = 0; _k < 16; _k++) {
01302         lfd_mask = _mm512_kor(lfd_mask, _mm512_int2mask(((pExt_0[tmpI32[_k]].index == 53839 && pExt_1[tmpJ32[_k]].index == 53827) ? (0x1) : (0x0)) << _k));
01303       }
01304       if (_mm512_mask2int(lfd_mask) != 0) {
01305 
01306         float tmp_force[16] __attribute__((aligned(64)));
01307         float tmp_fullforce[16] __attribute__((aligned(64)));
01308         float tmp_ijx[16] __attribute__((aligned(64)));
01309         float tmp_ijy[16] __attribute__((aligned(64)));
01310         float tmp_ijz[16] __attribute__((aligned(64)));
01311         float tmp_x[16] __attribute__((aligned(64)));
01312         float tmp_y[16] __attribute__((aligned(64)));
01313         float tmp_z[16] __attribute__((aligned(64)));
01314         float fulltmp_x[16] __attribute__((aligned(64)));
01315         float fulltmp_y[16] __attribute__((aligned(64)));
01316         float fulltmp_z[16] __attribute__((aligned(64)));
01317         float pi_x[16] __attribute__((aligned(64)));
01318         float pi_y[16] __attribute__((aligned(64)));
01319         float pi_z[16] __attribute__((aligned(64)));
01320         float pi_q[16] __attribute__((aligned(64)));
01321         float pj_x[16] __attribute__((aligned(64)));
01322         float pj_y[16] __attribute__((aligned(64)));
01323         float pj_z[16] __attribute__((aligned(64)));
01324         float pj_q[16] __attribute__((aligned(64)));
01325         float r2[16] __attribute__((aligned(64)));
01326         int table_i[16] __attribute__((aligned(64)));
01327         float kqq[16] __attribute__((aligned(64)));
01328         float diffa[16] __attribute__((aligned(64)));
01329         float vdw_a[16] __attribute__((aligned(64)));
01330         float vdw_b[16] __attribute__((aligned(64)));
01331         float vdw_c[16] __attribute__((aligned(64)));
01332         float vdw_d[16] __attribute__((aligned(64)));
01333         float fast_a[16] __attribute__((aligned(64)));
01334         float fast_b[16] __attribute__((aligned(64)));
01335         float fast_c[16] __attribute__((aligned(64)));
01336         float fast_d[16] __attribute__((aligned(64)));
01337         float A[16] __attribute__((aligned(64)));
01338         float B[16] __attribute__((aligned(64)));
01339         float table_four_i_0[16] __attribute__((aligned(64)));
01340         float table_four_i_1[16] __attribute__((aligned(64)));
01341         float table_four_i_2[16] __attribute__((aligned(64)));
01342         float table_four_i_3[16] __attribute__((aligned(64)));
01343         float table_four_i_4[16] __attribute__((aligned(64)));
01344         float table_four_i_5[16] __attribute__((aligned(64)));
01345         float table_four_i_6[16] __attribute__((aligned(64)));
01346         float table_four_i_7[16] __attribute__((aligned(64)));
01347         int i_vdwType[16] __attribute__((aligned(64)));
01348         int j_vdwType[16] __attribute__((aligned(64)));
01349 
01350         _mm512_store_ps(pi_x, p_i_x_vec);
01351         _mm512_store_ps(pi_y, p_i_y_vec);
01352         _mm512_store_ps(pi_z, p_i_z_vec);
01353         _mm512_store_ps(pi_q, p_i_q_vec);
01354         _mm512_store_ps(pj_x, p_j_x_vec);
01355         _mm512_store_ps(pj_y, p_j_y_vec);
01356         _mm512_store_ps(pj_z, p_j_z_vec);
01357         _mm512_store_ps(pj_q, p_j_q_vec);
01358         _mm512_store_ps(r2, r2_vec);
01359         _mm512_store_epi32(table_i, table_i_vec);
01360         _mm512_store_ps(kqq, kqq_vec);
01361         _mm512_store_ps(diffa, diffa_vec);
01362         _mm512_store_ps(table_four_i_0, table_four_i_0_vec);
01363         _mm512_store_ps(table_four_i_1, table_four_i_1_vec);
01364         _mm512_store_ps(table_four_i_2, table_four_i_2_vec);
01365         _mm512_store_ps(table_four_i_3, table_four_i_3_vec);
01366         _mm512_store_ps(table_four_i_4, table_four_i_4_vec);
01367         _mm512_store_ps(table_four_i_5, table_four_i_5_vec);
01368         _mm512_store_ps(table_four_i_6, table_four_i_6_vec);
01369         _mm512_store_ps(table_four_i_7, table_four_i_7_vec);
01370         #if (0 FAST(+1))
01371           _mm512_store_ps(vdw_a, vdw_a_vec);
01372           _mm512_store_ps(vdw_b, vdw_b_vec);
01373           _mm512_store_ps(vdw_c, vdw_c_vec);
01374           _mm512_store_ps(vdw_d, vdw_d_vec);
01375           _mm512_store_ps(A, A_vec);
01376           _mm512_store_ps(B, B_vec);
01377           _mm512_store_epi64(i_vdwType, pExt_i_vdwType_vec);
01378           _mm512_store_epi64(j_vdwType, pExt_j_vdwType_vec);
01379         #else
01380           _mm512_store_ps(vdw_a, _mm512_setzero_ps());
01381           _mm512_store_ps(vdw_b, _mm512_setzero_ps());
01382           _mm512_store_ps(vdw_c, _mm512_setzero_ps());
01383           _mm512_store_ps(vdw_d, _mm512_setzero_ps());
01384           _mm512_store_ps(A, _mm512_setzero_ps());
01385           _mm512_store_ps(B, _mm512_setzero_ps());
01386           _mm512_store_epi64(i_vdwType, _mm512_setzero_epi32());
01387           _mm512_store_epi64(j_vdwType, _mm512_setzero_epi32());
01388         #endif
01389         #if (0 FAST(SHORT(+1)))
01390           _mm512_store_ps(tmp_force, force_r_vec);
01391           _mm512_store_ps(tmp_x, tmp_x_vec);
01392           _mm512_store_ps(tmp_y, tmp_y_vec);
01393           _mm512_store_ps(tmp_z, tmp_z_vec);
01394           _mm512_store_ps(fast_a, fast_a_vec);
01395           _mm512_store_ps(fast_b, fast_b_vec);
01396           _mm512_store_ps(fast_c, fast_c_vec);
01397           _mm512_store_ps(fast_d, fast_d_vec);
01398         #else
01399           _mm512_store_ps(tmp_force, _mm512_setzero_ps());
01400           _mm512_store_ps(tmp_x, _mm512_setzero_ps());
01401           _mm512_store_ps(tmp_y, _mm512_setzero_ps());
01402           _mm512_store_ps(tmp_z, _mm512_setzero_ps());
01403           _mm512_store_ps(fast_a, _mm512_setzero_ps());
01404           _mm512_store_ps(fast_b, _mm512_setzero_ps());
01405           _mm512_store_ps(fast_c, _mm512_setzero_ps());
01406           _mm512_store_ps(fast_d, _mm512_setzero_ps());
01407         #endif
01408         #if (0 FULL(+1))
01409           _mm512_store_ps(tmp_fullforce, fullforce_r_vec);
01410           _mm512_store_ps(fulltmp_x, fulltmp_x_vec);
01411           _mm512_store_ps(fulltmp_y, fulltmp_y_vec);
01412           _mm512_store_ps(fulltmp_z, fulltmp_z_vec);
01413         #else
01414           _mm512_store_ps(tmp_fullforce, _mm512_setzero_ps());
01415           _mm512_store_ps(fulltmp_x, _mm512_setzero_ps());
01416           _mm512_store_ps(fulltmp_y, _mm512_setzero_ps());
01417           _mm512_store_ps(fulltmp_z, _mm512_setzero_ps());
01418         #endif
01419         _mm512_store_ps(tmp_ijx, p_ij_x_vec);
01420         _mm512_store_ps(tmp_ijy, p_ij_y_vec);
01421         _mm512_store_ps(tmp_ijz, p_ij_z_vec);
01422 
01423         #define PRNT(idx) \
01424           if (_mm512_mask2int(lfd_mask) & (1 << (idx))) { \
01425             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :  tmp x:%+.3e, y:%+.3e, z:%+.3e  :  fulltmp x:%+.3e, y:%+.3e, z:%+.3e\n", \
01426                    params.pe, params.timestep, \
01427                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01428                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01429                    tmp_x[idx], tmp_y[idx], tmp_z[idx], \
01430                    fulltmp_x[idx], fulltmp_y[idx], fulltmp_z[idx] \
01431                   ); \
01432             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    calc_" SELF("self") PAIR("pair") FULL("_fullelect") ENERGY("_energy") "." NORMAL("NORM") MODIFIED("MOD") EXCLUDED("EXCL") "\n", \
01433                    params.pe, params.timestep, \
01434                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01435                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]) \
01436                   ); \
01437             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    i:%ld, j:%ld, i_index:%d, j_index:%d\n", \
01438                    params.pe, params.timestep, \
01439                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01440                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01441                    tmpI32[idx], tmpJ32[idx], pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index \
01442                   ); \
01443             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    p_i x:%+.3e, y:%+.3e, z:%+.3e, q:%+.3e  :  p_j x:%+.3e, y:%+.3e, z:%+.3e, q:%+.3e  :  r2 %+.3e\n", \
01444                    params.pe, params.timestep, \
01445                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01446                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01447                    pi_x[idx], pi_y[idx], pi_z[idx], pi_q[idx], \
01448                    pj_x[idx], pj_y[idx], pj_z[idx], pj_q[idx], \
01449                    r2[idx] \
01450                   ); \
01451             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    offset x:%+.3e, y:%+.3e, z:%+.3e\n", \
01452                    params.pe, params.timestep, \
01453                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01454                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01455                    offset_x, offset_y, offset_z \
01456                   ); \
01457             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    forces f&s:%+.3e, full:%+.3e  :  p_ij x:%+.3e, y:%+.3e, z:%+.3e\n", \
01458                    params.pe, params.timestep, \
01459                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01460                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01461                    tmp_force[idx], tmp_fullforce[idx], \
01462                    tmp_ijx[idx], tmp_ijy[idx], tmp_ijz[idx] \
01463                   ); \
01464             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    table_i:%d, kqq::%+.3e, diffa:%+.3e\n", \
01465                    params.pe, params.timestep, \
01466                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01467                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01468                    table_i[idx], kqq[idx], diffa[idx] \
01469                   ); \
01470             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    vdw a:%+.3e, b:%+.3e, c:%+.3e, d:%+.3e  :  fast a:%+.3e, b:%+.3e, c:%+.3e, d:%+.3e, \n", \
01471                    params.pe, params.timestep, \
01472                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01473                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01474                    vdw_a[idx], vdw_b[idx], vdw_c[idx], vdw_d[idx], \
01475                    fast_a[idx], fast_b[idx], fast_c[idx], fast_d[idx] \
01476                   ); \
01477             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    A:%+.3e, B:%+.3e  :  vdwType i:%d, j:%d\n", \
01478                    params.pe, params.timestep, \
01479                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01480                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01481                    A[idx], B[idx], i_vdwType[idx], j_vdwType[idx] \
01482                   ); \
01483             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    table_four_i 0:%+.3e, 1:%+.3e, 2:%+.3e, 3:%+.3e, 4:%+.3e, 5:%+.3e, 6:%+.3e, 7:%+.3e\n", \
01484                    params.pe, params.timestep, \
01485                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01486                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01487                    table_four_i_0[idx], table_four_i_1[idx], table_four_i_2[idx], table_four_i_3[idx], table_four_i_4[idx], table_four_i_5[idx], table_four_i_6[idx], table_four_i_7[idx] \
01488                   ); \
01489             int indexDiff = pExt_0[tmpI32[idx]].index - pExt_1[tmpJ32[idx]].index; \
01490             int maxDiff = pExt_1[tmpJ32[idx]].excl_maxdiff; \
01491             int offset = -1, offset_major = -1, offset_minor = -1, flags = 0; \
01492             if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) { \
01493               offset = (2 * indexDiff) + pExt_1[tmpJ32[idx]].excl_index; \
01494               offset_major = offset / (sizeof(unsigned int) * 8); \
01495               offset_minor = offset % (sizeof(unsigned int) * 8); \
01496               flags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03; \
01497             } \
01498             printf("_lfd :: %05d %06d : %8d, %8d : %06d, %3d, %3d, %3d, %3d  :    excl: exclIndex:%d, indexDiff:%d, maxDiff:%d, offset:%d, flags:%d\n", \
01499                    params.pe, params.timestep, \
01500                    pExt_0[tmpI32[idx]].index, pExt_1[tmpJ32[idx]].index, \
01501                    params.ppI, params.p1, params.p2, (int)(tmpI32[idx]), (int)(tmpJ32[idx]), \
01502                    pExt_1[tmpJ32[idx]].excl_index, indexDiff, maxDiff, offset, flags \ 
01503                   ); \
01504           }
01505         PRNT( 0); PRNT( 1); PRNT( 2); PRNT( 3); PRNT( 4); PRNT( 5); PRNT( 6); PRNT( 7);
01506         PRNT( 8); PRNT( 9); PRNT(10); PRNT(11); PRNT(12); PRNT(13); PRNT(14); PRNT(15);
01507         #undef PRNT
01508       } // end if (_mm512_mask2int(lfd_mask) != 0)
01509     #endif  // (MIC_DATA_STRUCT_VERIFY != 0) && (MIC_DATA_STRUCT_VERIFY_KERNELS != 0)
01510   }
01511 
01512   FAST(ENERGY( vdwEnergy += _mm512_reduce_add_pd(vdwEnergy_vec); ))
01513   FAST(SHORT(ENERGY( electEnergy += _mm512_reduce_add_pd(electEnergy_vec); )))
01514   FULL(ENERGY( fullElectEnergy += _mm512_reduce_add_pd(fullElectEnergy_vec); ))
01515   #if (0 PAIR(FAST(SHORT(+1))))
01516     virial_xx += _mm512_reduce_add_pd(virial_xx_vec);
01517     virial_xy += _mm512_reduce_add_pd(virial_xy_vec);
01518     virial_xz += _mm512_reduce_add_pd(virial_xz_vec);
01519     virial_yy += _mm512_reduce_add_pd(virial_yy_vec);
01520     virial_yz += _mm512_reduce_add_pd(virial_yz_vec);
01521     virial_zz += _mm512_reduce_add_pd(virial_zz_vec);
01522   #endif
01523   #if (0 PAIR(FULL(+1)))
01524     fullElectVirial_xx += _mm512_reduce_add_pd(fullElectVirial_xx_vec);
01525     fullElectVirial_xy += _mm512_reduce_add_pd(fullElectVirial_xy_vec);
01526     fullElectVirial_xz += _mm512_reduce_add_pd(fullElectVirial_xz_vec);
01527     fullElectVirial_yy += _mm512_reduce_add_pd(fullElectVirial_yy_vec);
01528     fullElectVirial_yz += _mm512_reduce_add_pd(fullElectVirial_yz_vec);
01529     fullElectVirial_zz += _mm512_reduce_add_pd(fullElectVirial_zz_vec);
01530   #endif
01531 
01532   #if (MIC_EXCL_CHECKSUM_FULL != 0) && (0 EXCLUDED(+1) MODIFIED(+1))
01533     params.exclusionSum += _mm512_reduce_add_epi32(exclusionSum_vec);
01534   #endif
01535 
01536   #undef GATHER_PS_I32_OFFSET
01537   #undef GATHER_PS_I32
01538   #undef GATHER_EPI32_I32_OFFSET
01539   #undef GATHER_EPI32_I32
01540   #undef SCATTER_INC_PD_I32_STEP
01541   #undef SCATTER_INC_PD_I32
01542   #undef SCATTER_INC_PS_I32_STEP
01543   #undef SCATTER_INC_PS_I32
01544   #undef CONTRIB_ADD_PS2PD
01545   #undef CONTRIB_SUB_PS2PD
01546   #undef APPLY_FORCES_PS2PD_STEP_ADD
01547   #undef APPLY_FORCES_PS2PD_STEP_SUB
01548   #undef APPLY_FORCES_PS2PD_STEP_ADD_COMBO
01549   #undef APPLY_FORCES_PS2PD_STEP_SUB_COMBO
01550   #undef APPLY_FORCES_PS2PD_JSTEP
01551   #undef APPLY_FORCES_PS2PD
01552 
01553 #endif  // NAMD_MIC

Generated on Sat Nov 18 01:17:13 2017 for NAMD by  doxygen 1.4.7