19 #if REFINE_PAIRLISTS != 0
21 int plSize = *(params.plSize_ptr);
22 int * RESTRICT plArray = *(params.plArray_ptr);
23 double * RESTRICT r2Array = *(params.r2Array_ptr);
26 #if REFINE_PAIRLISTS_XYZ != 0
27 const int plMinSize = ((PAIRLIST[1] - 2) + 7) & (~(7));
28 const int plMallocSize = 4 * plMinSize;
30 const int plMinSize = PAIRLIST[1] - 2;
31 const int plMallocSize = plMinSize;
33 if (plMallocSize > plSize) {
34 if (plArray != NULL) { _mm_free(plArray); }
35 if (r2Array != NULL) { _mm_free(r2Array); }
36 const int newPlSize = plMallocSize * 1.2;
37 plArray = (
int*)_mm_malloc(newPlSize *
sizeof(
int), 64); __ASSERT(plArray != NULL);
38 r2Array = (
double*)_mm_malloc(newPlSize *
sizeof(
double), 64); __ASSERT(r2Array != NULL);
39 *(params.plArray_ptr) = plArray;
40 *(params.r2Array_ptr) = r2Array;
41 *(params.plSize_ptr) = newPlSize;
44 __ASSUME_ALIGNED(r2Array);
45 #if REFINE_PAIRLISTS_XYZ != 0
46 double * RESTRICT p_ij_x_array = r2Array + (1 * plMinSize); __ASSUME_ALIGNED(p_ij_x_array);
47 double * RESTRICT p_ij_y_array = r2Array + (2 * plMinSize); __ASSUME_ALIGNED(p_ij_y_array);
48 double * RESTRICT p_ij_z_array = r2Array + (3 * plMinSize); __ASSUME_ALIGNED(p_ij_z_array);
51 #if REFINE_PAIRLIST_HANDCODE != 0
54 __m512i plI_vec = _mm512_set_16to16_epi32(0, 0, 0, 0, 0, 0, 0, 0,
55 7, 6, 5, 4, 3, 2, 1, 0);
56 __m512i ij_mask_vec = _mm512_set_1to16_epi32(0x0000FFFF);
57 __m512i ij_store_perm_pattern = _mm512_set_16to16_epi32( 9, 7, 9, 6, 9, 5, 9, 4,
58 9, 3, 9, 2, 9, 1, 9, 0 );
60 plSize = PAIRLIST[1] - 2;
61 int * RESTRICT orig_plArray = PAIRLIST + 2;
63 #pragma loop count (100)
65 for (
int plI = 0; plI < plMinSize; plI += 8) {
68 __mmask16 active_mask = _mm512_mask_cmplt_epi32_mask(_mm512_int2mask(0x00FF), plI_vec, _mm512_set_1to16_epi32(plSize));
73 __m512i ij_vec = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), active_mask, orig_plArray + plI );
74 ij_vec = _mm512_mask_loadunpackhi_epi32( ij_vec, active_mask, orig_plArray + plI + 16);
75 __m512i i_vec = _mm512_and_epi32(_mm512_mask_srli_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, 16), ij_mask_vec);
76 __m512i j_vec = _mm512_mask_and_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, ij_mask_vec);
80 uintptr_t tmpI[8] __attribute__((aligned(64)));
81 uintptr_t tmpJ[8] __attribute__((aligned(64)));
82 _mm512_store_epi32(tmpI, _mm512_permutevar_epi32(ij_store_perm_pattern, i_vec));
83 _mm512_store_epi32(tmpJ, _mm512_permutevar_epi32(ij_store_perm_pattern, j_vec));
86 plI_vec = _mm512_mask_add_epi32(plI_vec, _mm512_int2mask(0x00FF), plI_vec, _mm512_set_1to16_epi32(8));
89 __m512d p_i_x_vec, p_i_y_vec, p_i_z_vec, p_j_x_vec, p_j_y_vec, p_j_z_vec;
91 __m512d zero_vec = _mm512_setzero_pd();
92 p_i_x_vec = p_i_y_vec = p_i_z_vec = zero_vec;
93 p_j_x_vec = p_j_y_vec = p_j_z_vec = zero_vec;
97 #define __VCVTPS2PD_LOAD_ELEM(i) \
98 { __mmask16 _k = _mm512_int2mask(0x01 << (i)); uintptr_t _i = tmpI[(i)]; uintptr_t _j = tmpJ[(i)]; \
99 asm("vcvtps2pd (%6,%12,4){{1to8}}, %0{{%14}}\n" \
100 "vcvtps2pd (%7,%12,4){{1to8}}, %1{{%14}}\n" \
101 "vcvtps2pd (%8,%12,4){{1to8}}, %2{{%14}}\n" \
102 "vcvtps2pd (%9,%13,4){{1to8}}, %3{{%14}}\n" \
103 "vcvtps2pd (%10,%13,4){{1to8}}, %4{{%14}}\n" \
104 "vcvtps2pd (%11,%13,4){{1to8}}, %5{{%14}}\n" \
105 : "+v"(p_i_x_vec), "+v"(p_i_y_vec), "+v"(p_i_z_vec), \
106 "+v"(p_j_x_vec), "+v"(p_j_y_vec), "+v"(p_j_z_vec) \
107 : "r"(p_0_x), "r"(p_0_y), "r"(p_0_z), \
108 "r"(p_1_x), "r"(p_1_y), "r"(p_1_z), \
109 "r"(_i), "r"(_j), "Yk"(_k) \
112 __VCVTPS2PD_LOAD_ELEM(0);
113 __VCVTPS2PD_LOAD_ELEM(1);
114 __VCVTPS2PD_LOAD_ELEM(2);
115 __VCVTPS2PD_LOAD_ELEM(3);
116 __VCVTPS2PD_LOAD_ELEM(4);
117 __VCVTPS2PD_LOAD_ELEM(5);
118 __VCVTPS2PD_LOAD_ELEM(6);
119 __VCVTPS2PD_LOAD_ELEM(7);
120 #undef __VCVTPS2PD_LOAD_ELEM
123 p_i_x_vec = _mm512_add_pd(p_i_x_vec, _mm512_set_1to8_pd(params.offset.x));
124 p_i_y_vec = _mm512_add_pd(p_i_y_vec, _mm512_set_1to8_pd(params.offset.y));
125 p_i_z_vec = _mm512_add_pd(p_i_z_vec, _mm512_set_1to8_pd(params.offset.z));
126 __m512d p_ij_x_vec = _mm512_sub_pd(p_i_x_vec, p_j_x_vec);
127 __m512d p_ij_y_vec = _mm512_sub_pd(p_i_y_vec, p_j_y_vec);
128 __m512d p_ij_z_vec = _mm512_sub_pd(p_i_z_vec, p_j_z_vec);
130 __m512d r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_x_vec, p_ij_x_vec), _mm512_set_1to8_pd(r2_delta));
131 r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_y_vec, p_ij_y_vec), r2_vec);
132 r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_z_vec, p_ij_z_vec), r2_vec);
134 __mmask16 cutoff_mask = _mm512_mask_cmplt_pd_mask(active_mask, r2_vec, _mm512_set_1to8_pd(cutoff2_delta));
136 _mm512_mask_packstorelo_epi32(plArray + offset , cutoff_mask, ij_vec);
137 _mm512_mask_packstorehi_epi32(plArray + offset + 16, cutoff_mask, ij_vec);
138 _mm512_mask_packstorelo_pd(r2Array + offset , cutoff_mask, r2_vec);
139 _mm512_mask_packstorehi_pd(r2Array + offset + 8, cutoff_mask, r2_vec);
140 #if REFINE_PAIRLISTS_XYZ != 0
141 _mm512_mask_packstorelo_pd(p_ij_x_array + offset , cutoff_mask, p_ij_x_vec);
142 _mm512_mask_packstorehi_pd(p_ij_x_array + offset + 8, cutoff_mask, p_ij_x_vec);
143 _mm512_mask_packstorelo_pd(p_ij_y_array + offset , cutoff_mask, p_ij_y_vec);
144 _mm512_mask_packstorehi_pd(p_ij_y_array + offset + 8, cutoff_mask, p_ij_y_vec);
145 _mm512_mask_packstorelo_pd(p_ij_z_array + offset , cutoff_mask, p_ij_z_vec);
146 _mm512_mask_packstorehi_pd(p_ij_z_array + offset + 8, cutoff_mask, p_ij_z_vec);
149 offset += _mm512_mask_reduce_add_epi32(cutoff_mask, _mm512_set_1to16_epi32(1));
158 const int savedPlSize = PAIRLIST[1];
164 #if REFINE_PAIRLISTS_XYZ != 0
169 for (
int plI = 2; plI < savedPlSize; plI++) {
170 const int plEntry = PAIRLIST[plI];
171 const int i = (plEntry >> 16) & 0xFFFF;
172 const int j = (plEntry ) & 0xFFFF;
173 double p_ij_x = (p_0_x[i] + params.offset.x) - p_1_x[j];
174 double p_ij_y = (p_0_y[i] + params.offset.y) - p_1_y[j];
175 double p_ij_z = (p_0_z[i] + params.offset.z) - p_1_z[j];
176 double r2_ij_delta = r2_delta + (p_ij_x *
p_ij_x) + (p_ij_y * p_ij_y) + (p_ij_z *
p_ij_z);
177 if (r2_ij_delta < cutoff2_delta) {
178 plArray[plSize++] = plEntry;
179 r2Array[r2Size++] = r2_ij_delta;
181 #if REFINE_PAIRLISTS_XYZ != 0
182 p_ij_x_array[ijXSize++] =
p_ij_x;
183 p_ij_y_array[ijYSize++] =
p_ij_y;
184 p_ij_z_array[ijZSize++] =
p_ij_z;
189 #endif // MIC_HANDCODE_FORCE check
191 #else // REFINE_PAIRLISTS != 0
193 const int plSize = PAIRLIST[1] - 2;
194 const int * RESTRICT plArray = PAIRLIST + 2;
196 #endif // REFINE_PAIRLISTS != 0
203 __ASSERT(plArray != NULL);
204 __ASSERT(plSize >= 0);
205 __ASSUME_ALIGNED(plArray);
208 #if (MIC_HANDCODE_FORCE != 0) && (MIC_HANDCODE_FORCE_SINGLE != 0)
register const BigReal p_ij_z
register const BigReal p_ij_x
register const BigReal p_ij_y