NAMD
ComputeNonbondedMICKernelBase2.h
Go to the documentation of this file.
1 
2 #ifdef NAMD_MIC
3 
4 { // Start a block so variables can be declared outsude the force loop
5 
6  // Validate the control macros for this file, by ensuring that certain invalid
7  // combinations result in compiler errors, preventing those combinations
8  EXCLUDED( FAST( foo bar ) )
9  EXCLUDED( MODIFIED( foo bar ) )
10  EXCLUDED( NORMAL( foo bar ) )
11  NORMAL( MODIFIED( foo bar ) )
12  ALCHPAIR( NOT_ALCHPAIR( foo bar ) )
13 
14  // If pairlist refinement is enabled, then refine the pairlists by doing the exact
15  // distance checks for all the atom pairs in the current pairlist as a separate
16  // loop before the actual force computation loop (removing the distance check
17  // from the force computation loop and ensuring that all iterations of the force
18  // computation loop are within cutoff).
19  #if REFINE_PAIRLISTS != 0
20 
21  int plSize = *(params.plSize_ptr);
22  int * RESTRICT plArray = *(params.plArray_ptr);
23  double * RESTRICT r2Array = *(params.r2Array_ptr);
24 
25  // Grow the scratch arrays as required
26  #if REFINE_PAIRLISTS_XYZ != 0
27  const int plMinSize = ((PAIRLIST[1] - 2) + 7) & (~(7)); // Roundup to multiple of 16
28  const int plMallocSize = 4 * plMinSize;
29  #else
30  const int plMinSize = PAIRLIST[1] - 2;
31  const int plMallocSize = plMinSize;
32  #endif
33  if (plMallocSize > plSize) {
34  if (plArray != NULL) { _mm_free(plArray); }
35  if (r2Array != NULL) { _mm_free(r2Array); }
36  const int newPlSize = plMallocSize * 1.2; // r2
37  plArray = ( int*)_mm_malloc(newPlSize * sizeof( int), 64); __ASSERT(plArray != NULL);
38  r2Array = (double*)_mm_malloc(newPlSize * sizeof(double), 64); __ASSERT(r2Array != NULL);
39  *(params.plArray_ptr) = plArray;
40  *(params.r2Array_ptr) = r2Array;
41  *(params.plSize_ptr) = newPlSize;
42  }
43 
44  __ASSUME_ALIGNED(r2Array); // NOTE: plArray handled below (common code)
45  #if REFINE_PAIRLISTS_XYZ != 0
46  double * RESTRICT p_ij_x_array = r2Array + (1 * plMinSize); __ASSUME_ALIGNED(p_ij_x_array);
47  double * RESTRICT p_ij_y_array = r2Array + (2 * plMinSize); __ASSUME_ALIGNED(p_ij_y_array);
48  double * RESTRICT p_ij_z_array = r2Array + (3 * plMinSize); __ASSUME_ALIGNED(p_ij_z_array);
49  #endif
50 
51  #if REFINE_PAIRLIST_HANDCODE != 0
52  {
53 
54  __m512i plI_vec = _mm512_set_16to16_epi32(0, 0, 0, 0, 0, 0, 0, 0,
55  7, 6, 5, 4, 3, 2, 1, 0);
56  __m512i ij_mask_vec = _mm512_set_1to16_epi32(0x0000FFFF);
57  __m512i ij_store_perm_pattern = _mm512_set_16to16_epi32( 9, 7, 9, 6, 9, 5, 9, 4,
58  9, 3, 9, 2, 9, 1, 9, 0 );
59  int offset = 0;
60  plSize = PAIRLIST[1] - 2;
61  int * RESTRICT orig_plArray = PAIRLIST + 2;
62 
63  #pragma loop count (100)
64  #pragma novector
65  for (int plI = 0; plI < plMinSize; plI += 8) {
66 
67  // Create the active_mask
68  __mmask16 active_mask = _mm512_mask_cmplt_epi32_mask(_mm512_int2mask(0x00FF), plI_vec, _mm512_set_1to16_epi32(plSize));
69 
70  // Load the i and j values from the pairlist array
71  // NOTE: The "hi" part of this "unaligned load" should never be required since plArray is actually
72  // aligned (unaligned load being used because of 32-bit indexes vs 64-bit forces).
73  __m512i ij_vec = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), active_mask, orig_plArray + plI );
74  ij_vec = _mm512_mask_loadunpackhi_epi32( ij_vec, active_mask, orig_plArray + plI + 16);
75  __m512i i_vec = _mm512_and_epi32(_mm512_mask_srli_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, 16), ij_mask_vec);
76  __m512i j_vec = _mm512_mask_and_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, ij_mask_vec);
77 
78  // NOTE: Make these the same size as pointers so that they can be directly loaded in to
79  // the index register that can be used with a base pointer register.
80  uintptr_t tmpI[8] __attribute__((aligned(64)));
81  uintptr_t tmpJ[8] __attribute__((aligned(64)));
82  _mm512_store_epi32(tmpI, _mm512_permutevar_epi32(ij_store_perm_pattern, i_vec));
83  _mm512_store_epi32(tmpJ, _mm512_permutevar_epi32(ij_store_perm_pattern, j_vec));
84 
85  // Increment the vectorized loop counter
86  plI_vec = _mm512_mask_add_epi32(plI_vec, _mm512_int2mask(0x00FF), plI_vec, _mm512_set_1to16_epi32(8));
87 
88  // Load position/charge data for the i and j atoms
89  __m512d p_i_x_vec, p_i_y_vec, p_i_z_vec, p_j_x_vec, p_j_y_vec, p_j_z_vec;
90  {
91  __m512d zero_vec = _mm512_setzero_pd();
92  p_i_x_vec = p_i_y_vec = p_i_z_vec = zero_vec;
93  p_j_x_vec = p_j_y_vec = p_j_z_vec = zero_vec;
94 
95  // NOTE: If 1 asm statement per 1 vcvtps2pd instrunction with each using the same mask (_k), the
96  // compiler resorts to using multiple masks instead of just one (using a single statment to avoid).
97  #define __VCVTPS2PD_LOAD_ELEM(i) \
98  { __mmask16 _k = _mm512_int2mask(0x01 << (i)); uintptr_t _i = tmpI[(i)]; uintptr_t _j = tmpJ[(i)]; \
99  asm("vcvtps2pd (%6,%12,4){{1to8}}, %0{{%14}}\n" \
100  "vcvtps2pd (%7,%12,4){{1to8}}, %1{{%14}}\n" \
101  "vcvtps2pd (%8,%12,4){{1to8}}, %2{{%14}}\n" \
102  "vcvtps2pd (%9,%13,4){{1to8}}, %3{{%14}}\n" \
103  "vcvtps2pd (%10,%13,4){{1to8}}, %4{{%14}}\n" \
104  "vcvtps2pd (%11,%13,4){{1to8}}, %5{{%14}}\n" \
105  : "+v"(p_i_x_vec), "+v"(p_i_y_vec), "+v"(p_i_z_vec), \
106  "+v"(p_j_x_vec), "+v"(p_j_y_vec), "+v"(p_j_z_vec) \
107  : "r"(p_0_x), "r"(p_0_y), "r"(p_0_z), \
108  "r"(p_1_x), "r"(p_1_y), "r"(p_1_z), \
109  "r"(_i), "r"(_j), "Yk"(_k) \
110  ); \
111  }
112  __VCVTPS2PD_LOAD_ELEM(0);
113  __VCVTPS2PD_LOAD_ELEM(1);
114  __VCVTPS2PD_LOAD_ELEM(2);
115  __VCVTPS2PD_LOAD_ELEM(3);
116  __VCVTPS2PD_LOAD_ELEM(4);
117  __VCVTPS2PD_LOAD_ELEM(5);
118  __VCVTPS2PD_LOAD_ELEM(6);
119  __VCVTPS2PD_LOAD_ELEM(7);
120  #undef __VCVTPS2PD_LOAD_ELEM
121  }
122 
123  p_i_x_vec = _mm512_add_pd(p_i_x_vec, _mm512_set_1to8_pd(params.offset.x));
124  p_i_y_vec = _mm512_add_pd(p_i_y_vec, _mm512_set_1to8_pd(params.offset.y));
125  p_i_z_vec = _mm512_add_pd(p_i_z_vec, _mm512_set_1to8_pd(params.offset.z));
126  __m512d p_ij_x_vec = _mm512_sub_pd(p_i_x_vec, p_j_x_vec);
127  __m512d p_ij_y_vec = _mm512_sub_pd(p_i_y_vec, p_j_y_vec);
128  __m512d p_ij_z_vec = _mm512_sub_pd(p_i_z_vec, p_j_z_vec);
129 
130  __m512d r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_x_vec, p_ij_x_vec), _mm512_set_1to8_pd(r2_delta));
131  r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_y_vec, p_ij_y_vec), r2_vec);
132  r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_z_vec, p_ij_z_vec), r2_vec);
133 
134  __mmask16 cutoff_mask = _mm512_mask_cmplt_pd_mask(active_mask, r2_vec, _mm512_set_1to8_pd(cutoff2_delta));
135 
136  _mm512_mask_packstorelo_epi32(plArray + offset , cutoff_mask, ij_vec);
137  _mm512_mask_packstorehi_epi32(plArray + offset + 16, cutoff_mask, ij_vec);
138  _mm512_mask_packstorelo_pd(r2Array + offset , cutoff_mask, r2_vec);
139  _mm512_mask_packstorehi_pd(r2Array + offset + 8, cutoff_mask, r2_vec);
140  #if REFINE_PAIRLISTS_XYZ != 0
141  _mm512_mask_packstorelo_pd(p_ij_x_array + offset , cutoff_mask, p_ij_x_vec);
142  _mm512_mask_packstorehi_pd(p_ij_x_array + offset + 8, cutoff_mask, p_ij_x_vec);
143  _mm512_mask_packstorelo_pd(p_ij_y_array + offset , cutoff_mask, p_ij_y_vec);
144  _mm512_mask_packstorehi_pd(p_ij_y_array + offset + 8, cutoff_mask, p_ij_y_vec);
145  _mm512_mask_packstorelo_pd(p_ij_z_array + offset , cutoff_mask, p_ij_z_vec);
146  _mm512_mask_packstorehi_pd(p_ij_z_array + offset + 8, cutoff_mask, p_ij_z_vec);
147  #endif
148 
149  offset += _mm512_mask_reduce_add_epi32(cutoff_mask, _mm512_set_1to16_epi32(1));
150  }
151 
152  plSize = offset;
153 
154  }
155  #else
156 
157  // Refine the saved pairlist
158  const int savedPlSize = PAIRLIST[1];
159  plSize = 0;
160  // DMK - NOTE | TODO | FIXME : If just plSize is used, the compiler doesn't recognize
161  // the compress idiom, so used r2Size separately, and once the compiler recognizes
162  // this, fix and/or report it. For now, use two index variables so compress happens.
163  int r2Size = 0;
164  #if REFINE_PAIRLISTS_XYZ != 0
165  int ijXSize = 0;
166  int ijYSize = 0;
167  int ijZSize = 0;
168  #endif
169  for (int plI = 2; plI < savedPlSize; plI++) {
170  const int plEntry = PAIRLIST[plI];
171  const int i = (plEntry >> 16) & 0xFFFF;
172  const int j = (plEntry ) & 0xFFFF;
173  double p_ij_x = (p_0_x[i] + params.offset.x) - p_1_x[j];
174  double p_ij_y = (p_0_y[i] + params.offset.y) - p_1_y[j];
175  double p_ij_z = (p_0_z[i] + params.offset.z) - p_1_z[j];
176  double r2_ij_delta = r2_delta + (p_ij_x * p_ij_x) + (p_ij_y * p_ij_y) + (p_ij_z * p_ij_z);
177  if (r2_ij_delta < cutoff2_delta) {
178  plArray[plSize++] = plEntry;
179  r2Array[r2Size++] = r2_ij_delta;
180  //plSize++;
181  #if REFINE_PAIRLISTS_XYZ != 0
182  p_ij_x_array[ijXSize++] = p_ij_x;
183  p_ij_y_array[ijYSize++] = p_ij_y;
184  p_ij_z_array[ijZSize++] = p_ij_z;
185  #endif
186  }
187  }
188 
189  #endif // MIC_HANDCODE_FORCE check
190 
191  #else // REFINE_PAIRLISTS != 0
192 
193  const int plSize = PAIRLIST[1] - 2;
194  const int * RESTRICT plArray = PAIRLIST + 2;
195 
196  #endif // REFINE_PAIRLISTS != 0
197 
198  // NOTE : The above code defines plArray and plSize, which are the pointer and the
199  // length of the pairlist to be processed by the force computation loop, whether
200  // or not pairlist refinement is being used. Neither the pointer or the size should
201  // include the first two elements (allocated size and actual size) of the pairlists
202  // that are saved across iterations.
203  __ASSERT(plArray != NULL);
204  __ASSERT(plSize >= 0);
205  __ASSUME_ALIGNED(plArray);
206 
207  // Include code for the force computation loop itself
208  #if (MIC_HANDCODE_FORCE != 0) && (MIC_HANDCODE_FORCE_SINGLE != 0)
210  #else
212  #endif
213 
214 } // End block
215 
216 #else // NAMD_MIC
217 
220 
221 #endif // NAMD_MIC
222 
#define NORMAL(X)
#define ALCHPAIR(X)
register const BigReal p_ij_z
#define NOT_ALCHPAIR(X)
#define FAST(X)
#define MODIFIED(X)
register const BigReal p_ij_x
register const BigReal p_ij_y
#define EXCLUDED(X)