ComputeNonbondedMICKernelBase2.h

Go to the documentation of this file.
00001 
00002 #ifdef NAMD_MIC
00003 
00004 { // Start a block so variables can be declared outsude the force loop
00005 
00006   // Validate the control macros for this file, by ensuring that certain invalid
00007   //   combinations result in compiler errors, preventing those combinations
00008   EXCLUDED( FAST( foo bar ) )
00009   EXCLUDED( MODIFIED( foo bar ) )
00010   EXCLUDED( NORMAL( foo bar ) )
00011   NORMAL( MODIFIED( foo bar ) )
00012   ALCHPAIR( NOT_ALCHPAIR( foo bar ) )
00013 
00014   // If pairlist refinement is enabled, then refine the pairlists by doing the exact
00015   //   distance checks for all the atom pairs in the current pairlist as a separate
00016   //   loop before the actual force computation loop (removing the distance check
00017   //   from the force computation loop and ensuring that all iterations of the force
00018   //   computation loop are within cutoff).
00019   #if REFINE_PAIRLISTS != 0
00020 
00021     int plSize = *(params.plSize_ptr);
00022     int * RESTRICT plArray = *(params.plArray_ptr);
00023     double * RESTRICT r2Array = *(params.r2Array_ptr);
00024 
00025     // Grow the scratch arrays as required
00026     #if REFINE_PAIRLISTS_XYZ != 0
00027       const int plMinSize = ((PAIRLIST[1] - 2) + 7) & (~(7));  // Roundup to multiple of 16
00028       const int plMallocSize = 4 * plMinSize;
00029     #else
00030       const int plMinSize = PAIRLIST[1] - 2;
00031       const int plMallocSize = plMinSize;
00032     #endif
00033     if (plMallocSize > plSize) {
00034       if (plArray != NULL) { _mm_free(plArray); }
00035       if (r2Array != NULL) { _mm_free(r2Array); }
00036       const int newPlSize = plMallocSize * 1.2; // r2
00037       plArray = (   int*)_mm_malloc(newPlSize * sizeof(   int), 64); __ASSERT(plArray != NULL);
00038       r2Array = (double*)_mm_malloc(newPlSize * sizeof(double), 64); __ASSERT(r2Array != NULL);
00039       *(params.plArray_ptr) = plArray;
00040       *(params.r2Array_ptr) = r2Array;
00041       *(params.plSize_ptr) = newPlSize;
00042     }
00043 
00044     __ASSUME_ALIGNED(r2Array); // NOTE: plArray handled below (common code)
00045     #if REFINE_PAIRLISTS_XYZ != 0
00046       double * RESTRICT p_ij_x_array = r2Array + (1 * plMinSize); __ASSUME_ALIGNED(p_ij_x_array);
00047       double * RESTRICT p_ij_y_array = r2Array + (2 * plMinSize); __ASSUME_ALIGNED(p_ij_y_array);
00048       double * RESTRICT p_ij_z_array = r2Array + (3 * plMinSize); __ASSUME_ALIGNED(p_ij_z_array);
00049     #endif
00050 
00051     #if REFINE_PAIRLIST_HANDCODE != 0
00052     {
00053 
00054       __m512i plI_vec = _mm512_set_16to16_epi32(0, 0, 0, 0, 0, 0, 0, 0,
00055                                                 7, 6, 5, 4, 3, 2, 1, 0);
00056       __m512i ij_mask_vec = _mm512_set_1to16_epi32(0x0000FFFF);
00057       __m512i ij_store_perm_pattern = _mm512_set_16to16_epi32( 9,  7,  9,  6,  9,  5,  9,  4,
00058                                                                9,  3,  9,  2,  9,  1,  9,  0  );
00059       int offset = 0;
00060       plSize = PAIRLIST[1] - 2;
00061       int * RESTRICT orig_plArray = PAIRLIST + 2;
00062 
00063       #pragma loop count (100)
00064       #pragma novector
00065       for (int plI = 0; plI < plMinSize; plI += 8) {
00066 
00067         // Create the active_mask
00068         __mmask16 active_mask = _mm512_mask_cmplt_epi32_mask(_mm512_int2mask(0x00FF), plI_vec, _mm512_set_1to16_epi32(plSize));
00069 
00070         // Load the i and j values from the pairlist array
00071         // NOTE: The "hi" part of this "unaligned load" should never be required since plArray is actually
00072         //   aligned (unaligned load being used because of 32-bit indexes vs 64-bit forces).
00073         __m512i ij_vec = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), active_mask, orig_plArray + plI     );
00074                 ij_vec = _mm512_mask_loadunpackhi_epi32(                ij_vec, active_mask, orig_plArray + plI + 16);
00075         __m512i i_vec = _mm512_and_epi32(_mm512_mask_srli_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, 16), ij_mask_vec);
00076         __m512i j_vec = _mm512_mask_and_epi32(_mm512_setzero_epi32(), active_mask, ij_vec, ij_mask_vec);
00077 
00078         // NOTE: Make these the same size as pointers so that they can be directly loaded in to
00079         //   the index register that can be used with a base pointer register.
00080         uintptr_t tmpI[8] __attribute__((aligned(64)));
00081         uintptr_t tmpJ[8] __attribute__((aligned(64)));
00082         _mm512_store_epi32(tmpI, _mm512_permutevar_epi32(ij_store_perm_pattern, i_vec));
00083         _mm512_store_epi32(tmpJ, _mm512_permutevar_epi32(ij_store_perm_pattern, j_vec));
00084 
00085         // Increment the vectorized loop counter
00086         plI_vec = _mm512_mask_add_epi32(plI_vec, _mm512_int2mask(0x00FF), plI_vec, _mm512_set_1to16_epi32(8));
00087 
00088         // Load position/charge data for the i and j atoms
00089         __m512d p_i_x_vec, p_i_y_vec, p_i_z_vec, p_j_x_vec, p_j_y_vec, p_j_z_vec;
00090         {
00091           __m512d zero_vec = _mm512_setzero_pd();
00092           p_i_x_vec = p_i_y_vec = p_i_z_vec = zero_vec;
00093           p_j_x_vec = p_j_y_vec = p_j_z_vec = zero_vec;
00094 
00095           // NOTE: If 1 asm statement per 1 vcvtps2pd instrunction with each using the same mask (_k), the
00096           //   compiler resorts to using multiple masks instead of just one (using a single statment to avoid).
00097           #define __VCVTPS2PD_LOAD_ELEM(i) \
00098           { __mmask16 _k = _mm512_int2mask(0x01 << (i)); uintptr_t _i = tmpI[(i)]; uintptr_t _j = tmpJ[(i)]; \
00099             asm("vcvtps2pd (%6,%12,4){{1to8}}, %0{{%14}}\n" \
00100                 "vcvtps2pd (%7,%12,4){{1to8}}, %1{{%14}}\n" \
00101                 "vcvtps2pd (%8,%12,4){{1to8}}, %2{{%14}}\n" \
00102                 "vcvtps2pd (%9,%13,4){{1to8}}, %3{{%14}}\n" \
00103                 "vcvtps2pd (%10,%13,4){{1to8}}, %4{{%14}}\n" \
00104                 "vcvtps2pd (%11,%13,4){{1to8}}, %5{{%14}}\n" \
00105                 : "+v"(p_i_x_vec), "+v"(p_i_y_vec), "+v"(p_i_z_vec), \
00106                   "+v"(p_j_x_vec), "+v"(p_j_y_vec), "+v"(p_j_z_vec)  \
00107                 : "r"(p_0_x), "r"(p_0_y), "r"(p_0_z), \
00108                   "r"(p_1_x), "r"(p_1_y), "r"(p_1_z), \
00109                   "r"(_i), "r"(_j), "Yk"(_k) \
00110                ); \
00111           }
00112           __VCVTPS2PD_LOAD_ELEM(0);
00113           __VCVTPS2PD_LOAD_ELEM(1);
00114           __VCVTPS2PD_LOAD_ELEM(2);
00115           __VCVTPS2PD_LOAD_ELEM(3);
00116           __VCVTPS2PD_LOAD_ELEM(4);
00117           __VCVTPS2PD_LOAD_ELEM(5);
00118           __VCVTPS2PD_LOAD_ELEM(6);
00119           __VCVTPS2PD_LOAD_ELEM(7);
00120           #undef __VCVTPS2PD_LOAD_ELEM
00121         }
00122 
00123         p_i_x_vec = _mm512_add_pd(p_i_x_vec, _mm512_set_1to8_pd(params.offset.x));
00124         p_i_y_vec = _mm512_add_pd(p_i_y_vec, _mm512_set_1to8_pd(params.offset.y));
00125         p_i_z_vec = _mm512_add_pd(p_i_z_vec, _mm512_set_1to8_pd(params.offset.z));
00126         __m512d p_ij_x_vec = _mm512_sub_pd(p_i_x_vec, p_j_x_vec);
00127         __m512d p_ij_y_vec = _mm512_sub_pd(p_i_y_vec, p_j_y_vec);
00128         __m512d p_ij_z_vec = _mm512_sub_pd(p_i_z_vec, p_j_z_vec);
00129 
00130         __m512d r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_x_vec, p_ij_x_vec), _mm512_set_1to8_pd(r2_delta));
00131         r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_y_vec, p_ij_y_vec), r2_vec);
00132         r2_vec = _mm512_add_pd(_mm512_mul_pd(p_ij_z_vec, p_ij_z_vec), r2_vec);
00133 
00134         __mmask16 cutoff_mask = _mm512_mask_cmplt_pd_mask(active_mask, r2_vec, _mm512_set_1to8_pd(cutoff2_delta));
00135 
00136         _mm512_mask_packstorelo_epi32(plArray + offset     , cutoff_mask, ij_vec);
00137         _mm512_mask_packstorehi_epi32(plArray + offset + 16, cutoff_mask, ij_vec);
00138         _mm512_mask_packstorelo_pd(r2Array + offset    , cutoff_mask, r2_vec);
00139         _mm512_mask_packstorehi_pd(r2Array + offset + 8, cutoff_mask, r2_vec);
00140         #if REFINE_PAIRLISTS_XYZ != 0
00141           _mm512_mask_packstorelo_pd(p_ij_x_array + offset    , cutoff_mask, p_ij_x_vec);
00142           _mm512_mask_packstorehi_pd(p_ij_x_array + offset + 8, cutoff_mask, p_ij_x_vec);
00143           _mm512_mask_packstorelo_pd(p_ij_y_array + offset    , cutoff_mask, p_ij_y_vec);
00144           _mm512_mask_packstorehi_pd(p_ij_y_array + offset + 8, cutoff_mask, p_ij_y_vec);
00145           _mm512_mask_packstorelo_pd(p_ij_z_array + offset    , cutoff_mask, p_ij_z_vec);
00146           _mm512_mask_packstorehi_pd(p_ij_z_array + offset + 8, cutoff_mask, p_ij_z_vec);
00147         #endif
00148 
00149         offset += _mm512_mask_reduce_add_epi32(cutoff_mask, _mm512_set_1to16_epi32(1));
00150       }
00151 
00152       plSize = offset;
00153 
00154     }
00155     #else
00156 
00157       // Refine the saved pairlist
00158       const int savedPlSize = PAIRLIST[1];
00159       plSize = 0;
00160       // DMK - NOTE | TODO | FIXME : If just plSize is used, the compiler doesn't recognize
00161       //   the compress idiom, so used r2Size separately, and once the compiler recognizes
00162       //   this, fix and/or report it.  For now, use two index variables so compress happens.
00163       int r2Size = 0;
00164       #if REFINE_PAIRLISTS_XYZ != 0
00165         int ijXSize = 0;
00166         int ijYSize = 0;
00167         int ijZSize = 0;
00168       #endif
00169       for (int plI = 2; plI < savedPlSize; plI++) {
00170         const int plEntry = PAIRLIST[plI];
00171         const int i = (plEntry >> 16) & 0xFFFF;
00172         const int j = (plEntry      ) & 0xFFFF;
00173         double p_ij_x = (p_0_x[i] + params.offset.x) - p_1_x[j];
00174         double p_ij_y = (p_0_y[i] + params.offset.y) - p_1_y[j];
00175         double p_ij_z = (p_0_z[i] + params.offset.z) - p_1_z[j];
00176         double r2_ij_delta = r2_delta + (p_ij_x * p_ij_x) + (p_ij_y * p_ij_y) + (p_ij_z * p_ij_z);
00177         if (r2_ij_delta < cutoff2_delta) {
00178           plArray[plSize++] = plEntry;
00179           r2Array[r2Size++] = r2_ij_delta;
00180           //plSize++;
00181           #if REFINE_PAIRLISTS_XYZ != 0
00182             p_ij_x_array[ijXSize++] = p_ij_x;
00183             p_ij_y_array[ijYSize++] = p_ij_y;
00184             p_ij_z_array[ijZSize++] = p_ij_z;
00185           #endif
00186         }
00187       }
00188 
00189     #endif // MIC_HANDCODE_FORCE check
00190 
00191   #else  // REFINE_PAIRLISTS != 0
00192 
00193     const int plSize = PAIRLIST[1] - 2;
00194     const int * RESTRICT plArray = PAIRLIST + 2;
00195 
00196   #endif  // REFINE_PAIRLISTS != 0
00197 
00198   // NOTE : The above code defines plArray and plSize, which are the pointer and the
00199   //   length of the pairlist to be processed by the force computation loop, whether
00200   //   or not pairlist refinement is being used.  Neither the pointer or the size should
00201   //   include the first two elements (allocated size and actual size) of the pairlists
00202   //   that are saved across iterations.
00203   __ASSERT(plArray != NULL);
00204   __ASSERT(plSize >= 0);
00205   __ASSUME_ALIGNED(plArray);
00206 
00207   // Include code for the force computation loop itself
00208   #if (MIC_HANDCODE_FORCE != 0) && (MIC_HANDCODE_FORCE_SINGLE != 0)
00209     #include "ComputeNonbondedMICKernelBase2_handcode_single.h"
00210   #else
00211     #include "ComputeNonbondedMICKernelBase2_scalar.h"
00212   #endif
00213 
00214 } // End block
00215 
00216 #else  // NAMD_MIC
00217 
00218 #include "ComputeNonbondedMICKernelBase2_handcode_single.h"
00219 #include "ComputeNonbondedMICKernelBase2_scalar.h"
00220 
00221 #endif  // NAMD_MIC
00222 

Generated on Thu Sep 21 01:17:11 2017 for NAMD by  doxygen 1.4.7