ComputeNonbondedMICKernelBase.h

Go to the documentation of this file.
00001 
00002 #ifdef NAMD_MIC
00003 
00006 
00007 #ifdef ARCH_POWERPC
00008   #include <builtins.h>
00009 #endif
00010 
00011 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)
00012   #include <emmintrin.h>  // We're using SSE2 intrinsics
00013   #if defined(__INTEL_COMPILER)
00014     #define __align(X) __declspec(align(X) )
00015   #elif defined(__GNUC__) || defined(__PGI)
00016     #define __align(X)  __attribute__((aligned(X) ))
00017   #else
00018     #define __align(X) __declspec(align(X) )
00019   #endif
00020 #endif
00021 
00022 // DMK - NOTE : Removing the includes for now, since we are using the "raw data"
00023 //   from the host data structures (i.e. packing them up at startup and passing
00024 //   that raw data to the card).  As data structures are included on the device
00025 //   as well (if that happens), then some of these could be included.
00026 //#ifdef DEFINITION // (
00027 //  #include "LJTable.h"
00028 //  #include "Molecule.h"
00029 //  #include "ComputeNonbondedUtil.h"
00030 //#endif // )
00031 //#include "Parameters.h"
00032 //#if NAMD_ComputeNonbonded_SortAtoms != 0
00033 //  #include "PatchMap.h"
00034 //#endif
00035 
00036 // determining class name
00037 #undef NAME
00038 #undef CLASS
00039 #undef CLASSNAME
00040 #define NAME CLASSNAME(calc)
00041 
00042 #undef PAIR
00043 #if NBTYPE == NBPAIR
00044   #define PAIR(X) X
00045   #define CLASS ComputeNonbondedPair
00046   #define CLASSNAME(X) ENERGYNAME( X ## _pair )
00047 #else
00048   #define PAIR(X)
00049 #endif
00050 
00051 #undef SELF
00052 #if NBTYPE == NBSELF
00053   #define SELF(X) X
00054   #define CLASS ComputeNonbondedSelf
00055   #define CLASSNAME(X) ENERGYNAME( X ## _self )
00056 #else
00057   #define SELF(X)
00058 #endif
00059 
00060 #undef ENERGYNAME
00061 #undef ENERGY
00062 #undef NOENERGY
00063 #ifdef CALCENERGY
00064   #define ENERGY(X) X
00065   #define NOENERGY(X)
00066   #define ENERGYNAME(X) SLOWONLYNAME( X ## _energy )
00067 #else
00068   #define ENERGY(X)
00069   #define NOENERGY(X) X
00070   #define ENERGYNAME(X) SLOWONLYNAME( X )
00071 #endif
00072 
00073 #undef SLOWONLYNAME
00074 #undef FAST
00075 #ifdef SLOWONLY
00076   #define FAST(X)
00077   #define SLOWONLYNAME(X) MERGEELECTNAME( X ## _slow )
00078 #else
00079   #define FAST(X) X
00080   #define SLOWONLYNAME(X) MERGEELECTNAME( X )
00081 #endif
00082 
00083 #undef MERGEELECTNAME
00084 #undef SHORT
00085 #undef NOSHORT
00086 #ifdef MERGEELECT
00087   #define SHORT(X)
00088   #define NOSHORT(X) X
00089   #define MERGEELECTNAME(X) FULLELECTNAME( X ## _merge )
00090 #else
00091   #define SHORT(X) X
00092   #define NOSHORT(X)
00093   #define MERGEELECTNAME(X) FULLELECTNAME( X )
00094 #endif
00095 
00096 #undef FULLELECTNAME
00097 #undef FULL
00098 #undef NOFULL
00099 #ifdef FULLELECT
00100   #define FULLELECTNAME(X) TABENERGYNAME( X ## _fullelect )
00101   #define FULL(X) X
00102   #define NOFULL(X)
00103 #else
00104   #define FULLELECTNAME(X) TABENERGYNAME( X )
00105   #define FULL(X)
00106   #define NOFULL(X) X
00107 #endif
00108 
00109 #undef TABENERGYNAME
00110 #undef TABENERGY
00111 #undef NOTABENERGY
00112 #ifdef TABENERGYFLAG
00113   #define TABENERGYNAME(X) FEPNAME( X ## _tabener )
00114   #define TABENERGY(X) X
00115   #define NOTABENERGY(X)
00116 #else
00117   #define TABENERGYNAME(X) FEPNAME( X )
00118   #define TABENERGY(X)
00119   #define NOTABENERGY(X) X
00120 #endif
00121 
00122 // Here are what these macros stand for:
00123 // FEP/NOT_FEP: FEP free energy perturbation is active/inactive 
00124 //      (does NOT use LAM)
00125 // LES: locally-enhanced sampling is active
00126 // LAM: scale the potential by a factor lambda (except FEP)
00127 // INT: measure interaction energies
00128 // PPROF: pressure profiling
00129 
00130 #undef FEPNAME
00131 #undef FEP
00132 #undef LES
00133 #undef INT
00134 #undef PPROF
00135 #undef LAM
00136 #undef CUDA
00137 #undef MIC
00138 #undef ALCH
00139 #undef TI
00140 #undef GO
00141 #define FEPNAME(X) LAST( X )
00142 #define FEP(X)
00143 #define ALCHPAIR(X)
00144 #define NOT_ALCHPAIR(X) X
00145 #define LES(X)
00146 #define INT(X)
00147 #define PPROF(X)
00148 #define LAM(X)
00149 #define CUDA(X)
00150 #define MIC(X)
00151 #define ALCH(X)
00152 #define TI(X)
00153 #define GO(X)
00154 #ifdef FEPFLAG
00155   #undef FEPNAME
00156   #undef FEP
00157   #undef ALCH
00158   #define FEPNAME(X) LAST( X ## _fep )
00159   #define FEP(X) X
00160   #define ALCH(X) X
00161 #endif
00162 #ifdef TIFLAG
00163   #undef FEPNAME
00164   #undef TI
00165   #undef ALCH
00166   #define FEPNAME(X) LAST( X ## _ti )
00167   #define TI(X) X
00168   #define ALCH(X) X
00169 #endif
00170 #ifdef LESFLAG
00171   #undef FEPNAME
00172   #undef LES
00173   #undef LAM
00174   #define FEPNAME(X) LAST( X ## _les )
00175   #define LES(X) X
00176   #define LAM(X) X
00177 #endif
00178 #ifdef INTFLAG
00179   #undef FEPNAME
00180   #undef INT
00181   #define FEPNAME(X) LAST( X ## _int )
00182   #define INT(X) X
00183 #endif
00184 #ifdef PPROFFLAG
00185   #undef FEPNAME
00186   #undef INT
00187   #undef PPROF
00188   #define FEPNAME(X) LAST( X ## _pprof )
00189   #define INT(X) X
00190   #define PPROF(X) X
00191 #endif
00192 #ifdef GOFORCES
00193   #undef FEPNAME
00194   #undef GO
00195   #define FEPNAME(X) LAST( X ## _go )
00196   #define GO(X) X
00197 #endif
00198 #ifdef NAMD_CUDA
00199   #undef CUDA
00200   #define CUDA(X) X
00201 #endif
00202 #ifdef NAMD_MIC
00203   #undef MIC
00204   #define MIC(X) X
00205 #endif
00206 
00207 #define LAST(X) X
00208 
00209 // see if things are really messed up
00210 SELF( PAIR( foo bar ) )
00211 LES( FEP( foo bar ) )
00212 LES( INT( foo bar ) )
00213 FEP( INT( foo bar ) )
00214 LAM( INT( foo bar ) )
00215 FEP( NOENERGY( foo bar ) )
00216 ENERGY( NOENERGY( foo bar ) )
00217 TABENERGY(NOTABENERGY( foo bar ) )
00218 
00219 #define ALLOCA_ALIGNED(p, s, a) { \
00220   char *ptr = alloca((s) + (a)); \
00221   int64 a1 = (a) - 1; \
00222   int64 ptr64 = ((int64)ptr) + a1); \
00223   (p) = (void*)(ptr64 - (ptr64 & a1)); \
00224 }
00225 
00226 #define __MIC_PAD_PLGEN_CTRL (MIC_PAD_PLGEN)
00227 
00230 
00231 
00233 // Declare the compute function
00234 
00235 __attribute__((target(mic))) void NAME (mic_params &params) {
00236 
00238 
00239   #if MIC_HANDCODE_FORCE_SINGLE != 0
00240     #define CALC_TYPE float
00241   #else
00242     #define CALC_TYPE double
00243   #endif
00244 
00245   // Setup nonbonded table pointers (using the main table pointer, create pointers for each
00246   //   of the non-overlapping sub-tables)
00247   const int table_four_n_16 = params.table_four_n_16;
00248   __ASSUME(table_four_n_16 % 16 == 0);
00249   __ASSERT(params.table_four_base_ptr != NULL);
00250   const CALC_TYPE * RESTRICT const table_noshort = (CALC_TYPE*)(params.table_four_base_ptr)                       ; __ASSUME_ALIGNED(table_noshort);
00251   const CALC_TYPE * RESTRICT const   table_short = (CALC_TYPE*)(params.table_four_base_ptr) + 16 * table_four_n_16; __ASSUME_ALIGNED(table_short);
00252   const CALC_TYPE * RESTRICT const    slow_table = (CALC_TYPE*)(params.table_four_base_ptr) + 32 * table_four_n_16; __ASSUME_ALIGNED(slow_table);
00253   //const CALC_TYPE * RESTRICT const    fast_table = (CALC_TYPE*)(params.table_four_base_ptr) + 36 * table_four_n_16; __ASSUME_ALIGNED(fast_table);
00254   //const CALC_TYPE * RESTRICT const    scor_table = (CALC_TYPE*)(params.table_four_base_ptr) + 40 * table_four_n_16; __ASSUME_ALIGNED(scor_table);
00255   //const CALC_TYPE * RESTRICT const    corr_table = (CALC_TYPE*)(params.table_four_base_ptr) + 44 * table_four_n_16; __ASSUME_ALIGNED(corr_table);
00256   //const CALC_TYPE * RESTRICT const    full_table = (CALC_TYPE*)(params.table_four_base_ptr) + 48 * table_four_n_16; __ASSUME_ALIGNED(full_table);
00257   //const CALC_TYPE * RESTRICT const    vdwa_table = (CALC_TYPE*)(params.table_four_base_ptr) + 52 * table_four_n_16; __ASSUME_ALIGNED(vdwa_table);
00258   //const CALC_TYPE * RESTRICT const    vdwb_table = (CALC_TYPE*)(params.table_four_base_ptr) + 56 * table_four_n_16; __ASSUME_ALIGNED(vdwb_table);
00259   //const CALC_TYPE * RESTRICT const      r2_table = (CALC_TYPE*)(params.table_four_base_ptr) + 60 * table_four_n_16; __ASSUME_ALIGNED(r2_table);
00260   // DMK - NOTE : Other table pointers will be useful as this code grows in scope, so including them all now
00261 
00262   // Setup LJ table pointers
00263   const CALC_TYPE * RESTRICT const lj_table_base_ptr = (CALC_TYPE*)(params.lj_table_base_ptr);
00264   __ASSERT(lj_table_base_ptr != NULL);
00265   const int lj_table_dim = params.lj_table_dim;
00266   __ASSUME_ALIGNED(lj_table_base_ptr);
00267 
00268   // Constants
00269   const CALC_TYPE r2_delta = params.constants->r2_delta;
00270   const int r2_delta_exp = params.constants->r2_delta_exp;
00271   const int r2_delta_expc = params.constants->r2_delta_expc;
00272   const CALC_TYPE scaling = params.constants->scaling;
00273   const CALC_TYPE modf_mod = params.constants->modf_mod;
00274 
00275   // Cutoff values
00276   const CALC_TYPE plcutoff = params.pp->plcutoff;
00277   const CALC_TYPE plcutoff2 = plcutoff * plcutoff;
00278   const CALC_TYPE plcutoff2_delta = plcutoff2 + r2_delta;
00279   const CALC_TYPE cutoff2 = params.constants->cutoff2;
00280   const CALC_TYPE cutoff2_delta = cutoff2 + r2_delta;
00281 
00282   // Number of atoms in each list of atoms
00283   const int i_upper = params.numAtoms[0];
00284   const int j_upper = params.numAtoms[1];
00285   const int i_upper_16 = params.numAtoms_16[0];
00286   const int j_upper_16 = params.numAtoms_16[1];
00287   __ASSERT(i_upper >= 0); __ASSERT(j_upper >= 0);
00288   __ASSERT(i_upper_16 >= 0); __ASSERT(j_upper_16 >= 0);
00289   __ASSUME(i_upper_16 % 16 == 0);
00290   __ASSUME(j_upper_16 % 16 == 0);
00291 
00292   // Setup pointers to atom input arrays
00293   #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00294     atom * RESTRICT p_0 = params.p[0]; __ASSERT(p_0 != NULL);  __ASSUME_ALIGNED(p_0);
00295     atom_param * RESTRICT pExt_0 = params.pExt[0]; __ASSERT(pExt_0 != NULL);  __ASSUME_ALIGNED(pExt_0);
00296     #if (0 SELF(+1))
00297       #define p_1  p_0
00298       #define pExt_1  pExt_0
00299     #else
00300       atom * RESTRICT p_1 = params.p[1]; __ASSERT(p_1 != NULL);  __ASSUME_ALIGNED(p_1);
00301       atom_param * RESTRICT pExt_1 = params.pExt[1]; __ASSERT(pExt_1 != NULL);  __ASSUME_ALIGNED(pExt_1);
00302     #endif
00303   #else
00304     mic_position_t * RESTRICT p_0_x = params.p_x[0];
00305     mic_position_t * RESTRICT p_0_y = params.p_y[0];
00306     mic_position_t * RESTRICT p_0_z = params.p_z[0];
00307     mic_position_t * RESTRICT p_0_q = params.p_q[0];
00308     int * RESTRICT pExt_0_vdwType = params.pExt_vdwType[0];
00309     int * RESTRICT pExt_0_index = params.pExt_index[0];
00310     int * RESTRICT pExt_0_exclIndex = params.pExt_exclIndex[0];
00311     int * RESTRICT pExt_0_exclMaxDiff = params.pExt_exclMaxDiff[0];
00312     #if (0 SELF(+1))
00313       #define p_1_x  p_0_x
00314       #define p_1_y  p_0_y
00315       #define p_1_z  p_0_z
00316       #define p_1_q  p_0_q
00317       #define p_1_vdwType  p_0_vdwType
00318       #define p_1_index  p_0_index
00319       #define p_1_exclIndex  p_0_exclIndex
00320       #define p_1_exclMaxDiff  p_0_exclMaxDiff
00321     #else
00322       mic_position_t * RESTRICT p_1_x = params.p_x[1];
00323       mic_position_t * RESTRICT p_1_y = params.p_y[1];
00324       mic_position_t * RESTRICT p_1_z = params.p_z[1];
00325       mic_position_t * RESTRICT p_1_q = params.p_q[1];
00326       int * RESTRICT pExt_1_vdwType = params.pExt_vdwType[1];
00327       int * RESTRICT pExt_1_index = params.pExt_index[1];
00328       int * RESTRICT pExt_1_exclIndex = params.pExt_exclIndex[1];
00329       int * RESTRICT pExt_1_exclMaxDiff = params.pExt_exclMaxDiff[1];
00330     #endif
00331     __ASSERT(p_0_x != NULL); __ASSERT(p_0_y != NULL); __ASSERT(p_0_z != NULL); __ASSERT(p_0_q != NULL);
00332     __ASSERT(p_1_x != NULL); __ASSERT(p_1_y != NULL); __ASSERT(p_1_z != NULL); __ASSERT(p_1_q != NULL);
00333     __ASSERT(pExt_0_vdwType != NULL); __ASSERT(pExt_0_index != NULL);
00334     __ASSERT(pExt_1_vdwType != NULL); __ASSERT(pExt_1_index != NULL);
00335     __ASSERT(pExt_0_exclIndex != NULL); __ASSERT(pExt_0_exclMaxDiff != NULL);
00336     __ASSERT(pExt_1_exclIndex != NULL); __ASSERT(pExt_1_exclMaxDiff != NULL);
00337     __ASSUME_ALIGNED(p_0_x); __ASSUME_ALIGNED(p_1_x);
00338     __ASSUME_ALIGNED(p_0_y); __ASSUME_ALIGNED(p_1_y);
00339     __ASSUME_ALIGNED(p_0_z); __ASSUME_ALIGNED(p_1_z);
00340     __ASSUME_ALIGNED(p_0_q); __ASSUME_ALIGNED(p_1_q);
00341     __ASSUME_ALIGNED(pExt_0_vdwType); __ASSUME_ALIGNED(pExt_0_index);
00342     __ASSUME_ALIGNED(pExt_1_vdwType); __ASSUME_ALIGNED(pExt_1_index);
00343     __ASSUME_ALIGNED(pExt_0_exclIndex); __ASSUME_ALIGNED(pExt_0_exclMaxDiff);
00344     __ASSUME_ALIGNED(pExt_1_exclIndex); __ASSUME_ALIGNED(pExt_1_exclMaxDiff);
00345   #endif
00346 
00347   // Setup pointers to force output arrays and clear those arrays (init to zero)
00348   #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00349     double4 * RESTRICT f_0 = params.ff[0];
00350     #if (0 SELF(+1))
00351       #define f_1  f_0
00352     #else
00353       double4 * RESTRICT f_1 = params.ff[1];
00354     #endif
00355     __ASSERT(f_0 != NULL); __ASSUME_ALIGNED(f_0);
00356     __ASSERT(f_1 != NULL); __ASSUME_ALIGNED(f_1);
00357     memset(f_0, 0, sizeof(double4) * i_upper_16);
00358     PAIR( memset(f_1, 0, sizeof(double4) * j_upper_16); )
00359     #if (0 FULL(+1))
00360       double4 * RESTRICT fullf_0 = params.fullf[0];
00361       #if (0 SELF(+1))
00362         #define fullf_1  fullf_0
00363       #else
00364         double4 * RESTRICT fullf_1 = params.fullf[1];
00365       #endif
00366       __ASSERT(fullf_0 != NULL); __ASSUME_ALIGNED(fullf_0);
00367       __ASSERT(fullf_1 != NULL); __ASSUME_ALIGNED(fullf_1);
00368       memset(fullf_0, 0, sizeof(double4) * i_upper_16);
00369       PAIR( memset(fullf_1, 0, sizeof(double4) * j_upper_16); )
00370     #endif
00371   #else
00372     double * RESTRICT f_0_x = params.ff_x[0];
00373     double * RESTRICT f_0_y = params.ff_y[0];
00374     double * RESTRICT f_0_z = params.ff_z[0];
00375     double * RESTRICT f_0_w = params.ff_w[0];
00376     #if (0 SELF(+1))
00377       #define f_1_x  f_0_x
00378       #define f_1_y  f_0_y
00379       #define f_1_z  f_0_z
00380       #define f_1_w  f_0_w
00381     #else
00382       double * RESTRICT f_1_x = params.ff_x[1];
00383       double * RESTRICT f_1_y = params.ff_y[1];
00384       double * RESTRICT f_1_z = params.ff_z[1];
00385       double * RESTRICT f_1_w = params.ff_w[1];
00386     #endif
00387     __ASSERT(f_0_x != NULL); __ASSERT(f_0_y != NULL); __ASSERT(f_0_z != NULL); __ASSERT(f_0_w != NULL);
00388     __ASSERT(f_1_x != NULL); __ASSERT(f_1_y != NULL); __ASSERT(f_1_z != NULL); __ASSERT(f_1_w != NULL);
00389     __ASSUME_ALIGNED(f_0_x); __ASSUME_ALIGNED(f_1_x);
00390     __ASSUME_ALIGNED(f_0_y); __ASSUME_ALIGNED(f_1_y);
00391     __ASSUME_ALIGNED(f_0_z); __ASSUME_ALIGNED(f_1_z);
00392     __ASSUME_ALIGNED(f_0_w); __ASSUME_ALIGNED(f_1_w);
00393     memset(f_0_x, 0, 4 * sizeof(double) * i_upper_16);
00394     PAIR( memset(f_1_x, 0, 4 * sizeof(double) * j_upper_16); )
00395     #if (0 FULL(+1))
00396       double * RESTRICT fullf_0_x = params.fullf_x[0];
00397       double * RESTRICT fullf_0_y = params.fullf_y[0];
00398       double * RESTRICT fullf_0_z = params.fullf_z[0];
00399       double * RESTRICT fullf_0_w = params.fullf_w[0];
00400       #if (0 SELF(+1))
00401         #define fullf_1_x  fullf_0_x
00402         #define fullf_1_y  fullf_0_y
00403         #define fullf_1_z  fullf_0_z
00404         #define fullf_1_w  fullf_0_w
00405       #else
00406         double * RESTRICT fullf_1_x = params.fullf_x[1];
00407         double * RESTRICT fullf_1_y = params.fullf_y[1];
00408         double * RESTRICT fullf_1_z = params.fullf_z[1];
00409         double * RESTRICT fullf_1_w = params.fullf_w[1];
00410       #endif
00411       __ASSERT(fullf_0_x != NULL); __ASSERT(fullf_0_y != NULL); __ASSERT(fullf_0_z != NULL); __ASSERT(fullf_0_w != NULL);
00412       __ASSERT(fullf_1_x != NULL); __ASSERT(fullf_1_y != NULL); __ASSERT(fullf_1_z != NULL); __ASSERT(fullf_1_w != NULL);
00413       __ASSUME_ALIGNED(fullf_0_x); __ASSUME_ALIGNED(fullf_1_x);
00414       __ASSUME_ALIGNED(fullf_0_y); __ASSUME_ALIGNED(fullf_1_y);
00415       __ASSUME_ALIGNED(fullf_0_z); __ASSUME_ALIGNED(fullf_1_z);
00416       __ASSUME_ALIGNED(fullf_0_w); __ASSUME_ALIGNED(fullf_1_w);
00417       memset(fullf_0_x, 0, 4 * sizeof(double) * i_upper_16);
00418       PAIR( memset(fullf_1_x, 0, 4 * sizeof(double) * j_upper_16); )
00419     #endif
00420   #endif
00421 
00422   // If the commOnly flag has been set, return now (after having zero'd the output forces)
00423   if (params.constants->commOnly) { return; }
00424 
00425   // If either atom list is size zero, return now (after having zero'd the output forces)
00426   if (i_upper <= 0 || j_upper <= 0) { return; }
00427 
00428   CALC_TYPE offset_x = (CALC_TYPE)(params.offset.x);
00429   CALC_TYPE offset_y = (CALC_TYPE)(params.offset.y);
00430   CALC_TYPE offset_z = (CALC_TYPE)(params.offset.z);
00431 
00433 
00434   // Grab the pairlist pointers for the various pairlists that will be used
00435   int * RESTRICT pairlist_norm = params.pairlists_ptr[PL_NORM_INDEX];
00436   int * RESTRICT pairlist_mod  = params.pairlists_ptr[PL_MOD_INDEX];
00437   int * RESTRICT pairlist_excl = params.pairlists_ptr[PL_EXCL_INDEX];
00438   #if MIC_CONDITION_NORMAL != 0
00439     int * RESTRICT pairlist_normhi = params.pairlists_ptr[PL_NORMHI_INDEX];
00440   #endif
00441 
00442   // NOTE : The first 2 integers are used for sizing information.  The actual
00443   //   pairlist entries (i.e. what we want to be 64 byte aligned) start at
00444   //   element 2 (thus the +/- 14 shifts in the macros below).
00445   #define PAIRLIST_ALLOC_CHECK(pl, init_size) { \
00446     if ((pl) == NULL) { \
00447       const int plInitSize = (init_size) + (16 * (MIC_HANDCODE_FORCE_PFDIST + 1)); \
00448       if (device__timestep == 0) { /* NOTE: Avoid all the prints on timestep 0 */ \
00449         (pl) = (int*)(_mm_malloc(plInitSize * sizeof(int) + 64, 64)); \
00450       } else { \
00451         (pl) = (int*)_MM_MALLOC_WRAPPER(plInitSize * sizeof(int) + 64, 64, "pairlist_alloc_check"); \
00452       } \
00453       __ASSERT((pl) != NULL); \
00454       (pl) += 14; \
00455       (pl)[0] = plInitSize; \
00456       (pl)[1] = 2; \
00457     } \
00458   }
00459 
00460   #define PAIRLIST_GROW_CHECK(pl, offset, mul) { \
00461     if ((offset) + j_upper + 8 + MIC_PREFETCH_DISTANCE + (16 * (MIC_HANDCODE_FORCE_PFDIST + 1)) > (pl)[0]) { \
00462       int newPlAllocSize = (int)(((pl)[0]) * (mul) + j_upper + 64); /* NOTE: add at least j_upper (max that can be added before next check) in case pl[0] is small to begin with */ \
00463       newPlAllocSize = (~2047) & (newPlAllocSize + 2047); /* NOTE: round up to 2K, add at least 2K */ \
00464       int* RESTRICT newPl = (int*)_MM_MALLOC_WRAPPER(newPlAllocSize * sizeof(int), 64, "pairlist_grow_check"); \
00465       __ASSERT((newPl) != NULL); \
00466       newPl += 14; \
00467       memcpy(newPl, (pl), (offset) * sizeof(int)); \
00468       _MM_FREE_WRAPPER((pl) - 14); \
00469       (pl) = newPl; \
00470       (pl)[0] = newPlAllocSize; \
00471     } \
00472   }
00473 
00474   DEVICE_FPRINTF("P");
00475 
00476   // Regenerate the pairlists, if need be
00477   if ((!(params.usePairlists)) || (params.savePairlists)) {
00478 
00479     // For the sake of getting a good estimate of the virtual memory require for pairlists,
00480     //   do an actual count of the interactions within this compute (part).
00481     // NOTE: This will only occur during timestep 0.  Later allocations grow the buffer by
00482     //   a fixed factor.
00483 
00484     if (pairlist_norm == NULL) { // NOTE: Only checking one pointer, but all will be allocated together
00485       // NOTE: This only occurs once when the compute is run for the first time
00486 
00487       int plCount_norm = 16;
00488       int plCount_mod = 16;
00489       int plCount_excl = 16;
00490       #if MIC_CONDITION_NORMAL != 0
00491         int plCount_normhi = 16;
00492       #endif
00493 
00494       int numParts = params.pp->numParts;
00495       int part = params.pp->part;
00496       for (int i = part; i < i_upper; i += numParts) {
00497 
00498         // Load position information for the current "i" atom
00499         #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00500           CALC_TYPE p_i_x = p_0[i].x + offset_x;
00501           CALC_TYPE p_i_y = p_0[i].y + offset_y;
00502           CALC_TYPE p_i_z = p_0[i].z + offset_z;
00503         #else
00504           CALC_TYPE p_i_x = p_0_x[i] + offset_x;
00505           CALC_TYPE p_i_y = p_0_y[i] + offset_y;
00506           CALC_TYPE p_i_z = p_0_z[i] + offset_z;
00507         #endif
00508           
00509         for (int j = 0 SELF(+i+1); j < j_upper; j++) {
00510 
00511           // Load the "j" atom's position, calculate/check the distance (squared)
00512           //   between the "i" and "j" atoms
00513           #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00514             CALC_TYPE p_d_x = p_i_x - p_1[j].x;
00515             CALC_TYPE p_d_y = p_i_y - p_1[j].y;
00516             CALC_TYPE p_d_z = p_i_z - p_1[j].z;
00517           #else
00518             CALC_TYPE p_d_x = p_i_x - p_1_x[j];
00519             CALC_TYPE p_d_y = p_i_y - p_1_y[j];
00520             CALC_TYPE p_d_z = p_i_z - p_1_z[j];
00521           #endif
00522           CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
00523           if (r2 <= plcutoff2) {
00524 
00525             // Check the exclusion bits to set the modified and excluded flags
00526             // NOTE: This code MUST match the exclusion lists generation code
00527             //   on ComputeNonbondedMIC.C, which generates the flags.  In
00528             //   particular, the order of the 2 bits must be correct, per the
00529             //   pairlist format listed below (i.e. isModified -> MSB).
00530             int exclFlags = 0x00;
00531             #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00532               const int indexDiff = pExt_0[i].index - pExt_1[j].index;
00533               const int maxDiff = pExt_1[j].excl_maxdiff;
00534             #else
00535               const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
00536               const int maxDiff = pExt_1_exclMaxDiff[j];
00537             #endif
00538             if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
00539               #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00540                 const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
00541               #else
00542                 const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
00543               #endif
00544               const int offset_major = offset / (sizeof(unsigned int) * 8);
00545               const int offset_minor = offset % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
00546               exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
00547             }
00548 
00549             // Create the pairlist entry value (i and j value) and store it in to the
00550             //   appropriate pairlist based on the type of interaction (exclFlags value)
00551             if (exclFlags == 0) {
00552               #if MIC_CONDITION_NORMAL != 0
00553                 if (r2 <= normhi_split) { plCount_norm++; }
00554                 else { plCount_normhi++; }
00555               #else
00556                 plCount_norm++;
00557               #endif
00558             } else if (exclFlags == 1) {
00559               plCount_excl++;
00560             } else if (exclFlags == 2) {
00561               plCount_mod++;
00562             }
00563               
00564           } // end if (r2 <= plcutoff2)
00565         } // end for (j < j_upper)
00566       } // end for (i < i_upper)
00567 
00568       // If padding the pairlists, add some extra room for padding ('vector width * num sub-lists' total)
00569       #if __MIC_PAD_PLGEN_CTRL != 0
00570         plCount_norm += (16 * i_upper);
00571         plCount_mod += (16 * i_upper);
00572         plCount_excl += (16 * i_upper);
00573         #if MIC_CONDITION_NORMAL != 0
00574           plCount_normhi += (16 * i_upper);
00575         #endif
00576       #endif
00577 
00578       // Add a constant percent and round up to a multiple of 1024
00579       const float plPercent = 1.4;
00580       const int plRound = 2048;
00581       plCount_norm = (int)(plCount_norm * plPercent); plCount_norm = (~(plRound-1)) & (plCount_norm + (plRound-1));
00582       plCount_mod = (int)(plCount_mod * plPercent); plCount_mod = (~(plRound-1)) & (plCount_mod + (plRound-1));
00583       plCount_excl = (int)(plCount_excl * plPercent); plCount_excl = (~(plRound-1)) & (plCount_excl + (plRound-1));
00584       #if MIC_CONDITION_NORMAL != 0
00585         plCount_normhi = (int)(plCount_normhi * plPercent); plCount_normhi = (~(plRound-1)) & (plCount_normhi + (plRound-1));
00586         plCount_norm += plCount_normhi;
00587       #endif
00588 
00589       // DMK - DEBUG - printing allocations, but reduce output by skipping initial pairlist allocations in timestep 0 (lots of them)
00590       // NOTE: This check is temporary for debugging, remove (keep wrapper version) when no longer needed
00591       if (device__timestep == 0) { /* NOTE: Avoid all the allocation prints during timestep 0 */
00592         pairlist_norm = (int*)(_mm_malloc(plCount_norm * sizeof(int), 64)); __ASSERT(pairlist_norm != NULL);
00593         pairlist_mod = (int*)(_mm_malloc(plCount_mod * sizeof(int), 64)); __ASSERT(pairlist_mod != NULL);
00594         pairlist_excl = (int*)(_mm_malloc(plCount_excl * sizeof(int), 64)); __ASSERT(pairlist_excl != NULL);
00595         #if MIC_CONDITION_NORMAL != 0
00596           pairlist_normhi = (int*)(_mm_malloc(plCount_normhi * sizeof(int), 64)); __ASSERT(pairlist_normhi != NULL);
00597         #endif
00598       } else {
00599         pairlist_norm = (int*)(_MM_MALLOC_WRAPPER(plCount_norm * sizeof(int), 64, "pairlist_norm")); __ASSERT(pairlist_norm != NULL);
00600         pairlist_mod = (int*)(_MM_MALLOC_WRAPPER(plCount_mod * sizeof(int), 64, "pairlist_mod")); __ASSERT(pairlist_mod != NULL);
00601         pairlist_excl = (int*)(_MM_MALLOC_WRAPPER(plCount_excl * sizeof(int), 64, "pairlist_excl")); __ASSERT(pairlist_excl != NULL);
00602         #if MIC_CONDITION_NORMAL != 0
00603           pairlist_normhi = (int*)(_MM_MALLOC_WRAPPER(plCount_normhi * sizeof(int), 64, "pairlist_normhi")); __ASSERT(pairlist_normhi != NULL);
00604         #endif
00605       }
00606 
00607       pairlist_norm += 14; pairlist_norm[0] = plCount_norm; pairlist_norm[1] = 2;
00608       pairlist_mod += 14; pairlist_mod[0] = plCount_mod; pairlist_mod[1] = 2;
00609       pairlist_excl += 14; pairlist_excl[0] = plCount_excl; pairlist_excl[1] = 2;
00610       #if MIC_CONDITION_NORMAL != 0
00611         pairlist_normhi += 14; pairlist_normhi[0] = plCount_normhi; pairlist_normhi[1] = 2;
00612       #endif
00613 
00614       // DMK - DEBUG - Pairlist memory stats
00615       #if MIC_TRACK_DEVICE_MEM_USAGE != 0
00616         pairlist_norm[-1] = 0;
00617         pairlist_mod[-1] = 0;
00618         pairlist_excl[-1] = 0;
00619         #if MIC_CONDITION_NORMAL != 0
00620           pairlist_normhi[-1] = 0;
00621         #endif
00622       #endif
00623 
00624     } // end if (pairlist_norm == NULL)
00625 
00626     // Create the pairlists from scratch.  The first two elements in each pairlist
00627     //   are special values (element 0 is the allocated size of the memory buffer
00628     //   and element 1 is the length of the pairlist itself, including the these
00629     //   first two elements).
00630     int plOffset_norm = 2;  // Current length of the normal pairlist
00631     int plOffset_mod = 2;   // Current length of the modified pairlist
00632     int plOffset_excl = 2;  // Current length of the excluded pairlist
00633     #if MIC_CONDITION_NORMAL != 0
00634       double normhi_split = (0.25 * cutoff2) + (0.75 * plcutoff2);  // Weighted average of cutoff2 and plcutoff2
00635       int plOffset_normhi = 2;  // Current length of the "normal high" pairlist (entires
00636                                 //   that belong in the normal pairlist, but that are
00637                                 //   further than the normhi_split distance, such that
00638                                 //   cutoff2 <= normhi_split <= plcutoff2).
00639     #endif
00640 
00641     // If tiling of the pairlist is enabled...
00642     #if MIC_TILE_PLGEN != 0
00643 
00644       // Calculate the loop tiling size
00645       int plTileSize = (i_upper > j_upper) ? (i_upper) : (j_upper); //(i_upper + j_upper) / 2;  // Avg. of list sizes ...
00646       plTileSize /= MIC_TILE_PLGEN;              // ... divided by MIC_TILE_PLGEN and ...
00647       plTileSize = (plTileSize + 15) & (~15);    // ... rounded up to multiple of 16
00648       __ASSUME(plTileSize % 16 == 0);
00649 
00650       // The "i" and "j" tile loops
00651       for (int _i = 0; _i < i_upper SELF(-1); _i += plTileSize) {
00652         for (int _j = 0 SELF(+ _i); _j < j_upper; _j += plTileSize) {
00653 
00654           // Calculate the "i" loop bounds for the current tile
00655           int i_lo = _i;
00656           int i_hi = _i + plTileSize;
00657           i_hi = (i_hi > i_upper SELF(-1)) ? (i_upper SELF(-1)) : (i_hi);
00658 
00659           // The "i" loop, iterating over the first list of atoms
00660           for (int i = i_lo; i < i_hi; i++) {
00661 
00662             // If the pairlist is not long enough, grow the pairlist
00663             PAIRLIST_GROW_CHECK(pairlist_norm, plOffset_norm, 1.4);
00664             PAIRLIST_GROW_CHECK(pairlist_mod , plOffset_mod,  1.2);
00665             PAIRLIST_GROW_CHECK(pairlist_excl, plOffset_excl, 1.2);
00666             #if MIC_CONDITION_NORMAL != 0
00667               PAIRLIST_GROW_CHECK(pairlist_normhi, plOffset_normhi, 1.3);
00668             #endif
00669 
00670             // Load position information for the current "i" atom
00671             #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00672               CALC_TYPE p_i_x = p_0[i].x + offset_x;
00673               CALC_TYPE p_i_y = p_0[i].y + offset_y;
00674               CALC_TYPE p_i_z = p_0[i].z + offset_z;
00675             #else
00676               CALC_TYPE p_i_x = p_0_x[i] + offset_x; //params.offset.x;
00677               CALC_TYPE p_i_y = p_0_y[i] + offset_y; //params.offset.y;
00678               CALC_TYPE p_i_z = p_0_z[i] + offset_z; //params.offset.z;
00679             #endif
00680 
00681             // Calculate the "j" loop bounds for the current tile
00682             int j_lo = PAIR(_j) SELF((_i == _j) ? (i+1) : (_j));
00683             int j_hi = _j + plTileSize;
00684             j_hi = (j_hi > j_upper) ? (j_upper) : (j_hi);
00685 
00686             #if MIC_HANDCODE_PLGEN != 0
00687 
00688               __m512 p_i_x_vec = _mm512_set_1to16_ps(p_i_x);
00689               __m512 p_i_y_vec = _mm512_set_1to16_ps(p_i_y);
00690               __m512 p_i_z_vec = _mm512_set_1to16_ps(p_i_z);
00691               __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0_index[i]);
00692 
00693               __m512i i_shifted_vec = _mm512_slli_epi32(_mm512_set_1to16_epi32(i), 16); 
00694               __m512i j_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10,  9,  8,
00695                                                        7,  6,  5,  4,  3,  2,  1,  0);
00696               // For self computes, round (i+1) down to nearest multiple of 16 and then
00697               //   add that value to j_vec (skipping iterations where all j values
00698               //   are < i+1)
00699               PAIR( j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(j_lo)); )
00700               #if (0 SELF(+1))
00701                 const int j_lo_16 = (j_lo) & (~0x0F);
00702                 j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(j_lo_16));
00703                 const int j_lo_m1 = j_lo - 1;
00704               #endif
00705 
00706               // The "j" loop, iterating over the second list of atoms
00707               #pragma novector
00708               #pragma loop count(35)
00709               for (int j = PAIR(j_lo) SELF(j_lo_16); j < j_hi; j += 16) {
00710       
00711                 // Create the active_mask
00712                 __mmask16 active_mask = _mm512_cmplt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_hi));
00713                 #if (0 SELF(+1))
00714                   active_mask = _mm512_kand(active_mask, _mm512_cmpgt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_lo_m1)));
00715                 #endif
00716 
00717                 // Create the pairlist entry values for these would-be interactions before advancing
00718                 //   the j_vec values (i in upper 16 bits and j in lower 16 bits)
00719                 __m512i plEntry_vec = _mm512_or_epi32(i_shifted_vec, j_vec);
00720 
00721                 // Advance j_vec counter
00722                 j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(16));
00723 
00724                 // Load the positions for the "j" atoms
00725                 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00726                   __m512i p_j_tmp0a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j     )); // Load  first set of 4 atoms
00727                   __m512i p_j_tmp1a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j +  4)); // Load second set of 4 atoms
00728                   __m512i p_j_tmp2a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j +  8)); // Load  third set of 4 atoms
00729                   __m512i p_j_tmp3a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 12)); // Load fourth set of 4 atoms
00730                   __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
00731                   __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
00732                   __m512i p_j_tmp0b_vec = _mm512_mask_swizzle_epi32(p_j_tmp0a_vec, k_2x2_0, p_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
00733                   __m512i p_j_tmp1b_vec = _mm512_mask_swizzle_epi32(p_j_tmp1a_vec, k_2x2_1, p_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
00734                   __m512i p_j_tmp2b_vec = _mm512_mask_swizzle_epi32(p_j_tmp2a_vec, k_2x2_0, p_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
00735                   __m512i p_j_tmp3b_vec = _mm512_mask_swizzle_epi32(p_j_tmp3a_vec, k_2x2_1, p_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
00736                   __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
00737                   __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
00738                   __m512i p_j_tmp0c_vec = _mm512_mask_swizzle_epi32(p_j_tmp0b_vec, k_4x4_0, p_j_tmp2b_vec, _MM_SWIZ_REG_BADC);
00739                   __m512i p_j_tmp1c_vec = _mm512_mask_swizzle_epi32(p_j_tmp1b_vec, k_4x4_0, p_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
00740                   __m512i p_j_tmp2c_vec = _mm512_mask_swizzle_epi32(p_j_tmp2b_vec, k_4x4_1, p_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
00741                   __m512i p_j_perm_pattern = _mm512_set_1to16_epi32(15, 11, 7, 3,  14, 10, 6, 2,  13, 9, 5, 1,  12, 8, 4, 0);
00742                   __m512 p_j_x_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp0c_vec));
00743                   __m512 p_j_y_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp1c_vec));
00744                   __m512 p_j_z_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp2c_vec));
00745                 #else
00746                   __m512 p_j_x_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_x + j);
00747                   __m512 p_j_y_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_y + j);
00748                   __m512 p_j_z_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_z + j);
00749                 #endif
00750 
00751                 // Calculate the distance between "i" atom and these "j" atoms (in double)
00752                 __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
00753                 __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
00754                 __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
00755                 __m512 r2_vec = _mm512_mul_ps(p_ij_x_vec, p_ij_x_vec);
00756                 r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_y_vec, p_ij_y_vec));
00757                 r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_z_vec, p_ij_z_vec));
00758 
00759                 // Do cutoff distance check.  If there are no particles within cutoff, move on
00760                 //   to the next iteration.
00761                 __mmask16 cutoff_mask = _mm512_mask_cmple_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps((float)(plcutoff2)));
00762 
00763                 if (_mm512_kortestz(cutoff_mask, cutoff_mask)) { continue; }
00764 
00765                 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00766                   __m512i pExt_j_tmp0a_vec = _mm512_load_epi32(pExt_1 + j     ); // Load  first set of 4 atoms
00767                   __m512i pExt_j_tmp1a_vec = _mm512_load_epi32(pExt_1 + j +  4); // Load second set of 4 atoms
00768                   __m512i pExt_j_tmp2a_vec = _mm512_load_epi32(pExt_1 + j +  8); // Load  third set of 4 atoms
00769                   __m512i pExt_j_tmp3a_vec = _mm512_load_epi32(pExt_1 + j + 12); // Load fourth set of 4 atoms
00770                   __m512i pExt_j_tmp0b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp0a_vec, k_2x2_0, pExt_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
00771                   __m512i pExt_j_tmp1b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1a_vec, k_2x2_1, pExt_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
00772                   __m512i pExt_j_tmp2b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2a_vec, k_2x2_0, pExt_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
00773                   __m512i pExt_j_tmp3b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3a_vec, k_2x2_1, pExt_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
00774                   __m512i pExt_j_tmp1c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1b_vec, k_4x4_0, pExt_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
00775                   __m512i pExt_j_tmp2c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2b_vec, k_4x4_1, pExt_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
00776                   __m512i pExt_j_tmp3c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3b_vec, k_4x4_1, pExt_j_tmp1b_vec, _MM_SWIZ_REG_BADC);
00777                   __m512i pExt_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3,  14, 10, 6, 2,  13, 9, 5, 1,  12, 8, 4, 0);
00778                   __m512i pExt_j_index_vec     = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp1c_vec);
00779                   __m512i pExt_j_exclIndex_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp2c_vec);
00780                   __m512i pExt_j_maxDiff_vec   = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp3c_vec);
00781                 #else
00782                   __m512i pExt_j_index_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_index + j);
00783                   __m512i pExt_j_maxDiff_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclMaxDiff + j);
00784                   __m512i pExt_j_exclIndex_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclIndex + j);
00785                 #endif
00786 
00787                 // Check the index difference vs the maxDiff value to see if there the exclusion bits
00788                 //   for this particular pair of atoms needs to be loaded.
00789                 __m512i indexDiff_vec = _mm512_mask_sub_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_i_index_vec, pExt_j_index_vec);
00790                 __mmask16 indexRange_min_mask = _mm512_mask_cmpge_epi32_mask(cutoff_mask, indexDiff_vec, _mm512_sub_epi32(_mm512_setzero_epi32(), pExt_j_maxDiff_vec)); // NOTE: indexDiff >= -1 * maxDiff
00791                 __mmask16 indexRange_max_mask = _mm512_mask_cmple_epi32_mask(cutoff_mask, indexDiff_vec, pExt_j_maxDiff_vec);
00792                 __mmask16 indexRange_mask = _mm512_kand(indexRange_min_mask, indexRange_max_mask);
00793 
00794                 // Calculate offsets into the exclusion flags
00795                 __m512i offset_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), indexRange_mask, _mm512_slli_epi32(indexDiff_vec, 1), pExt_j_exclIndex_vec);
00796                 __m512i offset_major_vec = _mm512_srli_epi32(offset_vec, 5); // NOTE : offset / 32
00797                 __m512i offset_minor_vec = _mm512_and_epi32(_mm512_set_1to16_epi32(0x001f), offset_vec); // NOTE : offset % 32
00798 
00799                 // Gather exclFlags using offset_major values and then extra 2-bit fields using offset_minor.
00800                 #if 0
00801                   int offset_major[16] __attribute__((aligned(64)));
00802                   int exclFlags[16] __attribute__((aligned(64)));
00803                   _mm512_store_epi32(offset_major, offset_major_vec);
00804                   exclFlags[ 0] = params.exclusion_bits[offset_major[ 0]];
00805                   exclFlags[ 1] = params.exclusion_bits[offset_major[ 1]];
00806                   exclFlags[ 2] = params.exclusion_bits[offset_major[ 2]];
00807                   exclFlags[ 3] = params.exclusion_bits[offset_major[ 3]];
00808                   exclFlags[ 4] = params.exclusion_bits[offset_major[ 4]];
00809                   exclFlags[ 5] = params.exclusion_bits[offset_major[ 5]];
00810                   exclFlags[ 6] = params.exclusion_bits[offset_major[ 6]];
00811                   exclFlags[ 7] = params.exclusion_bits[offset_major[ 7]];
00812                   exclFlags[ 8] = params.exclusion_bits[offset_major[ 8]];
00813                   exclFlags[ 9] = params.exclusion_bits[offset_major[ 9]];
00814                   exclFlags[10] = params.exclusion_bits[offset_major[10]];
00815                   exclFlags[11] = params.exclusion_bits[offset_major[11]];
00816                   exclFlags[12] = params.exclusion_bits[offset_major[12]];
00817                   exclFlags[13] = params.exclusion_bits[offset_major[13]];
00818                   exclFlags[14] = params.exclusion_bits[offset_major[14]];
00819                   exclFlags[15] = params.exclusion_bits[offset_major[15]];
00820                   __m512i exclFlags_vec = _mm512_load_epi32(exclFlags);
00821                 #else
00822                   __m512i exclFlags_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, offset_major_vec, params.exclusion_bits, _MM_SCALE_4);
00823                 #endif
00824                 exclFlags_vec = _mm512_mask_srlv_epi32(_mm512_setzero_epi32(), indexRange_mask, exclFlags_vec, offset_minor_vec);
00825                 exclFlags_vec = _mm512_and_epi32(exclFlags_vec, _mm512_set_1to16_epi32(0x03));  // NOTE : Mask out all but 2 LSBs
00826 
00827                 // Create masks for each type of interaction (normal, modified, excluded) and
00828                 //   store the generated pairlist entry values into the pairlists
00829                 __mmask16 norm_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_setzero_epi32());
00830                 #if MIC_CONDITION_NORMAL != 0
00831                   __mmask16 normhi_mask = _mm512_mask_cmplt_ps_mask(norm_mask, _mm512_set_1to16_ps(normhi_split), r2_vec);
00832                   norm_mask = _mm512_kxor(norm_mask, normhi_mask); // Unset any bits that were set in normhi
00833                 #endif            
00834                 __mmask16 excl_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(1));
00835                 __mmask16  mod_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(2));
00836                 _mm512_mask_packstorelo_epi32(pairlist_norm + plOffset_norm     , norm_mask, plEntry_vec);
00837                 _mm512_mask_packstorehi_epi32(pairlist_norm + plOffset_norm + 16, norm_mask, plEntry_vec);
00838                 #if MIC_CONDITION_NORMAL != 0
00839                   _mm512_mask_packstorelo_epi32(pairlist_normhi + plOffset_normhi     , normhi_mask, plEntry_vec);
00840                   _mm512_mask_packstorehi_epi32(pairlist_normhi + plOffset_normhi + 16, normhi_mask, plEntry_vec);
00841                 #endif
00842                 _mm512_mask_packstorelo_epi32(pairlist_excl + plOffset_excl     , excl_mask, plEntry_vec);
00843                 _mm512_mask_packstorehi_epi32(pairlist_excl + plOffset_excl + 16, excl_mask, plEntry_vec);
00844                 _mm512_mask_packstorelo_epi32(pairlist_mod  + plOffset_mod      ,  mod_mask, plEntry_vec);
00845                 _mm512_mask_packstorehi_epi32(pairlist_mod  + plOffset_mod  + 16,  mod_mask, plEntry_vec);
00846                 __m512i one_vec = _mm512_set_1to16_epi32(1);
00847                 plOffset_norm += _mm512_mask_reduce_add_epi32(norm_mask, one_vec);
00848                 #if MIC_CONDITION_NORMAL != 0
00849                   plOffset_normhi += _mm512_mask_reduce_add_epi32(normhi_mask, one_vec);
00850                 #endif
00851                 plOffset_excl += _mm512_mask_reduce_add_epi32(excl_mask, one_vec);
00852                 plOffset_mod  += _mm512_mask_reduce_add_epi32( mod_mask, one_vec);
00853               }
00854 
00855             #else
00856 
00857               // The "j" loop, iterating over the second list of atoms
00858               for (int j = j_lo; j < j_hi; j++) {
00859 
00860                 // Load the "j" atom's position, calculate/check the distance (squared)
00861                 //   between the "i" and "j" atoms
00862                 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00863                   CALC_TYPE p_d_x = p_i_x - p_1[j].x;
00864                   CALC_TYPE p_d_y = p_i_y - p_1[j].y;
00865                   CALC_TYPE p_d_z = p_i_z - p_1[j].z;
00866                 #else
00867                   CALC_TYPE p_d_x = p_i_x - p_1_x[j];
00868                   CALC_TYPE p_d_y = p_i_y - p_1_y[j];
00869                   CALC_TYPE p_d_z = p_i_z - p_1_z[j];
00870                 #endif
00871                 CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
00872                 if (r2 <= plcutoff2) {
00873 
00874                   // Check the exclusion bits to set the modified and excluded flags
00875                   // NOTE: This code MUST match the exclusion lists generation code
00876                   //   on ComputeNonbondedMIC.C, which generates the flags.  In
00877                   //   particular, the order of the 2 bits must be correct, per the
00878                   //   pairlist format listed below (i.e. isModified -> MSB).
00879                   int exclFlags = 0x00;
00880                   #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00881                     const int indexDiff = pExt_0[i].index - pExt_1[i].index;
00882                     const int maxDiff = pExt_1[j].excl_maxdiff;
00883                   #else
00884                     const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
00885                     const int maxDiff = pExt_1_exclMaxDiff[j];
00886                   #endif
00887                   if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
00888                     #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00889                       const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
00890                     #else
00891                       const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
00892                     #endif
00893                     const int offset_major = offset / (sizeof(unsigned int) * 8);
00894                     const int offset_minor = offset % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
00895                     exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
00896                   }
00897 
00898                   // Create the pairlist entry value and store it
00899                   const int plEntry = ((i & 0xFFFF) << 16) | (j & 0xFFFF);
00900                   if (exclFlags == 0) {
00901                     #if MIC_CONDITION_NORMAL != 0
00902                       if (r2 <= normhi_split) {
00903                         pairlist_norm[plOffset_norm++] = plEntry;
00904                       } else {
00905                         pairlist_normhi[plOffset_normhi++] = plEntry;
00906                       }
00907                     #else
00908                       pairlist_norm[plOffset_norm++] = plEntry;
00909                     #endif
00910                   } else if (exclFlags == 1) {
00911                     pairlist_excl[plOffset_excl++] = plEntry;
00912                   } else if (exclFlags == 2) {
00913                     pairlist_mod[plOffset_mod++] = plEntry;
00914                   }
00915 
00916                 } // end if (r2 <= plcutoff2
00917               } // end for (j < j_hi)
00918 
00919             #endif
00920 
00921             #if __MIC_PAD_PLGEN_CTRL != 0
00922               #if MIC_HANDCODE_FORCE_SINGLE != 0
00923                 const int padLen = 16;
00924               #else
00925                 const int padLen = 8;
00926               #endif
00927               const int padValue = (i << 16) | 0xFFFF;
00928               // NOTE: xxx % 8 != 2 because pairlist offset includes first two ints (pairlist size and alloc size)
00929               while (plOffset_norm % padLen != 2) { pairlist_norm[plOffset_norm++] = padValue; }
00930               while (plOffset_mod  % padLen != 2) { pairlist_mod [plOffset_mod++ ] = padValue; }
00931               while (plOffset_excl % padLen != 2) { pairlist_excl[plOffset_excl++] = padValue; }
00932               #if MIC_CONDITION_NORMAL != 0
00933                 while (plOffset_normhi % padLen != 2) { pairlist_normhi[plOffset_normhi++] = padValue; }
00934               #endif 
00935            #endif
00936 
00937           } // end for (i < i_hi)
00938         } // end (_j < j_upper; _j += plTileSize)
00939       } // end (_i < i_upper; _i += plTileSize)
00940 
00941     #else  // MIC_TILE_PLGEN
00942 
00943       // Do the distance checks for all possible pairs of atoms between the
00944       //   two input atom arrays
00945       // The "i" loop, iterating over the first list of atoms
00946 
00947       int numParts = params.pp->numParts;
00948       int part = params.pp->part;
00949 
00950       // Generate plGenILo and plGenHi values (the portion of the "i" atoms that will be processed by this "part").
00951       //   Note that this is done differently for self computes, because of the unequal work per "i" atom.  For
00952       //   pair computes, the "i" iteration space is just divided up evenly.
00953       #if 0
00954       int plGenILo = 0;
00955       int plGenIHi = i_upper SELF(-1);
00956       if (numParts > 1) {
00957 
00958         #if (0 SELF(+1))
00959 
00960           // For self computes where the iteration space forms a triangle, divide the rows in the
00961           //   iteration space so more "shorter" rows are grouped together while fewer "longer" rows
00962           //   are grouped together.
00963           float totalArea = ((i_upper) * (i_upper - 1)) / 2.0f;
00964           float areaPerPart = totalArea / numParts;
00965 
00966           // NOTE : We want to divide the triangular iteration space (0 <= i < i_upper - 1, i < j < i_upper).
00967           //   Since we know the area per part and the area of the iteration space "before" some arbitrary
00968           //   index X (i.e. 0 <= i < X), we can calculate indexes for each part.
00969           //     A = X(X-1)/2 where A = area "before" X
00970           //     solving for X we get X^2 - X - 2A = 0, or via the quadradic equation, X = (1 +- sqrt(1+8A))/2
00971           // NOTE: This calculation might be a little messy, so double check the border cases
00972           plGenILo = ((part==0)?(0) : ((int)((1.0f+sqrtf(1+8*(areaPerPart*part)))/2.0f)) );
00973           plGenIHi = ((part==numParts-1)?(i_upper-1) : ((int)((1.0f+sqrtf(1+8*(areaPerPart*(part+1))))/2.0f)) );
00974           // Reverse the indexes since this calculation assumes i=0 has little work i=i_upper-1 has lots of
00975           //   work (i.e. upper triangular versus lower triangular)
00976           int plGenTmp = plGenILo;
00977           plGenILo = (i_upper - 1) - plGenIHi;
00978           plGenIHi = (i_upper - 1) - plGenTmp;
00979 
00980         #else  // SELF
00981 
00982           // For pair computes where the iteration space forms a square, divide the rows in the
00983           //   iteration space evenly since they all have the same area
00984           plGenILo = (int)(((float)(i_upper)) * ((float)(part    )) / ((float)(numParts)));
00985           plGenIHi = (int)(((float)(i_upper)) * ((float)(part + 1)) / ((float)(numParts)));
00986 
00987         #endif  // SELF
00988 
00989         // Round up plGenILo and plGenIHi to a multiple of 16 so there is no false sharing
00990         //   on force output cachelines
00991         plGenILo = (plGenILo + 15) & (~15);
00992         plGenIHi = (plGenIHi + 15) & (~15);
00993         if (plGenIHi > i_upper SELF(-1)) { plGenIHi = i_upper SELF(-1); }
00994       }
00995 
00996       #pragma loop_count (300)
00997       for (int i = plGenILo; i < plGenIHi; i++) {
00998 
00999       #else
01000 
01001       #pragma loop_count (300)
01002       for (int i = part; i < i_upper SELF(-1); i += numParts) {
01003 
01004       #endif
01005 
01006         // If the pairlist is not long enough, grow the pairlist
01007         PAIRLIST_GROW_CHECK(pairlist_norm, plOffset_norm, 50);
01008         PAIRLIST_GROW_CHECK(pairlist_mod , plOffset_mod,   8);
01009         PAIRLIST_GROW_CHECK(pairlist_excl, plOffset_excl,  8);
01010         #if MIC_CONDITION_NORMAL != 0
01011           PAIRLIST_GROW_CHECK(pairlist_normhi, plOffset_normhi, 15);
01012         #endif
01013 
01014         // Load position information for the current "i" atom
01015         #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01016           CALC_TYPE p_i_x = p_0[i].x + offset_x;
01017           CALC_TYPE p_i_y = p_0[i].y + offset_y;
01018           CALC_TYPE p_i_z = p_0[i].z + offset_z;
01019         #else
01020           CALC_TYPE p_i_x = p_0_x[i] + offset_x;
01021           CALC_TYPE p_i_y = p_0_y[i] + offset_y;
01022           CALC_TYPE p_i_z = p_0_z[i] + offset_z;
01023         #endif
01024 
01025         #if MIC_HANDCODE_PLGEN != 0
01026 
01027           // Setup loop constants ("i" atom values) and iterator vector.
01028           __m512 p_i_x_vec = _mm512_set_1to16_ps(p_i_x);
01029           __m512 p_i_y_vec = _mm512_set_1to16_ps(p_i_y);
01030           __m512 p_i_z_vec = _mm512_set_1to16_ps(p_i_z);
01031           #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01032             __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0[i].index);
01033           #else
01034             __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0_index[i]);
01035           #endif
01036 
01037           __m512i i_shifted_vec = _mm512_slli_epi32(_mm512_set_1to16_epi32(i), 16); 
01038           __m512i j_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10,  9,  8,
01039                                                    7,  6,  5,  4,  3,  2,  1,  0);
01040 
01041           // For self computes, round (i+1) down to nearest multiple of 16 and then
01042           //   add that value to j_vec (skipping iterations where all j values
01043           //   are < i+1)
01044           #if (0 SELF(+1))
01045             const int ip1_16 = (i+1) & (~0x0F);
01046             j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(ip1_16));
01047           #endif
01048 
01049           // The "j" loop, iterating over the second list of atoms
01050           #pragma novector
01051           #pragma loop count(35)
01052           for (int j = 0 SELF(+ ip1_16); j < j_upper; j += 16) {
01053 
01054             // Create the active_mask
01055             __mmask16 active_mask = _mm512_cmplt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_upper));
01056             #if (0 SELF(+1))
01057               active_mask = _mm512_kand(active_mask, _mm512_cmpgt_epi32_mask(j_vec, _mm512_set_1to16_epi32(i)));
01058             #endif
01059 
01060             // Create the pairlist entry values for these would-be interactions before advancing
01061             //   the j_vec values (i in upper 16 bits and j in lower 16 bits)
01062             __m512i plEntry_vec = _mm512_or_epi32(i_shifted_vec, j_vec);
01063 
01064             // Advance j_vec counter
01065             j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(16));
01066 
01067             // Load the positions for the "j" atoms
01068             #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01069               __m512i p_j_tmp0a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j     )); // Load  first set of 4 atoms
01070               __m512i p_j_tmp1a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j +  4)); // Load second set of 4 atoms
01071               __m512i p_j_tmp2a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j +  8)); // Load  third set of 4 atoms
01072               __m512i p_j_tmp3a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 12)); // Load fourth set of 4 atoms
01073               __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
01074               __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
01075               __m512i p_j_tmp0b_vec = _mm512_mask_swizzle_epi32(p_j_tmp0a_vec, k_2x2_0, p_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
01076               __m512i p_j_tmp1b_vec = _mm512_mask_swizzle_epi32(p_j_tmp1a_vec, k_2x2_1, p_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
01077               __m512i p_j_tmp2b_vec = _mm512_mask_swizzle_epi32(p_j_tmp2a_vec, k_2x2_0, p_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
01078               __m512i p_j_tmp3b_vec = _mm512_mask_swizzle_epi32(p_j_tmp3a_vec, k_2x2_1, p_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
01079               __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
01080               __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
01081               __m512i p_j_tmp0c_vec = _mm512_mask_swizzle_epi32(p_j_tmp0b_vec, k_4x4_0, p_j_tmp2b_vec, _MM_SWIZ_REG_BADC);
01082               __m512i p_j_tmp1c_vec = _mm512_mask_swizzle_epi32(p_j_tmp1b_vec, k_4x4_0, p_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
01083               __m512i p_j_tmp2c_vec = _mm512_mask_swizzle_epi32(p_j_tmp2b_vec, k_4x4_1, p_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
01084               __m512i p_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3,  14, 10, 6, 2,  13, 9, 5, 1,  12, 8, 4, 0);
01085               __m512 p_j_x_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp0c_vec));
01086               __m512 p_j_y_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp1c_vec));
01087               __m512 p_j_z_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp2c_vec));
01088             #else
01089               __m512 p_j_x_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_x + j);
01090               __m512 p_j_y_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_y + j);
01091               __m512 p_j_z_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_z + j);
01092             #endif
01093 
01094             // Calculate the distance between "i" atom and these "j" atoms (in double)
01095             __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
01096             __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
01097             __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
01098             __m512 r2_vec = _mm512_mul_ps(p_ij_x_vec, p_ij_x_vec);
01099             r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_y_vec, p_ij_y_vec));
01100             r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_z_vec, p_ij_z_vec));
01101 
01102             // Do cutoff distance check.  If there are no particles within cutoff, move on
01103             //   to the next iteration.
01104             __mmask16 cutoff_mask = _mm512_mask_cmple_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps((float)(plcutoff2)));
01105 
01106             // If nothing passed the cutoff check, then move on to the next set of atoms (iteration)
01107             if (_mm512_kortestz(cutoff_mask, cutoff_mask)) { continue; }
01108 
01109             #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01110               __m512i pExt_j_tmp0a_vec = _mm512_load_epi32(pExt_1 + j     ); // Load  first set of 4 atoms
01111               __m512i pExt_j_tmp1a_vec = _mm512_load_epi32(pExt_1 + j +  4); // Load second set of 4 atoms
01112               __m512i pExt_j_tmp2a_vec = _mm512_load_epi32(pExt_1 + j +  8); // Load  third set of 4 atoms
01113               __m512i pExt_j_tmp3a_vec = _mm512_load_epi32(pExt_1 + j + 12); // Load fourth set of 4 atoms
01114               __m512i pExt_j_tmp0b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp0a_vec, k_2x2_0, pExt_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
01115               __m512i pExt_j_tmp1b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1a_vec, k_2x2_1, pExt_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
01116               __m512i pExt_j_tmp2b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2a_vec, k_2x2_0, pExt_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
01117               __m512i pExt_j_tmp3b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3a_vec, k_2x2_1, pExt_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
01118               __m512i pExt_j_tmp1c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1b_vec, k_4x4_0, pExt_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
01119               __m512i pExt_j_tmp2c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2b_vec, k_4x4_1, pExt_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
01120               __m512i pExt_j_tmp3c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3b_vec, k_4x4_1, pExt_j_tmp1b_vec, _MM_SWIZ_REG_BADC);
01121               __m512i pExt_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3,  14, 10, 6, 2,  13, 9, 5, 1,  12, 8, 4, 0);
01122               __m512i pExt_j_index_vec     = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp1c_vec);
01123               __m512i pExt_j_exclIndex_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp2c_vec);
01124               __m512i pExt_j_maxDiff_vec   = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp3c_vec);
01125             #else
01126               __m512i pExt_j_index_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_index + j);
01127               __m512i pExt_j_maxDiff_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclMaxDiff + j);
01128               __m512i pExt_j_exclIndex_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclIndex + j);
01129             #endif
01130 
01131             // Check the index difference vs the maxDiff value to see if there the exclusion bits
01132             //   for this particular pair of atoms needs to be loaded.
01133             __m512i indexDiff_vec = _mm512_mask_sub_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_i_index_vec, pExt_j_index_vec);
01134             __mmask16 indexRange_min_mask = _mm512_mask_cmpge_epi32_mask(cutoff_mask, indexDiff_vec, _mm512_sub_epi32(_mm512_setzero_epi32(), pExt_j_maxDiff_vec)); // NOTE: indexDiff >= -1 * maxDiff
01135             __mmask16 indexRange_max_mask = _mm512_mask_cmple_epi32_mask(cutoff_mask, indexDiff_vec, pExt_j_maxDiff_vec);
01136             __mmask16 indexRange_mask = _mm512_kand(indexRange_min_mask, indexRange_max_mask);
01137 
01138             // Calculate offsets into the exclusion flags
01139             __m512i offset_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), indexRange_mask, _mm512_slli_epi32(indexDiff_vec, 1), pExt_j_exclIndex_vec);
01140             __m512i offset_major_vec = _mm512_srli_epi32(offset_vec, 5); // NOTE : offset / 32
01141             __m512i offset_minor_vec = _mm512_and_epi32(_mm512_set_1to16_epi32(0x001f), offset_vec); // NOTE : offset % 32
01142 
01143             // Gather exclFlags using offset_major values and then extra 2-bit fields using offset_minor.
01144             __m512i exclFlags_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, offset_major_vec, params.exclusion_bits, _MM_SCALE_4);
01145             exclFlags_vec = _mm512_mask_srlv_epi32(_mm512_setzero_epi32(), indexRange_mask, exclFlags_vec, offset_minor_vec);
01146             exclFlags_vec = _mm512_and_epi32(exclFlags_vec, _mm512_set_1to16_epi32(0x03));  // NOTE : Mask out all but 2 LSBs
01147 
01148             // Create masks for each type of interaction (normal, modified, excluded) and
01149             //   store the generated pairlist entry values into the pairlists
01150             __mmask16 norm_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_setzero_epi32());
01151             #if MIC_CONDITION_NORMAL != 0
01152               __mmask16 normhi_mask = _mm512_mask_cmplt_ps_mask(norm_mask, _mm512_set_1to16_ps(normhi_split), r2_vec);
01153               norm_mask = _mm512_kxor(norm_mask, normhi_mask); // Unset any bits that were set in normhi
01154             #endif            
01155             __mmask16 excl_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(1));
01156             __mmask16  mod_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(2));
01157             _mm512_mask_packstorelo_epi32(pairlist_norm + plOffset_norm     , norm_mask, plEntry_vec);
01158             _mm512_mask_packstorehi_epi32(pairlist_norm + plOffset_norm + 16, norm_mask, plEntry_vec);
01159             #if MIC_CONDITION_NORMAL != 0
01160               _mm512_mask_packstorelo_epi32(pairlist_normhi + plOffset_normhi     , normhi_mask, plEntry_vec);
01161               _mm512_mask_packstorehi_epi32(pairlist_normhi + plOffset_normhi + 16, normhi_mask, plEntry_vec);
01162             #endif
01163             _mm512_mask_packstorelo_epi32(pairlist_excl + plOffset_excl     , excl_mask, plEntry_vec);
01164             _mm512_mask_packstorehi_epi32(pairlist_excl + plOffset_excl + 16, excl_mask, plEntry_vec);
01165             _mm512_mask_packstorelo_epi32(pairlist_mod  + plOffset_mod      ,  mod_mask, plEntry_vec);
01166             _mm512_mask_packstorehi_epi32(pairlist_mod  + plOffset_mod  + 16,  mod_mask, plEntry_vec);
01167             __m512i one_vec = _mm512_set_1to16_epi32(1);
01168 
01169             // Move the offsets forward by the number of atoms added to each list
01170             plOffset_norm += _mm512_mask_reduce_add_epi32(norm_mask, one_vec);
01171             #if MIC_CONDITION_NORMAL != 0
01172               plOffset_normhi += _mm512_mask_reduce_add_epi32(normhi_mask, one_vec);
01173             #endif
01174             plOffset_excl += _mm512_mask_reduce_add_epi32(excl_mask, one_vec);
01175             plOffset_mod  += _mm512_mask_reduce_add_epi32( mod_mask, one_vec);
01176           }
01177 
01178         #else // if MIC_HANDCODE_PLGEN != 0
01179 
01180           // The "j" loop, iterating over the second list of atoms
01181           #if (0 PAIR(+1))
01182             #pragma loop_count (30)
01183           #elif (0 SELF(+1))
01184             #pragma loop_count (300)
01185           #endif
01186           for (int j = 0 SELF(+i+1); j < j_upper; j++) {
01187 
01188             // Load the "j" atom's position, calculate/check the distance (squared)
01189             //   between the "i" and "j" atoms
01190             #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01191               CALC_TYPE p_d_x = p_i_x - p_1[j].x;
01192               CALC_TYPE p_d_y = p_i_y - p_1[j].y;
01193               CALC_TYPE p_d_z = p_i_z - p_1[j].z;
01194             #else
01195               CALC_TYPE p_d_x = p_i_x - p_1_x[j];
01196               CALC_TYPE p_d_y = p_i_y - p_1_y[j];
01197               CALC_TYPE p_d_z = p_i_z - p_1_z[j];
01198             #endif
01199             CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
01200             if (r2 <= plcutoff2) {
01201 
01202               // Check the exclusion bits to set the modified and excluded flags
01203               // NOTE: This code MUST match the exclusion lists generation code
01204               //   on ComputeNonbondedMIC.C, which generates the flags.  In
01205               //   particular, the order of the 2 bits must be correct, per the
01206               //   pairlist format listed below (i.e. isModified -> MSB).
01207               int exclFlags = 0x00;
01208               #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01209                 const int indexDiff = pExt_0[i].index - pExt_1[j].index;
01210                 const int maxDiff = pExt_1[j].excl_maxdiff;
01211               #else
01212                 const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
01213                 const int maxDiff = pExt_1_exclMaxDiff[j];
01214               #endif
01215               if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
01216                 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01217                   const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
01218                 #else
01219                   const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
01220                 #endif
01221                 const int offset_major = offset / (sizeof(unsigned int) * 8);
01222                 const int offset_minor = offset % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
01223                 exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
01224               }
01225 
01226               // Create the pairlist entry value (i and j value) and store it in to the
01227               //   appropriate pairlist based on the type of interaction (exclFlags value)
01228               const int plEntry = ((i & 0xFFFF) << 16) | (j & 0xFFFF);
01229               if (exclFlags == 0) {
01230                 #if MIC_CONDITION_NORMAL != 0
01231                   if (r2 <= normhi_split) {
01232                     pairlist_norm[plOffset_norm++] = plEntry;
01233                   } else {
01234                     pairlist_normhi[plOffset_normhi++] = plEntry;
01235                   }
01236                 #else
01237                   pairlist_norm[plOffset_norm++] = plEntry;
01238                 #endif
01239               } else if (exclFlags == 1) {
01240                 pairlist_excl[plOffset_excl++] = plEntry;
01241               } else if (exclFlags == 2) {
01242                 pairlist_mod[plOffset_mod++] = plEntry;
01243               }
01244 
01245             } // end if (r2 < plcutoff2)
01246           } // end for (j < j_upper)
01247 
01248         #endif // if MIC_HANDCODE_PLGEN != 0
01249 
01250         //#if MIC_PAD_PLGEN != 0
01251         #if __MIC_PAD_PLGEN_CTRL != 0
01252           //#if (MIC_HANDCODE_FORCE != 0) && (MIC_HANDCODE_FORCE_SINGLE != 0)
01253           #if MIC_HANDCODE_FORCE_SINGLE != 0
01254             const int padLen = 16;
01255           #else
01256             const int padLen = 8;
01257           #endif
01258           const int padValue = (i << 16) | 0xFFFF;
01259           // NOTE: xxx % padLen != 2 because offset includes first two ints (pairlist sizes), so 0 entries <-> offset 2, 16 entries <-> offset 18, etc.
01260           //   Alignment is setup so that the entries (minus the first two ints) are cacheline aligned.
01261           while (plOffset_norm % padLen != 2) { pairlist_norm[plOffset_norm++] = padValue; }
01262           while (plOffset_mod  % padLen != 2) { pairlist_mod [plOffset_mod++ ] = padValue; }
01263           while (plOffset_excl % padLen != 2) { pairlist_excl[plOffset_excl++] = padValue; }
01264           #if MIC_CONDITION_NORMAL != 0
01265             while (plOffset_normhi % padLen != 2) { pairlist_normhi[plOffset_normhi++] = padValue; }
01266           #endif
01267         #endif
01268 
01269       } // end for (i < i_upper)
01270 
01271     #endif  // if MIC_TILE_PLGEN != 0
01272 
01273     // If we are conditioning the normal pairlist, place the contents of pairlist_normhi
01274     //   at the end of pairlist_norm
01275     #if MIC_CONDITION_NORMAL != 0
01276 
01277       // Make sure there is enough room in pairlist_norm for the contents of pairlist_normhi
01278       if (plOffset_norm + plOffset_normhi > pairlist_norm[0]) {
01279         int newPlAllocSize = pairlist_norm[0] + 2 * plOffset_normhi + 8;
01280         int* RESTRICT newPl = (int*)_MM_MALLOC_WRAPPER(newPlAllocSize * sizeof(int) + 64, 64, "pairlist_norm grow for conditioning");
01281         __ASSERT(newPl != NULL);
01282         newPl += 14; // Align pairlist entries, which start at index 2
01283         memcpy(newPl, pairlist_norm, plOffset_norm * sizeof(int));
01284         _MM_FREE_WRAPPER(pairlist_norm - 14);
01285         pairlist_norm = newPl;
01286         pairlist_norm[0] = newPlAllocSize;
01287       }
01288 
01289       // Copy the contents of pairlist_normhi into pairlist_norm
01290       for (int i = 0; i < plOffset_normhi - 2; i++) {
01291         pairlist_norm[plOffset_norm + i] = pairlist_normhi[2 + i];
01292       }
01293       plOffset_norm += plOffset_normhi - 2;
01294       plOffset_normhi = 2;
01295 
01296     #endif
01297 
01298     // Store the current offsets (pairlist lengths) in to the pairlist data structure itself
01299     pairlist_norm[1] = plOffset_norm; // NOTE: The size includes the initial 2 ints
01300     pairlist_mod[1] = plOffset_mod;
01301     pairlist_excl[1] = plOffset_excl;
01302     #if MIC_CONDITION_NORMAL != 0
01303       pairlist_normhi[1] = plOffset_normhi;
01304     #endif
01305 
01306     // DMK - DEBUG - Pairlist memory size info
01307     #if MIC_TRACK_DEVICE_MEM_USAGE != 0
01308       if (plOffset_norm > pairlist_norm[-1]) { pairlist_norm[-1] = plOffset_norm; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
01309       if (plOffset_mod > pairlist_mod[-1]) { pairlist_mod[-1] = plOffset_mod; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
01310       if (plOffset_excl > pairlist_excl[-1]) { pairlist_excl[-1] = plOffset_excl; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
01311       #if MIC_CONDITION_NORMAL != 0
01312         if (plOffset_normhi > pairlist_normhi[-1]) { pairlist_normhi[-1] = plOffset_normhi; } // NOTE: Because of alignment of pairlist payload, known to have 14 ints allocated prior to pairlist start
01313       #endif
01314     #endif
01315 
01316     // If prefetch is enabled, pad-out some extra (valid) entries in the pairlist
01317     //   that can be safely prefetched (i.e. without segfaulting), even though they will have
01318     //   no actual effect (i.e. forces generated related to them)
01319     #if MIC_PREFETCH_DISTANCE > 0
01320       for (int pfI = 0; pfI < MIC_PREFETCH_DISTANCE; pfI++) {
01321         pairlist_norm[plOffset_norm + pfI] = 0;
01322         pairlist_mod [plOffset_mod  + pfI] = 0;
01323         pairlist_excl[plOffset_excl + pfI] = 0;
01324       }
01325     #endif
01326 
01327     // If we need to pad the pairlists with valid entries, create "zero" entries at the end
01328     #if MIC_HANDCODE_FORCE != 0 && MIC_HANDCODE_FORCE_PFDIST > 0
01329       for (int pfI = 0; pfI < 16 * MIC_HANDCODE_FORCE_PFDIST; pfI++) {
01330         #if MIC_HANDCODE_FORCE_SINGLE != 0
01331           pairlist_norm[plOffset_norm + pfI] = -1;
01332           pairlist_mod [plOffset_mod  + pfI] = -1;
01333           pairlist_excl[plOffset_excl + pfI] = -1;
01334         #else
01335           pairlist_norm[plOffset_norm + pfI] = 0;
01336           pairlist_mod [plOffset_mod  + pfI] = 0;
01337           pairlist_excl[plOffset_excl + pfI] = 0;
01338         #endif
01339       }
01340     #endif
01341 
01342     // Save the pointer values back into the pairlist pointer array, so the buffers can
01343     //   be reused across timesteps, in case the pointers were changed here
01344     params.pairlists_ptr[PL_NORM_INDEX] = pairlist_norm;
01345     params.pairlists_ptr[ PL_MOD_INDEX] = pairlist_mod;
01346     params.pairlists_ptr[PL_EXCL_INDEX] = pairlist_excl;
01347     #if MIC_CONDITION_NORMAL != 0
01348       params.pairlists_ptr[PL_NORMHI_INDEX] = pairlist_normhi;
01349     #endif
01350 
01351   } // end if (params.savePairlists)
01352 
01353   #undef PAIRLIST_ALLOC_CHECK
01354   #undef PAIRLIST_GROW_CHECK
01355 
01356   // Declare the scalars (or vector-width-long arrays if using force splitting) related
01357   //   to accumulating virial and energy output, initializing these values to zero
01358   #if (0 FAST(+1))
01359     #if (0 ENERGY(+1))
01360       double vdwEnergy = 0;
01361       SHORT( double electEnergy = 0; )
01362     #endif
01363     #if (0 SHORT(+1))
01364       double virial_xx = 0;
01365       double virial_xy = 0;
01366       double virial_xz = 0;
01367       double virial_yy = 0;
01368       double virial_yz = 0;
01369       double virial_zz = 0;
01370     #endif
01371   #endif
01372   #if (0 FULL(+1))
01373     #if (0 ENERGY(+1))
01374       double fullElectEnergy = 0;
01375     #endif
01376     double fullElectVirial_xx = 0;
01377     double fullElectVirial_xy = 0;
01378     double fullElectVirial_xz = 0;
01379     double fullElectVirial_yy = 0;
01380     double fullElectVirial_yz = 0;
01381     double fullElectVirial_zz = 0;
01382   #endif
01383 
01384   // NORMAL LOOP
01385   DEVICE_FPRINTF("N");
01386   #define PAIRLIST pairlist_norm
01387   #define NORMAL(X) X
01388   #define EXCLUDED(X)
01389   #define MODIFIED(X)
01390     #include "ComputeNonbondedMICKernelBase2.h"
01391   #undef PAIRLIST
01392   #undef NORMAL
01393   #undef EXCLUDED
01394   #undef MODIFIED
01395 
01396   // MODIFIED LOOP
01397   DEVICE_FPRINTF("M");
01398   #define PAIRLIST pairlist_mod
01399   #define NORMAL(X)
01400   #define EXCLUDED(X)
01401   #define MODIFIED(X) X
01402     #include "ComputeNonbondedMICKernelBase2.h"
01403   #undef PAIRLIST
01404   #undef NORMAL
01405   #undef EXCLUDED
01406   #undef MODIFIED
01407 
01408   // EXCLUDED LOOP
01409   #if defined(FULLELECT)
01410 
01411     DEVICE_FPRINTF("E");
01412     #define PAIRLIST pairlist_excl
01413     #undef FAST
01414     #define FAST(X)
01415     #define NORMAL(X)
01416     #define EXCLUDED(X) X
01417     #define MODIFIED(X)
01418       #include "ComputeNonbondedMICKernelBase2.h"
01419     #undef PAIRLIST
01420     #undef FAST
01421     #ifdef SLOWONLY
01422       #define FAST(X)
01423     #else
01424       #define FAST(X) X
01425     #endif
01426     #undef NORMAL
01427     #undef EXCLUDED
01428     #undef MODIFIED
01429 
01430   #else  // defined(FULLELECT)
01431 
01432     // If we are not executing the excluded loop, then we need to count exclusive
01433     //   interactions per atom.  Contribute the number of entries in the EXCLUDED
01434     //   pairlist (valid only if padding pairlists)
01435     DEVICE_FPRINTF("e");
01436     #if MIC_EXCL_CHECKSUM_FULL != 0
01437     {
01438       #if __MIC_PAD_PLGEN_CTRL != 0
01439         // NOTE: If using padding, the pairlist will have 'invalid' entries mixed in with the
01440         //   valid entries, so we need to scan through and only count the valid entries
01441         const int * const pairlist_excl_base = pairlist_excl + 2;
01442         const int pairlist_excl_len = pairlist_excl[1] - 2;
01443         int exclSum = 0;
01444         __ASSUME_ALIGNED(pairlist_excl_base);
01445         #pragma simd reduction(+ : exclSum)
01446         for (int plI = 0; plI < pairlist_excl_len; plI++) {
01447           if ((pairlist_excl_base[plI] & 0xFFFF) != 0xFFFF) {
01448             exclSum += 1;
01449           }
01450         }
01451         params.exclusionSum += exclSum;
01452       #else
01453         params.exclusionSum += pairlist_excl[1] - 2; // NOTE: Size includes first two elements (alloc size and size), don't count those
01454       #endif
01455     }
01456     #endif
01457 
01458   #endif  // defined(FULLELECT)
01459 
01460   // If this is a self compute, do the virial calculation here
01461   #if (0 SHORT( FAST( SELF(+1))))
01462     for (int i = 0; i < i_upper; i++) {
01463       #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01464         virial_xx += f_0[i].x * (p_0[i].x + params.patch1_center_x);
01465         virial_xy += f_0[i].x * (p_0[i].y + params.patch1_center_y);
01466         virial_xz += f_0[i].x * (p_0[i].z + params.patch1_center_z);
01467         virial_yy += f_0[i].y * (p_0[i].y + params.patch1_center_y);
01468         virial_yz += f_0[i].y * (p_0[i].z + params.patch1_center_z);
01469         virial_zz += f_0[i].z * (p_0[i].z + params.patch1_center_z);
01470       #else
01471         virial_xx += f_0_x[i] * (p_0_x[i] + params.patch1_center_x);
01472         virial_xy += f_0_x[i] * (p_0_y[i] + params.patch1_center_y);
01473         virial_xz += f_0_x[i] * (p_0_z[i] + params.patch1_center_z);
01474         virial_yy += f_0_y[i] * (p_0_y[i] + params.patch1_center_y);
01475         virial_yz += f_0_y[i] * (p_0_z[i] + params.patch1_center_z);
01476         virial_zz += f_0_z[i] * (p_0_z[i] + params.patch1_center_z);
01477       #endif
01478     }
01479   #endif
01480   #if (0 FULL( SELF(+1)))
01481     for (int i = 0; i < i_upper; i++) {
01482       #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01483         fullElectVirial_xx += fullf_0[i].x * (p_0[i].x + params.patch1_center_x);
01484         fullElectVirial_xy += fullf_0[i].x * (p_0[i].y + params.patch1_center_y);
01485         fullElectVirial_xz += fullf_0[i].x * (p_0[i].z + params.patch1_center_z);
01486         fullElectVirial_yy += fullf_0[i].y * (p_0[i].y + params.patch1_center_y);
01487         fullElectVirial_yz += fullf_0[i].y * (p_0[i].z + params.patch1_center_z);
01488         fullElectVirial_zz += fullf_0[i].z * (p_0[i].z + params.patch1_center_z);
01489       #else
01490         fullElectVirial_xx += fullf_0_x[i] * (p_0_x[i] + params.patch1_center_x);
01491         fullElectVirial_xy += fullf_0_x[i] * (p_0_y[i] + params.patch1_center_y);
01492         fullElectVirial_xz += fullf_0_x[i] * (p_0_z[i] + params.patch1_center_z);
01493         fullElectVirial_yy += fullf_0_y[i] * (p_0_y[i] + params.patch1_center_y);
01494         fullElectVirial_yz += fullf_0_y[i] * (p_0_z[i] + params.patch1_center_z);
01495         fullElectVirial_zz += fullf_0_z[i] * (p_0_z[i] + params.patch1_center_z);
01496       #endif
01497     }
01498   #endif
01499 
01500   FAST( SHORT( params.virial_xx = virial_xx; ) )
01501   FAST( SHORT( params.virial_xy = virial_xy; ) )
01502   FAST( SHORT( params.virial_xz = virial_xz; ) )
01503   FAST( SHORT( params.virial_yy = virial_yy; ) )
01504   FAST( SHORT( params.virial_yz = virial_yz; ) )
01505   FAST( SHORT( params.virial_zz = virial_zz; ) )
01506   FULL( params.fullElectVirial_xx = fullElectVirial_xx; )
01507   FULL( params.fullElectVirial_xy = fullElectVirial_xy; )
01508   FULL( params.fullElectVirial_xz = fullElectVirial_xz; )
01509   FULL( params.fullElectVirial_yy = fullElectVirial_yy; )
01510   FULL( params.fullElectVirial_yz = fullElectVirial_yz; )
01511   FULL( params.fullElectVirial_zz = fullElectVirial_zz; )
01512   FAST( ENERGY( params.vdwEnergy = vdwEnergy; ) )
01513   FAST( ENERGY( SHORT( params.electEnergy = electEnergy; ) ) )
01514   FULL( ENERGY( params.fullElectEnergy = fullElectEnergy; ) )
01515 
01516   #undef CALC_TYPE
01517 }
01518 
01519 
01520 // Undefine atom and force macros for SELF computes
01521 #if (0 SELF(+1))
01522   #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01523     #undef p_1
01524     #undef pExt_1
01525     #undef f_1
01526     #if (0 FULL(+1))
01527       #undef fullf_1
01528     #endif
01529   #else
01530     #undef p_1_x
01531     #undef p_1_y
01532     #undef p_1_z
01533     #undef p_1_q
01534     #undef p_1_vdwType
01535     #undef p_1_index
01536     #undef p_1_exclIndex
01537     #undef p_1_exclMaxDiff
01538     #undef f_1_x
01539     #undef f_1_y
01540     #undef f_1_z
01541     #undef f_1_w
01542     #if (0 FULL(+1))
01543       #undef fullf_1_x
01544       #undef fullf_1_y
01545       #undef fullf_1_z
01546       #undef fullf_1_w
01547     #endif
01548   #endif
01549 #endif
01550 
01551 #undef __MIC_PAD_PLGEN_CTRL
01552 
01553 #else  // NAMD_MIC
01554 
01555 #include "ComputeNonbondedMICKernelBase2.h"
01556 
01557 #endif  // NAMD_MIC
01558 

Generated on Wed Nov 22 01:17:13 2017 for NAMD by  doxygen 1.4.7