11 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)
12 #include <emmintrin.h>
13 #if defined(__INTEL_COMPILER)
14 #define __align(X) __declspec(align(X) )
15 #elif defined(__GNUC__) || defined(__PGI)
16 #define __align(X) __attribute__((aligned(X) ))
18 #define __align(X) __declspec(align(X) )
40 #define NAME CLASSNAME(calc)
45 #define CLASS ComputeNonbondedPair
46 #define CLASSNAME(X) ENERGYNAME( X ## _pair )
54 #define CLASS ComputeNonbondedSelf
55 #define CLASSNAME(X) ENERGYNAME( X ## _self )
66 #define ENERGYNAME(X) SLOWONLYNAME( X ## _energy )
70 #define ENERGYNAME(X) SLOWONLYNAME( X )
77 #define SLOWONLYNAME(X) MERGEELECTNAME( X ## _slow )
80 #define SLOWONLYNAME(X) MERGEELECTNAME( X )
89 #define MERGEELECTNAME(X) FULLELECTNAME( X ## _merge )
93 #define MERGEELECTNAME(X) FULLELECTNAME( X )
100 #define FULLELECTNAME(X) TABENERGYNAME( X ## _fullelect )
104 #define FULLELECTNAME(X) TABENERGYNAME( X )
113 #define TABENERGYNAME(X) FEPNAME( X ## _tabener )
114 #define TABENERGY(X) X
115 #define NOTABENERGY(X)
117 #define TABENERGYNAME(X) FEPNAME( X )
119 #define NOTABENERGY(X) X
141 #define FEPNAME(X) LAST( X )
144 #define NOT_ALCHPAIR(X) X
158 #define FEPNAME(X) LAST( X ## _fep )
166 #define FEPNAME(X) LAST( X ## _ti )
174 #define FEPNAME(X) LAST( X ## _les )
181 #define FEPNAME(X) LAST( X ## _int )
188 #define FEPNAME(X) LAST( X ## _pprof )
195 #define FEPNAME(X) LAST( X ## _go )
198 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
219 #define ALLOCA_ALIGNED(p, s, a) { \
220 char *ptr = alloca((s) + (a)); \
221 int64 a1 = (a) - 1; \
222 int64 ptr64 = ((int64)ptr) + a1); \
223 (p) = (void*)(ptr64 - (ptr64 & a1)); \
226 #define __MIC_PAD_PLGEN_CTRL (MIC_PAD_PLGEN)
235 __attribute__((target(mic)))
void NAME (mic_params ¶ms) {
239 #if MIC_HANDCODE_FORCE_SINGLE != 0
240 #define CALC_TYPE float
242 #define CALC_TYPE double
247 const int table_four_n_16 = params.table_four_n_16;
248 __ASSUME(table_four_n_16 % 16 == 0);
249 __ASSERT(params.table_four_base_ptr != NULL);
250 const CALC_TYPE * RESTRICT
const table_noshort = (CALC_TYPE*)(params.table_four_base_ptr) ; __ASSUME_ALIGNED(table_noshort);
251 const CALC_TYPE * RESTRICT
const table_short = (CALC_TYPE*)(params.table_four_base_ptr) + 16 * table_four_n_16; __ASSUME_ALIGNED(table_short);
252 const CALC_TYPE * RESTRICT
const slow_table = (CALC_TYPE*)(params.table_four_base_ptr) + 32 * table_four_n_16; __ASSUME_ALIGNED(slow_table);
263 const CALC_TYPE * RESTRICT
const lj_table_base_ptr = (CALC_TYPE*)(params.lj_table_base_ptr);
264 __ASSERT(lj_table_base_ptr != NULL);
265 const int lj_table_dim = params.lj_table_dim;
266 __ASSUME_ALIGNED(lj_table_base_ptr);
269 const CALC_TYPE r2_delta = params.constants->r2_delta;
270 const int r2_delta_exp = params.constants->r2_delta_exp;
271 const int r2_delta_expc = params.constants->r2_delta_expc;
272 const CALC_TYPE scaling = params.constants->scaling;
273 const CALC_TYPE modf_mod = params.constants->modf_mod;
276 const CALC_TYPE plcutoff = params.pp->plcutoff;
277 const CALC_TYPE
plcutoff2 = plcutoff * plcutoff;
278 const CALC_TYPE plcutoff2_delta = plcutoff2 + r2_delta;
279 const CALC_TYPE
cutoff2 = params.constants->cutoff2;
280 const CALC_TYPE cutoff2_delta = cutoff2 + r2_delta;
283 const int i_upper = params.numAtoms[0];
284 const int j_upper = params.numAtoms[1];
285 const int i_upper_16 = params.numAtoms_16[0];
286 const int j_upper_16 = params.numAtoms_16[1];
287 __ASSERT(i_upper >= 0); __ASSERT(j_upper >= 0);
288 __ASSERT(i_upper_16 >= 0); __ASSERT(j_upper_16 >= 0);
289 __ASSUME(i_upper_16 % 16 == 0);
290 __ASSUME(j_upper_16 % 16 == 0);
293 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
294 atom * RESTRICT p_0 = params.p[0]; __ASSERT(p_0 != NULL); __ASSUME_ALIGNED(p_0);
295 atom_param * RESTRICT pExt_0 = params.pExt[0]; __ASSERT(pExt_0 != NULL); __ASSUME_ALIGNED(pExt_0);
298 #define pExt_1 pExt_0
300 atom * RESTRICT p_1 = params.p[1]; __ASSERT(p_1 != NULL); __ASSUME_ALIGNED(p_1);
301 atom_param * RESTRICT pExt_1 = params.pExt[1]; __ASSERT(pExt_1 != NULL); __ASSUME_ALIGNED(pExt_1);
304 mic_position_t * RESTRICT p_0_x = params.p_x[0];
305 mic_position_t * RESTRICT p_0_y = params.p_y[0];
306 mic_position_t * RESTRICT p_0_z = params.p_z[0];
307 mic_position_t * RESTRICT p_0_q = params.p_q[0];
308 int * RESTRICT pExt_0_vdwType = params.pExt_vdwType[0];
309 int * RESTRICT pExt_0_index = params.pExt_index[0];
310 int * RESTRICT pExt_0_exclIndex = params.pExt_exclIndex[0];
311 int * RESTRICT pExt_0_exclMaxDiff = params.pExt_exclMaxDiff[0];
317 #define p_1_vdwType p_0_vdwType
318 #define p_1_index p_0_index
319 #define p_1_exclIndex p_0_exclIndex
320 #define p_1_exclMaxDiff p_0_exclMaxDiff
322 mic_position_t * RESTRICT p_1_x = params.p_x[1];
323 mic_position_t * RESTRICT p_1_y = params.p_y[1];
324 mic_position_t * RESTRICT p_1_z = params.p_z[1];
325 mic_position_t * RESTRICT p_1_q = params.p_q[1];
326 int * RESTRICT pExt_1_vdwType = params.pExt_vdwType[1];
327 int * RESTRICT pExt_1_index = params.pExt_index[1];
328 int * RESTRICT pExt_1_exclIndex = params.pExt_exclIndex[1];
329 int * RESTRICT pExt_1_exclMaxDiff = params.pExt_exclMaxDiff[1];
331 __ASSERT(p_0_x != NULL); __ASSERT(p_0_y != NULL); __ASSERT(p_0_z != NULL); __ASSERT(p_0_q != NULL);
332 __ASSERT(p_1_x != NULL); __ASSERT(p_1_y != NULL); __ASSERT(p_1_z != NULL); __ASSERT(p_1_q != NULL);
333 __ASSERT(pExt_0_vdwType != NULL); __ASSERT(pExt_0_index != NULL);
334 __ASSERT(pExt_1_vdwType != NULL); __ASSERT(pExt_1_index != NULL);
335 __ASSERT(pExt_0_exclIndex != NULL); __ASSERT(pExt_0_exclMaxDiff != NULL);
336 __ASSERT(pExt_1_exclIndex != NULL); __ASSERT(pExt_1_exclMaxDiff != NULL);
337 __ASSUME_ALIGNED(p_0_x); __ASSUME_ALIGNED(p_1_x);
338 __ASSUME_ALIGNED(p_0_y); __ASSUME_ALIGNED(p_1_y);
339 __ASSUME_ALIGNED(p_0_z); __ASSUME_ALIGNED(p_1_z);
340 __ASSUME_ALIGNED(p_0_q); __ASSUME_ALIGNED(p_1_q);
341 __ASSUME_ALIGNED(pExt_0_vdwType); __ASSUME_ALIGNED(pExt_0_index);
342 __ASSUME_ALIGNED(pExt_1_vdwType); __ASSUME_ALIGNED(pExt_1_index);
343 __ASSUME_ALIGNED(pExt_0_exclIndex); __ASSUME_ALIGNED(pExt_0_exclMaxDiff);
344 __ASSUME_ALIGNED(pExt_1_exclIndex); __ASSUME_ALIGNED(pExt_1_exclMaxDiff);
348 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
349 double4 * RESTRICT f_0 = params.ff[0];
353 double4 * RESTRICT f_1 = params.ff[1];
355 __ASSERT(f_0 != NULL); __ASSUME_ALIGNED(f_0);
356 __ASSERT(f_1 != NULL); __ASSUME_ALIGNED(f_1);
357 memset(f_0, 0,
sizeof(double4) * i_upper_16);
358 PAIR( memset(f_1, 0,
sizeof(double4) * j_upper_16); )
360 double4 * RESTRICT fullf_0 = params.fullf[0];
362 #define fullf_1 fullf_0
364 double4 * RESTRICT fullf_1 = params.fullf[1];
366 __ASSERT(fullf_0 != NULL); __ASSUME_ALIGNED(fullf_0);
367 __ASSERT(fullf_1 != NULL); __ASSUME_ALIGNED(fullf_1);
368 memset(fullf_0, 0,
sizeof(double4) * i_upper_16);
369 PAIR( memset(fullf_1, 0,
sizeof(double4) * j_upper_16); )
372 double * RESTRICT f_0_x = params.ff_x[0];
373 double * RESTRICT f_0_y = params.ff_y[0];
374 double * RESTRICT f_0_z = params.ff_z[0];
375 double * RESTRICT f_0_w = params.ff_w[0];
382 double * RESTRICT f_1_x = params.ff_x[1];
383 double * RESTRICT f_1_y = params.ff_y[1];
384 double * RESTRICT f_1_z = params.ff_z[1];
385 double * RESTRICT f_1_w = params.ff_w[1];
387 __ASSERT(f_0_x != NULL); __ASSERT(f_0_y != NULL); __ASSERT(f_0_z != NULL); __ASSERT(f_0_w != NULL);
388 __ASSERT(f_1_x != NULL); __ASSERT(f_1_y != NULL); __ASSERT(f_1_z != NULL); __ASSERT(f_1_w != NULL);
389 __ASSUME_ALIGNED(f_0_x); __ASSUME_ALIGNED(f_1_x);
390 __ASSUME_ALIGNED(f_0_y); __ASSUME_ALIGNED(f_1_y);
391 __ASSUME_ALIGNED(f_0_z); __ASSUME_ALIGNED(f_1_z);
392 __ASSUME_ALIGNED(f_0_w); __ASSUME_ALIGNED(f_1_w);
393 memset(f_0_x, 0, 4 *
sizeof(
double) * i_upper_16);
394 PAIR( memset(f_1_x, 0, 4 *
sizeof(
double) * j_upper_16); )
396 double * RESTRICT fullf_0_x = params.fullf_x[0];
397 double * RESTRICT fullf_0_y = params.fullf_y[0];
398 double * RESTRICT fullf_0_z = params.fullf_z[0];
399 double * RESTRICT fullf_0_w = params.fullf_w[0];
401 #define fullf_1_x fullf_0_x
402 #define fullf_1_y fullf_0_y
403 #define fullf_1_z fullf_0_z
404 #define fullf_1_w fullf_0_w
406 double * RESTRICT fullf_1_x = params.fullf_x[1];
407 double * RESTRICT fullf_1_y = params.fullf_y[1];
408 double * RESTRICT fullf_1_z = params.fullf_z[1];
409 double * RESTRICT fullf_1_w = params.fullf_w[1];
411 __ASSERT(fullf_0_x != NULL); __ASSERT(fullf_0_y != NULL); __ASSERT(fullf_0_z != NULL); __ASSERT(fullf_0_w != NULL);
412 __ASSERT(fullf_1_x != NULL); __ASSERT(fullf_1_y != NULL); __ASSERT(fullf_1_z != NULL); __ASSERT(fullf_1_w != NULL);
413 __ASSUME_ALIGNED(fullf_0_x); __ASSUME_ALIGNED(fullf_1_x);
414 __ASSUME_ALIGNED(fullf_0_y); __ASSUME_ALIGNED(fullf_1_y);
415 __ASSUME_ALIGNED(fullf_0_z); __ASSUME_ALIGNED(fullf_1_z);
416 __ASSUME_ALIGNED(fullf_0_w); __ASSUME_ALIGNED(fullf_1_w);
417 memset(fullf_0_x, 0, 4 *
sizeof(
double) * i_upper_16);
418 PAIR( memset(fullf_1_x, 0, 4 *
sizeof(
double) * j_upper_16); )
423 if (params.constants->commOnly) {
return; }
426 if (i_upper <= 0 || j_upper <= 0) {
return; }
428 CALC_TYPE offset_x = (CALC_TYPE)(params.offset.x);
429 CALC_TYPE offset_y = (CALC_TYPE)(params.offset.y);
430 CALC_TYPE offset_z = (CALC_TYPE)(params.offset.z);
435 int * RESTRICT pairlist_norm = params.pairlists_ptr[PL_NORM_INDEX];
436 int * RESTRICT pairlist_mod = params.pairlists_ptr[PL_MOD_INDEX];
437 int * RESTRICT pairlist_excl = params.pairlists_ptr[PL_EXCL_INDEX];
438 #if MIC_CONDITION_NORMAL != 0
439 int * RESTRICT pairlist_normhi = params.pairlists_ptr[PL_NORMHI_INDEX];
445 #define PAIRLIST_ALLOC_CHECK(pl, init_size) { \
446 if ((pl) == NULL) { \
447 const int plInitSize = (init_size) + (16 * (MIC_HANDCODE_FORCE_PFDIST + 1)); \
448 if (device__timestep == 0) { \
449 (pl) = (int*)(_mm_malloc(plInitSize * sizeof(int) + 64, 64)); \
451 (pl) = (int*)_MM_MALLOC_WRAPPER(plInitSize * sizeof(int) + 64, 64, "pairlist_alloc_check"); \
453 __ASSERT((pl) != NULL); \
455 (pl)[0] = plInitSize; \
460 #define PAIRLIST_GROW_CHECK(pl, offset, mul) { \
461 if ((offset) + j_upper + 8 + MIC_PREFETCH_DISTANCE + (16 * (MIC_HANDCODE_FORCE_PFDIST + 1)) > (pl)[0]) { \
462 int newPlAllocSize = (int)(((pl)[0]) * (mul) + j_upper + 64); \
463 newPlAllocSize = (~2047) & (newPlAllocSize + 2047); \
464 int* RESTRICT newPl = (int*)_MM_MALLOC_WRAPPER(newPlAllocSize * sizeof(int), 64, "pairlist_grow_check"); \
465 __ASSERT((newPl) != NULL); \
467 memcpy(newPl, (pl), (offset) * sizeof(int)); \
468 _MM_FREE_WRAPPER((pl) - 14); \
470 (pl)[0] = newPlAllocSize; \
477 if ((!(params.usePairlists)) || (params.savePairlists)) {
484 if (pairlist_norm == NULL) {
487 int plCount_norm = 16;
488 int plCount_mod = 16;
489 int plCount_excl = 16;
490 #if MIC_CONDITION_NORMAL != 0
491 int plCount_normhi = 16;
494 int numParts = params.pp->numParts;
495 int part = params.pp->part;
496 for (
int i = part; i < i_upper; i += numParts) {
499 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
500 CALC_TYPE p_i_x = p_0[i].x + offset_x;
501 CALC_TYPE p_i_y = p_0[i].y + offset_y;
502 CALC_TYPE p_i_z = p_0[i].z + offset_z;
504 CALC_TYPE p_i_x = p_0_x[i] + offset_x;
505 CALC_TYPE p_i_y = p_0_y[i] + offset_y;
506 CALC_TYPE p_i_z = p_0_z[i] + offset_z;
509 for (
int j = 0
SELF(+i+1); j < j_upper; j++) {
513 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
514 CALC_TYPE p_d_x = p_i_x - p_1[j].x;
515 CALC_TYPE p_d_y = p_i_y - p_1[j].y;
516 CALC_TYPE p_d_z = p_i_z - p_1[j].z;
518 CALC_TYPE p_d_x = p_i_x - p_1_x[j];
519 CALC_TYPE p_d_y = p_i_y - p_1_y[j];
520 CALC_TYPE p_d_z = p_i_z - p_1_z[j];
522 CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
523 if (r2 <= plcutoff2) {
530 int exclFlags = 0x00;
531 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
532 const int indexDiff = pExt_0[i].index - pExt_1[j].index;
533 const int maxDiff = pExt_1[j].excl_maxdiff;
535 const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
536 const int maxDiff = pExt_1_exclMaxDiff[j];
538 if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
539 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
540 const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
542 const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
544 const int offset_major = offset / (
sizeof(
unsigned int) * 8);
545 const int offset_minor = offset % (
sizeof(
unsigned int) * 8);
546 exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
551 if (exclFlags == 0) {
552 #if MIC_CONDITION_NORMAL != 0
553 if (r2 <= normhi_split) { plCount_norm++; }
554 else { plCount_normhi++; }
558 }
else if (exclFlags == 1) {
560 }
else if (exclFlags == 2) {
569 #if __MIC_PAD_PLGEN_CTRL != 0
570 plCount_norm += (16 * i_upper);
571 plCount_mod += (16 * i_upper);
572 plCount_excl += (16 * i_upper);
573 #if MIC_CONDITION_NORMAL != 0
574 plCount_normhi += (16 * i_upper);
579 const float plPercent = 1.4;
580 const int plRound = 2048;
581 plCount_norm = (int)(plCount_norm * plPercent); plCount_norm = (~(plRound-1)) & (plCount_norm + (plRound-1));
582 plCount_mod = (int)(plCount_mod * plPercent); plCount_mod = (~(plRound-1)) & (plCount_mod + (plRound-1));
583 plCount_excl = (int)(plCount_excl * plPercent); plCount_excl = (~(plRound-1)) & (plCount_excl + (plRound-1));
584 #if MIC_CONDITION_NORMAL != 0
585 plCount_normhi = (int)(plCount_normhi * plPercent); plCount_normhi = (~(plRound-1)) & (plCount_normhi + (plRound-1));
586 plCount_norm += plCount_normhi;
591 if (device__timestep == 0) {
592 pairlist_norm = (
int*)(_mm_malloc(plCount_norm *
sizeof(
int), 64)); __ASSERT(pairlist_norm != NULL);
593 pairlist_mod = (
int*)(_mm_malloc(plCount_mod *
sizeof(
int), 64)); __ASSERT(pairlist_mod != NULL);
594 pairlist_excl = (
int*)(_mm_malloc(plCount_excl *
sizeof(
int), 64)); __ASSERT(pairlist_excl != NULL);
595 #if MIC_CONDITION_NORMAL != 0
596 pairlist_normhi = (
int*)(_mm_malloc(plCount_normhi *
sizeof(
int), 64)); __ASSERT(pairlist_normhi != NULL);
599 pairlist_norm = (
int*)(_MM_MALLOC_WRAPPER(plCount_norm *
sizeof(
int), 64,
"pairlist_norm")); __ASSERT(pairlist_norm != NULL);
600 pairlist_mod = (
int*)(_MM_MALLOC_WRAPPER(plCount_mod *
sizeof(
int), 64,
"pairlist_mod")); __ASSERT(pairlist_mod != NULL);
601 pairlist_excl = (
int*)(_MM_MALLOC_WRAPPER(plCount_excl *
sizeof(
int), 64,
"pairlist_excl")); __ASSERT(pairlist_excl != NULL);
602 #if MIC_CONDITION_NORMAL != 0
603 pairlist_normhi = (
int*)(_MM_MALLOC_WRAPPER(plCount_normhi *
sizeof(
int), 64,
"pairlist_normhi")); __ASSERT(pairlist_normhi != NULL);
607 pairlist_norm += 14; pairlist_norm[0] = plCount_norm; pairlist_norm[1] = 2;
608 pairlist_mod += 14; pairlist_mod[0] = plCount_mod; pairlist_mod[1] = 2;
609 pairlist_excl += 14; pairlist_excl[0] = plCount_excl; pairlist_excl[1] = 2;
610 #if MIC_CONDITION_NORMAL != 0
611 pairlist_normhi += 14; pairlist_normhi[0] = plCount_normhi; pairlist_normhi[1] = 2;
615 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
616 pairlist_norm[-1] = 0;
617 pairlist_mod[-1] = 0;
618 pairlist_excl[-1] = 0;
619 #if MIC_CONDITION_NORMAL != 0
620 pairlist_normhi[-1] = 0;
630 int plOffset_norm = 2;
631 int plOffset_mod = 2;
632 int plOffset_excl = 2;
633 #if MIC_CONDITION_NORMAL != 0
634 double normhi_split = (0.25 *
cutoff2) + (0.75 * plcutoff2);
635 int plOffset_normhi = 2;
642 #if MIC_TILE_PLGEN != 0
645 int plTileSize = (i_upper > j_upper) ? (i_upper) : (j_upper);
646 plTileSize /= MIC_TILE_PLGEN;
647 plTileSize = (plTileSize + 15) & (~15);
648 __ASSUME(plTileSize % 16 == 0);
651 for (
int _i = 0; _i < i_upper
SELF(-1); _i += plTileSize) {
652 for (
int _j = 0
SELF(+ _i); _j < j_upper; _j += plTileSize) {
656 int i_hi = _i + plTileSize;
657 i_hi = (i_hi > i_upper
SELF(-1)) ? (i_upper
SELF(-1)) : (i_hi);
660 for (
int i = i_lo; i < i_hi; i++) {
663 PAIRLIST_GROW_CHECK(pairlist_norm, plOffset_norm, 1.4);
664 PAIRLIST_GROW_CHECK(pairlist_mod , plOffset_mod, 1.2);
665 PAIRLIST_GROW_CHECK(pairlist_excl, plOffset_excl, 1.2);
666 #if MIC_CONDITION_NORMAL != 0
667 PAIRLIST_GROW_CHECK(pairlist_normhi, plOffset_normhi, 1.3);
671 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
672 CALC_TYPE p_i_x = p_0[i].x + offset_x;
673 CALC_TYPE p_i_y = p_0[i].y + offset_y;
674 CALC_TYPE p_i_z = p_0[i].z + offset_z;
676 CALC_TYPE p_i_x = p_0_x[i] + offset_x;
677 CALC_TYPE p_i_y = p_0_y[i] + offset_y;
678 CALC_TYPE p_i_z = p_0_z[i] + offset_z;
682 int j_lo =
PAIR(_j)
SELF((_i == _j) ? (i+1) : (_j));
683 int j_hi = _j + plTileSize;
684 j_hi = (j_hi > j_upper) ? (j_upper) : (j_hi);
686 #if MIC_HANDCODE_PLGEN != 0
688 __m512 p_i_x_vec = _mm512_set_1to16_ps(p_i_x);
689 __m512 p_i_y_vec = _mm512_set_1to16_ps(p_i_y);
690 __m512 p_i_z_vec = _mm512_set_1to16_ps(p_i_z);
691 __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0_index[i]);
693 __m512i i_shifted_vec = _mm512_slli_epi32(_mm512_set_1to16_epi32(i), 16);
694 __m512i j_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10, 9, 8,
695 7, 6, 5, 4, 3, 2, 1, 0);
699 PAIR( j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(j_lo)); )
701 const
int j_lo_16 = (j_lo) & (~0x0F);
702 j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(j_lo_16));
703 const
int j_lo_m1 = j_lo - 1;
708 #pragma loop count(35)
709 for (
int j =
PAIR(j_lo)
SELF(j_lo_16); j < j_hi; j += 16) {
712 __mmask16 active_mask = _mm512_cmplt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_hi));
714 active_mask = _mm512_kand(active_mask, _mm512_cmpgt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_lo_m1)));
719 __m512i plEntry_vec = _mm512_or_epi32(i_shifted_vec, j_vec);
722 j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(16));
725 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
726 __m512i p_j_tmp0a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j ));
727 __m512i p_j_tmp1a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 4));
728 __m512i p_j_tmp2a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 8));
729 __m512i p_j_tmp3a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 12));
730 __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
731 __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
732 __m512i p_j_tmp0b_vec = _mm512_mask_swizzle_epi32(p_j_tmp0a_vec, k_2x2_0, p_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
733 __m512i p_j_tmp1b_vec = _mm512_mask_swizzle_epi32(p_j_tmp1a_vec, k_2x2_1, p_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
734 __m512i p_j_tmp2b_vec = _mm512_mask_swizzle_epi32(p_j_tmp2a_vec, k_2x2_0, p_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
735 __m512i p_j_tmp3b_vec = _mm512_mask_swizzle_epi32(p_j_tmp3a_vec, k_2x2_1, p_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
736 __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
737 __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
738 __m512i p_j_tmp0c_vec = _mm512_mask_swizzle_epi32(p_j_tmp0b_vec, k_4x4_0, p_j_tmp2b_vec, _MM_SWIZ_REG_BADC);
739 __m512i p_j_tmp1c_vec = _mm512_mask_swizzle_epi32(p_j_tmp1b_vec, k_4x4_0, p_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
740 __m512i p_j_tmp2c_vec = _mm512_mask_swizzle_epi32(p_j_tmp2b_vec, k_4x4_1, p_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
741 __m512i p_j_perm_pattern = _mm512_set_1to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
742 __m512 p_j_x_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp0c_vec));
743 __m512 p_j_y_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp1c_vec));
744 __m512 p_j_z_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp2c_vec));
746 __m512 p_j_x_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_x + j);
747 __m512 p_j_y_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_y + j);
748 __m512 p_j_z_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_z + j);
752 __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
753 __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
754 __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
755 __m512 r2_vec = _mm512_mul_ps(p_ij_x_vec, p_ij_x_vec);
756 r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_y_vec, p_ij_y_vec));
757 r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_z_vec, p_ij_z_vec));
761 __mmask16 cutoff_mask = _mm512_mask_cmple_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps((
float)(plcutoff2)));
763 if (_mm512_kortestz(cutoff_mask, cutoff_mask)) {
continue; }
765 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
766 __m512i pExt_j_tmp0a_vec = _mm512_load_epi32(pExt_1 + j );
767 __m512i pExt_j_tmp1a_vec = _mm512_load_epi32(pExt_1 + j + 4);
768 __m512i pExt_j_tmp2a_vec = _mm512_load_epi32(pExt_1 + j + 8);
769 __m512i pExt_j_tmp3a_vec = _mm512_load_epi32(pExt_1 + j + 12);
770 __m512i pExt_j_tmp0b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp0a_vec, k_2x2_0, pExt_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
771 __m512i pExt_j_tmp1b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1a_vec, k_2x2_1, pExt_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
772 __m512i pExt_j_tmp2b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2a_vec, k_2x2_0, pExt_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
773 __m512i pExt_j_tmp3b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3a_vec, k_2x2_1, pExt_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
774 __m512i pExt_j_tmp1c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1b_vec, k_4x4_0, pExt_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
775 __m512i pExt_j_tmp2c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2b_vec, k_4x4_1, pExt_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
776 __m512i pExt_j_tmp3c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3b_vec, k_4x4_1, pExt_j_tmp1b_vec, _MM_SWIZ_REG_BADC);
777 __m512i pExt_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
778 __m512i pExt_j_index_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp1c_vec);
779 __m512i pExt_j_exclIndex_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp2c_vec);
780 __m512i pExt_j_maxDiff_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp3c_vec);
782 __m512i pExt_j_index_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_index + j);
783 __m512i pExt_j_maxDiff_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclMaxDiff + j);
784 __m512i pExt_j_exclIndex_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclIndex + j);
789 __m512i indexDiff_vec = _mm512_mask_sub_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_i_index_vec, pExt_j_index_vec);
790 __mmask16 indexRange_min_mask = _mm512_mask_cmpge_epi32_mask(cutoff_mask, indexDiff_vec, _mm512_sub_epi32(_mm512_setzero_epi32(), pExt_j_maxDiff_vec));
791 __mmask16 indexRange_max_mask = _mm512_mask_cmple_epi32_mask(cutoff_mask, indexDiff_vec, pExt_j_maxDiff_vec);
792 __mmask16 indexRange_mask = _mm512_kand(indexRange_min_mask, indexRange_max_mask);
795 __m512i offset_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), indexRange_mask, _mm512_slli_epi32(indexDiff_vec, 1), pExt_j_exclIndex_vec);
796 __m512i offset_major_vec = _mm512_srli_epi32(offset_vec, 5);
797 __m512i offset_minor_vec = _mm512_and_epi32(_mm512_set_1to16_epi32(0x001f), offset_vec);
801 int offset_major[16] __attribute__((aligned(64)));
802 int exclFlags[16] __attribute__((aligned(64)));
803 _mm512_store_epi32(offset_major, offset_major_vec);
804 exclFlags[ 0] = params.exclusion_bits[offset_major[ 0]];
805 exclFlags[ 1] = params.exclusion_bits[offset_major[ 1]];
806 exclFlags[ 2] = params.exclusion_bits[offset_major[ 2]];
807 exclFlags[ 3] = params.exclusion_bits[offset_major[ 3]];
808 exclFlags[ 4] = params.exclusion_bits[offset_major[ 4]];
809 exclFlags[ 5] = params.exclusion_bits[offset_major[ 5]];
810 exclFlags[ 6] = params.exclusion_bits[offset_major[ 6]];
811 exclFlags[ 7] = params.exclusion_bits[offset_major[ 7]];
812 exclFlags[ 8] = params.exclusion_bits[offset_major[ 8]];
813 exclFlags[ 9] = params.exclusion_bits[offset_major[ 9]];
814 exclFlags[10] = params.exclusion_bits[offset_major[10]];
815 exclFlags[11] = params.exclusion_bits[offset_major[11]];
816 exclFlags[12] = params.exclusion_bits[offset_major[12]];
817 exclFlags[13] = params.exclusion_bits[offset_major[13]];
818 exclFlags[14] = params.exclusion_bits[offset_major[14]];
819 exclFlags[15] = params.exclusion_bits[offset_major[15]];
820 __m512i exclFlags_vec = _mm512_load_epi32(exclFlags);
822 __m512i exclFlags_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, offset_major_vec, params.exclusion_bits, _MM_SCALE_4);
824 exclFlags_vec = _mm512_mask_srlv_epi32(_mm512_setzero_epi32(), indexRange_mask, exclFlags_vec, offset_minor_vec);
825 exclFlags_vec = _mm512_and_epi32(exclFlags_vec, _mm512_set_1to16_epi32(0x03));
829 __mmask16 norm_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_setzero_epi32());
830 #if MIC_CONDITION_NORMAL != 0
831 __mmask16 normhi_mask = _mm512_mask_cmplt_ps_mask(norm_mask, _mm512_set_1to16_ps(normhi_split), r2_vec);
832 norm_mask = _mm512_kxor(norm_mask, normhi_mask);
834 __mmask16 excl_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(1));
835 __mmask16 mod_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(2));
836 _mm512_mask_packstorelo_epi32(pairlist_norm + plOffset_norm , norm_mask, plEntry_vec);
837 _mm512_mask_packstorehi_epi32(pairlist_norm + plOffset_norm + 16, norm_mask, plEntry_vec);
838 #if MIC_CONDITION_NORMAL != 0
839 _mm512_mask_packstorelo_epi32(pairlist_normhi + plOffset_normhi , normhi_mask, plEntry_vec);
840 _mm512_mask_packstorehi_epi32(pairlist_normhi + plOffset_normhi + 16, normhi_mask, plEntry_vec);
842 _mm512_mask_packstorelo_epi32(pairlist_excl + plOffset_excl , excl_mask, plEntry_vec);
843 _mm512_mask_packstorehi_epi32(pairlist_excl + plOffset_excl + 16, excl_mask, plEntry_vec);
844 _mm512_mask_packstorelo_epi32(pairlist_mod + plOffset_mod , mod_mask, plEntry_vec);
845 _mm512_mask_packstorehi_epi32(pairlist_mod + plOffset_mod + 16, mod_mask, plEntry_vec);
846 __m512i one_vec = _mm512_set_1to16_epi32(1);
847 plOffset_norm += _mm512_mask_reduce_add_epi32(norm_mask, one_vec);
848 #if MIC_CONDITION_NORMAL != 0
849 plOffset_normhi += _mm512_mask_reduce_add_epi32(normhi_mask, one_vec);
851 plOffset_excl += _mm512_mask_reduce_add_epi32(excl_mask, one_vec);
852 plOffset_mod += _mm512_mask_reduce_add_epi32( mod_mask, one_vec);
858 for (
int j = j_lo; j < j_hi; j++) {
862 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
863 CALC_TYPE p_d_x = p_i_x - p_1[j].x;
864 CALC_TYPE p_d_y = p_i_y - p_1[j].y;
865 CALC_TYPE p_d_z = p_i_z - p_1[j].z;
867 CALC_TYPE p_d_x = p_i_x - p_1_x[j];
868 CALC_TYPE p_d_y = p_i_y - p_1_y[j];
869 CALC_TYPE p_d_z = p_i_z - p_1_z[j];
871 CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
872 if (r2 <= plcutoff2) {
879 int exclFlags = 0x00;
880 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
881 const int indexDiff = pExt_0[i].index - pExt_1[i].index;
882 const int maxDiff = pExt_1[j].excl_maxdiff;
884 const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
885 const int maxDiff = pExt_1_exclMaxDiff[j];
887 if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
888 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
889 const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
891 const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
893 const int offset_major = offset / (
sizeof(
unsigned int) * 8);
894 const int offset_minor = offset % (
sizeof(
unsigned int) * 8);
895 exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
899 const int plEntry = ((i & 0xFFFF) << 16) | (j & 0xFFFF);
900 if (exclFlags == 0) {
901 #if MIC_CONDITION_NORMAL != 0
902 if (r2 <= normhi_split) {
903 pairlist_norm[plOffset_norm++] = plEntry;
905 pairlist_normhi[plOffset_normhi++] = plEntry;
908 pairlist_norm[plOffset_norm++] = plEntry;
910 }
else if (exclFlags == 1) {
911 pairlist_excl[plOffset_excl++] = plEntry;
912 }
else if (exclFlags == 2) {
913 pairlist_mod[plOffset_mod++] = plEntry;
921 #if __MIC_PAD_PLGEN_CTRL != 0
922 #if MIC_HANDCODE_FORCE_SINGLE != 0
923 const int padLen = 16;
925 const int padLen = 8;
927 const int padValue = (i << 16) | 0xFFFF;
929 while (plOffset_norm % padLen != 2) { pairlist_norm[plOffset_norm++] = padValue; }
930 while (plOffset_mod % padLen != 2) { pairlist_mod [plOffset_mod++ ] = padValue; }
931 while (plOffset_excl % padLen != 2) { pairlist_excl[plOffset_excl++] = padValue; }
932 #if MIC_CONDITION_NORMAL != 0
933 while (plOffset_normhi % padLen != 2) { pairlist_normhi[plOffset_normhi++] = padValue; }
941 #else // MIC_TILE_PLGEN
947 int numParts = params.pp->numParts;
948 int part = params.pp->part;
955 int plGenIHi = i_upper
SELF(-1);
963 float totalArea = ((i_upper) * (i_upper - 1)) / 2.0f;
964 float areaPerPart = totalArea / numParts;
972 plGenILo = ((part==0)?(0) : ((int)((1.0f+sqrtf(1+8*(areaPerPart*part)))/2.0f)) );
973 plGenIHi = ((part==numParts-1)?(i_upper-1) : ((int)((1.0f+sqrtf(1+8*(areaPerPart*(part+1))))/2.0f)) );
976 int plGenTmp = plGenILo;
977 plGenILo = (i_upper - 1) - plGenIHi;
978 plGenIHi = (i_upper - 1) - plGenTmp;
984 plGenILo = (int)(((
float)(i_upper)) * ((float)(part )) / ((
float)(numParts)));
985 plGenIHi = (int)(((
float)(i_upper)) * ((float)(part + 1)) / ((
float)(numParts)));
991 plGenILo = (plGenILo + 15) & (~15);
992 plGenIHi = (plGenIHi + 15) & (~15);
993 if (plGenIHi > i_upper
SELF(-1)) { plGenIHi = i_upper
SELF(-1); }
996 #pragma loop_count (300)
997 for (
int i = plGenILo; i < plGenIHi; i++) {
1001 #pragma loop_count (300)
1002 for (
int i = part; i < i_upper
SELF(-1); i += numParts) {
1007 PAIRLIST_GROW_CHECK(pairlist_norm, plOffset_norm, 50);
1008 PAIRLIST_GROW_CHECK(pairlist_mod , plOffset_mod, 8);
1009 PAIRLIST_GROW_CHECK(pairlist_excl, plOffset_excl, 8);
1010 #if MIC_CONDITION_NORMAL != 0
1011 PAIRLIST_GROW_CHECK(pairlist_normhi, plOffset_normhi, 15);
1015 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1016 CALC_TYPE p_i_x = p_0[i].x + offset_x;
1017 CALC_TYPE p_i_y = p_0[i].y + offset_y;
1018 CALC_TYPE p_i_z = p_0[i].z + offset_z;
1020 CALC_TYPE p_i_x = p_0_x[i] + offset_x;
1021 CALC_TYPE p_i_y = p_0_y[i] + offset_y;
1022 CALC_TYPE p_i_z = p_0_z[i] + offset_z;
1025 #if MIC_HANDCODE_PLGEN != 0
1028 __m512 p_i_x_vec = _mm512_set_1to16_ps(p_i_x);
1029 __m512 p_i_y_vec = _mm512_set_1to16_ps(p_i_y);
1030 __m512 p_i_z_vec = _mm512_set_1to16_ps(p_i_z);
1031 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1032 __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0[i].index);
1034 __m512i pExt_i_index_vec = _mm512_set_1to16_epi32(pExt_0_index[i]);
1037 __m512i i_shifted_vec = _mm512_slli_epi32(_mm512_set_1to16_epi32(i), 16);
1038 __m512i j_vec = _mm512_set_16to16_epi32(15, 14, 13, 12, 11, 10, 9, 8,
1039 7, 6, 5, 4, 3, 2, 1, 0);
1045 const int ip1_16 = (i+1) & (~0x0F);
1046 j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(ip1_16));
1051 #pragma loop count(35)
1052 for (
int j = 0
SELF(+ ip1_16); j < j_upper; j += 16) {
1055 __mmask16 active_mask = _mm512_cmplt_epi32_mask(j_vec, _mm512_set_1to16_epi32(j_upper));
1057 active_mask = _mm512_kand(active_mask, _mm512_cmpgt_epi32_mask(j_vec, _mm512_set_1to16_epi32(i)));
1062 __m512i plEntry_vec = _mm512_or_epi32(i_shifted_vec, j_vec);
1065 j_vec = _mm512_add_epi32(j_vec, _mm512_set_1to16_epi32(16));
1068 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1069 __m512i p_j_tmp0a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j ));
1070 __m512i p_j_tmp1a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 4));
1071 __m512i p_j_tmp2a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 8));
1072 __m512i p_j_tmp3a_vec = _mm512_castps_si512(_mm512_load_ps(p_1 + j + 12));
1073 __mmask16 k_2x2_0 = _mm512_int2mask(0xAAAA);
1074 __mmask16 k_2x2_1 = _mm512_int2mask(0x5555);
1075 __m512i p_j_tmp0b_vec = _mm512_mask_swizzle_epi32(p_j_tmp0a_vec, k_2x2_0, p_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
1076 __m512i p_j_tmp1b_vec = _mm512_mask_swizzle_epi32(p_j_tmp1a_vec, k_2x2_1, p_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
1077 __m512i p_j_tmp2b_vec = _mm512_mask_swizzle_epi32(p_j_tmp2a_vec, k_2x2_0, p_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
1078 __m512i p_j_tmp3b_vec = _mm512_mask_swizzle_epi32(p_j_tmp3a_vec, k_2x2_1, p_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
1079 __mmask16 k_4x4_0 = _mm512_int2mask(0xCCCC);
1080 __mmask16 k_4x4_1 = _mm512_int2mask(0x3333);
1081 __m512i p_j_tmp0c_vec = _mm512_mask_swizzle_epi32(p_j_tmp0b_vec, k_4x4_0, p_j_tmp2b_vec, _MM_SWIZ_REG_BADC);
1082 __m512i p_j_tmp1c_vec = _mm512_mask_swizzle_epi32(p_j_tmp1b_vec, k_4x4_0, p_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
1083 __m512i p_j_tmp2c_vec = _mm512_mask_swizzle_epi32(p_j_tmp2b_vec, k_4x4_1, p_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
1084 __m512i p_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
1085 __m512 p_j_x_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp0c_vec));
1086 __m512 p_j_y_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp1c_vec));
1087 __m512 p_j_z_vec = _mm512_castsi512_ps(_mm512_permutevar_epi32(p_j_perm_pattern, p_j_tmp2c_vec));
1089 __m512 p_j_x_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_x + j);
1090 __m512 p_j_y_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_y + j);
1091 __m512 p_j_z_vec = _mm512_mask_load_ps(_mm512_setzero_ps(), active_mask, p_1_z + j);
1095 __m512 p_ij_x_vec = _mm512_sub_ps(p_i_x_vec, p_j_x_vec);
1096 __m512 p_ij_y_vec = _mm512_sub_ps(p_i_y_vec, p_j_y_vec);
1097 __m512 p_ij_z_vec = _mm512_sub_ps(p_i_z_vec, p_j_z_vec);
1098 __m512 r2_vec = _mm512_mul_ps(p_ij_x_vec, p_ij_x_vec);
1099 r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_y_vec, p_ij_y_vec));
1100 r2_vec = _mm512_add_ps(r2_vec, _mm512_mul_ps(p_ij_z_vec, p_ij_z_vec));
1104 __mmask16 cutoff_mask = _mm512_mask_cmple_ps_mask(active_mask, r2_vec, _mm512_set_1to16_ps((
float)(plcutoff2)));
1107 if (_mm512_kortestz(cutoff_mask, cutoff_mask)) {
continue; }
1109 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1110 __m512i pExt_j_tmp0a_vec = _mm512_load_epi32(pExt_1 + j );
1111 __m512i pExt_j_tmp1a_vec = _mm512_load_epi32(pExt_1 + j + 4);
1112 __m512i pExt_j_tmp2a_vec = _mm512_load_epi32(pExt_1 + j + 8);
1113 __m512i pExt_j_tmp3a_vec = _mm512_load_epi32(pExt_1 + j + 12);
1114 __m512i pExt_j_tmp0b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp0a_vec, k_2x2_0, pExt_j_tmp1a_vec, _MM_SWIZ_REG_CDAB);
1115 __m512i pExt_j_tmp1b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1a_vec, k_2x2_1, pExt_j_tmp0a_vec, _MM_SWIZ_REG_CDAB);
1116 __m512i pExt_j_tmp2b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2a_vec, k_2x2_0, pExt_j_tmp3a_vec, _MM_SWIZ_REG_CDAB);
1117 __m512i pExt_j_tmp3b_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3a_vec, k_2x2_1, pExt_j_tmp2a_vec, _MM_SWIZ_REG_CDAB);
1118 __m512i pExt_j_tmp1c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp1b_vec, k_4x4_0, pExt_j_tmp3b_vec, _MM_SWIZ_REG_BADC);
1119 __m512i pExt_j_tmp2c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp2b_vec, k_4x4_1, pExt_j_tmp0b_vec, _MM_SWIZ_REG_BADC);
1120 __m512i pExt_j_tmp3c_vec = _mm512_mask_swizzle_epi32(pExt_j_tmp3b_vec, k_4x4_1, pExt_j_tmp1b_vec, _MM_SWIZ_REG_BADC);
1121 __m512i pExt_j_perm_pattern = _mm512_set_16to16_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
1122 __m512i pExt_j_index_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp1c_vec);
1123 __m512i pExt_j_exclIndex_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp2c_vec);
1124 __m512i pExt_j_maxDiff_vec = _mm512_permutevar_epi32(p_j_perm_pattern, pExt_j_tmp3c_vec);
1126 __m512i pExt_j_index_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_index + j);
1127 __m512i pExt_j_maxDiff_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclMaxDiff + j);
1128 __m512i pExt_j_exclIndex_vec = _mm512_mask_load_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_1_exclIndex + j);
1133 __m512i indexDiff_vec = _mm512_mask_sub_epi32(_mm512_setzero_epi32(), cutoff_mask, pExt_i_index_vec, pExt_j_index_vec);
1134 __mmask16 indexRange_min_mask = _mm512_mask_cmpge_epi32_mask(cutoff_mask, indexDiff_vec, _mm512_sub_epi32(_mm512_setzero_epi32(), pExt_j_maxDiff_vec));
1135 __mmask16 indexRange_max_mask = _mm512_mask_cmple_epi32_mask(cutoff_mask, indexDiff_vec, pExt_j_maxDiff_vec);
1136 __mmask16 indexRange_mask = _mm512_kand(indexRange_min_mask, indexRange_max_mask);
1139 __m512i offset_vec = _mm512_mask_add_epi32(_mm512_setzero_epi32(), indexRange_mask, _mm512_slli_epi32(indexDiff_vec, 1), pExt_j_exclIndex_vec);
1140 __m512i offset_major_vec = _mm512_srli_epi32(offset_vec, 5);
1141 __m512i offset_minor_vec = _mm512_and_epi32(_mm512_set_1to16_epi32(0x001f), offset_vec);
1144 __m512i exclFlags_vec = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), cutoff_mask, offset_major_vec, params.exclusion_bits, _MM_SCALE_4);
1145 exclFlags_vec = _mm512_mask_srlv_epi32(_mm512_setzero_epi32(), indexRange_mask, exclFlags_vec, offset_minor_vec);
1146 exclFlags_vec = _mm512_and_epi32(exclFlags_vec, _mm512_set_1to16_epi32(0x03));
1150 __mmask16 norm_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_setzero_epi32());
1151 #if MIC_CONDITION_NORMAL != 0
1152 __mmask16 normhi_mask = _mm512_mask_cmplt_ps_mask(norm_mask, _mm512_set_1to16_ps(normhi_split), r2_vec);
1153 norm_mask = _mm512_kxor(norm_mask, normhi_mask);
1155 __mmask16 excl_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(1));
1156 __mmask16 mod_mask = _mm512_mask_cmpeq_epi32_mask(cutoff_mask, exclFlags_vec, _mm512_set_1to16_epi32(2));
1157 _mm512_mask_packstorelo_epi32(pairlist_norm + plOffset_norm , norm_mask, plEntry_vec);
1158 _mm512_mask_packstorehi_epi32(pairlist_norm + plOffset_norm + 16, norm_mask, plEntry_vec);
1159 #if MIC_CONDITION_NORMAL != 0
1160 _mm512_mask_packstorelo_epi32(pairlist_normhi + plOffset_normhi , normhi_mask, plEntry_vec);
1161 _mm512_mask_packstorehi_epi32(pairlist_normhi + plOffset_normhi + 16, normhi_mask, plEntry_vec);
1163 _mm512_mask_packstorelo_epi32(pairlist_excl + plOffset_excl , excl_mask, plEntry_vec);
1164 _mm512_mask_packstorehi_epi32(pairlist_excl + plOffset_excl + 16, excl_mask, plEntry_vec);
1165 _mm512_mask_packstorelo_epi32(pairlist_mod + plOffset_mod , mod_mask, plEntry_vec);
1166 _mm512_mask_packstorehi_epi32(pairlist_mod + plOffset_mod + 16, mod_mask, plEntry_vec);
1167 __m512i one_vec = _mm512_set_1to16_epi32(1);
1170 plOffset_norm += _mm512_mask_reduce_add_epi32(norm_mask, one_vec);
1171 #if MIC_CONDITION_NORMAL != 0
1172 plOffset_normhi += _mm512_mask_reduce_add_epi32(normhi_mask, one_vec);
1174 plOffset_excl += _mm512_mask_reduce_add_epi32(excl_mask, one_vec);
1175 plOffset_mod += _mm512_mask_reduce_add_epi32( mod_mask, one_vec);
1178 #else // if MIC_HANDCODE_PLGEN != 0
1182 #pragma loop_count (30)
1184 #pragma loop_count (300)
1186 for (
int j = 0
SELF(+i+1); j < j_upper; j++) {
1190 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1191 CALC_TYPE p_d_x = p_i_x - p_1[j].x;
1192 CALC_TYPE p_d_y = p_i_y - p_1[j].y;
1193 CALC_TYPE p_d_z = p_i_z - p_1[j].z;
1195 CALC_TYPE p_d_x = p_i_x - p_1_x[j];
1196 CALC_TYPE p_d_y = p_i_y - p_1_y[j];
1197 CALC_TYPE p_d_z = p_i_z - p_1_z[j];
1199 CALC_TYPE r2 = (p_d_x * p_d_x) + (p_d_y * p_d_y) + (p_d_z * p_d_z);
1200 if (r2 <= plcutoff2) {
1207 int exclFlags = 0x00;
1208 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1209 const int indexDiff = pExt_0[i].index - pExt_1[j].index;
1210 const int maxDiff = pExt_1[j].excl_maxdiff;
1212 const int indexDiff = pExt_0_index[i] - pExt_1_index[j];
1213 const int maxDiff = pExt_1_exclMaxDiff[j];
1215 if (indexDiff >= -1 * maxDiff && indexDiff <= maxDiff) {
1216 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1217 const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
1219 const int offset = (2 * indexDiff) + pExt_1_exclIndex[j];
1221 const int offset_major = offset / (
sizeof(
unsigned int) * 8);
1222 const int offset_minor = offset % (
sizeof(
unsigned int) * 8);
1223 exclFlags = ((params.exclusion_bits[offset_major]) >> offset_minor) & 0x03;
1228 const int plEntry = ((i & 0xFFFF) << 16) | (j & 0xFFFF);
1229 if (exclFlags == 0) {
1230 #if MIC_CONDITION_NORMAL != 0
1231 if (r2 <= normhi_split) {
1232 pairlist_norm[plOffset_norm++] = plEntry;
1234 pairlist_normhi[plOffset_normhi++] = plEntry;
1237 pairlist_norm[plOffset_norm++] = plEntry;
1239 }
else if (exclFlags == 1) {
1240 pairlist_excl[plOffset_excl++] = plEntry;
1241 }
else if (exclFlags == 2) {
1242 pairlist_mod[plOffset_mod++] = plEntry;
1248 #endif // if MIC_HANDCODE_PLGEN != 0
1251 #if __MIC_PAD_PLGEN_CTRL != 0
1253 #if MIC_HANDCODE_FORCE_SINGLE != 0
1254 const int padLen = 16;
1256 const int padLen = 8;
1258 const int padValue = (i << 16) | 0xFFFF;
1261 while (plOffset_norm % padLen != 2) { pairlist_norm[plOffset_norm++] = padValue; }
1262 while (plOffset_mod % padLen != 2) { pairlist_mod [plOffset_mod++ ] = padValue; }
1263 while (plOffset_excl % padLen != 2) { pairlist_excl[plOffset_excl++] = padValue; }
1264 #if MIC_CONDITION_NORMAL != 0
1265 while (plOffset_normhi % padLen != 2) { pairlist_normhi[plOffset_normhi++] = padValue; }
1271 #endif // if MIC_TILE_PLGEN != 0
1275 #if MIC_CONDITION_NORMAL != 0
1278 if (plOffset_norm + plOffset_normhi > pairlist_norm[0]) {
1279 int newPlAllocSize = pairlist_norm[0] + 2 * plOffset_normhi + 8;
1280 int* RESTRICT newPl = (
int*)_MM_MALLOC_WRAPPER(newPlAllocSize *
sizeof(
int) + 64, 64,
"pairlist_norm grow for conditioning");
1281 __ASSERT(newPl != NULL);
1283 memcpy(newPl, pairlist_norm, plOffset_norm *
sizeof(
int));
1284 _MM_FREE_WRAPPER(pairlist_norm - 14);
1285 pairlist_norm = newPl;
1286 pairlist_norm[0] = newPlAllocSize;
1290 for (
int i = 0; i < plOffset_normhi - 2; i++) {
1291 pairlist_norm[plOffset_norm + i] = pairlist_normhi[2 + i];
1293 plOffset_norm += plOffset_normhi - 2;
1294 plOffset_normhi = 2;
1299 pairlist_norm[1] = plOffset_norm;
1300 pairlist_mod[1] = plOffset_mod;
1301 pairlist_excl[1] = plOffset_excl;
1302 #if MIC_CONDITION_NORMAL != 0
1303 pairlist_normhi[1] = plOffset_normhi;
1307 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
1308 if (plOffset_norm > pairlist_norm[-1]) { pairlist_norm[-1] = plOffset_norm; }
1309 if (plOffset_mod > pairlist_mod[-1]) { pairlist_mod[-1] = plOffset_mod; }
1310 if (plOffset_excl > pairlist_excl[-1]) { pairlist_excl[-1] = plOffset_excl; }
1311 #if MIC_CONDITION_NORMAL != 0
1312 if (plOffset_normhi > pairlist_normhi[-1]) { pairlist_normhi[-1] = plOffset_normhi; }
1319 #if MIC_PREFETCH_DISTANCE > 0
1320 for (
int pfI = 0; pfI < MIC_PREFETCH_DISTANCE; pfI++) {
1321 pairlist_norm[plOffset_norm + pfI] = 0;
1322 pairlist_mod [plOffset_mod + pfI] = 0;
1323 pairlist_excl[plOffset_excl + pfI] = 0;
1328 #if MIC_HANDCODE_FORCE != 0 && MIC_HANDCODE_FORCE_PFDIST > 0
1329 for (
int pfI = 0; pfI < 16 * MIC_HANDCODE_FORCE_PFDIST; pfI++) {
1330 #if MIC_HANDCODE_FORCE_SINGLE != 0
1331 pairlist_norm[plOffset_norm + pfI] = -1;
1332 pairlist_mod [plOffset_mod + pfI] = -1;
1333 pairlist_excl[plOffset_excl + pfI] = -1;
1335 pairlist_norm[plOffset_norm + pfI] = 0;
1336 pairlist_mod [plOffset_mod + pfI] = 0;
1337 pairlist_excl[plOffset_excl + pfI] = 0;
1344 params.pairlists_ptr[PL_NORM_INDEX] = pairlist_norm;
1345 params.pairlists_ptr[ PL_MOD_INDEX] = pairlist_mod;
1346 params.pairlists_ptr[PL_EXCL_INDEX] = pairlist_excl;
1347 #if MIC_CONDITION_NORMAL != 0
1348 params.pairlists_ptr[PL_NORMHI_INDEX] = pairlist_normhi;
1353 #undef PAIRLIST_ALLOC_CHECK
1354 #undef PAIRLIST_GROW_CHECK
1360 double vdwEnergy = 0;
1374 double fullElectEnergy = 0;
1376 double fullElectVirial_xx = 0;
1377 double fullElectVirial_xy = 0;
1378 double fullElectVirial_xz = 0;
1379 double fullElectVirial_yy = 0;
1380 double fullElectVirial_yz = 0;
1381 double fullElectVirial_zz = 0;
1385 DEVICE_FPRINTF(
"N");
1386 #define PAIRLIST pairlist_norm
1397 DEVICE_FPRINTF(
"M");
1398 #define PAIRLIST pairlist_mod
1401 #define MODIFIED(X) X
1409 #if defined(FULLELECT)
1411 DEVICE_FPRINTF(
"E");
1412 #define PAIRLIST pairlist_excl
1416 #define EXCLUDED(X) X
1430 #else // defined(FULLELECT)
1435 DEVICE_FPRINTF(
"e");
1436 #if MIC_EXCL_CHECKSUM_FULL != 0
1438 #if __MIC_PAD_PLGEN_CTRL != 0
1441 const int *
const pairlist_excl_base = pairlist_excl + 2;
1442 const int pairlist_excl_len = pairlist_excl[1] - 2;
1444 __ASSUME_ALIGNED(pairlist_excl_base);
1445 #pragma omp simd reduction(+ : exclSum)
1446 for (
int plI = 0; plI < pairlist_excl_len; plI++) {
1447 if ((pairlist_excl_base[plI] & 0xFFFF) != 0xFFFF) {
1451 params.exclusionSum += exclSum;
1453 params.exclusionSum += pairlist_excl[1] - 2;
1458 #endif // defined(FULLELECT)
1461 #if (0 SHORT( FAST( SELF(+1))))
1462 for (
int i = 0; i < i_upper; i++) {
1463 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1464 virial_xx += f_0[i].x * (p_0[i].x + params.patch1_center_x);
1465 virial_xy += f_0[i].x * (p_0[i].y + params.patch1_center_y);
1466 virial_xz += f_0[i].x * (p_0[i].z + params.patch1_center_z);
1467 virial_yy += f_0[i].y * (p_0[i].y + params.patch1_center_y);
1468 virial_yz += f_0[i].y * (p_0[i].z + params.patch1_center_z);
1469 virial_zz += f_0[i].z * (p_0[i].z + params.patch1_center_z);
1471 virial_xx += f_0_x[i] * (p_0_x[i] + params.patch1_center_x);
1472 virial_xy += f_0_x[i] * (p_0_y[i] + params.patch1_center_y);
1473 virial_xz += f_0_x[i] * (p_0_z[i] + params.patch1_center_z);
1474 virial_yy += f_0_y[i] * (p_0_y[i] + params.patch1_center_y);
1475 virial_yz += f_0_y[i] * (p_0_z[i] + params.patch1_center_z);
1476 virial_zz += f_0_z[i] * (p_0_z[i] + params.patch1_center_z);
1480 #if (0 FULL( SELF(+1)))
1481 for (
int i = 0; i < i_upper; i++) {
1482 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1483 fullElectVirial_xx += fullf_0[i].x * (p_0[i].x + params.patch1_center_x);
1484 fullElectVirial_xy += fullf_0[i].x * (p_0[i].y + params.patch1_center_y);
1485 fullElectVirial_xz += fullf_0[i].x * (p_0[i].z + params.patch1_center_z);
1486 fullElectVirial_yy += fullf_0[i].y * (p_0[i].y + params.patch1_center_y);
1487 fullElectVirial_yz += fullf_0[i].y * (p_0[i].z + params.patch1_center_z);
1488 fullElectVirial_zz += fullf_0[i].z * (p_0[i].z + params.patch1_center_z);
1490 fullElectVirial_xx += fullf_0_x[i] * (p_0_x[i] + params.patch1_center_x);
1491 fullElectVirial_xy += fullf_0_x[i] * (p_0_y[i] + params.patch1_center_y);
1492 fullElectVirial_xz += fullf_0_x[i] * (p_0_z[i] + params.patch1_center_z);
1493 fullElectVirial_yy += fullf_0_y[i] * (p_0_y[i] + params.patch1_center_y);
1494 fullElectVirial_yz += fullf_0_y[i] * (p_0_z[i] + params.patch1_center_z);
1495 fullElectVirial_zz += fullf_0_z[i] * (p_0_z[i] + params.patch1_center_z);
1500 FAST(
SHORT( params.virial_xx = virial_xx; ) )
1501 FAST(
SHORT( params.virial_xy = virial_xy; ) )
1502 FAST(
SHORT( params.virial_xz = virial_xz; ) )
1503 FAST(
SHORT( params.virial_yy = virial_yy; ) )
1504 FAST(
SHORT( params.virial_yz = virial_yz; ) )
1505 FAST(
SHORT( params.virial_zz = virial_zz; ) )
1506 FULL( params.fullElectVirial_xx = fullElectVirial_xx; )
1507 FULL( params.fullElectVirial_xy = fullElectVirial_xy; )
1508 FULL( params.fullElectVirial_xz = fullElectVirial_xz; )
1509 FULL( params.fullElectVirial_yy = fullElectVirial_yy; )
1510 FULL( params.fullElectVirial_yz = fullElectVirial_yz; )
1511 FULL( params.fullElectVirial_zz = fullElectVirial_zz; )
1512 FAST(
ENERGY( params.vdwEnergy = vdwEnergy; ) )
1514 FULL(
ENERGY( params.fullElectEnergy = fullElectEnergy; ) )
1522 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1536 #undef p_1_exclIndex
1537 #undef p_1_exclMaxDiff
1551 #undef __MIC_PAD_PLGEN_CTRL
register BigReal virial_xy
register BigReal virial_xz
register BigReal virial_yz
static __global__ void(const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const int *vdw_types, unsigned int *plist, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials, unsigned int *global_counters, int *force_ready_queue, const unsigned int *overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int *block_order, exclmask *exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)
register BigReal electEnergy
if(ComputeNonbondedUtil::goMethod==2)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
register BigReal virial_yy
register BigReal virial_zz
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
register BigReal virial_xx