#include "DpcppComputeNonbondedKernel.h"
#include "DpcppTileListKernel.h"
#include "DpcppDevice.h"
#include "DpcppUtils.h"
#include "NamdTypes.h"
#include <cmath>
#include <algorithm>

// #include <sycl/ext/intel/experimental/kernel_properties.hpp>
// using namespace sycl::ext::intel::experimental;

#if defined(BACKUP) || defined(LOAD)
#include "boost/archive/text_oarchive.hpp"
#include "boost/archive/text_iarchive.hpp"
#endif

#ifdef NAMD_DPCPP
#include <CL/sycl.hpp>

// #define DPCPP_FORCES_SLM
// #define DPCPP_FORCES_EXLM

#if defined(BACKUP) || defined(LOAD)
namespace boost {
namespace serialization {
template<class Archive>
void serialize(Archive & ar, sycl::float3 & f, const unsigned int version)
{
  ar & f.x();
  ar & f.y();
  ar & f.z();
}
}
}
#endif

using namespace sycl;

#ifdef WIN32
#define __thread __declspec(thread)
#endif

extern __thread DpcppDevice *dpcppDevice;
#define ROTATE_UP(data, i) sg.shuffle_down(data,i)
// #define ROTATE_UP(data, i) select(sg.shuffle_up(data, AVXSIZE-i), sg.shuffle_down(data,i), (int)((AVXSIZE-i-sg.get_local_linear_id())>=0))

#define OVERALLOC 1.2f

void NAMD_die(const char *);
// the original is 4 will be 128 , 16 will be 512
#ifdef TEST_SLMENERGYTABLE
  #define NONBONDKERNEL_NUM_WARP 32
#else
  #define NONBONDKERNEL_NUM_WARP 4
#endif

#define EXLMSIZE NONBONDKERNEL_NUM_WARP*AVXSIZE

#ifdef TEST_ENERGYMATH

/*
 * This is an approximation of erfc from Abramowitz and Stegun
 *
 * DMC found that this was slower than the rational approximator since it
 * requires an expf and sqrt evaluation. However, since the pmeApprox 
 * function combines all of the pme correction terms, it cannot be used for energy
 * evaluation. 
 *
 * Thanks to OpenMM for making me aware of this approximation
 * 
 */
NAMD_INLINE float erfcApprox(const float t2, const float t) {
  const float p   =  0.3275911f;
  const float a_1 =  0.254829592f;
  const float a_2 = -0.284496736f;
  const float a_3 =  1.421413741f;
  const float a_4 = -1.453152027f;
  const float a_5 =  1.061405429f;

  const float z = sycl::native::recip(1.0f + p * t);
  const float t_exp = sycl::exp(-1.0f * t2);

  float t_erfc = a_5;
  t_erfc = sycl::fma((float)z, t_erfc, (float)a_4);
  t_erfc = sycl::fma((float)z, t_erfc, (float)a_3);
  t_erfc = sycl::fma((float)z, t_erfc, (float)a_2);
  t_erfc = sycl::fma((float)z, t_erfc, (float)a_1);
  t_erfc = t_erfc * t_exp * z;

  return t_erfc;
}

/*
 * This is a rational approximator to the pme correction term
 *
 * It is approximating the following term:
 * 
 *  f(t^2) = erf(t)/t^3 - 2/sqrt(pi) * exp(-t^2) / t^2
 *  
 *    where t = r * ewaldcof
 *
 * The coefficients of the polynomials were generated with Boost's Remez
 * algorithm implementation. It gives reasonable values for inputs of t
 * from about 0.12 to 3. For values beyond 3, it starts to overestimate
 * 
 * This approximation is not production ready, but it gives a representation
 * of the performance
 *
 * This methodology was taken from GROMACS
 *
 */
NAMD_INLINE float pmeApprox(const float t2) {
  const float P0 =  7.52252777892e-01;
  const float P1 = -1.40866053590e-01;
  const float P2 =  2.94127777731e-02;
  const float P3 = -3.01958652118e-03;
  const float P4 =  2.16664879423e-04;
  const float P5 = -9.35168833829e-06;
  const float P6 =  1.88658974667e-07;

  const float Q0 = 1.00000000000e+00;
  const float Q1 = 4.12741061719e-01;
  const float Q2 = 7.24585411593e-02;
  const float Q3 = 6.57202856872e-03;
  const float Q4 = 2.70980253544e-04;

  float denominator = Q4;
  denominator = sycl::fma((float)t2, denominator, (float)Q3);
  denominator = sycl::fma((float)t2, denominator, (float)Q2);
  denominator = sycl::fma((float)t2, denominator, (float)Q1);
  denominator = sycl::fma((float)t2, denominator, (float)Q0);

  float denominator_inv = 1.0f / denominator;

  float numerator = P6;
  numerator = sycl::fma((float)t2, numerator, (float)P5);
  numerator = sycl::fma((float)t2, numerator, (float)P4);
  numerator = sycl::fma((float)t2, numerator, (float)P3);
  numerator = sycl::fma((float)t2, numerator, (float)P2);
  numerator = sycl::fma((float)t2, numerator, (float)P1);
  numerator = sycl::fma((float)t2, numerator, (float)P0);

  return numerator * denominator_inv;
}

struct nonbonded_coef {
  float lj_0; // denom * cutoff2 - 3.0f * switch2 * denom
  float lj_1; // denom * 2.0f
  float lj_2; // denom * -12.0f
  float lj_3; // denom *  12.0f * switch2
  float lj_4; // cutoff2
  float lj_5; // switch2
  float e_0; // roff3Inv
  float e_0_slow; // roff3Inv * (1 - slowScale)
  float e_1; // roff2Inv
  float e_2; // roffInv
  float ewald_0; // ewaldcof
  float ewald_1; // pi_ewaldcof
  float ewald_2; // ewaldcof ^ 2
  float ewald_3_slow; // ewaldcof ^ 3 * slowScale
};
#endif

#ifdef TEST_PRELOADL0L1
template<bool doEnergy, bool doSlow>
NAMD_INLINE void calcForceEnergyL0L1(const float r2, const float qi, const float qj,
  const float3 dxyz, 
  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
  // float4* d_forceTable, float4* d_energyTable,     // jhr
  float4* d_energyTable,
  float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
  float& energyVdw, float& energyElec, float& energySlow,
  float4 t0, float4 t1, float a) {
  
  int vdwIndex = vdwtypej + vdwtypei;
  float2 ljab = vdwCoefTable[vdwIndex];
  float rinv = rsqrt(r2);
  float4 fi;
  float fSlow = qi * qj;

  if (doEnergy) {
    fi = linear_interp_cudata_float4(d_energyTable, rinv /*feTableSize*/);
    // fi = a * (t1 - t0) + t0;
    energyVdw  += ljab.x() * fi.z() + ljab.y() * fi.y();
    energyElec += fSlow * fi.x();
    if (doSlow) energySlow += fSlow * fi.w();
  }

  // fi = linear_interp_cudata_float4(d_forceTable, rinv/*, feTableSize*/);
  fi = a * (t1 - t0) + t0;
  float f = ljab.x() * fi.z() + ljab.y() * fi.y() + fSlow * fi.x();

  float3 fxyz;
  fxyz = dxyz *f;
  iforce += fxyz;
  jforce -= fxyz;

  if (doSlow) {
    fSlow *= fi.w();
    cl::sycl::float3 fxyzSlow;
    fxyzSlow = dxyz * fSlow;
    iforceSlow += fxyzSlow;
    jforceSlow -= fxyzSlow;
  }
}
#endif

template<bool doEnergy, bool doSlow>
NAMD_INLINE void calcForceEnergy(const float r2, const float qi, const float qj,
  const float3 dxyz, 
  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
  float4* d_forceTable, float4* d_energyTable,     // jhr
  /*int feTableSize, int vdwCoefTableSize, */           // jhr
  float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
  float& energyVdw, float& energyElec, float& energySlow) {

  int vdwIndex = vdwtypej + vdwtypei;
  float2 ljab = vdwCoefTable[vdwIndex];
  float rinv = rsqrt(r2);
  float4 fi;
  float fSlow = qi * qj;

  if (doEnergy) {
    fi = linear_interp_cudata_float4(d_energyTable, rinv /*feTableSize*/);
    energyVdw  += ljab.x() * fi.z() + ljab.y() * fi.y();
    energyElec += fSlow * fi.x();
    if (doSlow) energySlow += fSlow * fi.w();
  }

  fi = linear_interp_cudata_float4(d_forceTable, rinv/*, feTableSize*/);
  float f = ljab.x() * fi.z() + ljab.y() * fi.y() + fSlow * fi.x();

  float3 fxyz;
  fxyz = dxyz *f;
  iforce += fxyz;
  jforce -= fxyz;

  if (doSlow) {
    fSlow *= fi.w();
    cl::sycl::float3 fxyzSlow;
    fxyzSlow = dxyz * fSlow;
    iforceSlow += fxyzSlow;
    jforceSlow -= fxyzSlow;
  }
}

#ifdef TEST_ENERGYMATH
template<bool doEnergy, bool doSlow>
NAMD_INLINE void calcForceEnergyMath(const float r2, const float qi, const float qj,
  const float3 dxyz, /*const float dx, const float dy, const float dz,*/
  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
  // cudaTextureObject_t vdwCoefTableTex, 
  // cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
  // float4* d_forceTable, float4* d_energyTable,
  float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
  float& energyVdw, float& energyElec, float& energySlow,
  const nonbonded_coef c, const float slowScale
) {
  int vdwIndex = vdwtypej + vdwtypei;
  float2 ljab = vdwCoefTable[vdwIndex];
  // float2 ljab={0,0};
  float rinv = rsqrt(r2);
  float f;
 
  float charge = qi * qj;

  const float rinv2 = rinv * rinv;
  const float rinv3 = rinv * rinv2;
  const float rinv6 = rinv3 * rinv3;
  const float rinv8 = rinv6 * rinv2;

  // VDW force
  const float ab_r6 = ljab.x() * rinv6 - ljab.y();
  const float w = ab_r6 * rinv6;
  const float dw_r = (ljab.x() * rinv6 + ab_r6) * -6.0f * rinv8;
  float f_vdw, e_vdw;

  if (r2 > c.lj_5) {
    const float delta_r = (c.lj_4 - r2);
    const float s = delta_r * delta_r * (c.lj_0 + c.lj_1 * r2);
    const float ds_r = delta_r * (c.lj_3 + c.lj_2 * r2);
    f_vdw = w * ds_r + dw_r * s;
    if (doEnergy) e_vdw = w * s;
  } else {
    f_vdw = dw_r;
    if (doEnergy) e_vdw = w;
  }
  if (doEnergy) energyVdw += e_vdw;

  if (!doSlow) {
    const float elec_fast = -1.0f * rinv3 + c.e_0;
    const float f_elec = charge * elec_fast;

    if (doEnergy) {
      float slow_energy = 0.5f * c.e_2 * (3.0f - r2 * c.e_1);
      float fast_energy = rinv - slow_energy;
      energyElec += charge * fast_energy;
    }

    f = f_elec + f_vdw;
  } else {
#ifdef APPROX_PME
    const float elec_fast = -1.0f * rinv3 + c.e_0_slow;
    float pme_term = pmeApprox(c.ewald_2 * r2);

    if (doEnergy) {
      float slow_energy = 0.5f * c.e_2 * (3.0f - r2 * c.e_1);
      float fast_energy = rinv - slow_energy;
      energyElec += charge * fast_energy;

      const float r = sqrtf(r2);
      const float elec_t = r * c.ewald_0;
      const float elec_t2 = r2 * c.ewald_2;

      const float corr_energy = erfcApprox(elec_t2, elec_t);
      const float scor_energy = slow_energy + (corr_energy - 1.0f) * rinv;
      energySlow += charge * scor_energy;
    }
    const float f_elec = charge * (elec_fast + pme_term * c.ewald_3_slow);
    f = f_elec + f_vdw;
#else

#endif
  }
  float3 fxyz;
  fxyz = dxyz *f;
  iforce += fxyz;
  jforce -= fxyz;
}
#endif

template<bool doSlow>
NAMD_INLINE void storeForces(const int pos, const float3 force, const float3 forceSlow,
  float4* __restrict__ devForces, float4* __restrict__ devForcesSlow) {

  ATOMIC_ADD_FLOAT(devForces[pos].x(), force.x());
  ATOMIC_ADD_FLOAT(devForces[pos].y(), force.y());
  ATOMIC_ADD_FLOAT(devForces[pos].z(), force.z());
  if (doSlow) {
    ATOMIC_ADD_FLOAT(devForcesSlow[pos].x(), forceSlow.x());
    ATOMIC_ADD_FLOAT(devForcesSlow[pos].y(), forceSlow.y());
    ATOMIC_ADD_FLOAT(devForcesSlow[pos].z(), forceSlow.z());
  }
}

template<bool doSlow>
NAMD_INLINE void storeForces(const int pos, const float3 force, const float3 forceSlow,
                 float* __restrict__ devForces_x, 
                 float* __restrict__ devForces_y, 
                 float* __restrict__ devForces_z,
                 float* __restrict__ devForcesSlow_x, 
                 float* __restrict__ devForcesSlow_y, 
                 float* __restrict__ devForcesSlow_z)
{
  ATOMIC_ADD_FLOAT(devForces_x[pos], force.x());
  ATOMIC_ADD_FLOAT(devForces_y[pos], force.y());
  ATOMIC_ADD_FLOAT(devForces_z[pos], force.z());
  if (doSlow) {
    ATOMIC_ADD_FLOAT(devForcesSlow_x[pos], forceSlow.x());
    ATOMIC_ADD_FLOAT(devForcesSlow_y[pos], forceSlow.y());
    ATOMIC_ADD_FLOAT(devForcesSlow_z[pos], forceSlow.z());
  }
}

template<bool doSlow>
NAMD_INLINE void storeForces(const int pos, const float3 force, const float3 forceSlow,
  float3* __restrict__ forces, float3* __restrict__ forcesSlow) {
  ATOMIC_ADD_FLOAT(forces[pos].x(), force.x());
  ATOMIC_ADD_FLOAT(forces[pos].y(), force.y());
  ATOMIC_ADD_FLOAT(forces[pos].z(), force.z());
  if (doSlow) {
    ATOMIC_ADD_FLOAT(forcesSlow[pos].x(), forceSlow.x());
    ATOMIC_ADD_FLOAT(forcesSlow[pos].y(), forceSlow.y());
    ATOMIC_ADD_FLOAT(forcesSlow[pos].z(), forceSlow.z());
  }
}

template<bool doPairlist>
NAMD_INLINE void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, int& jexclMaxdiff, int& jexclIndex,nd_item<3> item_ct1) {
  auto sg = item_ct1.get_sub_group();
  xyzq_j_w = ROTATE_UP(xyzq_j_w, 1);
  vdwtypej = ROTATE_UP(vdwtypej, 1);
  if (doPairlist) {
    jatomIndex = ROTATE_UP(jatomIndex, 1);
    jexclIndex = ROTATE_UP(jexclIndex, 1);
    jexclMaxdiff = ROTATE_UP(jexclMaxdiff, 1);
  }
}

template<bool doPairlist>
NAMD_INLINE void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, nd_item<3> item_ct1) {
  auto sg = item_ct1.get_sub_group();
  xyzq_j_w = ROTATE_UP(xyzq_j_w, 1);
  vdwtypej = ROTATE_UP(vdwtypej, 1);
  if (doPairlist) {
    jatomIndex = ROTATE_UP(jatomIndex, 1);
  }
}

template<bool doSlow>
NAMD_INLINE void shuffleNext(float3& jforce, float3& jforceSlow, nd_item<3> item_ct1) {
  auto sg = item_ct1.get_sub_group();
  jforce.x() = ROTATE_UP(jforce.x(), 1);
  jforce.y() = ROTATE_UP(jforce.y(), 1);
  jforce.z() = ROTATE_UP(jforce.z(), 1);
  if (doSlow) {
    jforceSlow.x() = ROTATE_UP(jforceSlow.x(), 1);
    jforceSlow.y() = ROTATE_UP(jforceSlow.y(), 1);
    jforceSlow.z() = ROTATE_UP(jforceSlow.z(), 1);
  }
}

//
// Returns the lower estimate for the distance between a bounding box and a set of atoms
//
NAMD_INLINE float distsq(const BoundingBox a, const float4 b) {

  float dx = max(0.0f, std::fabs(a.xyz.x() - b.x()) - a.wxyz.x());
  float dy = max(0.0f, std::fabs(a.xyz.y() - b.y()) - a.wxyz.y());
  float dz = max(0.0f, std::fabs(a.xyz.z() - b.z()) - a.wxyz.z());
  return dx*dx + dy*dy + dz*dz;
}

#define LARGE_FLOAT (float)(1.0e10)

//
// Nonbonded force kernel
//
template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
 void
nonbondedForceKernel(
  const int start,
  int numTileLists,
  const TileList* __restrict__ tileLists,
  TileExcl* __restrict__ tileExcls,
  const int* __restrict__ tileJatomStart,
  const int vdwCoefTableWidth,
  const float2* __restrict__ vdwCoefTable,
  const int* __restrict__ vdwTypes,
  const float3 lata,
  const float3 latb,
  const float3 latc,
  const float4* __restrict__ xyzq,
  const float cutoff2,
  #ifdef TEST_ENERGYMATH
  const nonbonded_coef c,
  const float slowScale,
  #endif
  #ifndef TEST_ENERGYMATH
  float4* d_forceTable,
  float4* d_energyTable,
  #endif
  // int feTableSize,
  // int vdwCoefTableSize,
  const int atomStorageSize,
  const float plcutoff2,
  const PatchPairRecord* __restrict__ patchPairs,
  const int* __restrict__ atomIndex,
  const int2* __restrict__ exclIndexMaxDiff,
  const unsigned int* __restrict__ overflowExclusions,
  unsigned int* __restrict__ tileListDepth,
  int* __restrict__ tileListOrder,
  int* __restrict__ jtiles,
  TileListStat* __restrict__ tileListStat,
  const BoundingBox* __restrict__ boundingBoxes,
  float4* __restrict__ devForces,
  float4* __restrict__ devForcesSlow,
  float * __restrict__ devForce_x,
  float * __restrict__ devForce_y,
  float * __restrict__ devForce_z,
  float * __restrict__ devForce_w,
  float * __restrict__ devForceSlow_x,
  float * __restrict__ devForceSlow_y,
  float * __restrict__ devForceSlow_z,
  float * __restrict__ devForceSlow_w,
  const int numPatches,
  unsigned int* __restrict__ patchNumCount,
  const DpcppPatchRecord* __restrict__ dpcppPatches,
  float4* __restrict__ mapForces,
  float4* __restrict__ mapForcesSlow,
  int* __restrict__ mapPatchReadyQueue,
  int* __restrict__ outputOrder,
  TileListVirialEnergy* __restrict__ virialEnergy,
  unsigned int* constExclusions,
  queue &myQ,
  int rangeSize,
  int workGroupSize) {
  myQ.submit(
    [&](handler &cgh) {

#ifdef DPCPP_FORCES_SLM
    sycl::accessor<sycl::float4, 2, sycl::access_mode::read_write,sycl::access::target::local>
        s_xyzq(sycl::range<2>(NONBONDKERNEL_NUM_WARP, AVXSIZE), cgh);
    sycl::accessor<int, 2, sycl::access_mode::read_write,sycl::access::target::local>
        s_vdwtypej(sycl::range<2>(NONBONDKERNEL_NUM_WARP, AVXSIZE), cgh);
    sycl::accessor<sycl::float3, 2, sycl::access_mode::read_write,sycl::access::target::local>
        s_jforce(sycl::range<2>(NONBONDKERNEL_NUM_WARP, AVXSIZE), cgh);
    sycl::accessor<sycl::float3, 2, sycl::access_mode::read_write,sycl::access::target::local>
        s_jforceSlow(sycl::range<2>(NONBONDKERNEL_NUM_WARP, AVXSIZE), cgh);
    sycl::accessor<int, 2, sycl::access_mode::read_write, sycl::access::target::local>
        s_jatomIndex(sycl::range<2>(NONBONDKERNEL_NUM_WARP, AVXSIZE), cgh);
#endif

#ifdef TEST_SLMENERGYTABLE
    int nWG=(FETABLESIZE+workGroupSize-1)/workGroupSize;
    sycl::accessor<sycl::float4, 1, sycl::access_mode::read_write,sycl::access::target::local>
        s_d_forceTable(sycl::range<1>(FETABLESIZE), cgh);
#endif

    cgh.parallel_for(
      nd_range<3>(range<3>(1, 1, rangeSize), range<3>(1, 1, workGroupSize)),
      [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(AVXSIZE)]] {
      
#ifdef DPCPP_FORCES_EXLM
      multi_ptr<sycl::float4[EXLMSIZE], access::address_space::local_space> s_xyzq_ptr = group_local_memory<sycl::float4[EXLMSIZE]>(item_ct1.get_group());
      auto& s_xyzq_ref = *s_xyzq_ptr;
      multi_ptr<int[EXLMSIZE], access::address_space::local_space> s_vdwtypej_ptr = group_local_memory<int[EXLMSIZE]>(item_ct1.get_group());
      auto& s_vdwtypej_ref = *s_vdwtypej_ptr;
      multi_ptr<sycl::float3[EXLMSIZE], access::address_space::local_space> s_jforce_ptr = group_local_memory<sycl::float3[EXLMSIZE]>(item_ct1.get_group());
      auto& s_jforce_ref = *s_jforce_ptr;
      multi_ptr<sycl::float3[EXLMSIZE], access::address_space::local_space> s_jforceSlow_ptr = group_local_memory<sycl::float3[EXLMSIZE]>(item_ct1.get_group());
      auto& s_jforceSlow_ref = *s_jforceSlow_ptr;
      multi_ptr<int[EXLMSIZE], access::address_space::local_space> s_jatomIndex_ptr = group_local_memory<int[EXLMSIZE]>(item_ct1.get_group());
      auto& s_jatomIndex_ref = *s_jatomIndex_ptr;
#endif

  auto sg = item_ct1.get_sub_group();
  // Single sub-group takes care of one list of tiles
  const int tid = item_ct1.get_local_id(2);
  const int gid = item_ct1.get_global_id(2);

  const int wid = sg.get_local_id();
  const int sgid = sg.get_group_id();

#ifdef TEST_SLMENERGYTABLE
  auto s_d_forceTable_ptr=s_d_forceTable.get_pointer().get();
  for(int i=0; i<nWG; i++)
  {
    s_d_forceTable_ptr[i*workGroupSize+tid]=d_forceTable[i*workGroupSize+tid];
    // s_d_forceTable[i*workGroupSize+tid]=d_forceTable[i*workGroupSize+tid];
  }
    item_ct1.barrier(sycl::access::fence_space::local_space);
#endif

  int itileList = start + item_ct1.get_group().get_id(2) * (workGroupSize / AVXSIZE) + sg.get_group_linear_id();
  if (itileList < numTileLists)
  {
    float3 iforce;
    float3 iforceSlow;
    float energyVdw, energyElec, energySlow;
    int nexcluded;
    unsigned int itileListLen;
    int2 patchInd;
    int2 patchNumList;

    // Start computation
    {
      // Warp index (0...warpsize-1)
      const int wid = item_ct1.get_local_id(2) % AVXSIZE;

      TileList tmp = tileLists[itileList];
      int iatomStart = tmp.iatomStart;
      int jtileStart = tmp.jtileStart;
      int jtileEnd   = tmp.jtileEnd;

      float3 sh;
      sh= tmp.offsetXYZ.x() * lata + tmp.offsetXYZ.y() * latb + tmp.offsetXYZ.z() * latc;
      // DH - set zeroShift flag if magnitude of shift vector is zero
      bool zeroShift = !(sycl::dot(sh, sh) > 0.0f);

      int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
      if (doPairlist) {
        PatchPairRecord PPStmp = patchPairs[itileList];
        iatomSize     = PPStmp.iatomSize;
        iatomFreeSize = PPStmp.iatomFreeSize;
        jatomSize     = PPStmp.jatomSize;
        jatomFreeSize = PPStmp.jatomFreeSize;
      }

      // Write to global memory here to avoid register spilling
      if (doVirial) {
        if (wid == 0) {
          virialEnergy[itileList].sh = sh;
        }
      }

      // Load i-atom data (and shift coordinates)
      float4 xyzq_i = xyzq[iatomStart + wid];

      // float x_i=xyzq[iatomStart + wid].x();
      // float y_i=xyzq[iatomStart + wid].y();
      // float z_i=xyzq[iatomStart + wid].z();

      // float x_i=xyzq[iatomStart + wid];
      // float x_i= sg.load((sycl::multi_ptr<int, sycl::access::address_space::global_space>)(int *)&(xyzq[iatomStart + wid])); block_load

#ifndef DPCPP_BLOCK_LOAD
      int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
#else
      int vdwtypei = sg.load((sycl::multi_ptr<int, sycl::access::address_space::global_space>)(int *)&(vdwTypes[iatomStart]));
      vdwtypei *= vdwCoefTableWidth;
#endif
      xyzq_i.x() += sh.x();
      xyzq_i.y() += sh.y();
      xyzq_i.z() += sh.z();

      // Load i-atom data (and shift coordinates)
      BoundingBox boundingBoxI;
      if (doPairlist) {
        boundingBoxI = boundingBoxes[iatomStart/AVXSIZE];
        boundingBoxI.xyz += sh;
      }

      // Get i-atom global index
      int iatomIndex;
      if (doPairlist) {
#ifndef DPCPP_BLOCK_LOAD
      iatomIndex = atomIndex[iatomStart + wid];
#else
      iatomIndex = sg.load((sycl::multi_ptr<int, sycl::access::address_space::global_space>)(int *)&(atomIndex[iatomStart]));
#endif

      }

      // i-forces in registers
      iforce = float3(0.0f);

      // float3 iforceSlow;
      if (doSlow) {
        iforceSlow = float3(0.0f);
      }

      // float energyVdw, energyElec, energySlow;
      if (doEnergy) {
        energyVdw = 0.0f;
        energyElec = 0.0f;
        if (doSlow) energySlow = 0.0f;
      }

      // Number of exclusions
      // NOTE: Lowest bit is used as indicator bit for tile pairs:
      //       bit 0 tile has no atoms within pairlist cutoff
      //       bit 1 tile has atoms within pairlist cutoff
      // int nexcluded;
      if (doPairlist) nexcluded = 0;

      // Number of i loops and free atoms
      int nfreei;
      if (doPairlist) {
        int nloopi = min((int)(iatomSize - iatomStart), AVXSIZE);
        nfreei = max(iatomFreeSize - iatomStart, 0);
        if (wid >= nloopi) {
          xyzq_i.x() = -LARGE_FLOAT;
          xyzq_i.y() = -LARGE_FLOAT;
          xyzq_i.z() = -LARGE_FLOAT;
        }
      }

      // tile list stuff
      // int itileListLen;
      // int minJatomStart;
      if (doPairlist) {
        itileListLen = 0;
      }

      // Exclusion index and maxdiff
      int iexclIndex, iexclMaxdiff;
      if (doPairlist) {
#ifndef DPCPP_BLOCK_LOAD
        int2 tmp = exclIndexMaxDiff[iatomStart + wid];
#else
        int2 tmp = sg.load((sycl::multi_ptr<cl::sycl::int2, sycl::access::address_space::global_space>)(cl::sycl::int2 *)&(exclIndexMaxDiff[iatomStart]));
#endif
        iexclIndex   = tmp.x();
        iexclMaxdiff = tmp.y();
      }


      for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {

        // Load j-atom starting index and exclusion mask
        int jatomStart = tileJatomStart[jtile];

        float4 xyzq_j = xyzq[jatomStart + wid];
#ifdef DPCPP_FORCES_SLM
        sg.barrier();
#endif

#ifdef DPCPP_FORCES_EXLM
        sg.barrier();
#endif

        // Check for early bail
        if (doPairlist) {
          float r2bb = distsq(boundingBoxI, xyzq_j);
          // if (all_of(sg, r2bb > plcutoff2)) {
          //   continue;
          // }
        if (sycl::all_of_group(sg, r2bb > plcutoff2)) {
            continue;
          }
        }
        unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
#ifndef DPCPP_BLOCK_LOAD
        int vdwtypej = vdwTypes[jatomStart + wid];
#else
        int vdwtypej = sg.load((sycl::multi_ptr<int, sycl::access::address_space::global_space>)(int *)&(vdwTypes[jatomStart]));
#endif
// sg.load((sycl::multi_ptr<int, sycl::access::address_space::global_space>)(int *)&(vdwTypes[jatomStart][0]));
#ifdef DPCPP_FORCES_SLM
        s_vdwtypej[sgid][wid] = vdwtypej;  
#endif
#ifdef DPCPP_FORCES_EXLM
        s_vdwtypej_ref[sgid*AVXSIZE + wid] = vdwtypej;
#endif

        // Get i-atom global index
        int jatomIndex;
        if (doPairlist) {
#ifndef DPCPP_BLOCK_LOAD
          jatomIndex = atomIndex[jatomStart + wid];
#else
          jatomIndex = sg.load((sycl::multi_ptr<int, sycl::access::address_space::global_space>)(int *)&(atomIndex[jatomStart]));
#endif
#ifdef DPCPP_FORCES_SLM
          s_jatomIndex[sgid][wid] = jatomIndex;
#endif

#ifdef DPCPP_FORCES_EXLM
          s_jatomIndex_ref[sgid*AVXSIZE + wid] = jatomIndex;
#endif

        }

        // Number of j loops and free atoms
        int nfreej;
        if (doPairlist) {
          int nloopj = min((int)(jatomSize - jatomStart), AVXSIZE);
          nfreej = max(jatomFreeSize - jatomStart, 0);
          if (wid >= nloopj) {
            xyzq_j.x() = LARGE_FLOAT;
            xyzq_j.y() = LARGE_FLOAT;
            xyzq_j.z() = LARGE_FLOAT;
          }
        }

#ifdef DPCPP_FORCES_SLM
        s_xyzq[sgid][wid] = xyzq_j;
#endif

#ifdef DPCPP_FORCES_EXSLM
        s_xyzq_ref[sgid*AVXSIZE+wid] = xyzq_j;
#endif

        // DH - self requires that zeroShift is also set
        const bool self = zeroShift && (iatomStart == jatomStart);
        const int modval = (self) ? 2*AVXSIZE-1 : AVXSIZE-1;

        float3 jforce =float3(0.0f);
        
        float3 jforceSlow;
        if (doSlow) {
          jforceSlow = float3(0.0f);
        }

#ifdef DPCPP_FORCES_SLM
        s_jforce[sgid][wid] = jforce;
        if (doSlow){
          s_jforceSlow[sgid][wid] = jforceSlow;
        }
        sg.barrier();
#endif

#ifdef DPCPP_FORCES_EXLM
        s_jforce_ref[sgid*AVXSIZE + wid] = jforce;
        if (doSlow){
          s_jforceSlow_ref[sgid*AVXSIZE + wid] = jforceSlow;
        }
        sg.barrier();
#endif

        int t = (self) ? 1 : 0;

        if (doPairlist) {
          // Build pair list
          // NOTE: Pairlist update, we must also include the diagonal since this is used
          //       in GBIS phase 2.
          // Clear the lowest (indicator) bit
          nexcluded &= (~1);

          // For self tiles, do the diagonal term (t=0).
          // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
          if (self) {
            int j = (0 + wid) & modval;
            // NOTE: __shfl() operation can give non-sense here because j may be >= AVXSIZE.
            //       However, if (j < AVXSIZE ..) below makes sure that these non-sense
            //       results are not actually every used
            float dx,dy,dz;

#ifdef DPCPP_FORCES_SLM
            xyzq_j = s_xyzq[sgid][j];
            dx = xyzq_j.x() - xyzq_i.x();
            dy = xyzq_j.y() - xyzq_i.y();
            dz = xyzq_j.z() - xyzq_i.z();  
#elif defined(DPCPP_FORCES_EXSLM)
            xyzq_j = s_xyzq_ref[sgid*AVXSIZE + j];
            dx = xyzq_j.x() - xyzq_i.x();
            dy = xyzq_j.y() - xyzq_i.y();
            dz = xyzq_j.z() - xyzq_i.z();  
#else
            // dx =ROTATE_UP(xyzq_j.x(), t) - xyzq_i.x();
            // dy =ROTATE_UP(xyzq_j.y(), t) - xyzq_i.y();
            // dz =ROTATE_UP(xyzq_j.z(), t) - xyzq_i.z();
            dx = xyzq_j.x() - xyzq_i.x();
            dy = xyzq_j.y() - xyzq_i.y();
            dz = xyzq_j.z() - xyzq_i.z();
#endif

            float r2 = dx*dx + dy*dy + dz*dz;

            if (j < AVXSIZE && r2 < plcutoff2) {
              // We have atom pair within the pairlist cutoff => Set indicator bit
              nexcluded |= 1;
            }
#ifdef DPCPP_FORCES_SLM
#elif defined(DPCPP_FORCES_EXSLM)
#else
            xyzq_j.x() = ROTATE_UP(xyzq_j.x(), 1);
            xyzq_j.y() = ROTATE_UP(xyzq_j.y(), 1);
            xyzq_j.z() = ROTATE_UP(xyzq_j.z(), 1);
            shuffleNext<doPairlist>(xyzq_j.w(), vdwtypej, jatomIndex, item_ct1);
#endif
          }

          for (;t < AVXSIZE;t++) {
            int j = (t + wid) & modval;

            excl >>= 1;
            if (j < AVXSIZE) {
            float3 dxyz;
#ifdef DPCPP_FORCES_SLM
            xyzq_j = s_xyzq[sgid][j];
            // dxyz.x() = xyzq_j.x() - xyzq_i.x();
            // dxyz.y() = xyzq_j.y() - xyzq_i.y();
            // dxyz.z() = xyzq_j.z() - xyzq_i.z();
#elif defined(DPCPP_FORCES_EXSLM)
            xyzq_j = s_xyzq_ref[sgid*AVXSIZE + j];
#else
            // dxyz.x() =ROTATE_UP(xyzq_j.x(), t) - xyzq_i.x();
            // dxyz.y() =ROTATE_UP(xyzq_j.y(), t) - xyzq_i.y();
            // dxyz.z() =ROTATE_UP(xyzq_j.z(), t) - xyzq_i.z();
#endif
            dxyz.x() = xyzq_j.x() - xyzq_i.x();
            dxyz.y() = xyzq_j.y() - xyzq_i.y();
            dxyz.z() = xyzq_j.z() - xyzq_i.z();

            float r2 = sycl::dot(dxyz, dxyz);

            // We have atom pair within the pairlist cutoff => Set indicator bit
            if (r2 < plcutoff2) {// preload t0 t1
#ifdef TEST_PRELOADL0L1
  float k = rsqrt(r2);
  const int tableSize  = 4096;
  const float x = k * (float)tableSize - 0.5f;
  const float f = floorf(x);
  const float a = x - f;
  const unsigned int i = (unsigned int)f;
  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
  const int i1 = i0 + 1;
  const sycl::float4 t0 =d_forceTable[i0];
  const sycl::float4 t1 =d_forceTable[i1];
#endif
              nexcluded |= 1;
              if (j < nfreej || wid < nfreei) {
                bool excluded = false;
#ifdef DPCPP_FORCES_SLM
                int indexdiff = s_jatomIndex[sgid][j] - iatomIndex;
#elif defined(DPCPP_FORCES_EXSLM)
                int indexdiff = s_jatomIndex_ref[sgid*AVXSIZE + j] - iatomIndex;
#else
                int indexdiff = jatomIndex - iatomIndex;
#endif
                if ( abs(indexdiff) <= iexclMaxdiff) {
                  indexdiff += iexclIndex;
                  int indexword = ((unsigned int) indexdiff) >> 5;

                  indexword = (indexword < MAX_CONST_EXCLUSIONS)?constExclusions[indexword]: overflowExclusions[indexword];
                  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
                }
                if (excluded) nexcluded += 2;
                if (!excluded) excl |= (1 << (AVXSIZE-1));
                if (!excluded && r2 < cutoff2) {
#ifdef TEST_ENERGYMATH
                    calcForceEnergyMath<doEnergy, doSlow>(r2, xyzq_i.w(), xyzq_j.w(), dxyz,
                      vdwtypei,
  #ifdef DPCPP_FORCES_SLM
                      s_vdwtypej[sgid][j],
  #elif defined(DPCPP_FORCES_EXSLM)
                      s_vdwtypej_ref[sgid*AVXSIZE + j],
  #else
                      vdwtypej,
  #endif
                      vdwCoefTable,
  // #ifdef TEST_SLMENERGYTABLE
  //                     s_d_forceTable_ptr, 
  // #else
  //                     d_forceTable,
  // #endif
  //                     d_energyTable,     // jhr
                      /*feTableSize,*/ /* vdwCoefTableSize, */           // jhr
                      iforce, iforceSlow,
  #ifdef DPCPP_FORCES_SLM
                      s_jforce[sgid][j], s_jforceSlow[sgid][j],
  #elif defined(DPCPP_FORCES_EXSLM)
                      s_jforce_ref[sgid*AVXSIZE +j], s_jforceSlow_ref[sgid*AVXSIZE + j],
  #else
                      jforce, jforceSlow,
  #endif
                      energyVdw, energyElec, energySlow, 
                      c, slowScale);
#elif defined(TEST_PRELOADL0L1)
                    calcForceEnergyL0L1<doEnergy, doSlow>(r2, xyzq_i.w(), xyzq_j.w(), dxyz,
                      vdwtypei,
                      vdwtypej,
                      vdwCoefTable,
                      d_energyTable,     // jhr
                      /*feTableSize,*/ /* vdwCoefTableSize, */           // jhr
                      iforce, iforceSlow,
                      jforce, jforceSlow,
                      energyVdw, energyElec, energySlow, t0, t1, a);
#else
                    calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w(), xyzq_j.w(), dxyz,
                      vdwtypei,
  #ifdef DPCPP_FORCES_SLM
                      s_vdwtypej[sgid][j],
  #elif defined(DPCPP_FORCES_EXSLM)
                      s_vdwtypej_ref[sgid*AVXSIZE + j],
  #else
                      vdwtypej,
  #endif
                      vdwCoefTable,
  #ifdef TEST_SLMENERGYTABLE
                      s_d_forceTable_ptr, 
  #else
                      d_forceTable,
  #endif
                      d_energyTable,     // jhr
                      /*feTableSize,*/ /* vdwCoefTableSize, */           // jhr
                      iforce, iforceSlow,
  #ifdef DPCPP_FORCES_SLM
                      s_jforce[sgid][j], s_jforceSlow[sgid][j],
  #elif defined(DPCPP_FORCES_EXSLM)
                      s_jforce_ref[sgid*AVXSIZE +j], s_jforceSlow_ref[sgid*AVXSIZE + j],
  #else
                      jforce, jforceSlow,
  #endif
                      energyVdw, energyElec, energySlow);
#endif
                }
                }
              }
            }
#ifdef DPCPP_FORCES_SLM
#elif defined(DPCPP_FORCES_EXSLM)
#else
            xyzq_j.x() = ROTATE_UP(xyzq_j.x(), 1);
            xyzq_j.y() = ROTATE_UP(xyzq_j.y(), 1);
            xyzq_j.z() = ROTATE_UP(xyzq_j.z(), 1);
            shuffleNext<doPairlist>(xyzq_j.w(), vdwtypej, jatomIndex, item_ct1);
            shuffleNext<doSlow>(jforce, jforceSlow, item_ct1);
#endif
          } // t
        } else {
          // Just compute forces
          if (self) {
            excl >>= 1;
#ifdef DPCPP_FORCES_SLM
#elif defined(DPCPP_FORCES_EXSLM)
#else
            xyzq_j.x() = ROTATE_UP(xyzq_j.x(), 1);
            xyzq_j.y() = ROTATE_UP(xyzq_j.y(), 1);
            xyzq_j.z() = ROTATE_UP(xyzq_j.z(), 1);
            shuffleNext<doPairlist>(xyzq_j.w(), vdwtypej, jatomIndex, item_ct1);
#endif
          }
          for (;t < AVXSIZE;t++) {
            if ((excl & 1)) {
#ifdef DPCPP_FORCES_SLM
              xyzq_j = s_xyzq[sgid][(wid+t) & (AVXSIZE-1)];
#elif defined(DPCPP_FORCES_EXSLM)
              xyzq_j = s_xyzq_ref[sgid*AVXSIZE + ((wid+t) & (AVXSIZE-1))];
#endif
              float3 dxyz;
              dxyz.x() = xyzq_j.x() - xyzq_i.x();
              dxyz.y() = xyzq_j.y() - xyzq_i.y();
              dxyz.z() = xyzq_j.z() - xyzq_i.z();

              float r2 = sycl::dot(dxyz, dxyz);
#ifdef TEST_PRELOADL0L1
  float k = rsqrt(r2);
  const int tableSize  = 4096;
  const float x = k * (float)tableSize - 0.5f;
  const float f = floorf(x);
  const float a = x - f;
  const unsigned int i = (unsigned int)f;
  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
  const int i1 = i0 + 1;
  const sycl::float4 t0 =d_forceTable[i0]; //
  const sycl::float4 t1 =d_forceTable[i1]; //t1=t0
#endif
              if (r2 < cutoff2) {
#ifdef TEST_ENERGYMATH
                    calcForceEnergyMath<doEnergy, doSlow>(r2, xyzq_i.w(), xyzq_j.w(), dxyz,
                      vdwtypei,
  #ifdef DPCPP_FORCES_SLM
                      s_vdwtypej[sgid][j],
  #elif defined(DPCPP_FORCES_EXSLM)
                      s_vdwtypej_ref[sgid*AVXSIZE + j],
  #else
                      vdwtypej,
  #endif
                      vdwCoefTable,
  // #ifdef TEST_SLMENERGYTABLE
  //                     s_d_forceTable_ptr, 
  // #else
  //                     d_forceTable,
  // #endif
  //                     d_energyTable,     // jhr
                      /*feTableSize,*/ /* vdwCoefTableSize, */           // jhr
                      iforce, iforceSlow,
  #ifdef DPCPP_FORCES_SLM
                      s_jforce[sgid][j], s_jforceSlow[sgid][j],
  #elif defined(DPCPP_FORCES_EXSLM)
                      s_jforce_ref[sgid*AVXSIZE +j], s_jforceSlow_ref[sgid*AVXSIZE + j],
  #else
                      jforce, jforceSlow,
  #endif
                      energyVdw, energyElec, energySlow, 
                      c, slowScale);
#elif defined(TEST_PRELOADL0L1)
                    calcForceEnergyL0L1<doEnergy, doSlow>(r2, xyzq_i.w(), xyzq_j.w(), dxyz,
                      vdwtypei,
                      vdwtypej,
                      vdwCoefTable,
                      d_energyTable,     // jhr
                      /*feTableSize,*/ /* vdwCoefTableSize, */           // jhr
                      iforce, iforceSlow,
                      jforce, jforceSlow,
                      energyVdw, energyElec, energySlow, t0, t1, a);
#else
                calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w(), xyzq_j.w(), dxyz,
                  vdwtypei,
#ifdef DPCPP_FORCES_SLM
                  s_vdwtypej[sgid][(wid+t) & (AVXSIZE-1)],
#elif defined(DPCPP_FORCES_EXSLM)
                  s_vdwtypej_ref[sgid*AVXSIZE +((wid+t) & (AVXSIZE-1))],
#else
                  vdwtypej,
#endif
                  vdwCoefTable,
#ifdef TEST_SLMENERGYTABLE
                    s_d_forceTable_ptr, 
#else
                    d_forceTable,
#endif
                    d_energyTable,     // jhr
                    /*feTableSize, vdwCoefTableSize,*/           // jhr
                  iforce, iforceSlow,
#ifdef DPCPP_FORCES_SLM
                  s_jforce[sgid][(wid+t) & (AVXSIZE-1)],
                  s_jforceSlow[sgid][(wid+t) & (AVXSIZE-1)],
#elif defined(DPCPP_FORCES_EXSLM)
                  s_jforce_ref[sgid*AVXSIZE + ((wid+t) & (AVXSIZE-1))],
                  s_jforceSlow_ref[sgid*AVXSIZE + ((wid+t) & (AVXSIZE-1))],
#else
                  jforce, jforceSlow,
#endif
                  energyVdw, energyElec, energySlow);
#endif
              } // (r2 < cutoff2)
            } // (excl & 1)
            excl >>= 1;
#ifdef DPCPP_FORCES_SLM
#elif defined(DPCPP_FORCES_EXSLM)
#else
            xyzq_j.x() = ROTATE_UP(xyzq_j.x(), 1);
            xyzq_j.y() = ROTATE_UP(xyzq_j.y(), 1);
            xyzq_j.z() = ROTATE_UP(xyzq_j.z(), 1);
            shuffleNext<doPairlist>(xyzq_j.w(), vdwtypej, jatomIndex, item_ct1);
            shuffleNext<doSlow>(jforce, jforceSlow, item_ct1);
#endif
          } // t
#ifdef DPCPP_FORCES_SLM
          sg.barrier();
#elif defined(DPCPP_FORCES_EXSLM)
          sg.barrier();
#endif
        }

        // Write j-forces
        storeForces<doSlow>(jatomStart + wid,
#ifdef DPCPP_FORCES_SLM
        s_jforce[sgid][wid], s_jforceSlow[sgid][wid],
#elif defined(DPCPP_FORCES_EXSLM)
        s_jforce_ref[sgid*AVXSIZE+wid], s_jforceSlow_ref[sgid*AVXSIZE + wid],
#else
        jforce, jforceSlow,
#endif
#ifdef FORCES_SOA
          devForce_x, devForce_y, devForce_z, devForceSlow_x, devForceSlow_y, devForceSlow_z);
#else
          devForces, devForcesSlow);
#endif
        // Write exclusions
        if (doPairlist && sycl::any_of_group(sg, nexcluded & 1)) {
          int anyexcl = (65536 | sycl::any_of_group(sg, excl));
          // Mark this jtile as non-empty:
          //  VdW:      1 if tile has atom pairs within pairlist cutoff and some these atoms interact
          //  GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
          if (wid == 0) jtiles[jtile] = anyexcl;
          // Store exclusions
#ifndef DPCPP_BLOCK_LOAD
          tileExcls[jtile].excl[wid] = excl;
#else
          sg.store((sycl::multi_ptr<unsigned int, sycl::access::address_space::global_space>)(unsigned int *)&(tileExcls[jtile].excl[0]), excl);
#endif
          // itileListLen:
          // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
          // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
          itileListLen += anyexcl;
          // NOTE, this minJatomStart is only stored once for the first tile list entry
        }

      } // jtile

      // Write i-forces
      storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow,
#ifdef FORCES_SOA
                 devForce_x, devForce_y, devForce_z, devForceSlow_x, devForceSlow_y, devForceSlow_z);
#else
                 devForces, devForcesSlow);
#endif
    }
    // Done with computation

    // Save pairlist stuff
    if (doPairlist) {

      // Warp index (0...warpsize-1)
      const int wid = item_ct1.get_local_id(2) % AVXSIZE;

      if (wid == 0) {
        // minJatomStart is in range [0 ... atomStorageSize-1]
        tileListDepth[itileList] = itileListLen;
        tileListOrder[itileList] = itileList;
        // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
        if ((itileListLen & 65535) > 0) ATOMIC_ADD(int, tileListStat->numTileLists, 1);
        // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
        if (itileListLen > 0) ATOMIC_ADD(int, tileListStat->numTileListsGBIS, 1);
        // NOTE: always numTileListsGBIS >= numTileLists
      }

      // Remove indicator bit
      nexcluded >>= 1;
      // reduce_all not needed here. Reduce to lane 0 is sufficient
      int nexcludedWarp = sycl::reduce_over_group(sg, nexcluded, sycl::ext::oneapi::plus<>());
      // int nexcludedWarp = reduce(sg, nexcluded, sycl::ext::oneapi::plus<>());
      if (wid == 0) ATOMIC_ADD(int, tileListStat->numExcluded, nexcludedWarp);
    }

    if (doVirial) {
      // Warp index (0...warpsize-1)
      const int wid = item_ct1.get_local_id(2) % AVXSIZE;

      // reduce_all not needed here. Reduce to lane 0 would be sufficient
      float3 iforceSum;
      iforceSum.x() = sycl::reduce_over_group(sg, iforce.x(), sycl::ext::oneapi::plus<>());
      iforceSum.y() = sycl::reduce_over_group(sg, iforce.y(), sycl::ext::oneapi::plus<>());
      iforceSum.z() = sycl::reduce_over_group(sg, iforce.z(), sycl::ext::oneapi::plus<>());
      // iforceSum.x() = reduce(sg, iforce.x(), sycl::ext::oneapi::plus<>());
      // iforceSum.y() = reduce(sg, iforce.y(), sycl::ext::oneapi::plus<>());
      // iforceSum.z() = reduce(sg, iforce.z(), sycl::ext::oneapi::plus<>());

      if (wid == 0) {
        virialEnergy[itileList].force = iforceSum;
      }

      if (doSlow) {
        // reduce_all not needed here. Reduce to lane 0 would be sufficient
        iforceSum.x() = sycl::reduce_over_group(sg, iforceSlow.x(), sycl::ext::oneapi::plus<>());
        iforceSum.y() = sycl::reduce_over_group(sg, iforceSlow.y(), sycl::ext::oneapi::plus<>());
        iforceSum.z() = sycl::reduce_over_group(sg, iforceSlow.z(), sycl::ext::oneapi::plus<>());
        // iforceSum.x() = reduce(sg, iforceSlow.x(), sycl::ext::oneapi::plus<>());
        // iforceSum.y() = reduce(sg, iforceSlow.y(), sycl::ext::oneapi::plus<>());
        // iforceSum.z() = reduce(sg, iforceSlow.z(), sycl::ext::oneapi::plus<>());
        if (wid == 0) {
          virialEnergy[itileList].forceSlow = iforceSum;
        }
      }
    }

    // Reduce energy
    if (doEnergy) {
      // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
      //       (Why does CUB suck here?)
      energyVdw  = sycl::reduce_over_group(sg, energyVdw,  sycl::ext::oneapi::plus<>());
      energyElec = sycl::reduce_over_group(sg, energyElec, sycl::ext::oneapi::plus<>());
      if (doSlow) energySlow = sycl::reduce_over_group(sg, energySlow, sycl::ext::oneapi::plus<>());
      // energyVdw  = reduce(sg, energyVdw,  sycl::ext::oneapi::plus<>());
      // energyElec = reduce(sg, energyElec, sycl::ext::oneapi::plus<>());
      // if (doSlow) energySlow = reduce(sg, energySlow, sycl::ext::oneapi::plus<>());

      if (item_ct1.get_local_id(2) % AVXSIZE == 0) {
        virialEnergy[itileList].energyVdw  = energyVdw;
        virialEnergy[itileList].energyElec = energyElec;
        if (doSlow) virialEnergy[itileList].energySlow = energySlow;
      }
    }
    // {
    // int itileList = start + item_ct1.get_group().get_id(2) * (workGroupSize / AVXSIZE) + sg.get_group_linear_id();

    if (doStreaming) {
      // Make sure devForces and devForcesSlow have been written into device memory
      sg.barrier();
#ifdef TEST_ATOMIC_FENCE
      sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
      //sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);
#else
      sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::sub_group);
#endif

      TileList tmp = tileLists[itileList];
      patchInd     = tmp.patchInd;
      patchNumList = tmp.patchNumList;

      int patchDone[2] = {false, false};
      // int patchDone0 = false;
      // int pathcDone1 = false;

      const int wid = item_ct1.get_local_id(2) % AVXSIZE;
      if (wid == 0) {
      #ifdef TEST_CASEMU
        int patchCountOld0 = ATOMIC_FETCH_ADD(unsigned int, patchNumCount[patchInd.x()], (unsigned int)(1));

        patchDone[0] = (patchCountOld0 + 1 == patchNumList.x());
        if (patchDone[0]) patchNumCount[patchInd.x()] = 0;
        // if(patchCountOld0 + 1 == patchNumList.x()){
        //   patchDone[0] = true;
        //   // patchNumCount[patchInd.x()]=0;
        //   ATOMIC_FETCH_SUB(unsigned int, patchNumCount[patchInd.x()], patchNumList.x());
        // }
      #else
        bool done = false;
        unsigned int fetched;
        unsigned int new_val;
        while (!done) {
          fetched = patchNumCount[patchInd.x()];
          if (fetched == patchNumList.x()-1) new_val = 0;
          else new_val = fetched+1;
          done = ATOMIC_EXCHANGE(unsigned int, patchNumCount[patchInd.x()], fetched, new_val);
        }
        patchDone[0] = (fetched == patchNumList.x()-1);
      #endif

        if (patchInd.x() != patchInd.y()) {
          #ifdef TEST_CASEMU
          int patchCountOld1 = ATOMIC_FETCH_ADD(unsigned int, patchNumCount[patchInd.y()], (unsigned int)(1));
          patchDone[1] = (patchCountOld1 + 1 == patchNumList.y());
          if (patchDone[1]) patchNumCount[patchInd.y()] = 0;
          // if(patchCountOld1 + 1 == patchNumList.y()){
          //   patchDone[1] = true;
          //   // patchNumCount[patchInd.y()]=0;
          //   ATOMIC_FETCH_SUB(unsigned int, patchNumCount[patchInd.y()], patchNumList.y());
          // }
          #else
          done = false;
          while (!done) {
            fetched = patchNumCount[patchInd.y()];
            if (fetched == patchNumList.y()-1) new_val = 0;
            else new_val = fetched+1;
            done = ATOMIC_EXCHANGE(unsigned int, patchNumCount[patchInd.y()], fetched, new_val);
          }
          patchDone[1] = (fetched == patchNumList.y()-1);
          #endif
        }
          
      }

      // patchDone[0] = any_of(sg, patchDone[0]);
      // patchDone[1] = any_of(sg, patchDone[1]);
      patchDone[0] = group_broadcast(sg, patchDone[0], 0);
      patchDone[1] = group_broadcast(sg, patchDone[1], 0);
      if (patchDone[0]) {
        // Patch 1 is done, write onto host-mapped memory
        DpcppPatchRecord patch = dpcppPatches[patchInd.x()];
        int start = patch.atomStart;
        int end   = start + patch.numAtoms;
        for (int i=start+wid;i < end;i+=AVXSIZE) {
#ifdef FORCES_SOA
          mapForces[i] = float4(devForce_x[i],
            devForce_y[i], devForce_z[i], devForce_w[i]);
          // mapForces[i] = 0;
          if (doSlow){
            mapForcesSlow[i] = float4(devForceSlow_x[i],
                                     devForceSlow_y[i], 
                                     devForceSlow_z[i], 
                                     devForceSlow_w[i]);
          }
#else
          mapForces[i] = devForces[i];
          if (doSlow){
            mapForcesSlow[i] = devForcesSlow[i];
          }
#endif

        }
      }

      if (patchDone[1]) {
        // Patch 2 is done
        DpcppPatchRecord patch = dpcppPatches[patchInd.y()];
        int start = patch.atomStart;
        int end   = start + patch.numAtoms;
        for (int i=start+wid;i < end;i+=AVXSIZE) {
#ifdef FORCES_SOA
          mapForces[i] = float4(devForce_x[i],
            devForce_y[i], devForce_z[i], devForce_w[i]);
          // mapForces[i] = 0;
          if (doSlow){
            mapForcesSlow[i] = float4(devForceSlow_x[i],
                                     devForceSlow_y[i], 
                                     devForceSlow_z[i], 
                                     devForceSlow_w[i]);
          }
#else
          mapForces[i] = devForces[i];
          if (doSlow){
            mapForcesSlow[i] = devForcesSlow[i];
          }
#endif
        }
      }

      if (patchDone[0] || patchDone[1]) {
        // Make sure mapForces and mapForcesSlow are up-to-date
#ifdef TEST_ATOMIC_FENCE
        sg.barrier();
        sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
        //sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::system);
#endif
        
        // Add patch into "patchReadyQueue"
        if (wid == 0) {
          if (patchDone[0]) {
            int ind = ATOMIC_FETCH_ADD(int, tileListStat->patchReadyQueueCount, 1);
            mapPatchReadyQueue[ind] = patchInd.x();
          }
          if (patchDone[1]) {
            int ind = ATOMIC_FETCH_ADD(int, tileListStat->patchReadyQueueCount, 1);
            mapPatchReadyQueue[ind] = patchInd.y();
          }
        }
      }
    } // if ( dostreaming)
    // }
    if (doStreaming && outputOrder != NULL && item_ct1.get_local_id(2) % AVXSIZE == 0) {
      int index = ATOMIC_FETCH_ADD(int, tileListStat->outputOrderIndex, 1);
      outputOrder[index] = itileList;
    }

  } // if (itileList < numTileLists)


  }); // cgs.parallel_for
  }); // q.submit
#ifdef TEST_QWAIT
  myQ.wait();
#endif
}

//
// Finish up - reduce virials from nonbonded kernel
//
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
 void reduceNonbondedVirialKernel(const bool doSlow,
  const int atomStorageSize,
  const float4* __restrict__ xyzq,
  const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
  VirialEnergy* __restrict__ virialEnergy, nd_item<3> item_ct1) {
  auto group =  item_ct1.get_group();

  for (int ibase = item_ct1.get_group(2)*item_ct1.get_local_range().get(2);ibase < atomStorageSize;ibase += item_ct1.get_local_range().get(2)*item_ct1.get_group_range(2))
  {
    int i = ibase + item_ct1.get_local_id(2);

    // Set to zero to avoid nan*0
    float4 pos;
    pos.x() = 0.0f;
    pos.y() = 0.0f;
    pos.z() = 0.0f;
    float4 force, forceSlow;
    force.x() = 0.0f;
    force.y() = 0.0f;
    force.z() = 0.0f;
    forceSlow.x() = 0.0f;
    forceSlow.y() = 0.0f;
    forceSlow.z() = 0.0f;
    if (i < atomStorageSize) {
      pos = xyzq[i];
      force = devForces[i];
      if (doSlow) forceSlow = devForcesSlow[i];
    }
    // Reduce across the entire thread block
    float vxxt = force.x()*pos.x();
    float vxyt = force.x()*pos.y();
    float vxzt = force.x()*pos.z();
    float vyxt = force.y()*pos.x();
    float vyyt = force.y()*pos.y();
    float vyzt = force.y()*pos.z();
    float vzxt = force.z()*pos.x();
    float vzyt = force.z()*pos.y();
    float vzzt = force.z()*pos.z();
    // atomicAdd(&virialEnergy->virial[0], (double)vxx);
    // atomicAdd(&virialEnergy->virial[1], (double)vxy);
    // atomicAdd(&virialEnergy->virial[2], (double)vxz);
    // atomicAdd(&virialEnergy->virial[3], (double)vyx);
    // atomicAdd(&virialEnergy->virial[4], (double)vyy);
    // atomicAdd(&virialEnergy->virial[5], (double)vyz);
    // atomicAdd(&virialEnergy->virial[6], (double)vzx);
    // atomicAdd(&virialEnergy->virial[7], (double)vzy);
    // atomicAdd(&virialEnergy->virial[8], (double)vzz);

    // volatile float vxx = sycl::ext::oneapi::reduce(group, vxxt, sycl::ext::oneapi::plus<>());
    // volatile float vxy = sycl::ext::oneapi::reduce(group, vxyt, sycl::ext::oneapi::plus<>());
    // volatile float vxz = sycl::ext::oneapi::reduce(group, vxzt, sycl::ext::oneapi::plus<>());
    // volatile float vyx = sycl::ext::oneapi::reduce(group, vyxt, sycl::ext::oneapi::plus<>());
    // volatile float vyy = sycl::ext::oneapi::reduce(group, vyyt, sycl::ext::oneapi::plus<>());
    // volatile float vyz = sycl::ext::oneapi::reduce(group, vyzt, sycl::ext::oneapi::plus<>());
    // volatile float vzx = sycl::ext::oneapi::reduce(group, vzxt, sycl::ext::oneapi::plus<>());
    // volatile float vzy = sycl::ext::oneapi::reduce(group, vzyt, sycl::ext::oneapi::plus<>());
    // volatile float vzz = sycl::ext::oneapi::reduce(group, vzzt, sycl::ext::oneapi::plus<>());

    volatile float vxx = sycl::reduce_over_group(group, vxxt, sycl::ext::oneapi::plus<>());
    volatile float vxy = sycl::reduce_over_group(group, vxyt, sycl::ext::oneapi::plus<>());
    volatile float vxz = sycl::reduce_over_group(group, vxzt, sycl::ext::oneapi::plus<>());
    volatile float vyx = sycl::reduce_over_group(group, vyxt, sycl::ext::oneapi::plus<>());
    volatile float vyy = sycl::reduce_over_group(group, vyyt, sycl::ext::oneapi::plus<>());
    volatile float vyz = sycl::reduce_over_group(group, vyzt, sycl::ext::oneapi::plus<>());
    volatile float vzx = sycl::reduce_over_group(group, vzxt, sycl::ext::oneapi::plus<>());
    volatile float vzy = sycl::reduce_over_group(group, vzyt, sycl::ext::oneapi::plus<>());
    volatile float vzz = sycl::reduce_over_group(group, vzzt, sycl::ext::oneapi::plus<>());

    if (item_ct1.get_local_id(2) == 0) {
      ATOMIC_ADD(double, virialEnergy->virial[0], (double)vxx);
      ATOMIC_ADD(double, virialEnergy->virial[1], (double)vxy);
      ATOMIC_ADD(double, virialEnergy->virial[2], (double)vxz);
      ATOMIC_ADD(double, virialEnergy->virial[3], (double)vyx);
      ATOMIC_ADD(double, virialEnergy->virial[4], (double)vyy);
      ATOMIC_ADD(double, virialEnergy->virial[5], (double)vyz);
      ATOMIC_ADD(double, virialEnergy->virial[6], (double)vzx);
      ATOMIC_ADD(double, virialEnergy->virial[7], (double)vzy);
      ATOMIC_ADD(double, virialEnergy->virial[8], (double)vzz);
    }

    if (doSlow) {
      // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
      float vxxSlowt = forceSlow.x()*pos.x();
      float vxySlowt = forceSlow.x()*pos.y();
      float vxzSlowt = forceSlow.x()*pos.z();
      float vyxSlowt = forceSlow.y()*pos.x();
      float vyySlowt = forceSlow.y()*pos.y();
      float vyzSlowt = forceSlow.y()*pos.z();
      float vzxSlowt = forceSlow.z()*pos.x();
      float vzySlowt = forceSlow.z()*pos.y();
      float vzzSlowt = forceSlow.z()*pos.z();
      // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
      // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
      // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
      // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
      // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
      // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
      // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
      // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
      // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
      volatile float vxxSlow = sycl::reduce_over_group(group, vxxSlowt, sycl::ext::oneapi::plus<>());
      volatile float vxySlow = sycl::reduce_over_group(group, vxySlowt, sycl::ext::oneapi::plus<>());
      volatile float vxzSlow = sycl::reduce_over_group(group, vxzSlowt, sycl::ext::oneapi::plus<>());
      volatile float vyxSlow = sycl::reduce_over_group(group, vyxSlowt, sycl::ext::oneapi::plus<>());
      volatile float vyySlow = sycl::reduce_over_group(group, vyySlowt, sycl::ext::oneapi::plus<>());
      volatile float vyzSlow = sycl::reduce_over_group(group, vyzSlowt, sycl::ext::oneapi::plus<>());
      volatile float vzxSlow = sycl::reduce_over_group(group, vzxSlowt, sycl::ext::oneapi::plus<>());
      volatile float vzySlow = sycl::reduce_over_group(group, vzySlowt, sycl::ext::oneapi::plus<>());
      volatile float vzzSlow = sycl::reduce_over_group(group, vzzSlowt, sycl::ext::oneapi::plus<>());

      // volatile float vxxSlow = sycl::ext::oneapi::reduce(group, vxxSlowt, sycl::ext::oneapi::plus<>());
      // volatile float vxySlow = sycl::ext::oneapi::reduce(group, vxySlowt, sycl::ext::oneapi::plus<>());
      // volatile float vxzSlow = sycl::ext::oneapi::reduce(group, vxzSlowt, sycl::ext::oneapi::plus<>());
      // volatile float vyxSlow = sycl::ext::oneapi::reduce(group, vyxSlowt, sycl::ext::oneapi::plus<>());
      // volatile float vyySlow = sycl::ext::oneapi::reduce(group, vyySlowt, sycl::ext::oneapi::plus<>());
      // volatile float vyzSlow = sycl::ext::oneapi::reduce(group, vyzSlowt, sycl::ext::oneapi::plus<>());
      // volatile float vzxSlow = sycl::ext::oneapi::reduce(group, vzxSlowt, sycl::ext::oneapi::plus<>());
      // volatile float vzySlow = sycl::ext::oneapi::reduce(group, vzySlowt, sycl::ext::oneapi::plus<>());
      // volatile float vzzSlow = sycl::ext::oneapi::reduce(group, vzzSlowt, sycl::ext::oneapi::plus<>());

      if (item_ct1.get_local_id(2) == 0) {
        ATOMIC_ADD(double, virialEnergy->virialSlow[0], (double)vxxSlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[1], (double)vxySlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[2], (double)vxzSlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[3], (double)vyxSlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[4], (double)vyySlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[5], (double)vyzSlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[6], (double)vzxSlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[7], (double)vzySlow);
        ATOMIC_ADD(double, virialEnergy->virialSlow[8], (double)vzzSlow);
      }
    }
  }

}

#define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
 void reduceVirialEnergyKernel(
  const bool doEnergy, const bool doVirial, const bool doSlow,
  const int numTileLists,
  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
  VirialEnergy* __restrict__ virialEnergy, nd_item<3> item_ct1) {
  auto group =  item_ct1.get_group();

  for (int ibase = item_ct1.get_group(2)*item_ct1.get_local_range().get(2);ibase < numTileLists;ibase += item_ct1.get_local_range().get(2)*item_ct1.get_group_range(2))
  {
    int itileList = ibase + item_ct1.get_local_id(2);
    TileListVirialEnergy ve;
    if (itileList < numTileLists) {
      ve = tileListVirialEnergy[itileList];
    } else {
      // Set to zero to avoid nan*0
      if (doVirial) {
        ve.sh = float3(0.0f);
        ve.force = float3(0.0f);
        ve.forceSlow = float3(0.0f);
      }
      if (doEnergy) {
        ve.energyVdw = 0.0;
        ve.energyElec = 0.0;
        ve.energySlow = 0.0;
        // ve.energyGBIS = 0.0;
      }
    }

    if (doVirial) {
      float vxxt = ve.force.x()*ve.sh.x();
      float vxyt = ve.force.x()*ve.sh.y();
      float vxzt = ve.force.x()*ve.sh.z();
      float vyxt = ve.force.y()*ve.sh.x();
      float vyyt = ve.force.y()*ve.sh.y();
      float vyzt = ve.force.y()*ve.sh.z();
      float vzxt = ve.force.z()*ve.sh.x();
      float vzyt = ve.force.z()*ve.sh.y();
      float vzzt = ve.force.z()*ve.sh.z();

      float vxx = sycl::reduce_over_group(group, vxxt, sycl::ext::oneapi::plus<>());
      float vxy = sycl::reduce_over_group(group, vxyt, sycl::ext::oneapi::plus<>());
      float vxz = sycl::reduce_over_group(group, vxzt, sycl::ext::oneapi::plus<>());
      float vyx = sycl::reduce_over_group(group, vyxt, sycl::ext::oneapi::plus<>());
      float vyy = sycl::reduce_over_group(group, vyyt, sycl::ext::oneapi::plus<>());
      float vyz = sycl::reduce_over_group(group, vyzt, sycl::ext::oneapi::plus<>());
      float vzx = sycl::reduce_over_group(group, vzxt, sycl::ext::oneapi::plus<>());
      float vzy = sycl::reduce_over_group(group, vzyt, sycl::ext::oneapi::plus<>());
      float vzz = sycl::reduce_over_group(group, vzzt, sycl::ext::oneapi::plus<>());

      //       float vxx = sycl::ext::oneapi::reduce(group, vxxt, sycl::ext::oneapi::plus<>());
      // float vxy = sycl::ext::oneapi::reduce(group, vxyt, sycl::ext::oneapi::plus<>());
      // float vxz = sycl::ext::oneapi::reduce(group, vxzt, sycl::ext::oneapi::plus<>());
      // float vyx = sycl::ext::oneapi::reduce(group, vyxt, sycl::ext::oneapi::plus<>());
      // float vyy = sycl::ext::oneapi::reduce(group, vyyt, sycl::ext::oneapi::plus<>());
      // float vyz = sycl::ext::oneapi::reduce(group, vyzt, sycl::ext::oneapi::plus<>());
      // float vzx = sycl::ext::oneapi::reduce(group, vzxt, sycl::ext::oneapi::plus<>());
      // float vzy = sycl::ext::oneapi::reduce(group, vzyt, sycl::ext::oneapi::plus<>());
      // float vzz = sycl::ext::oneapi::reduce(group, vzzt, sycl::ext::oneapi::plus<>());

      if (item_ct1.get_local_id(2) == 0) {
        ATOMIC_ADD(double, virialEnergy->virial[0], (double)vxx);
        ATOMIC_ADD(double, virialEnergy->virial[1], (double)vxy);
        ATOMIC_ADD(double, virialEnergy->virial[2], (double)vxz);
        ATOMIC_ADD(double, virialEnergy->virial[3], (double)vyx);
        ATOMIC_ADD(double, virialEnergy->virial[4], (double)vyy);
        ATOMIC_ADD(double, virialEnergy->virial[5], (double)vyz);
        ATOMIC_ADD(double, virialEnergy->virial[6], (double)vzx);
        ATOMIC_ADD(double, virialEnergy->virial[7], (double)vzy);
        ATOMIC_ADD(double, virialEnergy->virial[8], (double)vzz);
      }

      if (doSlow) {
        vxxt = ve.forceSlow.x()*ve.sh.x();
        vxyt = ve.forceSlow.x()*ve.sh.y();
        vxzt = ve.forceSlow.x()*ve.sh.z();
        vyxt = ve.forceSlow.y()*ve.sh.x();
        vyyt = ve.forceSlow.y()*ve.sh.y();
        vyzt = ve.forceSlow.y()*ve.sh.z();
        vzxt = ve.forceSlow.z()*ve.sh.x();
        vzyt = ve.forceSlow.z()*ve.sh.y();
        vzzt = ve.forceSlow.z()*ve.sh.z();

        float vxx = sycl::reduce_over_group(group, vxxt, sycl::ext::oneapi::plus<>());
        float vxy = sycl::reduce_over_group(group, vxyt, sycl::ext::oneapi::plus<>());
        float vxz = sycl::reduce_over_group(group, vxzt, sycl::ext::oneapi::plus<>());
        float vyx = sycl::reduce_over_group(group, vyxt, sycl::ext::oneapi::plus<>());
        float vyy = sycl::reduce_over_group(group, vyyt, sycl::ext::oneapi::plus<>());
        float vyz = sycl::reduce_over_group(group, vyzt, sycl::ext::oneapi::plus<>());
        float vzx = sycl::reduce_over_group(group, vzxt, sycl::ext::oneapi::plus<>());
        float vzy = sycl::reduce_over_group(group, vzyt, sycl::ext::oneapi::plus<>());
        float vzz = sycl::reduce_over_group(group, vzzt, sycl::ext::oneapi::plus<>());

        if (item_ct1.get_local_id(2) == 0) {
          ATOMIC_ADD(double, virialEnergy->virialSlow[0], (double)vxx);
          ATOMIC_ADD(double, virialEnergy->virialSlow[1], (double)vxy);
          ATOMIC_ADD(double, virialEnergy->virialSlow[2], (double)vxz);
          ATOMIC_ADD(double, virialEnergy->virialSlow[3], (double)vyx);
          ATOMIC_ADD(double, virialEnergy->virialSlow[4], (double)vyy);
          ATOMIC_ADD(double, virialEnergy->virialSlow[5], (double)vyz);
          ATOMIC_ADD(double, virialEnergy->virialSlow[6], (double)vzx);
          ATOMIC_ADD(double, virialEnergy->virialSlow[7], (double)vzy);
          ATOMIC_ADD(double, virialEnergy->virialSlow[8], (double)vzz);
        }
      }
    }

    if (doEnergy) {
      volatile double energyVdw = sycl::reduce_over_group(group, ve.energyVdw, sycl::ext::oneapi::plus<>());
      volatile double energyElec = sycl::reduce_over_group(group, ve.energyElec, sycl::ext::oneapi::plus<>());

      if (item_ct1.get_local_id(2) == 0) {
          ATOMIC_ADD(double, virialEnergy->energyVdw, (double)energyVdw);
          ATOMIC_ADD(double, virialEnergy->energyElec, (double)energyElec);
      }
      if (doSlow) {
        volatile double energySlow = sycl::reduce_over_group(group, ve.energySlow, sycl::ext::oneapi::plus<>());

        if (item_ct1.get_local_id(2) == 0) ATOMIC_ADD(double, virialEnergy->energySlow, (double)energySlow);
      }
      // if (doGBIS) {
      //   double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
      //   if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
      // }
    }
  }
}

#define REDUCEGBISENERGYKERNEL_NUM_WARP 32
 void reduceGBISEnergyKernel(const int numTileLists,
  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
  VirialEnergy* __restrict__ virialEnergy, nd_item<3> item_ct1) {
  auto group =  item_ct1.get_group();

  for (int ibase = item_ct1.get_group(2)*item_ct1.get_local_range().get(2);ibase < numTileLists;ibase += item_ct1.get_local_range().get(2)*item_ct1.get_group_range(2))
  {
    int itileList = ibase + item_ct1.get_local_id(2);
    double energyGBISt = 0.0;
    if (itileList < numTileLists) {
      energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
    }

    volatile double energyGBIS = sycl::reduce_over_group(group, energyGBISt, sycl::ext::oneapi::plus<>());
    if (item_ct1.get_local_id(2) == 0) ATOMIC_ADD(double, virialEnergy->energyGBIS, (double)energyGBIS);
  }

}

// ##############################################################################################
// ##############################################################################################
// ##############################################################################################

DpcppComputeNonbondedKernel::DpcppComputeNonbondedKernel(int deviceID, DpcppNonbondedTables& dpcppNonbondedTables,
  bool doStreaming) : deviceID(deviceID), dpcppNonbondedTables(dpcppNonbondedTables), doStreaming(doStreaming) {
  
  dpcppDevice->setDeviceID(deviceID);

  overflowExclusions = NULL;
  overflowExclusionsSize = 0;

  exclIndexMaxDiff = NULL;
  exclIndexMaxDiffSize = 0;

  atomIndex = NULL;
  atomIndexSize = 0;

  vdwTypes = NULL;
  vdwTypesSize = 0;

  patchNumCount = NULL;
  patchNumCountSize = 0;

  patchReadyQueue = NULL;
  patchReadyQueueSize = 0;

  force_x = force_y = force_z = force_w = NULL;
  forceSize = 0;
  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
  forceSlowSize = 0;

}

void DpcppComputeNonbondedKernel::initialize(sycl::queue *queue) {
  myQ = queue;
}

void DpcppComputeNonbondedKernel::reallocate_forceSOA(int atomStorageSize)
{
  reallocate_device<float>(&force_x, &forceSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&force_y, &forceSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&force_z, &forceSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&force_w, &forceSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, *myQ, 1.4f);
  reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, *myQ, 1.4f);  
}

DpcppComputeNonbondedKernel::~DpcppComputeNonbondedKernel() {
  dpcppDevice->setDeviceID(deviceID);
  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions, *myQ);
  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff, *myQ);
  if (atomIndex != NULL) deallocate_device<int>(&atomIndex, *myQ);
  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes, *myQ);
  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount, *myQ);
  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue, *myQ);

  if (force_x != NULL) deallocate_device<float>(&force_x, *myQ);
  if (force_y != NULL) deallocate_device<float>(&force_y, *myQ);
  if (force_z != NULL) deallocate_device<float>(&force_z, *myQ);
  if (force_w != NULL) deallocate_device<float>(&force_w, *myQ);
  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x, *myQ);
  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y, *myQ);
  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z, *myQ);
  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w, *myQ);  

  if (constExclusions != NULL) deallocate_device<unsigned int>(&constExclusions, *myQ); 

}

void DpcppComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
  const int2* h_exclIndexMaxDiff, const int* h_atomIndex) {
#ifdef TEST_QWAIT
  myQ->wait();
#endif
  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, *myQ, OVERALLOC);
  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, *myQ, OVERALLOC);
  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, *myQ, OVERALLOC);

  copy_HtoD(h_vdwTypes, vdwTypes, atomStorageSize, *myQ);
  copy_HtoD(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, *myQ);
  copy_HtoD(h_atomIndex, atomIndex, atomStorageSize, *myQ);
}

int* DpcppComputeNonbondedKernel::getPatchReadyQueue() {
  if (!doStreaming) {
    NAMD_die("DpcppComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
  }
  return patchReadyQueue;
}

template <int doSlow>
void transposeForcesKernel(float4 *f, float4 *fSlow,
              float *fx, float *fy, float *fz, float *fw,
              float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww, int n, sycl::queue myQ)
{
  int workGroupSize = 128;
  int rangeSize = ((n - 1)/workGroupSize+1)*workGroupSize;

  myQ.submit(
    [&](handler &cgh) {
    cgh.parallel_for(
//    cgh.parallel_for<class transposeForcesKernel_>(
      nd_range<3>(range<3>(1, 1, rangeSize), range<3>(1, 1, workGroupSize)),
      [=](nd_item<3> item_ct1) {
        int tid = item_ct1.get_global_id(2);
        if (tid < n) {
          f[tid] = float4(fx[tid], fy[tid], fz[tid], fw[tid]);
          if (doSlow) {
            fSlow[tid] = float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
          }
        }
    });
  });
#ifdef TEST_QWAIT
  myQ.wait();
#endif
}

void DpcppComputeNonbondedKernel::nonbondedForce(DpcppTileListKernel& tlKernel,
  const int atomStorageSize, const bool doPairlist,
  const bool doEnergy, const bool doVirial, const bool doSlow,
  const float3 lata, const float3 latb, const float3 latc,
  const float4* h_xyzq, const float cutoff2, 
  #ifdef TEST_ENERGYMATH
  const float cutoff, const float cutoff2Inv, const float scutoff2, const float scutoff2Inv, const float scutoff_denom, 
  const float ewaldcof, const float pi_ewaldcof, const float slowScale,
  #endif
  float4* d_forces, float4* d_forcesSlow,
  float4* h_forces, float4* h_forcesSlow) {
//try {

  if (!doPairlist) copy_HtoD(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, *myQ);

#ifdef FORCES_SOA
//  tlKernel.clearTileListStat(*myQ);
  tlKernel.clearTileListStat();
  clear_device_array<float>(force_x, atomStorageSize, *myQ);
  clear_device_array<float>(force_y, atomStorageSize, *myQ);
  clear_device_array<float>(force_z, atomStorageSize, *myQ);
  clear_device_array<float>(force_w, atomStorageSize, *myQ);
  if (doSlow) {
    clear_device_array<float>(forceSlow_x, atomStorageSize, *myQ);
    clear_device_array<float>(forceSlow_y, atomStorageSize, *myQ);
    clear_device_array<float>(forceSlow_z, atomStorageSize, *myQ);
    clear_device_array<float>(forceSlow_w, atomStorageSize, *myQ);
  }
#ifdef TEST_QWAIT
  myQ->wait();
#endif
#else
  clear_device_array<float4>(d_forces, atomStorageSize, *myQ);
  if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, *myQ);
#endif

  // --- streaming ----
  float4 *m_forces = NULL;
  float4 *m_forcesSlow = NULL;
  int* m_patchReadyQueue = NULL;
  int numPatches = 0;
  unsigned int* patchNumCountPtr = NULL;
  if (doStreaming) {
    numPatches = tlKernel.getNumPatches();
    #ifdef TEST_QWAIT
    myQ->wait();
    #endif
    if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches, *myQ)) {
      // If re-allocated, clear array
      clear_device_array<unsigned int>(patchNumCount, numPatches, *myQ);
    }
    patchNumCountPtr = patchNumCount;
    bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, *myQ);
    if (re) {
      // If re-allocated, re-set to "-1"
      for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
    }
/* DPCT_ORIG     cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));*/
    m_patchReadyQueue = patchReadyQueue;
/* DPCT_ORIG     cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));*/
    m_forces = h_forces;
/* DPCT_ORIG     cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));*/
    m_forcesSlow = h_forcesSlow;
  }
  // -----------------

  if (doVirial || doEnergy) {
    tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
  }

  int shMemSize = 0;

  int* outputOrderPtr = tlKernel.getOutputOrder();

  int numTileLists = tlKernel.getNumTileLists();

 #include "array_debug_dpcpp/ac_pre_nonbondedForceKernel.h"
  
  int nwarp = NONBONDKERNEL_NUM_WARP;
  int workGroupSize = std::min(dpcppDevice->getMaxWorkGroupSize(), AVXSIZE*nwarp);
  int start = 0;
  
  auto doEnergy1 = doEnergy;
  auto doVirial1 = doVirial; 
  auto doSlow1 = doSlow; 
  auto doPairlist1 = doPairlist; 
  auto numTileLists1 = numTileLists; 
  auto lata1 = lata;
  auto latb1 = latb;
  auto latc1 = latc;
  auto VdwCoefTableWidth1 = dpcppNonbondedTables.getVdwCoefTableWidth();
  auto cutoff21 = cutoff2;
  auto feTableSize1 = dpcppNonbondedTables.getfeTableSize();
  auto vdwTableSize1 = dpcppNonbondedTables.getvdwTableSize();
  auto atomStorageSize1 = atomStorageSize;
  auto plcutoff21 = tlKernel.get_plcutoff2();
  auto numPatches1 = numPatches;

#ifdef TEST_ENERGYMATH
  const float cutoffInv = 1.0f / cutoff;
  nonbonded_coef c;
  c.lj_0 = scutoff_denom * cutoff2 - 3.0f * scutoff2 * scutoff_denom;
  c.lj_1 = scutoff_denom * 2.0f;
  c.lj_2 = scutoff_denom * -12.0f;
  c.lj_3 = 12.0f * scutoff_denom * scutoff2;
  c.lj_4 = cutoff2;
  c.lj_5 = scutoff2;
  c.e_0 = cutoff2Inv * cutoffInv;
  c.e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
  c.e_1 = cutoff2Inv;
  c.e_2 = cutoffInv;
  c.ewald_0 = ewaldcof;
  c.ewald_1 = pi_ewaldcof;
  c.ewald_2 = ewaldcof * ewaldcof;
  c.ewald_3_slow = ewaldcof * ewaldcof * ewaldcof * slowScale;
#endif

//  while (start < tlKernel.getNumTileLists())
//  {

    int nleft = tlKernel.getNumTileLists() - start;
    int rangeSize = ((nleft-1)/nwarp+1)*workGroupSize;

//#define BACKUP
//#define LOAD
#if defined(BACKUP) || defined(LOAD)
  auto tag = "pre_nonbondedForceKernel_scalars";
  {
    static int counter=0;
    std::string fname = (std::string)AC_ROOT_DIR +"/" + (std::string)(tag)+"_call_"+std::to_string(++counter)+".bin";
#ifdef BACKUP
    std::ofstream ofs(fname);
    boost::archive::text_oarchive archive(ofs);
#elif defined(LOAD)
    std::ifstream ifs(fname);
    boost::archive::text_iarchive archive(ifs);
#endif
    archive & start;
    archive & rangeSize; 
    archive & workGroupSize;
    archive & doEnergy1;
    archive & doVirial1; 
    archive & doSlow1; 
    archive & doPairlist1; 

    archive & numTileLists1;
    archive & VdwCoefTableWidth1;
    archive & lata1;
    archive & latb1;
    archive & latc1;
    archive & cutoff21;
    archive & feTableSize1;
    archive & vdwTableSize1;
    archive & atomStorageSize1;
    archive & plcutoff21;
    archive & numPatches1;
  } 
#endif

#ifdef TEST_ENERGYMATH
#define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
    nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
    (start, numTileLists1, tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
    VdwCoefTableWidth1, dpcppNonbondedTables.getVdwCoefTable(), \
    vdwTypes, lata1, latb1, latc1, tlKernel.get_xyzq(), cutoff21, \
    c, slowScale, \
    /*dpcppNonbondedTables.getForceTable(),  dpcppNonbondedTables.getEnergyTable(), */ \
    /*feTableSize1-1,*/ /* vdwTableSize1-1, */ \
    atomStorageSize1, plcutoff21, tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
    tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
    tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
    force_x, force_y, force_z, force_w, \
    forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
    numPatches1, patchNumCountPtr, tlKernel.getDpcppPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
    outputOrderPtr, tlKernel.getTileListVirialEnergy(), constExclusions, *myQ, rangeSize, workGroupSize);
#else
#define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
    nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
    (start, numTileLists1, tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
    VdwCoefTableWidth1, dpcppNonbondedTables.getVdwCoefTable(), \
    vdwTypes, lata1, latb1, latc1, tlKernel.get_xyzq(), cutoff21, \
    dpcppNonbondedTables.getForceTable(),  dpcppNonbondedTables.getEnergyTable(), \
    /*feTableSize1-1,*/ /* vdwTableSize1-1, */ \
    atomStorageSize1, plcutoff21, tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
    tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
    tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
    force_x, force_y, force_z, force_w, \
    forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
    numPatches1, patchNumCountPtr, tlKernel.getDpcppPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
    outputOrderPtr, tlKernel.getTileListVirialEnergy(), constExclusions, *myQ, rangeSize, workGroupSize);
#endif



    if (doStreaming) {
      if (!doEnergy1 && !doVirial1 && !doSlow1 && !doPairlist1) CALL(0, 0, 0, 0, 1);
      if (!doEnergy1 && !doVirial1 &&  doSlow1 && !doPairlist1) CALL(0, 0, 1, 0, 1);
      if (!doEnergy1 &&  doVirial1 && !doSlow1 && !doPairlist1) CALL(0, 1, 0, 0, 1);
      if (!doEnergy1 &&  doVirial1 &&  doSlow1 && !doPairlist1) CALL(0, 1, 1, 0, 1);
      if ( doEnergy1 && !doVirial1 && !doSlow1 && !doPairlist1) CALL(1, 0, 0, 0, 1);
      if ( doEnergy1 && !doVirial1 &&  doSlow1 && !doPairlist1) CALL(1, 0, 1, 0, 1);
      if ( doEnergy1 &&  doVirial1 && !doSlow1 && !doPairlist1) CALL(1, 1, 0, 0, 1);
      if ( doEnergy1 &&  doVirial1 &&  doSlow1 && !doPairlist1) CALL(1, 1, 1, 0, 1);

      if (!doEnergy1 && !doVirial1 && !doSlow1 &&  doPairlist1) CALL(0, 0, 0, 1, 1);
      if (!doEnergy1 && !doVirial1 &&  doSlow1 &&  doPairlist1) CALL(0, 0, 1, 1, 1);
      if (!doEnergy1 &&  doVirial1 && !doSlow1 &&  doPairlist1) CALL(0, 1, 0, 1, 1);
      if (!doEnergy1 &&  doVirial1 &&  doSlow1 &&  doPairlist1) CALL(0, 1, 1, 1, 1);
      if ( doEnergy1 && !doVirial1 && !doSlow1 &&  doPairlist1) CALL(1, 0, 0, 1, 1);
      if ( doEnergy1 && !doVirial1 &&  doSlow1 &&  doPairlist1) CALL(1, 0, 1, 1, 1);
      if ( doEnergy1 &&  doVirial1 && !doSlow1 &&  doPairlist1) CALL(1, 1, 0, 1, 1);
      if ( doEnergy1 &&  doVirial1 &&  doSlow1 &&  doPairlist1) CALL(1, 1, 1, 1, 1);
    } else {
      if (!doEnergy1 && !doVirial1 && !doSlow1 && !doPairlist1) CALL(0, 0, 0, 0, 0);
      if (!doEnergy1 && !doVirial1 &&  doSlow1 && !doPairlist1) CALL(0, 0, 1, 0, 0);
      if (!doEnergy1 &&  doVirial1 && !doSlow1 && !doPairlist1) CALL(0, 1, 0, 0, 0);
      if (!doEnergy1 &&  doVirial1 &&  doSlow1 && !doPairlist1) CALL(0, 1, 1, 0, 0);
      if ( doEnergy1 && !doVirial1 && !doSlow1 && !doPairlist1) CALL(1, 0, 0, 0, 0);
      if ( doEnergy1 && !doVirial1 &&  doSlow1 && !doPairlist1) CALL(1, 0, 1, 0, 0);
      if ( doEnergy1 &&  doVirial1 && !doSlow1 && !doPairlist1) CALL(1, 1, 0, 0, 0);
      if ( doEnergy1 &&  doVirial1 &&  doSlow1 && !doPairlist1) CALL(1, 1, 1, 0, 0);

      if (!doEnergy1 && !doVirial1 && !doSlow1 &&  doPairlist1) CALL(0, 0, 0, 1, 0);
      if (!doEnergy1 && !doVirial1 &&  doSlow1 &&  doPairlist1) CALL(0, 0, 1, 1, 0);
      if (!doEnergy1 &&  doVirial1 && !doSlow1 &&  doPairlist1) CALL(0, 1, 0, 1, 0);
      if (!doEnergy1 &&  doVirial1 &&  doSlow1 &&  doPairlist1) CALL(0, 1, 1, 1, 0);
      if ( doEnergy1 && !doVirial1 && !doSlow1 &&  doPairlist1) CALL(1, 0, 0, 1, 0);
      if ( doEnergy1 && !doVirial1 &&  doSlow1 &&  doPairlist1) CALL(1, 0, 1, 1, 0);
      if ( doEnergy1 &&  doVirial1 && !doSlow1 &&  doPairlist1) CALL(1, 1, 0, 1, 0);
      if ( doEnergy1 &&  doVirial1 &&  doSlow1 &&  doPairlist1) CALL(1, 1, 1, 1, 0);
    }

#ifdef FORCES_SOA
    {
      if (doSlow) 
        transposeForcesKernel<1>(d_forces, d_forcesSlow,
                       force_x, force_y, force_z, force_w,
                       forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
                       atomStorageSize, *myQ);
      else
        transposeForcesKernel<0>(d_forces, d_forcesSlow,
                       force_x, force_y, force_z, force_w,
                       forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
                       atomStorageSize, *myQ);        
    }
#endif

//    start += rangeSize*nwarp/workGroupSize;
//  }


#include "array_debug_dpcpp/ac_post_nonbondedForceKernel.h"

}
//catch (exception const &exc) {
//  std::cerr << exc.what() << "EOE at line " << __LINE__ << std::endl;
//  std::exit(1);
//}

//
// Perform virial and energy reductions for non-bonded force calculation
//
void DpcppComputeNonbondedKernel::reduceVirialEnergy(DpcppTileListKernel& tlKernel,
  const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
  float4* d_forces, float4* d_forcesSlow,
  VirialEnergy* d_virialEnergy) {
//try {

  if (doEnergy || doVirial) {
    clear_device_array<VirialEnergy>(d_virialEnergy, 1, *myQ);
  }

  if (doVirial)
  {
//    int nthread = std::min(dpcppDevice->getMaxWorkGroupSize(), REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*AVXSIZE);

    int workGroupSize = std::min(dpcppDevice->getMaxWorkGroupSize(), REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*AVXSIZE);
    int rangeSize = ((atomStorageSize-1)/workGroupSize+1)*workGroupSize;

    auto tlKernel_get_xyzq =tlKernel.get_xyzq(); 
#include "array_debug_dpcpp/ac_pre_reducenonbondedvirialkernel.h"

    // Ignore args:
    // compared before NBF calc kernel tlKernel.get_xyzq()
    // compared after NBF calc kernel: d_forces, d_forcesSlow
    {
      myQ->submit(
        [&](handler &cgh) {
          cgh.parallel_for<class reduceNonbondedVirialKernel_f2dab6>(
            nd_range<3>(range<3>(1, 1, rangeSize), range<3>(1, 1, workGroupSize)),
            [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(AVXSIZE)]] {
              reduceNonbondedVirialKernel(doSlow, atomStorageSize, tlKernel_get_xyzq, d_forces, d_forcesSlow, d_virialEnergy, item_ct1);
            });
        });
//  myQ->wait();
    }
#include "array_debug_dpcpp/ac_post_reducenonbondedvirialkernel.h"
  }

  if (doVirial || doEnergy)
  {
    int workGroupSize = std::min(dpcppDevice->getMaxWorkGroupSize(), REDUCEVIRIALENERGYKERNEL_NUM_WARP*AVXSIZE); 
    int rangeSize = ((tlKernel.getTileListVirialEnergyLength()-1)/workGroupSize+1)*workGroupSize;

    auto tlKernel_getTileListVirialEnergyLength = tlKernel.getTileListVirialEnergyLength();
    auto tlKernel_getTileListVirialEnergy = (tlKernel_getTileListVirialEnergyLength) ? tlKernel.getTileListVirialEnergy(): NULL;

#include "array_debug_dpcpp/ac_pre_reducevirialenergykernel.h"
    {
      myQ->submit(
        [&](handler &cgh) {
          cgh.parallel_for<class reduceVirialEnergyKernel_24746d>(
            nd_range<3>(range<3>(1, 1, rangeSize), range<3>(1, 1, workGroupSize)),
            [=](nd_item<3> item_ct1) [[intel::reqd_sub_group_size(AVXSIZE)]] {
              reduceVirialEnergyKernel(doEnergy, doVirial, doSlow, tlKernel_getTileListVirialEnergyLength, tlKernel_getTileListVirialEnergy, d_virialEnergy, item_ct1);
            });
        });
//  myQ->wait();
    }
#include "array_debug_dpcpp/ac_post_reducevirialenergykernel.h"

  }  

  if (doGBIS && doEnergy)
  {
//    int nthread = std::min(dpcppDevice->getMaxWorkGroupSize(), REDUCEGBISENERGYKERNEL_NUM_WARP*AVXSIZE);
//    int nblock = (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1;

    int workGroupSize = std::min(dpcppDevice->getMaxWorkGroupSize(), REDUCEGBISENERGYKERNEL_NUM_WARP*AVXSIZE);
    int rangeSize = ((tlKernel.getTileListVirialEnergyGBISLength()-1)/workGroupSize+1)*workGroupSize;

    auto tlKernel_getTileListVirialEnergyGBISLength = tlKernel.getTileListVirialEnergyGBISLength();
    auto tlKernel_getTileListVirialEnergy = tlKernel.getTileListVirialEnergy();
    {
      myQ->submit(
        [&](handler &cgh) {
          cgh.parallel_for<class reduceGBISEnergyKernel_b32de7>(
            nd_range<3>(range<3>(1, 1, rangeSize), range<3>(1, 1, workGroupSize)),
            [=](nd_item<3> item_ct1) {
              reduceGBISEnergyKernel(tlKernel_getTileListVirialEnergyGBISLength, tlKernel_getTileListVirialEnergy, d_virialEnergy, item_ct1);
            });
        });
    }
  }

}
//catch (exception const &exc) {
//  std::cerr << exc.what() << "EOE at line " << __LINE__ << std::endl;
//  std::exit(1);
//}

void DpcppComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {

	int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
  //TODO: Copy the data frp, exlusion_bits to the global Host array

        allocate_device<unsigned int>(&constExclusions, nconst, *myQ);
  copy_HtoD<unsigned int>(exclusion_bits, constExclusions, nconst, *myQ);

  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions, *myQ);
  copy_HtoD_sync(exclusion_bits, overflowExclusions, numExclusions, *myQ);
}

#endif // NAMD_DPCPP
