#ifndef DPCPPUTILS_H
#define DPCPPUTILS_H

#include <stdio.h>
#include "DpcppDevice.h"
#include "NamdTypes.h"


#ifdef NAMD_DPCPP
#include <CL/sycl.hpp>

#include "array_debug_dpcpp_util.h"

#define AVXSIZE 32

// #define TEST_QWAIT
#define TEST_ASYNCEXSCAN
#define TEST_ASYNSORT
// #define TEST_ESIMDSORT


#define TEST_CASEMU
#define TEST_ATOMIC_FENCE
// #define TEST_SLMENERGYTABLE
// #define TEST_BARRIERLOCAL
// #define TEST_ENERGYMATH
// #define TEST_PRELOADL0L1

#define APPROX_PME

#define FETABLESIZE 4096
#ifdef __SYCL_DEVICE_ONLY__
  #define CL_CONSTANT __attribute__((opencl_constant))
#else
  #define CL_CONSTANT
#endif

#define PRINTF(format, ...) { \
                            static const CL_CONSTANT char _format[] = format; \
                            sycl::ext::oneapi::experimental::printf(_format, ## __VA_ARGS__); \
                            }


// #define ATOMIC_ADD(T, data, val)  (sycl::ext::oneapi::atomic_ref< T, sycl::ext::oneapi::memory_order::relaxed, sycl::ext::oneapi::memory_scope::device, sycl::access::address_space::global_space>((data)) += (val));

#define ATOMIC_ADD(T, data, val)  (sycl::atomic_ref< T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>((data)) += (val));
//template <class T> 
//NAMD_INLINE void atomicAddNAMD(T* data, T val) {sycl::ext::oneapi::atomic_ref< T, sycl::ext::oneapi::memory_order::relaxed, sycl::ext::oneapi::memory_scope::device, sycl::access::address_space::global_space>(data) += val;}

#define ATOMIC_FETCH_ADD(T, data, val)  sycl::atomic_ref< T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>(data).fetch_add(val);
#define ATOMIC_FETCH_SUB(T, data, val)  sycl::atomic_ref< T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>(data).fetch_sub(val);
#define ATOMIC_EXCHANGE(T, data, expected, desired)  sycl::atomic_ref< T, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>(data).compare_exchange_weak(expected, desired)

// #define ATOMIC_FETCH_ADD(T, data, val)  sycl::ext::oneapi::atomic_ref< T, sycl::ext::oneapi::memory_order::relaxed, sycl::ext::oneapi::memory_scope::device, sycl::access::address_space::global_space>(data).fetch_add(val);
// #define ATOMIC_FETCH_SUB(T, data, val)  sycl::ext::oneapi::atomic_ref< T, sycl::ext::oneapi::memory_order::relaxed, sycl::ext::oneapi::memory_scope::device, sycl::access::address_space::global_space>(data).fetch_sub(val);
// #define ATOMIC_EXCHANGE(T, data, expected, desired)  sycl::ext::oneapi::atomic_ref< T, sycl::ext::oneapi::memory_order::relaxed, sycl::ext::oneapi::memory_scope::device, sycl::access::address_space::global_space>(data).compare_exchange_weak(expected, desired)

#ifndef ATS_ATOMICS
#define ATOMIC_ADD_FLOAT(o, v) ATOMIC_ADD(float, o, v)

#else
#ifdef __SYCL_DEVICE_ONLY__
SYCL_EXTERNAL float atomic_fetch_add(volatile __attribute__((opencl_global)) _Atomic float *p, float val);
#define ATOMIC_ADD_FLOAT(o, v) \
  atomic_fetch_add((volatile __attribute__((opencl_global)) _Atomic float *)(&o), v)
#else
inline float atomic_fetch_add(volatile _Atomic float *p, float val) { return val; };
#define ATOMIC_ADD_FLOAT(o, v) ATOMIC_ADD(float, o, v)
#endif

#endif // ATS_ATOMICS

template <typename T>
NAMD_INLINE T warp_shuffle(const sycl::ext::oneapi::sub_group& sg, unsigned long mask,
                                  T var, int srcLane, int width) {
  int partition_id = sg.get_local_id() / width;
  int id = sg.get_local_id();
  srcLane = (mask & (0x1 << id)) ? (srcLane % width) : (id % width);
  srcLane += partition_id * width;
  return sg.shuffle(var, srcLane);
}
NAMD_INLINE unsigned long warp_ballot(const sycl::ext::oneapi::sub_group& sg, int pred) {
  unsigned long vote = (pred != 0) ? 0x1 << (int)sg.get_local_id() : 0;
  return sycl::reduce_over_group(sg, vote, sycl::ext::oneapi::plus<>());
  // return sycl::ext::oneapi::reduce(sg, vote, sycl::ext::oneapi::plus<>());
}

#define WARP_SHUFFLE_MASK(SG, MASK, VAR, LANE, WIDTH) \
  warp_shuffle(SG, MASK, VAR, LANE, WIDTH)
#define WARP_BALLOT(SG, P)       warp_ballot(SG, P)

#define ATOMIC_FETCH_ADD_FLOAT(o, v) ATOMIC_ADD_FLOAT(o, v)

/*
NAMD_INLINE sycl::float4 linear_interp_cudata_float4(const sycl::float4 *__restrict__ mytable, const float rinv, const int tsize000)
{
    const int tsize=4096;
    const float table_rinv = rinv > 1.f ? 1.f : rinv;
    const float table_f = tsize * table_rinv;
    const int table_i = (int) table_f;
    const float table_diff = table_f - table_i;

    return mytable[table_i] + (mytable[table_i + 1] - mytable[table_i]) * table_diff;
*/
NAMD_INLINE sycl::float4 linear_interp_cudata_float4(const sycl::float4* __restrict__ tex, const float k) {
  const int tableSize  = 4096;
  const float x = k * (float)tableSize - 0.5f;
  const float f = floorf(x);
  const float a = x - f;
  const unsigned int i = (unsigned int)f;
  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
  const int i1 = i0 + 1;
  const sycl::float4 t0 =tex[i0];
  const sycl::float4 t1 =tex[i1];
  return a * (t1 - t0) + t0;
}

void dpcppDie(const char *msg, int err=0);

void dpcppNAMD_bug(const char *msg);

template <class T>
inline void clear_device_array(T *data, const int ndata, sycl::queue &myQ) {
  myQ.memset((void*)(data), 0, sizeof(T)*ndata);
}

//----------------------------------------------------------------------------------------
//
// Allocate page-locked host memory
// pp = memory pointer
// len = length of the array
//
template <class T>
inline void allocate_host(T **pp, const int len, sycl::queue &myQ) {
  *((void **)pp) = malloc_host(sizeof(T)*len, myQ);
}


//----------------------------------------------------------------------------------------
//
// Allocate gpu memory
// pp = memory pointer
// len = length of the array
//
template <class T>
inline void allocate_device(T **pp, const int len, sycl::queue &myQ) {
  *((void **)pp) = malloc_device(sizeof(T)*len, myQ);
}

//----------------------------------------------------------------------------------------
//
// Deallocate gpu memory
// pp = memory pointer
//
template <class T>
inline void deallocate_device(T **pp, sycl::queue &myQ) {
  if (*pp != NULL) {
    free((void *)(*pp), myQ);
    *pp = NULL;
  }
}
//----------------------------------------------------------------------------------------

bool reallocate_device_T(void **pp, int *curlen, const int newlen, sycl::queue &myQ, const float fac, const size_t sizeofT);
//----------------------------------------------------------------------------------------
//
// Allocate & re-allocate device memory
// pp = memory pointer
// curlen = current length of the array
// newlen = new required length of the array
// fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
//
// returns true if reallocation happened
//
template <class T>
bool reallocate_device(T **pp, int *curlen, const int newlen, sycl::queue &myQ, const float fac=1.0f) {
  return reallocate_device_T((void **)pp, curlen, newlen, myQ, fac, sizeof(T));
}
//----------------------------------------------------------------------------------------
bool reallocate_host_T(void **pp, int *curlen, const int newlen, sycl::queue &myQ, const float fac, 
		       const size_t sizeofT);
//----------------------------------------------------------------------------------------
//
// Allocate & re-allocate pinned host memory
// pp = memory pointer
// curlen = current length of the array
// newlen = new required length of the array
// fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
// flag = allocation type:
//        dpcppHostAllocDefault = default type, emulates dpcppMallocHost
//        dpcppHostAllocMapped  = maps allocation into DPC++ address space
//
// returns true if reallocation happened
//
template <class T>
bool reallocate_host(T **pp, int *curlen, const int newlen, sycl::queue &myQ,
		     const float fac=1.0f) {
  return reallocate_host_T((void **)pp, curlen, newlen, myQ, fac, sizeof(T));
}

//----------------------------------------------------------------------------------------
//
// Deallocate page-locked host memory
// pp = memory pointer
//
template <class T>
inline void deallocate_host(T **pp, sycl::queue &myQ) {
  if (*pp != NULL) {
    free((void *)(*pp), myQ);
    *pp = NULL;
  }
}
//
// Copies memory Host -> Device
//
template <class T>
inline void copy_HtoD(const T *h_array, T *d_array, int array_len, sycl::queue &myQ) {
  myQ.memcpy((void*)(d_array), (void*)(h_array), sizeof(T)*array_len);
}

//----------------------------------------------------------------------------------------
//
// Copies memory Host -> Device using synchronous calls
//
template <class T>
inline void copy_HtoD_sync(const T *h_array, T *d_array, int array_len, sycl::queue &myQ) {
  myQ.memcpy((void*)(d_array), (void*)(h_array), sizeof(T)*array_len).wait();
}

//----------------------------------------------------------------------------------------
//
// Copies memory Device -> Host
//
template <class T>
inline void copy_DtoH(const T *d_array, T *h_array, const int array_len, sycl::queue &myQ) {
  myQ.memcpy((void*)(h_array), (void*)(d_array), sizeof(T)*array_len);
}
//----------------------------------------------------------------------------------------
//
// Copies memory Device -> Host using synchronous calls
//
template <class T>
inline void copy_DtoH_sync(const T *d_array, T *h_array, const int array_len, sycl::queue &myQ) {
  myQ.memcpy((void*)(h_array), (void*)(d_array), sizeof(T)*array_len).wait();
}
//----------------------------------------------------------------------------------------
//
// Copies memory Device -> Device
//
template <class T>
inline void copy_DtoD(const T *d_src, T *d_dst, const int array_len, sycl::queue &myQ) {
  myQ.memcpy((void*)(d_dst), (void*)(d_src), sizeof(T)*array_len);
}
//----------------------------------------------------------------------------------------
//
// Copies memory Device -> Device using synchronous calls
//
template <class T>
inline void copy_DtoD_sync(const T *d_src, T *d_dst, const int array_len, sycl::queue &myQ) {
  myQ.memcpy((void*)(d_dst), (void*)(d_src), sizeof(T)*array_len).wait();
}

//----------------------------------------------------------------------------------------
//
// Copies memory between two peer devices Device -> Device
//
void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
  const void *d_src, void *d_dst, const int array_len, sycl::queue &myQ,
  const size_t sizeofT);

template <class T>
inline void copy_PeerDtoD(const int src_dev, const int dst_dev,
  const T *d_src, T *d_dst, const int array_len, sycl::queue &myQ) {
  assert(0==1);
  //TODO: Need to handle case of copy within a tile (subdevices),
  // between tiles, and between GPUs within a node
  myQ.memcpy((void*)(d_dst), (void*)(d_src), sizeof(T)*array_len);
}

//----------------------------------------------------------------------------------------
//
// Copies 3D memory block Host -> Device
//
void copy3D_HtoD_T(void* src_data, void* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  size_t sizeofT, sycl::queue &myQ);

template <class T>
void copy3D_HtoD(T* src_data, T* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  sycl::queue &myQ) {
  copy3D_HtoD_T(src_data, dst_data,
    src_x0, src_y0, src_z0,
    src_xsize, src_ysize,
    dst_x0, dst_y0, dst_z0,
    dst_xsize, dst_ysize,
    width, height, depth,
    sizeof(T), myQ);
}

//----------------------------------------------------------------------------------------
//
// Copies 3D memory block Device -> Host
//
void copy3D_DtoH_T(void* src_data, void* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  size_t sizeofT, sycl::queue &myQ);

template <class T>
void copy3D_DtoH(T* src_data, T* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  sycl::queue &myQ) {
  copy3D_DtoH_T(src_data, dst_data,
    src_x0, src_y0, src_z0,
    src_xsize, src_ysize,
    dst_x0, dst_y0, dst_z0,
    dst_xsize, dst_ysize,
    width, height, depth,
    sizeof(T), myQ);
}

//----------------------------------------------------------------------------------------
//
// Copies 3D memory block Device -> Device
//
void copy3D_DtoD_T(void* src_data, void* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  size_t sizeofT, sycl::queue &myQ);

template <class T>
void copy3D_DtoD(T* src_data, T* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  sycl::queue &myQ) {
  copy3D_DtoD_T(src_data, dst_data,
    src_x0, src_y0, src_z0,
    src_xsize, src_ysize,
    dst_x0, dst_y0, dst_z0,
    dst_xsize, dst_ysize,
    width, height, depth,
    sizeof(T), myQ);
}

//----------------------------------------------------------------------------------------
//
// Copies 3D memory block between two peer devices Device -> Device
//
void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
  void* src_data, void* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  size_t sizeofT, sycl::queue &myQ);

template <class T>
void copy3D_PeerDtoD(int src_dev, int dst_dev,
  T* src_data, T* dst_data,
  int src_x0, int src_y0, int src_z0,
  size_t src_xsize, size_t src_ysize,
  int dst_x0, int dst_y0, int dst_z0,
  size_t dst_xsize, size_t dst_ysize,
  size_t width, size_t height, size_t depth,
  sycl::queue &myQ) {
  copy3D_PeerDtoD_T(src_dev, dst_dev,
    src_data, dst_data,
    src_x0, src_y0, src_z0,
    src_xsize, src_ysize,
    dst_x0, dst_y0, dst_z0,
    dst_xsize, dst_ysize,
    width, height, depth,
    sizeof(T), myQ);
}
#endif // NAMD_DPCPP

#endif // DPCPPUTILS_H
