/***************************************************************************
 *cr
 *cr            (C) Copyright 1995-2009 John E. Stone
 *cr
 ***************************************************************************/
/***************************************************************************
 * RCS INFORMATION:
 *
 *      $RCSfile: WKFThreads.h,v $
 *      $Author: johns $        $Locker:  $             $State: Exp $
 *      $Revision: 1.2 $       $Date: 2009/09/16 04:29:54 $
 *
 ***************************************************************************/
/*
 * Copyright (c) 1994-2009 John E. Stone
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef WKF_THREADS_INC
#define WKF_THREADS_INC 1

/* POSIX Threads */
#if defined(__hpux) || defined(__irix) || defined(__linux) || defined(_CRAY) || defined(__osf__) || defined(_AIX) || defined(__APPLE__) || defined(__sun)
#if !defined(USEPOSIXTHREADS)
#define USEPOSIXTHREADS
#endif
#endif

#ifdef WKFTHREADS
#ifdef USEPOSIXTHREADS
#include <pthread.h>

typedef pthread_t        wkf_thread_t;
typedef pthread_mutex_t   wkf_mutex_t;
typedef pthread_cond_t     wkf_cond_t;
#endif


#ifdef _MSC_VER
#include <windows.h>
typedef HANDLE wkf_thread_t;
typedef CRITICAL_SECTION wkf_mutex_t;

#if 0 && (NTDDI_VERSION >= NTDDI_WS08 || _WIN32_WINNT > 0x0600) 
/* Use native condition variables only with Windows Server 2008 and newer... */
#define WKFUSEWIN2008CONDVARS 1
typedef  CONDITION_VARIABLE wkf_cond_t;
#else
/* Every version of Windows prior to Vista/WS2008 must emulate */
/* variables using manually resettable events or other schemes */ 

/* For higher performance, use interlocked memory operations   */
/* rather than locking/unlocking mutexes when manipulating     */
/* internal state.                                             */
#if 1
#define WKFUSEINTERLOCKEDATOMICOPS 1
#endif 
#define WKF_COND_SIGNAL    0
#define WKF_COND_BROADCAST 1
typedef struct {
  LONG waiters;     /**< XXX this _MUST_ be 32-bit aligned for correct */
                    /**< operation with the InterlockedXXX() APIs      */
  CRITICAL_SECTION waiters_lock;
  HANDLE events[2]; /**< Signal and broadcast event HANDLEs. */
} wkf_cond_t;
#endif

typedef HANDLE wkf_barrier_t; /**< Not implemented for Windows */
#endif
#endif


#ifndef WKFTHREADS
typedef int wkf_thread_t;
typedef int wkf_mutex_t;
typedef int wkf_cond_t;
#endif


typedef struct wkf_run_barrier_struct {
  int padding1[8]; /* Padding bytes to avoid false sharing and cache aliasing */
  wkf_mutex_t lock;       /**< Mutex lock for the structure */
  int n_clients;          /**< Number of threads to wait for at barrier */
  int n_waiting;          /**< Number of currently waiting threads */
  int phase;              /**< Flag to separate waiters from fast workers */
  void * (*fctn)(void *); /**< Fctn ptr to call, or NULL if done */
  void * parms;           /**< parms for fctn pointer */
  void * (*rslt)(void *); /**< Fctn ptr to return to barrier wait callers */
  void * rsltparms;       /**< parms to return to barrier wait callers */
  wkf_cond_t wait_cv;     /**< Clients wait on condition variable to proceed */
  int padding2[8]; /* Padding bytes to avoid false sharing and cache aliasing */
} wkf_run_barrier_t;


#ifdef __cplusplus
extern "C" {
#endif

/* number of physical processors available */
int wkf_thread_numphysprocessors(void);

/* number of processors available, subject to user override */
int wkf_thread_numprocessors(void);

/* query CPU affinity of the WKF process (if allowed by host system) */
int * wkf_cpu_affinitylist(int *cpuaffinitycount);

/* set the CPU affinity of the current thread (if allowed by host system) */
int wkf_thread_set_self_cpuaffinity(int cpu);

/* set the concurrency level and scheduling scope for threads */
int wkf_thread_setconcurrency(int);


/*
 * Thread management
 */
/* create a thread */
int wkf_thread_create(wkf_thread_t *, void * fctn(void *), void *);

/* join (wait for completion of, and merge with) a thread */
int wkf_thread_join(wkf_thread_t, void **);


/*
 * Mutexes
 */
/* initialize a mutex */
int wkf_mutex_init(wkf_mutex_t *);

/* lock a mutex */
int wkf_mutex_lock(wkf_mutex_t *);

/* try to lock a mutex */
int wkf_mutex_trylock(wkf_mutex_t *);

/* lock a mutex by spinning only */
int wkf_mutex_spin_lock(wkf_mutex_t *);

/* unlock a mutex */
int wkf_mutex_unlock(wkf_mutex_t *);

/* destroy a mutex */
int wkf_mutex_destroy(wkf_mutex_t *);


/*
 * Condition variables
 */
/* initialize a condition variable */
int wkf_cond_init(wkf_cond_t *);

/* destroy a condition variable */
int wkf_cond_destroy(wkf_cond_t *);

/* wait on a condition variable */
int wkf_cond_wait(wkf_cond_t *, wkf_mutex_t *);

/* signal a condition variable, waking at least one thread */
int wkf_cond_signal(wkf_cond_t *);

/* signal a condition variable, waking all threads */
int wkf_cond_broadcast(wkf_cond_t *);


/*
 * This is a symmetric barrier routine designed to be used
 * in implementing a sleepable thread pool.
 */
int wkf_thread_run_barrier_init(wkf_run_barrier_t *barrier, int n_clients);
void wkf_thread_run_barrier_destroy(wkf_run_barrier_t *barrier);

/* sleeping barrier synchronization */
void * (*wkf_thread_run_barrier(wkf_run_barrier_t *barrier,
                                void * fctn(void*),
                                void * parms,
                                void **rsltparms))(void *);

/* non-blocking poll to see if peers are already at the barrier */
int wkf_thread_run_barrier_poll(wkf_run_barrier_t *barrier);


/* Task tile struct for stack, iterator, and scheduler routines;  */
/* 'start' is inclusive, 'end' is exclusive.  This yields a       */
/* half-open interval that corresponds to a typical 'for' loop.   */
typedef struct wkf_tasktile_struct {
  int start;           /**< starting task ID (inclusive) */
  int end;             /**< ending task ID (exclusive) */
} wkf_tasktile_t;


/* 
 * tile stack 
 */
#define WKF_TILESTACK_EMPTY -1

typedef struct wkf_tasktile_stack_struct {
  wkf_mutex_t mtx;   /**< Mutex lock for the structure */
  int growthrate;    /**< stack growth chunk size */
  int size;          /**< current allocated stack size */
  int top;           /**< index of top stack element */
  wkf_tasktile_t *s; /**< stack of task tiles */
} wkf_tilestack_t;

int wkf_tilestack_init(wkf_tilestack_t *s, int size); 
void wkf_tilestack_destroy(wkf_tilestack_t *);
int wkf_tilestack_compact(wkf_tilestack_t *);
int wkf_tilestack_push(wkf_tilestack_t *, const wkf_tasktile_t *);
int wkf_tilestack_pop(wkf_tilestack_t *, wkf_tasktile_t *);
int wkf_tilestack_popall(wkf_tilestack_t *);
int wkf_tilestack_empty(wkf_tilestack_t *);


/*
 * Shared iterators intended for trivial CPU/GPU load balancing with no
 * exception handling capability (all work units must complete with 
 * no errors, or else the whole thing is canceled).
 */
/* work scheduling macros */
#define WKF_SCHED_DONE     -1
#define WKF_SCHED_CONTINUE  0

typedef struct wkf_shared_iterator_struct {
  wkf_mutex_t mtx;     /**< mutex lock */
  int start;           /**< starting value (inclusive) */
  int end;             /**< ending value (exlusive) */
  int current;         /**< current value */
  int fatalerror;      /**< cancel processing immediately for all threads */
} wkf_shared_iterator_t;

/* initialize a shared iterator */
int wkf_shared_iterator_init(wkf_shared_iterator_t *it);

/* destroy a shared iterator */
int wkf_shared_iterator_destroy(wkf_shared_iterator_t *it);

/* Set shared iterator state to half-open interval defined by tile */
int wkf_shared_iterator_set(wkf_shared_iterator_t *it, wkf_tasktile_t *tile);

/* iterate the shared iterator with a requested tile size,        */
/* returns the tile received, and a return code of -1 if no       */
/* iterations left or a fatal error has occured during processing,*/
/* canceling all worker threads.                                  */
int wkf_shared_iterator_next_tile(wkf_shared_iterator_t *it, int reqsize, 
                                  wkf_tasktile_t *tile);

/* worker thread calls this to indicate a fatal error */
int wkf_shared_iterator_setfatalerror(wkf_shared_iterator_t *it);

/* master thread calls this to query for fatal errors */
int wkf_shared_iterator_getfatalerror(wkf_shared_iterator_t *it);


/*
 * Thread pool.
 */
/* shortcut macro to tell the create routine we only want CPU cores */
#define WKF_THREADPOOL_DEVLIST_CPUSONLY NULL

/* symbolic constant macro to test if we have a GPU or not */
#define WKF_THREADPOOL_DEVID_CPU -1

/** thread-specific handle data for workers */
typedef struct wkf_threadpool_workerdata_struct {
  int padding1[8]; /* Padding bytes to avoid false sharing and cache aliasing */
  wkf_shared_iterator_t *iter;           /**< dynamic work scheduler */
  wkf_tilestack_t *errorstack;           /**< stack of tiles that failed */
  int threadid;                          /**< worker thread's id */
  int threadcount;                       /**< total number of worker threads */
  int devid;                             /**< worker CPU/GPU device ID */
  float devspeed;                        /**< speed scaling for this device */
  void *parms;                           /**< fctn parms for this worker */
  void *thrpool;                         /**< void ptr to thread pool struct */
  int padding2[8]; /* Padding bytes to avoid false sharing and cache aliasing */
} wkf_threadpool_workerdata_t;

typedef struct wkf_threadpool_struct {
  int workercount;                         /**< number of worker threads */
  int *devlist;                            /**< per-worker CPU/GPU device IDs */
  wkf_shared_iterator_t iter;              /**< dynamic work scheduler */
  wkf_tilestack_t errorstack;              /**< stack of tiles that failed */
  wkf_thread_t *threads;                   /**< worker threads */
  wkf_threadpool_workerdata_t *workerdata; /**< per-worker data */
  wkf_run_barrier_t runbar;                /**< execution barrier */
} wkf_threadpool_t;

/* create a thread pool with a specified number of worker threads */
wkf_threadpool_t * wkf_threadpool_create(int workercount, int *devlist);

/* launch threads onto a new function, with associated parms */
int wkf_threadpool_launch(wkf_threadpool_t *thrpool, 
                          void *fctn(void *), void *parms, int blocking);

/* wait for all worker threads to complete their work */
int wkf_threadpool_wait(wkf_threadpool_t *thrpool);

/* join all worker threads and free resources */
int wkf_threadpool_destroy(wkf_threadpool_t *thrpool);

/* worker thread can call this to get its ID and number of peers */
int wkf_threadpool_worker_getid(void *voiddata, int *threadid, int *threadcount);

/* worker thread can call this to get its CPU/GPU device ID */
int wkf_threadpool_worker_getdevid(void *voiddata, int *devid);

/* worker thread calls this to set relative speed of this device */
/* as determined by the SM/core count and clock rate             */
/* Note: this should only be called once, during the worker's    */
/* device initialization process                                 */
int wkf_threadpool_worker_setdevspeed(void *voiddata, float speed);

/* worker thread calls this to get relative speed of this device */
/* as determined by the SM/core count and clock rate             */
int wkf_threadpool_worker_getdevspeed(void *voiddata, float *speed);

/* worker thread calls this to scale max tile size by worker speed */
/* as determined by the SM/core count and clock rate             */
int wkf_threadpool_worker_devscaletile(void *voiddata, int *tilesize);

/* worker thread can call this to get its client data pointer */
int wkf_threadpool_worker_getdata(void *voiddata, void **clientdata);

/* Set dynamic scheduler state to half-open interval defined by tile */
int wkf_threadpool_sched_dynamic(wkf_threadpool_t *thrpool, wkf_tasktile_t *tile);

/* worker thread calls this to get its next work unit            */
/* iterate the shared iterator, returns -1 if no iterations left */
int wkf_threadpool_next_tile(void *thrpool, int reqsize, 
                             wkf_tasktile_t *tile);

/* worker thread calls this when it fails computing a tile after */
/* it has already taken it from the scheduler                    */
int wkf_threadpool_tile_failed(void *thrpool, wkf_tasktile_t *tile);

/* worker thread calls this to indicate that an unrecoverable error occured */
int wkf_threadpool_setfatalerror(void *thrparms);

/* master thread calls this to query for fatal errors */
int wkf_threadpool_getfatalerror(void *thrparms);



/*
 * Routines to generate a pool of threads which then grind through
 * a dynamically load balanced work queue implemented as a shared iterator.
 * No exception handling is possible, just a simple all-or-nothing attept.
 * Useful for simple calculations that take very little time.
 * An array of threads is generated, launched, and joined all with one call.
 */
typedef struct wkf_threadlaunch_struct {
  int padding1[8]; /* Padding bytes to avoid false sharing and cache aliasing */
  wkf_shared_iterator_t *iter;  /**< dynamic scheduler iterator */
  int threadid;                 /**< ID of worker thread */
  int threadcount;              /**< number of workers */
  void * clientdata;            /**< worker parameters */
  int padding2[8]; /* Padding bytes to avoid false sharing and cache aliasing */
} wkf_threadlaunch_t;

/* launch up to numprocs threads using shared iterator as a load balancer */
int wkf_threadlaunch(int numprocs, void *clientdata, void * fctn(void *),
                     wkf_tasktile_t *tile);

/* worker thread can call this to get its ID and number of peers */
int wkf_threadlaunch_getid(void *thrparms, int *threadid, int *threadcount);

/* worker thread can call this to get its client data pointer */
int wkf_threadlaunch_getdata(void *thrparms, void **clientdata);

/* worker thread calls this to get its next work unit            */
/* iterate the shared iterator, returns -1 if no iterations left */
int wkf_threadlaunch_next_tile(void *voidparms, int reqsize, 
                               wkf_tasktile_t *tile);

/* worker thread calls this to indicate that an unrecoverable error occured */
int wkf_threadlaunch_setfatalerror(void *thrparms);


#ifdef __cplusplus
}
#endif

#endif
