/**
***  Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by
***  The Board of Trustees of the University of Illinois.
***  All rights reserved.
**/

/*
   Currently, WorkDistrib generates the layout of the Patches,
   directs the construction and distribution of Computes and
   associates Computes with Patches.
*/

#include <stdio.h>

#include "charm++.h"

#include "ProcessorPrivate.h"

#include "BOCgroup.h"
#include "WorkDistrib.decl.h"
#include "WorkDistrib.h"

#ifdef USE_COMM_LIB
#include "ComlibManager.h"
#endif

#include "main.decl.h"
#include "main.h"
#include "Node.h"
#include "PatchMgr.h"
#include "PatchMap.inl"
#include "NamdTypes.h"
#include "PDB.h"
#include "SimParameters.h"
#include "Molecule.h"
#include "NamdOneTools.h"
#include "Compute.h"
#include "ComputeMap.h"
#include "RecBisection.h"
#include "Random.h"
#include "varsizemsg.h"

//#define DEBUGM
#define MIN_DEBUG_LEVEL 2
#include "Debug.h"


class ComputeMapChangeMsg : public CMessage_ComputeMapChangeMsg
{
public:

  int numNewNodes;
  int *newNodes;

//  VARSIZE_DECL(ComputeMapChangeMsg);
};

/*
VARSIZE_MSG(ComputeMapChangeMsg,
  VARSIZE_ARRAY(newNodes);
)
*/


//======================================================================
// Public functions
//----------------------------------------------------------------------
WorkDistrib::WorkDistrib()
{
  CpvAccess(BOCclass_group).workDistrib = thisgroup;
  mapsArrived = false;
  awaitingMaps = false;
}

//----------------------------------------------------------------------
WorkDistrib::~WorkDistrib(void)
{ }


//----------------------------------------------------------------------
void WorkDistrib::saveComputeMapChanges(int ep, CkGroupID chareID)
{
  saveComputeMapReturnEP = ep;
  saveComputeMapReturnChareID = chareID;
  saveComputeMapCount = CkNumPes();

  ComputeMap *computeMap = ComputeMap::Object();

  int i;
  for (i=0; i<computeMap->numComputes(); i++) {
    DebugM(3, "ComputeMap (" << i << ") node = " << computeMap->node(i) << " newNode = " << computeMap->newNode(i) << "\n");
  }
  
  int nnn = computeMap->numComputes();
  ComputeMapChangeMsg *mapMsg 
    = new (nnn,0) ComputeMapChangeMsg ;

  mapMsg->numNewNodes = nnn;
  for(i=0; i<nnn; i++)
    mapMsg->newNodes[i] = computeMap->newNode(i);

  CProxy_WorkDistrib(thisgroup).recvComputeMapChanges(mapMsg);
}

void WorkDistrib::recvComputeMapChanges(ComputeMapChangeMsg *msg) {
  
  ComputeMap *computeMap = ComputeMap::Object();
  int i;
  for(i=0; i<computeMap->numComputes(); i++)
    computeMap->setNewNode(i,msg->newNodes[i]);

  delete msg;

#if CHARM_VERSION > 050402
  CProxy_WorkDistrib workProxy(thisgroup);
  workProxy[0].doneSaveComputeMap();
#else
  CProxy_WorkDistrib(thisgroup).doneSaveComputeMap(0);
#endif

  DebugM(2, "ComputeMap after send!\n");
  for (i=0; i<computeMap->numComputes(); i++) {
    DebugM(2, "ComputeMap (" << i << ") node = " << computeMap->node(i) << " newNode = " << computeMap->newNode(i) << " type=" << computeMap->type(i) << "\n");
  }
  DebugM(2, "===================================================\n");
}

void WorkDistrib::doneSaveComputeMap() {
  if (!--saveComputeMapCount) { 
    CkSendMsgBranch(saveComputeMapReturnEP, CkAllocMsg(0,0,0), 0, saveComputeMapReturnChareID);
  }
}


//----------------------------------------------------------------------
// This should only be called on node 0.
//----------------------------------------------------------------------
FullAtomList *WorkDistrib::createAtomLists(void)
{
  int i;
  StringList *current;	//  Pointer used to retrieve configuration items
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();
  PatchMap *patchMap = PatchMap::Object();
  CProxy_PatchMgr pm(CpvAccess(BOCclass_group).patchMgr);
  PatchMgr *patchMgr = pm.ckLocalBranch();
  SimParameters *params = node->simParameters;
  Molecule *molecule = node->molecule;
  PDB *pdb = node->pdb;

  int numPatches = patchMap->numPatches();
  int numAtoms = pdb->num_atoms();

  Vector *positions = new Position[numAtoms];
  pdb->get_all_positions(positions);

  Vector *velocities = new Velocity[numAtoms];

  if ( params->initialTemp < 0.0 ) {
    Bool binvels=FALSE;

    //  Reading the veolcities from a PDB
    current = node->configList->find("velocities");

    if (current == NULL) {
      current = node->configList->find("binvelocities");
      binvels = TRUE;
    }

    if (!binvels) {
      velocities_from_PDB(current->data, velocities, numAtoms);
    }
    else {
      velocities_from_binfile(current->data, velocities, numAtoms);
    }
  }
  else {
    // Random velocities for a given temperature
    random_velocities(params->initialTemp, molecule, velocities, numAtoms);
  }

  //  If COMMotion == no, remove center of mass motion
  if (!(params->comMove)) {
    remove_com_motion(velocities, molecule, numAtoms);
  }

  FullAtomList *atoms = new FullAtomList[numPatches];

  const Lattice lattice = params->lattice;

  if (params->splitPatch == SPLIT_PATCH_HYDROGEN)
    {
    // split atoms into patched based on helix-group and position
    int aid, pid=0;
    for(i=0; i < numAtoms; i++)
      {
      if ( ! ( i % 1000 ) )
	{
        DebugM(3,"Assigned " << i << " atoms to patches so far.\n");
        }
      // Assign atoms to patches without splitting hydrogen groups.
      // We know that the hydrogenGroup array is sorted with group parents
      // listed first.  Thus, only change the pid if an atom is a group parent.
      aid = molecule->hydrogenGroup[i].atomID;
      if (molecule->hydrogenGroup[i].isGP)
	pid = patchMap->assignToPatch(positions[aid],lattice);
      // else: don't change pid
      FullAtom a;
      a.id = aid;
      a.position = positions[aid];
      a.velocity = velocities[aid];
      atoms[pid].add(a);
      }
    }
  else
    {
    // split atoms into patched based on position
    for(i=0; i < numAtoms; i++)
      {
      if ( ! ( i % 1000 ) )
	{
	DebugM(3,"Assigned " << i << " atoms to patches so far.\n");
	}
      int pid = patchMap->assignToPatch(positions[i],lattice);
      FullAtom a;
      a.id = i;
      a.position = positions[i];
      a.velocity = velocities[i];
      atoms[pid].add(a);
      }
    }

  delete [] positions;
  delete [] velocities;

  for(i=0; i < numPatches; i++)
  {
    ScaledPosition center(0.5*(patchMap->min_a(i)+patchMap->max_a(i)),
			  0.5*(patchMap->min_b(i)+patchMap->max_b(i)),
			  0.5*(patchMap->min_c(i)+patchMap->max_c(i)));

    int n = atoms[i].size();
    FullAtom *a = atoms[i].begin();
    int j;
//Modifications for alchemical fep
//SD & CC, CNRS - LCTN, Nancy
    Bool fepOn = params->fepOn;
//fepe
    Bool lesOn = params->lesOn;
  
    Bool pairInteractionOn = params->pairInteractionOn;

    Transform mother_transform;
    for(j=0; j < n; j++)
    {
      int aid = a[j].id;

      if (params->splitPatch == SPLIT_PATCH_HYDROGEN) {
        if ( molecule->is_hydrogenGroupParent(aid) ) {
          a[j].hydrogenGroupSize = molecule->get_groupSize(aid);
        } else {
          a[j].hydrogenGroupSize = 0;
        }
      } else {
        a[j].hydrogenGroupSize = 1;
      }

      a[j].nonbondedGroupIsAtom = 0;

      a[j].atomFixed = molecule->is_atom_fixed(aid) ? 1 : 0;
      a[j].fixedPosition = a[j].position;

      if ( a[j].hydrogenGroupSize ) {
        a[j].position = lattice.nearest(
		a[j].position, center, &(a[j].transform));
        mother_transform = a[j].transform;
      } else {
        a[j].position = lattice.apply_transform(a[j].position,mother_transform);
        a[j].transform = mother_transform;
      }

      a[j].mass = molecule->atommass(aid);
      a[j].charge = molecule->atomcharge(aid);

//Modifications for alchemical fep
//SD & CC, CNRS - LCTN, Nancy
      if ( fepOn || lesOn || pairInteractionOn ) {
        a[j].partition = molecule->get_fep_type(aid);
      } 
      else {
        a[j].partition = 0;
      }
//fepe

    }

    for(j=0; j < n; j++)
    {
      int aid = a[j].id;
    }


    int size, allfixed, k;
    for(j=0; j < n; j+=size) {
      size = a[j].hydrogenGroupSize;
      if ( ! size ) {
        NAMD_bug("Mother atom with hydrogenGroupSize of 0!");
      }
      allfixed = 1;
      for ( k = 0; k < size; ++k ) {
        allfixed = ( allfixed && (a[j+k].atomFixed) );
      }
      for ( k = 0; k < size; ++k ) {
        a[j+k].groupFixed = allfixed ? 1 : 0;
      }
    }

  }

  return atoms;

}


//----------------------------------------------------------------------
// This should only be called on node 0.
//----------------------------------------------------------------------
void WorkDistrib::createHomePatches(void)
{
  int i;
  PatchMap *patchMap = PatchMap::Object();
  CProxy_PatchMgr pm(CpvAccess(BOCclass_group).patchMgr);
  PatchMgr *patchMgr = pm.ckLocalBranch();

  int numPatches = patchMap->numPatches();

  FullAtomList *atoms = createAtomLists();

  int maxAtoms = -1;
  int maxPatch = -1;
  for(i=0; i < numPatches; i++) {
    int numAtoms = atoms[i].size();
    if ( numAtoms > maxAtoms ) { maxAtoms = numAtoms; maxPatch = i; }
  }
  iout << iINFO << "LARGEST PATCH (" << maxPatch <<
	") HAS " << maxAtoms << " ATOMS\n" << endi;

  for(i=0; i < numPatches; i++)
  {
    if ( ! ( i % 100 ) )
    {
      DebugM(3,"Created " << i << " patches so far.\n");
    }

    patchMgr->createHomePatch(i,atoms[i]);
  }

  delete [] atoms;
}

void WorkDistrib::distributeHomePatches() {
  // ref BOC
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();
  CProxy_PatchMgr pm(CpvAccess(BOCclass_group).patchMgr);
  PatchMgr *patchMgr = pm.ckLocalBranch();
  // ref singleton
  PatchMap *patchMap = PatchMap::Object();

  // Move patches to the proper node
  for(int i=0;i < patchMap->numPatches(); i++)
  {
    if (patchMap->node(i) != node->myid() )
    {
      DebugM(3,"patchMgr->movePatch("
	<< i << "," << patchMap->node(i) << ")\n");
      patchMgr->movePatch(i,patchMap->node(i));
    }
  }
  patchMgr->sendMovePatches();
}

void WorkDistrib::reinitAtoms() {

  PatchMap *patchMap = PatchMap::Object();
  CProxy_PatchMgr pm(CpvAccess(BOCclass_group).patchMgr);
  PatchMgr *patchMgr = pm.ckLocalBranch();

  int numPatches = patchMap->numPatches();

  FullAtomList *atoms = createAtomLists();

  for(int i=0; i < numPatches; i++) {
    patchMgr->sendAtoms(i,atoms[i]);
  }

  delete [] atoms;

}


//----------------------------------------------------------------------

class MapDistribMsg: public CMessage_MapDistribMsg {
  public:
    char *patchMapData;
    char *computeMapData;

//  VARSIZE_DECL(MapDistribMsg);
};

/*
VARSIZE_MSG(MapDistribMsg,
  VARSIZE_ARRAY(patchMapData);
  VARSIZE_ARRAY(computeMapData);
)
*/

void WorkDistrib::sendMaps(void)
{
  if ( CkNumPes() == 1 ) {
    mapsArrived = true;
    return;
  }

  int sizes[2];
  sizes[0] = PatchMap::Object()->packSize();
  sizes[1] = ComputeMap::Object()->packSize();

  MapDistribMsg *mapMsg = new (sizes[0], sizes[1], 0) MapDistribMsg;

  PatchMap::Object()->pack(mapMsg->patchMapData);
  ComputeMap::Object()->pack(mapMsg->computeMapData);

#if CHARM_VERSION > 050402
  CProxy_WorkDistrib workProxy(thisgroup);
  workProxy[0].saveMaps(mapMsg);
#else
  CProxy_WorkDistrib(thisgroup).saveMaps(mapMsg,0);
#endif
}

// saveMaps() is called when the map message is received
void WorkDistrib::saveMaps(MapDistribMsg *msg)
{
  // Use a resend to forward messages before processing.  Otherwise the
  // map distribution is slow on many CPUs.  We need to use a tree
  // rather than a broadcast because some implementations of broadcast
  // generate a copy of the message on the sender for each recipient.
  // This is because MPI doesn't allow re-use of an outstanding buffer.

  if ( mapsArrived && CkMyPe() ) {
    PatchMap::Object()->unpack(msg->patchMapData);
    ComputeMap::Object()->unpack(msg->computeMapData);
  }
  if ( mapsArrived ) {
    delete msg;
    return;
  }

  mapsArrived = true;

  int pids[3];
  int basePe = 2 * CkMyPe() + 1;
  int npid = 0;
  if ( (basePe+npid) < CkNumPes() ) { pids[npid] = basePe + npid; ++npid; }
  if ( (basePe+npid) < CkNumPes() ) { pids[npid] = basePe + npid; ++npid; }
  pids[npid] = CkMyPe(); ++npid;  // always send the message to ourselves
  CProxy_WorkDistrib(thisgroup).saveMaps(msg,npid,pids);
}


//----------------------------------------------------------------------
void WorkDistrib::patchMapInit(void)
{
  PatchMap *patchMap = PatchMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();
  SimParameters *params = node->simParameters;
  Lattice lattice = params->lattice;

  BigReal patchSize = params->patchDimension;

  ScaledPosition xmin, xmax;
  ScaledPosition sysDim, sysMin;

  DebugM(3,"Mapping patches\n");
  // Need to use full box for FMA to match NAMD 1.X results.
  if ( params->FMAOn ) {
    node->pdb->find_extremes(&(xmin.x),&(xmax.x),lattice.a_r());
    node->pdb->find_extremes(&(xmin.y),&(xmax.y),lattice.b_r());
    node->pdb->find_extremes(&(xmin.z),&(xmax.z),lattice.c_r());
  // Otherwise, this allows a small number of stray atoms.
  } else {
    node->pdb->find_extremes(&(xmin.x),&(xmax.x),lattice.a_r(),0.9);
    node->pdb->find_extremes(&(xmin.y),&(xmax.y),lattice.b_r(),0.9);
    node->pdb->find_extremes(&(xmin.z),&(xmax.z),lattice.c_r(),0.9);
  }

  BigReal origin_shift;
  origin_shift = lattice.a_r() * lattice.origin();
  xmin.x -= origin_shift;
  xmax.x -= origin_shift;
  origin_shift = lattice.b_r() * lattice.origin();
  xmin.y -= origin_shift;
  xmax.y -= origin_shift;
  origin_shift = lattice.c_r() * lattice.origin();
  xmin.z -= origin_shift;
  xmax.z -= origin_shift;

  patchMap->initialize(xmin,xmax,lattice,patchSize,
				params->twoAwayX ? 2 : 1,
				params->twoAwayY ? 2 : 1,
				params->twoAwayZ ? 2 : 1);

}

//----------------------------------------------------------------------
void WorkDistrib::assignNodeToPatch()
{
  int method=1;

  PatchMap *patchMap = PatchMap::Object();
  int nNodes = Node::Object()->numNodes();
  if (nNodes > patchMap->numPatches())
    assignPatchesBitReversal();
  else if (nNodes == patchMap->numPatches())
    assignPatchesRoundRobin();
  else if (method==1)
    assignPatchesRecursiveBisection();
  else
    assignPatchesToLowestLoadNode();

  int *nAtoms = new int[nNodes];
  int numAtoms=0;
  int i;
  for(i=0; i < nNodes; i++)
    nAtoms[i] = 0;

  for(i=0; i < patchMap->numPatches(); i++)
  {
    //    iout << iINFO << "Patch " << i << " has " 
    //	 << patchMap->patch(i)->getNumAtoms() << " atoms and "
    //	 << patchMap->patch(i)->getNumAtoms() * 
    //            patchMap->patch(i)->getNumAtoms() 
    //	 << " pairs.\n" << endi;

    if (patchMap->patch(i)) {
      numAtoms += patchMap->patch(i)->getNumAtoms();
      nAtoms[patchMap->node(i)] += patchMap->patch(i)->getNumAtoms();
    }
  }

//  for(i=0; i < nNodes; i++)
//    iout << iINFO 
//	 << nAtoms[i] << " atoms assigned to node " << i << "\n" << endi;
  if ( numAtoms != Node::Object()->molecule->numAtoms ) {
    NAMD_die("Incorrect atom count in WorkDistrib::assignNodeToPatch\n");
  }

  delete [] nAtoms;
 
  //  PatchMap::Object()->printPatchMap();
}

//----------------------------------------------------------------------
// void WorkDistrib::assignPatchesSlices() 
// {
//   int pid; 
//   int assignedNode = 0;
//   PatchMap *patchMap = PatchMap::Object();
//   Node *node = CLocalBranch(Node, CpvAccess(BOCclass_group).node);

//   int *numAtoms = new int[node->numNodes()];
//   for (int i=0; i<node->numNodes(); i++) {
//     numAtoms[i] = 0;
//   }

//   // Assign patch to node with least atoms assigned.
//   for(pid=0; pid < patchMap->numPatches(); pid++) {
//     assignedNode = 0;
//     for (i=1; i < node->numNodes(); i++) {
//       if (numAtoms[i] < numAtoms[assignedNode]) assignedNode = i;
//     }
//     patchMap->assignNode(pid, assignedNode);
//     numAtoms[assignedNode] += patchMap->patch(pid)->getNumAtoms();

//     /*
//     iout << iINFO << "Patch (" << pid << ") has " 
//       << patchMap->patch(pid)->getNumAtoms() 
//       << " atoms:  Assigned to Node(" << assignedNode << ")\n" 
//       << endi;
//     */
//   }

//   delete[] numAtoms;
// }

//----------------------------------------------------------------------
void WorkDistrib::assignPatchesToLowestLoadNode() 
{
  int pid; 
  int assignedNode = 0;
  PatchMap *patchMap = PatchMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();

  int *load = new int[node->numNodes()];
  int *assignedNodes = new int[patchMap->numPatches()];
  for (int i=0; i<node->numNodes(); i++) {
    load[i] = 0;
  }

  // Assign patch to node with least atoms assigned.
  for(pid=0; pid < patchMap->numPatches(); pid++) {
    assignedNode = 0;
    for (int i=1; i < node->numNodes(); i++) {
      if (load[i] < load[assignedNode]) assignedNode = i;
    }
    assignedNodes[pid] = assignedNode;
    load[assignedNode] += patchMap->patch(pid)->getNumAtoms() + 1;
  }

  delete[] load;
  sortNodesAndAssign(assignedNodes);
  delete[] assignedNodes;
}

//----------------------------------------------------------------------
void WorkDistrib::assignPatchesBitReversal() 
{
  int pid; 
  PatchMap *patchMap = PatchMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();

  int ncpus = node->numNodes();
  int npatches = patchMap->numPatches();
  if ( ncpus <= npatches )
    NAMD_bug("WorkDistrib::assignPatchesBitReversal called improperly");

  // find next highest power of two
  int npow2 = 1;  int nbits = 0;
  while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }

  // build bit reversal sequence
  SortableResizeArray<int> seq(ncpus);
  // avoid using node 0 (reverse of 0 is 0 so start at 1)
  int i = 1;
  for ( int icpu=0; icpu<(ncpus-1); ++icpu ) {
    int ri;
    for ( ri = ncpus; ri >= ncpus; ++i ) {
      ri = 0;
      int pow2 = 1;
      int rpow2 = npow2 / 2;
      for ( int j=0; j<nbits; ++j ) {
        ri += rpow2 * ( ( i / pow2 ) % 2 );
        pow2 *= 2;  rpow2 /= 2;
      }
    }
    seq[icpu] = ri;
  }

  // extract and sort patch locations
  seq.resize(npatches);
  seq.sort();

  sortNodesAndAssign(seq.begin());
}

//----------------------------------------------------------------------
struct nodesort {
  int node;
  int a_total;
  int b_total;
  int c_total;
  int npatches;
  nodesort() : node(-1),a_total(0),b_total(0),c_total(0),npatches(0) { ; }
  int operator==(const nodesort &o) const {
    float a1 = ((float)a_total)/((float)npatches);
    float a2 = ((float)o.a_total)/((float)o.npatches);
    float b1 = ((float)b_total)/((float)npatches);
    float b2 = ((float)o.b_total)/((float)o.npatches);
    float c1 = ((float)c_total)/((float)npatches);
    float c2 = ((float)o.c_total)/((float)o.npatches);
    return ((a1 == a2) && (b1 == b2) && (c1 == c2));
  }
  int operator<(const nodesort &o) const {
    float a1 = ((float)a_total)/((float)npatches);
    float a2 = ((float)o.a_total)/((float)o.npatches);
    float b1 = ((float)b_total)/((float)npatches);
    float b2 = ((float)o.b_total)/((float)o.npatches);
    float c1 = ((float)c_total)/((float)npatches);
    float c2 = ((float)o.c_total)/((float)o.npatches);
    return ( (a1 < a2) || ((a1 == a2) && (b1 < b2)) ||
		((a1 == a2) && (b1 == b2) && (c1 < c2)) );
  }
};

void WorkDistrib::sortNodesAndAssign(int *assignedNode) {
  int i, pid; 
  PatchMap *patchMap = PatchMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();
  int nnodes = node->numNodes();
  int npatches = patchMap->numPatches();

  ResizeArray<nodesort> allnodes(nnodes);
  for ( i=0; i < nnodes; ++i ) {
    allnodes[i].node = i;
  }
  for ( pid=0; pid<npatches; ++pid ) {
    // iout << pid << " " << assignedNode[pid] << "\n" << endi;
    allnodes[assignedNode[pid]].npatches++;
    allnodes[assignedNode[pid]].a_total += patchMap->index_a(pid);
    allnodes[assignedNode[pid]].b_total += patchMap->index_b(pid);
    allnodes[assignedNode[pid]].c_total += patchMap->index_c(pid);
  }
  SortableResizeArray<nodesort> usednodes(nnodes);
  usednodes.resize(0);
  for ( i=0; i < nnodes; ++i ) {
    if ( allnodes[i].npatches ) usednodes.add(allnodes[i]);
  }
  usednodes.sort();
  int nused = usednodes.size();
  int i2 = nused/2;
  for ( i=0; i < nnodes; ++i ) {
    if ( allnodes[i].npatches ) allnodes[usednodes[i2++].node].node = i;
    if ( i2 == nused ) i2 = 0;
  }

  for ( pid=0; pid<npatches; ++pid ) {
    // iout << pid << " " <<  allnodes[assignedNode[pid]].node << "\n" << endi;
    patchMap->assignNode(pid, allnodes[assignedNode[pid]].node);
  }
}

//----------------------------------------------------------------------
void WorkDistrib::assignPatchesRoundRobin() 
{
  int pid; 
  PatchMap *patchMap = PatchMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();
  int *assignedNode = new int[patchMap->numPatches()];

  for(pid=0; pid < patchMap->numPatches(); pid++) {
    assignedNode[pid] = pid % node->numNodes();
  }

  sortNodesAndAssign(assignedNode);
  delete [] assignedNode;
}

//----------------------------------------------------------------------
void WorkDistrib::assignPatchesRecursiveBisection() 
{
  PatchMap *patchMap = PatchMap::Object();
  int *assignedNode = new int[patchMap->numPatches()];
  int numNodes = Node::Object()->numNodes();
  int usedNodes = numNodes;
  if ( numNodes > 64 ) usedNodes -= 1;
  RecBisection recBisec(usedNodes,PatchMap::Object());
  if ( recBisec.partition(assignedNode) ) {
    if ( usedNodes != numNodes ) {
      for ( int i=0; i<patchMap->numPatches(); ++i ) {
        assignedNode[i] += 1;
      }
    }
    sortNodesAndAssign(assignedNode);
  } else {
    iout << iWARN 
	 << "WorkDistrib: Recursive bisection fails,"
	 << "invoking least-load algorithm\n";
    assignPatchesToLowestLoadNode();
  }
  delete [] assignedNode;
}

//----------------------------------------------------------------------
void WorkDistrib::mapComputes(void)
{
  PatchMap *patchMap = PatchMap::Object();
  ComputeMap *computeMap = ComputeMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();

  DebugM(3,"Mapping computes\n");

  // We need to allocate computes for self, 1 and 2 away pairs for
  // electrostatics, and 1 angleForce for each node.  Then I might
  // throw in a few extras, in case I forget some.

#define MAX_SELF_PARTITIONS 100
#define MAX_PAIR_PARTITIONS 10

/*
  int numPotentialCids =
	patchMap->numPatches() *
		(13 * MAX_PAIR_PARTITIONS + MAX_SELF_PARTITIONS + 10) +
	node->numNodes() * 20;
*/
  int numPotentialCids =
	patchMap->numPatches() *
		(13 * node->simParameters->maxPairPart + 
		node->simParameters->maxSelfPart + 10) +
	node->numNodes() * 20;
  // iout << iINFO << "numPotentialCids: " << numPotentialCids << "\n" << endi;
  computeMap->allocateCids(numPotentialCids);

  // Handle full electrostatics
  if ( node->simParameters->fullDirectOn )
    mapComputeHomePatches(computeFullDirectType);
  if ( node->simParameters->FMAOn )
#ifdef DPMTA
    mapComputeHomePatches(computeDPMTAType);
#else
    NAMD_die("This binary does not include DPMTA (FMA).");
#endif
  if ( node->simParameters->PMEOn ) {
#ifdef DPME
    if ( node->simParameters->useDPME )
      mapComputeHomePatches(computeDPMEType);
    else {
      mapComputeHomePatches(computePmeType);
    }
#else
    mapComputeHomePatches(computePmeType);
#endif
  }

  if ( node->simParameters->globalForcesOn ) {
    DebugM(2,"adding ComputeGlobal\n");
    mapComputeHomePatches(computeGlobalType);
  }

  if ( node->simParameters->extForcesOn )
    mapComputeHomePatches(computeExtType);

  mapComputeNonbonded();

  // If we're doing true pair interactions, no need for bonded terms.
  // But if we're doing within-group interactions, we do need them.
  if ( !node->simParameters->pairInteractionOn || 
      node->simParameters->pairInteractionSelf) { 
    mapComputeHomePatches(computeBondsType);
    mapComputeHomePatches(computeAnglesType);
    mapComputeHomePatches(computeDihedralsType);
    mapComputeHomePatches(computeImpropersType);
    mapComputePatch(computeSelfBondsType);
    mapComputePatch(computeSelfAnglesType);
    mapComputePatch(computeSelfDihedralsType);
    mapComputePatch(computeSelfImpropersType);
  }


  if ( node->simParameters->eFieldOn )
    mapComputePatch(computeEFieldType);
  if ( node->simParameters->stirOn )
    mapComputePatch(computeStirType);
  if ( node->simParameters->sphericalBCOn )
    mapComputePatch(computeSphericalBCType);
  if ( node->simParameters->cylindricalBCOn )
    mapComputePatch(computeCylindricalBCType);
  if ( node->simParameters->constraintsOn )
    mapComputePatch(computeRestraintsType);
  if ( node->simParameters->consForceOn )
    mapComputePatch(computeConsForceType);
  if ( node->simParameters->consTorqueOn )
    mapComputePatch(computeConsTorqueType);
}

//----------------------------------------------------------------------
void WorkDistrib::mapComputeHomePatches(ComputeType type)
{
  PatchMap *patchMap = PatchMap::Object();
  ComputeMap *computeMap = ComputeMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();

  int numNodes = node->numNodes();
  int numPatches = patchMap->numPatches();
  ComputeID *cid = new ComputeID[numNodes];

  for(int i=0; i<numNodes; i++) {
    if ( patchMap->numPatchesOnNode(i) ) {
      cid[i]=computeMap->storeCompute(i,numPatches,type);
    }
  }

  PatchID j;

  for(j=0;j<numPatches;j++)
  {
    patchMap->newCid(j,cid[patchMap->node(j)]);
    computeMap->newPid(cid[patchMap->node(j)],j);
  }

  delete [] cid;
}

//----------------------------------------------------------------------
void WorkDistrib::mapComputePatch(ComputeType type)
{
  PatchMap *patchMap = PatchMap::Object();
  ComputeMap *computeMap = ComputeMap::Object();

  PatchID i;
  ComputeID cid;

  for(i=0; i<patchMap->numPatches(); i++)
  {
    cid=computeMap->storeCompute(patchMap->node(i),1,type);
    computeMap->newPid(cid,i);
    patchMap->newCid(i,cid);
  }

}


//----------------------------------------------------------------------
void WorkDistrib::mapComputeNonbonded(void)
{
  // For each patch, create 1 electrostatic object for self-interaction.
  // Then create 1 for each 1-away and 2-away neighbor which has a larger
  // pid.

  PatchMap *patchMap = PatchMap::Object();
  ComputeMap *computeMap = ComputeMap::Object();
  CProxy_Node nd(CpvAccess(BOCclass_group).node);
  Node *node = nd.ckLocalBranch();

  PatchID oneAway[PatchMap::MaxOneOrTwoAway];
  PatchID oneAwayTrans[PatchMap::MaxOneOrTwoAway];

  PatchID i;
  ComputeID cid;
  int numNeighbors;
  int j;

  for(i=0; i<patchMap->numPatches(); i++) // do the self 
  {
    int64 numAtoms = patchMap->patch(i)->getNumAtoms();  // avoid overflow
    int numPartitions;
    if (node->simParameters->numAtomsSelf == 0) {
      numPartitions = 1 + (numAtoms > 50) + (numAtoms*numAtoms)/50000;
    }
    else {
      numPartitions = (int) (
        numAtoms*numAtoms / (double)(node->simParameters->numAtomsSelf*node->simParameters->numAtomsSelf) + 0.5 );
      if (numPartitions < 1) numPartitions = 1;
    }
    if ( numPartitions > node->simParameters->maxSelfPart )
			numPartitions = node->simParameters->maxSelfPart;
    // self-interaction
    DebugM(4,"Mapping " << numPartitions << " ComputeNonbondedSelf objects for patch " << i << "\n");
//    iout <<"Self numPartitions = " <<numPartitions <<" numAtoms " <<numAtoms <<endl;
    for(int partition=0; partition < numPartitions; partition++)
    {
      cid=computeMap->storeCompute(patchMap->node(i),1,
				   computeNonbondedSelfType,
				   partition,numPartitions);
      computeMap->newPid(cid,i);
      patchMap->newCid(i,cid);
    }
  }

  for(int p1=0; p1 <patchMap->numPatches(); p1++) // do the pairs
  {
    // this only returns half of neighbors, which is what we want
    numNeighbors=patchMap->oneOrTwoAwayNeighbors(p1,oneAway,oneAwayTrans);
    for(j=0;j<numNeighbors;j++)
    {
	int p2 = oneAway[j];
	int64 numAtoms1 = patchMap->patch(p1)->getNumAtoms();
	int64 numAtoms2 = patchMap->patch(p2)->getNumAtoms();
	const int distance =
 	  ( patchMap->index_a(p1) == patchMap->index_a(p2) ? 0 : 1 ) +
 	  ( patchMap->index_b(p1) == patchMap->index_b(p2) ? 0 : 1 ) +
 	  ( patchMap->index_c(p1) == patchMap->index_c(p2) ? 0 : 1 );
        int numPartitions;
	int divide = 0;
        if (distance <= 1) {
	  divide = node->simParameters->numAtomsPair;
	} else {
	  divide = node->simParameters->numAtomsPair2;
	}
	if (divide == 0) {
          numPartitions = 1 + (numAtoms1*numAtoms2 > 2500) + (numAtoms1*numAtoms2)/100000;
	}
	else {
          numPartitions = (int) (
		numAtoms1*numAtoms2/(double)(divide*divide) + 0.5 );
          if ( numPartitions < 1 ) numPartitions = 1;
	}
        if ( numPartitions > node->simParameters->maxPairPart )
			numPartitions = node->simParameters->maxPairPart;
//	if ( numPartitions > 1 ) iout << "Mapping " << numPartitions << " ComputeNonbondedPair objects for patches " << p1 << "(" << numAtoms1 << ") and " << p2 << "(" << numAtoms2 << ")\n" << endi;
	for(int partition=0; partition < numPartitions; partition++)
	{
	  cid=computeMap->storeCompute(
		patchMap->node(patchMap->downstream2(p1,p2)),
		2,computeNonbondedPairType,partition,numPartitions);
	  computeMap->newPid(cid,p1);
	  computeMap->newPid(cid,p2,oneAwayTrans[j]);
	  patchMap->newCid(p1,cid);
	  patchMap->newCid(p2,cid);
        }
    }
  }

}

//----------------------------------------------------------------------
void WorkDistrib::messageEnqueueWork(Compute *compute) {
  LocalWorkMsg *msg = compute->localWorkMsg;
  CkSetQueueing(msg, CK_QUEUEING_IFIFO);
  int seq = compute->sequence();

  if ( seq < 0 ) {
    *((int*) CkPriorityPtr(msg)) = 128 + compute->priority();
  } else {
    *((int*) CkPriorityPtr(msg)) = 128 + (seq %256) * 256 + compute->priority();
  }

  msg->compute = compute; // pointer is valid since send is to local Pe
  int type = compute->type();

  CProxy_WorkDistrib wdProxy(CpvAccess(BOCclass_group).workDistrib);
  switch ( type ) {
  case computeBondsType:
  case computeSelfBondsType:
#if CHARM_VERSION > 050402
    wdProxy[CkMyPe()].enqueueBonds(msg);
#else
    wdProxy.enqueueBonds(msg,CkMyPe());
#endif
    break;
  case computeAnglesType:
  case computeSelfAnglesType:
#if CHARM_VERSION > 050402
    wdProxy[CkMyPe()].enqueueAngles(msg);
#else
    wdProxy.enqueueAngles(msg,CkMyPe());
#endif
    break;
  case computeDihedralsType:
  case computeSelfDihedralsType:
#if CHARM_VERSION > 050402
    wdProxy[CkMyPe()].enqueueDihedrals(msg);
#else
    wdProxy.enqueueDihedrals(msg,CkMyPe());
#endif
    break;
  case computeImpropersType:
  case computeSelfImpropersType:
#if CHARM_VERSION > 050402
    wdProxy[CkMyPe()].enqueueImpropers(msg);
#else
    wdProxy.enqueueImpropers(msg,CkMyPe());
#endif
    break;
  case computeNonbondedSelfType:
    switch ( seq % 2 ) {
    case 0:
#if CHARM_VERSION > 050402
      wdProxy[CkMyPe()].enqueueSelfA(msg);
#else
      wdProxy.enqueueSelfA(msg,CkMyPe());
#endif
      break;
    case 1:
#if CHARM_VERSION > 050402
      wdProxy[CkMyPe()].enqueueSelfB(msg);
#else
      wdProxy.enqueueSelfB(msg,CkMyPe());
#endif
      break;
    default:
      NAMD_bug("WorkDistrib::messageEnqueueSelf case statement error!");
    }
    break;
  case computeNonbondedPairType:
    switch ( seq % 2 ) {
    case 0:
#if CHARM_VERSION > 050402
      wdProxy[CkMyPe()].enqueueWorkA(msg);
#else 
      wdProxy.enqueueWorkA(msg,CkMyPe());
#endif
      break;
    case 1:
#if CHARM_VERSION > 050402
      wdProxy[CkMyPe()].enqueueWorkB(msg);
#else
      wdProxy.enqueueWorkB(msg,CkMyPe());
#endif
      break;
    case 2:
#if CHARM_VERSION > 050402
      wdProxy[CkMyPe()].enqueueWorkC(msg);
#else
      wdProxy.enqueueWorkC(msg,CkMyPe());
#endif
      break;
    default:
      NAMD_bug("WorkDistrib::messageEnqueueWork case statement error!");
    }
    break;
  case computePmeType:
#if 0
#if CHARM_VERSION > 050402
    wdProxy[CkMyPe()].enqueuePme(msg);
#else
    wdProxy.enqueuePme(msg,CkMyPe());
#endif
#else
    msg->compute->doWork();
#endif
    break;
  default:
#if CHARM_VERSION > 050402
    wdProxy[CkMyPe()].enqueueWork(msg);
#else
    wdProxy.enqueueWork(msg,CkMyPe());
#endif
  }
}

void WorkDistrib::enqueueWork(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueBonds(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueAngles(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueDihedrals(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueImpropers(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueuePme(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueSelfA(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueSelfB(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueWorkA(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueWorkB(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

void WorkDistrib::enqueueWorkC(LocalWorkMsg *msg) {
  msg->compute->doWork();
  if ( msg->compute->localWorkMsg != msg )
    NAMD_bug("WorkDistrib LocalWorkMsg recycling failed!");
}

//**********************************************************************
//
//			FUNCTION velocities_from_PDB
//
//   INPUTS:
//      v - Array of vectors to populate
//	filename - name of the PDB filename to read in
//
//	This function reads in a set of initial velocities from a
//      PDB file.  It places the velocities into the array of Vectors
//      passed to it.
//
//***********************************************************************/

void WorkDistrib::velocities_from_PDB(char *filename, 
				      Vector *v, int totalAtoms)
{
  PDB *v_pdb;		//  PDB info from velocity PDB
  int i;

  //  Read the PDB
  v_pdb = new PDB(filename);
  if ( v_pdb == NULL )
  {
    NAMD_die("memory allocation failed in Node::velocities_from_PDB");
  }

  //  Make sure the number of velocities read in matches
  //  the number of atoms we have
  if (v_pdb->num_atoms() != totalAtoms)
  {
    char err_msg[129];

    sprintf(err_msg, "FOUND %d COORDINATES IN VELOCITY PDB!!",
	    v_pdb->num_atoms());

    NAMD_die(err_msg);
  }

  //  Get the entire list of atom info and loop through
  //  them assigning the velocity vector for each one
  v_pdb->get_all_positions(v);

  for (i=0; i<totalAtoms; i++)
  {
    v[i].x *= PDBVELINVFACTOR;
    v[i].y *= PDBVELINVFACTOR;
    v[i].z *= PDBVELINVFACTOR;
  }

  delete v_pdb;
}
//		END OF FUNCTION velocities_from_PDB

//**********************************************************************
//
// 			FUNCTION velocities_from_binfile
//
//    INPUTS:
// 	fname - File name to write velocities to
//	n - Number of atoms in system
//	vels - Array of velocity vectors
//					
//	This function writes out the velocities in binary format.  This is
//     done to preserve accuracy between restarts of namd.
//
//**********************************************************************

void WorkDistrib::velocities_from_binfile(char *fname, Vector *vels, int n)
{
  read_binary_file(fname,vels,n);
}
//               END OF FUNCTION velocities_from_binfile

//**********************************************************************
//
//			FUNCTION random_velocities
//
//   INPUTS:
//	v - array of vectors to populate
//	Temp - Temperature to acheive
//
//	This function assigns a random velocity distribution to a
//   simulation to achieve a desired initial temperature.  The method
//   used here was stolen from the program X-PLOR.
//
//**********************************************************************

void WorkDistrib::random_velocities(BigReal Temp,Molecule *structure,
				    Vector *v, int totalAtoms)
{
  int i, j;		//  Loop counter
  BigReal kbT;		//  Boltzman constant * Temp
  BigReal randnum;	//  Random number from -6.0 to 6.0
  BigReal kbToverM;	//  sqrt(Kb*Temp/Mass)
  Random vel_random(Node::Object()->simParameters->randomSeed);

  kbT = Temp*BOLTZMAN;

  //  Loop through all the atoms and assign velocities in
  //  the x, y and z directions for each one
  for (i=0; i<totalAtoms; i++)
  {
    kbToverM = sqrt(kbT/structure->atommass(i));

    //  The following comment was stolen from X-PLOR where
    //  the following section of code was adapted from.
    
    //  This section generates a Gaussian random
    //  deviate of 0.0 mean and standard deviation RFD for
    //  each of the three spatial dimensions.
    //  The algorithm is a "sum of uniform deviates algorithm"
    //  which may be found in Abramowitz and Stegun,
    //  "Handbook of Mathematical Functions", pg 952.
    for (randnum=0.0, j=0; j<12; j++)
    {
      randnum += vel_random.uniform();
    }

    randnum -= 6.0;

    v[i].x = randnum*kbToverM;

    for (randnum=0.0, j=0; j<12; j++)
    {
      randnum += vel_random.uniform();
    }

    randnum -= 6.0;

    v[i].y = randnum*kbToverM;

    for (randnum=0.0, j=0; j<12; j++)
    {
      randnum += vel_random.uniform();
    }

    randnum -= 6.0;
    
    v[i].z = randnum*kbToverM;
  }
}
/*			END OF FUNCTION random_velocities		*/

//**********************************************************************
//
//			FUNCTION remove_com_motion
//
//   INPUTS:
//	vel - Array of initial velocity vectors
//
//	This function removes the center of mass motion from a molecule.
//
//**********************************************************************

void WorkDistrib::remove_com_motion(Vector *vel, Molecule *structure, int n)
{
  Vector mv(0,0,0);		//  Sum of (mv)_i
  BigReal totalMass=0; 	//  Total mass of system
  int i;			//  Loop counter

  //  Loop through and compute the net momentum
  for (i=0; i<n; i++)
  {
    BigReal mass = structure->atommass(i);
    mv += mass * vel[i];
    totalMass += mass;
  }

  mv /= totalMass;

  iout << iINFO << "REMOVING COM VELOCITY "
	<< ( PDBVELFACTOR * mv ) << "\n" << endi;

  for (i=0; i<n; i++) { vel[i] -= mv; }

}
/*			END OF FUNCTION remove_com_motion		*/

/* getNumComputeGlobals returns the total number of global compute
   objects that will be set up.  This is either the number of nodes,
   or the number of patches, whichever is smaller. */
int WorkDistrib::getNumComputeGlobals() {
  int numWorkingPes = CkNumPes();
  int npatches=(PatchMap::Object())->numPatches();
  if ( numWorkingPes > npatches ) numWorkingPes = npatches;
  return numWorkingPes;
}

#include "WorkDistrib.def.h"

