namd/doxygen/NamdHybridLB_8C_source.html

 /*****************************************************************************

  * $Source: /home/cvs/namd/cvsroot/namd2/src/NamdHybridLB.C,v $

  * $Author: jim $

  * $Date: 2017/03/30 20:06:17 $

  * $Revision: 1.40 $

  *****************************************************************************/


 #if !defined(WIN32) || defined(__CYGWIN__)

 #include <unistd.h>

 #endif

 #include <fcntl.h>


 #include "InfoStream.h"

 #include "NamdHybridLB.h"

 #include "Node.h"

 #include "PatchMap.h"

 #include "ComputeMap.h"

 #include "LdbCoordinator.h"


 class SplitComputesMsg : public CMessage_SplitComputesMsg {

 public:

   double maxUnsplit;

   double averageLoad;

   double avgCompute;

   double maxCompute;

   int maxComputeId;

   int nMoveableComputes;

   int numPesAvailable;

   int n;

   int *cid;

   float *load;

 };


 #include "NamdHybridLB.def.h"


 // #define DUMP_LDBDATA 1

 // #define LOAD_LDBDATA 1


 extern int isPmeProcessor(int);

 #ifdef MEM_OPT_VERSION

 extern int isOutputProcessor(int);

 #endif

 // Load array defined in NamdCentLB.C

 extern double *cpuloads;


 void CreateNamdHybridLB() {

   int seqno = LdbInfra::Object()->getLoadbalancerTicket();

   CProxy_NamdHybridLB::ckNew(CkLBOptions(seqno));


   // creating an array to store the loads of all processors

   // to be used with proxy spanning tree

   if (CkMyPe() == 0 && cpuloads == NULL) {

     cpuloads = new double[CkNumPes()];

     CmiAssert(cpuloads != NULL);

     for (int i=0; i<CkNumPes(); i++) cpuloads[i] = 0.0;

   }

 }


 NamdHybridLB::NamdHybridLB(const CkLBOptions& opt): HybridBaseLB(opt)

 {

   // setting the name

   lbname = (char *)"NamdHybridLB";


   delete tree;        // delete the tree built from the base class

   const SimParameters* simParams = Node::Object()->simParameters;

   if (CkNumPes() <= simParams->hybridGroupSize)  {

     tree = new TwoLevelTree;   // similar to centralized load balancing

   }

   else {

     tree = new ThreeLevelTree(simParams->hybridGroupSize);

     initTree();

     // can only do shrink strategy on levels > 1

     statsStrategy = SHRINK_NULL;

   }


   // initializing thisProxy

   thisProxy = CProxy_NamdHybridLB(thisgroup);


   // initializing the central LB

   centralLB = AllocateNamdCentLB();


   // initializing the dummy LB

   dummyLB = AllocateNamdDummyLB();


   // assigning initial values to variables

   from_procs = NULL;

   computeArray = NULL;

   patchArray = NULL;

   processorArray = NULL;

   updateCount = 0;

   splitCount = 0;

   splitComputesMsgs = 0;

   updateFlag = false;

   collectFlag = false;


 }


 bool NamdHybridLB::QueryBalanceNow(int _step){

   if ( LdbCoordinator::Object()->takingLdbData ) {

           return true;

   } else {

           return false;

   }

 }


 bool NamdHybridLB::QueryDumpData() {

 #if 0

   if (LdbCoordinator::Object()->ldbCycleNum == 1)  return true;

   if (LdbCoordinator::Object()->ldbCycleNum == 2)  return true;

 #endif

   return false;

 }


 #if 0


 LBVectorMigrateMsg* NamdHybridLB::VectorStrategy(LDStats* stats){

   CkPrintf("[%d] Using Vector Strategy to balance the load\n",CkMyPe());

   LBVectorMigrateMsg* msg = new(0,0) LBVectorMigrateMsg;

   msg->n_moves = 0;

   msg->level = currentLevel;

   return msg;

 }

 #endif


 /*

  * Runs the load balancing strategy

  */

 CLBMigrateMsg* NamdHybridLB::Strategy(LDStats* stats)

 {

         int i;

         // CkPrintf("[%d] NamdHybridLB at Strategy\n",CkMyPe());


         // calling the centralLB for level 1

         if(currentLevel == 1){

                 LevelData *lData = levelData[currentLevel];

                 CLBMigrateMsg *msg;

                 LocalLBInfoMsg *newMsg;

                 msg = GrpLevelStrategy(stats);


                 // creating a new message to send to its parent

                 newMsg = new(msg->n_moves,endPE-startPE+1) LocalLBInfoMsg;

                 newMsg->n_moves = msg->n_moves;

                 newMsg->startPE = startPE;

                 newMsg->endPE = endPE;

                 for(i=0; i<msg->n_moves; i++){

                         newMsg->moves[i] = msg->moves[i];

                 }

                 for(i=0; i<endPE-startPE+1; i++){

                         newMsg->cpuloads[i] = peLoads[i];

                 }

                 delete [] peLoads;

                 thisProxy[0].UpdateLocalLBInfo(newMsg);

                 return msg;

         }else{

                 dummyLB->work(stats);

                 return createMigrateMsg(stats);

         }

 }


 void NamdHybridLB::UpdateLocalLBInfo(LocalLBInfoMsg *msg){

         int children;

         int i;


         // getting the number of children

         children = tree->numNodes(currentLevel);

         // CkPrintf("[%d] Updating compute map, total %d\n",CkMyPe(),siblings);


         // getting the compute map to insert the changes coming from the children

         ComputeMap *computeMap = ComputeMap::Object();


         // traversing the set of moves in msg

         for(i=0; i<msg->n_moves; i++){

             if (msg->moves[i].to_pe != -1)

                 computeMap->setNewNode(LdbIdField(msg->moves[i].obj.id, 0),msg->moves[i].to_pe);

         }


         // CODING

         // updating cpuloads array

         for(i=msg->startPE; i<=msg->endPE; i++){

                 cpuloads[i] = msg->cpuloads[i-msg->startPE];

         }


         // checking if all children have sent the update

         updateCount++;

         if(updateCount == children){

                 updateCount = 0;

                 updateFlag = true;

                  // CkPrintf("[%d] UPDATE READY\n",CkMyPe());

         }


         // checking if the collect info is ready

         if(updateFlag && collectFlag){

                 updateFlag = false;

                 collectFlag = false;

                 thisProxy[parent_backup].CollectInfo(loc_backup, n_backup, fromlevel_backup);

         }


         delete msg;

 }


 void NamdHybridLB::splitComputes(SplitComputesMsg *msg) {

   const int children = tree->numNodes(1);


   if ( ! splitComputesMsgs ) {

     splitComputesMsgs = new SplitComputesMsg*[children];

   }


   splitComputesMsgs[splitCount] = msg;


   if ( ++splitCount == children ) {

     splitCount = 0;


     const SimParameters* simParams = Node::Object()->simParameters;

     ComputeMap *computeMap = ComputeMap::Object();


     double maxUnsplit = 0.;

     double averageLoad = 0.;

     double avgCompute = 0.;

     double maxCompute = 0.;

     int maxComputeId = -1;

     int nMoveableComputes = 0;

     int numPesAvailable = 0;

     int nToSplit = 0;


     for ( int j=0; j < children; ++j ) {

       SplitComputesMsg *msg = splitComputesMsgs[j];

       if ( msg->maxUnsplit > maxUnsplit ) { maxUnsplit = msg->maxUnsplit; }

       if ( msg->maxCompute > maxCompute ) { maxCompute = msg->maxCompute; maxComputeId = msg->maxComputeId; }

       averageLoad += msg->averageLoad * msg->numPesAvailable;

       numPesAvailable += msg->numPesAvailable;

       avgCompute += msg->avgCompute * msg->nMoveableComputes;

       nMoveableComputes += msg->nMoveableComputes;

       nToSplit += msg->n;

     }


     averageLoad /= numPesAvailable;

     if ( nMoveableComputes ) avgCompute /= nMoveableComputes; else avgCompute = 0.;


     CkPrintf("LDB: Largest compute %d load %f is %.1f%% of average load %f\n",

             maxComputeId, maxCompute, 100. * maxCompute / averageLoad, averageLoad);

     CkPrintf("LDB: Average compute %f is %.1f%% of average load %f\n",

             avgCompute, 100. * avgCompute / averageLoad, averageLoad);


     if ( ! nToSplit ) {

       for ( int j=0; j < children; ++j ) {

         delete splitComputesMsgs[j];

       }

     } else {

       // partitions are stored as char but mostly limited by

       // high load noise at low outer-loop iteration counts

       int maxParts = 10;

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)

 //split LCPO compute very small, else CUDA compute is delayed

       if (simParams->LCPOOn) {

         maxParts = 20;

       }

 #endif

       int totalAddedParts = 0;

       maxCompute = averageLoad / 10.;

       if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;

       if ( simParams->ldbRelativeGrainsize > 0. ) {

         maxCompute = averageLoad * simParams->ldbRelativeGrainsize;

       }

       CkPrintf("LDB: Partitioning computes with target load %f\n", maxCompute);


       for ( int j=0; j < children; ++j ) {

         SplitComputesMsg *msg = splitComputesMsgs[j];

         for (int i=0; i < msg->n; ++i) {

           int nparts = (int) ceil(msg->load[i] / maxCompute);

           if ( nparts > maxParts ) nparts = maxParts;

           if ( nparts < 1 ) nparts = 1;

           computeMap->setNewNumPartitions(msg->cid[i],nparts);

           totalAddedParts += nparts - 1;

         }

         delete msg;

       }


       CkPrintf("LDB: Increased migratable compute count from %d to %d\n",

               nMoveableComputes,nMoveableComputes+totalAddedParts);

       CkPrintf("LDB: Largest unpartitionable compute is %f\n", maxUnsplit);

     }

   }

 }


 CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) {

   int numProcessors = stats->nprocs();  // number of processors at group level

   int numPatches = PatchMap::Object()->numPatches();

   ComputeMap *computeMap = ComputeMap::Object();

   const int numComputes = computeMap->numComputes();

   const int numGroupComputes = stats->n_migrateobjs;

   const SimParameters* simParams = Node::Object()->simParameters;


   if ( ! processorArray ) processorArray = new processorInfo[numProcessors];

   // these data structures are global and need to be distributed

   if ( ! patchArray ) patchArray = new patchInfo[numPatches];

   if ( ! computeArray ) computeArray = new computeInfo[numGroupComputes];

   if ( ! from_procs ) from_procs = new int[numGroupComputes];


   int nMoveableComputes = buildData(stats);

   CmiAssert(nMoveableComputes <= numGroupComputes);


 #if LDB_DEBUG

 #define DUMP_LDBDATA 1

 #define LOAD_LDBDATA 1

 #endif


 #if DUMP_LDBDATA

   dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes);

 #elif LOAD_LDBDATA

   loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes);

   // CkExit();

 #endif


   double averageLoad = 0.;

   double avgCompute;

   double maxCompute;

   int maxComputeId;

   int numPesAvailable;

   {

    int i;

    double total = 0.;

    maxCompute = 0.;

    int maxi = -1;

    for (i=0; i<nMoveableComputes; i++) {

       double load = computeArray[i].load;

       total += load;

       if ( load > maxCompute ) { maxCompute = load;  maxi = i; }

    }

    avgCompute = total / nMoveableComputes;

    maxComputeId = maxi < 0 ? -1 : LdbIdField(computeArray[maxi].handle.id, 0);


     int P = stats->nprocs();

    numPesAvailable = 0;

    for (i=0; i<P; i++) {

       if (processorArray[i].available) {

         ++numPesAvailable;

         total += processorArray[i].backgroundLoad;

       }

    }

    if (numPesAvailable == 0)

      NAMD_die("No processors available for load balancing!\n");


    averageLoad = total/numPesAvailable;

   }


   int i_split = 0;

   double maxUnsplit = 0.;


   if ( step() == 1 ) {

     for (int i=0; i<nMoveableComputes; i++) {

       const int cid = LdbIdField(computeArray[i].handle.id, 0);

       if ( computeMap->numPartitions(cid) == 0 ) {

         const double load = computeArray[i].load;

         if ( load > maxUnsplit ) maxUnsplit = load;

         continue;

       }

       ++i_split;

     }

   }


   {

     SplitComputesMsg *msg = new(i_split,i_split) SplitComputesMsg;

     msg->maxUnsplit = maxUnsplit;

     msg->averageLoad = averageLoad;

     msg->avgCompute = avgCompute;

     msg->maxCompute = maxCompute;

     msg->maxComputeId = maxComputeId;

     msg->nMoveableComputes = nMoveableComputes;

     msg->numPesAvailable = numPesAvailable;

     msg->n = i_split;


     if ( step() == 1 ) {

       i_split = 0;

       for (int i=0; i<nMoveableComputes; i++) {

         computeArray[i].processor = computeArray[i].oldProcessor;

         const int cid = LdbIdField(computeArray[i].handle.id, 0);

         if ( computeMap->numPartitions(cid) == 0 ) {

           continue;

         }

         msg->cid[i_split] = cid;

         msg->load[i_split] = computeArray[i].load;

         ++i_split;

       }

     }


     thisProxy[0].splitComputes(msg);

   }


   if ( step() == 1 ) {

     // compute splitting only

   } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default

     if (step() < 4)

       TorusLB(computeArray, patchArray, processorArray,

                   nMoveableComputes, numPatches, numProcessors);

     else

       RefineTorusLB(computeArray, patchArray, processorArray,

                   nMoveableComputes, numPatches, numProcessors, 1);

   } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) {

     TorusLB(computeArray, patchArray, processorArray,

                   nMoveableComputes, numPatches, numProcessors);

   } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) {

     RefineTorusLB(computeArray, patchArray, processorArray,

                   nMoveableComputes, numPatches, numProcessors, 1);

   } else if (simParams->ldbStrategy == LDBSTRAT_OLD) {

     NAMD_die("Old load balancer strategy is not compatible with hybrid balancer.");

     if (step() < 4)

       Alg7(computeArray, patchArray, processorArray,

                   nMoveableComputes, numPatches, numProcessors);

     else

       RefineOnly(computeArray, patchArray, processorArray,

                   nMoveableComputes, numPatches, numProcessors);

   }


 #if LDB_DEBUG && USE_TOPOMAP

   TopoManager tmgr;

   int pe1, pe2, pe3, hops=0;

   /* This is double counting the hops

   for(int i=0; i<nMoveableComputes; i++)

   {

     pe1 = computeArray[i].processor;

     pe2 = patchArray[computeArray[i].patch1].processor;

     pe3 = patchArray[computeArray[i].patch2].processor;

     hops += tmgr.getHopsBetweenRanks(pe1, pe2);

     if(computeArray[i].patch1 != computeArray[i].patch2)

       hops += tmgr.getHopsBetweenRanks(pe1, pe3);

   }*/

   for (int i=0; i<numPatches; i++)  {

     //int num = patchArray[i].proxiesOn.numElements();

     pe1 = patchArray[i].processor;

     Iterator nextProc;

     processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc);

     while (p) {

       pe2 = p->Id;

       hops += tmgr.getHopsBetweenRanks(pe1, pe2);

       p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc);

     }

   }

   CkPrintf("Load Balancing: Number of Hops: %d\n", hops);

 #endif


 #if DUMP_LDBDATA

   dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);

 #elif LOAD_LDBDATA

   dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes);

   // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);

   // CkExit();

 #endif


   // For error checking:

   // Count up computes, to see if somebody doesn't have any computes

   int i;

 #if 0

   int* computeCount = new int[numProcessors];

   for(i=0; i<numProcessors; i++)

     computeCount[i]=0;

   for(i=0; i<nMoveableComputes; i++)

     computeCount[computeArray[i].processor]++;

   for(i=0; i<numProcessors; i++) {

     if (computeCount[i]==0)

       iout << iINFO <<"Warning: Processor " << i

            << " has NO moveable computes.\n" << endi;

   }

   delete [] computeCount;

 #endif


   CkVec<MigrateInfo *> migrateInfo;

   for(i=0;i<nMoveableComputes;i++) {

     if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) {

       /* CkPrintf("[%d] Obj %d migrating from %d (%d) to %d\n",

                      CkMyPe(),computeArray[i].handle.id.id[0],

                          from_procs[i], computeArray[i].oldProcessor, computeArray[i].processor); */

       MigrateInfo *migrateMe = new MigrateInfo;

       migrateMe->obj = computeArray[i].handle;

       //migrateMe->from_pe = computeArray[i].oldProcessor;

       int frompe = from_procs[i];

       if (frompe == numProcessors)

         frompe = -1;

       else

         frompe = frompe + stats->procs[0].pe;

       migrateMe->from_pe = frompe;

       migrateMe->to_pe = computeArray[i].processor;

       if (frompe == -1) {

           // don't know yet which processor this compute belongs to, but

           // inform receiver

         LDObjData obj;

         obj.handle = computeArray[i].handle;

         thisProxy[computeArray[i].processor].ObjMigrated(obj, NULL, 0, currentLevel-1);

       }

       migrateInfo.push_back(migrateMe);


       // sneak in updates to ComputeMap

       //ERASE CkPrintf("%d setting %d to processor %d\n",CkMyPe(),computeArray[i].handle.id.id[0],computeArray[i].processor);

       computeMap->setNewNode(LdbIdField(computeArray[i].handle.id, 0),

                                 computeArray[i].processor);

     }

   }

   // CkPrintf("LOAD BALANCING READY %d\n",CkMyPe());


   LBMigrateMsg* msg;

   msg = createMigrateMsg(migrateInfo, numProcessors);


   peLoads = new double [numProcessors];

   startPE = processorArray[0].Id;

   endPE = processorArray[numProcessors-1].Id;

   // CkPrintf("[%d] numProcessors=%d, %d to %d\n",CkMyPe(),numProcessors,processorArray[0].Id,processorArray[numProcessors-1].Id);

   for (i=0; i<numProcessors; i++) {

         peLoads[i] = processorArray[i].load;

   }


   delete [] from_procs;

   delete [] processorArray;

   delete [] patchArray;

   delete [] computeArray;


   from_procs = NULL;

   processorArray = NULL;

   patchArray = NULL;

   computeArray = NULL;


   return msg;


 }


 void NamdHybridLB::dumpDataASCII(char *file, int numProcessors,

                                int numPatches, int numComputes)

 {

   char filename[128];

   sprintf(filename, "%s_%d.%d", file, CkMyPe(), step());

   FILE* fp = fopen(filename,"w");

   if (fp == NULL){

      perror("dumpLDStatsASCII");

      return;

   }

   // CkPrintf("***** DUMP data to file: %s ***** \n", filename);

   fprintf(fp,"%d %d %d\n",numProcessors,numPatches,numComputes);


   int i;

   for(i=0;i<numProcessors;i++) {

     processorInfo* p = processorArray + i;

     fprintf(fp,"%d %e %e %e %e\n",p->Id,p->load,p->backgroundLoad,p->computeLoad,p->idleTime);

   }


   for(i=0;i < numPatches; i++) {

     patchInfo* p = patchArray + i;

     fprintf(fp,"%d %e %d %d\n",p->Id,p->load,p->processor,p->numAtoms);

   }


   for(i=0; i < numComputes; i++) {

     computeInfo* c = computeArray + i;

     fprintf(fp,"%d %e %d %d %d %d",c->Id,c->load,c->patch1,c->patch2,

             c->processor,c->oldProcessor);

     fprintf(fp, "\n");

   }


   // dump patchSet

   for (i=0; i< numProcessors; i++) {

       int num = processorArray[i].proxies.numElements();

       fprintf(fp, "%d %d: ", i, num);

       Iterator nextProxy;

       patchInfo *p = (patchInfo *)processorArray[i].proxies.

         iterator((Iterator *)&nextProxy);

       while (p) {

           fprintf(fp, "%d ", p->Id);

           p = (patchInfo *)processorArray[i].proxies.

             next((Iterator*)&nextProxy);

       }

       fprintf(fp, "\n");

   }

   // dump proxiesOn

   for (i=0; i<numPatches; i++)  {

     int num = patchArray[i].proxiesOn.numElements();

     fprintf(fp, "%d %d: ", i, num);

       Iterator nextProc;

       processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.

         iterator((Iterator *)&nextProc);

       while (p) {

         fprintf(fp, "%d ", p->Id);

         p = (processorInfo *)patchArray[i].proxiesOn.

           next((Iterator*)&nextProc);

       }

       fprintf(fp, "\n");

   }


   fclose(fp);

   //CkExit();

 }


 int NamdHybridLB::buildData(LDStats* stats) {

   int n_pes = stats->nprocs();


   PatchMap* patchMap = PatchMap::Object();

   ComputeMap* computeMap = ComputeMap::Object();

   const SimParameters* simParams = Node::Object()->simParameters;


   BigReal bgfactor = simParams->ldbBackgroundScaling;

   BigReal pmebgfactor = simParams->ldbPMEBackgroundScaling;

   BigReal homebgfactor = simParams->ldbHomeBackgroundScaling;

   int pmeOn = simParams->PMEOn;

   int unLoadPme = simParams->ldbUnloadPME;

   int pmeBarrier = simParams->PMEBarrier;

   int unLoadZero = simParams->ldbUnloadZero;

   int unLoadOne = simParams->ldbUnloadOne;

   int unLoadIO= simParams->ldbUnloadOutputPEs;

   // traversing the list of processors and getting their load information

   int i, pe_no;

   for (i=0; i<n_pes; ++i) {

     pe_no = stats->procs[i].pe;


     // BACKUP processorArray[i].Id = i;

     processorArray[i].Id = pe_no;               // absolute pe number

     processorArray[i].available = true;

     // BACKUP if ( pmeOn && isPmeProcessor(i) )

     if ( pmeOn && isPmeProcessor(pe_no) ) {

       processorArray[i].backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;

     // BACKUP } else if (patchMap->numPatchesOnNode(i) > 0) {

     } else if (patchMap->numPatchesOnNode(pe_no) > 0) {

       processorArray[i].backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;

     } else {

       processorArray[i].backgroundLoad = bgfactor * stats->procs[i].bg_walltime;

     }

     processorArray[i].idleTime = stats->procs[i].idletime;

     processorArray[i].load = processorArray[i].computeLoad = 0.0;

   }


   // If I am group zero, then offload processor 0 and 1 in my group

   if(stats->procs[0].pe == 0) {

     if(unLoadZero) processorArray[0].available = false;

     if(unLoadOne) processorArray[1].available = false;

   }


   // if all pes are Pme, disable this flag

   if (pmeOn && unLoadPme) {

     for (i=0; i<n_pes; i++) {

       if(!isPmeProcessor(stats->procs[i].pe))  break;

     }

     if (i == n_pes) {

       iout << iINFO << "Turned off unLoadPme flag!\n"  << endi;

       unLoadPme = 0;

     }

   }


   if (pmeOn && unLoadPme) {

     for (i=0; i<n_pes; i++) {

       if ((pmeBarrier && i==0) || isPmeProcessor(stats->procs[i].pe))

         processorArray[i].available = false;

     }

   }


   // if all pes are output, disable this flag

 #ifdef MEM_OPT_VERSION

   if (unLoadIO) {

       if (simParams->numoutputprocs == n_pes) {

           iout << iINFO << "Turned off unLoadIO flag!\n"  << endi;

           unLoadIO = 0;

       }

   }

   if (unLoadIO){

       for (i=0; i<n_pes; i++) {

           if (isOutputProcessor(stats->procs[i].pe))

               processorArray[i].available = false;

       }

   }

 #endif


   // need to go over all patches to get all required proxies

   int numPatches = patchMap->numPatches();

   int totalLocalProxies = 0;

   int totalProxies = 0;

   for ( int pid=0; pid<numPatches; ++pid ) {

         int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];


         patchArray[pid].Id = pid;

         patchArray[pid].numAtoms = 0;

         patchArray[pid].processor = patchMap->node(pid);


         const int numProxies =

 #if 0 // USE_TOPOMAP - this function needs to be there for the hybrid case

         requiredProxiesOnProcGrid(pid,neighborNodes);

 #else

         requiredProxies(pid, neighborNodes);

 #endif


         int numLocalProxies = 0;

         for (int k=0; k<numProxies; k++) {

                 if( (neighborNodes[k] >= stats->procs[0].pe) && (neighborNodes[k] <= stats->procs[n_pes-1].pe) ){

                         ++numLocalProxies;

                         int index = neighborNodes[k] - stats->procs[0].pe;

                         processorArray[index].proxies.unchecked_insert(&patchArray[pid]);

                         patchArray[pid].proxiesOn.unchecked_insert(&processorArray[index]);

                 }

         }

 #if 0

         if ( numLocalProxies ) {

             CkPrintf("LDB Pe %d patch %d has %d local of %d total proxies\n",

                 CkMyPe(), pid, numLocalProxies, numProxies);

         }

 #endif

         totalLocalProxies += numLocalProxies;

         totalProxies += numProxies;

   }

 #if 0

   CkPrintf("LDB Pe %d has %d local of %d total proxies\n",

                 CkMyPe(), totalLocalProxies, totalProxies);

 #endif


   int nMoveableComputes=0;

   int index;


   int j;


   // this loop goes over only the objects in this group

   const auto nObjs = stats->objData.size();

   for(j=0; j < nObjs; j++) {

         const LDObjData &this_obj = stats->objData[j];

         int frompe = stats->from_proc[j];


         // filter out non-NAMD managed objects (like PME array)

         if (this_obj.omID().id.idx != 1) {

                 // CmiAssert(frompe>=0 && frompe<n_pes);

                 // CkPrintf("non-NAMD object %d on pe %d with walltime %lf\n",

                 // this_obj.id().id[0], frompe + stats->procs[0].pe, this_obj.wallTime);

                 processorArray[frompe].backgroundLoad += this_obj.wallTime;

                 continue;

         }


         if (LdbIdField(this_obj.id(), 1) == PATCH_TYPE) { // Its a patch

                 // handled above to get required proxies from all patches

                 processorArray[frompe].backgroundLoad += this_obj.wallTime;

         } else if (LdbIdField(this_obj.id(), 1) == BONDED_TYPE) { // Its a bonded compute

                 processorArray[frompe].backgroundLoad += this_obj.wallTime;

         } else if (this_obj.migratable && this_obj.wallTime != 0.) { // Its a compute


                 const int cid = LdbIdField(this_obj.id(), 0);

                 const int p0 = computeMap->pid(cid,0);


                 // For self-interactions, just return the same pid twice

                 int p1;

                 if (computeMap->numPids(cid) > 1)

                         p1 = computeMap->pid(cid,1);

                         else p1 = p0;

                         computeArray[nMoveableComputes].Id = cid;

                         //BACKUP computeArray[nMoveableComputes].oldProcessor = stats->from_proc[j];

                         if (frompe >= n_pes) {  // from outside

 CkPrintf("assigning random old processor...this looks broken\n");

                           computeArray[nMoveableComputes].oldProcessor = CrnRand()%n_pes + stats->procs[0].pe;     // random

                         }

                         else {

                           computeArray[nMoveableComputes].oldProcessor = frompe + stats->procs[0].pe;

                         }

                         from_procs[nMoveableComputes] = frompe;


                         //BACKUP2 index = stats->from_proc[j] - stats->procs[0].pe;

                         //BACKUP processorArray[stats->from_proc[j]].computeLoad += this_obj.wallTime;

                         int index = computeArray[nMoveableComputes].oldProcessor - stats->procs[0].pe;

                         processorArray[index].computeLoad += this_obj.wallTime;

                         computeArray[nMoveableComputes].processor = -1;

                         computeArray[nMoveableComputes].patch1 = p0;

                         computeArray[nMoveableComputes].patch2 = p1;

                         computeArray[nMoveableComputes].handle = this_obj.handle;

                         computeArray[nMoveableComputes].load = this_obj.wallTime;

                         nMoveableComputes++;

         }

   }


         for (i=0; i<n_pes; i++) {

           processorArray[i].load = processorArray[i].backgroundLoad + processorArray[i].computeLoad;

         }

         stats->clear();

         return nMoveableComputes;

 }


 int NamdHybridLB::requiredProxies(PatchID id, int neighborNodes[])

 {

   PatchMap* patchMap = PatchMap::Object();

   int myNode = patchMap->node(id);

   int nProxyNodes = 0;


 #define IF_NEW_NODE \

     int j; \

     for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \

     if ( j == nProxyNodes )


   PatchID neighbors[1 + PatchMap::MaxOneAway + PatchMap::MaxTwoAway];

   neighbors[0] = id;

   int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);

   for ( int i = 0; i < numNeighbors; ++i ) {

     const int proxyNode = patchMap->basenode(neighbors[i]);

     if ( proxyNode != myNode ) {

       IF_NEW_NODE {

         neighborNodes[nProxyNodes] = proxyNode;

         nProxyNodes++;

       }

     }

   }


   // Distribute initial default proxies across empty processors.

   // This shouldn't be necessary, but may constrain the load balancer

   // and avoid placing too many proxies on a single processor.  -JCP


   // This code needs to be turned off when the creation of ST is

   // shifted to the load balancers -ASB


 #if 1

   int numNodes = CkNumPes();

   int numPatches = patchMap->numPatches();

   int emptyNodes = numNodes - numPatches;

   if ( emptyNodes > numPatches ) {

     int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;

     int maxNodesPerPatch = PatchMap::MaxOneAway + PatchMap::MaxTwoAway;

     if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;

     int proxyNode = (myNode + 1) % numNodes;

     while ( nProxyNodes < nodesPerPatch &&

                         ! patchMap->numPatchesOnNode(proxyNode) ) {

       if ( proxyNode != myNode ) {

         IF_NEW_NODE {

           neighborNodes[nProxyNodes] = proxyNode;

           nProxyNodes++;

         }

       }

       proxyNode = (proxyNode + 1) % numNodes;

     }

     proxyNode = (myNode - 1 + numNodes) % numNodes;

     while ( nProxyNodes < nodesPerPatch &&

                         ! patchMap->numPatchesOnNode(proxyNode) ) {

       if ( proxyNode != myNode ) {

         IF_NEW_NODE {

           neighborNodes[nProxyNodes] = proxyNode;

           nProxyNodes++;

         }

       }

       proxyNode = (proxyNode - 1 + numNodes) % numNodes;

     }

     proxyNode = (myNode + 1) % numNodes;

     int count = 0;

     while ( nProxyNodes < nodesPerPatch ) {

       if ( ! patchMap->numPatchesOnNode(proxyNode) && proxyNode != myNode ) {

         IF_NEW_NODE {

           neighborNodes[nProxyNodes] = proxyNode;

           nProxyNodes++;

         }

       }

       proxyNode = (proxyNode + 1) % numNodes;

       count ++; if (count == numNodes) break;   // we looped all

     }

   } else {

     int proxyNode = myNode - 1;

     if ( proxyNode >= 0 && ! patchMap->numPatchesOnNode(proxyNode) ) {

       if ( proxyNode != myNode ) {

         IF_NEW_NODE {

           neighborNodes[nProxyNodes] = proxyNode;

           nProxyNodes++;

         }

       }

     }

     proxyNode = myNode + 1;

     if ( proxyNode < numNodes && ! patchMap->numPatchesOnNode(proxyNode) ) {

       if ( proxyNode != myNode ) {

         IF_NEW_NODE {

           neighborNodes[nProxyNodes] = proxyNode;

           nProxyNodes++;

         }

       }

     }

   }

 #endif


   return nProxyNodes;

 }


Node::Object
static Node * Object()
Definition: Node.h:86

load
BlockLoad::TempStorage load
Definition: CudaTileListKernel.cu:691

NamdHybridLB::UpdateLocalLBInfo
void UpdateLocalLBInfo(LocalLBInfoMsg *msg)
Definition: NamdHybridLB.C:178

PatchMap::MaxOneAway
Definition: PatchMap.h:46

AllocateNamdDummyLB
NamdDummyLB * AllocateNamdDummyLB()
Definition: NamdDummyLB.C:20

SplitComputesMsg::cid
int * cid
Definition: NamdHybridLB.C:30

computeInfo
Definition: elements.h:20

computeInfo::patch1
int patch1
Definition: elements.h:23

iINFO
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81

BONDED_TYPE
represents bonded compute
Definition: LdbCoordinator.h:69

AllocateNamdCentLB
NamdCentLB * AllocateNamdCentLB()
Definition: NamdCentLB.C:38

Alg7
Definition: Alg7.h:13

SimParameters::numoutputprocs
int numoutputprocs
Definition: SimParameters.h:1039

ComputeMap::setNewNumPartitions
void setNewNumPartitions(ComputeID cid, char numPartitions)
Definition: ComputeMap.h:144

SimParameters::ldbRelativeGrainsize
BigReal ldbRelativeGrainsize
Definition: SimParameters.h:149

SplitComputesMsg::n
int n
Definition: NamdHybridLB.C:29

ComputeMap::numComputes
int numComputes(void)
Definition: ComputeMap.h:101

SimParameters::ldbUnloadPME
Bool ldbUnloadPME
Definition: SimParameters.h:179

SplitComputesMsg::maxCompute
double maxCompute
Definition: NamdHybridLB.C:25

SimParameters
Definition: SimParameters.h:100

PatchMap::Object
static PatchMap * Object()
Definition: PatchMap.h:27

cpuloads
double * cpuloads
Definition: NamdCentLB.C:24

LocalLBInfoMsg::endPE
int endPE
Definition: NamdHybridLB.h:35

IRSet::numElements
int numElements()
Definition: Set.C:144

RefineOnly
Definition: RefineOnly.h:13

Node::simParameters
SimParameters * simParameters
Definition: Node.h:178

processorInfo::proxies
LargeIRSet proxies
Definition: elements.h:46

Node.h

LDBSTRAT_REFINEONLY
#define LDBSTRAT_REFINEONLY
Definition: SimParameters.h:67

SimParameters::ldbUnloadOutputPEs
Bool ldbUnloadOutputPEs
Definition: SimParameters.h:182

InfoRecord::Id
int Id
Definition: elements.h:16

InfoStream.h

endi
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54

SimParameters::PMEBarrier
Bool PMEBarrier
Definition: SimParameters.h:782

RefineTorusLB
Definition: RefineTorusLB.h:13

LocalLBInfoMsg::n_moves
int n_moves
Definition: NamdHybridLB.h:33

computeInfo::processor
int processor
Definition: elements.h:24

iout
#define iout
Definition: InfoStream.h:51

computeInfo::oldProcessor
int oldProcessor
Definition: elements.h:25

averageLoad
static double averageLoad
Definition: ProxyMgr.C:696

SplitComputesMsg::maxComputeId
int maxComputeId
Definition: NamdHybridLB.C:26

LocalLBInfoMsg::startPE
int startPE
Definition: NamdHybridLB.h:34

PatchMap::basenode
int basenode(int pid) const
Definition: PatchMap.h:117

NamdHybridLB::NamdHybridLB
NamdHybridLB(const CkLBOptions &opt)
Default constructor.
Definition: NamdHybridLB.C:65

LDBSTRAT_DEFAULT
#define LDBSTRAT_DEFAULT
Definition: SimParameters.h:65

SimParameters::hybridGroupSize
int hybridGroupSize
Definition: SimParameters.h:145

ComputeMap.h

isPmeProcessor
int isPmeProcessor(int)
Definition: ComputePme.C:604

ComputeMap
Definition: ComputeMap.h:85

SimParameters::ldbHomeBackgroundScaling
BigReal ldbHomeBackgroundScaling
Definition: SimParameters.h:148

PatchMap::MaxTwoAway
Definition: PatchMap.h:45

next
static Units next(Units u)
Definition: ParseOptions.C:48

LocalLBInfoMsg::cpuloads
double * cpuloads
Definition: NamdHybridLB.h:37

NamdHybridLB::splitComputes
void splitComputes(SplitComputesMsg *)
Definition: NamdHybridLB.C:219

SimParameters::PMEOn
Bool PMEOn
Definition: SimParameters.h:771

CreateNamdHybridLB
void CreateNamdHybridLB()
Definition: NamdHybridLB.C:49

LDBSTRAT_OLD
#define LDBSTRAT_OLD
Definition: SimParameters.h:68

processorInfo::idleTime
double idleTime
Definition: elements.h:40

computeInfo::patch2
int patch2
Definition: elements.h:23

SplitComputesMsg::averageLoad
double averageLoad
Definition: NamdHybridLB.C:23

LocalLBInfoMsg
Definition: NamdHybridLB.h:31

ComputeMap::numPartitions
int numPartitions(ComputeID cid)
Definition: ComputeMap.C:135

IF_NEW_NODE
#define IF_NEW_NODE

PatchID
int PatchID
Definition: NamdTypes.h:182

NamdHybridLB.h

numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
Definition: CudaComputeNonbondedKernel.cu:254

SplitComputesMsg::nMoveableComputes
int nMoveableComputes
Definition: NamdHybridLB.C:27

ComputeMap::setNewNode
void setNewNode(ComputeID cid, NodeID node)
Definition: ComputeMap.h:120

LdbIdField
const int & LdbIdField(const LdbId &id, const int index)
Definition: LdbCoordinator.h:46

patchInfo::numAtoms
int numAtoms
Definition: elements.h:32

NAMD_die
void NAMD_die(const char *err_msg)
Definition: common.C:85

LdbCoordinator::Object
static LdbCoordinator * Object()
Definition: LdbCoordinator.h:93

patchInfo
Definition: elements.h:29

SimParameters::ldbBackgroundScaling
BigReal ldbBackgroundScaling
Definition: SimParameters.h:146

SplitComputesMsg::load
float * load
Definition: NamdHybridLB.C:31

PATCH_TYPE
represents a patch
Definition: LdbCoordinator.h:68

InfoRecord::load
double load
Definition: elements.h:15

Iterator
Definition: Set.h:19

LdbCoordinator.h

LocalLBInfoMsg::moves
MigrateInfo * moves
Definition: NamdHybridLB.h:36

LDBSTRAT_COMPREHENSIVE
#define LDBSTRAT_COMPREHENSIVE
Definition: SimParameters.h:66

PatchMap::downstreamNeighbors
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714

SplitComputesMsg::maxUnsplit
double maxUnsplit
Definition: NamdHybridLB.C:22

computeInfo::handle
LDObjHandle handle
Definition: elements.h:26

simParams
#define simParams
Definition: Output.C:127

PatchMap::numPatches
int numPatches(void) const
Definition: PatchMap.h:59

PatchMap::node
int node(int pid) const
Definition: PatchMap.h:114

NamdDummyLB::work
void work(LDStats *stats)
Definition: NamdDummyLB.C:44

patchInfo::proxiesOn
IRSet proxiesOn
Definition: elements.h:33

ComputeMap::Object
static ComputeMap * Object()
Definition: ComputeMap.h:89

SimParameters::ldbPMEBackgroundScaling
BigReal ldbPMEBackgroundScaling
Definition: SimParameters.h:147

SplitComputesMsg::avgCompute
double avgCompute
Definition: NamdHybridLB.C:24

processorInfo::computeLoad
double computeLoad
Definition: elements.h:41

ComputeMap::numPids
int numPids(ComputeID cid)
Definition: ComputeMap.C:103

PatchMap.h

PatchMap::numPatchesOnNode
int numPatchesOnNode(int node)
Definition: PatchMap.h:60

IRSet::unchecked_insert
void unchecked_insert(InfoRecord *)
Definition: Set.C:32

patchInfo::processor
int processor
Definition: elements.h:31

ComputeMap::pid
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:109

SimParameters::LCPOOn
Bool LCPOOn
Definition: SimParameters.h:532

PatchMap
Definition: PatchMap.h:23

isOutputProcessor
int isOutputProcessor(int pe)
Definition: ParallelIOMgr.C:352

SimParameters::ldbUnloadOne
Bool ldbUnloadOne
Definition: SimParameters.h:181

SimParameters::ldbUnloadZero
Bool ldbUnloadZero
Definition: SimParameters.h:180

SplitComputesMsg::numPesAvailable
int numPesAvailable
Definition: NamdHybridLB.C:28

processorInfo::backgroundLoad
double backgroundLoad
Definition: elements.h:39

SimParameters::ldbStrategy
int ldbStrategy
Definition: SimParameters.h:139

processorInfo::available
bool available
Definition: elements.h:44

SplitComputesMsg
Definition: NamdHybridLB.C:20

BigReal
double BigReal
Definition: common.h:114

processorInfo
Definition: elements.h:36

TorusLB
Definition: TorusLB.h:14