Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

LdbCoordinator.C

Go to the documentation of this file.
00001 
00007 /*****************************************************************************
00008  * $Source: /home/cvs/namd/cvsroot/namd2/src/LdbCoordinator.C,v $
00009  * $Author: gzheng $
00010  * $Date: 2012/05/18 07:33:48 $
00011  * $Revision: 1.116 $
00012  *****************************************************************************/
00013 
00014 #include <stdlib.h>
00015 
00016 #include "InfoStream.h"
00017 #include "NamdCentLB.h"
00018 #include "NamdHybridLB.h"
00019 #include "NamdDummyLB.h"
00020 #include "NamdNborLB.h"
00021 
00022 #include "HomePatch.h"
00023 #include "LdbCoordinator.decl.h"
00024 #include "LdbCoordinator.h"
00025 #include "NamdTypes.h"
00026 #include "Node.h"
00027 #include "SimParameters.h"
00028 #include "PatchMap.inl"
00029 #include "ComputeMap.h"
00030 //#define DEBUGM
00031 #define MIN_DEBUG_LEVEL 3
00032 #include "Debug.h"
00033 #include "Controller.h"
00034 #include "Sequencer.h"
00035 #include "RefineOnly.h"
00036 #include "ComputeMgr.h"
00037 #include "Compute.h"
00038 #include "packmsg.h"
00039 #include "Sync.h"
00040 
00041 #include "elements.h"
00042 #include "ComputeMgr.decl.h"
00043 
00044 #define DEBUG_LEVEL 4
00045 
00046 #if CONVERSE_VERSION_ELAN
00047 extern "C" void enableBlockingReceives();
00048 extern "C" void disableBlockingReceives();
00049 #endif
00050 
00051 void LdbCoordinator_initproc() {
00052   // Set the load balancing period (in seconds).  Without this the
00053   // load balancing framework will hang until 1 second has passed
00054   // since the last load balancing, causing hiccups in very fast runs.
00055   // This is duplicated below for older versions, but putting it here
00056   // also fixes the first load balance.
00057   LBSetPeriod(1.0e-5);
00058 }
00059 
00060 void LdbCoordinator::staticMigrateFn(LDObjHandle handle, int dest)
00061 {
00062    LdbCoordinator *ldbCoordinator = (LdbCoordinator *)LDOMUserData(handle.omhandle);
00063    ldbCoordinator->Migrate(handle,dest);
00064 }
00065 
00066 void LdbCoordinator::Migrate(LDObjHandle handle, int dest)
00067 {
00068   LdbMigrateMsg* msg = new LdbMigrateMsg;
00069   msg->handle = handle;
00070   msg->from = CkMyPe();
00071   msg->to = dest;
00072   if ( msg->to != CkMyPe() ) {
00073     CProxy_LdbCoordinator ldbProxy(thisgroup);
00074     ldbProxy[CkMyPe()].RecvMigrate(msg);
00075   } else {
00076     ExpectMigrate(msg);
00077   }
00078 }
00079 
00080 void LdbCoordinator::staticStatsFn(LDOMHandle h, int state)
00081 {
00082   CkPrintf("I'm supposed to set stats\n");
00083 }
00084 
00085 void LdbCoordinator::staticQueryEstLoadFn(LDOMHandle h)
00086 {
00087   CkPrintf("I'm supposed to query load\n");
00088 }
00089 
00090 void LdbCoordinator::staticReceiveAtSync(void* data)
00091 {
00092 
00093 #if CONVERSE_VERSION_ELAN
00094     //disableBlockingReceives();
00095 #endif
00096 
00097   ((LdbCoordinator*)data)->ReceiveAtSync();
00098 }
00099 
00100 void LdbCoordinator::ReceiveAtSync()
00101 {
00102   theLbdb->RegisteringObjects(myHandle);
00103 }
00104 
00105 void LdbCoordinator::staticResumeFromSync(void* data)
00106 {
00107   ((LdbCoordinator*)data)->ResumeFromSync();
00108 }
00109 
00110 void LdbCoordinator::ResumeFromSync()
00111 {
00112   theLbdb->DoneRegisteringObjects(myHandle);
00113   CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
00114   contribute(NULL, 0, CkReduction::random, cb);
00115 }
00116 
00117 LdbCoordinator::LdbCoordinator()
00118 {
00119   if (CkpvAccess(LdbCoordinator_instance) == NULL) {
00120     CkpvAccess(LdbCoordinator_instance) = this;
00121   } else {
00122     iout << iFILE << iERROR << iPE 
00123          << "LdbCoordinator instanced twice on same node!" << endi;
00124     CkExit();
00125   }
00126   
00127 #if 0
00128   // Create a load balancer
00129   if (CkMyPe() == 0) {
00130     //   CreateCentralLB();
00131     CreateNamdCentLB();
00132     //   CreateNamdNborLB();
00133   }
00134 #endif
00135 
00136   ldbCycleNum = 1;
00137   takingLdbData = 1;
00138   totalStepsDone = 0;
00139   nLocalComputes = nLocalPatches = 0;
00140   patchNAtoms = (int *) NULL;
00141   sequencerThreads = (Sequencer **) NULL;
00142   ldbStatsFP = NULL;
00143   computeArray = NULL;
00144   patchArray = NULL;
00145   processorArray = NULL;
00146 
00147   // Register self as an object manager for new charm++ balancer framework
00148   theLbdb = LBDatabase::Object(); 
00149 
00150   // Set the load balancing period (in seconds).  Without this the
00151   // load balancing framework will hang until 1 second has passed
00152   // since the last load balancing, causing hiccups in very fast runs.
00153   // Unfortunately, the clock is already set for the first load
00154   // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm.
00155   // For newer versions this is handled in initproc above.
00156 
00157   theLbdb->SetLBPeriod(1.0e-5);
00158 
00159   myOMid.id.idx = 1;
00160   LDCallbacks cb = { (LDMigrateFn)staticMigrateFn,
00161                      (LDStatsFn)staticStatsFn,
00162                      (LDQueryEstLoadFn)staticQueryEstLoadFn
00163                    };
00164   myHandle = theLbdb->RegisterOM(myOMid,(void*)this,cb);
00165 
00166   // Add myself as a local barrier receiver, so I know when I might
00167   // be registering objects.
00168   theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync,
00169                                    (void*)this);;
00170 
00171   // Also, add a local barrier client, to trigger load balancing
00172   ldBarrierHandle = theLbdb->
00173     AddLocalBarrierClient((LDResumeFn)staticResumeFromSync,
00174                           (void*)this);
00175   migrateMsgs = 0; // linked list
00176   numComputes = 0;
00177   reg_all_objs = 1;
00178 }
00179 
00180 LdbCoordinator::~LdbCoordinator(void)
00181 {
00182   delete [] patchNAtoms;
00183   delete [] sequencerThreads;
00184   if (CkMyPe() == 0)
00185   {
00186     delete [] computeArray;
00187     delete [] patchArray;
00188     delete [] processorArray;
00189   }
00190   if (ldbStatsFP)
00191     fclose(ldbStatsFP);
00192 
00193 }
00194 
00195 void LdbCoordinator::createLoadBalancer()
00196 {
00197   const SimParameters *simParams = Node::Object()->simParameters;
00198 
00199   // Create hierarchical or centralized load balancers
00200   // Currently centralized is the default
00201   if (simParams->ldBalancer == LDBAL_CENTRALIZED) {
00202     CkPrintf("LDB: Central LB being created...\n");
00203     CreateNamdCentLB();
00204   } else if (simParams->ldBalancer == LDBAL_HYBRID) {
00205     CkPrintf("LDB: Hybrid LB being created...\n");
00206     CreateNamdHybridLB();
00207   }
00208 }
00209 
00210 void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit)
00211 {
00212   const SimParameters *simParams = Node::Object()->simParameters;
00213 
00214 #if 0
00215   static int lbcreated = 0; // XXX static variables are unsafe for SMP
00216   // PE0 first time Create a load balancer
00217   if (CkMyPe() == 0 && !lbcreated) {
00218     if (simParams->ldbStrategy == LDBSTRAT_ALGNBOR) 
00219       CreateNamdNborLB();
00220     else {
00221       //   CreateCentralLB();
00222       CreateNamdCentLB();
00223     }
00224     lbcreated = 1;
00225   }
00226 #endif
00227 
00228   //  DebugM(10,"stepsPerLdbCycle initialized\n");
00229   stepsPerLdbCycle = simParams->ldbPeriod;
00230   firstLdbStep = simParams->firstLdbStep;
00231   int lastLdbStep = simParams->lastLdbStep;
00232   int stepsPerCycle = simParams->stepsPerCycle;
00233 
00234   computeMap = cMap;
00235   patchMap = pMap;
00236 
00237   // Set the number of received messages correctly for node 0
00238 
00239   nStatsMessagesExpected = Node::Object()->numNodes();
00240   nStatsMessagesReceived = 0;
00241 
00242   if (patchNAtoms) 
00243     delete [] patchNAtoms;  // Depends on delete NULL to do nothing
00244   nPatches = patchMap->numPatches();
00245   patchNAtoms = new int[nPatches];
00246 
00247   typedef Sequencer *seqPtr;
00248 
00249   if ( ! reinit ) {
00250     delete [] sequencerThreads;  // Depends on delete NULL to do nothing
00251     sequencerThreads = new seqPtr[nPatches];
00252   }
00253 
00254   nLocalPatches=0;
00255 
00256   int i;
00257   for(i=0;i<nPatches;i++)
00258   {
00259     if (patchMap->node(i) == Node::Object()->myid())
00260     {
00261       nLocalPatches++;
00262       patchNAtoms[i]=0;
00263     } else {
00264       patchNAtoms[i]=-1;
00265     }
00266     if ( ! reinit ) sequencerThreads[i]=NULL;
00267   }
00268   if ( ! reinit ) controllerThread = NULL;
00269   if (nLocalPatches != patchMap->numHomePatches())
00270     NAMD_die("Disaggreement in patchMap data.\n");
00271  
00272   const int oldNumComputes = numComputes;
00273   nLocalComputes = 0;
00274   numComputes = computeMap->numComputes();
00275 
00276   for(i=0;i<numComputes;i++)  {
00277     if ( (computeMap->node(i) == Node::Object()->myid())
00278          && ( 0
00279 #ifndef NAMD_CUDA
00280               || (computeMap->type(i) == computeNonbondedSelfType)
00281               || (computeMap->type(i) == computeNonbondedPairType)
00282 #endif
00283               || (computeMap->type(i) == computeLCPOType)
00284               || (computeMap->type(i) == computeSelfExclsType)
00285               || (computeMap->type(i) == computeSelfBondsType)
00286               || (computeMap->type(i) == computeSelfAnglesType)
00287               || (computeMap->type(i) == computeSelfDihedralsType)
00288               || (computeMap->type(i) == computeSelfImpropersType)
00289               || (computeMap->type(i) == computeSelfTholeType)
00290               || (computeMap->type(i) == computeSelfAnisoType)
00291               || (computeMap->type(i) == computeSelfCrosstermsType)
00292 
00293                  || (computeMap->type(i) == computeBondsType)
00294                  || (computeMap->type(i) == computeExclsType)
00295                  || (computeMap->type(i) == computeAnglesType)
00296                  || (computeMap->type(i) == computeDihedralsType)
00297                  || (computeMap->type(i) == computeImpropersType)
00298                  || (computeMap->type(i) == computeTholeType)
00299                  || (computeMap->type(i) == computeAnisoType)
00300                  || (computeMap->type(i) == computeCrosstermsType)
00301         ) ) {
00302       nLocalComputes++;
00303     }
00304   }
00305   
00306   // New LB frameworks registration
00307 
00308   // Allocate data structure to save incoming migrations.  Processor
00309   // zero will get all migrations
00310 
00311   // If this is the first time through, we need it register patches
00312   if (ldbCycleNum == reg_all_objs) {
00313     if ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) {
00314       reg_all_objs = 3;
00315     }
00316     // Tell the lbdb that I'm registering objects, until I'm done
00317     // registering them.
00318     theLbdb->RegisteringObjects(myHandle);
00319     
00320    if ( ldbCycleNum == 1 ) {
00321     patchHandles = new LDObjHandle[nLocalPatches];
00322     int patch_count=0;
00323     int i;
00324     for(i=0;i<nPatches;i++)
00325       if (patchMap->node(i) == Node::Object()->myid()) {
00326         LDObjid elemID;
00327         elemID.id[0] = i;
00328         elemID.id[1] = elemID.id[2] = elemID.id[3] = -2;
00329 
00330         if (patch_count >= nLocalPatches) {
00331           iout << iFILE << iERROR << iPE 
00332                << "LdbCoordinator found too many local patches!" << endi;
00333           CkExit();
00334         }
00335         HomePatch *p = patchMap->homePatch(i);
00336         p->ldObjHandle = 
00337         patchHandles[patch_count] 
00338           = theLbdb->RegisterObj(myHandle,elemID,0,0);
00339         patch_count++;
00340 
00341       }
00342    }
00343   
00344     if ( numComputes > oldNumComputes ) {
00345       // Register computes
00346       for(i=oldNumComputes; i<numComputes; i++)  {
00347         if ( computeMap->node(i) == Node::Object()->myid())
00348         {
00349           if ( 0
00350 #ifndef NAMD_CUDA
00351                   || (computeMap->type(i) == computeNonbondedSelfType)
00352                   || (computeMap->type(i) == computeNonbondedPairType)
00353 #endif
00354                   || (computeMap->type(i) == computeLCPOType)
00355                   || (computeMap->type(i) == computeSelfExclsType)
00356                   || (computeMap->type(i) == computeSelfBondsType)
00357                   || (computeMap->type(i) == computeSelfAnglesType)
00358                   || (computeMap->type(i) == computeSelfDihedralsType)
00359                   || (computeMap->type(i) == computeSelfImpropersType)
00360                   || (computeMap->type(i) == computeSelfTholeType)
00361                   || (computeMap->type(i) == computeSelfAnisoType)
00362                   || (computeMap->type(i) == computeSelfCrosstermsType)
00363                 )  {
00364           // Register the object with the load balancer
00365           // Store the depended patch IDs in the rest of the element ID
00366           LDObjid elemID;
00367           elemID.id[0] = i;
00368         
00369           if (computeMap->numPids(i) > 2)
00370             elemID.id[3] = computeMap->pid(i,2);
00371           else elemID.id[3] = -1;
00372 
00373           if (computeMap->numPids(i) > 1)
00374             elemID.id[2] =  computeMap->pid(i,1);
00375           else elemID.id[2] = -1;
00376 
00377           if (computeMap->numPids(i) > 0)
00378             elemID.id[1] =  computeMap->pid(i,0);
00379           else elemID.id[1] = -1;
00380 
00381           Compute *c = computeMap->compute(i);
00382           if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
00383 
00384           c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1);
00385           }
00386           else if ( (computeMap->type(i) == computeBondsType)
00387                  || (computeMap->type(i) == computeExclsType)
00388                  || (computeMap->type(i) == computeAnglesType)
00389                  || (computeMap->type(i) == computeDihedralsType)
00390                  || (computeMap->type(i) == computeImpropersType)
00391                  || (computeMap->type(i) == computeTholeType)
00392                  || (computeMap->type(i) == computeAnisoType)
00393                  || (computeMap->type(i) == computeCrosstermsType)
00394                ) {
00395           // Register the object with the load balancer
00396           // Store the depended patch IDs in the rest of the element ID
00397           LDObjid elemID;
00398           elemID.id[0] = i;
00399         
00400           elemID.id[1] = elemID.id[2] = elemID.id[3] = -3;
00401 
00402           Compute *c = computeMap->compute(i);
00403           if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
00404 
00405           c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0);
00406           }
00407         }
00408       }
00409     }
00410     theLbdb->DoneRegisteringObjects(myHandle);
00411   }
00412 
00413   // process saved migration messages, if any
00414   while ( migrateMsgs ) {
00415     LdbMigrateMsg *m = migrateMsgs;
00416     migrateMsgs = m->next;
00417     Compute *c = computeMap->compute(m->handle.id.id[0]);
00418     if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2");
00419     c->ldObjHandle = m->handle;
00420     delete m;
00421   }
00422 
00423   // Fixup to take care of the extra timestep at startup
00424   // This is pretty ugly here, but it makes the count correct
00425   
00426   // iout << "LDB Cycle Num: " << ldbCycleNum << "\n";
00427 
00428  if ( simParams->ldBalancer == LDBAL_CENTRALIZED ) {
00429   if (ldbCycleNum == 1 || ldbCycleNum == 3) {
00430     numStepsToRun = stepsPerCycle;
00431     totalStepsDone += numStepsToRun;
00432     takingLdbData = 0;
00433     theLbdb->CollectStatsOff();
00434   } else if (ldbCycleNum == 2 || ldbCycleNum == 4) {
00435     numStepsToRun = firstLdbStep - stepsPerCycle;
00436     while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle;
00437     totalStepsDone += numStepsToRun;
00438     takingLdbData = 1;
00439     theLbdb->CollectStatsOn();
00440   } else if ( (ldbCycleNum <= 6) || !takingLdbData )
00441   {
00442     totalStepsDone += firstLdbStep;
00443     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00444       numStepsToRun = -1;
00445       takingLdbData = 0;
00446       theLbdb->CollectStatsOff();
00447     } else {
00448       numStepsToRun = firstLdbStep;
00449       takingLdbData = 1;
00450       theLbdb->CollectStatsOn();
00451     }
00452   }
00453   else 
00454   {
00455     totalStepsDone += stepsPerLdbCycle - firstLdbStep;
00456     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00457       numStepsToRun = -1;
00458       takingLdbData = 0;
00459       theLbdb->CollectStatsOff();
00460     } else {
00461       numStepsToRun = stepsPerLdbCycle - firstLdbStep;
00462       takingLdbData = 0;
00463       theLbdb->CollectStatsOff();
00464     }
00465   }
00466  } else {
00467   if (ldbCycleNum==1)
00468   {
00469     totalStepsDone += firstLdbStep;
00470     numStepsToRun = firstLdbStep;
00471     takingLdbData = 0;
00472     theLbdb->CollectStatsOff();
00473   }
00474   else if ( (ldbCycleNum <= 4) || !takingLdbData )
00475   {
00476     totalStepsDone += firstLdbStep;
00477     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00478       numStepsToRun = -1;
00479       takingLdbData = 0;
00480       theLbdb->CollectStatsOff();
00481     } else {
00482       numStepsToRun = firstLdbStep;
00483       takingLdbData = 1;
00484       theLbdb->CollectStatsOn();
00485     }
00486   }
00487   else 
00488   {
00489     totalStepsDone += stepsPerLdbCycle - firstLdbStep;
00490     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00491       numStepsToRun = -1;
00492       takingLdbData = 0;
00493       theLbdb->CollectStatsOff();
00494     } else {
00495       numStepsToRun = stepsPerLdbCycle - firstLdbStep;
00496       takingLdbData = 0;
00497       theLbdb->CollectStatsOff();
00498     }
00499   }
00500  }
00501 
00502 /*-----------------------------------------------------------------------------*
00503  * --------------------------------------------------------------------------- *
00504  * Comments inserted by Abhinav to clarify relation between ldbCycleNum,       *
00505  * load balancing step numbers (printed by the step() function) and            *
00506  * tracing of the steps                                                        *
00507  * --------------------------------------------------------------------------- *
00508  * If trace is turned off in the beginning, then tracing is turned on          *
00509  * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can       *
00510  * be adjusted by specifying firstLdbStep and ldbPeriod which are set by       *
00511  * default to 5*stepspercycle and 200*stepspercycle if not specified.          *
00512  *                                                                             *
00513  * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the             *
00514  * following timeline (for these particular numbers):                          *
00515  *                                                                             *
00516  * Tracing         :  <------ off ------><------------- on -----------><-- off *
00517  * Ldb Step() No   :              1     2     3        4      5       6      7 *
00518  * Iteration Steps : 00====20====40====60====80======160====180=====260====280 *
00519  * ldbCycleNum     :  1     2     3     4     5        6      7       8      9 *
00520  * Instrumention   :          Inst  Inst  Inst           Inst            Inst  *
00521  * LDB Strategy    :              TLB  RLB   RLB            RLB            RLB *
00522  *                                                                             *
00523  * TLB = TorusLB                                                               *
00524  * RLB = RefineTorusLB                                                         *
00525  * Inst = Instrumentation Phase (no real load balancing)                       *
00526  * --------------------------------------------------------------------------- *
00527  *-----------------------------------------------------------------------------*
00528  */
00529 #if 0 //replaced by traceBarrier at Controller and Sequencer
00530   if (traceAvailable()) {
00531     static int specialTracing = 0; // XXX static variables are unsafe for SMP
00532     if (ldbCycleNum == 1 && traceIsOn() == 0)  specialTracing = 1;
00533     if (specialTracing) {
00534       if (ldbCycleNum == 4) traceBegin();
00535       if (ldbCycleNum == 8) traceEnd();
00536     }
00537   }
00538 #endif
00539   
00540   nPatchesReported = 0;
00541   nPatchesExpected = nLocalPatches;
00542   nComputesReported = 0;
00543   nComputesExpected = nLocalComputes * numStepsToRun;
00544   controllerReported = 0;
00545   controllerExpected = ! CkMyPe();
00546 
00547   if (CkMyPe() == 0)
00548   {
00549     if (computeArray == NULL)
00550       computeArray = new computeInfo[numComputes];
00551     if (patchArray == NULL)
00552       patchArray = new patchInfo[nPatches];
00553     if (processorArray == NULL)
00554       processorArray = new processorInfo[CkNumPes()];
00555   }
00556     
00557   theLbdb->ClearLoads();
00558 }
00559 
00560 void LdbCoordinator::patchLoad(PatchID id, int nAtoms, int /* timestep */)
00561 {
00562   CmiAssert( id >=0 && id < nPatches);
00563   if (patchNAtoms[id] != -1) {
00564     patchNAtoms[id] = nAtoms;
00565     nPatchesReported++;
00566   } else {
00567     DebugM(10, "::patchLoad() Unexpected patch reporting in\n");
00568   }
00569 }
00570 
00571 void LdbCoordinator::rebalance(Sequencer *seq, PatchID pid)
00572 {
00573   if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
00574     return;
00575 
00576   sequencerThreads[pid] = seq;
00577   seq->suspend();
00578 }
00579 
00580 void LdbCoordinator::rebalance(Controller *c)
00581 {
00582   if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
00583     return;
00584 
00585   iout << "LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() << "\n" << endi;
00586   DebugM(3, "Controller reached load balance barrier.\n");
00587   controllerReported = 1;
00588   controllerThread = c;
00589 
00590   CProxy_LdbCoordinator(thisgroup).barrier();
00591 
00592   CthSuspend();
00593 }
00594 
00595 void LdbCoordinator::barrier(void)
00596 {
00597   if ( (nPatchesReported != nPatchesExpected) 
00598        || (nComputesReported != nComputesExpected)
00599        || (controllerReported != controllerExpected) )
00600   {
00601     NAMD_bug("Load balancer received wrong number of events.\n");
00602   }
00603 
00604   theLbdb->AtLocalBarrier(ldBarrierHandle);
00605 }
00606 
00607 void LdbCoordinator::nodeDone(CkReductionMsg *msg)
00608 {
00609   delete msg;
00610 
00611   iout << "LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() << "\n" << endi;
00612   if ( takingLdbData ) {
00613       ExecuteMigrations();
00614   } else {
00615       updateComputesReady();
00616   }
00617 }
00618 
00619 void LdbCoordinator::ExecuteMigrations(void)
00620 {
00621  // computeMgr->updateComputes() call only on Node(0) i.e. right here
00622   // This will barrier for all Nodes - (i.e. Computes must be
00623   // here and with proxies before anyone can start up
00624 
00625   CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
00626   ComputeMgr *computeMgr = cm.ckLocalBranch();
00627   computeMgr->updateComputes(CkIndex_LdbCoordinator::
00628                              updateComputesReady(),thisgroup);
00629 }
00630 
00631 void LdbCoordinator::RecvMigrate(LdbMigrateMsg* m)
00632 {
00633   // This method receives the migration from the framework,
00634   // unregisters it, and sends it to the destination PE
00635 
00636   if ( m->to != CkMyPe() ) {
00637     theLbdb->UnregisterObj(m->handle);
00638 
00639     CProxy_LdbCoordinator  ldbProxy(thisgroup);
00640     ldbProxy[m->to].ExpectMigrate(m);
00641   } else {
00642     ExpectMigrate(m);
00643   }
00644 }
00645 
00646 void LdbCoordinator::ExpectMigrate(LdbMigrateMsg* m)
00647 {
00648   if ( m->from != CkMyPe() ) {
00649     m->handle = theLbdb->RegisterObj(myHandle,m->handle.id,0,1);
00650     theLbdb->Migrated(m->handle);
00651   }
00652 
00653   m->next = migrateMsgs;
00654   migrateMsgs = m;
00655 }
00656 
00657 void LdbCoordinator::updateComputesReady() {
00658   DebugM(3,"updateComputesReady()\n");
00659 
00660   CProxy_LdbCoordinator(thisgroup).resume();
00661   CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
00662 }
00663 
00664 void LdbCoordinator::resume(void)
00665 {
00666   DebugM(3,"resume()\n");
00667   //  printLocalLdbReport();
00668 
00669   ldbCycleNum++;
00670   initialize(PatchMap::Object(),ComputeMap::Object(),1);
00671 
00672   Sync::Object()->openSync();
00673 }
00674 
00675 void LdbCoordinator::resumeReady(CkQdMsg *msg) {
00676 
00677   iout << "LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() << "\n" << endi;
00678   DebugM(3,"resumeReady()\n");
00679   delete msg;
00680 
00681   CProxy_LdbCoordinator(thisgroup).resume2();
00682 }
00683 
00684 void LdbCoordinator::resume2(void)
00685 {
00686   DebugM(3,"resume2()\n");
00687 
00688 #if CONVERSE_VERSION_ELAN
00689   //  enableBlockingReceives();
00690 #endif
00691 
00692   awakenSequencers();
00693 }
00694 
00695 void LdbCoordinator::awakenSequencers()
00696 {
00697   if (controllerThread)
00698   {
00699     controllerThread->awaken();
00700     controllerThread = NULL;
00701   }
00702   for(int i=0; i < patchMap->numPatches(); i++)
00703   {
00704     if (sequencerThreads[i])
00705     {
00706       sequencerThreads[i]->awaken();
00707     }
00708     sequencerThreads[i]= NULL;
00709   }
00710 }
00711 
00712 // Figure out which proxies we will definitely create on other
00713 // nodes, without regard for non-bonded computes.  This code is swiped
00714 // from ProxyMgr, and changes there probable need to be propagated here.
00715 
00716 int LdbCoordinator::requiredProxies(PatchID id, int neighborNodes[])
00717 {
00718   PatchID neighbors[1 + PatchMap::MaxOneAway];
00719   neighbors[0] = id;
00720   int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
00721 
00722   int nProxyNodes = 0;
00723   int myNode = patchMap->node(id);
00724   for ( int i = 0; i < numNeighbors; ++i ) {
00725     const int proxyNode = patchMap->basenode(neighbors[i]);
00726     if ( proxyNode != myNode ) {
00727       int j;
00728       for ( j = 0; j < nProxyNodes; ++j ) {
00729         if ( neighborNodes[j] == proxyNode ) break;
00730       }
00731       if ( j == nProxyNodes ) {
00732         neighborNodes[nProxyNodes] = proxyNode;
00733         nProxyNodes++;
00734       }
00735     }
00736   }
00737   return nProxyNodes;
00738 }
00739 
00740 void LdbCoordinator::printLocalLdbReport(void)
00741 {
00742   char outputBuf[255];
00743   char *curLoc;
00744 
00745   CkPrintf("%d:Patch report:\n",CkMyPe());
00746   
00747   curLoc = outputBuf;
00748   int i,j=0;
00749   for(i=0; i<patchMap->numPatches(); i++)
00750   {
00751     if (patchNAtoms[i] != -1)
00752     {
00753       curLoc += sprintf(curLoc,"%5d: %5d ",i,patchNAtoms[i]);
00754       j++;
00755     } 
00756     if (((j % 4) == 0) && j)
00757     {
00758       curLoc = outputBuf;
00759       CkPrintf("[%d]%s\n",CkMyPe(),outputBuf);
00760       j=0;
00761     }
00762   }
00763 
00764   CkPrintf("%d:Compute report:\n",CkMyPe());
00765   
00766   curLoc = outputBuf;
00767   j=0;
00768 }
00769 
00770 void LdbCoordinator::printRequiredProxies(PatchID id, FILE *fp)
00771 {
00772   // Check all two-away neighbors.
00773   // This is really just one-away neighbors, since 
00774   // two-away always returns zero: RKB
00775   int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
00776   const int nProxyNodes = requiredProxies(id,neighborNodes);
00777 
00778   fprintf(fp,"%4d ",nProxyNodes);
00779 
00780   for(int i=0;i<nProxyNodes;i++)
00781     fprintf(fp,"%4d ",neighborNodes[i]);
00782 }
00783 
00784 #include "LdbCoordinator.def.h"

Generated on Fri May 25 04:07:15 2012 for NAMD by  doxygen 1.3.9.1