LdbCoordinator.C

Go to the documentation of this file.
00001 
00007 /*****************************************************************************
00008  * $Source: /home/cvs/namd/cvsroot/namd2/src/LdbCoordinator.C,v $
00009  * $Author: jim $
00010  * $Date: 2017/03/30 20:06:17 $
00011  * $Revision: 1.128 $
00012  *****************************************************************************/
00013 
00014 #include <stdlib.h>
00015 
00016 #include "InfoStream.h"
00017 #include "NamdCentLB.h"
00018 #include "NamdHybridLB.h"
00019 #include "NamdDummyLB.h"
00020 #include "NamdNborLB.h"
00021 
00022 #include "HomePatch.h"
00023 #include "LdbCoordinator.decl.h"
00024 #include "LdbCoordinator.h"
00025 #include "NamdTypes.h"
00026 #include "Node.h"
00027 #include "SimParameters.h"
00028 #include "PatchMap.inl"
00029 #include "ComputeMap.h"
00030 #include "ComputeNonbondedMICKernel.h"
00031 //#define DEBUGM
00032 #define MIN_DEBUG_LEVEL 3
00033 #include "Debug.h"
00034 #include "Controller.h"
00035 #include "Sequencer.h"
00036 #include "RefineOnly.h"
00037 #include "ComputeMgr.h"
00038 #include "Compute.h"
00039 #include "packmsg.h"
00040 #include "Sync.h"
00041 
00042 #include "elements.h"
00043 #include "ComputeMgr.decl.h"
00044 
00045 #define DEBUG_LEVEL 4
00046 
00047 #if CONVERSE_VERSION_ELAN
00048 extern "C" void enableBlockingReceives();
00049 extern "C" void disableBlockingReceives();
00050 #endif
00051 
00052 void LdbCoordinator_initproc() {
00053   // Set the load balancing period (in seconds).  Without this the
00054   // load balancing framework will hang until 1 second has passed
00055   // since the last load balancing, causing hiccups in very fast runs.
00056   // This is duplicated below for older versions, but putting it here
00057   // also fixes the first load balance.
00058   LBSetPeriod(1.0e-5);
00059 }
00060 
00061 void LdbCoordinator::staticMigrateFn(LDObjHandle handle, int dest)
00062 {
00063    LdbCoordinator *ldbCoordinator = (LdbCoordinator *)LDOMUserData(handle.omhandle);
00064    ldbCoordinator->Migrate(handle,dest);
00065 }
00066 
00067 void LdbCoordinator::Migrate(LDObjHandle handle, int dest)
00068 {
00069   LdbMigrateMsg* msg = new LdbMigrateMsg;
00070   msg->handle = handle;
00071   msg->from = CkMyPe();
00072   msg->to = dest;
00073   if ( msg->to != CkMyPe() ) {
00074     CProxy_LdbCoordinator ldbProxy(thisgroup);
00075     ldbProxy[CkMyPe()].RecvMigrate(msg);
00076   } else {
00077     ExpectMigrate(msg);
00078   }
00079 }
00080 
00081 void LdbCoordinator::staticStatsFn(LDOMHandle h, int state)
00082 {
00083   CkPrintf("I'm supposed to set stats\n");
00084 }
00085 
00086 void LdbCoordinator::staticQueryEstLoadFn(LDOMHandle h)
00087 {
00088   CkPrintf("I'm supposed to query load\n");
00089 }
00090 
00091 void LdbCoordinator::staticReceiveAtSync(void* data)
00092 {
00093 
00094 #if CONVERSE_VERSION_ELAN
00095     //disableBlockingReceives();
00096 #endif
00097 
00098   ((LdbCoordinator*)data)->ReceiveAtSync();
00099 }
00100 
00101 void LdbCoordinator::ReceiveAtSync()
00102 {
00103   theLbdb->RegisteringObjects(myHandle);
00104 }
00105 
00106 void LdbCoordinator::staticResumeFromSync(void* data)
00107 {
00108   ((LdbCoordinator*)data)->ResumeFromSync();
00109 }
00110 
00111 void LdbCoordinator::ResumeFromSync()
00112 {
00113   theLbdb->DoneRegisteringObjects(myHandle);
00114   CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
00115   contribute(0, NULL, CkReduction::random, cb);
00116 }
00117 
00118 LdbCoordinator::LdbCoordinator()
00119 {
00120   if (CkpvAccess(LdbCoordinator_instance) == NULL) {
00121     CkpvAccess(LdbCoordinator_instance) = this;
00122   } else {
00123     NAMD_bug("LdbCoordinator instanced twice on same node!");
00124   }
00125   
00126 #if 0
00127   // Create a load balancer
00128   if (CkMyPe() == 0) {
00129     //   CreateCentralLB();
00130     CreateNamdCentLB();
00131     //   CreateNamdNborLB();
00132   }
00133 #endif
00134 
00135   collPes = 0;
00136   ldbCycleNum = 1;
00137   takingLdbData = 1;
00138   totalStepsDone = 0;
00139   nLocalComputes = nLocalPatches = 0;
00140   patchNAtoms = (int *) NULL;
00141   sequencerThreads = (Sequencer **) NULL;
00142   ldbStatsFP = NULL;
00143   computeArray = NULL;
00144   patchArray = NULL;
00145   processorArray = NULL;
00146 
00147   // Register self as an object manager for new charm++ balancer framework
00148   theLbdb = LBDatabase::Object(); 
00149 
00150   // Set the load balancing period (in seconds).  Without this the
00151   // load balancing framework will hang until 1 second has passed
00152   // since the last load balancing, causing hiccups in very fast runs.
00153   // Unfortunately, the clock is already set for the first load
00154   // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm.
00155   // For newer versions this is handled in initproc above.
00156 
00157   theLbdb->SetLBPeriod(1.0e-5);
00158 
00159   myOMid.id.idx = 1;
00160   LDCallbacks cb = { (LDMigrateFn)staticMigrateFn,
00161                      (LDStatsFn)staticStatsFn,
00162                      (LDQueryEstLoadFn)staticQueryEstLoadFn
00163                    };
00164   myHandle = theLbdb->RegisterOM(myOMid,(void*)this,cb);
00165 
00166   // Add myself as a local barrier receiver, so I know when I might
00167   // be registering objects.
00168   theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync,
00169                                    (void*)this);;
00170 
00171   // Also, add a local barrier client, to trigger load balancing
00172   ldBarrierHandle = theLbdb->
00173     AddLocalBarrierClient((LDResumeFn)staticResumeFromSync,
00174                           (void*)this);
00175   migrateMsgs = 0; // linked list
00176   numComputes = 0;
00177   reg_all_objs = 1;
00178 }
00179 
00180 LdbCoordinator::~LdbCoordinator(void)
00181 {
00182   delete [] patchNAtoms;
00183   delete [] sequencerThreads;
00184   if (CkMyPe() == 0)
00185   {
00186     delete [] computeArray;
00187     delete [] patchArray;
00188     delete [] processorArray;
00189   }
00190   if (ldbStatsFP)
00191     fclose(ldbStatsFP);
00192 
00193 }
00194 
00195 void LdbCoordinator::createLoadBalancer()
00196 {
00197   const SimParameters *simParams = Node::Object()->simParameters;
00198 
00199   // Create hierarchical or centralized load balancers
00200   // Currently centralized is the default
00201   if (simParams->ldBalancer == LDBAL_CENTRALIZED) {
00202     CkPrintf("LDB: Central LB being created...\n");
00203     CreateNamdCentLB();
00204   } else if (simParams->ldBalancer == LDBAL_HYBRID) {
00205     CkPrintf("LDB: Hybrid LB being created...\n");
00206     CreateNamdHybridLB();
00207   }
00208 }
00209 
00210 void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit)
00211 {
00212   const SimParameters *simParams = Node::Object()->simParameters;
00213 
00214 #if 0
00215   static int lbcreated = 0; // XXX static variables are unsafe for SMP
00216   // PE0 first time Create a load balancer
00217   if (CkMyPe() == 0 && !lbcreated) {
00218     if (simParams->ldbStrategy == LDBSTRAT_ALGNBOR) 
00219       CreateNamdNborLB();
00220     else {
00221       //   CreateCentralLB();
00222       CreateNamdCentLB();
00223     }
00224     lbcreated = 1;
00225   }
00226 #endif
00227 
00228   //  DebugM(10,"stepsPerLdbCycle initialized\n");
00229   stepsPerLdbCycle = simParams->ldbPeriod;
00230   firstLdbStep = simParams->firstLdbStep;
00231   int lastLdbStep = simParams->lastLdbStep;
00232   int stepsPerCycle = simParams->stepsPerCycle;
00233 
00234   computeMap = cMap;
00235   patchMap = pMap;
00236 
00237   // Set the number of received messages correctly for node 0
00238 
00239   nStatsMessagesExpected = Node::Object()->numNodes();
00240   nStatsMessagesReceived = 0;
00241 
00242   if (patchNAtoms) 
00243     delete [] patchNAtoms;  // Depends on delete NULL to do nothing
00244   nPatches = patchMap->numPatches();
00245   patchNAtoms = new int[nPatches];
00246 
00247   typedef Sequencer *seqPtr;
00248 
00249   if ( ! reinit ) {
00250     delete [] sequencerThreads;  // Depends on delete NULL to do nothing
00251     sequencerThreads = new seqPtr[nPatches];
00252   }
00253 
00254   nLocalPatches=0;
00255 
00256   int i;
00257   for(i=0;i<nPatches;i++)
00258   {
00259     if (patchMap->node(i) == Node::Object()->myid())
00260     {
00261       nLocalPatches++;
00262       patchNAtoms[i]=0;
00263     } else {
00264       patchNAtoms[i]=-1;
00265     }
00266     if ( ! reinit ) sequencerThreads[i]=NULL;
00267   }
00268   if ( ! reinit ) controllerThread = NULL;
00269   if (nLocalPatches != patchMap->numHomePatches())
00270     NAMD_die("Disaggreement in patchMap data.\n");
00271  
00272   const int oldNumComputes = numComputes;
00273   nLocalComputes = 0;
00274   numComputes = computeMap->numComputes();
00275 
00276   for(i=0;i<numComputes;i++)  {
00277     if ( (computeMap->node(i) == Node::Object()->myid())
00278          && ( 0
00279               #if (defined(NAMD_CUDA) || defined(NAMD_MIC))
00280                 #if defined(NAMD_MIC)
00281                   || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
00282                   || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
00283                 #endif
00284               #else
00285               || (computeMap->type(i) == computeNonbondedSelfType)
00286               || (computeMap->type(i) == computeNonbondedPairType)
00287 #endif
00288 #if defined(NAMD_CUDA) && defined(BONDED_CUDA)
00289         || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & 1))
00290         || (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & 1))
00291         || (computeMap->type(i) == computeSelfAnglesType && !(simParams->bondedCUDA & 2))
00292         || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & 2))
00293         || (computeMap->type(i) == computeSelfDihedralsType && !(simParams->bondedCUDA & 4))
00294         || (computeMap->type(i) == computeDihedralsType && !(simParams->bondedCUDA & 4))
00295         || (computeMap->type(i) == computeSelfImpropersType && !(simParams->bondedCUDA & 8))
00296         || (computeMap->type(i) == computeImpropersType && !(simParams->bondedCUDA & 8))
00297         || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & 16))
00298         || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & 16))
00299         || (computeMap->type(i) == computeSelfCrosstermsType && !(simParams->bondedCUDA & 32))
00300         || (computeMap->type(i) == computeCrosstermsType && !(simParams->bondedCUDA & 32))
00301 #else
00302         || (computeMap->type(i) == computeSelfBondsType)
00303         || (computeMap->type(i) == computeBondsType)
00304         || (computeMap->type(i) == computeSelfAnglesType)
00305         || (computeMap->type(i) == computeAnglesType)
00306         || (computeMap->type(i) == computeSelfDihedralsType)
00307         || (computeMap->type(i) == computeDihedralsType)
00308         || (computeMap->type(i) == computeSelfImpropersType)
00309         || (computeMap->type(i) == computeImpropersType)
00310         || (computeMap->type(i) == computeSelfExclsType)
00311         || (computeMap->type(i) == computeExclsType)
00312         || (computeMap->type(i) == computeSelfCrosstermsType)
00313         || (computeMap->type(i) == computeCrosstermsType)
00314 #endif
00315               || (computeMap->type(i) == computeLCPOType)
00316               || (computeMap->type(i) == computeSelfTholeType)
00317               || (computeMap->type(i) == computeSelfAnisoType)
00318 
00319                  || (computeMap->type(i) == computeTholeType)
00320                  || (computeMap->type(i) == computeAnisoType)
00321               // JLai
00322                  || (computeMap->type(i) == computeGromacsPairType)
00323                  || (computeMap->type(i) == computeSelfGromacsPairType)
00324         ) ) {
00325       nLocalComputes++;
00326     }
00327   }
00328   
00329   // New LB frameworks registration
00330 
00331   // Allocate data structure to save incoming migrations.  Processor
00332   // zero will get all migrations
00333 
00334   // If this is the first time through, we need it register patches
00335   if (ldbCycleNum == reg_all_objs) {
00336     if ( 1 ) { // ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) {
00337       reg_all_objs = 3;
00338     }
00339     // Tell the lbdb that I'm registering objects, until I'm done
00340     // registering them.
00341     theLbdb->RegisteringObjects(myHandle);
00342     
00343    if ( ldbCycleNum == 1 ) {
00344     patchHandles = new LDObjHandle[nLocalPatches];
00345     int patch_count=0;
00346     int i;
00347     for(i=0;i<nPatches;i++)
00348       if (patchMap->node(i) == Node::Object()->myid()) {
00349 
00350         LdbId elemID;
00351         LdbIdField(elemID, 0) = i;
00352         LdbIdField(elemID, 1) = PATCH_TYPE;
00353 
00354         if (patch_count >= nLocalPatches) {
00355     NAMD_bug("LdbCoordinator found too many local patches!");
00356         }
00357         HomePatch *p = patchMap->homePatch(i);
00358         p->ldObjHandle = 
00359         patchHandles[patch_count] 
00360           = theLbdb->RegisterObj(myHandle,elemID,0,0);
00361         patch_count++;
00362 
00363       }
00364    }
00365   
00366     if ( numComputes > oldNumComputes ) {
00367       // Register computes
00368       for(i=oldNumComputes; i<numComputes; i++)  {
00369         if ( computeMap->node(i) == Node::Object()->myid())
00370         {
00371           if ( 0
00372                #if (defined(NAMD_CUDA) || defined(NAMD_MIC))
00373                  #if defined(NAMD_MIC)
00374                    || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
00375                    || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
00376                  #endif
00377                #else
00378                   || (computeMap->type(i) == computeNonbondedSelfType)
00379                   || (computeMap->type(i) == computeNonbondedPairType)
00380                #endif
00381 #if defined(NAMD_CUDA) && defined(BONDED_CUDA)
00382             || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & 1))
00383             || (computeMap->type(i) == computeSelfAnglesType && !(simParams->bondedCUDA & 2))
00384             || (computeMap->type(i) == computeSelfDihedralsType && !(simParams->bondedCUDA & 4))
00385             || (computeMap->type(i) == computeSelfImpropersType && !(simParams->bondedCUDA & 8))
00386             || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & 16))
00387             || (computeMap->type(i) == computeSelfCrosstermsType && !(simParams->bondedCUDA & 32))
00388 #else
00389             || (computeMap->type(i) == computeSelfBondsType)
00390             || (computeMap->type(i) == computeSelfAnglesType)
00391             || (computeMap->type(i) == computeSelfDihedralsType)
00392             || (computeMap->type(i) == computeSelfImpropersType)
00393             || (computeMap->type(i) == computeSelfExclsType)
00394             || (computeMap->type(i) == computeSelfCrosstermsType)
00395 #endif
00396                   || (computeMap->type(i) == computeLCPOType)
00397                   || (computeMap->type(i) == computeSelfTholeType)
00398                   || (computeMap->type(i) == computeSelfAnisoType)
00399                // JLai
00400                   || (computeMap->type(i) == computeSelfGromacsPairType)
00401                // End of JLai
00402                 )  {
00403           // Register the object with the load balancer
00404           // Store the depended patch IDs in the rest of the element ID
00405           LdbId elemID;
00406           LdbIdField(elemID, 0) = i;
00407 
00408           if (computeMap->numPids(i) > 0)
00409             LdbIdField(elemID, 1) =  computeMap->pid(i,0);
00410           else LdbIdField(elemID, 1) = NONBONDED_OR_SELF_TYPE;
00411 
00412           Compute *c = computeMap->compute(i);
00413           if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
00414 
00415           c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1);
00416           }
00417           else if ( 
00418 #if defined(NAMD_CUDA) && defined(BONDED_CUDA)
00419                     (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & 1))
00420                  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & 2))
00421                  || (computeMap->type(i) == computeDihedralsType && !(simParams->bondedCUDA & 4))
00422                  || (computeMap->type(i) == computeImpropersType && !(simParams->bondedCUDA & 8))
00423                  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & 16))
00424                  || (computeMap->type(i) == computeCrosstermsType && !(simParams->bondedCUDA & 32))
00425 #else
00426                     (computeMap->type(i) == computeBondsType)
00427                  || (computeMap->type(i) == computeAnglesType)
00428                  || (computeMap->type(i) == computeDihedralsType)
00429                  || (computeMap->type(i) == computeImpropersType)
00430                  || (computeMap->type(i) == computeExclsType)
00431                  || (computeMap->type(i) == computeCrosstermsType)
00432 #endif
00433                  || (computeMap->type(i) == computeTholeType)
00434                  || (computeMap->type(i) == computeAnisoType)
00435                  // JLai
00436                  || (computeMap->type(i) == computeGromacsPairType)
00437                  // End of JLai
00438                ) {
00439           // Register the object with the load balancer
00440           // Store the depended patch IDs in the rest of the element ID
00441           LdbId elemID;
00442           LdbIdField(elemID, 0) = i;
00443         
00444           LdbIdField(elemID, 1) = BONDED_TYPE;
00445 
00446           Compute *c = computeMap->compute(i);
00447           if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
00448 
00449           c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0);
00450           }
00451         }
00452       }
00453     }
00454     theLbdb->DoneRegisteringObjects(myHandle);
00455   }
00456 
00457   // process saved migration messages, if any
00458   while ( migrateMsgs ) {
00459     LdbMigrateMsg *m = migrateMsgs;
00460     migrateMsgs = m->next;
00461     Compute *c = computeMap->compute(LdbIdField(m->handle.id, 0));
00462     if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2");
00463     c->ldObjHandle = m->handle;
00464     delete m;
00465   }
00466 
00467   // Fixup to take care of the extra timestep at startup
00468   // This is pretty ugly here, but it makes the count correct
00469   
00470   // iout << "LDB Cycle Num: " << ldbCycleNum << "\n";
00471 
00472  if ( 1 ) { // ( simParams->ldBalancer == LDBAL_CENTRALIZED ) {
00473   if (ldbCycleNum == 1 || ldbCycleNum == 3) {
00474     numStepsToRun = stepsPerCycle;
00475     totalStepsDone += numStepsToRun;
00476     takingLdbData = 0;
00477     theLbdb->CollectStatsOff();
00478   } else if (ldbCycleNum == 2 || ldbCycleNum == 4) {
00479     numStepsToRun = firstLdbStep - stepsPerCycle;
00480     while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle;
00481     totalStepsDone += numStepsToRun;
00482     takingLdbData = 1;
00483     theLbdb->CollectStatsOn();
00484   } else if ( (ldbCycleNum <= 6) || !takingLdbData )
00485   {
00486     totalStepsDone += firstLdbStep;
00487     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00488       numStepsToRun = -1;
00489       takingLdbData = 0;
00490       theLbdb->CollectStatsOff();
00491     } else {
00492       numStepsToRun = firstLdbStep;
00493       takingLdbData = 1;
00494       theLbdb->CollectStatsOn();
00495     }
00496   }
00497   else 
00498   {
00499     totalStepsDone += stepsPerLdbCycle - firstLdbStep;
00500     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00501       numStepsToRun = -1;
00502       takingLdbData = 0;
00503       theLbdb->CollectStatsOff();
00504     } else {
00505       numStepsToRun = stepsPerLdbCycle - firstLdbStep;
00506       takingLdbData = 0;
00507       theLbdb->CollectStatsOff();
00508     }
00509   }
00510  } else {
00511   if (ldbCycleNum==1)
00512   {
00513     totalStepsDone += firstLdbStep;
00514     numStepsToRun = firstLdbStep;
00515     takingLdbData = 0;
00516     theLbdb->CollectStatsOff();
00517   }
00518   else if ( (ldbCycleNum <= 4) || !takingLdbData )
00519   {
00520     totalStepsDone += firstLdbStep;
00521     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00522       numStepsToRun = -1;
00523       takingLdbData = 0;
00524       theLbdb->CollectStatsOff();
00525     } else {
00526       numStepsToRun = firstLdbStep;
00527       takingLdbData = 1;
00528       theLbdb->CollectStatsOn();
00529     }
00530   }
00531   else 
00532   {
00533     totalStepsDone += stepsPerLdbCycle - firstLdbStep;
00534     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00535       numStepsToRun = -1;
00536       takingLdbData = 0;
00537       theLbdb->CollectStatsOff();
00538     } else {
00539       numStepsToRun = stepsPerLdbCycle - firstLdbStep;
00540       takingLdbData = 0;
00541       theLbdb->CollectStatsOff();
00542     }
00543   }
00544  }
00545 
00546 /*-----------------------------------------------------------------------------*
00547  * --------------------------------------------------------------------------- *
00548  * Comments inserted by Abhinav to clarify relation between ldbCycleNum,       *
00549  * load balancing step numbers (printed by the step() function) and            *
00550  * tracing of the steps                                                        *
00551  * --------------------------------------------------------------------------- *
00552  * If trace is turned off in the beginning, then tracing is turned on          *
00553  * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can       *
00554  * be adjusted by specifying firstLdbStep and ldbPeriod which are set by       *
00555  * default to 5*stepspercycle and 200*stepspercycle if not specified.          *
00556  *                                                                             *
00557  * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the             *
00558  * following timeline (for these particular numbers):                          *
00559  *                                                                             *
00560  * Tracing         :  <------ off ------><------------- on -----------><-- off *
00561  * Ldb Step() No   :              1     2     3        4      5       6      7 *
00562  * Iteration Steps : 00====20====40====60====80======160====180=====260====280 *
00563  * ldbCycleNum     :  1     2     3     4     5        6      7       8      9 *
00564  * Instrumention   :          Inst  Inst  Inst           Inst            Inst  *
00565  * LDB Strategy    :              TLB  RLB   RLB            RLB            RLB *
00566  *                                                                             *
00567  * TLB = TorusLB                                                               *
00568  * RLB = RefineTorusLB                                                         *
00569  * Inst = Instrumentation Phase (no real load balancing)                       *
00570  * --------------------------------------------------------------------------- *
00571  *-----------------------------------------------------------------------------*
00572  */
00573 #if 0 //replaced by traceBarrier at Controller and Sequencer
00574   if (traceAvailable()) {
00575     static int specialTracing = 0; // XXX static variables are unsafe for SMP
00576     if (ldbCycleNum == 1 && traceIsOn() == 0)  specialTracing = 1;
00577     if (specialTracing) {
00578       if (ldbCycleNum == 4) traceBegin();
00579       if (ldbCycleNum == 8) traceEnd();
00580     }
00581   }
00582 #endif
00583 
00584   nPatchesReported = 0;
00585   nPatchesExpected = nLocalPatches;
00586   nComputesReported = 0;
00587   nComputesExpected = nLocalComputes * numStepsToRun;
00588   controllerReported = 0;
00589   controllerExpected = ! CkMyPe();
00590 
00591   if (simParams->multigratorOn) {
00592     // Add the number of pressure cycles into nComputesExpected:
00593     // Pressure cycle is done when !(step % simParams->multigratorPressureFreq) = true
00594     // step = Current step
00595     int step = totalStepsDone - numStepsToRun;
00596     int freq = simParams->multigratorPressureFreq;
00597     // dstep = Number of steps we have to take until next pressure cycle
00598     int dstep = 0;
00599     if ((step % freq) != 0) dstep = freq - (step % freq);
00600     step += dstep;
00601     if (step < totalStepsDone) {
00602       int numPressureCycles = 1 + ((totalStepsDone-step-1)/freq);
00603       if (step==0) numPressureCycles--;
00604       // if (CkMyPe()==2) fprintf(stderr, "step %d totalStepsDone %d numPressureCycles %d\n",
00605       //   step, totalStepsDone, numPressureCycles);
00606       nComputesExpected += 2*nLocalComputes*numPressureCycles;
00607     }
00608   }
00609 
00610   if (CkMyPe() == 0)
00611   {
00612     if (computeArray == NULL)
00613       computeArray = new computeInfo[numComputes];
00614     if (patchArray == NULL)
00615       patchArray = new patchInfo[nPatches];
00616     if (processorArray == NULL)
00617       processorArray = new processorInfo[CkNumPes()];
00618   }
00619     
00620   theLbdb->ClearLoads();
00621 }
00622 
00623 void LdbCoordinator::patchLoad(PatchID id, int nAtoms, int /* timestep */)
00624 {
00625   CmiAssert( id >=0 && id < nPatches);
00626   if (patchNAtoms[id] != -1) {
00627     patchNAtoms[id] = nAtoms;
00628     nPatchesReported++;
00629   } else {
00630     DebugM(10, "::patchLoad() Unexpected patch reporting in\n");
00631   }
00632 }
00633 
00634 void LdbCoordinator::rebalance(Sequencer *seq, PatchID pid)
00635 {
00636   if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
00637     return;
00638 
00639   sequencerThreads[pid] = seq;
00640   seq->suspend();
00641 }
00642 
00643 void LdbCoordinator::rebalance(Controller *c)
00644 {
00645   if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
00646     return;
00647 
00648   iout << "LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() << "\n" << endi;
00649   DebugM(3, "Controller reached load balance barrier.\n");
00650   controllerReported = 1;
00651   controllerThread = c;
00652 
00653   CProxy_LdbCoordinator(thisgroup).barrier();
00654 
00655   CthSuspend();
00656 }
00657 
00658 void LdbCoordinator::barrier(void)
00659 {
00660   if ( (nPatchesReported != nPatchesExpected) 
00661        || (nComputesReported != nComputesExpected)
00662        || (controllerReported != controllerExpected) )
00663   {
00664     NAMD_bug("Load balancer received wrong number of events.\n");
00665   }
00666 
00667   theLbdb->AtLocalBarrier(ldBarrierHandle);
00668 }
00669 
00670 void LdbCoordinator::nodeDone(CkReductionMsg *msg)
00671 {
00672   delete msg;
00673 
00674   iout << "LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() << "\n" << endi;
00675   if ( takingLdbData ) {
00676       ExecuteMigrations();
00677   } else {
00678       updateComputesReady();
00679   }
00680 }
00681 
00682 void LdbCoordinator::ExecuteMigrations(void)
00683 {
00684  // computeMgr->updateComputes() call only on Node(0) i.e. right here
00685   // This will barrier for all Nodes - (i.e. Computes must be
00686   // here and with proxies before anyone can start up
00687 
00688   CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
00689   ComputeMgr *computeMgr = cm.ckLocalBranch();
00690   computeMgr->updateComputes(CkIndex_LdbCoordinator::
00691                              updateComputesReady(),thisgroup);
00692 }
00693 
00694 void LdbCoordinator::RecvMigrate(LdbMigrateMsg* m)
00695 {
00696   // This method receives the migration from the framework,
00697   // unregisters it, and sends it to the destination PE
00698 
00699   if ( m->to != CkMyPe() ) {
00700     theLbdb->UnregisterObj(m->handle);
00701 
00702     CProxy_LdbCoordinator  ldbProxy(thisgroup);
00703     ldbProxy[m->to].ExpectMigrate(m);
00704   } else {
00705     ExpectMigrate(m);
00706   }
00707 }
00708 
00709 void LdbCoordinator::ExpectMigrate(LdbMigrateMsg* m)
00710 {
00711   if ( m->from != CkMyPe() ) {
00712     m->handle = theLbdb->RegisterObj(myHandle,m->handle.id,0,1);
00713     theLbdb->Migrated(m->handle);
00714   }
00715 
00716   m->next = migrateMsgs;
00717   migrateMsgs = m;
00718 }
00719 
00720 void LdbCoordinator::updateComputesReady() {
00721   DebugM(3,"updateComputesReady()\n");
00722 
00723   CProxy_LdbCoordinator(thisgroup).resume();
00724   CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
00725 }
00726 
00727 void LdbCoordinator::resume(void)
00728 {
00729   DebugM(3,"resume()\n");
00730   //  printLocalLdbReport();
00731 
00732   ldbCycleNum++;
00733   initialize(PatchMap::Object(),ComputeMap::Object(),1);
00734 
00735   Sync::Object()->openSync();
00736 }
00737 
00738 void LdbCoordinator::resumeReady(CkQdMsg *msg) {
00739 
00740   iout << "LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() << "\n" << endi;
00741   DebugM(3,"resumeReady()\n");
00742   delete msg;
00743 
00744   CProxy_LdbCoordinator(thisgroup).resume2();
00745 }
00746 
00747 void LdbCoordinator::resume2(void)
00748 {
00749   DebugM(3,"resume2()\n");
00750 
00751 #if CONVERSE_VERSION_ELAN
00752   //  enableBlockingReceives();
00753 #endif
00754 
00755   awakenSequencers();
00756 }
00757 
00758 void LdbCoordinator::awakenSequencers()
00759 {
00760   if (controllerThread)
00761   {
00762     controllerThread->awaken();
00763     controllerThread = NULL;
00764   }
00765   for(int i=0; i < patchMap->numPatches(); i++)
00766   {
00767     if (sequencerThreads[i])
00768     {
00769       sequencerThreads[i]->awaken();
00770     }
00771     sequencerThreads[i]= NULL;
00772   }
00773 }
00774 
00775 // Figure out which proxies we will definitely create on other
00776 // nodes, without regard for non-bonded computes.  This code is swiped
00777 // from ProxyMgr, and changes there probable need to be propagated here.
00778 
00779 int LdbCoordinator::requiredProxies(PatchID id, int neighborNodes[])
00780 {
00781   PatchID neighbors[1 + PatchMap::MaxOneAway];
00782   neighbors[0] = id;
00783   int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
00784 
00785   int nProxyNodes = 0;
00786   int myNode = patchMap->node(id);
00787   for ( int i = 0; i < numNeighbors; ++i ) {
00788     const int proxyNode = patchMap->basenode(neighbors[i]);
00789     if ( proxyNode != myNode ) {
00790       int j;
00791       for ( j = 0; j < nProxyNodes; ++j ) {
00792         if ( neighborNodes[j] == proxyNode ) break;
00793       }
00794       if ( j == nProxyNodes ) {
00795         neighborNodes[nProxyNodes] = proxyNode;
00796         nProxyNodes++;
00797       }
00798     }
00799   }
00800   return nProxyNodes;
00801 }
00802 
00803 void LdbCoordinator::printLocalLdbReport(void)
00804 {
00805   char outputBuf[255];
00806   char *curLoc;
00807 
00808   CkPrintf("%d:Patch report:\n",CkMyPe());
00809   
00810   curLoc = outputBuf;
00811   int i,j=0;
00812   for(i=0; i<patchMap->numPatches(); i++)
00813   {
00814     if (patchNAtoms[i] != -1)
00815     {
00816       curLoc += sprintf(curLoc,"%5d: %5d ",i,patchNAtoms[i]);
00817       j++;
00818     } 
00819     if (((j % 4) == 0) && j)
00820     {
00821       curLoc = outputBuf;
00822       CkPrintf("[%d]%s\n",CkMyPe(),outputBuf);
00823       j=0;
00824     }
00825   }
00826 
00827   CkPrintf("%d:Compute report:\n",CkMyPe());
00828   
00829   curLoc = outputBuf;
00830   j=0;
00831 }
00832 
00833 void LdbCoordinator::printRequiredProxies(PatchID id, FILE *fp)
00834 {
00835   // Check all two-away neighbors.
00836   // This is really just one-away neighbors, since 
00837   // two-away always returns zero: RKB
00838   int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
00839   const int nProxyNodes = requiredProxies(id,neighborNodes);
00840 
00841   fprintf(fp,"%4d ",nProxyNodes);
00842 
00843   for(int i=0;i<nProxyNodes;i++)
00844     fprintf(fp,"%4d ",neighborNodes[i]);
00845 }
00846 
00847 void LdbCoordinator::sendCollectLoads(CollectLoadsMsg *msg) {
00848   CProxy_LdbCoordinator(thisgroup)[0].collectLoads(msg);
00849 }
00850 
00851 void LdbCoordinator::collectLoads(CollectLoadsMsg *msg) {
00852   // CkPrintf("LdbCoordinator::collectLoads recv %d-%d\n", msg->firstPe, msg->lastPe);
00853   if ( collPes == 0 ) {
00854     reverted = 0;
00855     initTotalProxies = 0;
00856     finalTotalProxies = 0;
00857     initMaxPeProxies = 0;
00858     finalMaxPeProxies = 0;
00859     initMaxPatchProxies = 0;
00860     finalMaxPatchProxies = 0;
00861     initTime = 0;
00862     finalTime = 0;
00863     initMemory = 0;
00864     finalMemory = 0;
00865     initAvgPeLoad = 0;
00866     finalAvgPeLoad = 0;
00867     initMaxPeLoad = 0;
00868     finalMaxPeLoad = 0;
00869   }
00870   int numPes = msg->lastPe - msg->firstPe + 1;
00871   collPes += numPes;
00872 #define COLL_MAX(F) if ( msg->F > F ) F = msg->F;
00873 #define COLL_AVG(F) F += msg->F * (double) numPes / (double) CkNumPes();
00874 #define COLL_SUM(F) F += msg->F;
00875   COLL_SUM(reverted)
00876   COLL_SUM(initTotalProxies)
00877   COLL_SUM(finalTotalProxies)
00878   COLL_MAX(initMaxPeProxies)
00879   COLL_MAX(finalMaxPeProxies)
00880   COLL_MAX(initMaxPatchProxies)
00881   COLL_MAX(finalMaxPatchProxies)
00882   if ( (msg->finalTime - msg->initTime) > (finalTime - initTime) ) {
00883     initTime = msg->initTime;
00884     finalTime = msg->finalTime;
00885   }
00886   COLL_MAX(initMemory)
00887   COLL_MAX(finalMemory)
00888   COLL_AVG(initAvgPeLoad)
00889   COLL_AVG(finalAvgPeLoad)
00890   COLL_MAX(initMaxPeLoad)
00891   COLL_MAX(finalMaxPeLoad)
00892 
00893   if ( collPes == CkNumPes() ) {
00894     collPes = 0;
00895     iout << "LDB: TIME " << initTime << " LOAD: AVG " << initAvgPeLoad
00896       << " MAX " << initMaxPeLoad << "  PROXIES: TOTAL " << initTotalProxies << " MAXPE " <<
00897       initMaxPeProxies << " MAXPATCH " << initMaxPatchProxies << " " << "None"
00898       << " MEM: " << initMemory << " MB\n";
00899     if ( reverted ) iout << "LDB: Reverting to original mapping on " << reverted << " balancers\n";
00900     iout << "LDB: TIME " << finalTime << " LOAD: AVG " << finalAvgPeLoad
00901       << " MAX " << finalMaxPeLoad << "  PROXIES: TOTAL " << finalTotalProxies << " MAXPE " <<
00902       finalMaxPeProxies << " MAXPATCH " << finalMaxPatchProxies << " " << msg->strategyName
00903       << " MEM: " << finalMemory << " MB\n";
00904     iout << endi;
00905     fflush(stdout);
00906   }
00907 
00908   delete msg;
00909 }
00910 
00911 #include "LdbCoordinator.def.h"

Generated on Sun Aug 19 01:17:15 2018 for NAMD by  doxygen 1.4.7