LdbCoordinator.C

Go to the documentation of this file.
00001 
00007 /*****************************************************************************
00008  * $Source: /home/cvs/namd/cvsroot/namd2/src/LdbCoordinator.C,v $
00009  * $Author: jim $
00010  * $Date: 2017/03/30 20:06:17 $
00011  * $Revision: 1.128 $
00012  *****************************************************************************/
00013 
00014 #include <stdlib.h>
00015 
00016 #include "InfoStream.h"
00017 #include "NamdCentLB.h"
00018 #include "NamdHybridLB.h"
00019 #include "NamdDummyLB.h"
00020 #include "NamdNborLB.h"
00021 
00022 #include "HomePatch.h"
00023 #include "LdbCoordinator.decl.h"
00024 #include "LdbCoordinator.h"
00025 #include "NamdTypes.h"
00026 #include "Node.h"
00027 #include "SimParameters.h"
00028 #include "PatchMap.inl"
00029 #include "ComputeMap.h"
00030 #include "ComputeNonbondedMICKernel.h"
00031 //#define DEBUGM
00032 #define MIN_DEBUG_LEVEL 3
00033 #include "Debug.h"
00034 #include "Controller.h"
00035 #include "Sequencer.h"
00036 #include "RefineOnly.h"
00037 #include "ComputeMgr.h"
00038 #include "Compute.h"
00039 #include "packmsg.h"
00040 #include "Sync.h"
00041 
00042 #include "elements.h"
00043 #include "ComputeMgr.decl.h"
00044 
00045 #define DEBUG_LEVEL 4
00046 
00047 #if CONVERSE_VERSION_ELAN
00048 extern "C" void enableBlockingReceives();
00049 extern "C" void disableBlockingReceives();
00050 #endif
00051 
00052 void LdbCoordinator_initproc() {
00053   // Set the load balancing period (in seconds).  Without this the
00054   // load balancing framework will hang until 1 second has passed
00055   // since the last load balancing, causing hiccups in very fast runs.
00056   // This is duplicated below for older versions, but putting it here
00057   // also fixes the first load balance.
00058   LBSetPeriod(1.0e-5);
00059 }
00060 
00061 void LdbCoordinator::staticMigrateFn(LDObjHandle handle, int dest)
00062 {
00063    LdbCoordinator *ldbCoordinator = (LdbCoordinator *)LDOMUserData(handle.omhandle);
00064    ldbCoordinator->Migrate(handle,dest);
00065 }
00066 
00067 void LdbCoordinator::Migrate(LDObjHandle handle, int dest)
00068 {
00069   LdbMigrateMsg* msg = new LdbMigrateMsg;
00070   msg->handle = handle;
00071   msg->from = CkMyPe();
00072   msg->to = dest;
00073   if ( msg->to != CkMyPe() ) {
00074     CProxy_LdbCoordinator ldbProxy(thisgroup);
00075     ldbProxy[CkMyPe()].RecvMigrate(msg);
00076   } else {
00077     ExpectMigrate(msg);
00078   }
00079 }
00080 
00081 void LdbCoordinator::staticStatsFn(LDOMHandle h, int state)
00082 {
00083   CkPrintf("I'm supposed to set stats\n");
00084 }
00085 
00086 void LdbCoordinator::staticQueryEstLoadFn(LDOMHandle h)
00087 {
00088   CkPrintf("I'm supposed to query load\n");
00089 }
00090 
00091 void LdbCoordinator::staticReceiveAtSync(void* data)
00092 {
00093 
00094 #if CONVERSE_VERSION_ELAN
00095     //disableBlockingReceives();
00096 #endif
00097 
00098   ((LdbCoordinator*)data)->ReceiveAtSync();
00099 }
00100 
00101 void LdbCoordinator::ReceiveAtSync()
00102 {
00103   theLbdb->RegisteringObjects(myHandle);
00104 }
00105 
00106 void LdbCoordinator::staticResumeFromSync(void* data)
00107 {
00108   ((LdbCoordinator*)data)->ResumeFromSync();
00109 }
00110 
00111 void LdbCoordinator::ResumeFromSync()
00112 {
00113   theLbdb->DoneRegisteringObjects(myHandle);
00114   CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
00115   contribute(0, NULL, CkReduction::random, cb);
00116 }
00117 
00118 LdbCoordinator::LdbCoordinator()
00119 {
00120   if (CkpvAccess(LdbCoordinator_instance) == NULL) {
00121     CkpvAccess(LdbCoordinator_instance) = this;
00122   } else {
00123     NAMD_bug("LdbCoordinator instanced twice on same node!");
00124   }
00125   
00126 #if 0
00127   // Create a load balancer
00128   if (CkMyPe() == 0) {
00129     //   CreateCentralLB();
00130     CreateNamdCentLB();
00131     //   CreateNamdNborLB();
00132   }
00133 #endif
00134 
00135   collPes = 0;
00136   ldbCycleNum = 1;
00137   takingLdbData = 1;
00138   totalStepsDone = 0;
00139   nLocalComputes = nLocalPatches = 0;
00140   patchNAtoms = (int *) NULL;
00141   sequencerThreads = (Sequencer **) NULL;
00142   ldbStatsFP = NULL;
00143   computeArray = NULL;
00144   patchArray = NULL;
00145   processorArray = NULL;
00146 
00147   // Register self as an object manager for new charm++ balancer framework
00148   theLbdb = LBDatabase::Object(); 
00149 
00150   // Set the load balancing period (in seconds).  Without this the
00151   // load balancing framework will hang until 1 second has passed
00152   // since the last load balancing, causing hiccups in very fast runs.
00153   // Unfortunately, the clock is already set for the first load
00154   // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm.
00155   // For newer versions this is handled in initproc above.
00156 
00157   theLbdb->SetLBPeriod(1.0e-5);
00158 
00159   myOMid.id.idx = 1;
00160   LDCallbacks cb = { (LDMigrateFn)staticMigrateFn,
00161                      (LDStatsFn)staticStatsFn,
00162                      (LDQueryEstLoadFn)staticQueryEstLoadFn
00163                    };
00164   myHandle = theLbdb->RegisterOM(myOMid,(void*)this,cb);
00165 
00166   // Add myself as a local barrier receiver, so I know when I might
00167   // be registering objects.
00168   theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync,
00169                                    (void*)this);;
00170 
00171   // Also, add a local barrier client, to trigger load balancing
00172   ldBarrierHandle = theLbdb->
00173     AddLocalBarrierClient((LDResumeFn)staticResumeFromSync,
00174                           (void*)this);
00175   migrateMsgs = 0; // linked list
00176   numComputes = 0;
00177   reg_all_objs = 1;
00178 }
00179 
00180 LdbCoordinator::~LdbCoordinator(void)
00181 {
00182   delete [] patchNAtoms;
00183   delete [] sequencerThreads;
00184   if (CkMyPe() == 0)
00185   {
00186     delete [] computeArray;
00187     delete [] patchArray;
00188     delete [] processorArray;
00189   }
00190   if (ldbStatsFP)
00191     fclose(ldbStatsFP);
00192 
00193 }
00194 
00195 void LdbCoordinator::createLoadBalancer()
00196 {
00197   const SimParameters *simParams = Node::Object()->simParameters;
00198 
00199   // Create hierarchical or centralized load balancers
00200   // Currently centralized is the default
00201   if (simParams->ldBalancer == LDBAL_CENTRALIZED) {
00202     CkPrintf("LDB: Central LB being created...\n");
00203     CreateNamdCentLB();
00204   } else if (simParams->ldBalancer == LDBAL_HYBRID) {
00205     CkPrintf("LDB: Hybrid LB being created...\n");
00206     CreateNamdHybridLB();
00207   }
00208 }
00209 
00210 void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit)
00211 {
00212   const SimParameters *simParams = Node::Object()->simParameters;
00213 
00214 #if 0
00215   static int lbcreated = 0; // XXX static variables are unsafe for SMP
00216   // PE0 first time Create a load balancer
00217   if (CkMyPe() == 0 && !lbcreated) {
00218     if (simParams->ldbStrategy == LDBSTRAT_ALGNBOR) 
00219       CreateNamdNborLB();
00220     else {
00221       //   CreateCentralLB();
00222       CreateNamdCentLB();
00223     }
00224     lbcreated = 1;
00225   }
00226 #endif
00227 
00228   //  DebugM(10,"stepsPerLdbCycle initialized\n");
00229   stepsPerLdbCycle = simParams->ldbPeriod;
00230   firstLdbStep = simParams->firstLdbStep;
00231   int lastLdbStep = simParams->lastLdbStep;
00232   int stepsPerCycle = simParams->stepsPerCycle;
00233 
00234   computeMap = cMap;
00235   patchMap = pMap;
00236 
00237   // Set the number of received messages correctly for node 0
00238 
00239   nStatsMessagesExpected = Node::Object()->numNodes();
00240   nStatsMessagesReceived = 0;
00241 
00242   if (patchNAtoms) 
00243     delete [] patchNAtoms;  // Depends on delete NULL to do nothing
00244   nPatches = patchMap->numPatches();
00245   patchNAtoms = new int[nPatches];
00246 
00247   typedef Sequencer *seqPtr;
00248 
00249   if ( ! reinit ) {
00250     delete [] sequencerThreads;  // Depends on delete NULL to do nothing
00251     sequencerThreads = new seqPtr[nPatches];
00252   }
00253 
00254   nLocalPatches=0;
00255 
00256   int i;
00257   for(i=0;i<nPatches;i++)
00258   {
00259     if (patchMap->node(i) == Node::Object()->myid())
00260     {
00261       nLocalPatches++;
00262       patchNAtoms[i]=0;
00263     } else {
00264       patchNAtoms[i]=-1;
00265     }
00266     if ( ! reinit ) sequencerThreads[i]=NULL;
00267   }
00268   if ( ! reinit ) controllerThread = NULL;
00269   if (nLocalPatches != patchMap->numHomePatches())
00270     NAMD_die("Disaggreement in patchMap data.\n");
00271  
00272   const int oldNumComputes = numComputes;
00273   nLocalComputes = 0;
00274   numComputes = computeMap->numComputes();
00275 
00276   for(i=0;i<numComputes;i++)  {
00277     if ( (computeMap->node(i) == Node::Object()->myid())
00278          && ( 0
00279               #if (defined(NAMD_CUDA) || defined(NAMD_MIC))
00280                 #if defined(NAMD_MIC)
00281                   || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
00282                   || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
00283                 #endif
00284               #else
00285               || (computeMap->type(i) == computeNonbondedSelfType)
00286               || (computeMap->type(i) == computeNonbondedPairType)
00287 #endif
00288 #if defined(NAMD_CUDA) && defined(BONDED_CUDA)
00289         || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & 1))
00290         || (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & 1))
00291         || (computeMap->type(i) == computeSelfAnglesType && !(simParams->bondedCUDA & 2))
00292         || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & 2))
00293         || (computeMap->type(i) == computeSelfDihedralsType && !(simParams->bondedCUDA & 4))
00294         || (computeMap->type(i) == computeDihedralsType && !(simParams->bondedCUDA & 4))
00295         || (computeMap->type(i) == computeSelfImpropersType && !(simParams->bondedCUDA & 8))
00296         || (computeMap->type(i) == computeImpropersType && !(simParams->bondedCUDA & 8))
00297         || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & 16))
00298         || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & 16))
00299         || (computeMap->type(i) == computeSelfCrosstermsType && !(simParams->bondedCUDA & 32))
00300         || (computeMap->type(i) == computeCrosstermsType && !(simParams->bondedCUDA & 32))
00301 #else
00302         || (computeMap->type(i) == computeSelfBondsType)
00303         || (computeMap->type(i) == computeBondsType)
00304         || (computeMap->type(i) == computeSelfAnglesType)
00305         || (computeMap->type(i) == computeAnglesType)
00306         || (computeMap->type(i) == computeSelfDihedralsType)
00307         || (computeMap->type(i) == computeDihedralsType)
00308         || (computeMap->type(i) == computeSelfImpropersType)
00309         || (computeMap->type(i) == computeImpropersType)
00310         || (computeMap->type(i) == computeSelfExclsType)
00311         || (computeMap->type(i) == computeExclsType)
00312         || (computeMap->type(i) == computeSelfCrosstermsType)
00313         || (computeMap->type(i) == computeCrosstermsType)
00314 #endif
00315               || (computeMap->type(i) == computeLCPOType)
00316               || (computeMap->type(i) == computeSelfTholeType)
00317               || (computeMap->type(i) == computeSelfAnisoType)
00318 
00319                  || (computeMap->type(i) == computeTholeType)
00320                  || (computeMap->type(i) == computeAnisoType)
00321               // JLai
00322                  || (computeMap->type(i) == computeGromacsPairType)
00323                  || (computeMap->type(i) == computeSelfGromacsPairType)
00324         ) ) {
00325       nLocalComputes++;
00326     }
00327   }
00328   
00329   // New LB frameworks registration
00330 
00331   // Allocate data structure to save incoming migrations.  Processor
00332   // zero will get all migrations
00333 
00334   // If this is the first time through, we need it register patches
00335   if (ldbCycleNum == reg_all_objs) {
00336     if ( 1 ) { // ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) {
00337       reg_all_objs = 3;
00338     }
00339     // Tell the lbdb that I'm registering objects, until I'm done
00340     // registering them.
00341     theLbdb->RegisteringObjects(myHandle);
00342     
00343    if ( ldbCycleNum == 1 ) {
00344     patchHandles = new LDObjHandle[nLocalPatches];
00345     int patch_count=0;
00346     int i;
00347     for(i=0;i<nPatches;i++)
00348       if (patchMap->node(i) == Node::Object()->myid()) {
00349         LDObjid elemID;
00350         elemID.id[0] = i;
00351         elemID.id[1] = elemID.id[2] = elemID.id[3] = -2;
00352 
00353         if (patch_count >= nLocalPatches) {
00354     NAMD_bug("LdbCoordinator found too many local patches!");
00355         }
00356         HomePatch *p = patchMap->homePatch(i);
00357         p->ldObjHandle = 
00358         patchHandles[patch_count] 
00359           = theLbdb->RegisterObj(myHandle,elemID,0,0);
00360         patch_count++;
00361 
00362       }
00363    }
00364   
00365     if ( numComputes > oldNumComputes ) {
00366       // Register computes
00367       for(i=oldNumComputes; i<numComputes; i++)  {
00368         if ( computeMap->node(i) == Node::Object()->myid())
00369         {
00370           if ( 0
00371                #if (defined(NAMD_CUDA) || defined(NAMD_MIC))
00372                  #if defined(NAMD_MIC)
00373                    || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
00374                    || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
00375                  #endif
00376                #else
00377                   || (computeMap->type(i) == computeNonbondedSelfType)
00378                   || (computeMap->type(i) == computeNonbondedPairType)
00379                #endif
00380 #if defined(NAMD_CUDA) && defined(BONDED_CUDA)
00381             || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & 1))
00382             || (computeMap->type(i) == computeSelfAnglesType && !(simParams->bondedCUDA & 2))
00383             || (computeMap->type(i) == computeSelfDihedralsType && !(simParams->bondedCUDA & 4))
00384             || (computeMap->type(i) == computeSelfImpropersType && !(simParams->bondedCUDA & 8))
00385             || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & 16))
00386             || (computeMap->type(i) == computeSelfCrosstermsType && !(simParams->bondedCUDA & 32))
00387 #else
00388             || (computeMap->type(i) == computeSelfBondsType)
00389             || (computeMap->type(i) == computeSelfAnglesType)
00390             || (computeMap->type(i) == computeSelfDihedralsType)
00391             || (computeMap->type(i) == computeSelfImpropersType)
00392             || (computeMap->type(i) == computeSelfExclsType)
00393             || (computeMap->type(i) == computeSelfCrosstermsType)
00394 #endif
00395                   || (computeMap->type(i) == computeLCPOType)
00396                   || (computeMap->type(i) == computeSelfTholeType)
00397                   || (computeMap->type(i) == computeSelfAnisoType)
00398                // JLai
00399                   || (computeMap->type(i) == computeSelfGromacsPairType)
00400                // End of JLai
00401                 )  {
00402           // Register the object with the load balancer
00403           // Store the depended patch IDs in the rest of the element ID
00404           LDObjid elemID;
00405           elemID.id[0] = i;
00406         
00407           if (computeMap->numPids(i) > 2)
00408             elemID.id[3] = computeMap->pid(i,2);
00409           else elemID.id[3] = -1;
00410 
00411           if (computeMap->numPids(i) > 1)
00412             elemID.id[2] =  computeMap->pid(i,1);
00413           else elemID.id[2] = -1;
00414 
00415           if (computeMap->numPids(i) > 0)
00416             elemID.id[1] =  computeMap->pid(i,0);
00417           else elemID.id[1] = -1;
00418 
00419           Compute *c = computeMap->compute(i);
00420           if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
00421 
00422           c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1);
00423           }
00424           else if ( 
00425 #if defined(NAMD_CUDA) && defined(BONDED_CUDA)
00426                     (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & 1))
00427                  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & 2))
00428                  || (computeMap->type(i) == computeDihedralsType && !(simParams->bondedCUDA & 4))
00429                  || (computeMap->type(i) == computeImpropersType && !(simParams->bondedCUDA & 8))
00430                  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & 16))
00431                  || (computeMap->type(i) == computeCrosstermsType && !(simParams->bondedCUDA & 32))
00432 #else
00433                     (computeMap->type(i) == computeBondsType)
00434                  || (computeMap->type(i) == computeAnglesType)
00435                  || (computeMap->type(i) == computeDihedralsType)
00436                  || (computeMap->type(i) == computeImpropersType)
00437                  || (computeMap->type(i) == computeExclsType)
00438                  || (computeMap->type(i) == computeCrosstermsType)
00439 #endif
00440                  || (computeMap->type(i) == computeTholeType)
00441                  || (computeMap->type(i) == computeAnisoType)
00442                  // JLai
00443                  || (computeMap->type(i) == computeGromacsPairType)
00444                  // End of JLai
00445                ) {
00446           // Register the object with the load balancer
00447           // Store the depended patch IDs in the rest of the element ID
00448           LDObjid elemID;
00449           elemID.id[0] = i;
00450         
00451           elemID.id[1] = elemID.id[2] = elemID.id[3] = -3;
00452 
00453           Compute *c = computeMap->compute(i);
00454           if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
00455 
00456           c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0);
00457           }
00458         }
00459       }
00460     }
00461     theLbdb->DoneRegisteringObjects(myHandle);
00462   }
00463 
00464   // process saved migration messages, if any
00465   while ( migrateMsgs ) {
00466     LdbMigrateMsg *m = migrateMsgs;
00467     migrateMsgs = m->next;
00468     Compute *c = computeMap->compute(m->handle.id.id[0]);
00469     if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2");
00470     c->ldObjHandle = m->handle;
00471     delete m;
00472   }
00473 
00474   // Fixup to take care of the extra timestep at startup
00475   // This is pretty ugly here, but it makes the count correct
00476   
00477   // iout << "LDB Cycle Num: " << ldbCycleNum << "\n";
00478 
00479  if ( 1 ) { // ( simParams->ldBalancer == LDBAL_CENTRALIZED ) {
00480   if (ldbCycleNum == 1 || ldbCycleNum == 3) {
00481     numStepsToRun = stepsPerCycle;
00482     totalStepsDone += numStepsToRun;
00483     takingLdbData = 0;
00484     theLbdb->CollectStatsOff();
00485   } else if (ldbCycleNum == 2 || ldbCycleNum == 4) {
00486     numStepsToRun = firstLdbStep - stepsPerCycle;
00487     while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle;
00488     totalStepsDone += numStepsToRun;
00489     takingLdbData = 1;
00490     theLbdb->CollectStatsOn();
00491   } else if ( (ldbCycleNum <= 6) || !takingLdbData )
00492   {
00493     totalStepsDone += firstLdbStep;
00494     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00495       numStepsToRun = -1;
00496       takingLdbData = 0;
00497       theLbdb->CollectStatsOff();
00498     } else {
00499       numStepsToRun = firstLdbStep;
00500       takingLdbData = 1;
00501       theLbdb->CollectStatsOn();
00502     }
00503   }
00504   else 
00505   {
00506     totalStepsDone += stepsPerLdbCycle - firstLdbStep;
00507     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00508       numStepsToRun = -1;
00509       takingLdbData = 0;
00510       theLbdb->CollectStatsOff();
00511     } else {
00512       numStepsToRun = stepsPerLdbCycle - firstLdbStep;
00513       takingLdbData = 0;
00514       theLbdb->CollectStatsOff();
00515     }
00516   }
00517  } else {
00518   if (ldbCycleNum==1)
00519   {
00520     totalStepsDone += firstLdbStep;
00521     numStepsToRun = firstLdbStep;
00522     takingLdbData = 0;
00523     theLbdb->CollectStatsOff();
00524   }
00525   else if ( (ldbCycleNum <= 4) || !takingLdbData )
00526   {
00527     totalStepsDone += firstLdbStep;
00528     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00529       numStepsToRun = -1;
00530       takingLdbData = 0;
00531       theLbdb->CollectStatsOff();
00532     } else {
00533       numStepsToRun = firstLdbStep;
00534       takingLdbData = 1;
00535       theLbdb->CollectStatsOn();
00536     }
00537   }
00538   else 
00539   {
00540     totalStepsDone += stepsPerLdbCycle - firstLdbStep;
00541     if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
00542       numStepsToRun = -1;
00543       takingLdbData = 0;
00544       theLbdb->CollectStatsOff();
00545     } else {
00546       numStepsToRun = stepsPerLdbCycle - firstLdbStep;
00547       takingLdbData = 0;
00548       theLbdb->CollectStatsOff();
00549     }
00550   }
00551  }
00552 
00553 /*-----------------------------------------------------------------------------*
00554  * --------------------------------------------------------------------------- *
00555  * Comments inserted by Abhinav to clarify relation between ldbCycleNum,       *
00556  * load balancing step numbers (printed by the step() function) and            *
00557  * tracing of the steps                                                        *
00558  * --------------------------------------------------------------------------- *
00559  * If trace is turned off in the beginning, then tracing is turned on          *
00560  * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can       *
00561  * be adjusted by specifying firstLdbStep and ldbPeriod which are set by       *
00562  * default to 5*stepspercycle and 200*stepspercycle if not specified.          *
00563  *                                                                             *
00564  * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the             *
00565  * following timeline (for these particular numbers):                          *
00566  *                                                                             *
00567  * Tracing         :  <------ off ------><------------- on -----------><-- off *
00568  * Ldb Step() No   :              1     2     3        4      5       6      7 *
00569  * Iteration Steps : 00====20====40====60====80======160====180=====260====280 *
00570  * ldbCycleNum     :  1     2     3     4     5        6      7       8      9 *
00571  * Instrumention   :          Inst  Inst  Inst           Inst            Inst  *
00572  * LDB Strategy    :              TLB  RLB   RLB            RLB            RLB *
00573  *                                                                             *
00574  * TLB = TorusLB                                                               *
00575  * RLB = RefineTorusLB                                                         *
00576  * Inst = Instrumentation Phase (no real load balancing)                       *
00577  * --------------------------------------------------------------------------- *
00578  *-----------------------------------------------------------------------------*
00579  */
00580 #if 0 //replaced by traceBarrier at Controller and Sequencer
00581   if (traceAvailable()) {
00582     static int specialTracing = 0; // XXX static variables are unsafe for SMP
00583     if (ldbCycleNum == 1 && traceIsOn() == 0)  specialTracing = 1;
00584     if (specialTracing) {
00585       if (ldbCycleNum == 4) traceBegin();
00586       if (ldbCycleNum == 8) traceEnd();
00587     }
00588   }
00589 #endif
00590 
00591   nPatchesReported = 0;
00592   nPatchesExpected = nLocalPatches;
00593   nComputesReported = 0;
00594   nComputesExpected = nLocalComputes * numStepsToRun;
00595   controllerReported = 0;
00596   controllerExpected = ! CkMyPe();
00597 
00598   if (simParams->multigratorOn) {
00599     // Add the number of pressure cycles into nComputesExpected:
00600     // Pressure cycle is done when !(step % simParams->multigratorPressureFreq) = true
00601     // step = Current step
00602     int step = totalStepsDone - numStepsToRun;
00603     int freq = simParams->multigratorPressureFreq;
00604     // dstep = Number of steps we have to take until next pressure cycle
00605     int dstep = 0;
00606     if ((step % freq) != 0) dstep = freq - (step % freq);
00607     step += dstep;
00608     if (step < totalStepsDone) {
00609       int numPressureCycles = 1 + ((totalStepsDone-step-1)/freq);
00610       if (step==0) numPressureCycles--;
00611       // if (CkMyPe()==2) fprintf(stderr, "step %d totalStepsDone %d numPressureCycles %d\n",
00612       //   step, totalStepsDone, numPressureCycles);
00613       nComputesExpected += 2*nLocalComputes*numPressureCycles;
00614     }
00615   }
00616 
00617   if (CkMyPe() == 0)
00618   {
00619     if (computeArray == NULL)
00620       computeArray = new computeInfo[numComputes];
00621     if (patchArray == NULL)
00622       patchArray = new patchInfo[nPatches];
00623     if (processorArray == NULL)
00624       processorArray = new processorInfo[CkNumPes()];
00625   }
00626     
00627   theLbdb->ClearLoads();
00628 }
00629 
00630 void LdbCoordinator::patchLoad(PatchID id, int nAtoms, int /* timestep */)
00631 {
00632   CmiAssert( id >=0 && id < nPatches);
00633   if (patchNAtoms[id] != -1) {
00634     patchNAtoms[id] = nAtoms;
00635     nPatchesReported++;
00636   } else {
00637     DebugM(10, "::patchLoad() Unexpected patch reporting in\n");
00638   }
00639 }
00640 
00641 void LdbCoordinator::rebalance(Sequencer *seq, PatchID pid)
00642 {
00643   if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
00644     return;
00645 
00646   sequencerThreads[pid] = seq;
00647   seq->suspend();
00648 }
00649 
00650 void LdbCoordinator::rebalance(Controller *c)
00651 {
00652   if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
00653     return;
00654 
00655   iout << "LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() << "\n" << endi;
00656   DebugM(3, "Controller reached load balance barrier.\n");
00657   controllerReported = 1;
00658   controllerThread = c;
00659 
00660   CProxy_LdbCoordinator(thisgroup).barrier();
00661 
00662   CthSuspend();
00663 }
00664 
00665 void LdbCoordinator::barrier(void)
00666 {
00667   if ( (nPatchesReported != nPatchesExpected) 
00668        || (nComputesReported != nComputesExpected)
00669        || (controllerReported != controllerExpected) )
00670   {
00671     NAMD_bug("Load balancer received wrong number of events.\n");
00672   }
00673 
00674   theLbdb->AtLocalBarrier(ldBarrierHandle);
00675 }
00676 
00677 void LdbCoordinator::nodeDone(CkReductionMsg *msg)
00678 {
00679   delete msg;
00680 
00681   iout << "LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() << "\n" << endi;
00682   if ( takingLdbData ) {
00683       ExecuteMigrations();
00684   } else {
00685       updateComputesReady();
00686   }
00687 }
00688 
00689 void LdbCoordinator::ExecuteMigrations(void)
00690 {
00691  // computeMgr->updateComputes() call only on Node(0) i.e. right here
00692   // This will barrier for all Nodes - (i.e. Computes must be
00693   // here and with proxies before anyone can start up
00694 
00695   CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
00696   ComputeMgr *computeMgr = cm.ckLocalBranch();
00697   computeMgr->updateComputes(CkIndex_LdbCoordinator::
00698                              updateComputesReady(),thisgroup);
00699 }
00700 
00701 void LdbCoordinator::RecvMigrate(LdbMigrateMsg* m)
00702 {
00703   // This method receives the migration from the framework,
00704   // unregisters it, and sends it to the destination PE
00705 
00706   if ( m->to != CkMyPe() ) {
00707     theLbdb->UnregisterObj(m->handle);
00708 
00709     CProxy_LdbCoordinator  ldbProxy(thisgroup);
00710     ldbProxy[m->to].ExpectMigrate(m);
00711   } else {
00712     ExpectMigrate(m);
00713   }
00714 }
00715 
00716 void LdbCoordinator::ExpectMigrate(LdbMigrateMsg* m)
00717 {
00718   if ( m->from != CkMyPe() ) {
00719     m->handle = theLbdb->RegisterObj(myHandle,m->handle.id,0,1);
00720     theLbdb->Migrated(m->handle);
00721   }
00722 
00723   m->next = migrateMsgs;
00724   migrateMsgs = m;
00725 }
00726 
00727 void LdbCoordinator::updateComputesReady() {
00728   DebugM(3,"updateComputesReady()\n");
00729 
00730   CProxy_LdbCoordinator(thisgroup).resume();
00731   CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
00732 }
00733 
00734 void LdbCoordinator::resume(void)
00735 {
00736   DebugM(3,"resume()\n");
00737   //  printLocalLdbReport();
00738 
00739   ldbCycleNum++;
00740   initialize(PatchMap::Object(),ComputeMap::Object(),1);
00741 
00742   Sync::Object()->openSync();
00743 }
00744 
00745 void LdbCoordinator::resumeReady(CkQdMsg *msg) {
00746 
00747   iout << "LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() << "\n" << endi;
00748   DebugM(3,"resumeReady()\n");
00749   delete msg;
00750 
00751   CProxy_LdbCoordinator(thisgroup).resume2();
00752 }
00753 
00754 void LdbCoordinator::resume2(void)
00755 {
00756   DebugM(3,"resume2()\n");
00757 
00758 #if CONVERSE_VERSION_ELAN
00759   //  enableBlockingReceives();
00760 #endif
00761 
00762   awakenSequencers();
00763 }
00764 
00765 void LdbCoordinator::awakenSequencers()
00766 {
00767   if (controllerThread)
00768   {
00769     controllerThread->awaken();
00770     controllerThread = NULL;
00771   }
00772   for(int i=0; i < patchMap->numPatches(); i++)
00773   {
00774     if (sequencerThreads[i])
00775     {
00776       sequencerThreads[i]->awaken();
00777     }
00778     sequencerThreads[i]= NULL;
00779   }
00780 }
00781 
00782 // Figure out which proxies we will definitely create on other
00783 // nodes, without regard for non-bonded computes.  This code is swiped
00784 // from ProxyMgr, and changes there probable need to be propagated here.
00785 
00786 int LdbCoordinator::requiredProxies(PatchID id, int neighborNodes[])
00787 {
00788   PatchID neighbors[1 + PatchMap::MaxOneAway];
00789   neighbors[0] = id;
00790   int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
00791 
00792   int nProxyNodes = 0;
00793   int myNode = patchMap->node(id);
00794   for ( int i = 0; i < numNeighbors; ++i ) {
00795     const int proxyNode = patchMap->basenode(neighbors[i]);
00796     if ( proxyNode != myNode ) {
00797       int j;
00798       for ( j = 0; j < nProxyNodes; ++j ) {
00799         if ( neighborNodes[j] == proxyNode ) break;
00800       }
00801       if ( j == nProxyNodes ) {
00802         neighborNodes[nProxyNodes] = proxyNode;
00803         nProxyNodes++;
00804       }
00805     }
00806   }
00807   return nProxyNodes;
00808 }
00809 
00810 void LdbCoordinator::printLocalLdbReport(void)
00811 {
00812   char outputBuf[255];
00813   char *curLoc;
00814 
00815   CkPrintf("%d:Patch report:\n",CkMyPe());
00816   
00817   curLoc = outputBuf;
00818   int i,j=0;
00819   for(i=0; i<patchMap->numPatches(); i++)
00820   {
00821     if (patchNAtoms[i] != -1)
00822     {
00823       curLoc += sprintf(curLoc,"%5d: %5d ",i,patchNAtoms[i]);
00824       j++;
00825     } 
00826     if (((j % 4) == 0) && j)
00827     {
00828       curLoc = outputBuf;
00829       CkPrintf("[%d]%s\n",CkMyPe(),outputBuf);
00830       j=0;
00831     }
00832   }
00833 
00834   CkPrintf("%d:Compute report:\n",CkMyPe());
00835   
00836   curLoc = outputBuf;
00837   j=0;
00838 }
00839 
00840 void LdbCoordinator::printRequiredProxies(PatchID id, FILE *fp)
00841 {
00842   // Check all two-away neighbors.
00843   // This is really just one-away neighbors, since 
00844   // two-away always returns zero: RKB
00845   int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
00846   const int nProxyNodes = requiredProxies(id,neighborNodes);
00847 
00848   fprintf(fp,"%4d ",nProxyNodes);
00849 
00850   for(int i=0;i<nProxyNodes;i++)
00851     fprintf(fp,"%4d ",neighborNodes[i]);
00852 }
00853 
00854 void LdbCoordinator::sendCollectLoads(CollectLoadsMsg *msg) {
00855   CProxy_LdbCoordinator(thisgroup)[0].collectLoads(msg);
00856 }
00857 
00858 void LdbCoordinator::collectLoads(CollectLoadsMsg *msg) {
00859   // CkPrintf("LdbCoordinator::collectLoads recv %d-%d\n", msg->firstPe, msg->lastPe);
00860   if ( collPes == 0 ) {
00861     reverted = 0;
00862     initTotalProxies = 0;
00863     finalTotalProxies = 0;
00864     initMaxPeProxies = 0;
00865     finalMaxPeProxies = 0;
00866     initMaxPatchProxies = 0;
00867     finalMaxPatchProxies = 0;
00868     initTime = 0;
00869     finalTime = 0;
00870     initMemory = 0;
00871     finalMemory = 0;
00872     initAvgPeLoad = 0;
00873     finalAvgPeLoad = 0;
00874     initMaxPeLoad = 0;
00875     finalMaxPeLoad = 0;
00876   }
00877   int numPes = msg->lastPe - msg->firstPe + 1;
00878   collPes += numPes;
00879 #define COLL_MAX(F) if ( msg->F > F ) F = msg->F;
00880 #define COLL_AVG(F) F += msg->F * (double) numPes / (double) CkNumPes();
00881 #define COLL_SUM(F) F += msg->F;
00882   COLL_SUM(reverted)
00883   COLL_SUM(initTotalProxies)
00884   COLL_SUM(finalTotalProxies)
00885   COLL_MAX(initMaxPeProxies)
00886   COLL_MAX(finalMaxPeProxies)
00887   COLL_MAX(initMaxPatchProxies)
00888   COLL_MAX(finalMaxPatchProxies)
00889   if ( (msg->finalTime - msg->initTime) > (finalTime - initTime) ) {
00890     initTime = msg->initTime;
00891     finalTime = msg->finalTime;
00892   }
00893   COLL_MAX(initMemory)
00894   COLL_MAX(finalMemory)
00895   COLL_AVG(initAvgPeLoad)
00896   COLL_AVG(finalAvgPeLoad)
00897   COLL_MAX(initMaxPeLoad)
00898   COLL_MAX(finalMaxPeLoad)
00899 
00900   if ( collPes == CkNumPes() ) {
00901     collPes = 0;
00902     iout << "LDB: TIME " << initTime << " LOAD: AVG " << initAvgPeLoad
00903       << " MAX " << initMaxPeLoad << "  PROXIES: TOTAL " << initTotalProxies << " MAXPE " <<
00904       initMaxPeProxies << " MAXPATCH " << initMaxPatchProxies << " " << "None"
00905       << " MEM: " << initMemory << " MB\n";
00906     if ( reverted ) iout << "LDB: Reverting to original mapping on " << reverted << " balancers\n";
00907     iout << "LDB: TIME " << finalTime << " LOAD: AVG " << finalAvgPeLoad
00908       << " MAX " << finalMaxPeLoad << "  PROXIES: TOTAL " << finalTotalProxies << " MAXPE " <<
00909       finalMaxPeProxies << " MAXPATCH " << finalMaxPatchProxies << " " << msg->strategyName
00910       << " MEM: " << finalMemory << " MB\n";
00911     iout << endi;
00912     fflush(stdout);
00913   }
00914 
00915   delete msg;
00916 }
00917 
00918 #include "LdbCoordinator.def.h"

Generated on Tue Sep 19 01:17:12 2017 for NAMD by  doxygen 1.4.7