NAMD
LdbCoordinator.C
Go to the documentation of this file.
1 
7 /*****************************************************************************
8  * $Source: /home/cvs/namd/cvsroot/namd2/src/LdbCoordinator.C,v $
9  * $Author: jim $
10  * $Date: 2017/03/30 20:06:17 $
11  * $Revision: 1.128 $
12  *****************************************************************************/
13 
14 #include <stdlib.h>
15 
16 #include "InfoStream.h"
17 #include "NamdCentLB.h"
18 #include "NamdHybridLB.h"
19 #include "NamdDummyLB.h"
20 
21 #include "HomePatch.h"
22 #include "LdbCoordinator.decl.h"
23 #include "LdbCoordinator.h"
24 #include "NamdTypes.h"
25 #include "Node.h"
26 #include "SimParameters.h"
27 #include "PatchMap.inl"
28 #include "ComputeMap.h"
30 //#define DEBUGM
31 #define MIN_DEBUG_LEVEL 3
32 #include "Debug.h"
33 #include "Controller.h"
34 #include "Sequencer.h"
35 #include "RefineOnly.h"
36 #include "ComputeMgr.h"
37 #include "Compute.h"
38 #include "packmsg.h"
39 #include "Sync.h"
40 
41 #include "elements.h"
42 #include "ComputeMgr.decl.h"
43 
44 #define DEBUG_LEVEL 4
45 
46 #if CONVERSE_VERSION_ELAN
47 extern "C" void enableBlockingReceives();
48 extern "C" void disableBlockingReceives();
49 #endif
50 
52  // Set the load balancing period (in seconds). Without this the
53  // load balancing framework will hang until 1 second has passed
54  // since the last load balancing, causing hiccups in very fast runs.
55  // This is duplicated below for older versions, but putting it here
56  // also fixes the first load balance.
57 #ifndef LB_MANAGER_VERSION
58  LBSetPeriod(1.0e-5);
59 #endif
60 }
61 
62 void LdbCoordinator::staticMigrateFn(LDObjHandle handle, int dest)
63 {
64  LdbCoordinator::Object()->Migrate(handle,dest);
65 }
66 
67 void LdbCoordinator::Migrate(LDObjHandle handle, int dest)
68 {
69  LdbMigrateMsg* msg = new LdbMigrateMsg;
70  msg->handle = handle;
71  msg->from = CkMyPe();
72  msg->to = dest;
73  if ( msg->to != CkMyPe() ) {
74  CProxy_LdbCoordinator ldbProxy(thisgroup);
75  ldbProxy[CkMyPe()].RecvMigrate(msg);
76  } else {
77  ExpectMigrate(msg);
78  }
79 }
80 
81 void LdbCoordinator::staticStatsFn(LDOMHandle h, int state)
82 {
83  CkPrintf("I'm supposed to set stats\n");
84 }
85 
87 {
88  CkPrintf("I'm supposed to query load\n");
89 }
90 
92 {
93 
94 #if CONVERSE_VERSION_ELAN
95  //disableBlockingReceives();
96 #endif
97 
98  ((LdbCoordinator*)data)->AtSyncBarrierReached();
99 }
100 
102 {
103  theLbdb->RegisteringObjects(myHandle);
104 }
105 
107 {
108  ((LdbCoordinator*)data)->ResumeFromSync();
109 }
110 
112 {
113  theLbdb->DoneRegisteringObjects(myHandle);
114  CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
115  contribute(0, NULL, CkReduction::random, cb);
116 }
117 
119 {
120  if (CkpvAccess(LdbCoordinator_instance) == NULL) {
121  CkpvAccess(LdbCoordinator_instance) = this;
122  } else {
123  NAMD_bug("LdbCoordinator instanced twice on same node!");
124  }
125 
126  collPes = 0;
127  ldbCycleNum = 1;
128  takingLdbData = 1;
129  totalStepsDone = 0;
131  patchNAtoms = (int *) NULL;
132  sequencerThreads = (Sequencer **) NULL;
133  ldbStatsFP = NULL;
134  computeArray = NULL;
135  patchArray = NULL;
136  processorArray = NULL;
137 
138  // Register self as an object manager for new charm++ balancer framework
139  theLbdb = LdbInfra::Object();
140 
141  // Set the load balancing period (in seconds). Without this the
142  // load balancing framework will hang until 1 second has passed
143  // since the last load balancing, causing hiccups in very fast runs.
144  // Unfortunately, the clock is already set for the first load
145  // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm.
146  // For newer versions this is handled in initproc above.
147 
148 #ifndef LB_MANAGER_VERSION
149  theLbdb->SetLBPeriod(1.0e-5);
150 #endif
151 
152  myOMid.id.idx = 1;
153  LDCallbacks cb = { (LDMigrateFn)staticMigrateFn,
154  (LDStatsFn)staticStatsFn,
155  (LDQueryEstLoadFn)staticQueryEstLoadFn
156  };
157  myHandle = theLbdb->RegisterOM(myOMid,nullptr,cb);
158 
159 #ifdef LB_MANAGER_VERSION
160  // Add myself as a local barrier receiver, so I know when I might
161  // be registering objects.
162  theLbdb->AddLocalBarrierReceiver(this, &LdbCoordinator::AtSyncBarrierReached);
163 
164  // Also, add a local barrier client, to trigger load balancing
166  AddLocalBarrierClient(this, &LdbCoordinator::ResumeFromSync);
167 #else
168  // Add myself as a local barrier receiver, so I know when I might
169  // be registering objects.
170  theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync,
171  (void*)this);;
172 
173  // Also, add a local barrier client, to trigger load balancing
175  AddLocalBarrierClient((LDResumeFn)staticResumeFromSync,
176  (void*)this);
177 #endif
178  migrateMsgs = 0; // linked list
179  numComputes = 0;
180  reg_all_objs = 1;
181 }
182 
184 {
185  delete [] patchNAtoms;
186  delete [] sequencerThreads;
187  if (CkMyPe() == 0)
188  {
189  delete [] computeArray;
190  delete [] patchArray;
191  delete [] processorArray;
192  }
193  if (ldbStatsFP)
194  fclose(ldbStatsFP);
195 
196 }
197 
199 {
201 
202  // Create hierarchical or centralized load balancers
203  // Currently centralized is the default
204  if (simParams->ldBalancer == LDBAL_CENTRALIZED) {
205  CkPrintf("LDB: Central LB being created...\n");
207  } else if (simParams->ldBalancer == LDBAL_HYBRID) {
208  CkPrintf("LDB: Hybrid LB being created...\n");
210  }
211 }
212 
213 void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit)
214 {
216 
217  // DebugM(10,"stepsPerLdbCycle initialized\n");
218  stepsPerLdbCycle = simParams->ldbPeriod;
219  firstLdbStep = simParams->firstLdbStep;
220  int lastLdbStep = simParams->lastLdbStep;
221  int stepsPerCycle = simParams->stepsPerCycle;
222 
223  computeMap = cMap;
224  patchMap = pMap;
225 
226  // Set the number of received messages correctly for node 0
227 
230 
231  if (patchNAtoms)
232  delete [] patchNAtoms; // Depends on delete NULL to do nothing
234  patchNAtoms = new int[nPatches];
235 
236  typedef Sequencer *seqPtr;
237 
238  if ( ! reinit ) {
239  delete [] sequencerThreads; // Depends on delete NULL to do nothing
240  sequencerThreads = new seqPtr[nPatches];
241  }
242 
243  nLocalPatches=0;
244 
245  int i;
246  for(i=0;i<nPatches;i++)
247  {
248  if (patchMap->node(i) == Node::Object()->myid())
249  {
250  nLocalPatches++;
251  patchNAtoms[i]=0;
252  } else {
253  patchNAtoms[i]=-1;
254  }
255  if ( ! reinit ) sequencerThreads[i]=NULL;
256  }
257  if ( ! reinit ) controllerThread = NULL;
259  NAMD_die("Disaggreement in patchMap data.\n");
260 
261  const int oldNumComputes = numComputes;
262  nLocalComputes = 0;
264 
265  for(i=0;i<numComputes;i++) {
266  if ( (computeMap->node(i) == Node::Object()->myid())
267  && ( 0
268  #if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
269  #if defined(NAMD_MIC)
270  || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
271  || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
272  #endif
273  #else
276 #endif
277 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
278  || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & 1))
279  || (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & 1))
280  || (computeMap->type(i) == computeSelfAnglesType && !(simParams->bondedCUDA & 2))
281  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & 2))
282  || (computeMap->type(i) == computeSelfDihedralsType && !(simParams->bondedCUDA & 4))
283  || (computeMap->type(i) == computeDihedralsType && !(simParams->bondedCUDA & 4))
284  || (computeMap->type(i) == computeSelfImpropersType && !(simParams->bondedCUDA & 8))
285  || (computeMap->type(i) == computeImpropersType && !(simParams->bondedCUDA & 8))
286  || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & 16))
287  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & 16))
288  || (computeMap->type(i) == computeSelfCrosstermsType && !(simParams->bondedCUDA & 32))
289  || (computeMap->type(i) == computeCrosstermsType && !(simParams->bondedCUDA & 32))
290 #else
292  || (computeMap->type(i) == computeBondsType)
294  || (computeMap->type(i) == computeAnglesType)
300  || (computeMap->type(i) == computeExclsType)
303 #endif
304  || (computeMap->type(i) == computeLCPOType)
307 
308  || (computeMap->type(i) == computeTholeType)
309  || (computeMap->type(i) == computeAnisoType)
310  // JLai
313  ) ) {
314  nLocalComputes++;
315  }
316  }
317 
318  // New LB frameworks registration
319 
320  // Allocate data structure to save incoming migrations. Processor
321  // zero will get all migrations
322 
323  // If this is the first time through, we need it register patches
324  if (ldbCycleNum == reg_all_objs) {
325  if ( 1 ) { // ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) {
326  reg_all_objs = 3;
327  }
328  // Tell the lbdb that I'm registering objects, until I'm done
329  // registering them.
330  theLbdb->RegisteringObjects(myHandle);
331 
332  if ( ldbCycleNum == 1 ) {
333  patchHandles = new LDObjHandle[nLocalPatches];
334  int patch_count=0;
335  int i;
336  for(i=0;i<nPatches;i++)
337  if (patchMap->node(i) == Node::Object()->myid()) {
338 
339  LdbId elemID;
340  LdbIdField(elemID, 0) = i;
341  LdbIdField(elemID, 1) = PATCH_TYPE;
342 
343  if (patch_count >= nLocalPatches) {
344  NAMD_bug("LdbCoordinator found too many local patches!");
345  }
346  HomePatch *p = patchMap->homePatch(i);
347  p->ldObjHandle =
348  patchHandles[patch_count]
349  = theLbdb->RegisterObj(myHandle,elemID,0,0);
350  patch_count++;
351 
352  }
353  }
354 
355  if ( numComputes > oldNumComputes ) {
356  // Register computes
357  for(i=oldNumComputes; i<numComputes; i++) {
358  if ( computeMap->node(i) == Node::Object()->myid())
359  {
360  if ( 0
361  #if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
362  #if defined(NAMD_MIC)
363  || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
364  || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
365  #endif
366  #else
369  #endif
370 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
371  || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & 1))
372  || (computeMap->type(i) == computeSelfAnglesType && !(simParams->bondedCUDA & 2))
373  || (computeMap->type(i) == computeSelfDihedralsType && !(simParams->bondedCUDA & 4))
374  || (computeMap->type(i) == computeSelfImpropersType && !(simParams->bondedCUDA & 8))
375  || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & 16))
376  || (computeMap->type(i) == computeSelfCrosstermsType && !(simParams->bondedCUDA & 32))
377 #else
384 #endif
385  || (computeMap->type(i) == computeLCPOType)
388  // JLai
390  // End of JLai
391  ) {
392  // Register the object with the load balancer
393  // Store the depended patch IDs in the rest of the element ID
394  LdbId elemID;
395  LdbIdField(elemID, 0) = i;
396 
397  if (computeMap->numPids(i) > 0)
398  LdbIdField(elemID, 1) = computeMap->pid(i,0);
399  else LdbIdField(elemID, 1) = NONBONDED_OR_SELF_TYPE;
400 
401  Compute *c = computeMap->compute(i);
402  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
403 
404  c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1);
405  }
406  else if (
407 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
408  (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & 1))
409  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & 2))
410  || (computeMap->type(i) == computeDihedralsType && !(simParams->bondedCUDA & 4))
411  || (computeMap->type(i) == computeImpropersType && !(simParams->bondedCUDA & 8))
412  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & 16))
413  || (computeMap->type(i) == computeCrosstermsType && !(simParams->bondedCUDA & 32))
414 #else
416  || (computeMap->type(i) == computeAnglesType)
419  || (computeMap->type(i) == computeExclsType)
421 #endif
422  || (computeMap->type(i) == computeTholeType)
423  || (computeMap->type(i) == computeAnisoType)
424  // JLai
426  // End of JLai
427  ) {
428  // Register the object with the load balancer
429  // Store the depended patch IDs in the rest of the element ID
430  LdbId elemID;
431  LdbIdField(elemID, 0) = i;
432 
433  LdbIdField(elemID, 1) = BONDED_TYPE;
434 
435  Compute *c = computeMap->compute(i);
436  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
437 
438  c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0);
439  }
440  }
441  }
442  }
443  theLbdb->DoneRegisteringObjects(myHandle);
444  }
445 
446  // process saved migration messages, if any
447  while ( migrateMsgs ) {
449  migrateMsgs = m->next;
450  Compute *c = computeMap->compute(LdbIdField(m->handle.id, 0));
451  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2");
452  c->ldObjHandle = m->handle;
453  delete m;
454  }
455 
456  // Fixup to take care of the extra timestep at startup
457  // This is pretty ugly here, but it makes the count correct
458 
459  // iout << "LDB Cycle Num: " << ldbCycleNum << "\n";
460 
461  if ( 1 ) { // ( simParams->ldBalancer == LDBAL_CENTRALIZED ) {
462  if (ldbCycleNum == 1 || ldbCycleNum == 3) {
463  numStepsToRun = stepsPerCycle;
465  takingLdbData = 0;
466  theLbdb->CollectStatsOff();
467  } else if (ldbCycleNum == 2 || ldbCycleNum == 4) {
468  numStepsToRun = firstLdbStep - stepsPerCycle;
469  while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle;
471  takingLdbData = 1;
472  theLbdb->CollectStatsOn();
473  } else if ( (ldbCycleNum <= 6) || !takingLdbData )
474  {
476  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
477  numStepsToRun = -1;
478  takingLdbData = 0;
479  theLbdb->CollectStatsOff();
480  } else {
482  takingLdbData = 1;
483  theLbdb->CollectStatsOn();
484  }
485  }
486  else
487  {
489  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
490  numStepsToRun = -1;
491  takingLdbData = 0;
492  theLbdb->CollectStatsOff();
493  } else {
495  takingLdbData = 0;
496  theLbdb->CollectStatsOff();
497  }
498  }
499  } else {
500  if (ldbCycleNum==1)
501  {
504  takingLdbData = 0;
505  theLbdb->CollectStatsOff();
506  }
507  else if ( (ldbCycleNum <= 4) || !takingLdbData )
508  {
510  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
511  numStepsToRun = -1;
512  takingLdbData = 0;
513  theLbdb->CollectStatsOff();
514  } else {
516  takingLdbData = 1;
517  theLbdb->CollectStatsOn();
518  }
519  }
520  else
521  {
523  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
524  numStepsToRun = -1;
525  takingLdbData = 0;
526  theLbdb->CollectStatsOff();
527  } else {
529  takingLdbData = 0;
530  theLbdb->CollectStatsOff();
531  }
532  }
533  }
534 
535 /*-----------------------------------------------------------------------------*
536  * --------------------------------------------------------------------------- *
537  * Comments inserted by Abhinav to clarify relation between ldbCycleNum, *
538  * load balancing step numbers (printed by the step() function) and *
539  * tracing of the steps *
540  * --------------------------------------------------------------------------- *
541  * If trace is turned off in the beginning, then tracing is turned on *
542  * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can *
543  * be adjusted by specifying firstLdbStep and ldbPeriod which are set by *
544  * default to 5*stepspercycle and 200*stepspercycle if not specified. *
545  * *
546  * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the *
547  * following timeline (for these particular numbers): *
548  * *
549  * Tracing : <------ off ------><------------- on -----------><-- off *
550  * Ldb Step() No : 1 2 3 4 5 6 7 *
551  * Iteration Steps : 00====20====40====60====80======160====180=====260====280 *
552  * ldbCycleNum : 1 2 3 4 5 6 7 8 9 *
553  * Instrumention : Inst Inst Inst Inst Inst *
554  * LDB Strategy : TLB RLB RLB RLB RLB *
555  * *
556  * TLB = TorusLB *
557  * RLB = RefineTorusLB *
558  * Inst = Instrumentation Phase (no real load balancing) *
559  * --------------------------------------------------------------------------- *
560  *-----------------------------------------------------------------------------*
561  */
562 #if 0 //replaced by traceBarrier at Controller and Sequencer
563  if (traceAvailable()) {
564  static int specialTracing = 0; // XXX static variables are unsafe for SMP
565  if (ldbCycleNum == 1 && traceIsOn() == 0) specialTracing = 1;
566  if (specialTracing) {
567  if (ldbCycleNum == 4) traceBegin();
568  if (ldbCycleNum == 8) traceEnd();
569  }
570  }
571 #endif
572 
573  nPatchesReported = 0;
575  nComputesReported = 0;
577  controllerReported = 0;
578  controllerExpected = ! CkMyPe();
579 
580  if (simParams->multigratorOn) {
581  // Add the number of pressure cycles into nComputesExpected:
582  // Pressure cycle is done when !(step % simParams->multigratorPressureFreq) = true
583  // step = Current step
584  int step = totalStepsDone - numStepsToRun;
585  int freq = simParams->multigratorPressureFreq;
586  // dstep = Number of steps we have to take until next pressure cycle
587  int dstep = 0;
588  if ((step % freq) != 0) dstep = freq - (step % freq);
589  step += dstep;
590  if (step < totalStepsDone) {
591  int numPressureCycles = 1 + ((totalStepsDone-step-1)/freq);
592  if (step==0) numPressureCycles--;
593  // if (CkMyPe()==2) fprintf(stderr, "step %d totalStepsDone %d numPressureCycles %d\n",
594  // step, totalStepsDone, numPressureCycles);
595  nComputesExpected += 2*nLocalComputes*numPressureCycles;
596  }
597  }
598 
599  if (CkMyPe() == 0)
600  {
601  if (computeArray == NULL)
603  if (patchArray == NULL)
605  if (processorArray == NULL)
606  processorArray = new processorInfo[CkNumPes()];
607  }
608 
609  theLbdb->ClearLoads();
610 }
611 
612 void LdbCoordinator::patchLoad(PatchID id, int nAtoms, int /* timestep */)
613 {
614  CmiAssert( id >=0 && id < nPatches);
615  if (patchNAtoms[id] != -1) {
616  patchNAtoms[id] = nAtoms;
618  } else {
619  DebugM(10, "::patchLoad() Unexpected patch reporting in\n");
620  }
621 }
622 
624 {
625  if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
626  return;
627 
628  sequencerThreads[pid] = seq;
629  seq->suspend();
630 }
631 
633 {
634  if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
635  return;
636 
637  iout << "LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() << "\n" << endi;
638  DebugM(3, "Controller reached load balance barrier.\n");
639  controllerReported = 1;
640  controllerThread = c;
641 
642  CProxy_LdbCoordinator(thisgroup).barrier();
643 
644  CthSuspend();
645 }
646 
648 {
652  {
653  NAMD_bug("Load balancer received wrong number of events.\n");
654  }
655  theLbdb->AtLocalBarrier(ldBarrierHandle);
656 }
657 
658 void LdbCoordinator::nodeDone(CkReductionMsg *msg)
659 {
660  delete msg;
661 
662  iout << "LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() << "\n" << endi;
663  if ( takingLdbData ) {
665  } else {
667  }
668 }
669 
671 {
672  // computeMgr->updateComputes() call only on Node(0) i.e. right here
673  // This will barrier for all Nodes - (i.e. Computes must be
674  // here and with proxies before anyone can start up
675 
676  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
677  ComputeMgr *computeMgr = cm.ckLocalBranch();
678  computeMgr->updateComputes(CkIndex_LdbCoordinator::
679  updateComputesReady(),thisgroup);
680 }
681 
683 {
684  // This method receives the migration from the framework,
685  // unregisters it, and sends it to the destination PE
686 
687  if ( m->to != CkMyPe() ) {
688  theLbdb->UnregisterObj(m->handle);
689 
690  CProxy_LdbCoordinator ldbProxy(thisgroup);
691  ldbProxy[m->to].ExpectMigrate(m);
692  } else {
693  ExpectMigrate(m);
694  }
695 }
696 
698 {
699  if ( m->from != CkMyPe() ) {
700  m->handle = theLbdb->RegisterObj(myHandle,m->handle.id,0,1);
701  theLbdb->Migrated(m->handle);
702  }
703 
704  m->next = migrateMsgs;
705  migrateMsgs = m;
706 }
707 
709  DebugM(3,"updateComputesReady()\n");
710 
711  CProxy_LdbCoordinator(thisgroup).resume();
712  CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
713 }
714 
716 {
717  DebugM(3,"resume()\n");
718  // printLocalLdbReport();
719 
720  ldbCycleNum++;
722 
723  Sync::Object()->openSync();
724 }
725 
726 void LdbCoordinator::resumeReady(CkQdMsg *msg) {
727 
728  iout << "LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() << "\n" << endi;
729  DebugM(3,"resumeReady()\n");
730  delete msg;
731 
732  CProxy_LdbCoordinator(thisgroup).resume2();
733 }
734 
736 {
737  DebugM(3,"resume2()\n");
738 
739 #if CONVERSE_VERSION_ELAN
740  // enableBlockingReceives();
741 #endif
742 
744 }
745 
747 {
748  if (controllerThread)
749  {
751  controllerThread = NULL;
752  }
753  for(int i=0; i < patchMap->numPatches(); i++)
754  {
755  if (sequencerThreads[i])
756  {
757  sequencerThreads[i]->awaken();
758  }
759  sequencerThreads[i]= NULL;
760  }
761 }
762 
763 // Figure out which proxies we will definitely create on other
764 // nodes, without regard for non-bonded computes. This code is swiped
765 // from ProxyMgr, and changes there probable need to be propagated here.
766 
767 int LdbCoordinator::requiredProxies(PatchID id, int neighborNodes[])
768 {
769  PatchID neighbors[1 + PatchMap::MaxOneAway];
770  neighbors[0] = id;
771  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
772 
773  int nProxyNodes = 0;
774  int myNode = patchMap->node(id);
775  for ( int i = 0; i < numNeighbors; ++i ) {
776  const int proxyNode = patchMap->basenode(neighbors[i]);
777  if ( proxyNode != myNode ) {
778  int j;
779  for ( j = 0; j < nProxyNodes; ++j ) {
780  if ( neighborNodes[j] == proxyNode ) break;
781  }
782  if ( j == nProxyNodes ) {
783  neighborNodes[nProxyNodes] = proxyNode;
784  nProxyNodes++;
785  }
786  }
787  }
788  return nProxyNodes;
789 }
790 
792 {
793  char outputBuf[255];
794  char *curLoc;
795 
796  CkPrintf("%d:Patch report:\n",CkMyPe());
797 
798  curLoc = outputBuf;
799  int i,j=0;
800  for(i=0; i<patchMap->numPatches(); i++)
801  {
802  if (patchNAtoms[i] != -1)
803  {
804  curLoc += sprintf(curLoc,"%5d: %5d ",i,patchNAtoms[i]);
805  j++;
806  }
807  if (((j % 4) == 0) && j)
808  {
809  curLoc = outputBuf;
810  CkPrintf("[%d]%s\n",CkMyPe(),outputBuf);
811  j=0;
812  }
813  }
814 
815  CkPrintf("%d:Compute report:\n",CkMyPe());
816 
817  curLoc = outputBuf;
818  j=0;
819 }
820 
822 {
823  // Check all two-away neighbors.
824  // This is really just one-away neighbors, since
825  // two-away always returns zero: RKB
826  int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
827  const int nProxyNodes = requiredProxies(id,neighborNodes);
828 
829  fprintf(fp,"%4d ",nProxyNodes);
830 
831  for(int i=0;i<nProxyNodes;i++)
832  fprintf(fp,"%4d ",neighborNodes[i]);
833 }
834 
836  CProxy_LdbCoordinator(thisgroup)[0].collectLoads(msg);
837 }
838 
840  // CkPrintf("LdbCoordinator::collectLoads recv %d-%d\n", msg->firstPe, msg->lastPe);
841  if ( collPes == 0 ) {
842  reverted = 0;
843  initTotalProxies = 0;
844  finalTotalProxies = 0;
845  initMaxPeProxies = 0;
846  finalMaxPeProxies = 0;
847  initMaxPatchProxies = 0;
848  finalMaxPatchProxies = 0;
849  initTime = 0;
850  finalTime = 0;
851  initMemory = 0;
852  finalMemory = 0;
853  initAvgPeLoad = 0;
854  finalAvgPeLoad = 0;
855  initMaxPeLoad = 0;
856  finalMaxPeLoad = 0;
857  }
858  int numPes = msg->lastPe - msg->firstPe + 1;
859  collPes += numPes;
860 #define COLL_MAX(F) if ( msg->F > F ) F = msg->F;
861 #define COLL_AVG(F) F += msg->F * (double) numPes / (double) CkNumPes();
862 #define COLL_SUM(F) F += msg->F;
863  COLL_SUM(reverted)
864  COLL_SUM(initTotalProxies)
865  COLL_SUM(finalTotalProxies)
866  COLL_MAX(initMaxPeProxies)
867  COLL_MAX(finalMaxPeProxies)
868  COLL_MAX(initMaxPatchProxies)
869  COLL_MAX(finalMaxPatchProxies)
870  if ( (msg->finalTime - msg->initTime) > (finalTime - initTime) ) {
871  initTime = msg->initTime;
872  finalTime = msg->finalTime;
873  }
874  COLL_MAX(initMemory)
875  COLL_MAX(finalMemory)
876  COLL_AVG(initAvgPeLoad)
877  COLL_AVG(finalAvgPeLoad)
878  COLL_MAX(initMaxPeLoad)
879  COLL_MAX(finalMaxPeLoad)
880 
881  if ( collPes == CkNumPes() ) {
882  collPes = 0;
883  iout << "LDB: TIME " << initTime << " LOAD: AVG " << initAvgPeLoad
884  << " MAX " << initMaxPeLoad << " PROXIES: TOTAL " << initTotalProxies << " MAXPE " <<
885  initMaxPeProxies << " MAXPATCH " << initMaxPatchProxies << " " << "None"
886  << " MEM: " << initMemory << " MB\n";
887  if ( reverted ) iout << "LDB: Reverting to original mapping on " << reverted << " balancers\n";
888  iout << "LDB: TIME " << finalTime << " LOAD: AVG " << finalAvgPeLoad
889  << " MAX " << finalMaxPeLoad << " PROXIES: TOTAL " << finalTotalProxies << " MAXPE " <<
890  finalMaxPeProxies << " MAXPATCH " << finalMaxPatchProxies << " " << msg->strategyName
891  << " MEM: " << finalMemory << " MB\n";
892  iout << endi;
893  fflush(stdout);
894  }
895 
896  delete msg;
897 }
898 
899 #include "LdbCoordinator.def.h"
static Node * Object()
Definition: Node.h:86
#define LDBAL_HYBRID
Definition: SimParameters.h:63
void barrier(void)
void sendCollectLoads(CollectLoadsMsg *)
void LdbCoordinator_initproc()
represents bonded compute
Controller * controllerThread
int numComputes(void)
Definition: ComputeMap.h:101
void collectLoads(CollectLoadsMsg *)
void resumeReady(CkQdMsg *msg)
static PatchMap * Object()
Definition: PatchMap.h:27
Sequencer ** sequencerThreads
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
LDObjHandle * patchHandles
void updateComputesReady()
void resume(void)
#define COLL_AVG(F)
void AtSyncBarrierReached(void)
#define DebugM(x, y)
Definition: Debug.h:59
void createLoadBalancer()
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
void Migrate(LDObjHandle handle, int dest)
LDObjHandle ldObjHandle
Definition: Compute.h:44
#define iout
Definition: InfoStream.h:51
void printRequiredProxies(PatchID id, FILE *fp)
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:240
int basenode(int pid) const
Definition: PatchMap.h:117
LDObjHandle ldObjHandle
Definition: HomePatch.h:479
represents nonbonded or self compute
void openSync()
Definition: Sync.C:63
void awakenSequencers(void)
void patchLoad(PatchID id, int nAtoms, int timestep)
void ResumeFromSync(void)
char strategyName[16]
ComputeMap * computeMap
int numNodes()
Definition: Node.h:189
PatchMap * patchMap
void initialize(PatchMap *pmap, ComputeMap *cmap, int reinit=0)
void awaken(void)
Definition: Sequencer.h:29
#define LDBAL_NONE
Definition: SimParameters.h:61
LdbMigrateMsg * migrateMsgs
void CreateNamdHybridLB()
Definition: NamdHybridLB.C:49
LDOMHandle myHandle
void NAMD_bug(const char *err_msg)
Definition: common.C:129
ComputeType type(ComputeID cid)
Definition: ComputeMap.C:120
void CreateNamdCentLB()
Definition: NamdCentLB.C:26
void awaken(void)
Definition: Controller.h:45
void rebalance(Sequencer *seq, PatchID id)
static Sync * Object()
Definition: Sync.h:50
#define COLL_SUM(F)
int PatchID
Definition: NamdTypes.h:182
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
Definition: common.C:85
static LdbCoordinator * Object()
static void staticQueryEstLoadFn(LDOMHandle h)
void ExpectMigrate(LdbMigrateMsg *)
void nodeDone(CkReductionMsg *)
#define LDBAL_CENTRALIZED
Definition: SimParameters.h:62
static void staticReceiveAtSync(void *data)
represents a patch
void suspend(void)
Definition: Sequencer.C:136
int myid()
Definition: Node.h:188
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714
LDObjid LdbId
#define simParams
Definition: Output.C:127
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
int numHomePatches(void)
Definition: PatchMap.C:432
Compute * compute(ComputeID cid)
Definition: ComputeMap.h:171
static ComputeMap * Object()
Definition: ComputeMap.h:89
LdbInfra * theLbdb
void printLocalLdbReport(void)
computeInfo * computeArray
int requiredProxies(PatchID id, int[])
int multigratorPressureFreq
int node(ComputeID cid)
Definition: ComputeMap.h:106
int numPids(ComputeID cid)
Definition: ComputeMap.C:103
#define COLL_MAX(F)
static void staticMigrateFn(LDObjHandle handle, int dest)
LdbMigrateMsg * next
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:109
void resume2(void)
LDObjHandle handle
patchInfo * patchArray
LDBarrierClient ldBarrierHandle
static void staticResumeFromSync(void *data)
static void staticStatsFn(LDOMHandle h, int state)
void updateComputes(int, CkGroupID)
Definition: ComputeMgr.C:142
processorInfo * processorArray
void ExecuteMigrations(void)
void RecvMigrate(LdbMigrateMsg *)