NAMD
LdbCoordinator.C
Go to the documentation of this file.
1 
7 /*****************************************************************************
8  * $Source: /home/cvs/namd/cvsroot/namd2/src/LdbCoordinator.C,v $
9  * $Author: jim $
10  * $Date: 2017/03/30 20:06:17 $
11  * $Revision: 1.128 $
12  *****************************************************************************/
13 
14 #include <stdlib.h>
15 
16 #include "InfoStream.h"
17 #include "NamdCentLB.h"
18 #include "NamdHybridLB.h"
19 #include "NamdDummyLB.h"
20 
21 #include "HomePatch.h"
22 #include "LdbCoordinator.decl.h"
23 #include "LdbCoordinator.h"
24 #include "NamdTypes.h"
25 #include "Node.h"
26 #include "SimParameters.h"
27 #include "PatchMap.inl"
28 #include "ComputeMap.h"
29 #include "common.h"
30 //#define DEBUGM
31 #define MIN_DEBUG_LEVEL 3
32 #include "Debug.h"
33 #include "Controller.h"
34 #include "Sequencer.h"
35 #include "RefineOnly.h"
36 #include "ComputeMgr.h"
37 #include "Compute.h"
38 #include "packmsg.h"
39 #include "Sync.h"
40 
41 #include "elements.h"
42 #include "ComputeMgr.decl.h"
43 
44 #define DEBUG_LEVEL 4
45 
46 #if CONVERSE_VERSION_ELAN
47 extern "C" void enableBlockingReceives();
48 extern "C" void disableBlockingReceives();
49 #endif
50 
52  // Set the load balancing period (in seconds). Without this the
53  // load balancing framework will hang until 1 second has passed
54  // since the last load balancing, causing hiccups in very fast runs.
55  // This is duplicated below for older versions, but putting it here
56  // also fixes the first load balance.
57 #ifndef LB_MANAGER_VERSION
58  LBSetPeriod(1.0e-5);
59 #endif
60 }
61 
62 void LdbCoordinator::staticMigrateFn(LDObjHandle handle, int dest)
63 {
64  LdbCoordinator::Object()->Migrate(handle,dest);
65 }
66 
67 void LdbCoordinator::Migrate(LDObjHandle handle, int dest)
68 {
69  LdbMigrateMsg* msg = new LdbMigrateMsg;
70  msg->handle = handle;
71  msg->from = CkMyPe();
72  msg->to = dest;
73  if ( msg->to != CkMyPe() ) {
74  CProxy_LdbCoordinator ldbProxy(thisgroup);
75  ldbProxy[CkMyPe()].RecvMigrate(msg);
76  } else {
77  ExpectMigrate(msg);
78  }
79 }
80 
81 void LdbCoordinator::staticStatsFn(LDOMHandle h, int state)
82 {
83  CkPrintf("I'm supposed to set stats\n");
84 }
85 
87 {
88  CkPrintf("I'm supposed to query load\n");
89 }
90 
92 {
93 
94 #if CONVERSE_VERSION_ELAN
95  //disableBlockingReceives();
96 #endif
97 
98  ((LdbCoordinator*)data)->AtSyncBarrierReached();
99 }
100 
102 {
103  theLbdb->RegisteringObjects(myHandle);
104 }
105 
107 {
108  ((LdbCoordinator*)data)->ResumeFromSync();
109 }
110 
112 {
113  theLbdb->DoneRegisteringObjects(myHandle);
114  CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
115  contribute(0, NULL, CkReduction::random, cb);
116 }
117 
119 {
120  if (CkpvAccess(LdbCoordinator_instance) == NULL) {
121  CkpvAccess(LdbCoordinator_instance) = this;
122  } else {
123  NAMD_bug("LdbCoordinator instanced twice on same node!");
124  }
125 
126  collPes = 0;
127  ldbCycleNum = 1;
128  takingLdbData = 1;
129  totalStepsDone = 0;
131  patchNAtoms = (int *) NULL;
132  sequencerThreads = (Sequencer **) NULL;
133  ldbStatsFP = NULL;
134  computeArray = NULL;
135  patchArray = NULL;
136  processorArray = NULL;
137 
138  // Register self as an object manager for new charm++ balancer framework
139  theLbdb = LdbInfra::Object();
140 
141  // Set the load balancing period (in seconds). Without this the
142  // load balancing framework will hang until 1 second has passed
143  // since the last load balancing, causing hiccups in very fast runs.
144  // Unfortunately, the clock is already set for the first load
145  // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm.
146  // For newer versions this is handled in initproc above.
147 
148 #ifndef LB_MANAGER_VERSION
149  theLbdb->SetLBPeriod(1.0e-5);
150 #endif
151 
152  myOMid.id.idx = 1;
153  LDCallbacks cb = { (LDMigrateFn)staticMigrateFn,
154  (LDStatsFn)staticStatsFn,
155  (LDQueryEstLoadFn)staticQueryEstLoadFn
156  };
157  myHandle = theLbdb->RegisterOM(myOMid,nullptr,cb);
158 
159 #ifdef LB_MANAGER_VERSION
160  // Add myself as a local barrier receiver, so I know when I might
161  // be registering objects.
162  theLbdb->AddLocalBarrierReceiver(this, &LdbCoordinator::AtSyncBarrierReached);
163 
164  // Also, add a local barrier client, to trigger load balancing
166  AddLocalBarrierClient(this, &LdbCoordinator::ResumeFromSync);
167 #else
168  // Add myself as a local barrier receiver, so I know when I might
169  // be registering objects.
170  theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync,
171  (void*)this);;
172 
173  // Also, add a local barrier client, to trigger load balancing
175  AddLocalBarrierClient((LDResumeFn)staticResumeFromSync,
176  (void*)this);
177 #endif
178  migrateMsgs = 0; // linked list
179  numComputes = 0;
180  reg_all_objs = 1;
181 }
182 
184 {
185  delete [] patchNAtoms;
186  delete [] sequencerThreads;
187  if (CkMyPe() == 0)
188  {
189  delete [] computeArray;
190  delete [] patchArray;
191  delete [] processorArray;
192  }
193  if (ldbStatsFP)
194  fclose(ldbStatsFP);
195 
196 }
197 
199 {
201 
202  // Create hierarchical or centralized load balancers
203  // Currently centralized is the default
204  if (simParams->ldBalancer == LDBAL_CENTRALIZED) {
205  CkPrintf("LDB: Central LB being created...\n");
207  } else if (simParams->ldBalancer == LDBAL_HYBRID) {
208  CkPrintf("LDB: Hybrid LB being created...\n");
210  }
211 }
212 
213 void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit)
214 {
216 
217  // DebugM(10,"stepsPerLdbCycle initialized\n");
218  stepsPerLdbCycle = simParams->ldbPeriod;
219  firstLdbStep = simParams->firstLdbStep;
220  int lastLdbStep = simParams->lastLdbStep;
221  int stepsPerCycle = simParams->stepsPerCycle;
222 
223  computeMap = cMap;
224  patchMap = pMap;
225 
226  // Set the number of received messages correctly for node 0
227 
230 
231  if (patchNAtoms)
232  delete [] patchNAtoms; // Depends on delete NULL to do nothing
234  patchNAtoms = new int[nPatches];
235 
236  typedef Sequencer *seqPtr;
237 
238  if ( ! reinit ) {
239  delete [] sequencerThreads; // Depends on delete NULL to do nothing
240  sequencerThreads = new seqPtr[nPatches];
241  }
242 
243  nLocalPatches=0;
244 
245  int i;
246  for(i=0;i<nPatches;i++)
247  {
248  if (patchMap->node(i) == Node::Object()->myid())
249  {
250  nLocalPatches++;
251  patchNAtoms[i]=0;
252  } else {
253  patchNAtoms[i]=-1;
254  }
255  if ( ! reinit ) sequencerThreads[i]=NULL;
256  }
257  if ( ! reinit ) controllerThread = NULL;
259  NAMD_die("Disaggreement in patchMap data.\n");
260 
261  const int oldNumComputes = numComputes;
262  nLocalComputes = 0;
264 
265  for(i=0;i<numComputes;i++) {
266  if ( (computeMap->node(i) == Node::Object()->myid())
267  && ( 0
268  #if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
269  #if defined(NAMD_MIC)
270  || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
271  || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
272  #endif
273  #else
276 #endif
277 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
278  || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
279  || (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
281  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANGLES))
286  || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
287  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
290  || (computeMap->type(i) == computeSelfTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
291  || (computeMap->type(i) == computeTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
292  || (computeMap->type(i) == computeSelfAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
293  || (computeMap->type(i) == computeAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
294 #else
296  || (computeMap->type(i) == computeBondsType)
298  || (computeMap->type(i) == computeAnglesType)
304  || (computeMap->type(i) == computeExclsType)
308  || (computeMap->type(i) == computeTholeType)
310  || (computeMap->type(i) == computeAnisoType)
311 #endif
314  || (computeMap->type(i) == computeLCPOType)
315  // JLai
318  ) ) {
319  nLocalComputes++;
320  }
321  }
322 
323  // New LB frameworks registration
324 
325  // Allocate data structure to save incoming migrations. Processor
326  // zero will get all migrations
327 
328  // If this is the first time through, we need it register patches
329  if (ldbCycleNum == reg_all_objs) {
330  if ( 1 ) { // ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) {
331  reg_all_objs = 3;
332  }
333  // Tell the lbdb that I'm registering objects, until I'm done
334  // registering them.
335  theLbdb->RegisteringObjects(myHandle);
336 
337  if ( ldbCycleNum == 1 ) {
338  patchHandles = new LDObjHandle[nLocalPatches];
339  int patch_count=0;
340  int i;
341  for(i=0;i<nPatches;i++)
342  if (patchMap->node(i) == Node::Object()->myid()) {
343 
344  LdbId elemID;
345  LdbIdField(elemID, 0) = i;
346  LdbIdField(elemID, 1) = PATCH_TYPE;
347 
348  if (patch_count >= nLocalPatches) {
349  NAMD_bug("LdbCoordinator found too many local patches!");
350  }
351  HomePatch *p = patchMap->homePatch(i);
352  p->ldObjHandle =
353  patchHandles[patch_count]
354  = theLbdb->RegisterObj(myHandle,elemID,0,0);
355  patch_count++;
356 
357  }
358  }
359 
360  if ( numComputes > oldNumComputes ) {
361  // Register computes
362  for(i=oldNumComputes; i<numComputes; i++) {
363  if ( computeMap->node(i) == Node::Object()->myid())
364  {
365  if ( 0
366  #if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
367  #if defined(NAMD_MIC)
368  || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
369  || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
370  #endif
371  #else
374  #endif
375 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
376  || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
380  || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
382  || (computeMap->type(i) == computeSelfTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
383  || (computeMap->type(i) == computeSelfAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
384 #else
393 #endif
394  || (computeMap->type(i) == computeLCPOType)
396  // JLai
398  // End of JLai
399  ) {
400  // Register the object with the load balancer
401  // Store the depended patch IDs in the rest of the element ID
402  LdbId elemID;
403  LdbIdField(elemID, 0) = i;
404 
405  if (computeMap->numPids(i) > 0)
406  LdbIdField(elemID, 1) = computeMap->pid(i,0);
407  else LdbIdField(elemID, 1) = NONBONDED_OR_SELF_TYPE;
408 
409  Compute *c = computeMap->compute(i);
410  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
411 
412  c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1);
413  }
414  else if (
415 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
416  (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
417  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANGLES))
420  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
422  || (computeMap->type(i) == computeTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
423  || (computeMap->type(i) == computeAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
424 #else
426  || (computeMap->type(i) == computeAnglesType)
429  || (computeMap->type(i) == computeExclsType)
431  || (computeMap->type(i) == computeTholeType)
432  || (computeMap->type(i) == computeAnisoType)
433 #endif
435  // JLai
437  // End of JLai
438  ) {
439  // Register the object with the load balancer
440  // Store the depended patch IDs in the rest of the element ID
441  LdbId elemID;
442  LdbIdField(elemID, 0) = i;
443 
444  LdbIdField(elemID, 1) = BONDED_TYPE;
445 
446  Compute *c = computeMap->compute(i);
447  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
448 
449  c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0);
450  }
451  }
452  }
453  }
454  theLbdb->DoneRegisteringObjects(myHandle);
455  }
456 
457  // process saved migration messages, if any
458  while ( migrateMsgs ) {
460  migrateMsgs = m->next;
461  Compute *c = computeMap->compute(LdbIdField(m->handle.id, 0));
462  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2");
463  c->ldObjHandle = m->handle;
464  delete m;
465  }
466 
467  // Fixup to take care of the extra timestep at startup
468  // This is pretty ugly here, but it makes the count correct
469 
470  // iout << "LDB Cycle Num: " << ldbCycleNum << "\n";
471 
472  if ( 1 ) { // ( simParams->ldBalancer == LDBAL_CENTRALIZED ) {
473  if (ldbCycleNum == 1 || ldbCycleNum == 3) {
474  numStepsToRun = stepsPerCycle;
476  takingLdbData = 0;
477  theLbdb->CollectStatsOff();
478  } else if (ldbCycleNum == 2 || ldbCycleNum == 4) {
479  numStepsToRun = firstLdbStep - stepsPerCycle;
480  while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle;
482  takingLdbData = 1;
483  theLbdb->CollectStatsOn();
484  } else if ( (ldbCycleNum <= 6) || !takingLdbData )
485  {
487  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
488  numStepsToRun = -1;
489  takingLdbData = 0;
490  theLbdb->CollectStatsOff();
491  } else {
493  takingLdbData = 1;
494  theLbdb->CollectStatsOn();
495  }
496  }
497  else
498  {
500  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
501  numStepsToRun = -1;
502  takingLdbData = 0;
503  theLbdb->CollectStatsOff();
504  } else {
506  takingLdbData = 0;
507  theLbdb->CollectStatsOff();
508  }
509  }
510  } else {
511  if (ldbCycleNum==1)
512  {
515  takingLdbData = 0;
516  theLbdb->CollectStatsOff();
517  }
518  else if ( (ldbCycleNum <= 4) || !takingLdbData )
519  {
521  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
522  numStepsToRun = -1;
523  takingLdbData = 0;
524  theLbdb->CollectStatsOff();
525  } else {
527  takingLdbData = 1;
528  theLbdb->CollectStatsOn();
529  }
530  }
531  else
532  {
534  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
535  numStepsToRun = -1;
536  takingLdbData = 0;
537  theLbdb->CollectStatsOff();
538  } else {
540  takingLdbData = 0;
541  theLbdb->CollectStatsOff();
542  }
543  }
544  }
545 
546 /*-----------------------------------------------------------------------------*
547  * --------------------------------------------------------------------------- *
548  * Comments inserted by Abhinav to clarify relation between ldbCycleNum, *
549  * load balancing step numbers (printed by the step() function) and *
550  * tracing of the steps *
551  * --------------------------------------------------------------------------- *
552  * If trace is turned off in the beginning, then tracing is turned on *
553  * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can *
554  * be adjusted by specifying firstLdbStep and ldbPeriod which are set by *
555  * default to 5*stepspercycle and 200*stepspercycle if not specified. *
556  * *
557  * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the *
558  * following timeline (for these particular numbers): *
559  * *
560  * Tracing : <------ off ------><------------- on -----------><-- off *
561  * Ldb Step() No : 1 2 3 4 5 6 7 *
562  * Iteration Steps : 00====20====40====60====80======160====180=====260====280 *
563  * ldbCycleNum : 1 2 3 4 5 6 7 8 9 *
564  * Instrumention : Inst Inst Inst Inst Inst *
565  * LDB Strategy : TLB RLB RLB RLB RLB *
566  * *
567  * TLB = TorusLB *
568  * RLB = RefineTorusLB *
569  * Inst = Instrumentation Phase (no real load balancing) *
570  * --------------------------------------------------------------------------- *
571  *-----------------------------------------------------------------------------*
572  */
573 #if 0 //replaced by traceBarrier at Controller and Sequencer
574  if (traceAvailable()) {
575  static int specialTracing = 0; // XXX static variables are unsafe for SMP
576  if (ldbCycleNum == 1 && traceIsOn() == 0) specialTracing = 1;
577  if (specialTracing) {
578  if (ldbCycleNum == 4) traceBegin();
579  if (ldbCycleNum == 8) traceEnd();
580  }
581  }
582 #endif
583 
584  nPatchesReported = 0;
586  nComputesReported = 0;
588  controllerReported = 0;
589  controllerExpected = ! CkMyPe();
590 
591  if (simParams->multigratorOn) {
592  // Add the number of pressure cycles into nComputesExpected:
593  // Pressure cycle is done when !(step % simParams->multigratorPressureFreq) = true
594  // step = Current step
595  int step = totalStepsDone - numStepsToRun;
596  int freq = simParams->multigratorPressureFreq;
597  // dstep = Number of steps we have to take until next pressure cycle
598  int dstep = 0;
599  if ((step % freq) != 0) dstep = freq - (step % freq);
600  step += dstep;
601  if (step < totalStepsDone) {
602  int numPressureCycles = 1 + ((totalStepsDone-step-1)/freq);
603  if (step==0) numPressureCycles--;
604  // if (CkMyPe()==2) fprintf(stderr, "step %d totalStepsDone %d numPressureCycles %d\n",
605  // step, totalStepsDone, numPressureCycles);
606  nComputesExpected += 2*nLocalComputes*numPressureCycles;
607  }
608  }
609 
610  if (CkMyPe() == 0)
611  {
612  if (computeArray == NULL)
614  if (patchArray == NULL)
616  if (processorArray == NULL)
617  processorArray = new processorInfo[CkNumPes()];
618  }
619 
620  theLbdb->ClearLoads();
621 }
622 
623 void LdbCoordinator::patchLoad(PatchID id, int nAtoms, int /* timestep */)
624 {
625  CmiAssert( id >=0 && id < nPatches);
626  if (patchNAtoms[id] != -1) {
627  patchNAtoms[id] = nAtoms;
629  } else {
630  DebugM(10, "::patchLoad() Unexpected patch reporting in\n");
631  }
632 }
633 
635 {
636  if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
637  return;
638 
639  sequencerThreads[pid] = seq;
640  seq->suspend();
641 }
642 
644 {
645  if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
646  return;
647 
648  iout << "LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() << "\n" << endi;
649  DebugM(3, "Controller reached load balance barrier.\n");
650  controllerReported = 1;
651  controllerThread = c;
652 
653  CProxy_LdbCoordinator(thisgroup).barrier();
654 
655  CthSuspend();
656 }
657 
659 {
660  // NOTE: I don't know why the IMD barrier introduced in
661  // 93c41c99f195945cc66c923428e5ff29348cb6b9 would cause
662  // LdbCoordinator::rebalance(Sequencer *seq, PatchID pid) and
663  // LdbCoordinator::rebalance(Controller *c) to be called out-of-order
664  // but bypassing these checks seems fine since they
665  // are introduced in b011a94e650be5124f04fd06bb51a7c536b2e61c.
666  // Before b011a94e650be5124f04fd06bb51a7c536b2e61c failing these
667  // checks does not trigger NAMD_bug().
671  {
672  const auto* simParams = Node::Object()->simParameters;
673  if (!(simParams->IMDon && simParams->IMDversion == IMDversion_t::IMDv3)) {
674  NAMD_bug("Load balancer received wrong number of events.\n");
675  }
676  }
677  theLbdb->AtLocalBarrier(ldBarrierHandle);
678 }
679 
680 void LdbCoordinator::nodeDone(CkReductionMsg *msg)
681 {
682  delete msg;
683 
684  iout << "LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() << "\n" << endi;
685  if ( takingLdbData ) {
687  } else {
689  }
690 }
691 
693 {
694  // computeMgr->updateComputes() call only on Node(0) i.e. right here
695  // This will barrier for all Nodes - (i.e. Computes must be
696  // here and with proxies before anyone can start up
697 
698  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
699  ComputeMgr *computeMgr = cm.ckLocalBranch();
700  computeMgr->updateComputes(CkIndex_LdbCoordinator::
701  updateComputesReady(),thisgroup);
702 }
703 
705 {
706  // This method receives the migration from the framework,
707  // unregisters it, and sends it to the destination PE
708 
709  if ( m->to != CkMyPe() ) {
710  theLbdb->UnregisterObj(m->handle);
711 
712  CProxy_LdbCoordinator ldbProxy(thisgroup);
713  ldbProxy[m->to].ExpectMigrate(m);
714  } else {
715  ExpectMigrate(m);
716  }
717 }
718 
720 {
721  if ( m->from != CkMyPe() ) {
722  m->handle = theLbdb->RegisterObj(myHandle,m->handle.id,0,1);
723  theLbdb->Migrated(m->handle);
724  }
725 
726  m->next = migrateMsgs;
727  migrateMsgs = m;
728 }
729 
731  DebugM(3,"updateComputesReady()\n");
732 
733  CProxy_LdbCoordinator(thisgroup).resume();
734  CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
735 }
736 
738 {
739  DebugM(3,"resume()\n");
740  // printLocalLdbReport();
741 
742  ldbCycleNum++;
744 
745  Sync::Object()->openSync();
746 }
747 
748 void LdbCoordinator::resumeReady(CkQdMsg *msg) {
749 
750  iout << "LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() << "\n" << endi;
751  DebugM(3,"resumeReady()\n");
752  delete msg;
753 
754  CProxy_LdbCoordinator(thisgroup).resume2();
755 }
756 
758 {
759  DebugM(3,"resume2()\n");
760 
761 #if CONVERSE_VERSION_ELAN
762  // enableBlockingReceives();
763 #endif
764 
766 }
767 
769 {
770  if (controllerThread)
771  {
773  controllerThread = NULL;
774  }
775  for(int i=0; i < patchMap->numPatches(); i++)
776  {
777  if (sequencerThreads[i])
778  {
779  sequencerThreads[i]->awaken();
780  }
781  sequencerThreads[i]= NULL;
782  }
783 }
784 
785 // Figure out which proxies we will definitely create on other
786 // nodes, without regard for non-bonded computes. This code is swiped
787 // from ProxyMgr, and changes there probable need to be propagated here.
788 
789 int LdbCoordinator::requiredProxies(PatchID id, int neighborNodes[])
790 {
791  PatchID neighbors[1 + PatchMap::MaxOneAway];
792  neighbors[0] = id;
793  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
794 
795  int nProxyNodes = 0;
796  int myNode = patchMap->node(id);
797  for ( int i = 0; i < numNeighbors; ++i ) {
798  const int proxyNode = patchMap->basenode(neighbors[i]);
799  if ( proxyNode != myNode ) {
800  int j;
801  for ( j = 0; j < nProxyNodes; ++j ) {
802  if ( neighborNodes[j] == proxyNode ) break;
803  }
804  if ( j == nProxyNodes ) {
805  neighborNodes[nProxyNodes] = proxyNode;
806  nProxyNodes++;
807  }
808  }
809  }
810  return nProxyNodes;
811 }
812 
814 {
815  char outputBuf[255];
816  char *curLoc;
817 
818  CkPrintf("%d:Patch report:\n",CkMyPe());
819 
820  curLoc = outputBuf;
821  int i,j=0;
822  for(i=0; i<patchMap->numPatches(); i++)
823  {
824  if (patchNAtoms[i] != -1)
825  {
826  curLoc += sprintf(curLoc,"%5d: %5d ",i,patchNAtoms[i]);
827  j++;
828  }
829  if (((j % 4) == 0) && j)
830  {
831  curLoc = outputBuf;
832  CkPrintf("[%d]%s\n",CkMyPe(),outputBuf);
833  j=0;
834  }
835  }
836 
837  CkPrintf("%d:Compute report:\n",CkMyPe());
838 
839  curLoc = outputBuf;
840  j=0;
841 }
842 
844 {
845  // Check all two-away neighbors.
846  // This is really just one-away neighbors, since
847  // two-away always returns zero: RKB
848  int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
849  const int nProxyNodes = requiredProxies(id,neighborNodes);
850 
851  fprintf(fp,"%4d ",nProxyNodes);
852 
853  for(int i=0;i<nProxyNodes;i++)
854  fprintf(fp,"%4d ",neighborNodes[i]);
855 }
856 
858  CProxy_LdbCoordinator(thisgroup)[0].collectLoads(msg);
859 }
860 
862  // CkPrintf("LdbCoordinator::collectLoads recv %d-%d\n", msg->firstPe, msg->lastPe);
863  if ( collPes == 0 ) {
864  reverted = 0;
865  initTotalProxies = 0;
866  finalTotalProxies = 0;
867  initMaxPeProxies = 0;
868  finalMaxPeProxies = 0;
869  initMaxPatchProxies = 0;
870  finalMaxPatchProxies = 0;
871  initTime = 0;
872  finalTime = 0;
873  initMemory = 0;
874  finalMemory = 0;
875  initAvgPeLoad = 0;
876  finalAvgPeLoad = 0;
877  initMaxPeLoad = 0;
878  finalMaxPeLoad = 0;
879  }
880  int numPes = msg->lastPe - msg->firstPe + 1;
881  collPes += numPes;
882 #define COLL_MAX(F) if ( msg->F > F ) F = msg->F;
883 #define COLL_AVG(F) F += msg->F * (double) numPes / (double) CkNumPes();
884 #define COLL_SUM(F) F += msg->F;
885  COLL_SUM(reverted)
886  COLL_SUM(initTotalProxies)
887  COLL_SUM(finalTotalProxies)
888  COLL_MAX(initMaxPeProxies)
889  COLL_MAX(finalMaxPeProxies)
890  COLL_MAX(initMaxPatchProxies)
891  COLL_MAX(finalMaxPatchProxies)
892  if ( (msg->finalTime - msg->initTime) > (finalTime - initTime) ) {
893  initTime = msg->initTime;
894  finalTime = msg->finalTime;
895  }
896  COLL_MAX(initMemory)
897  COLL_MAX(finalMemory)
898  COLL_AVG(initAvgPeLoad)
899  COLL_AVG(finalAvgPeLoad)
900  COLL_MAX(initMaxPeLoad)
901  COLL_MAX(finalMaxPeLoad)
902 
903  if ( collPes == CkNumPes() ) {
904  collPes = 0;
905  iout << "LDB: TIME " << initTime << " LOAD: AVG " << initAvgPeLoad
906  << " MAX " << initMaxPeLoad << " PROXIES: TOTAL " << initTotalProxies << " MAXPE " <<
907  initMaxPeProxies << " MAXPATCH " << initMaxPatchProxies << " " << "None"
908  << " MEM: " << initMemory << " MB\n";
909  if ( reverted ) iout << "LDB: Reverting to original mapping on " << reverted << " balancers\n";
910  iout << "LDB: TIME " << finalTime << " LOAD: AVG " << finalAvgPeLoad
911  << " MAX " << finalMaxPeLoad << " PROXIES: TOTAL " << finalTotalProxies << " MAXPE " <<
912  finalMaxPeProxies << " MAXPATCH " << finalMaxPatchProxies << " " << msg->strategyName
913  << " MEM: " << finalMemory << " MB\n";
914  iout << endi;
915  fflush(stdout);
916  }
917 
918  delete msg;
919 }
920 
921 #include "LdbCoordinator.def.h"
static Node * Object()
Definition: Node.h:86
#define LDBAL_HYBRID
Definition: SimParameters.h:66
int requiredProxies(PatchID id, int [])
void barrier(void)
void sendCollectLoads(CollectLoadsMsg *)
#define NAMD_BONDEDGPU_IMPROPERS
void LdbCoordinator_initproc()
#define NAMD_BONDEDGPU_CROSSTERMS
Controller * controllerThread
int numComputes(void)
Definition: ComputeMap.h:103
void collectLoads(CollectLoadsMsg *)
#define NAMD_BONDEDGPU_ANISOS
Definition: common.h:275
void resumeReady(CkQdMsg *msg)
static PatchMap * Object()
Definition: PatchMap.h:27
#define NAMD_BONDEDGPU_ANGLES
Sequencer ** sequencerThreads
#define NAMD_BONDEDGPU_THOLES
SimParameters * simParameters
Definition: Node.h:181
LDObjHandle * patchHandles
void updateComputesReady()
void resume(void)
#define COLL_AVG(F)
void AtSyncBarrierReached(void)
#define DebugM(x, y)
Definition: Debug.h:75
represents a patch
void createLoadBalancer()
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
represents nonbonded or self compute
void Migrate(LDObjHandle handle, int dest)
LDObjHandle ldObjHandle
Definition: Compute.h:44
#define iout
Definition: InfoStream.h:51
#define NAMD_BONDEDGPU_DIHEDRALS
void printRequiredProxies(PatchID id, FILE *fp)
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249
LDObjHandle ldObjHandle
Definition: HomePatch.h:554
void openSync()
Definition: Sync.C:63
void awakenSequencers(void)
void patchLoad(PatchID id, int nAtoms, int timestep)
void ResumeFromSync(void)
char strategyName[16]
ComputeMap * computeMap
int numNodes()
Definition: Node.h:192
PatchMap * patchMap
void initialize(PatchMap *pmap, ComputeMap *cmap, int reinit=0)
int numPatches(void) const
Definition: PatchMap.h:59
void awaken(void)
Definition: Sequencer.h:55
#define LDBAL_NONE
Definition: SimParameters.h:64
LdbMigrateMsg * migrateMsgs
void CreateNamdHybridLB()
Definition: NamdHybridLB.C:49
LDOMHandle myHandle
void NAMD_bug(const char *err_msg)
Definition: common.C:195
ComputeType type(ComputeID cid)
Definition: ComputeMap.C:118
void CreateNamdCentLB()
Definition: NamdCentLB.C:26
void awaken(void)
Definition: Controller.C:371
void rebalance(Sequencer *seq, PatchID id)
#define NAMD_BONDEDGPU_EXCLS
static Sync * Object()
Definition: Sync.h:52
#define COLL_SUM(F)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
Definition: common.C:147
static LdbCoordinator * Object()
static void staticQueryEstLoadFn(LDOMHandle h)
void ExpectMigrate(LdbMigrateMsg *)
void nodeDone(CkReductionMsg *)
#define LDBAL_CENTRALIZED
Definition: SimParameters.h:65
static void staticReceiveAtSync(void *data)
int basenode(int pid) const
Definition: PatchMap.h:117
void suspend(void)
Definition: Sequencer.C:279
int myid()
Definition: Node.h:191
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714
LDObjid LdbId
#define simParams
Definition: Output.C:131
int numHomePatches(void)
Definition: PatchMap.C:432
Compute * compute(ComputeID cid)
Definition: ComputeMap.h:173
static ComputeMap * Object()
Definition: ComputeMap.h:91
LdbInfra * theLbdb
void printLocalLdbReport(void)
computeInfo * computeArray
int node(ComputeID cid)
Definition: ComputeMap.h:108
int numPids(ComputeID cid)
Definition: ComputeMap.C:101
#define COLL_MAX(F)
represents bonded compute
static void staticMigrateFn(LDObjHandle handle, int dest)
LdbMigrateMsg * next
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:107
void resume2(void)
LDObjHandle handle
patchInfo * patchArray
int node(int pid) const
Definition: PatchMap.h:114
LDBarrierClient ldBarrierHandle
static void staticResumeFromSync(void *data)
int32 PatchID
Definition: NamdTypes.h:287
static void staticStatsFn(LDOMHandle h, int state)
void updateComputes(int, CkGroupID)
Definition: ComputeMgr.C:142
processorInfo * processorArray
void ExecuteMigrations(void)
#define NAMD_BONDEDGPU_BONDS
void RecvMigrate(LdbMigrateMsg *)