NAMD
LdbCoordinator.C
Go to the documentation of this file.
1 
7 /*****************************************************************************
8  * $Source: /home/cvs/namd/cvsroot/namd2/src/LdbCoordinator.C,v $
9  * $Author: jim $
10  * $Date: 2017/03/30 20:06:17 $
11  * $Revision: 1.128 $
12  *****************************************************************************/
13 
14 #include <stdlib.h>
15 
16 #include "InfoStream.h"
17 #include "NamdCentLB.h"
18 #include "NamdHybridLB.h"
19 #include "NamdDummyLB.h"
20 
21 #include "HomePatch.h"
22 #include "LdbCoordinator.decl.h"
23 #include "LdbCoordinator.h"
24 #include "NamdTypes.h"
25 #include "Node.h"
26 #include "SimParameters.h"
27 #include "PatchMap.inl"
28 #include "ComputeMap.h"
29 #include "common.h"
30 //#define DEBUGM
31 #define MIN_DEBUG_LEVEL 3
32 #include "Debug.h"
33 #include "Controller.h"
34 #include "Sequencer.h"
35 #include "RefineOnly.h"
36 #include "ComputeMgr.h"
37 #include "Compute.h"
38 #include "packmsg.h"
39 #include "Sync.h"
40 
41 #include "elements.h"
42 #include "ComputeMgr.decl.h"
43 
44 #define DEBUG_LEVEL 4
45 
46 #if CONVERSE_VERSION_ELAN
47 extern "C" void enableBlockingReceives();
48 extern "C" void disableBlockingReceives();
49 #endif
50 
52  // Set the load balancing period (in seconds). Without this the
53  // load balancing framework will hang until 1 second has passed
54  // since the last load balancing, causing hiccups in very fast runs.
55  // This is duplicated below for older versions, but putting it here
56  // also fixes the first load balance.
57 #ifndef LB_MANAGER_VERSION
58  LBSetPeriod(1.0e-5);
59 #endif
60 }
61 
62 void LdbCoordinator::staticMigrateFn(LDObjHandle handle, int dest)
63 {
64  LdbCoordinator::Object()->Migrate(handle,dest);
65 }
66 
67 void LdbCoordinator::Migrate(LDObjHandle handle, int dest)
68 {
69  LdbMigrateMsg* msg = new LdbMigrateMsg;
70  msg->handle = handle;
71  msg->from = CkMyPe();
72  msg->to = dest;
73  if ( msg->to != CkMyPe() ) {
74  CProxy_LdbCoordinator ldbProxy(thisgroup);
75  ldbProxy[CkMyPe()].RecvMigrate(msg);
76  } else {
77  ExpectMigrate(msg);
78  }
79 }
80 
81 void LdbCoordinator::staticStatsFn(LDOMHandle h, int state)
82 {
83  CkPrintf("I'm supposed to set stats\n");
84 }
85 
87 {
88  CkPrintf("I'm supposed to query load\n");
89 }
90 
92 {
93 
94 #if CONVERSE_VERSION_ELAN
95  //disableBlockingReceives();
96 #endif
97 
98  ((LdbCoordinator*)data)->AtSyncBarrierReached();
99 }
100 
102 {
103  theLbdb->RegisteringObjects(myHandle);
104 }
105 
107 {
108  ((LdbCoordinator*)data)->ResumeFromSync();
109 }
110 
112 {
113  theLbdb->DoneRegisteringObjects(myHandle);
114  CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
115  contribute(0, NULL, CkReduction::random, cb);
116 }
117 
119 {
120  if (CkpvAccess(LdbCoordinator_instance) == NULL) {
121  CkpvAccess(LdbCoordinator_instance) = this;
122  } else {
123  NAMD_bug("LdbCoordinator instanced twice on same node!");
124  }
125 
126  collPes = 0;
127  ldbCycleNum = 1;
128  takingLdbData = 1;
129  totalStepsDone = 0;
131  patchNAtoms = (int *) NULL;
132  sequencerThreads = (Sequencer **) NULL;
133  ldbStatsFP = NULL;
134  computeArray = NULL;
135  patchArray = NULL;
136  processorArray = NULL;
137 
138  // Register self as an object manager for new charm++ balancer framework
139  theLbdb = LdbInfra::Object();
140 
141  // Set the load balancing period (in seconds). Without this the
142  // load balancing framework will hang until 1 second has passed
143  // since the last load balancing, causing hiccups in very fast runs.
144  // Unfortunately, the clock is already set for the first load
145  // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm.
146  // For newer versions this is handled in initproc above.
147 
148 #ifndef LB_MANAGER_VERSION
149  theLbdb->SetLBPeriod(1.0e-5);
150 #endif
151 
152  myOMid.id.idx = 1;
153  LDCallbacks cb = { (LDMigrateFn)staticMigrateFn,
154  (LDStatsFn)staticStatsFn,
155  (LDQueryEstLoadFn)staticQueryEstLoadFn
156  };
157  myHandle = theLbdb->RegisterOM(myOMid,nullptr,cb);
158 
159 #ifdef LB_MANAGER_VERSION
160  // Add myself as a local barrier receiver, so I know when I might
161  // be registering objects.
162  theLbdb->AddLocalBarrierReceiver(this, &LdbCoordinator::AtSyncBarrierReached);
163 
164  // Also, add a local barrier client, to trigger load balancing
166  AddLocalBarrierClient(this, &LdbCoordinator::ResumeFromSync);
167 #else
168  // Add myself as a local barrier receiver, so I know when I might
169  // be registering objects.
170  theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync,
171  (void*)this);;
172 
173  // Also, add a local barrier client, to trigger load balancing
175  AddLocalBarrierClient((LDResumeFn)staticResumeFromSync,
176  (void*)this);
177 #endif
178  migrateMsgs = 0; // linked list
179  numComputes = 0;
180  reg_all_objs = 1;
181 }
182 
184 {
185  delete [] patchNAtoms;
186  delete [] sequencerThreads;
187  if (CkMyPe() == 0)
188  {
189  delete [] computeArray;
190  delete [] patchArray;
191  delete [] processorArray;
192  }
193  if (ldbStatsFP)
194  fclose(ldbStatsFP);
195 
196 }
197 
199 {
201 
202  // Create hierarchical or centralized load balancers
203  // Currently centralized is the default
204  if (simParams->ldBalancer == LDBAL_CENTRALIZED) {
205  CkPrintf("LDB: Central LB being created...\n");
207  } else if (simParams->ldBalancer == LDBAL_HYBRID) {
208  CkPrintf("LDB: Hybrid LB being created...\n");
210  }
211 }
212 
213 void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit)
214 {
216 
217  // DebugM(10,"stepsPerLdbCycle initialized\n");
218  stepsPerLdbCycle = simParams->ldbPeriod;
219  firstLdbStep = simParams->firstLdbStep;
220  int lastLdbStep = simParams->lastLdbStep;
221  int stepsPerCycle = simParams->stepsPerCycle;
222 
223  computeMap = cMap;
224  patchMap = pMap;
225 
226  // Set the number of received messages correctly for node 0
227 
230 
231  if (patchNAtoms)
232  delete [] patchNAtoms; // Depends on delete NULL to do nothing
234  patchNAtoms = new int[nPatches];
235 
236  typedef Sequencer *seqPtr;
237 
238  if ( ! reinit ) {
239  delete [] sequencerThreads; // Depends on delete NULL to do nothing
240  sequencerThreads = new seqPtr[nPatches];
241  }
242 
243  nLocalPatches=0;
244 
245  int i;
246  for(i=0;i<nPatches;i++)
247  {
248  if (patchMap->node(i) == Node::Object()->myid())
249  {
250  nLocalPatches++;
251  patchNAtoms[i]=0;
252  } else {
253  patchNAtoms[i]=-1;
254  }
255  if ( ! reinit ) sequencerThreads[i]=NULL;
256  }
257  if ( ! reinit ) controllerThread = NULL;
259  NAMD_die("Disaggreement in patchMap data.\n");
260 
261  const int oldNumComputes = numComputes;
262  nLocalComputes = 0;
264 
265  for(i=0;i<numComputes;i++) {
266  if ( (computeMap->node(i) == Node::Object()->myid())
267  && ( 0
268  #if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
269  #if defined(NAMD_MIC)
270  || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
271  || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
272  #endif
273  #else
276 #endif
277 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
278  || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
279  || (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
281  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANGLES))
286  || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
287  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
290  || (computeMap->type(i) == computeSelfTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
291  || (computeMap->type(i) == computeTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
292  || (computeMap->type(i) == computeSelfAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
293  || (computeMap->type(i) == computeAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
296 #else
298  || (computeMap->type(i) == computeBondsType)
300  || (computeMap->type(i) == computeAnglesType)
306  || (computeMap->type(i) == computeExclsType)
310  || (computeMap->type(i) == computeTholeType)
312  || (computeMap->type(i) == computeAnisoType)
315 #endif
316  || (computeMap->type(i) == computeLCPOType)
317  // JLai
320  ) ) {
321  nLocalComputes++;
322  }
323  }
324 
325  // New LB frameworks registration
326 
327  // Allocate data structure to save incoming migrations. Processor
328  // zero will get all migrations
329 
330  // If this is the first time through, we need it register patches
331  if (ldbCycleNum == reg_all_objs) {
332  if ( 1 ) { // ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) {
333  reg_all_objs = 3;
334  }
335  // Tell the lbdb that I'm registering objects, until I'm done
336  // registering them.
337  theLbdb->RegisteringObjects(myHandle);
338 
339  if ( ldbCycleNum == 1 ) {
340  patchHandles = new LDObjHandle[nLocalPatches];
341  int patch_count=0;
342  int i;
343  for(i=0;i<nPatches;i++)
344  if (patchMap->node(i) == Node::Object()->myid()) {
345 
346  LdbId elemID;
347  LdbIdField(elemID, 0) = i;
348  LdbIdField(elemID, 1) = PATCH_TYPE;
349 
350  if (patch_count >= nLocalPatches) {
351  NAMD_bug("LdbCoordinator found too many local patches!");
352  }
353  HomePatch *p = patchMap->homePatch(i);
354  p->ldObjHandle =
355  patchHandles[patch_count]
356  = theLbdb->RegisterObj(myHandle,elemID,0,0);
357  patch_count++;
358 
359  }
360  }
361 
362  if ( numComputes > oldNumComputes ) {
363  // Register computes
364  for(i=oldNumComputes; i<numComputes; i++) {
365  if ( computeMap->node(i) == Node::Object()->myid())
366  {
367  if ( 0
368  #if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
369  #if defined(NAMD_MIC)
370  || ((computeMap->type(i) == computeNonbondedSelfType) && (computeMap->directToDevice(i) == 0))
371  || ((computeMap->type(i) == computeNonbondedPairType) && (computeMap->directToDevice(i) == 0))
372  #endif
373  #else
376  #endif
377 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
378  || (computeMap->type(i) == computeSelfBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
382  || (computeMap->type(i) == computeSelfExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
384  || (computeMap->type(i) == computeSelfTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
385  || (computeMap->type(i) == computeSelfAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
387 #else
397 #endif
398  || (computeMap->type(i) == computeLCPOType)
399  // JLai
401  // End of JLai
402  ) {
403  // Register the object with the load balancer
404  // Store the depended patch IDs in the rest of the element ID
405  LdbId elemID;
406  LdbIdField(elemID, 0) = i;
407 
408  if (computeMap->numPids(i) > 0)
409  LdbIdField(elemID, 1) = computeMap->pid(i,0);
410  else LdbIdField(elemID, 1) = NONBONDED_OR_SELF_TYPE;
411 
412  Compute *c = computeMap->compute(i);
413  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
414 
415  c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1);
416  }
417  else if (
418 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
419  (computeMap->type(i) == computeBondsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_BONDS))
420  || (computeMap->type(i) == computeAnglesType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANGLES))
423  || (computeMap->type(i) == computeExclsType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_EXCLS))
425  || (computeMap->type(i) == computeTholeType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_THOLES))
426  || (computeMap->type(i) == computeAnisoType && !(simParams->bondedCUDA & NAMD_BONDEDGPU_ANISOS))
428 #else
430  || (computeMap->type(i) == computeAnglesType)
433  || (computeMap->type(i) == computeExclsType)
435  || (computeMap->type(i) == computeTholeType)
436  || (computeMap->type(i) == computeAnisoType)
438 #endif
439  // JLai
441  // End of JLai
442  ) {
443  // Register the object with the load balancer
444  // Store the depended patch IDs in the rest of the element ID
445  LdbId elemID;
446  LdbIdField(elemID, 0) = i;
447 
448  LdbIdField(elemID, 1) = BONDED_TYPE;
449 
450  Compute *c = computeMap->compute(i);
451  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer");
452 
453  c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0);
454  }
455  }
456  }
457  }
458  theLbdb->DoneRegisteringObjects(myHandle);
459  }
460 
461  // process saved migration messages, if any
462  while ( migrateMsgs ) {
464  migrateMsgs = m->next;
465  Compute *c = computeMap->compute(LdbIdField(m->handle.id, 0));
466  if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2");
467  c->ldObjHandle = m->handle;
468  delete m;
469  }
470 
471  // Fixup to take care of the extra timestep at startup
472  // This is pretty ugly here, but it makes the count correct
473 
474  // iout << "LDB Cycle Num: " << ldbCycleNum << "\n";
475 
476  if ( 1 ) { // ( simParams->ldBalancer == LDBAL_CENTRALIZED ) {
477  if (ldbCycleNum == 1 || ldbCycleNum == 3) {
478  numStepsToRun = stepsPerCycle;
480  takingLdbData = 0;
481  theLbdb->CollectStatsOff();
482  } else if (ldbCycleNum == 2 || ldbCycleNum == 4) {
483  numStepsToRun = firstLdbStep - stepsPerCycle;
484  while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle;
486  takingLdbData = 1;
487  theLbdb->CollectStatsOn();
488  } else if ( (ldbCycleNum <= 6) || !takingLdbData )
489  {
491  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
492  numStepsToRun = -1;
493  takingLdbData = 0;
494  theLbdb->CollectStatsOff();
495  } else {
497  takingLdbData = 1;
498  theLbdb->CollectStatsOn();
499  }
500  }
501  else
502  {
504  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
505  numStepsToRun = -1;
506  takingLdbData = 0;
507  theLbdb->CollectStatsOff();
508  } else {
510  takingLdbData = 0;
511  theLbdb->CollectStatsOff();
512  }
513  }
514  } else {
515  if (ldbCycleNum==1)
516  {
519  takingLdbData = 0;
520  theLbdb->CollectStatsOff();
521  }
522  else if ( (ldbCycleNum <= 4) || !takingLdbData )
523  {
525  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
526  numStepsToRun = -1;
527  takingLdbData = 0;
528  theLbdb->CollectStatsOff();
529  } else {
531  takingLdbData = 1;
532  theLbdb->CollectStatsOn();
533  }
534  }
535  else
536  {
538  if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) {
539  numStepsToRun = -1;
540  takingLdbData = 0;
541  theLbdb->CollectStatsOff();
542  } else {
544  takingLdbData = 0;
545  theLbdb->CollectStatsOff();
546  }
547  }
548  }
549 
550 /*-----------------------------------------------------------------------------*
551  * --------------------------------------------------------------------------- *
552  * Comments inserted by Abhinav to clarify relation between ldbCycleNum, *
553  * load balancing step numbers (printed by the step() function) and *
554  * tracing of the steps *
555  * --------------------------------------------------------------------------- *
556  * If trace is turned off in the beginning, then tracing is turned on *
557  * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can *
558  * be adjusted by specifying firstLdbStep and ldbPeriod which are set by *
559  * default to 5*stepspercycle and 200*stepspercycle if not specified. *
560  * *
561  * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the *
562  * following timeline (for these particular numbers): *
563  * *
564  * Tracing : <------ off ------><------------- on -----------><-- off *
565  * Ldb Step() No : 1 2 3 4 5 6 7 *
566  * Iteration Steps : 00====20====40====60====80======160====180=====260====280 *
567  * ldbCycleNum : 1 2 3 4 5 6 7 8 9 *
568  * Instrumention : Inst Inst Inst Inst Inst *
569  * LDB Strategy : TLB RLB RLB RLB RLB *
570  * *
571  * TLB = TorusLB *
572  * RLB = RefineTorusLB *
573  * Inst = Instrumentation Phase (no real load balancing) *
574  * --------------------------------------------------------------------------- *
575  *-----------------------------------------------------------------------------*
576  */
577 #if 0 //replaced by traceBarrier at Controller and Sequencer
578  if (traceAvailable()) {
579  static int specialTracing = 0; // XXX static variables are unsafe for SMP
580  if (ldbCycleNum == 1 && traceIsOn() == 0) specialTracing = 1;
581  if (specialTracing) {
582  if (ldbCycleNum == 4) traceBegin();
583  if (ldbCycleNum == 8) traceEnd();
584  }
585  }
586 #endif
587 
588  nPatchesReported = 0;
590  nComputesReported = 0;
592  controllerReported = 0;
593  controllerExpected = ! CkMyPe();
594 
595  if (simParams->multigratorOn) {
596  // Add the number of pressure cycles into nComputesExpected:
597  // Pressure cycle is done when !(step % simParams->multigratorPressureFreq) = true
598  // step = Current step
599  int step = totalStepsDone - numStepsToRun;
600  int freq = simParams->multigratorPressureFreq;
601  // dstep = Number of steps we have to take until next pressure cycle
602  int dstep = 0;
603  if ((step % freq) != 0) dstep = freq - (step % freq);
604  step += dstep;
605  if (step < totalStepsDone) {
606  int numPressureCycles = 1 + ((totalStepsDone-step-1)/freq);
607  if (step==0) numPressureCycles--;
608  // if (CkMyPe()==2) fprintf(stderr, "step %d totalStepsDone %d numPressureCycles %d\n",
609  // step, totalStepsDone, numPressureCycles);
610  nComputesExpected += 2*nLocalComputes*numPressureCycles;
611  }
612  }
613 
614  if (CkMyPe() == 0)
615  {
616  if (computeArray == NULL)
618  if (patchArray == NULL)
620  if (processorArray == NULL)
621  processorArray = new processorInfo[CkNumPes()];
622  }
623 
624  theLbdb->ClearLoads();
625 }
626 
627 void LdbCoordinator::patchLoad(PatchID id, int nAtoms, int /* timestep */)
628 {
629  CmiAssert( id >=0 && id < nPatches);
630  if (patchNAtoms[id] != -1) {
631  patchNAtoms[id] = nAtoms;
633  } else {
634  DebugM(10, "::patchLoad() Unexpected patch reporting in\n");
635  }
636 }
637 
639 {
640  if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
641  return;
642 
643  sequencerThreads[pid] = seq;
644  seq->suspend();
645 }
646 
648 {
649  if (Node::Object()->simParameters->ldBalancer == LDBAL_NONE)
650  return;
651 
652  iout << "LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() << "\n" << endi;
653  DebugM(3, "Controller reached load balance barrier.\n");
654  controllerReported = 1;
655  controllerThread = c;
656 
657  CProxy_LdbCoordinator(thisgroup).barrier();
658 
659  CthSuspend();
660 }
661 
663 {
664  // NOTE: I don't know why the IMD barrier introduced in
665  // 93c41c99f195945cc66c923428e5ff29348cb6b9 would cause
666  // LdbCoordinator::rebalance(Sequencer *seq, PatchID pid) and
667  // LdbCoordinator::rebalance(Controller *c) to be called out-of-order
668  // but bypassing these checks seems fine since they
669  // are introduced in b011a94e650be5124f04fd06bb51a7c536b2e61c.
670  // Before b011a94e650be5124f04fd06bb51a7c536b2e61c failing these
671  // checks does not trigger NAMD_bug().
675  {
676  const auto* simParams = Node::Object()->simParameters;
677  if (!(simParams->IMDon && simParams->IMDversion == IMDversion_t::IMDv3)) {
678  NAMD_bug("Load balancer received wrong number of events.\n");
679  }
680  }
681  theLbdb->AtLocalBarrier(ldBarrierHandle);
682 }
683 
684 void LdbCoordinator::nodeDone(CkReductionMsg *msg)
685 {
686  delete msg;
687 
688  iout << "LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() << "\n" << endi;
689  if ( takingLdbData ) {
691  } else {
693  }
694 }
695 
697 {
698  // computeMgr->updateComputes() call only on Node(0) i.e. right here
699  // This will barrier for all Nodes - (i.e. Computes must be
700  // here and with proxies before anyone can start up
701 
702  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
703  ComputeMgr *computeMgr = cm.ckLocalBranch();
704  computeMgr->updateComputes(CkIndex_LdbCoordinator::
705  updateComputesReady(),thisgroup);
706 }
707 
709 {
710  // This method receives the migration from the framework,
711  // unregisters it, and sends it to the destination PE
712 
713  if ( m->to != CkMyPe() ) {
714  theLbdb->UnregisterObj(m->handle);
715 
716  CProxy_LdbCoordinator ldbProxy(thisgroup);
717  ldbProxy[m->to].ExpectMigrate(m);
718  } else {
719  ExpectMigrate(m);
720  }
721 }
722 
724 {
725  if ( m->from != CkMyPe() ) {
726  m->handle = theLbdb->RegisterObj(myHandle,m->handle.id,0,1);
727  theLbdb->Migrated(m->handle);
728  }
729 
730  m->next = migrateMsgs;
731  migrateMsgs = m;
732 }
733 
735  DebugM(3,"updateComputesReady()\n");
736 
737  CProxy_LdbCoordinator(thisgroup).resume();
738  CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
739 }
740 
742 {
743  DebugM(3,"resume()\n");
744  // printLocalLdbReport();
745 
746  ldbCycleNum++;
748 
749  Sync::Object()->openSync();
750 }
751 
752 void LdbCoordinator::resumeReady(CkQdMsg *msg) {
753 
754  iout << "LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() << "\n" << endi;
755  DebugM(3,"resumeReady()\n");
756  delete msg;
757 
758  CProxy_LdbCoordinator(thisgroup).resume2();
759 }
760 
762 {
763  DebugM(3,"resume2()\n");
764 
765 #if CONVERSE_VERSION_ELAN
766  // enableBlockingReceives();
767 #endif
768 
770 }
771 
773 {
774  if (controllerThread)
775  {
777  controllerThread = NULL;
778  }
779  for(int i=0; i < patchMap->numPatches(); i++)
780  {
781  if (sequencerThreads[i])
782  {
783  sequencerThreads[i]->awaken();
784  }
785  sequencerThreads[i]= NULL;
786  }
787 }
788 
789 // Figure out which proxies we will definitely create on other
790 // nodes, without regard for non-bonded computes. This code is swiped
791 // from ProxyMgr, and changes there probable need to be propagated here.
792 
793 int LdbCoordinator::requiredProxies(PatchID id, int neighborNodes[])
794 {
795  PatchID neighbors[1 + PatchMap::MaxOneAway];
796  neighbors[0] = id;
797  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
798 
799  int nProxyNodes = 0;
800  int myNode = patchMap->node(id);
801  for ( int i = 0; i < numNeighbors; ++i ) {
802  const int proxyNode = patchMap->basenode(neighbors[i]);
803  if ( proxyNode != myNode ) {
804  int j;
805  for ( j = 0; j < nProxyNodes; ++j ) {
806  if ( neighborNodes[j] == proxyNode ) break;
807  }
808  if ( j == nProxyNodes ) {
809  neighborNodes[nProxyNodes] = proxyNode;
810  nProxyNodes++;
811  }
812  }
813  }
814  return nProxyNodes;
815 }
816 
818 {
819  char outputBuf[255];
820  char *curLoc;
821 
822  CkPrintf("%d:Patch report:\n",CkMyPe());
823 
824  curLoc = outputBuf;
825  int i,j=0;
826  for(i=0; i<patchMap->numPatches(); i++)
827  {
828  if (patchNAtoms[i] != -1)
829  {
830  curLoc += sprintf(curLoc,"%5d: %5d ",i,patchNAtoms[i]);
831  j++;
832  }
833  if (((j % 4) == 0) && j)
834  {
835  curLoc = outputBuf;
836  CkPrintf("[%d]%s\n",CkMyPe(),outputBuf);
837  j=0;
838  }
839  }
840 
841  CkPrintf("%d:Compute report:\n",CkMyPe());
842 
843  curLoc = outputBuf;
844  j=0;
845 }
846 
848 {
849  // Check all two-away neighbors.
850  // This is really just one-away neighbors, since
851  // two-away always returns zero: RKB
852  int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
853  const int nProxyNodes = requiredProxies(id,neighborNodes);
854 
855  fprintf(fp,"%4d ",nProxyNodes);
856 
857  for(int i=0;i<nProxyNodes;i++)
858  fprintf(fp,"%4d ",neighborNodes[i]);
859 }
860 
862  CProxy_LdbCoordinator(thisgroup)[0].collectLoads(msg);
863 }
864 
866  // CkPrintf("LdbCoordinator::collectLoads recv %d-%d\n", msg->firstPe, msg->lastPe);
867  if ( collPes == 0 ) {
868  reverted = 0;
869  initTotalProxies = 0;
870  finalTotalProxies = 0;
871  initMaxPeProxies = 0;
872  finalMaxPeProxies = 0;
873  initMaxPatchProxies = 0;
874  finalMaxPatchProxies = 0;
875  initTime = 0;
876  finalTime = 0;
877  initMemory = 0;
878  finalMemory = 0;
879  initAvgPeLoad = 0;
880  finalAvgPeLoad = 0;
881  initMaxPeLoad = 0;
882  finalMaxPeLoad = 0;
883  }
884  int numPes = msg->lastPe - msg->firstPe + 1;
885  collPes += numPes;
886 #define COLL_MAX(F) if ( msg->F > F ) F = msg->F;
887 #define COLL_AVG(F) F += msg->F * (double) numPes / (double) CkNumPes();
888 #define COLL_SUM(F) F += msg->F;
889  COLL_SUM(reverted)
890  COLL_SUM(initTotalProxies)
891  COLL_SUM(finalTotalProxies)
892  COLL_MAX(initMaxPeProxies)
893  COLL_MAX(finalMaxPeProxies)
894  COLL_MAX(initMaxPatchProxies)
895  COLL_MAX(finalMaxPatchProxies)
896  if ( (msg->finalTime - msg->initTime) > (finalTime - initTime) ) {
897  initTime = msg->initTime;
898  finalTime = msg->finalTime;
899  }
900  COLL_MAX(initMemory)
901  COLL_MAX(finalMemory)
902  COLL_AVG(initAvgPeLoad)
903  COLL_AVG(finalAvgPeLoad)
904  COLL_MAX(initMaxPeLoad)
905  COLL_MAX(finalMaxPeLoad)
906 
907  if ( collPes == CkNumPes() ) {
908  collPes = 0;
909  iout << "LDB: TIME " << initTime << " LOAD: AVG " << initAvgPeLoad
910  << " MAX " << initMaxPeLoad << " PROXIES: TOTAL " << initTotalProxies << " MAXPE " <<
911  initMaxPeProxies << " MAXPATCH " << initMaxPatchProxies << " " << "None"
912  << " MEM: " << initMemory << " MB\n";
913  if ( reverted ) iout << "LDB: Reverting to original mapping on " << reverted << " balancers\n";
914  iout << "LDB: TIME " << finalTime << " LOAD: AVG " << finalAvgPeLoad
915  << " MAX " << finalMaxPeLoad << " PROXIES: TOTAL " << finalTotalProxies << " MAXPE " <<
916  finalMaxPeProxies << " MAXPATCH " << finalMaxPatchProxies << " " << msg->strategyName
917  << " MEM: " << finalMemory << " MB\n";
918  iout << endi;
919  fflush(stdout);
920  }
921 
922  delete msg;
923 }
924 
925 #include "LdbCoordinator.def.h"
static Node * Object()
Definition: Node.h:86
#define LDBAL_HYBRID
Definition: SimParameters.h:66
int requiredProxies(PatchID id, int [])
void barrier(void)
void sendCollectLoads(CollectLoadsMsg *)
#define NAMD_BONDEDGPU_IMPROPERS
void LdbCoordinator_initproc()
#define NAMD_BONDEDGPU_CROSSTERMS
Controller * controllerThread
int numComputes(void)
Definition: ComputeMap.h:103
void collectLoads(CollectLoadsMsg *)
#define NAMD_BONDEDGPU_ANISOS
Definition: common.h:275
void resumeReady(CkQdMsg *msg)
static PatchMap * Object()
Definition: PatchMap.h:27
#define NAMD_BONDEDGPU_ONEFOURENBTHOLES
#define NAMD_BONDEDGPU_ANGLES
Sequencer ** sequencerThreads
#define NAMD_BONDEDGPU_THOLES
SimParameters * simParameters
Definition: Node.h:181
LDObjHandle * patchHandles
void updateComputesReady()
void resume(void)
#define COLL_AVG(F)
void AtSyncBarrierReached(void)
#define DebugM(x, y)
Definition: Debug.h:75
represents a patch
void createLoadBalancer()
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
represents nonbonded or self compute
void Migrate(LDObjHandle handle, int dest)
LDObjHandle ldObjHandle
Definition: Compute.h:44
#define iout
Definition: InfoStream.h:51
#define NAMD_BONDEDGPU_DIHEDRALS
void printRequiredProxies(PatchID id, FILE *fp)
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249
LDObjHandle ldObjHandle
Definition: HomePatch.h:554
void openSync()
Definition: Sync.C:63
void awakenSequencers(void)
void patchLoad(PatchID id, int nAtoms, int timestep)
void ResumeFromSync(void)
char strategyName[16]
ComputeMap * computeMap
int numNodes()
Definition: Node.h:192
PatchMap * patchMap
void initialize(PatchMap *pmap, ComputeMap *cmap, int reinit=0)
int numPatches(void) const
Definition: PatchMap.h:59
void awaken(void)
Definition: Sequencer.h:55
#define LDBAL_NONE
Definition: SimParameters.h:64
LdbMigrateMsg * migrateMsgs
void CreateNamdHybridLB()
Definition: NamdHybridLB.C:49
LDOMHandle myHandle
void NAMD_bug(const char *err_msg)
Definition: common.C:195
ComputeType type(ComputeID cid)
Definition: ComputeMap.C:118
void CreateNamdCentLB()
Definition: NamdCentLB.C:26
void awaken(void)
Definition: Controller.C:371
void rebalance(Sequencer *seq, PatchID id)
#define NAMD_BONDEDGPU_EXCLS
static Sync * Object()
Definition: Sync.h:52
#define COLL_SUM(F)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
Definition: common.C:147
static LdbCoordinator * Object()
static void staticQueryEstLoadFn(LDOMHandle h)
void ExpectMigrate(LdbMigrateMsg *)
void nodeDone(CkReductionMsg *)
#define LDBAL_CENTRALIZED
Definition: SimParameters.h:65
static void staticReceiveAtSync(void *data)
int basenode(int pid) const
Definition: PatchMap.h:117
void suspend(void)
Definition: Sequencer.C:279
int myid()
Definition: Node.h:191
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714
LDObjid LdbId
#define simParams
Definition: Output.C:131
int numHomePatches(void)
Definition: PatchMap.C:432
Compute * compute(ComputeID cid)
Definition: ComputeMap.h:173
static ComputeMap * Object()
Definition: ComputeMap.h:91
LdbInfra * theLbdb
void printLocalLdbReport(void)
computeInfo * computeArray
int node(ComputeID cid)
Definition: ComputeMap.h:108
int numPids(ComputeID cid)
Definition: ComputeMap.C:101
#define COLL_MAX(F)
represents bonded compute
static void staticMigrateFn(LDObjHandle handle, int dest)
LdbMigrateMsg * next
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:107
void resume2(void)
LDObjHandle handle
patchInfo * patchArray
int node(int pid) const
Definition: PatchMap.h:114
LDBarrierClient ldBarrierHandle
static void staticResumeFromSync(void *data)
int32 PatchID
Definition: NamdTypes.h:287
static void staticStatsFn(LDOMHandle h, int state)
void updateComputes(int, CkGroupID)
Definition: ComputeMgr.C:142
processorInfo * processorArray
void ExecuteMigrations(void)
#define NAMD_BONDEDGPU_BONDS
void RecvMigrate(LdbMigrateMsg *)