22 #include "LdbCoordinator.decl.h"
31 #define MIN_DEBUG_LEVEL 3
42 #include "ComputeMgr.decl.h"
46 #if CONVERSE_VERSION_ELAN
47 extern "C" void enableBlockingReceives();
48 extern "C" void disableBlockingReceives();
57 #ifndef LB_MANAGER_VERSION
73 if ( msg->
to != CkMyPe() ) {
74 CProxy_LdbCoordinator ldbProxy(thisgroup);
75 ldbProxy[CkMyPe()].RecvMigrate(msg);
83 CkPrintf(
"I'm supposed to set stats\n");
88 CkPrintf(
"I'm supposed to query load\n");
94 #if CONVERSE_VERSION_ELAN
114 CkCallback cb(CkIndex_LdbCoordinator::nodeDone(NULL), 0, thisgroup);
115 contribute(0, NULL, CkReduction::random, cb);
120 if (CkpvAccess(LdbCoordinator_instance) == NULL) {
121 CkpvAccess(LdbCoordinator_instance) =
this;
123 NAMD_bug(
"LdbCoordinator instanced twice on same node!");
148 #ifndef LB_MANAGER_VERSION
159 #ifdef LB_MANAGER_VERSION
205 CkPrintf(
"LDB: Central LB being created...\n");
208 CkPrintf(
"LDB: Hybrid LB being created...\n");
259 NAMD_die(
"Disaggreement in patchMap data.\n");
268 #
if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
269 #if defined(NAMD_MIC)
277 #
if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
344 NAMD_bug(
"LdbCoordinator found too many local patches!");
355 if ( numComputes > oldNumComputes ) {
361 #
if (defined(NAMD_CUDA) || defined(NAMD_HIP) || defined(NAMD_MIC))
362 #if defined(NAMD_MIC)
370 #
if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
402 if ( ! c )
NAMD_bug(
"LdbCoordinator::initialize() null compute pointer");
407 #
if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(BONDED_CUDA)
436 if ( ! c )
NAMD_bug(
"LdbCoordinator::initialize() null compute pointer");
451 if ( ! c )
NAMD_bug(
"LdbCoordinator::initialize() null compute pointer 2");
562 #if 0 //replaced by traceBarrier at Controller and Sequencer
563 if (traceAvailable()) {
564 static int specialTracing = 0;
565 if (
ldbCycleNum == 1 && traceIsOn() == 0) specialTracing = 1;
566 if (specialTracing) {
588 if ((step % freq) != 0) dstep = freq - (step % freq);
592 if (step==0) numPressureCycles--;
614 CmiAssert(
id >=0 &&
id <
nPatches);
619 DebugM(10,
"::patchLoad() Unexpected patch reporting in\n");
637 iout <<
"LDB: ============= START OF LOAD BALANCING ============== " << CmiWallTimer() <<
"\n" <<
endi;
638 DebugM(3,
"Controller reached load balance barrier.\n");
642 CProxy_LdbCoordinator(thisgroup).barrier();
653 NAMD_bug(
"Load balancer received wrong number of events.\n");
662 iout <<
"LDB: ============== END OF LOAD BALANCING =============== " << CmiWallTimer() <<
"\n" <<
endi;
676 CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).
computeMgr);
687 if ( m->
to != CkMyPe() ) {
690 CProxy_LdbCoordinator ldbProxy(thisgroup);
691 ldbProxy[m->
to].ExpectMigrate(m);
699 if ( m->
from != CkMyPe() ) {
709 DebugM(3,
"updateComputesReady()\n");
711 CProxy_LdbCoordinator(thisgroup).resume();
712 CkStartQD(CkIndex_LdbCoordinator::resumeReady((CkQdMsg*)0),&thishandle);
728 iout <<
"LDB: =============== DONE WITH MIGRATION ================ " << CmiWallTimer() <<
"\n" <<
endi;
729 DebugM(3,
"resumeReady()\n");
732 CProxy_LdbCoordinator(thisgroup).resume2();
739 #if CONVERSE_VERSION_ELAN
775 for (
int i = 0; i < numNeighbors; ++i ) {
777 if ( proxyNode != myNode ) {
779 for ( j = 0; j < nProxyNodes; ++j ) {
780 if ( neighborNodes[j] == proxyNode )
break;
782 if ( j == nProxyNodes ) {
783 neighborNodes[nProxyNodes] = proxyNode;
796 CkPrintf(
"%d:Patch report:\n",CkMyPe());
804 curLoc += sprintf(curLoc,
"%5d: %5d ",i,
patchNAtoms[i]);
807 if (((j % 4) == 0) && j)
810 CkPrintf(
"[%d]%s\n",CkMyPe(),outputBuf);
815 CkPrintf(
"%d:Compute report:\n",CkMyPe());
829 fprintf(fp,
"%4d ",nProxyNodes);
831 for(
int i=0;i<nProxyNodes;i++)
832 fprintf(fp,
"%4d ",neighborNodes[i]);
836 CProxy_LdbCoordinator(thisgroup)[0].collectLoads(msg);
841 if ( collPes == 0 ) {
843 initTotalProxies = 0;
844 finalTotalProxies = 0;
845 initMaxPeProxies = 0;
846 finalMaxPeProxies = 0;
847 initMaxPatchProxies = 0;
848 finalMaxPatchProxies = 0;
860 #define COLL_MAX(F) if ( msg->F > F ) F = msg->F;
861 #define COLL_AVG(F) F += msg->F * (double) numPes / (double) CkNumPes();
862 #define COLL_SUM(F) F += msg->F;
881 if ( collPes == CkNumPes() ) {
883 iout <<
"LDB: TIME " << initTime <<
" LOAD: AVG " << initAvgPeLoad
884 <<
" MAX " << initMaxPeLoad <<
" PROXIES: TOTAL " << initTotalProxies <<
" MAXPE " <<
885 initMaxPeProxies <<
" MAXPATCH " << initMaxPatchProxies <<
" " <<
"None"
886 <<
" MEM: " << initMemory <<
" MB\n";
887 if ( reverted )
iout <<
"LDB: Reverting to original mapping on " << reverted <<
" balancers\n";
888 iout <<
"LDB: TIME " << finalTime <<
" LOAD: AVG " << finalAvgPeLoad
889 <<
" MAX " << finalMaxPeLoad <<
" PROXIES: TOTAL " << finalTotalProxies <<
" MAXPE " <<
890 finalMaxPeProxies <<
" MAXPATCH " << finalMaxPatchProxies <<
" " << msg->
strategyName
891 <<
" MEM: " << finalMemory <<
" MB\n";
899 #include "LdbCoordinator.def.h"
void sendCollectLoads(CollectLoadsMsg *)
void LdbCoordinator_initproc()
represents bonded compute
Controller * controllerThread
void collectLoads(CollectLoadsMsg *)
void resumeReady(CkQdMsg *msg)
static PatchMap * Object()
Sequencer ** sequencerThreads
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
int nStatsMessagesExpected
LDObjHandle * patchHandles
void updateComputesReady()
void AtSyncBarrierReached(void)
void createLoadBalancer()
std::ostream & endi(std::ostream &s)
void Migrate(LDObjHandle handle, int dest)
void printRequiredProxies(PatchID id, FILE *fp)
HomePatch * homePatch(PatchID pid)
int basenode(int pid) const
represents nonbonded or self compute
void awakenSequencers(void)
void patchLoad(PatchID id, int nAtoms, int timestep)
void ResumeFromSync(void)
void initialize(PatchMap *pmap, ComputeMap *cmap, int reinit=0)
LdbMigrateMsg * migrateMsgs
void CreateNamdHybridLB()
void NAMD_bug(const char *err_msg)
ComputeType type(ComputeID cid)
void rebalance(Sequencer *seq, PatchID id)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
static LdbCoordinator * Object()
static void staticQueryEstLoadFn(LDOMHandle h)
void ExpectMigrate(LdbMigrateMsg *)
void nodeDone(CkReductionMsg *)
#define LDBAL_CENTRALIZED
static void staticReceiveAtSync(void *data)
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
int numPatches(void) const
Compute * compute(ComputeID cid)
static ComputeMap * Object()
void printLocalLdbReport(void)
computeInfo * computeArray
int requiredProxies(PatchID id, int[])
int multigratorPressureFreq
int nStatsMessagesReceived
int numPids(ComputeID cid)
static void staticMigrateFn(LDObjHandle handle, int dest)
int pid(ComputeID cid, int i)
LDBarrierClient ldBarrierHandle
static void staticResumeFromSync(void *data)
static void staticStatsFn(LDOMHandle h, int state)
void updateComputes(int, CkGroupID)
processorInfo * processorArray
void ExecuteMigrations(void)
void RecvMigrate(LdbMigrateMsg *)