8 #if !defined(WIN32) || defined(__CYGWIN__) 34 #include "NamdHybridLB.def.h" 40 #ifdef MEM_OPT_VERSION 50 int seqno = LdbInfra::Object()->getLoadbalancerTicket();
51 CProxy_NamdHybridLB::ckNew(CkLBOptions(seqno));
55 if (CkMyPe() == 0 &&
cpuloads == NULL) {
58 for (
int i=0; i<CkNumPes(); i++)
cpuloads[i] = 0.0;
68 lbname = (
char *)
"NamdHybridLB";
72 if (CkNumPes() <=
simParams->hybridGroupSize) {
73 tree =
new TwoLevelTree;
76 tree =
new ThreeLevelTree(
simParams->hybridGroupSize);
79 statsStrategy = SHRINK_NULL;
83 thisProxy = CProxy_NamdHybridLB(thisgroup);
95 processorArray = NULL;
98 splitComputesMsgs = 0;
109 bool NamdHybridLB::QueryBalanceNow(
int _step){
117 bool NamdHybridLB::QueryDumpData() {
131 LBVectorMigrateMsg* NamdHybridLB::VectorStrategy(LDStats* stats){
132 CkPrintf(
"[%d] Using Vector Strategy to balance the load\n",CkMyPe());
133 LBVectorMigrateMsg* msg =
new(0,0) LBVectorMigrateMsg;
135 msg->level = currentLevel;
143 CLBMigrateMsg* NamdHybridLB::Strategy(LDStats* stats)
149 if(currentLevel == 1){
150 LevelData *lData = levelData[currentLevel];
153 msg = GrpLevelStrategy(stats);
157 newMsg->
n_moves = msg->n_moves;
159 newMsg->
endPE = endPE;
160 for(i=0; i<msg->n_moves; i++){
161 newMsg->
moves[i] = msg->moves[i];
163 for(i=0; i<endPE-startPE+1; i++){
167 thisProxy[0].UpdateLocalLBInfo(newMsg);
170 dummyLB->
work(stats);
171 return createMigrateMsg(stats);
183 children = tree->numNodes(currentLevel);
191 if (msg->
moves[i].to_pe != -1)
197 for(i=msg->
startPE; i<=msg->endPE; i++){
203 if(updateCount == children){
210 if(updateFlag && collectFlag){
213 thisProxy[parent_backup].CollectInfo(loc_backup, n_backup, fromlevel_backup);
220 const int children = tree->numNodes(1);
222 if ( ! splitComputesMsgs ) {
226 splitComputesMsgs[splitCount] = msg;
228 if ( ++splitCount == children ) {
234 double maxUnsplit = 0.;
236 double avgCompute = 0.;
237 double maxCompute = 0.;
238 int maxComputeId = -1;
239 int nMoveableComputes = 0;
240 int numPesAvailable = 0;
243 for (
int j=0; j < children; ++j ) {
255 if ( nMoveableComputes ) avgCompute /= nMoveableComputes;
else avgCompute = 0.;
257 CkPrintf(
"LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
259 CkPrintf(
"LDB: Average compute %f is %.1f%% of average load %f\n",
263 for (
int j=0; j < children; ++j ) {
264 delete splitComputesMsgs[j];
270 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 276 int totalAddedParts = 0;
278 if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
279 if (
simParams->ldbRelativeGrainsize > 0. ) {
282 CkPrintf(
"LDB: Partitioning computes with target load %f\n", maxCompute);
284 for (
int j=0; j < children; ++j ) {
286 for (
int i=0; i < msg->
n; ++i) {
287 int nparts = (int) ceil(msg->
load[i] / maxCompute);
288 if ( nparts > maxParts ) nparts = maxParts;
289 if ( nparts < 1 ) nparts = 1;
291 totalAddedParts += nparts - 1;
296 CkPrintf(
"LDB: Increased migratable compute count from %d to %d\n",
297 nMoveableComputes,nMoveableComputes+totalAddedParts);
298 CkPrintf(
"LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
308 CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) {
309 int numProcessors = stats->nprocs();
313 const int numGroupComputes = stats->n_migrateobjs;
316 if ( ! processorArray ) processorArray =
new processorInfo[numProcessors];
318 if ( ! patchArray ) patchArray =
new patchInfo[numPatches];
319 if ( ! computeArray ) computeArray =
new computeInfo[numGroupComputes];
320 if ( ! from_procs ) from_procs =
new int[numGroupComputes];
322 int nMoveableComputes = buildData(stats);
323 CmiAssert(nMoveableComputes <= numGroupComputes);
327 #define DUMP_LDBDATA 1 328 #define LOAD_LDBDATA 1 332 dumpDataASCII(
"ldbd_before", numProcessors, numPatches, nMoveableComputes);
334 loadDataASCII(
"ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
348 for (i=0; i<nMoveableComputes; i++) {
349 double load = computeArray[i].
load;
351 if ( load > maxCompute ) { maxCompute = load; maxi = i; }
353 avgCompute = total / nMoveableComputes;
354 maxComputeId = maxi < 0 ? -1 :
LdbIdField(computeArray[maxi].handle.id, 0);
356 int P = stats->nprocs();
358 for (i=0; i<P; i++) {
359 if (processorArray[i].available) {
364 if (numPesAvailable == 0)
365 NAMD_die(
"No processors available for load balancing!\n");
371 double maxUnsplit = 0.;
374 for (
int i=0; i<nMoveableComputes; i++) {
375 const int cid =
LdbIdField(computeArray[i].handle.id, 0);
377 const double load = computeArray[i].
load;
378 if ( load > maxUnsplit ) maxUnsplit = load;
398 for (
int i=0; i<nMoveableComputes; i++) {
400 const int cid =
LdbIdField(computeArray[i].handle.id, 0);
404 msg->
cid[i_split] = cid;
405 msg->
load[i_split] = computeArray[i].
load;
410 thisProxy[0].splitComputes(msg);
417 TorusLB(computeArray, patchArray, processorArray,
418 nMoveableComputes, numPatches, numProcessors);
421 nMoveableComputes, numPatches, numProcessors, 1);
423 TorusLB(computeArray, patchArray, processorArray,
424 nMoveableComputes, numPatches, numProcessors);
427 nMoveableComputes, numPatches, numProcessors, 1);
429 NAMD_die(
"Old load balancer strategy is not compatible with hybrid balancer.");
431 Alg7(computeArray, patchArray, processorArray,
432 nMoveableComputes, numPatches, numProcessors);
434 RefineOnly(computeArray, patchArray, processorArray,
435 nMoveableComputes, numPatches, numProcessors);
438 #if LDB_DEBUG && USE_TOPOMAP 440 int pe1, pe2, pe3, hops=0;
451 for (
int i=0; i<numPatches; i++) {
458 hops += tmgr.getHopsBetweenRanks(pe1, pe2);
462 CkPrintf(
"Load Balancing: Number of Hops: %d\n", hops);
466 dumpDataASCII(
"ldbd_after", numProcessors, numPatches, nMoveableComputes);
468 dumpDataASCII(
"ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
477 int* computeCount =
new int[numProcessors];
478 for(i=0; i<numProcessors; i++)
480 for(i=0; i<nMoveableComputes; i++)
481 computeCount[computeArray[i].processor]++;
482 for(i=0; i<numProcessors; i++) {
483 if (computeCount[i]==0)
484 iout <<
iINFO <<
"Warning: Processor " << i
485 <<
" has NO moveable computes.\n" <<
endi;
487 delete [] computeCount;
490 CkVec<MigrateInfo *> migrateInfo;
491 for(i=0;i<nMoveableComputes;i++) {
492 if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) {
496 MigrateInfo *migrateMe =
new MigrateInfo;
497 migrateMe->obj = computeArray[i].
handle;
499 int frompe = from_procs[i];
500 if (frompe == numProcessors)
503 frompe = frompe + stats->procs[0].pe;
504 migrateMe->from_pe = frompe;
505 migrateMe->to_pe = computeArray[i].
processor;
510 obj.handle = computeArray[i].
handle;
511 thisProxy[computeArray[i].
processor].ObjMigrated(obj, NULL, 0, currentLevel-1);
513 migrateInfo.push_back(migrateMe);
518 computeArray[i].processor);
524 msg = createMigrateMsg(migrateInfo, numProcessors);
526 peLoads =
new double [numProcessors];
527 startPE = processorArray[0].
Id;
528 endPE = processorArray[numProcessors-1].
Id;
530 for (i=0; i<numProcessors; i++) {
531 peLoads[i] = processorArray[i].
load;
535 delete [] from_procs;
536 delete [] processorArray;
537 delete [] patchArray;
538 delete [] computeArray;
541 processorArray = NULL;
549 void NamdHybridLB::dumpDataASCII(
char *file,
int numProcessors,
550 int numPatches,
int numComputes)
553 sprintf(filename,
"%s_%d.%d", file, CkMyPe(), step());
554 FILE* fp = fopen(filename,
"w");
556 perror(
"dumpLDStatsASCII");
560 fprintf(fp,
"%d %d %d\n",numProcessors,numPatches,numComputes);
563 for(i=0;i<numProcessors;i++) {
568 for(i=0;i < numPatches; i++) {
573 for(i=0; i < numComputes; i++) {
581 for (i=0; i< numProcessors; i++) {
583 fprintf(fp,
"%d %d: ", i, num);
588 fprintf(fp,
"%d ", p->
Id);
589 p = (
patchInfo *)processorArray[i].proxies.
595 for (i=0; i<numPatches; i++) {
597 fprintf(fp,
"%d %d: ", i, num);
602 fprintf(fp,
"%d ", p->
Id);
617 int NamdHybridLB::buildData(LDStats* stats) {
618 int n_pes = stats->nprocs();
630 int unLoadZero =
simParams->ldbUnloadZero;
632 int unLoadIO=
simParams->ldbUnloadOutputPEs;
635 for (i=0; i<n_pes; ++i) {
636 pe_no = stats->procs[i].pe;
639 processorArray[i].
Id = pe_no;
643 processorArray[i].
backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
646 processorArray[i].
backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
648 processorArray[i].
backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
650 processorArray[i].
idleTime = stats->procs[i].idletime;
655 if(stats->procs[0].pe == 0) {
656 if(unLoadZero) processorArray[0].
available =
false;
657 if(unLoadOne) processorArray[1].
available =
false;
661 if (pmeOn && unLoadPme) {
662 for (i=0; i<n_pes; i++) {
671 if (pmeOn && unLoadPme) {
672 for (i=0; i<n_pes; i++) {
674 processorArray[i].available =
false;
679 #ifdef MEM_OPT_VERSION 681 if (
simParams->numoutputprocs == n_pes) {
687 for (i=0; i<n_pes; i++) {
689 processorArray[i].available =
false;
696 int totalLocalProxies = 0;
697 int totalProxies = 0;
698 for (
int pid=0; pid<numPatches; ++pid ) {
701 patchArray[pid].
Id = pid;
705 const int numProxies =
706 #if 0 // USE_TOPOMAP - this function needs to be there for the hybrid case 707 requiredProxiesOnProcGrid(pid,neighborNodes);
709 requiredProxies(pid, neighborNodes);
712 int numLocalProxies = 0;
713 for (
int k=0; k<numProxies; k++) {
714 if( (neighborNodes[k] >= stats->procs[0].pe) && (neighborNodes[k] <= stats->procs[n_pes-1].pe) ){
716 int index = neighborNodes[k] - stats->procs[0].pe;
722 if ( numLocalProxies ) {
723 CkPrintf(
"LDB Pe %d patch %d has %d local of %d total proxies\n",
724 CkMyPe(), pid, numLocalProxies, numProxies);
727 totalLocalProxies += numLocalProxies;
728 totalProxies += numProxies;
731 CkPrintf(
"LDB Pe %d has %d local of %d total proxies\n",
732 CkMyPe(), totalLocalProxies, totalProxies);
735 int nMoveableComputes=0;
741 const auto nObjs = stats->objData.size();
742 for(j=0; j < nObjs; j++) {
743 const LDObjData &this_obj = stats->objData[j];
744 int frompe = stats->from_proc[j];
747 if (this_obj.omID().id.idx != 1) {
760 }
else if (this_obj.migratable && this_obj.wallTime != 0.) {
763 const int p0 = computeMap->
pid(cid,0);
767 if (computeMap->
numPids(cid) > 1)
768 p1 = computeMap->
pid(cid,1);
770 computeArray[nMoveableComputes].
Id = cid;
772 if (frompe >= n_pes) {
773 CkPrintf(
"assigning random old processor...this looks broken\n");
774 computeArray[nMoveableComputes].
oldProcessor = CrnRand()%n_pes + stats->procs[0].pe;
777 computeArray[nMoveableComputes].
oldProcessor = frompe + stats->procs[0].pe;
779 from_procs[nMoveableComputes] = frompe;
783 int index = computeArray[nMoveableComputes].
oldProcessor - stats->procs[0].pe;
784 processorArray[index].
computeLoad += this_obj.wallTime;
785 computeArray[nMoveableComputes].
processor = -1;
786 computeArray[nMoveableComputes].
patch1 = p0;
787 computeArray[nMoveableComputes].
patch2 = p1;
788 computeArray[nMoveableComputes].
handle = this_obj.handle;
789 computeArray[nMoveableComputes].
load = this_obj.wallTime;
794 for (i=0; i<n_pes; i++) {
798 return nMoveableComputes;
802 int NamdHybridLB::requiredProxies(
PatchID id,
int neighborNodes[])
805 int myNode = patchMap->
node(
id);
808 #define IF_NEW_NODE \ 810 for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \ 811 if ( j == nProxyNodes ) 816 for (
int i = 0; i < numNeighbors; ++i ) {
817 const int proxyNode = patchMap->
basenode(neighbors[i]);
818 if ( proxyNode != myNode ) {
820 neighborNodes[nProxyNodes] = proxyNode;
834 int numNodes = CkNumPes();
836 int emptyNodes = numNodes - numPatches;
837 if ( emptyNodes > numPatches ) {
838 int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
840 if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
841 int proxyNode = (myNode + 1) % numNodes;
842 while ( nProxyNodes < nodesPerPatch &&
844 if ( proxyNode != myNode ) {
846 neighborNodes[nProxyNodes] = proxyNode;
850 proxyNode = (proxyNode + 1) % numNodes;
852 proxyNode = (myNode - 1 + numNodes) % numNodes;
853 while ( nProxyNodes < nodesPerPatch &&
855 if ( proxyNode != myNode ) {
857 neighborNodes[nProxyNodes] = proxyNode;
861 proxyNode = (proxyNode - 1 + numNodes) % numNodes;
863 proxyNode = (myNode + 1) % numNodes;
865 while ( nProxyNodes < nodesPerPatch ) {
868 neighborNodes[nProxyNodes] = proxyNode;
872 proxyNode = (proxyNode + 1) % numNodes;
873 count ++;
if (count == numNodes)
break;
876 int proxyNode = myNode - 1;
878 if ( proxyNode != myNode ) {
880 neighborNodes[nProxyNodes] = proxyNode;
885 proxyNode = myNode + 1;
886 if ( proxyNode < numNodes && ! patchMap->numPatchesOnNode(proxyNode) ) {
887 if ( proxyNode != myNode ) {
889 neighborNodes[nProxyNodes] = proxyNode;
void UpdateLocalLBInfo(LocalLBInfoMsg *msg)
NamdDummyLB * AllocateNamdDummyLB()
std::ostream & iINFO(std::ostream &s)
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
void setNewNumPartitions(ComputeID cid, char numPartitions)
static PatchMap * Object()
SimParameters * simParameters
#define LDBSTRAT_REFINEONLY
std::ostream & endi(std::ostream &s)
static double averageLoad
NamdHybridLB(const CkLBOptions &opt)
Default constructor.
static Units next(Units u)
void splitComputes(SplitComputesMsg *)
int numPatches(void) const
void CreateNamdHybridLB()
int numPartitions(ComputeID cid)
void setNewNode(ComputeID cid, NodeID node)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
static LdbCoordinator * Object()
int basenode(int pid) const
#define LDBSTRAT_COMPREHENSIVE
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
void work(LDStats *stats)
static ComputeMap * Object()
int numPids(ComputeID cid)
int numPatchesOnNode(int node)
void unchecked_insert(InfoRecord *)
int pid(ComputeID cid, int i)
int isOutputProcessor(int pe)