8 #if !defined(WIN32) || defined(__CYGWIN__) 15 #include "NamdCentLB.def.h" 28 int seqno = LdbInfra::Object()->getLoadbalancerTicket();
29 loadbalancer = CProxy_NamdCentLB::ckNew(CkLBOptions(seqno));
31 if (CkMyRank() == 0 &&
cpuloads == NULL) {
34 for (
int i=0; i<CkNumPes(); i++)
cpuloads[i] = 0.0;
39 return new NamdCentLB((CkMigrateMessage*)NULL);
69 bool NamdCentLB::QueryBalanceNow(
int _step)
79 bool NamdCentLB::QueryDumpData()
93 int numProcessors = stats->nprocs();
100 if ( ! processorArray ) processorArray =
new processorInfo[numProcessors];
101 if ( ! patchArray ) patchArray =
new patchInfo[numPatches];
102 if ( ! computeArray ) computeArray =
new computeInfo[numComputes];
104 int nMoveableComputes = buildData(stats);
107 #define DUMP_LDBDATA 1 108 #define LOAD_LDBDATA 1 112 dumpDataASCII(
"ldbd_before", numProcessors, numPatches, nMoveableComputes);
114 loadDataASCII(
"ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
119 double avgCompute = 0.;
120 if ( nMoveableComputes ) {
123 double maxCompute = 0.;
125 for (i=0; i<nMoveableComputes; i++) {
126 double load = computeArray[i].
load;
128 if ( load > maxCompute ) { maxCompute = load; maxi = i; }
130 avgCompute = total / nMoveableComputes;
132 int P = stats->nprocs();
133 int numPesAvailable = 0;
134 for (i=0; i<P; i++) {
135 if (processorArray[i].available) {
140 if (numPesAvailable == 0)
141 NAMD_die(
"No processors available for load balancing!\n");
144 CkPrintf(
"LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
147 CkPrintf(
"LDB: Average compute %f is %.1f%% of average load %f\n",
156 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 162 int totalAddedParts = 0;
164 if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
165 if (
simParams->ldbRelativeGrainsize > 0. ) {
168 CkPrintf(
"LDB: Partitioning computes with target load %f\n", maxCompute);
169 double maxUnsplit = 0.;
170 for (
int i=0; i<nMoveableComputes; i++) {
172 const int cid =
LdbIdField(computeArray[i].handle.id, 0);
173 const double load = computeArray[i].
load;
175 if ( load > maxUnsplit ) maxUnsplit = load;
178 int nparts = (int) ceil(load / maxCompute);
179 if ( nparts > maxParts ) nparts = maxParts;
180 if ( nparts < 1 ) nparts = 1;
181 if ( 0 && nparts > 1 ) {
182 CkPrintf(
"LDB: Partitioning compute %d with load %f by %d\n",
186 totalAddedParts += nparts - 1;
188 CkPrintf(
"LDB: Increased migratable compute count from %d to %d\n",
189 nMoveableComputes,nMoveableComputes+totalAddedParts);
190 CkPrintf(
"LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
193 TorusLB(computeArray, patchArray, processorArray,
194 nMoveableComputes, numPatches, numProcessors);
197 nMoveableComputes, numPatches, numProcessors, 1);
199 TorusLB(computeArray, patchArray, processorArray,
200 nMoveableComputes, numPatches, numProcessors);
203 nMoveableComputes, numPatches, numProcessors, 1);
206 Alg7(computeArray, patchArray, processorArray,
207 nMoveableComputes, numPatches, numProcessors);
209 RefineOnly(computeArray, patchArray, processorArray,
210 nMoveableComputes, numPatches, numProcessors);
213 #if LDB_DEBUG && USE_TOPOMAP 215 int pe1, pe2, pe3, hops=0;
226 for (
int i=0; i<numPatches; i++) {
233 hops += tmgr.getHopsBetweenRanks(pe1, pe2);
237 CkPrintf(
"Load Balancing: Number of Hops: %d\n", hops);
241 dumpDataASCII(
"ldbd_after", numProcessors, numPatches, nMoveableComputes);
243 dumpDataASCII(
"ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
252 int* computeCount =
new int[numProcessors];
253 for(i=0; i<numProcessors; i++)
255 for(i=0; i<nMoveableComputes; i++)
256 computeCount[computeArray[i].processor]++;
257 for(i=0; i<numProcessors; i++) {
258 if (computeCount[i]==0)
259 iout <<
iINFO <<
"Warning: Processor " << i
260 <<
" has NO moveable computes.\n" <<
endi;
262 delete [] computeCount;
265 std::vector<MigrateInfo *> migrateInfo;
266 for(i=0;i<nMoveableComputes;i++) {
267 if (computeArray[i].processor != computeArray[i].oldProcessor) {
271 MigrateInfo *migrateMe =
new MigrateInfo;
272 migrateMe->obj = computeArray[i].
handle;
274 migrateMe->to_pe = computeArray[i].
processor;
275 migrateInfo.push_back(migrateMe);
283 const int migrate_count=migrateInfo.size();
285 CLBMigrateMsg* msg =
new(migrate_count,CkNumPes(),CkNumPes(),0) CLBMigrateMsg;
287 msg->n_moves = migrate_count;
288 for(i=0; i < migrate_count; i++) {
289 MigrateInfo* item = migrateInfo[i];
290 msg->moves[i] = *item;
292 migrateInfo[i] =
nullptr;
295 for (i=0; i<numProcessors; i++) {
299 delete [] processorArray;
300 delete [] patchArray;
301 delete [] computeArray;
303 processorArray = NULL;
312 void NamdCentLB::dumpDataASCII(
char *file,
int numProcessors,
313 int numPatches,
int numComputes)
316 sprintf(filename,
"%s.%d", file, step());
317 FILE* fp = fopen(filename,
"w");
319 perror(
"dumpLDStatsASCII");
322 CkPrintf(
"***** DUMP data to file: %s ***** \n", filename);
323 fprintf(fp,
"%d %d %d\n",numProcessors,numPatches,numComputes);
326 for(i=0;i<numProcessors;i++) {
331 for(i=0;i < numPatches; i++) {
336 for(i=0; i < numComputes; i++) {
344 for (i=0; i< numProcessors; i++) {
346 fprintf(fp,
"%d %d: ", i, num);
351 fprintf(fp,
"%d ", p->
Id);
352 p = (
patchInfo *)processorArray[i].proxies.
358 for (i=0; i<numPatches; i++) {
360 fprintf(fp,
"%d %d: ", i, num);
365 fprintf(fp,
"%d ", p->
Id);
376 void NamdCentLB::loadDataASCII(
char *file,
int &numProcessors,
377 int &numPatches,
int &numComputes)
381 sprintf(filename,
"%s", file);
383 CkPrintf(
"***** Load ascii data from file: %s ***** \n", filename);
385 FILE* fp = fopen(filename,
"r");
387 perror(
"loadDataASCII");
391 fscanf(fp,
"%d %d %d",&numProcessors,&numPatches,&numComputes);
393 printf(
"numProcs: %d numPatches: %d numComputes: %d\n", numProcessors,numPatches, numComputes);
395 delete [] processorArray;
396 delete [] patchArray;
397 delete [] computeArray;
403 for(i=0;i<numProcessors;i++) {
407 if (p->
Id != i) CmiAbort(
"Reading processorArray error!");
411 for(i=0;i < numPatches; i++) {
415 CmiAbort(
"Reading patchArray error!");
418 for(i=0; i < numComputes; i++) {
424 CmiAbort(
"Reading computeArray error!");
429 for (i=0; i< numProcessors; i++) {
431 fscanf(fp,
"%d %d: ",&curp, &num);
433 CmiAbort(
"Reading patchsSet error!");
434 for (
int j=0; j<num; j++) {
441 for (i=0; i<numPatches; i++) {
443 fscanf(fp,
"%d %d: ",&curp, &num);
445 CmiAbort(
"Reading proxiesOn error!");
446 for (
int j=0; j<num; j++) {
458 #ifdef MEM_OPT_VERSION 461 #if defined(NAMD_MIC) 462 extern int isMICProcessor(
int);
465 int NamdCentLB::buildData(LDStats* stats)
467 int n_pes = stats->nprocs();
479 int unLoadZero =
simParams->ldbUnloadZero;
481 int unLoadIO=
simParams->ldbUnloadOutputPEs;
483 for (i=0; i<n_pes; ++i) {
484 processorArray[i].
Id = i;
487 processorArray[i].
backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
489 processorArray[i].
backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
491 processorArray[i].
backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
493 processorArray[i].
idleTime = stats->procs[i].idletime;
538 if (unLoadZero) processorArray[0].
available =
false;
539 if (unLoadOne) processorArray[1].
available =
false;
542 if (pmeOn && unLoadPme) {
543 for (i=0; i<n_pes; i++) {
552 if (pmeOn && unLoadPme) {
553 for (i=0; i<n_pes; i++) {
555 processorArray[i].available =
false;
559 #ifdef MEM_OPT_VERSION 562 if (
simParams->numoutputprocs == n_pes) {
569 for (i=0; i<n_pes; i++) {
584 #if defined(NAMD_MIC) 586 for (i = 0; i < n_pes; i++) {
587 if (isMICProcessor(i) != 0) { processorArray[i].
available =
false; }
592 int nMoveableComputes=0;
594 int nIdleComputes = 0;
597 const auto nObjs = stats->objData.size();
598 for (j=0; j < nObjs; j++) {
599 const LDObjData &this_obj = stats->objData[j];
600 int frompe = stats->from_proc[j];
603 if (this_obj.omID().id.idx != 1) {
606 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
614 patchArray[pid].
Id = pid;
616 patchArray[pid].
processor = stats->from_proc[j];
617 const int numProxies =
619 requiredProxiesOnProcGrid(pid,neighborNodes);
621 requiredProxies(pid, neighborNodes);
624 nProxies += numProxies;
626 for (
int k=0; k<numProxies; k++) {
630 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
632 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
633 }
else if (this_obj.migratable) {
634 if ( this_obj.wallTime == 0. ) {
638 const int p0 = computeMap->
pid(cid,0);
642 if (computeMap->
numPids(cid) > 1)
643 p1 = computeMap->
pid(cid,1);
645 computeArray[nMoveableComputes].
Id = cid;
646 computeArray[nMoveableComputes].
oldProcessor = stats->from_proc[j];
647 processorArray[stats->from_proc[j]].
computeLoad += this_obj.wallTime;
648 computeArray[nMoveableComputes].
processor = -1;
649 computeArray[nMoveableComputes].
patch1 = p0;
650 computeArray[nMoveableComputes].
patch2 = p1;
651 computeArray[nMoveableComputes].
handle = this_obj.handle;
652 computeArray[nMoveableComputes].
load = this_obj.wallTime;
656 processorArray[stats->from_proc[j]].
backgroundLoad += this_obj.wallTime;
661 CkPrintf(
"LDB: %d computes have load of zero\n", nIdleComputes);
679 for (i=0; i<n_pes; i++) {
683 return nMoveableComputes;
690 int NamdCentLB::requiredProxies(
PatchID id,
int neighborNodes[])
693 int myNode = patchMap->
node(
id);
696 #define IF_NEW_NODE \ 698 for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \ 699 if ( j == nProxyNodes ) 704 for (
int i = 0; i < numNeighbors; ++i ) {
705 const int proxyNode = patchMap->
basenode(neighbors[i]);
706 if ( proxyNode != myNode ) {
708 neighborNodes[nProxyNodes] = proxyNode;
722 int numPes = CkNumPes();
724 int emptyNodes = numPes - numPatches;
725 if ( emptyNodes > numPatches ) {
726 int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
728 if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
729 int proxyNode = (myNode + 1) % numPes;
730 while ( nProxyNodes < nodesPerPatch &&
732 if ( proxyNode != myNode ) {
734 neighborNodes[nProxyNodes] = proxyNode;
738 proxyNode = (proxyNode + 1) % numPes;
740 proxyNode = (myNode - 1 + numPes) % numPes;
741 while ( nProxyNodes < nodesPerPatch &&
743 if ( proxyNode != myNode ) {
745 neighborNodes[nProxyNodes] = proxyNode;
749 proxyNode = (proxyNode - 1 + numPes) % numPes;
751 proxyNode = (myNode + 1) % numPes;
753 while ( nProxyNodes < nodesPerPatch ) {
756 neighborNodes[nProxyNodes] = proxyNode;
760 proxyNode = (proxyNode + 1) % numPes;
761 count ++;
if (count == numPes)
break;
764 int proxyNode = myNode - 1;
766 if ( proxyNode != myNode ) {
768 neighborNodes[nProxyNodes] = proxyNode;
773 proxyNode = myNode + 1;
774 if ( proxyNode < numPes && ! patchMap->numPatchesOnNode(proxyNode) ) {
775 if ( proxyNode != myNode ) {
777 neighborNodes[nProxyNodes] = proxyNode;
795 int NamdCentLB::requiredProxiesOnProcGrid(
PatchID id,
int neighborNodes[])
797 enum proxyHere { No, Yes };
798 int numPes = CkNumPes();
799 proxyHere *proxyNodes =
new proxyHere[numPes];
803 int xsize = 0, ysize = 0, zsize = 0, tsize = 0;
804 int my_x = 0, my_y = 0, my_z = 0, my_t = 0;
807 int myNode = patchMap->
node(
id);
810 xsize = tmgr.getDimNX();
811 ysize = tmgr.getDimNY();
812 zsize = tmgr.getDimNZ();
813 tsize = tmgr.getDimNT();
815 tmgr.rankToCoordinates(myNode, my_x, my_y, my_z, my_t);
817 if(xsize * ysize * zsize * tsize != CkNumPes()) {
818 delete [] proxyNodes;
819 return requiredProxies(
id, neighborNodes);
823 for ( i = 0; i < numPes; ++i )
842 bool smallFlag =
false;
843 double pnodes = CkNumPes();
845 smallFlag = (patchMap->
numPatches() > pnodes )?1:0;
850 for ( i = 1; i < numNeighbors; ++i )
852 int proxyNode = patchMap->
basenode(neighbors[i]);
854 if (proxyNode != myNode)
855 if (proxyNodes[proxyNode] == No)
857 proxyNodes[proxyNode] = Yes;
858 neighborNodes[nProxyNodes] = proxyNode;
865 delete [] proxyNodes;
872 int emptyNodes = numPes - numPatches;
875 int nodesPerPatch = nProxyNodes + 4 * (emptyNodes-1) / numPatches + 1;
877 int proxy_x=0, proxy_y=0, proxy_z=0;
883 for(k=-1; k<= 1; k++) {
884 proxy_z = (my_z + k + zsize) % zsize;
885 for(j=-1; j <= 1; j++) {
886 proxy_y = (my_y + j + ysize) % ysize;
887 for(i = -1; i <= 1; i++) {
888 proxy_x = (my_x + i + xsize) % xsize;
889 for(l = 0; l < tsize; l++) {
890 if(i == 0 && j == 0 && k == 0 && l == 0)
893 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
896 proxyNodes[proxyNode] == No) {
897 proxyNodes[proxyNode] = Yes;
898 neighborNodes[nProxyNodes] = proxyNode;
902 if(nProxyNodes >= nodesPerPatch ||
907 if(nProxyNodes >= nodesPerPatch ||
912 if(nProxyNodes >= nodesPerPatch ||
917 if(nProxyNodes >= nodesPerPatch ||
924 for(k=-2; k<= 2; k+=2) {
925 proxy_z = (my_z + k + zsize) % zsize;
926 for(j=-2; j <= 2; j+=2) {
927 proxy_y = (my_y + j + ysize) % ysize;
928 for(i = -2; i <= 2; i+=2) {
929 proxy_x = (my_x + i + xsize) % xsize;
930 for(l = 0; l < tsize; l++) {
931 if(i == 0 && j == 0 && k == 0 && l == 0)
934 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
937 proxyNodes[proxyNode] == No) {
938 proxyNodes[proxyNode] = Yes;
939 neighborNodes[nProxyNodes] = proxyNode;
943 if(nProxyNodes >= nodesPerPatch ||
948 if(nProxyNodes >= nodesPerPatch ||
953 if(nProxyNodes >= nodesPerPatch ||
958 if(nProxyNodes >= nodesPerPatch ||
971 proxy_y = (my_y + 2) % ysize;
972 proxy_x = my_x % xsize;
973 proxy_z = my_z % zsize;
975 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
976 if(proxyNodes[proxyNode] == No) {
977 proxyNodes[proxyNode] = Yes;
978 neighborNodes[nProxyNodes] = proxyNode;
982 proxy_y = (my_y - 2 + ysize) % ysize;
983 proxy_x = my_x % xsize;
984 proxy_z = my_z % zsize;
986 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
987 if(proxyNodes[proxyNode] == No) {
988 proxyNodes[proxyNode] = Yes;
989 neighborNodes[nProxyNodes] = proxyNode;
996 proxy_y = my_y % ysize;
997 proxy_x = my_x % xsize;
998 proxy_z = (my_z + 2) % zsize;
1000 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1001 if(proxyNodes[proxyNode] == No) {
1002 proxyNodes[proxyNode] = Yes;
1003 neighborNodes[nProxyNodes] = proxyNode;
1007 proxy_y = my_y % ysize;
1008 proxy_x = my_x % xsize;
1009 proxy_z = (my_z - 2 + zsize) % zsize;
1011 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1012 if(proxyNodes[proxyNode] == No) {
1013 proxyNodes[proxyNode] = Yes;
1014 neighborNodes[nProxyNodes] = proxyNode;
1021 proxy_y = my_y % ysize;
1022 proxy_x = (my_x + 2) % xsize;
1023 proxy_z = my_z % zsize;
1025 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1026 if(proxyNodes[proxyNode] == No) {
1027 proxyNodes[proxyNode] = Yes;
1028 neighborNodes[nProxyNodes] = proxyNode;
1032 proxy_y = my_y % ysize;
1033 proxy_x = (my_x - 2 + xsize) % xsize;
1034 proxy_z = my_z % zsize;
1036 proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1037 if(proxyNodes[proxyNode] == No) {
1038 proxyNodes[proxyNode] = Yes;
1039 neighborNodes[nProxyNodes] = proxyNode;
1049 delete [] proxyNodes;
std::ostream & iINFO(std::ostream &s)
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
void setNewNumPartitions(ComputeID cid, char numPartitions)
static PatchMap * Object()
SimParameters * simParameters
#define LDBSTRAT_REFINEONLY
std::ostream & endi(std::ostream &s)
CLBMigrateMsg * Strategy(LDStats *stats)
static double averageLoad
void insert(InfoRecord *)
static Units next(Units u)
int numPatches(void) const
int numaway_c(void) const
int numPartitions(ComputeID cid)
int numaway_a(void) const
void setNewNode(ComputeID cid, NodeID node)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
static LdbCoordinator * Object()
int basenode(int pid) const
#define LDBSTRAT_COMPREHENSIVE
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
static ComputeMap * Object()
int numaway_b(void) const
int numPids(ComputeID cid)
int numPatchesOnNode(int node)
void unchecked_insert(InfoRecord *)
NamdCentLB(const CkLBOptions &opt)
int pid(ComputeID cid, int i)
int isOutputProcessor(int pe)