8 #if !defined(WIN32) || defined(__CYGWIN__)
34 #include "NamdHybridLB.def.h"
40 #ifdef MEM_OPT_VERSION
50 int seqno = LdbInfra::Object()->getLoadbalancerTicket();
51 CProxy_NamdHybridLB::ckNew(CkLBOptions(seqno));
55 if (CkMyPe() == 0 &&
cpuloads == NULL) {
58 for (
int i=0; i<CkNumPes(); i++)
cpuloads[i] = 0.0;
68 lbname = (
char *)
"NamdHybridLB";
73 tree =
new TwoLevelTree;
79 statsStrategy = SHRINK_NULL;
83 thisProxy = CProxy_NamdHybridLB(thisgroup);
95 processorArray = NULL;
98 splitComputesMsgs = 0;
109 bool NamdHybridLB::QueryBalanceNow(
int _step){
117 bool NamdHybridLB::QueryDumpData() {
131 LBVectorMigrateMsg* NamdHybridLB::VectorStrategy(LDStats* stats){
132 CkPrintf(
"[%d] Using Vector Strategy to balance the load\n",CkMyPe());
133 LBVectorMigrateMsg* msg =
new(0,0) LBVectorMigrateMsg;
135 msg->level = currentLevel;
143 CLBMigrateMsg* NamdHybridLB::Strategy(LDStats* stats)
149 if(currentLevel == 1){
150 LevelData *lData = levelData[currentLevel];
153 msg = GrpLevelStrategy(stats);
157 newMsg->
n_moves = msg->n_moves;
159 newMsg->
endPE = endPE;
160 for(i=0; i<msg->n_moves; i++){
161 newMsg->
moves[i] = msg->moves[i];
163 for(i=0; i<endPE-startPE+1; i++){
167 thisProxy[0].UpdateLocalLBInfo(newMsg);
170 dummyLB->
work(stats);
171 return createMigrateMsg(stats);
183 children = tree->numNodes(currentLevel);
191 if (msg->
moves[i].to_pe != -1)
197 for(i=msg->
startPE; i<=msg->endPE; i++){
203 if(updateCount == children){
210 if(updateFlag && collectFlag){
213 thisProxy[parent_backup].CollectInfo(loc_backup, n_backup, fromlevel_backup);
220 const int children = tree->numNodes(1);
222 if ( ! splitComputesMsgs ) {
226 splitComputesMsgs[splitCount] = msg;
228 if ( ++splitCount == children ) {
234 double maxUnsplit = 0.;
236 double avgCompute = 0.;
237 double maxCompute = 0.;
238 int maxComputeId = -1;
239 int nMoveableComputes = 0;
240 int numPesAvailable = 0;
243 for (
int j=0; j < children; ++j ) {
254 averageLoad /= numPesAvailable;
255 if ( nMoveableComputes ) avgCompute /= nMoveableComputes;
else avgCompute = 0.;
257 CkPrintf(
"LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
258 maxComputeId, maxCompute, 100. * maxCompute / averageLoad, averageLoad);
259 CkPrintf(
"LDB: Average compute %f is %.1f%% of average load %f\n",
260 avgCompute, 100. * avgCompute / averageLoad, averageLoad);
263 for (
int j=0; j < children; ++j ) {
264 delete splitComputesMsgs[j];
270 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
276 int totalAddedParts = 0;
277 maxCompute = averageLoad / 10.;
278 if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
282 CkPrintf(
"LDB: Partitioning computes with target load %f\n", maxCompute);
284 for (
int j=0; j < children; ++j ) {
286 for (
int i=0; i < msg->
n; ++i) {
287 int nparts = (int) ceil(msg->
load[i] / maxCompute);
288 if ( nparts > maxParts ) nparts = maxParts;
289 if ( nparts < 1 ) nparts = 1;
291 totalAddedParts += nparts - 1;
296 CkPrintf(
"LDB: Increased migratable compute count from %d to %d\n",
297 nMoveableComputes,nMoveableComputes+totalAddedParts);
298 CkPrintf(
"LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
308 CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) {
309 int numProcessors = stats->nprocs();
313 const int numGroupComputes = stats->n_migrateobjs;
316 if ( ! processorArray ) processorArray =
new processorInfo[numProcessors];
319 if ( ! computeArray ) computeArray =
new computeInfo[numGroupComputes];
320 if ( ! from_procs ) from_procs =
new int[numGroupComputes];
322 int nMoveableComputes = buildData(stats);
323 CmiAssert(nMoveableComputes <= numGroupComputes);
327 #define DUMP_LDBDATA 1
328 #define LOAD_LDBDATA 1
332 dumpDataASCII(
"ldbd_before", numProcessors, numPatches, nMoveableComputes);
334 loadDataASCII(
"ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
348 for (i=0; i<nMoveableComputes; i++) {
351 if ( load > maxCompute ) { maxCompute =
load; maxi = i; }
353 avgCompute = total / nMoveableComputes;
354 maxComputeId = maxi < 0 ? -1 :
LdbIdField(computeArray[maxi].handle.id, 0);
356 int P = stats->nprocs();
358 for (i=0; i<P; i++) {
359 if (processorArray[i].available) {
364 if (numPesAvailable == 0)
365 NAMD_die(
"No processors available for load balancing!\n");
367 averageLoad = total/numPesAvailable;
371 double maxUnsplit = 0.;
374 for (
int i=0; i<nMoveableComputes; i++) {
375 const int cid =
LdbIdField(computeArray[i].handle.id, 0);
377 const double load = computeArray[i].
load;
378 if ( load > maxUnsplit ) maxUnsplit =
load;
398 for (
int i=0; i<nMoveableComputes; i++) {
400 const int cid =
LdbIdField(computeArray[i].handle.id, 0);
404 msg->
cid[i_split] = cid;
405 msg->
load[i_split] = computeArray[i].
load;
410 thisProxy[0].splitComputes(msg);
417 TorusLB(computeArray, patchArray, processorArray,
418 nMoveableComputes, numPatches, numProcessors);
421 nMoveableComputes, numPatches, numProcessors, 1);
423 TorusLB(computeArray, patchArray, processorArray,
424 nMoveableComputes, numPatches, numProcessors);
427 nMoveableComputes, numPatches, numProcessors, 1);
429 NAMD_die(
"Old load balancer strategy is not compatible with hybrid balancer.");
431 Alg7(computeArray, patchArray, processorArray,
432 nMoveableComputes, numPatches, numProcessors);
434 RefineOnly(computeArray, patchArray, processorArray,
435 nMoveableComputes, numPatches, numProcessors);
438 #if LDB_DEBUG && USE_TOPOMAP
440 int pe1, pe2, pe3, hops=0;
458 hops += tmgr.getHopsBetweenRanks(pe1, pe2);
462 CkPrintf(
"Load Balancing: Number of Hops: %d\n", hops);
466 dumpDataASCII(
"ldbd_after", numProcessors, numPatches, nMoveableComputes);
468 dumpDataASCII(
"ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
477 int* computeCount =
new int[numProcessors];
478 for(i=0; i<numProcessors; i++)
480 for(i=0; i<nMoveableComputes; i++)
481 computeCount[computeArray[i].processor]++;
482 for(i=0; i<numProcessors; i++) {
483 if (computeCount[i]==0)
484 iout <<
iINFO <<
"Warning: Processor " << i
485 <<
" has NO moveable computes.\n" <<
endi;
487 delete [] computeCount;
490 CkVec<MigrateInfo *> migrateInfo;
491 for(i=0;i<nMoveableComputes;i++) {
492 if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) {
496 MigrateInfo *migrateMe =
new MigrateInfo;
497 migrateMe->obj = computeArray[i].
handle;
499 int frompe = from_procs[i];
500 if (frompe == numProcessors)
503 frompe = frompe + stats->procs[0].pe;
504 migrateMe->from_pe = frompe;
505 migrateMe->to_pe = computeArray[i].
processor;
510 obj.handle = computeArray[i].
handle;
511 thisProxy[computeArray[i].
processor].ObjMigrated(obj, NULL, 0, currentLevel-1);
513 migrateInfo.push_back(migrateMe);
518 computeArray[i].processor);
524 msg = createMigrateMsg(migrateInfo, numProcessors);
526 peLoads =
new double [numProcessors];
527 startPE = processorArray[0].
Id;
528 endPE = processorArray[numProcessors-1].
Id;
530 for (i=0; i<numProcessors; i++) {
531 peLoads[i] = processorArray[i].
load;
535 delete [] from_procs;
536 delete [] processorArray;
537 delete [] patchArray;
538 delete [] computeArray;
541 processorArray = NULL;
549 void NamdHybridLB::dumpDataASCII(
char *file,
int numProcessors,
550 int numPatches,
int numComputes)
553 sprintf(filename,
"%s_%d.%d", file, CkMyPe(), step());
554 FILE* fp = fopen(filename,
"w");
556 perror(
"dumpLDStatsASCII");
560 fprintf(fp,
"%d %d %d\n",numProcessors,numPatches,numComputes);
563 for(i=0;i<numProcessors;i++) {
573 for(i=0; i < numComputes; i++) {
581 for (i=0; i< numProcessors; i++) {
583 fprintf(fp,
"%d %d: ", i, num);
588 fprintf(fp,
"%d ", p->
Id);
589 p = (
patchInfo *)processorArray[i].proxies.
597 fprintf(fp,
"%d %d: ", i, num);
602 fprintf(fp,
"%d ", p->
Id);
617 int NamdHybridLB::buildData(LDStats* stats) {
618 int n_pes = stats->nprocs();
627 int pmeOn = simParams->
PMEOn;
635 for (i=0; i<n_pes; ++i) {
636 pe_no = stats->procs[i].pe;
639 processorArray[i].
Id = pe_no;
643 processorArray[i].
backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
646 processorArray[i].
backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
648 processorArray[i].
backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
650 processorArray[i].
idleTime = stats->procs[i].idletime;
655 if(stats->procs[0].pe == 0) {
656 if(unLoadZero) processorArray[0].
available =
false;
657 if(unLoadOne) processorArray[1].
available =
false;
661 if (pmeOn && unLoadPme) {
662 for (i=0; i<n_pes; i++) {
671 if (pmeOn && unLoadPme) {
672 for (i=0; i<n_pes; i++) {
674 processorArray[i].available =
false;
679 #ifdef MEM_OPT_VERSION
687 for (i=0; i<n_pes; i++) {
689 processorArray[i].available =
false;
696 int totalLocalProxies = 0;
697 int totalProxies = 0;
701 patchArray[pid].
Id = pid;
705 const int numProxies =
706 #if 0 // USE_TOPOMAP - this function needs to be there for the hybrid case
707 requiredProxiesOnProcGrid(pid,neighborNodes);
709 requiredProxies(pid, neighborNodes);
712 int numLocalProxies = 0;
713 for (
int k=0; k<numProxies; k++) {
714 if( (neighborNodes[k] >= stats->procs[0].pe) && (neighborNodes[k] <= stats->procs[n_pes-1].pe) ){
716 int index = neighborNodes[k] - stats->procs[0].pe;
722 if ( numLocalProxies ) {
723 CkPrintf(
"LDB Pe %d patch %d has %d local of %d total proxies\n",
724 CkMyPe(), pid, numLocalProxies, numProxies);
727 totalLocalProxies += numLocalProxies;
728 totalProxies += numProxies;
731 CkPrintf(
"LDB Pe %d has %d local of %d total proxies\n",
732 CkMyPe(), totalLocalProxies, totalProxies);
735 int nMoveableComputes=0;
741 const auto nObjs = stats->objData.size();
742 for(j=0; j < nObjs; j++) {
743 const LDObjData &this_obj = stats->objData[j];
744 int frompe = stats->from_proc[j];
747 if (this_obj.omID().id.idx != 1) {
760 }
else if (this_obj.migratable && this_obj.wallTime != 0.) {
763 const int p0 = computeMap->
pid(cid,0);
767 if (computeMap->
numPids(cid) > 1)
768 p1 = computeMap->
pid(cid,1);
770 computeArray[nMoveableComputes].
Id = cid;
772 if (frompe >= n_pes) {
773 CkPrintf(
"assigning random old processor...this looks broken\n");
774 computeArray[nMoveableComputes].
oldProcessor = CrnRand()%n_pes + stats->procs[0].pe;
777 computeArray[nMoveableComputes].
oldProcessor = frompe + stats->procs[0].pe;
779 from_procs[nMoveableComputes] = frompe;
783 int index = computeArray[nMoveableComputes].
oldProcessor - stats->procs[0].pe;
784 processorArray[index].
computeLoad += this_obj.wallTime;
785 computeArray[nMoveableComputes].
processor = -1;
786 computeArray[nMoveableComputes].
patch1 = p0;
787 computeArray[nMoveableComputes].
patch2 = p1;
788 computeArray[nMoveableComputes].
handle = this_obj.handle;
789 computeArray[nMoveableComputes].
load = this_obj.wallTime;
794 for (i=0; i<n_pes; i++) {
798 return nMoveableComputes;
802 int NamdHybridLB::requiredProxies(
PatchID id,
int neighborNodes[])
805 int myNode = patchMap->
node(
id);
808 #define IF_NEW_NODE \
810 for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \
811 if ( j == nProxyNodes )
816 for (
int i = 0; i < numNeighbors; ++i ) {
817 const int proxyNode = patchMap->
basenode(neighbors[i]);
818 if ( proxyNode != myNode ) {
820 neighborNodes[nProxyNodes] = proxyNode;
834 int numNodes = CkNumPes();
837 if ( emptyNodes > numPatches ) {
838 int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
840 if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
841 int proxyNode = (myNode + 1) % numNodes;
842 while ( nProxyNodes < nodesPerPatch &&
844 if ( proxyNode != myNode ) {
846 neighborNodes[nProxyNodes] = proxyNode;
850 proxyNode = (proxyNode + 1) % numNodes;
852 proxyNode = (myNode - 1 + numNodes) % numNodes;
853 while ( nProxyNodes < nodesPerPatch &&
855 if ( proxyNode != myNode ) {
857 neighborNodes[nProxyNodes] = proxyNode;
861 proxyNode = (proxyNode - 1 + numNodes) % numNodes;
863 proxyNode = (myNode + 1) % numNodes;
865 while ( nProxyNodes < nodesPerPatch ) {
868 neighborNodes[nProxyNodes] = proxyNode;
872 proxyNode = (proxyNode + 1) % numNodes;
873 count ++;
if (count == numNodes)
break;
876 int proxyNode = myNode - 1;
878 if ( proxyNode != myNode ) {
880 neighborNodes[nProxyNodes] = proxyNode;
885 proxyNode = myNode + 1;
886 if ( proxyNode < numNodes && ! patchMap->numPatchesOnNode(proxyNode) ) {
887 if ( proxyNode != myNode ) {
889 neighborNodes[nProxyNodes] = proxyNode;
BlockLoad::TempStorage load
void UpdateLocalLBInfo(LocalLBInfoMsg *msg)
NamdDummyLB * AllocateNamdDummyLB()
std::ostream & iINFO(std::ostream &s)
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
void setNewNumPartitions(ComputeID cid, char numPartitions)
BigReal ldbRelativeGrainsize
static PatchMap * Object()
SimParameters * simParameters
#define LDBSTRAT_REFINEONLY
std::ostream & endi(std::ostream &s)
static double averageLoad
int basenode(int pid) const
NamdHybridLB(const CkLBOptions &opt)
Default constructor.
BigReal ldbHomeBackgroundScaling
static Units next(Units u)
void splitComputes(SplitComputesMsg *)
void CreateNamdHybridLB()
int numPartitions(ComputeID cid)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void setNewNode(ComputeID cid, NodeID node)
const int & LdbIdField(const LdbId &id, const int index)
void NAMD_die(const char *err_msg)
static LdbCoordinator * Object()
BigReal ldbBackgroundScaling
#define LDBSTRAT_COMPREHENSIVE
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
int numPatches(void) const
void work(LDStats *stats)
static ComputeMap * Object()
BigReal ldbPMEBackgroundScaling
int numPids(ComputeID cid)
int numPatchesOnNode(int node)
void unchecked_insert(InfoRecord *)
int pid(ComputeID cid, int i)
int isOutputProcessor(int pe)