Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

NamdCentLB.C

Go to the documentation of this file.
00001 /*****************************************************************************
00002  * $Source: /home/cvs/namd/cvsroot/namd2/src/NamdCentLB.C,v $
00003  * $Author: jlai7 $
00004  * $Date: 2012/11/27 21:13:18 $
00005  * $Revision: 1.118 $
00006  *****************************************************************************/
00007 
00008 #if !defined(WIN32) || defined(__CYGWIN__)
00009 #include <unistd.h>
00010 #endif
00011 #include <fcntl.h>
00012 
00013 #include "InfoStream.h"
00014 #include "NamdCentLB.h"
00015 #include "NamdCentLB.def.h"
00016 #include "Node.h"
00017 #include "PatchMap.h"
00018 #include "ComputeMap.h"
00019 #include "LdbCoordinator.h"
00020 
00021 // #define DUMP_LDBDATA 1
00022 // #define LOAD_LDBDATA 1
00023 
00024 double *cpuloads = NULL;
00025 
00026 void CreateNamdCentLB() {
00027   // CkPrintf("[%d] creating NamdCentLB %d\n",CkMyPe(),loadbalancer);
00028   loadbalancer = CProxy_NamdCentLB::ckNew();
00029   // CkPrintf("[%d] created NamdCentLB %d\n",CkMyPe(),loadbalancer);
00030   if (CkMyRank() == 0 && cpuloads == NULL) {    
00031     cpuloads = new double[CkNumPes()];
00032     CmiAssert(cpuloads != NULL);
00033     for (int i=0; i<CkNumPes(); i++) cpuloads[i] = 0.0;
00034   }
00035 }
00036 
00037 NamdCentLB *AllocateNamdCentLB() {
00038   return new NamdCentLB((CkMigrateMessage*)NULL);
00039 }
00040 
00044 NamdCentLB::NamdCentLB(CkMigrateMessage *msg): CentralLB(msg) {
00045   processorArray = 0;
00046   patchArray = 0;
00047   computeArray = 0;
00048 } 
00049 
00050 NamdCentLB::NamdCentLB(): CentralLB(CkLBOptions(-1))
00051 {
00052   //  if (CkMyPe()==0)
00053   //   CkPrintf("[%d] NamdCentLB created\n",CkMyPe());
00054   processorArray = 0;
00055   patchArray = 0;
00056   computeArray = 0;
00057 }
00058 
00059 /*
00060 NamdCentLB::~NamdCentLB()
00061 {
00062   delete [] processorArray;
00063   delete [] patchArray;
00064   delete [] computeArray;
00065 }
00066 */
00067 
00068 CmiBool NamdCentLB::QueryBalanceNow(int _step)
00069 {
00070   //  CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step);
00071   if ( LdbCoordinator::Object()->takingLdbData ) {
00072     return CmiTrue;
00073   } else {
00074     return CmiFalse;
00075   }
00076 }
00077 
00078 CmiBool NamdCentLB::QueryDumpData()
00079 {
00080 #if 0
00081   if (LdbCoordinator::Object()->ldbCycleNum == 1)  return CmiTrue;
00082   if (LdbCoordinator::Object()->ldbCycleNum == 2)  return CmiTrue;
00083 #endif
00084   return CmiFalse;
00085 }
00086 
00087 #if CHARM_VERSION > 60301
00088 CLBMigrateMsg* NamdCentLB::Strategy(LDStats* stats)
00089 #else
00090 // ignore n_pes in the function below
00091 CLBMigrateMsg* NamdCentLB::Strategy(LDStats* stats, int n_pes)
00092 #endif
00093 {
00094   //  CkPrintf("LDB: All statistics received at %f, %f\n",
00095   //  CmiTimer(),CmiWallTimer());
00096 
00097 #if CHARM_VERSION > 60301
00098   int numProcessors = stats->nprocs();
00099 #else
00100   int numProcessors = stats->count;
00101 #endif
00102   int numPatches = PatchMap::Object()->numPatches();
00103   ComputeMap *computeMap = ComputeMap::Object();
00104   const int numComputes = computeMap->numComputes();
00105   const SimParameters* simParams = Node::Object()->simParameters;
00106 
00107   // these sizes should never change
00108   if ( ! processorArray ) processorArray = new processorInfo[numProcessors];
00109   if ( ! patchArray ) patchArray = new patchInfo[numPatches];
00110   if ( ! computeArray ) computeArray = new computeInfo[numComputes];
00111 
00112   int nMoveableComputes = buildData(stats);
00113 
00114 #if LDB_DEBUG
00115 #define DUMP_LDBDATA 1
00116 #define LOAD_LDBDATA 1
00117 #endif
00118 
00119 #if DUMP_LDBDATA 
00120   dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes);
00121 #elif LOAD_LDBDATA
00122   loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
00123   // CkExit();
00124 #endif
00125 
00126   double averageLoad = 0.;
00127   double avgCompute;
00128   {
00129    int i;
00130    double total = 0.;
00131    double maxCompute = 0.;
00132    int maxi = 0;
00133    for (i=0; i<nMoveableComputes; i++) {
00134       double load = computeArray[i].load;
00135       total += load;
00136       if ( load > maxCompute ) { maxCompute = load;  maxi = i; }
00137    }
00138    avgCompute = total / nMoveableComputes;
00139 
00140 #if CHARM_VERSION > 60301
00141     int P = stats->nprocs();
00142 #else
00143     int P = stats->count;
00144 #endif
00145    int numPesAvailable = 0;
00146    for (i=0; i<P; i++) {
00147       if (processorArray[i].available) {
00148         ++numPesAvailable;
00149         total += processorArray[i].backgroundLoad;
00150       }
00151    }
00152    if (numPesAvailable == 0)
00153      NAMD_die("No processors available for load balancing!\n");
00154 
00155    averageLoad = total/numPesAvailable;
00156    CkPrintf("LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
00157             computeArray[maxi].handle.id.id[0],
00158             maxCompute, 100. * maxCompute / averageLoad, averageLoad);
00159    CkPrintf("LDB: Average compute %f is %.1f%% of average load %f\n",
00160             avgCompute, 100. * avgCompute / averageLoad, averageLoad);
00161   }
00162 
00163   if ( step() == 1 ) {
00164     // compute splitting only
00165     // partitions are stored as char but mostly limited by
00166     // high load noise at low outer-loop iteration counts
00167     int maxParts = 10;
00168 #ifdef NAMD_CUDA
00169 //split LCPO compute very small, else CUDA compute is delayed
00170     if (simParams->LCPOOn) {
00171       maxParts = 20;
00172     }
00173 #endif
00174     int totalAddedParts = 0;
00175     double maxCompute = averageLoad / 10.;
00176     if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
00177     if ( simParams->ldbRelativeGrainsize > 0. ) {
00178       maxCompute = averageLoad * simParams->ldbRelativeGrainsize;
00179     }
00180     CkPrintf("LDB: Partitioning computes with target load %f\n", maxCompute);
00181     double maxUnsplit = 0.;
00182     for (int i=0; i<nMoveableComputes; i++) {
00183       computeArray[i].processor = computeArray[i].oldProcessor;
00184       const int cid = computeArray[i].handle.id.id[0];
00185       const double load = computeArray[i].load;
00186       if ( computeMap->numPartitions(cid) == 0 ) {
00187         if ( load > maxUnsplit ) maxUnsplit = load;
00188         continue;
00189       }
00190       int nparts = (int) ceil(load / maxCompute);
00191       if ( nparts > maxParts ) nparts = maxParts;
00192       if ( nparts < 1 ) nparts = 1;
00193       if ( nparts > 1 ) {
00194         CkPrintf("LDB: Partitioning compute %d with load %f by %d\n",
00195                   cid, load, nparts);
00196       }
00197       computeMap->setNewNumPartitions(cid,nparts);
00198       totalAddedParts += nparts - 1;
00199     }
00200     CkPrintf("LDB: Increased migratable compute count from %d to %d\n",
00201               nMoveableComputes,nMoveableComputes+totalAddedParts);
00202     CkPrintf("LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
00203   } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default
00204     if (step() < 4)
00205       TorusLB(computeArray, patchArray, processorArray,
00206                   nMoveableComputes, numPatches, numProcessors);
00207     else
00208       RefineTorusLB(computeArray, patchArray, processorArray,
00209                   nMoveableComputes, numPatches, numProcessors, 1);
00210   } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) {
00211     TorusLB(computeArray, patchArray, processorArray,
00212                   nMoveableComputes, numPatches, numProcessors);
00213   } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) {
00214     RefineTorusLB(computeArray, patchArray, processorArray,
00215                   nMoveableComputes, numPatches, numProcessors, 1);
00216   } else if (simParams->ldbStrategy == LDBSTRAT_OLD) {
00217     if (step() < 4)
00218       Alg7(computeArray, patchArray, processorArray,
00219                   nMoveableComputes, numPatches, numProcessors);
00220     else
00221       RefineOnly(computeArray, patchArray, processorArray, 
00222                   nMoveableComputes, numPatches, numProcessors);
00223   }
00224 
00225 #if LDB_DEBUG && USE_TOPOMAP
00226   TopoManager tmgr;
00227   int pe1, pe2, pe3, hops=0;
00228   /* This is double counting the hops
00229   for(int i=0; i<nMoveableComputes; i++)
00230   {
00231     pe1 = computeArray[i].processor;
00232     pe2 = patchArray[computeArray[i].patch1].processor;
00233     pe3 = patchArray[computeArray[i].patch2].processor;
00234     hops += tmgr.getHopsBetweenRanks(pe1, pe2);
00235     if(computeArray[i].patch1 != computeArray[i].patch2)
00236       hops += tmgr.getHopsBetweenRanks(pe1, pe3);  
00237   }*/
00238   for (int i=0; i<numPatches; i++)  {
00239     //int num = patchArray[i].proxiesOn.numElements();
00240     pe1 = patchArray[i].processor;
00241     Iterator nextProc;
00242     processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc);
00243     while (p) {
00244       pe2 = p->Id;
00245       hops += tmgr.getHopsBetweenRanks(pe1, pe2);
00246       p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc);
00247     }
00248   }
00249   CkPrintf("Load Balancing: Number of Hops: %d\n", hops);
00250 #endif
00251 
00252 #if DUMP_LDBDATA
00253   dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
00254 #elif LOAD_LDBDATA
00255   dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
00256   // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
00257   // CkExit();
00258 #endif
00259 
00260   // For error checking:
00261   // Count up computes, to see if somebody doesn't have any computes
00262   int i;
00263 #if 0
00264   int* computeCount = new int[numProcessors];
00265   for(i=0; i<numProcessors; i++)
00266     computeCount[i]=0;
00267   for(i=0; i<nMoveableComputes; i++)
00268     computeCount[computeArray[i].processor]++;
00269   for(i=0; i<numProcessors; i++) {
00270     if (computeCount[i]==0)
00271       iout << iINFO <<"Warning: Processor " << i 
00272            << " has NO moveable computes.\n" << endi;
00273   }
00274   delete [] computeCount;
00275 #endif
00276   
00277   CkVec<MigrateInfo *> migrateInfo;
00278   for(i=0;i<nMoveableComputes;i++) {
00279     if (computeArray[i].processor != computeArray[i].oldProcessor) {
00280       //      CkPrintf("[%d] Obj %d migrating from %d to %d\n",
00281       //               CkMyPe(),computeArray[i].handle.id.id[0],
00282       //               computeArray[i].processor,computeArray[i].oldProcessor);
00283       MigrateInfo *migrateMe = new MigrateInfo;
00284       migrateMe->obj = computeArray[i].handle;
00285       migrateMe->from_pe = computeArray[i].oldProcessor;
00286       migrateMe->to_pe = computeArray[i].processor;
00287       migrateInfo.insertAtEnd(migrateMe);
00288 
00289       // sneak in updates to ComputeMap
00290       computeMap->setNewNode(computeArray[i].handle.id.id[0],
00291                                 computeArray[i].processor);
00292     }
00293   }
00294   
00295   int migrate_count=migrateInfo.length();
00296   // CkPrintf("NamdCentLB migrating %d elements\n",migrate_count);
00297   CLBMigrateMsg* msg = new(migrate_count,CkNumPes(),CkNumPes(),0) CLBMigrateMsg;
00298 
00299   msg->n_moves = migrate_count;
00300   for(i=0; i < migrate_count; i++) {
00301     MigrateInfo* item = migrateInfo[i];
00302     msg->moves[i] = *item;
00303     delete item;
00304     migrateInfo[i] = 0;
00305   }
00306 
00307   for (i=0; i<numProcessors; i++) {
00308     cpuloads[i] = processorArray[i].load;
00309   }
00310 
00311   delete [] processorArray;
00312   delete [] patchArray;
00313   delete [] computeArray;
00314 
00315   processorArray = NULL;
00316   patchArray = NULL;
00317   computeArray = NULL;
00318   
00319   return msg;
00320 };
00321 
00322 #ifndef WIN32
00323 
00324 void NamdCentLB::dumpDataASCII(char *file, int numProcessors,
00325                                int numPatches, int numComputes)
00326 {
00327   char filename[128];
00328   sprintf(filename, "%s.%d", file, step());
00329   FILE* fp = fopen(filename,"w");
00330   if (fp == NULL){
00331      perror("dumpLDStatsASCII");
00332      return;
00333   }
00334   CkPrintf("***** DUMP data to file: %s ***** \n", filename);
00335   fprintf(fp,"%d %d %d\n",numProcessors,numPatches,numComputes);
00336 
00337   int i;
00338   for(i=0;i<numProcessors;i++) {
00339     processorInfo* p = processorArray + i;
00340     fprintf(fp,"%d %e %e %e %e\n",p->Id,p->load,p->backgroundLoad,p->computeLoad,p->idleTime);
00341   }
00342 
00343   for(i=0;i < numPatches; i++) {
00344     patchInfo* p = patchArray + i;
00345     fprintf(fp,"%d %e %d %d\n",p->Id,p->load,p->processor,p->numAtoms);
00346   }
00347     
00348   for(i=0; i < numComputes; i++) {
00349     computeInfo* c = computeArray + i;
00350     fprintf(fp,"%d %e %d %d %d %d",c->Id,c->load,c->patch1,c->patch2,
00351             c->processor,c->oldProcessor);
00352     fprintf(fp, "\n");
00353   }
00354 
00355   // dump patchSet
00356   for (i=0; i< numProcessors; i++) {
00357       int num = processorArray[i].proxies.numElements();
00358       fprintf(fp, "%d %d: ", i, num);
00359       Iterator nextProxy;
00360       patchInfo *p = (patchInfo *)processorArray[i].proxies.
00361         iterator((Iterator *)&nextProxy);
00362       while (p) {
00363           fprintf(fp, "%d ", p->Id);
00364           p = (patchInfo *)processorArray[i].proxies.
00365             next((Iterator*)&nextProxy);
00366       }
00367       fprintf(fp, "\n");
00368   }
00369   // dump proxiesOn
00370   for (i=0; i<numPatches; i++)  {
00371     int num = patchArray[i].proxiesOn.numElements();
00372     fprintf(fp, "%d %d: ", i, num);
00373       Iterator nextProc;
00374       processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.
00375         iterator((Iterator *)&nextProc);
00376       while (p) {
00377         fprintf(fp, "%d ", p->Id);
00378         p = (processorInfo *)patchArray[i].proxiesOn.
00379           next((Iterator*)&nextProc);
00380       }
00381       fprintf(fp, "\n");
00382   }
00383 
00384   fclose(fp);
00385   //CkExit();
00386 }
00387 
00388 void NamdCentLB::loadDataASCII(char *file, int &numProcessors,
00389                                int &numPatches, int &numComputes)
00390 {
00391   char filename[128];
00392   //sprintf(filename, "%s.%d", file, step());
00393   sprintf(filename, "%s", file);
00394 
00395   CkPrintf("***** Load ascii data from file: %s ***** \n", filename);
00396 
00397   FILE* fp = fopen(filename, "r");
00398   if (fp == NULL){
00399      perror("loadDataASCII");
00400      return;
00401   }
00402 
00403   fscanf(fp,"%d %d %d",&numProcessors,&numPatches,&numComputes);
00404 
00405   printf("numProcs: %d numPatches: %d numComputes: %d\n", numProcessors,numPatches, numComputes);
00406 
00407   delete [] processorArray;
00408   delete [] patchArray;
00409   delete [] computeArray;
00410   processorArray = new processorInfo[numProcessors];
00411   patchArray = new patchInfo[numPatches];
00412   computeArray = new computeInfo[numComputes];
00413 
00414   int i;
00415   for(i=0;i<numProcessors;i++) {
00416     processorInfo* p = processorArray + i;
00417     fscanf(fp,"%d %le %le %le", &p->Id, &p->load, &p->backgroundLoad, &p->computeLoad);
00418     fscanf(fp,"%le\n", &p->idleTime);
00419     if (p->Id != i) CmiAbort("Reading processorArray error!");
00420 //    p->backgroundLoad = 0.0;
00421   }
00422 
00423   for(i=0;i < numPatches; i++) {
00424     patchInfo* p = patchArray + i;
00425     fscanf(fp,"%d %le %d %d\n",&p->Id,&p->load,&p->processor,&p->numAtoms);
00426     if (p->Id != i || p->processor > numProcessors || p->processor < 0) 
00427       CmiAbort("Reading patchArray error!");
00428   }
00429     
00430   for(i=0; i < numComputes; i++) {
00431     computeInfo* c = computeArray + i;
00432     fscanf(fp,"%d %le %d %d %d %d",&c->Id,&c->load,&c->patch1,&c->patch2,
00433             &c->processor,&c->oldProcessor);
00434 
00435     if (c->patch1 < 0 || c->patch1 > numPatches || c->patch2 < 0 || c->patch2 > numPatches)
00436       CmiAbort("Reading computeArray error!");
00437   // printf("%d %e %d %d %d %d\n", c->Id,c->load,c->patch1,c->patch2,c->processor,c->oldProcessor);
00438   }
00439 
00440   // dump patchSet
00441   for (i=0; i< numProcessors; i++) {
00442       int num, curp;
00443       fscanf(fp,"%d %d: ",&curp, &num);
00444       if(curp != i)
00445         CmiAbort("Reading patchsSet error!");
00446       for (int j=0; j<num; j++) {
00447           int id;
00448           fscanf(fp,"%d",&id);
00449           processorArray[i].proxies.unchecked_insert(&patchArray[id]);
00450       }
00451   }
00452   // dump proxiesOn
00453   for (i=0; i<numPatches; i++)  {
00454       int num, curp;
00455       fscanf(fp,"%d %d: ",&curp, &num);
00456       if(curp != i)
00457         CmiAbort("Reading proxiesOn error!");
00458       for (int j=0; j<num; j++) {
00459           int id;
00460           fscanf(fp,"%d",&id);
00461           patchArray[i].proxiesOn.insert(&processorArray[id]);
00462       }
00463   }
00464 
00465   fclose(fp);
00466 }
00467 #endif
00468 
00469 extern int isPmeProcessor(int); 
00470 #ifdef MEM_OPT_VERSION
00471 extern int isOutputProcessor(int); 
00472 #endif
00473 
00474 int NamdCentLB::buildData(LDStats* stats)
00475 {
00476 #if CHARM_VERSION > 60301
00477   int n_pes = stats->nprocs();
00478 #else
00479   int n_pes = stats->count;
00480 #endif
00481 
00482   PatchMap* patchMap = PatchMap::Object();
00483   ComputeMap* computeMap = ComputeMap::Object();
00484   const SimParameters* simParams = Node::Object()->simParameters;
00485 
00486   BigReal bgfactor = simParams->ldbBackgroundScaling;
00487   BigReal pmebgfactor = simParams->ldbPMEBackgroundScaling;
00488   BigReal homebgfactor = simParams->ldbHomeBackgroundScaling;
00489   int pmeOn = simParams->PMEOn;
00490   int unLoadPme = simParams->ldbUnloadPME;
00491   int pmeBarrier = simParams->PMEBarrier;
00492   int unLoadZero = simParams->ldbUnloadZero;
00493   int unLoadOne = simParams->ldbUnloadOne;
00494   int unLoadIO= simParams->ldbUnloadOutputPEs;
00495   int i;
00496   for (i=0; i<n_pes; ++i) {
00497     processorArray[i].Id = i;
00498     processorArray[i].available = CmiTrue;
00499     if ( pmeOn && isPmeProcessor(i) ) {
00500       processorArray[i].backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
00501     } else if (patchMap->numPatchesOnNode(i) > 0) {
00502       processorArray[i].backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
00503     } else {
00504       processorArray[i].backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
00505     }
00506     processorArray[i].idleTime = stats->procs[i].idletime;
00507     processorArray[i].load = processorArray[i].computeLoad = 0.0;
00508   }
00509 
00510 /* *********** this code is defunct *****************
00511 #if 0
00512   double bgfactor = 1.0 + 1.0 * CkNumPes()/1000.0;
00513   if ( bgfactor > 2.0 ) bgfactor = 2.0;
00514   iout << iINFO << "Scaling background load by " << bgfactor << ".\n" << endi;
00515   int i;
00516   for (i=0; i<n_pes; i++) {
00517     processorArray[i].Id = i;
00518     processorArray[i].backgroundLoad = bgfactor * stats[i].bg_walltime;
00519   }
00520 
00521   double bg_weight = 0.7;
00522 
00523   int i;
00524   for (i=0; i<n_pes; i++) {
00525     processorArray[i].Id = i;
00526     if (patchMap->numPatchesOnNode(i) > 0)
00527       processorArray[i].backgroundLoad = bg_weight * stats->procs[i].bg_walltime;
00528     else 
00529       processorArray[i].backgroundLoad = stats[i].bg_walltime;
00530   }
00531   
00532   //Modification to reduce the coputeload on PME processors
00533   const SimParameters* simParams = Node::Object()->simParameters;  
00534   
00535   // CkPrintf("BACKGROUND LOAD\n");
00536   if(simParams->PMEOn) {
00537     double bgfactor = 1.0 + 1.0 * CkNumPes()/1000.0;
00538     if ( bgfactor > 2.0 ) bgfactor = 2.0;
00539     for (i=0; i<n_pes; i++) {
00540       // CkPrintf("BG[%d] =  %5.5lf,", i, processorArray[i].backgroundLoad);
00541       if(isPmeProcessor(i)) {
00542         processorArray[i].backgroundLoad *= bgfactor;
00543       }
00544       // CkPrintf("%5.5lf;  ", processorArray[i].backgroundLoad);
00545     }
00546   }
00547   // CkPrintf("\n");
00548 #endif  
00549 *********** end of defunct code *********** */
00550 
00551   if (unLoadZero) processorArray[0].available = CmiFalse;
00552   if (unLoadOne) processorArray[1].available = CmiFalse;
00553 
00554   // if all pes are Pme, disable this flag
00555   if (pmeOn && unLoadPme) {
00556     for (i=0; i<n_pes; i++) {
00557       if (!isPmeProcessor(i))  break;
00558     }
00559     if (i == n_pes) {
00560       iout << iINFO << "Turned off unLoadPme flag!\n"  << endi;
00561       unLoadPme = 0;
00562     }
00563   }
00564   
00565   if (pmeOn && unLoadPme) {
00566     for (i=0; i<n_pes; i++) {
00567       if ((pmeBarrier && i==0) || isPmeProcessor(i)) 
00568         processorArray[i].available = CmiFalse;
00569     }
00570   }
00571   // if all pes are output, disable this flag
00572 #ifdef MEM_OPT_VERSION
00573 
00574   if (unLoadIO) {
00575       if (simParams->numoutputprocs == n_pes) {
00576           iout << iINFO << "Turned off unLoadIO flag!\n"  << endi;
00577           unLoadIO = 0;
00578       }
00579   }
00580   if (unLoadIO){
00581     iout << iINFO << "Testing for output processors!\n"  << endi;
00582       for (i=0; i<n_pes; i++) {
00583           if (isOutputProcessor(stats->procs[i].pe)) 
00584             {
00585               //              iout << iINFO << "Removed output PE "<< stats->procs[i].pe <<" from available list!\n"  << endi;
00586               processorArray[i].available = CmiFalse;
00587             }
00588           else
00589             {
00590               //              iout << iINFO << "Nonoutput PE "<< stats->procs[i].pe <<" is in available list!\n"  << endi;
00591             }
00592       }
00593   }
00594 #endif
00595 
00596   int nMoveableComputes=0;
00597   int nProxies = 0;             // total number of estimated proxies
00598   int nIdleComputes = 0;
00599 
00600   int j;
00601   for (j=0; j < stats->n_objs; j++) {
00602       const LDObjData &this_obj = stats->objData[j];
00603       int frompe = stats->from_proc[j];
00604 
00605       // filter out non-NAMD managed objects (like PME array)
00606       if (this_obj.omID().id.idx != 1) {
00607         // CkPrintf("non-NAMD object %d on pe %d with walltime %lf\n",
00608         // this_obj.id().id[0], stats->from_proc[j], this_obj.wallTime);
00609         processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
00610         continue;
00611       }
00612 
00613       if (this_obj.id().id[1] == -2) { // Its a patch
00614         const int pid = this_obj.id().id[0];
00615         int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
00616 
00617         patchArray[pid].Id = pid;
00618         patchArray[pid].numAtoms = 0;
00619         patchArray[pid].processor = stats->from_proc[j];
00620         const int numProxies = 
00621 #if USE_TOPOMAP
00622         requiredProxiesOnProcGrid(pid,neighborNodes);
00623 #else
00624         requiredProxies(pid, neighborNodes);
00625 #endif
00626 
00627         nProxies += numProxies;
00628 
00629         for (int k=0; k<numProxies; k++) {
00630           processorArray[neighborNodes[k]].proxies.unchecked_insert(&patchArray[pid]);
00631           patchArray[pid].proxiesOn.unchecked_insert(&processorArray[neighborNodes[k]]);
00632         }
00633         processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
00634       } else if (this_obj.id().id[1] == -3) { // Its a bonded compute
00635         processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
00636       } else if (this_obj.migratable) { // Its a compute
00637        if ( this_obj.wallTime == 0. ) { // don't migrate idle computes
00638          ++nIdleComputes;
00639        } else {
00640         const int cid = this_obj.id().id[0];
00641         const int p0 = computeMap->pid(cid,0);
00642 
00643         // For self-interactions, just return the same pid twice
00644         int p1;
00645         if (computeMap->numPids(cid) > 1)
00646           p1 = computeMap->pid(cid,1);
00647         else p1 = p0;
00648         computeArray[nMoveableComputes].Id = cid;
00649         computeArray[nMoveableComputes].oldProcessor = stats->from_proc[j];
00650         processorArray[stats->from_proc[j]].computeLoad += this_obj.wallTime;
00651         computeArray[nMoveableComputes].processor = -1;
00652         computeArray[nMoveableComputes].patch1 = p0;
00653         computeArray[nMoveableComputes].patch2 = p1;
00654         computeArray[nMoveableComputes].handle = this_obj.handle;
00655         computeArray[nMoveableComputes].load = this_obj.wallTime;
00656         nMoveableComputes++;
00657        }
00658       } else {
00659         processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
00660       }
00661     }
00662 
00663    if ( nIdleComputes )
00664      CkPrintf("LDB: %d computes have load of zero\n", nIdleComputes);
00665 
00666 /* *********** this code is defunct *****************
00667 #if 0
00668   int averageProxy = nProxies / n_pes;
00669   CkPrintf("total proxies: %d, avervage: %d\n", nProxies, averageProxy);
00670   for (i=0; i<n_pes; i++) {
00671     // too many proxies on this node, weight the background load
00672     int proxies = processorArray[i].proxies.numElements();
00673     if (proxies > averageProxy) {
00674       double factor = 1.0*(proxies-averageProxy)/nProxies;
00675       processorArray[i].backgroundLoad *= (1.0 + factor);
00676       CkPrintf("On [%d]: too many proxies: %d, increased bg load by %f\n", i, nProxies, factor);
00677     }
00678   }
00679 #endif
00680 *********** end of defunct code *********** */
00681 
00682   for (i=0; i<n_pes; i++) {
00683     processorArray[i].load = processorArray[i].backgroundLoad + processorArray[i].computeLoad;
00684   }
00685   stats->clear();
00686   return nMoveableComputes;
00687 }
00688 
00689 // Figure out which proxies we will definitely create on other
00690 // nodes, without regard for non-bonded computes.  This code is swiped
00691 // from ProxyMgr, and changes there probable need to be propagated here.
00692 
00693 int NamdCentLB::requiredProxies(PatchID id, int neighborNodes[])
00694 {
00695   PatchMap* patchMap = PatchMap::Object();
00696   int myNode = patchMap->node(id);
00697   int nProxyNodes = 0;
00698 
00699 #define IF_NEW_NODE \
00700     int j; \
00701     for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \
00702     if ( j == nProxyNodes )
00703 
00704   PatchID neighbors[1 + PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
00705   neighbors[0] = id;
00706   int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
00707   for ( int i = 0; i < numNeighbors; ++i ) {
00708     const int proxyNode = patchMap->basenode(neighbors[i]);
00709     if ( proxyNode != myNode ) {
00710       IF_NEW_NODE {
00711         neighborNodes[nProxyNodes] = proxyNode;
00712         nProxyNodes++;
00713       }
00714     }
00715   }
00716 
00717   // Distribute initial default proxies across empty processors.
00718   // This shouldn't be necessary, but may constrain the load balancer
00719   // and avoid placing too many proxies on a single processor.  -JCP
00720 
00721   // This code needs to be turned off when the creation of ST is
00722   // shifted to the load balancers -ASB
00723 
00724 #if 1
00725   int numPes = CkNumPes();
00726   int numPatches = patchMap->numPatches();
00727   int emptyNodes = numPes - numPatches;
00728   if ( emptyNodes > numPatches ) {
00729     int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
00730     int maxNodesPerPatch = PatchMap::MaxOneAway + PatchMap::MaxTwoAway;
00731     if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
00732     int proxyNode = (myNode + 1) % numPes;
00733     while ( nProxyNodes < nodesPerPatch &&
00734                         ! patchMap->numPatchesOnNode(proxyNode) ) {
00735       if ( proxyNode != myNode ) {
00736         IF_NEW_NODE {
00737           neighborNodes[nProxyNodes] = proxyNode;
00738           nProxyNodes++;
00739         }
00740       }
00741       proxyNode = (proxyNode + 1) % numPes;
00742     }
00743     proxyNode = (myNode - 1 + numPes) % numPes;
00744     while ( nProxyNodes < nodesPerPatch &&
00745                         ! patchMap->numPatchesOnNode(proxyNode) ) {
00746       if ( proxyNode != myNode ) {
00747         IF_NEW_NODE {
00748           neighborNodes[nProxyNodes] = proxyNode;
00749           nProxyNodes++;
00750         }
00751       }
00752       proxyNode = (proxyNode - 1 + numPes) % numPes;
00753     }
00754     proxyNode = (myNode + 1) % numPes;
00755     int count = 0;
00756     while ( nProxyNodes < nodesPerPatch ) {
00757       if ( ! patchMap->numPatchesOnNode(proxyNode) && proxyNode != myNode ) {
00758         IF_NEW_NODE {
00759           neighborNodes[nProxyNodes] = proxyNode;
00760           nProxyNodes++;
00761         }
00762       }
00763       proxyNode = (proxyNode + 1) % numPes;
00764       count ++; if (count == numPes) break;   // we looped all
00765     }
00766   } else {
00767     int proxyNode = myNode - 1;
00768     if ( proxyNode >= 0 && ! patchMap->numPatchesOnNode(proxyNode) ) {
00769       if ( proxyNode != myNode ) {
00770         IF_NEW_NODE {
00771           neighborNodes[nProxyNodes] = proxyNode;
00772           nProxyNodes++;
00773         }
00774       }
00775     }
00776     proxyNode = myNode + 1;
00777     if ( proxyNode < numPes && ! patchMap->numPatchesOnNode(proxyNode) ) {
00778       if ( proxyNode != myNode ) {
00779         IF_NEW_NODE {
00780           neighborNodes[nProxyNodes] = proxyNode;
00781           nProxyNodes++;
00782         }
00783       }
00784     }
00785   }
00786 #endif
00787 
00788   return nProxyNodes;
00789 }
00790 
00791 #if USE_TOPOMAP 
00792 // Figure out which proxies we will definitely create on other nodes,
00793 // without regard for non-bonded computes.  This code is swiped from
00794 // ProxyMgr, and changes there probable need to be propagated here.
00795 // The proxies are placed on nearby processors on the 3d-grid along
00796 // the X, Y, Z and T dimensions
00797 
00798 int NamdCentLB::requiredProxiesOnProcGrid(PatchID id, int neighborNodes[])
00799 {
00800   enum proxyHere { No, Yes };
00801   int numPes = CkNumPes();
00802   proxyHere *proxyNodes = new proxyHere[numPes];
00803   int nProxyNodes;
00804   int i, j, k, l;
00805 
00806   int xsize = 0, ysize = 0, zsize = 0, tsize = 0;
00807   int my_x = 0, my_y = 0, my_z = 0, my_t = 0;
00808 
00809   PatchMap* patchMap = PatchMap::Object();
00810   int myNode = patchMap->node(id);
00811     
00812   TopoManager tmgr;
00813   xsize = tmgr.getDimNX();
00814   ysize = tmgr.getDimNY();
00815   zsize = tmgr.getDimNZ();
00816   tsize = tmgr.getDimNT();
00817   
00818   tmgr.rankToCoordinates(myNode, my_x, my_y, my_z, my_t);
00819   
00820   if(xsize * ysize * zsize * tsize != CkNumPes()) {
00821     delete [] proxyNodes;
00822     return requiredProxies(id, neighborNodes);
00823   }  
00824 
00825   // Note all home patches.
00826   for ( i = 0; i < numPes; ++i )
00827   {
00828     proxyNodes[i] = No;
00829   }
00830   nProxyNodes = 0;
00831 
00832   // Check all two-away neighbors.
00833   // This is really just one-away neighbors, since 
00834   // two-away always returns zero: RKB
00835   PatchID neighbors[1 + PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
00836 
00837   // Assign a proxy to all your neighbors. But dont increment counter
00838   // because these have to be there anyway.
00839   neighbors[0] = id;  
00840   int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
00841   
00842   // Small Flag chooses between different loadbalancing schemes.
00843   // Small Flag == true, patches are close to each other
00844   // false, patches are far from each other
00845   CmiBool smallFlag = CmiFalse;
00846   double pnodes = CkNumPes();
00847   pnodes *= 0.25;    
00848   smallFlag = (patchMap->numPatches() > pnodes )?1:0;
00849 
00850   //If there are lot of patches its likely they will all be neighbors, 
00851   //so all we need to do is to place proxies on downstream patches.
00852   //if (smallFlag) {
00853   for ( i = 1; i < numNeighbors; ++i )
00854     {
00855       int proxyNode = patchMap->basenode(neighbors[i]);
00856       
00857       if (proxyNode != myNode)
00858         if (proxyNodes[proxyNode] == No)
00859           {
00860             proxyNodes[proxyNode] = Yes;
00861             neighborNodes[nProxyNodes] = proxyNode;
00862             nProxyNodes++;
00863           }
00864     }
00865   //}
00866  
00867   if (step() > 2) {
00868     delete [] proxyNodes;
00869     return nProxyNodes;
00870   }
00871  
00872   // Place numPesPerPatch proxies on the 3d torus neighbors of a processor
00873 
00874   int numPatches = patchMap->numPatches();
00875   int emptyNodes = numPes - numPatches;
00876   //if ( emptyNodes > numPatches ) {
00877   
00878   int nodesPerPatch = nProxyNodes + 4 * (emptyNodes-1) / numPatches + 1;
00879   int proxyNode = 0 ;
00880   int proxy_x=0, proxy_y=0, proxy_z=0;
00881   
00882   //Choose from the 26 neighbors of mynode.
00883   //CkAssert(nodesPerPatch - nProxyNodes <= 26);  
00884   //Too few patches otherwise, try twoaway?
00885   
00886   for(k=-1; k<= 1; k++) {
00887     proxy_z = (my_z + k + zsize) % zsize;
00888     for(j=-1; j <= 1; j++) {
00889       proxy_y = (my_y + j + ysize) % ysize;
00890       for(i = -1; i <= 1; i++) {
00891         proxy_x = (my_x + i + xsize) % xsize;
00892         for(l = 0; l < tsize; l++) {
00893           if(i == 0 && j == 0 && k == 0 && l == 0)
00894             continue;
00895 
00896           proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
00897 
00898           if((! patchMap->numPatchesOnNode(proxyNode) || !smallFlag) &&
00899              proxyNodes[proxyNode] == No) {
00900             proxyNodes[proxyNode] = Yes;
00901             neighborNodes[nProxyNodes] = proxyNode;
00902             nProxyNodes++;
00903           }
00904           
00905           if(nProxyNodes >= nodesPerPatch || 
00906              nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00907             break;
00908         } // end for
00909 
00910         if(nProxyNodes >= nodesPerPatch || 
00911            nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00912           break;
00913       } // end for
00914       
00915       if(nProxyNodes >= nodesPerPatch || 
00916          nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00917         break;    
00918     } // end for
00919 
00920     if(nProxyNodes >= nodesPerPatch || 
00921        nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00922       break;      
00923   } // end for
00924 
00925 #if 1
00926   if(!smallFlag) {
00927     for(k=-2; k<= 2; k+=2) {
00928       proxy_z = (my_z + k + zsize) % zsize;
00929       for(j=-2; j <= 2; j+=2) {
00930         proxy_y = (my_y + j + ysize) % ysize;
00931         for(i = -2; i <= 2; i+=2) {
00932           proxy_x = (my_x + i + xsize) % xsize;
00933           for(l = 0; l < tsize; l++) {
00934             if(i == 0 && j == 0 && k == 0 && l == 0)
00935               continue;
00936           
00937             proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
00938           
00939             if((! patchMap->numPatchesOnNode(proxyNode) || !smallFlag) &&
00940                proxyNodes[proxyNode] == No) {
00941               proxyNodes[proxyNode] = Yes;
00942               neighborNodes[nProxyNodes] = proxyNode;
00943               nProxyNodes++;
00944             }
00945             
00946             if(nProxyNodes >= nodesPerPatch || 
00947                nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00948               break;
00949           } // end for
00950 
00951           if(nProxyNodes >= nodesPerPatch || 
00952              nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00953             break;
00954         } // end for
00955         
00956         if(nProxyNodes >= nodesPerPatch || 
00957            nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00958           break;          
00959       } // end for
00960 
00961       if(nProxyNodes >= nodesPerPatch || 
00962          nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
00963         break;    
00964     } // end for
00965   }
00966 
00967 #else
00968   #if 0
00969   const SimParameters* params = Node::Object()->simParameters;
00970 
00971   if(!smallFlag) {
00972     //Add two-away proxies
00973     if(patchMap->numaway_a() == 2) {
00974       proxy_y = (my_y + 2) % ysize;
00975       proxy_x = my_x  % xsize;
00976       proxy_z = my_z  % zsize;
00977       
00978       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
00979       if(proxyNodes[proxyNode] == No) {
00980         proxyNodes[proxyNode] = Yes;
00981         neighborNodes[nProxyNodes] = proxyNode;
00982       nProxyNodes++;
00983       }
00984       
00985       proxy_y = (my_y - 2 + ysize) % ysize;
00986       proxy_x = my_x  % xsize;
00987       proxy_z = my_z % zsize;
00988       
00989       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
00990       if(proxyNodes[proxyNode] == No) {
00991         proxyNodes[proxyNode] = Yes;
00992         neighborNodes[nProxyNodes] = proxyNode;
00993         nProxyNodes++;
00994       }
00995     }
00996     
00997     //Add two away proxies
00998     if(patchMap->numaway_b() == 2) {
00999       proxy_y = my_y  % ysize;
01000       proxy_x = my_x  % xsize;
01001       proxy_z = (my_z + 2) % zsize;
01002       
01003       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
01004       if(proxyNodes[proxyNode] == No) {
01005         proxyNodes[proxyNode] = Yes;
01006         neighborNodes[nProxyNodes] = proxyNode;
01007         nProxyNodes++;
01008       }
01009       
01010       proxy_y = my_y  % ysize;
01011       proxy_x = my_x  % xsize;
01012       proxy_z = (my_z - 2 + zsize) % zsize;
01013       
01014       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
01015       if(proxyNodes[proxyNode] == No) {
01016         proxyNodes[proxyNode] = Yes;
01017         neighborNodes[nProxyNodes] = proxyNode;
01018         nProxyNodes++;
01019       }
01020     }
01021     
01022     //Add two away proxies
01023     if(patchMap->numaway_c() == 2) {
01024       proxy_y = my_y  % ysize;
01025       proxy_x = (my_x + 2) % xsize;
01026       proxy_z = my_z  % zsize;
01027       
01028       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
01029       if(proxyNodes[proxyNode] == No) {
01030         proxyNodes[proxyNode] = Yes;
01031         neighborNodes[nProxyNodes] = proxyNode;
01032       nProxyNodes++;
01033       }
01034       
01035       proxy_y = my_y  % ysize;
01036       proxy_x = (my_x  - 2 + xsize) % xsize;
01037       proxy_z = my_z % zsize;
01038       
01039       proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
01040       if(proxyNodes[proxyNode] == No) {
01041         proxyNodes[proxyNode] = Yes;
01042         neighborNodes[nProxyNodes] = proxyNode;
01043         nProxyNodes++;
01044       }
01045     }
01046   }
01047   #endif
01048 #endif
01049   
01050   // CkPrintf("Returning %d proxies\n", nProxyNodes);
01051 
01052   delete [] proxyNodes;
01053   return nProxyNodes;
01054 }
01055 
01056 #endif

Generated on Sat May 18 04:07:17 2013 for NAMD by  doxygen 1.3.9.1