NAMD
NamdCentLB.C
Go to the documentation of this file.
1 /*****************************************************************************
2  * $Source: /home/cvs/namd/cvsroot/namd2/src/NamdCentLB.C,v $
3  * $Author: jim $
4  * $Date: 2017/03/30 20:06:17 $
5  * $Revision: 1.125 $
6  *****************************************************************************/
7 
8 #if !defined(WIN32) || defined(__CYGWIN__)
9 #include <unistd.h>
10 #endif
11 #include <fcntl.h>
12 
13 #include "InfoStream.h"
14 #include "NamdCentLB.h"
15 #include "NamdCentLB.def.h"
16 #include "Node.h"
17 #include "PatchMap.h"
18 #include "ComputeMap.h"
19 #include "LdbCoordinator.h"
20 
21 // #define DUMP_LDBDATA 1
22 // #define LOAD_LDBDATA 1
23 
24 double *cpuloads = NULL;
25 
27  // CkPrintf("[%d] creating NamdCentLB %d\n",CkMyPe(),loadbalancer);
28  int seqno = LdbInfra::Object()->getLoadbalancerTicket();
29  loadbalancer = CProxy_NamdCentLB::ckNew(CkLBOptions(seqno));
30  // CkPrintf("[%d] created NamdCentLB %d\n",CkMyPe(),loadbalancer);
31  if (CkMyRank() == 0 && cpuloads == NULL) {
32  cpuloads = new double[CkNumPes()];
33  CmiAssert(cpuloads != NULL);
34  for (int i=0; i<CkNumPes(); i++) cpuloads[i] = 0.0;
35  }
36 }
37 
39  return new NamdCentLB((CkMigrateMessage*)NULL);
40 }
41 
45 NamdCentLB::NamdCentLB(CkMigrateMessage *msg): CentralLB(msg) {
46  processorArray = 0;
47  patchArray = 0;
48  computeArray = 0;
49 }
50 
51 NamdCentLB::NamdCentLB(const CkLBOptions& opt): CentralLB(opt)
52 {
53  // if (CkMyPe()==0)
54  // CkPrintf("[%d] NamdCentLB created\n",CkMyPe());
55  processorArray = 0;
56  patchArray = 0;
57  computeArray = 0;
58 }
59 
60 /*
61 NamdCentLB::~NamdCentLB()
62 {
63  delete [] processorArray;
64  delete [] patchArray;
65  delete [] computeArray;
66 }
67 */
68 
69 bool NamdCentLB::QueryBalanceNow(int _step)
70 {
71  // CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step);
72  if ( LdbCoordinator::Object()->takingLdbData ) {
73  return true;
74  } else {
75  return false;
76  }
77 }
78 
79 bool NamdCentLB::QueryDumpData()
80 {
81 #if 0
82  if (LdbCoordinator::Object()->ldbCycleNum == 1) return true;
83  if (LdbCoordinator::Object()->ldbCycleNum == 2) return true;
84 #endif
85  return false;
86 }
87 
88 CLBMigrateMsg* NamdCentLB::Strategy(LDStats* stats)
89 {
90  // CkPrintf("LDB: All statistics received at %f, %f\n",
91  // CmiTimer(),CmiWallTimer());
92 
93  int numProcessors = stats->nprocs();
95  ComputeMap *computeMap = ComputeMap::Object();
96  const int numComputes = computeMap->numComputes();
98 
99  // these sizes should never change
100  if ( ! processorArray ) processorArray = new processorInfo[numProcessors];
101  if ( ! patchArray ) patchArray = new patchInfo[numPatches];
102  if ( ! computeArray ) computeArray = new computeInfo[numComputes];
103 
104  int nMoveableComputes = buildData(stats);
105 
106 #if LDB_DEBUG
107 #define DUMP_LDBDATA 1
108 #define LOAD_LDBDATA 1
109 #endif
110 
111 #if DUMP_LDBDATA
112  dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes);
113 #elif LOAD_LDBDATA
114  loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
115  // CkExit();
116 #endif
117 
118  double averageLoad = 0.;
119  double avgCompute = 0.;
120  if ( nMoveableComputes ) {
121  int i;
122  double total = 0.;
123  double maxCompute = 0.;
124  int maxi = 0;
125  for (i=0; i<nMoveableComputes; i++) {
126  double load = computeArray[i].load;
127  total += load;
128  if ( load > maxCompute ) { maxCompute = load; maxi = i; }
129  }
130  avgCompute = total / nMoveableComputes;
131 
132  int P = stats->nprocs();
133  int numPesAvailable = 0;
134  for (i=0; i<P; i++) {
135  if (processorArray[i].available) {
136  ++numPesAvailable;
137  total += processorArray[i].backgroundLoad;
138  }
139  }
140  if (numPesAvailable == 0)
141  NAMD_die("No processors available for load balancing!\n");
142 
143  averageLoad = total/numPesAvailable;
144  CkPrintf("LDB: Largest compute %d load %f is %.1f%% of average load %f\n",
145  LdbIdField(computeArray[maxi].handle.id, 0),
146  maxCompute, 100. * maxCompute / averageLoad, averageLoad);
147  CkPrintf("LDB: Average compute %f is %.1f%% of average load %f\n",
148  avgCompute, 100. * avgCompute / averageLoad, averageLoad);
149  }
150 
151  if ( step() == 1 ) {
152  // compute splitting only
153  // partitions are stored as char but mostly limited by
154  // high load noise at low outer-loop iteration counts
155  int maxParts = 10;
156 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
157 //split LCPO compute very small, else CUDA compute is delayed
158  if (simParams->LCPOOn) {
159  maxParts = 20;
160  }
161 #endif
162  int totalAddedParts = 0;
163  double maxCompute = averageLoad / 10.;
164  if ( maxCompute < 2. * avgCompute ) maxCompute = 2. * avgCompute;
165  if ( simParams->ldbRelativeGrainsize > 0. ) {
166  maxCompute = averageLoad * simParams->ldbRelativeGrainsize;
167  }
168  CkPrintf("LDB: Partitioning computes with target load %f\n", maxCompute);
169  double maxUnsplit = 0.;
170  for (int i=0; i<nMoveableComputes; i++) {
171  computeArray[i].processor = computeArray[i].oldProcessor;
172  const int cid = LdbIdField(computeArray[i].handle.id, 0);
173  const double load = computeArray[i].load;
174  if ( computeMap->numPartitions(cid) == 0 ) {
175  if ( load > maxUnsplit ) maxUnsplit = load;
176  continue;
177  }
178  int nparts = (int) ceil(load / maxCompute);
179  if ( nparts > maxParts ) nparts = maxParts;
180  if ( nparts < 1 ) nparts = 1;
181  if ( 0 && nparts > 1 ) {
182  CkPrintf("LDB: Partitioning compute %d with load %f by %d\n",
183  cid, load, nparts);
184  }
185  computeMap->setNewNumPartitions(cid,nparts);
186  totalAddedParts += nparts - 1;
187  }
188  CkPrintf("LDB: Increased migratable compute count from %d to %d\n",
189  nMoveableComputes,nMoveableComputes+totalAddedParts);
190  CkPrintf("LDB: Largest unpartitionable compute is %f\n", maxUnsplit);
191  } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default
192  if (step() < 4)
193  TorusLB(computeArray, patchArray, processorArray,
194  nMoveableComputes, numPatches, numProcessors);
195  else
196  RefineTorusLB(computeArray, patchArray, processorArray,
197  nMoveableComputes, numPatches, numProcessors, 1);
198  } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) {
199  TorusLB(computeArray, patchArray, processorArray,
200  nMoveableComputes, numPatches, numProcessors);
201  } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) {
202  RefineTorusLB(computeArray, patchArray, processorArray,
203  nMoveableComputes, numPatches, numProcessors, 1);
204  } else if (simParams->ldbStrategy == LDBSTRAT_OLD) {
205  if (step() < 4)
206  Alg7(computeArray, patchArray, processorArray,
207  nMoveableComputes, numPatches, numProcessors);
208  else
209  RefineOnly(computeArray, patchArray, processorArray,
210  nMoveableComputes, numPatches, numProcessors);
211  }
212 
213 #if LDB_DEBUG && USE_TOPOMAP
214  TopoManager tmgr;
215  int pe1, pe2, pe3, hops=0;
216  /* This is double counting the hops
217  for(int i=0; i<nMoveableComputes; i++)
218  {
219  pe1 = computeArray[i].processor;
220  pe2 = patchArray[computeArray[i].patch1].processor;
221  pe3 = patchArray[computeArray[i].patch2].processor;
222  hops += tmgr.getHopsBetweenRanks(pe1, pe2);
223  if(computeArray[i].patch1 != computeArray[i].patch2)
224  hops += tmgr.getHopsBetweenRanks(pe1, pe3);
225  }*/
226  for (int i=0; i<numPatches; i++) {
227  //int num = patchArray[i].proxiesOn.numElements();
228  pe1 = patchArray[i].processor;
229  Iterator nextProc;
230  processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc);
231  while (p) {
232  pe2 = p->Id;
233  hops += tmgr.getHopsBetweenRanks(pe1, pe2);
234  p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc);
235  }
236  }
237  CkPrintf("Load Balancing: Number of Hops: %d\n", hops);
238 #endif
239 
240 #if DUMP_LDBDATA
241  dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
242 #elif LOAD_LDBDATA
243  dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
244  // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
245  // CkExit();
246 #endif
247 
248  // For error checking:
249  // Count up computes, to see if somebody doesn't have any computes
250  int i;
251 #if 0
252  int* computeCount = new int[numProcessors];
253  for(i=0; i<numProcessors; i++)
254  computeCount[i]=0;
255  for(i=0; i<nMoveableComputes; i++)
256  computeCount[computeArray[i].processor]++;
257  for(i=0; i<numProcessors; i++) {
258  if (computeCount[i]==0)
259  iout << iINFO <<"Warning: Processor " << i
260  << " has NO moveable computes.\n" << endi;
261  }
262  delete [] computeCount;
263 #endif
264 
265  std::vector<MigrateInfo *> migrateInfo;
266  for(i=0;i<nMoveableComputes;i++) {
267  if (computeArray[i].processor != computeArray[i].oldProcessor) {
268  // CkPrintf("[%d] Obj %d migrating from %d to %d\n",
269  // CkMyPe(),computeArray[i].handle.id.id[0],
270  // computeArray[i].processor,computeArray[i].oldProcessor);
271  MigrateInfo *migrateMe = new MigrateInfo;
272  migrateMe->obj = computeArray[i].handle;
273  migrateMe->from_pe = computeArray[i].oldProcessor;
274  migrateMe->to_pe = computeArray[i].processor;
275  migrateInfo.push_back(migrateMe);
276 
277  // sneak in updates to ComputeMap
278  computeMap->setNewNode(LdbIdField(computeArray[i].handle.id, 0),
279  computeArray[i].processor);
280  }
281  }
282 
283  const int migrate_count=migrateInfo.size();
284  // CkPrintf("NamdCentLB migrating %d elements\n",migrate_count);
285  CLBMigrateMsg* msg = new(migrate_count,CkNumPes(),CkNumPes(),0) CLBMigrateMsg;
286 
287  msg->n_moves = migrate_count;
288  for(i=0; i < migrate_count; i++) {
289  MigrateInfo* item = migrateInfo[i];
290  msg->moves[i] = *item;
291  delete item;
292  migrateInfo[i] = nullptr;
293  }
294 
295  for (i=0; i<numProcessors; i++) {
296  cpuloads[i] = processorArray[i].load;
297  }
298 
299  delete [] processorArray;
300  delete [] patchArray;
301  delete [] computeArray;
302 
303  processorArray = NULL;
304  patchArray = NULL;
305  computeArray = NULL;
306 
307  return msg;
308 };
309 
310 #ifndef WIN32
311 
312 void NamdCentLB::dumpDataASCII(char *file, int numProcessors,
313  int numPatches, int numComputes)
314 {
315  char filename[128];
316  sprintf(filename, "%s.%d", file, step());
317  FILE* fp = fopen(filename,"w");
318  if (fp == NULL){
319  perror("dumpLDStatsASCII");
320  return;
321  }
322  CkPrintf("***** DUMP data to file: %s ***** \n", filename);
323  fprintf(fp,"%d %d %d\n",numProcessors,numPatches,numComputes);
324 
325  int i;
326  for(i=0;i<numProcessors;i++) {
327  processorInfo* p = processorArray + i;
328  fprintf(fp,"%d %e %e %e %e\n",p->Id,p->load,p->backgroundLoad,p->computeLoad,p->idleTime);
329  }
330 
331  for(i=0;i < numPatches; i++) {
332  patchInfo* p = patchArray + i;
333  fprintf(fp,"%d %e %d %d\n",p->Id,p->load,p->processor,p->numAtoms);
334  }
335 
336  for(i=0; i < numComputes; i++) {
337  computeInfo* c = computeArray + i;
338  fprintf(fp,"%d %e %d %d %d %d",c->Id,c->load,c->patch1,c->patch2,
339  c->processor,c->oldProcessor);
340  fprintf(fp, "\n");
341  }
342 
343  // dump patchSet
344  for (i=0; i< numProcessors; i++) {
345  int num = processorArray[i].proxies.numElements();
346  fprintf(fp, "%d %d: ", i, num);
347  Iterator nextProxy;
348  patchInfo *p = (patchInfo *)processorArray[i].proxies.
349  iterator((Iterator *)&nextProxy);
350  while (p) {
351  fprintf(fp, "%d ", p->Id);
352  p = (patchInfo *)processorArray[i].proxies.
353  next((Iterator*)&nextProxy);
354  }
355  fprintf(fp, "\n");
356  }
357  // dump proxiesOn
358  for (i=0; i<numPatches; i++) {
359  int num = patchArray[i].proxiesOn.numElements();
360  fprintf(fp, "%d %d: ", i, num);
361  Iterator nextProc;
362  processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.
363  iterator((Iterator *)&nextProc);
364  while (p) {
365  fprintf(fp, "%d ", p->Id);
366  p = (processorInfo *)patchArray[i].proxiesOn.
367  next((Iterator*)&nextProc);
368  }
369  fprintf(fp, "\n");
370  }
371 
372  fclose(fp);
373  //CkExit();
374 }
375 
376 void NamdCentLB::loadDataASCII(char *file, int &numProcessors,
377  int &numPatches, int &numComputes)
378 {
379  char filename[128];
380  //sprintf(filename, "%s.%d", file, step());
381  sprintf(filename, "%s", file);
382 
383  CkPrintf("***** Load ascii data from file: %s ***** \n", filename);
384 
385  FILE* fp = fopen(filename, "r");
386  if (fp == NULL){
387  perror("loadDataASCII");
388  return;
389  }
390 
391  fscanf(fp,"%d %d %d",&numProcessors,&numPatches,&numComputes);
392 
393  printf("numProcs: %d numPatches: %d numComputes: %d\n", numProcessors,numPatches, numComputes);
394 
395  delete [] processorArray;
396  delete [] patchArray;
397  delete [] computeArray;
398  processorArray = new processorInfo[numProcessors];
399  patchArray = new patchInfo[numPatches];
400  computeArray = new computeInfo[numComputes];
401 
402  int i;
403  for(i=0;i<numProcessors;i++) {
404  processorInfo* p = processorArray + i;
405  fscanf(fp,"%d %le %le %le", &p->Id, &p->load, &p->backgroundLoad, &p->computeLoad);
406  fscanf(fp,"%le\n", &p->idleTime);
407  if (p->Id != i) CmiAbort("Reading processorArray error!");
408 // p->backgroundLoad = 0.0;
409  }
410 
411  for(i=0;i < numPatches; i++) {
412  patchInfo* p = patchArray + i;
413  fscanf(fp,"%d %le %d %d\n",&p->Id,&p->load,&p->processor,&p->numAtoms);
414  if (p->Id != i || p->processor > numProcessors || p->processor < 0)
415  CmiAbort("Reading patchArray error!");
416  }
417 
418  for(i=0; i < numComputes; i++) {
419  computeInfo* c = computeArray + i;
420  fscanf(fp,"%d %le %d %d %d %d",&c->Id,&c->load,&c->patch1,&c->patch2,
421  &c->processor,&c->oldProcessor);
422 
423  if (c->patch1 < 0 || c->patch1 > numPatches || c->patch2 < 0 || c->patch2 > numPatches)
424  CmiAbort("Reading computeArray error!");
425  // printf("%d %e %d %d %d %d\n", c->Id,c->load,c->patch1,c->patch2,c->processor,c->oldProcessor);
426  }
427 
428  // dump patchSet
429  for (i=0; i< numProcessors; i++) {
430  int num, curp;
431  fscanf(fp,"%d %d: ",&curp, &num);
432  if(curp != i)
433  CmiAbort("Reading patchsSet error!");
434  for (int j=0; j<num; j++) {
435  int id;
436  fscanf(fp,"%d",&id);
437  processorArray[i].proxies.unchecked_insert(&patchArray[id]);
438  }
439  }
440  // dump proxiesOn
441  for (i=0; i<numPatches; i++) {
442  int num, curp;
443  fscanf(fp,"%d %d: ",&curp, &num);
444  if(curp != i)
445  CmiAbort("Reading proxiesOn error!");
446  for (int j=0; j<num; j++) {
447  int id;
448  fscanf(fp,"%d",&id);
449  patchArray[i].proxiesOn.insert(&processorArray[id]);
450  }
451  }
452 
453  fclose(fp);
454 }
455 #endif
456 
457 extern int isPmeProcessor(int);
458 #ifdef MEM_OPT_VERSION
459 extern int isOutputProcessor(int);
460 #endif
461 #if defined(NAMD_MIC)
462 extern int isMICProcessor(int);
463 #endif
464 
465 int NamdCentLB::buildData(LDStats* stats)
466 {
467  int n_pes = stats->nprocs();
468 
469  PatchMap* patchMap = PatchMap::Object();
470  ComputeMap* computeMap = ComputeMap::Object();
472 
473  BigReal bgfactor = simParams->ldbBackgroundScaling;
474  BigReal pmebgfactor = simParams->ldbPMEBackgroundScaling;
475  BigReal homebgfactor = simParams->ldbHomeBackgroundScaling;
476  int pmeOn = simParams->PMEOn;
477  int unLoadPme = simParams->ldbUnloadPME;
478  int pmeBarrier = simParams->PMEBarrier;
479  int unLoadZero = simParams->ldbUnloadZero;
480  int unLoadOne = simParams->ldbUnloadOne;
481  int unLoadIO= simParams->ldbUnloadOutputPEs;
482  int i;
483  for (i=0; i<n_pes; ++i) {
484  processorArray[i].Id = i;
485  processorArray[i].available = true;
486  if ( pmeOn && isPmeProcessor(i) ) {
487  processorArray[i].backgroundLoad = pmebgfactor * stats->procs[i].bg_walltime;
488  } else if (patchMap->numPatchesOnNode(i) > 0) {
489  processorArray[i].backgroundLoad = homebgfactor * stats->procs[i].bg_walltime;
490  } else {
491  processorArray[i].backgroundLoad = bgfactor * stats->procs[i].bg_walltime;
492  }
493  processorArray[i].idleTime = stats->procs[i].idletime;
494  processorArray[i].load = processorArray[i].computeLoad = 0.0;
495  }
496 
497 /* *********** this code is defunct *****************
498 #if 0
499  double bgfactor = 1.0 + 1.0 * CkNumPes()/1000.0;
500  if ( bgfactor > 2.0 ) bgfactor = 2.0;
501  iout << iINFO << "Scaling background load by " << bgfactor << ".\n" << endi;
502  int i;
503  for (i=0; i<n_pes; i++) {
504  processorArray[i].Id = i;
505  processorArray[i].backgroundLoad = bgfactor * stats[i].bg_walltime;
506  }
507 
508  double bg_weight = 0.7;
509 
510  int i;
511  for (i=0; i<n_pes; i++) {
512  processorArray[i].Id = i;
513  if (patchMap->numPatchesOnNode(i) > 0)
514  processorArray[i].backgroundLoad = bg_weight * stats->procs[i].bg_walltime;
515  else
516  processorArray[i].backgroundLoad = stats[i].bg_walltime;
517  }
518 
519  //Modification to reduce the coputeload on PME processors
520  const SimParameters* simParams = Node::Object()->simParameters;
521 
522  // CkPrintf("BACKGROUND LOAD\n");
523  if(simParams->PMEOn) {
524  double bgfactor = 1.0 + 1.0 * CkNumPes()/1000.0;
525  if ( bgfactor > 2.0 ) bgfactor = 2.0;
526  for (i=0; i<n_pes; i++) {
527  // CkPrintf("BG[%d] = %5.5lf,", i, processorArray[i].backgroundLoad);
528  if(isPmeProcessor(i)) {
529  processorArray[i].backgroundLoad *= bgfactor;
530  }
531  // CkPrintf("%5.5lf; ", processorArray[i].backgroundLoad);
532  }
533  }
534  // CkPrintf("\n");
535 #endif
536 *********** end of defunct code *********** */
537 
538  if (unLoadZero) processorArray[0].available = false;
539  if (unLoadOne) processorArray[1].available = false;
540 
541  // if all pes are Pme, disable this flag
542  if (pmeOn && unLoadPme) {
543  for (i=0; i<n_pes; i++) {
544  if (!isPmeProcessor(i)) break;
545  }
546  if (i == n_pes) {
547  iout << iINFO << "Turned off unLoadPme flag!\n" << endi;
548  unLoadPme = 0;
549  }
550  }
551 
552  if (pmeOn && unLoadPme) {
553  for (i=0; i<n_pes; i++) {
554  if ((pmeBarrier && i==0) || isPmeProcessor(i))
555  processorArray[i].available = false;
556  }
557  }
558  // if all pes are output, disable this flag
559 #ifdef MEM_OPT_VERSION
560 
561  if (unLoadIO) {
562  if (simParams->numoutputprocs == n_pes) {
563  iout << iINFO << "Turned off unLoadIO flag!\n" << endi;
564  unLoadIO = 0;
565  }
566  }
567  if (unLoadIO){
568  iout << iINFO << "Testing for output processors!\n" << endi;
569  for (i=0; i<n_pes; i++) {
570  if (isOutputProcessor(stats->procs[i].pe))
571  {
572  // iout << iINFO << "Removed output PE "<< stats->procs[i].pe <<" from available list!\n" << endi;
573  processorArray[i].available = false;
574  }
575  else
576  {
577  // iout << iINFO << "Nonoutput PE "<< stats->procs[i].pe <<" is in available list!\n" << endi;
578  }
579  }
580  }
581 #endif
582 
583  // Unload PEs driving MIC devices, if need be
584  #if defined(NAMD_MIC)
585  if (simParams->mic_unloadMICPEs != 0) {
586  for (i = 0; i < n_pes; i++) {
587  if (isMICProcessor(i) != 0) { processorArray[i].available = false; }
588  }
589  }
590  #endif
591 
592  int nMoveableComputes=0;
593  int nProxies = 0; // total number of estimated proxies
594  int nIdleComputes = 0;
595 
596  int j;
597  const auto nObjs = stats->objData.size();
598  for (j=0; j < nObjs; j++) {
599  const LDObjData &this_obj = stats->objData[j];
600  int frompe = stats->from_proc[j];
601 
602  // filter out non-NAMD managed objects (like PME array)
603  if (this_obj.omID().id.idx != 1) {
604  // CkPrintf("non-NAMD object %d on pe %d with walltime %lf\n",
605  // this_obj.id().id[0], stats->from_proc[j], this_obj.wallTime);
606  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
607  continue;
608  }
609 
610  if (LdbIdField(this_obj.id(), 1) == PATCH_TYPE) { // Its a patch
611  const int pid = LdbIdField(this_obj.id(), 0);
612  int neighborNodes[PatchMap::MaxOneAway + PatchMap::MaxTwoAway];
613 
614  patchArray[pid].Id = pid;
615  patchArray[pid].numAtoms = 0;
616  patchArray[pid].processor = stats->from_proc[j];
617  const int numProxies =
618 #if USE_TOPOMAP
619  requiredProxiesOnProcGrid(pid,neighborNodes);
620 #else
621  requiredProxies(pid, neighborNodes);
622 #endif
623 
624  nProxies += numProxies;
625 
626  for (int k=0; k<numProxies; k++) {
627  processorArray[neighborNodes[k]].proxies.unchecked_insert(&patchArray[pid]);
628  patchArray[pid].proxiesOn.unchecked_insert(&processorArray[neighborNodes[k]]);
629  }
630  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
631  } else if (LdbIdField(this_obj.id(), 1) == BONDED_TYPE) { // Its a bonded compute
632  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
633  } else if (this_obj.migratable) { // Its a compute
634  if ( this_obj.wallTime == 0. ) { // don't migrate idle computes
635  ++nIdleComputes;
636  } else {
637  const int cid = LdbIdField(this_obj.id(), 0);
638  const int p0 = computeMap->pid(cid,0);
639 
640  // For self-interactions, just return the same pid twice
641  int p1;
642  if (computeMap->numPids(cid) > 1)
643  p1 = computeMap->pid(cid,1);
644  else p1 = p0;
645  computeArray[nMoveableComputes].Id = cid;
646  computeArray[nMoveableComputes].oldProcessor = stats->from_proc[j];
647  processorArray[stats->from_proc[j]].computeLoad += this_obj.wallTime;
648  computeArray[nMoveableComputes].processor = -1;
649  computeArray[nMoveableComputes].patch1 = p0;
650  computeArray[nMoveableComputes].patch2 = p1;
651  computeArray[nMoveableComputes].handle = this_obj.handle;
652  computeArray[nMoveableComputes].load = this_obj.wallTime;
653  nMoveableComputes++;
654  }
655  } else {
656  processorArray[stats->from_proc[j]].backgroundLoad += this_obj.wallTime;
657  }
658  }
659 
660  if ( nIdleComputes )
661  CkPrintf("LDB: %d computes have load of zero\n", nIdleComputes);
662 
663 /* *********** this code is defunct *****************
664 #if 0
665  int averageProxy = nProxies / n_pes;
666  CkPrintf("total proxies: %d, avervage: %d\n", nProxies, averageProxy);
667  for (i=0; i<n_pes; i++) {
668  // too many proxies on this node, weight the background load
669  int proxies = processorArray[i].proxies.numElements();
670  if (proxies > averageProxy) {
671  double factor = 1.0*(proxies-averageProxy)/nProxies;
672  processorArray[i].backgroundLoad *= (1.0 + factor);
673  CkPrintf("On [%d]: too many proxies: %d, increased bg load by %f\n", i, nProxies, factor);
674  }
675  }
676 #endif
677 *********** end of defunct code *********** */
678 
679  for (i=0; i<n_pes; i++) {
680  processorArray[i].load = processorArray[i].backgroundLoad + processorArray[i].computeLoad;
681  }
682  stats->clear();
683  return nMoveableComputes;
684 }
685 
686 // Figure out which proxies we will definitely create on other
687 // nodes, without regard for non-bonded computes. This code is swiped
688 // from ProxyMgr, and changes there probable need to be propagated here.
689 
690 int NamdCentLB::requiredProxies(PatchID id, int neighborNodes[])
691 {
692  PatchMap* patchMap = PatchMap::Object();
693  int myNode = patchMap->node(id);
694  int nProxyNodes = 0;
695 
696 #define IF_NEW_NODE \
697  int j; \
698  for ( j=0; j<nProxyNodes && neighborNodes[j] != proxyNode; ++j ); \
699  if ( j == nProxyNodes )
700 
702  neighbors[0] = id;
703  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
704  for ( int i = 0; i < numNeighbors; ++i ) {
705  const int proxyNode = patchMap->basenode(neighbors[i]);
706  if ( proxyNode != myNode ) {
707  IF_NEW_NODE {
708  neighborNodes[nProxyNodes] = proxyNode;
709  nProxyNodes++;
710  }
711  }
712  }
713 
714  // Distribute initial default proxies across empty processors.
715  // This shouldn't be necessary, but may constrain the load balancer
716  // and avoid placing too many proxies on a single processor. -JCP
717 
718  // This code needs to be turned off when the creation of ST is
719  // shifted to the load balancers -ASB
720 
721 #if 1
722  int numPes = CkNumPes();
723  int numPatches = patchMap->numPatches();
724  int emptyNodes = numPes - numPatches;
725  if ( emptyNodes > numPatches ) {
726  int nodesPerPatch = nProxyNodes + 1 + (emptyNodes-1) / numPatches;
727  int maxNodesPerPatch = PatchMap::MaxOneAway + PatchMap::MaxTwoAway;
728  if ( nodesPerPatch > maxNodesPerPatch ) nodesPerPatch = maxNodesPerPatch;
729  int proxyNode = (myNode + 1) % numPes;
730  while ( nProxyNodes < nodesPerPatch &&
731  ! patchMap->numPatchesOnNode(proxyNode) ) {
732  if ( proxyNode != myNode ) {
733  IF_NEW_NODE {
734  neighborNodes[nProxyNodes] = proxyNode;
735  nProxyNodes++;
736  }
737  }
738  proxyNode = (proxyNode + 1) % numPes;
739  }
740  proxyNode = (myNode - 1 + numPes) % numPes;
741  while ( nProxyNodes < nodesPerPatch &&
742  ! patchMap->numPatchesOnNode(proxyNode) ) {
743  if ( proxyNode != myNode ) {
744  IF_NEW_NODE {
745  neighborNodes[nProxyNodes] = proxyNode;
746  nProxyNodes++;
747  }
748  }
749  proxyNode = (proxyNode - 1 + numPes) % numPes;
750  }
751  proxyNode = (myNode + 1) % numPes;
752  int count = 0;
753  while ( nProxyNodes < nodesPerPatch ) {
754  if ( ! patchMap->numPatchesOnNode(proxyNode) && proxyNode != myNode ) {
755  IF_NEW_NODE {
756  neighborNodes[nProxyNodes] = proxyNode;
757  nProxyNodes++;
758  }
759  }
760  proxyNode = (proxyNode + 1) % numPes;
761  count ++; if (count == numPes) break; // we looped all
762  }
763  } else {
764  int proxyNode = myNode - 1;
765  if ( proxyNode >= 0 && ! patchMap->numPatchesOnNode(proxyNode) ) {
766  if ( proxyNode != myNode ) {
767  IF_NEW_NODE {
768  neighborNodes[nProxyNodes] = proxyNode;
769  nProxyNodes++;
770  }
771  }
772  }
773  proxyNode = myNode + 1;
774  if ( proxyNode < numPes && ! patchMap->numPatchesOnNode(proxyNode) ) {
775  if ( proxyNode != myNode ) {
776  IF_NEW_NODE {
777  neighborNodes[nProxyNodes] = proxyNode;
778  nProxyNodes++;
779  }
780  }
781  }
782  }
783 #endif
784 
785  return nProxyNodes;
786 }
787 
788 #if USE_TOPOMAP
789 // Figure out which proxies we will definitely create on other nodes,
790 // without regard for non-bonded computes. This code is swiped from
791 // ProxyMgr, and changes there probable need to be propagated here.
792 // The proxies are placed on nearby processors on the 3d-grid along
793 // the X, Y, Z and T dimensions
794 
795 int NamdCentLB::requiredProxiesOnProcGrid(PatchID id, int neighborNodes[])
796 {
797  enum proxyHere { No, Yes };
798  int numPes = CkNumPes();
799  proxyHere *proxyNodes = new proxyHere[numPes];
800  int nProxyNodes;
801  int i, j, k, l;
802 
803  int xsize = 0, ysize = 0, zsize = 0, tsize = 0;
804  int my_x = 0, my_y = 0, my_z = 0, my_t = 0;
805 
806  PatchMap* patchMap = PatchMap::Object();
807  int myNode = patchMap->node(id);
808 
809  TopoManager tmgr;
810  xsize = tmgr.getDimNX();
811  ysize = tmgr.getDimNY();
812  zsize = tmgr.getDimNZ();
813  tsize = tmgr.getDimNT();
814 
815  tmgr.rankToCoordinates(myNode, my_x, my_y, my_z, my_t);
816 
817  if(xsize * ysize * zsize * tsize != CkNumPes()) {
818  delete [] proxyNodes;
819  return requiredProxies(id, neighborNodes);
820  }
821 
822  // Note all home patches.
823  for ( i = 0; i < numPes; ++i )
824  {
825  proxyNodes[i] = No;
826  }
827  nProxyNodes = 0;
828 
829  // Check all two-away neighbors.
830  // This is really just one-away neighbors, since
831  // two-away always returns zero: RKB
833 
834  // Assign a proxy to all your neighbors. But dont increment counter
835  // because these have to be there anyway.
836  neighbors[0] = id;
837  int numNeighbors = 1 + patchMap->downstreamNeighbors(id,neighbors+1);
838 
839  // Small Flag chooses between different loadbalancing schemes.
840  // Small Flag == true, patches are close to each other
841  // false, patches are far from each other
842  bool smallFlag = false;
843  double pnodes = CkNumPes();
844  pnodes *= 0.25;
845  smallFlag = (patchMap->numPatches() > pnodes )?1:0;
846 
847  //If there are lot of patches its likely they will all be neighbors,
848  //so all we need to do is to place proxies on downstream patches.
849  //if (smallFlag) {
850  for ( i = 1; i < numNeighbors; ++i )
851  {
852  int proxyNode = patchMap->basenode(neighbors[i]);
853 
854  if (proxyNode != myNode)
855  if (proxyNodes[proxyNode] == No)
856  {
857  proxyNodes[proxyNode] = Yes;
858  neighborNodes[nProxyNodes] = proxyNode;
859  nProxyNodes++;
860  }
861  }
862  //}
863 
864  if (step() > 2) {
865  delete [] proxyNodes;
866  return nProxyNodes;
867  }
868 
869  // Place numPesPerPatch proxies on the 3d torus neighbors of a processor
870 
871  int numPatches = patchMap->numPatches();
872  int emptyNodes = numPes - numPatches;
873  //if ( emptyNodes > numPatches ) {
874 
875  int nodesPerPatch = nProxyNodes + 4 * (emptyNodes-1) / numPatches + 1;
876  int proxyNode = 0 ;
877  int proxy_x=0, proxy_y=0, proxy_z=0;
878 
879  //Choose from the 26 neighbors of mynode.
880  //CkAssert(nodesPerPatch - nProxyNodes <= 26);
881  //Too few patches otherwise, try twoaway?
882 
883  for(k=-1; k<= 1; k++) {
884  proxy_z = (my_z + k + zsize) % zsize;
885  for(j=-1; j <= 1; j++) {
886  proxy_y = (my_y + j + ysize) % ysize;
887  for(i = -1; i <= 1; i++) {
888  proxy_x = (my_x + i + xsize) % xsize;
889  for(l = 0; l < tsize; l++) {
890  if(i == 0 && j == 0 && k == 0 && l == 0)
891  continue;
892 
893  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
894 
895  if((! patchMap->numPatchesOnNode(proxyNode) || !smallFlag) &&
896  proxyNodes[proxyNode] == No) {
897  proxyNodes[proxyNode] = Yes;
898  neighborNodes[nProxyNodes] = proxyNode;
899  nProxyNodes++;
900  }
901 
902  if(nProxyNodes >= nodesPerPatch ||
903  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
904  break;
905  } // end for
906 
907  if(nProxyNodes >= nodesPerPatch ||
908  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
909  break;
910  } // end for
911 
912  if(nProxyNodes >= nodesPerPatch ||
913  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
914  break;
915  } // end for
916 
917  if(nProxyNodes >= nodesPerPatch ||
918  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
919  break;
920  } // end for
921 
922 #if 1
923  if(!smallFlag) {
924  for(k=-2; k<= 2; k+=2) {
925  proxy_z = (my_z + k + zsize) % zsize;
926  for(j=-2; j <= 2; j+=2) {
927  proxy_y = (my_y + j + ysize) % ysize;
928  for(i = -2; i <= 2; i+=2) {
929  proxy_x = (my_x + i + xsize) % xsize;
930  for(l = 0; l < tsize; l++) {
931  if(i == 0 && j == 0 && k == 0 && l == 0)
932  continue;
933 
934  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z, l);
935 
936  if((! patchMap->numPatchesOnNode(proxyNode) || !smallFlag) &&
937  proxyNodes[proxyNode] == No) {
938  proxyNodes[proxyNode] = Yes;
939  neighborNodes[nProxyNodes] = proxyNode;
940  nProxyNodes++;
941  }
942 
943  if(nProxyNodes >= nodesPerPatch ||
944  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
945  break;
946  } // end for
947 
948  if(nProxyNodes >= nodesPerPatch ||
949  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
950  break;
951  } // end for
952 
953  if(nProxyNodes >= nodesPerPatch ||
954  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
955  break;
956  } // end for
957 
958  if(nProxyNodes >= nodesPerPatch ||
959  nProxyNodes >= PatchMap::MaxOneAway + PatchMap::MaxTwoAway)
960  break;
961  } // end for
962  }
963 
964 #else
965  #if 0
966  const SimParameters* params = Node::Object()->simParameters;
967 
968  if(!smallFlag) {
969  //Add two-away proxies
970  if(patchMap->numaway_a() == 2) {
971  proxy_y = (my_y + 2) % ysize;
972  proxy_x = my_x % xsize;
973  proxy_z = my_z % zsize;
974 
975  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
976  if(proxyNodes[proxyNode] == No) {
977  proxyNodes[proxyNode] = Yes;
978  neighborNodes[nProxyNodes] = proxyNode;
979  nProxyNodes++;
980  }
981 
982  proxy_y = (my_y - 2 + ysize) % ysize;
983  proxy_x = my_x % xsize;
984  proxy_z = my_z % zsize;
985 
986  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
987  if(proxyNodes[proxyNode] == No) {
988  proxyNodes[proxyNode] = Yes;
989  neighborNodes[nProxyNodes] = proxyNode;
990  nProxyNodes++;
991  }
992  }
993 
994  //Add two away proxies
995  if(patchMap->numaway_b() == 2) {
996  proxy_y = my_y % ysize;
997  proxy_x = my_x % xsize;
998  proxy_z = (my_z + 2) % zsize;
999 
1000  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1001  if(proxyNodes[proxyNode] == No) {
1002  proxyNodes[proxyNode] = Yes;
1003  neighborNodes[nProxyNodes] = proxyNode;
1004  nProxyNodes++;
1005  }
1006 
1007  proxy_y = my_y % ysize;
1008  proxy_x = my_x % xsize;
1009  proxy_z = (my_z - 2 + zsize) % zsize;
1010 
1011  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1012  if(proxyNodes[proxyNode] == No) {
1013  proxyNodes[proxyNode] = Yes;
1014  neighborNodes[nProxyNodes] = proxyNode;
1015  nProxyNodes++;
1016  }
1017  }
1018 
1019  //Add two away proxies
1020  if(patchMap->numaway_c() == 2) {
1021  proxy_y = my_y % ysize;
1022  proxy_x = (my_x + 2) % xsize;
1023  proxy_z = my_z % zsize;
1024 
1025  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1026  if(proxyNodes[proxyNode] == No) {
1027  proxyNodes[proxyNode] = Yes;
1028  neighborNodes[nProxyNodes] = proxyNode;
1029  nProxyNodes++;
1030  }
1031 
1032  proxy_y = my_y % ysize;
1033  proxy_x = (my_x - 2 + xsize) % xsize;
1034  proxy_z = my_z % zsize;
1035 
1036  proxyNode = tmgr.coordinatesToRank(proxy_x, proxy_y, proxy_z);
1037  if(proxyNodes[proxyNode] == No) {
1038  proxyNodes[proxyNode] = Yes;
1039  neighborNodes[nProxyNodes] = proxyNode;
1040  nProxyNodes++;
1041  }
1042  }
1043  }
1044  #endif
1045 #endif
1046 
1047  // CkPrintf("Returning %d proxies\n", nProxyNodes);
1048 
1049  delete [] proxyNodes;
1050  return nProxyNodes;
1051 }
1052 
1053 #endif
static Node * Object()
Definition: Node.h:86
BlockLoad::TempStorage load
int patch1
Definition: elements.h:23
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
#define IF_NEW_NODE
represents bonded compute
NamdCentLB * AllocateNamdCentLB()
Definition: NamdCentLB.C:38
Definition: Alg7.h:13
void setNewNumPartitions(ComputeID cid, char numPartitions)
Definition: ComputeMap.h:144
BigReal ldbRelativeGrainsize
int numComputes(void)
Definition: ComputeMap.h:101
static PatchMap * Object()
Definition: PatchMap.h:27
double * cpuloads
Definition: NamdCentLB.C:24
int numElements()
Definition: Set.C:144
SimParameters * simParameters
Definition: Node.h:178
LargeIRSet proxies
Definition: elements.h:46
#define LDBSTRAT_REFINEONLY
Definition: SimParameters.h:67
Bool ldbUnloadOutputPEs
int Id
Definition: elements.h:16
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
CLBMigrateMsg * Strategy(LDStats *stats)
Definition: NamdCentLB.C:88
int isMICProcessor(int pe)
Definition: ComputeMgr.C:1881
int processor
Definition: elements.h:24
#define iout
Definition: InfoStream.h:51
int oldProcessor
Definition: elements.h:25
static double averageLoad
Definition: ProxyMgr.C:696
int numaway_b(void) const
Definition: PatchMap.h:69
void insert(InfoRecord *)
Definition: Set.C:49
int basenode(int pid) const
Definition: PatchMap.h:117
#define LDBSTRAT_DEFAULT
Definition: SimParameters.h:65
int isPmeProcessor(int)
Definition: ComputePme.C:604
BigReal ldbHomeBackgroundScaling
static Units next(Units u)
Definition: ParseOptions.C:48
#define LDBSTRAT_OLD
Definition: SimParameters.h:68
double idleTime
Definition: elements.h:40
int patch2
Definition: elements.h:23
void CreateNamdCentLB()
Definition: NamdCentLB.C:26
int numPartitions(ComputeID cid)
Definition: ComputeMap.C:135
int PatchID
Definition: NamdTypes.h:182
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void setNewNode(ComputeID cid, NodeID node)
Definition: ComputeMap.h:120
const int & LdbIdField(const LdbId &id, const int index)
int numAtoms
Definition: elements.h:32
void NAMD_die(const char *err_msg)
Definition: common.C:85
static LdbCoordinator * Object()
BigReal ldbBackgroundScaling
represents a patch
double load
Definition: elements.h:15
Definition: Set.h:19
#define LDBSTRAT_COMPREHENSIVE
Definition: SimParameters.h:66
int downstreamNeighbors(int pid, PatchID *neighbor_ids)
Definition: PatchMap.C:714
LDObjHandle handle
Definition: elements.h:26
#define simParams
Definition: Output.C:127
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
IRSet proxiesOn
Definition: elements.h:33
static ComputeMap * Object()
Definition: ComputeMap.h:89
BigReal ldbPMEBackgroundScaling
double computeLoad
Definition: elements.h:41
int numPids(ComputeID cid)
Definition: ComputeMap.C:103
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
void unchecked_insert(InfoRecord *)
Definition: Set.C:32
int numaway_c(void) const
Definition: PatchMap.h:70
NamdCentLB(const CkLBOptions &opt)
Definition: NamdCentLB.C:51
int processor
Definition: elements.h:31
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:109
int isOutputProcessor(int pe)
int numaway_a(void) const
Definition: PatchMap.h:68
double backgroundLoad
Definition: elements.h:39
bool available
Definition: elements.h:44
double BigReal
Definition: common.h:114