NAMD
TorusLB.C
Go to the documentation of this file.
1 /*****************************************************************************
2  * $Source: /home/cvs/namd/cvsroot/namd2/src/TorusLB.C,v $
3  * $Author: jim $
4  * $Date: 2013/08/30 18:18:20 $
5  * $Revision: 1.35 $
6  *****************************************************************************/
7 
15 #include "TorusLB.h"
16 #include "ProxyMgr.h"
17 #define SHRINK_INNER_BRICK 1
18 
20 int npas, int npes) : RefineTorusLB(cs, pas, pes, ncs, npas, npes, 0)
21 {
22  strategyName = "TorusLB";
23  strategy();
24  //if ( computeMax() <= origMaxLoad ) {
25  // binaryRefine();
26  // printLoads();
27  //}
28  // CREATE THE SPANNING TREE IN THE LOAD BALANCER
29  //if(proxySendSpanning || proxyRecvSpanning)
30  // createSpanningTree();
31 }
32 
34 
35 void TorusLB::strategy() {
36  int index;
37  // compute the average load by (compute load + background load) / numPesAvailable
39  // two heaps of self and pair computes
40  makeTwoHeaps();
41 
42  const int beginGroup = processors[0].Id;
43  const int endGroup = beginGroup + P;
44 #define INGROUP(PROC) ((PROC) >= beginGroup && (PROC) < endGroup)
45 
46  computeInfo *c;
47  processorInfo *p, *minp;
48  Iterator nextP;
49  overLoad = 1.2;
50 
51  for(int I=0; I<numComputes; I++) {
52 
54  if ( ! c ) c = (computeInfo *) computeSelfHeap->deleteMax();
55 
56  if(c->processor != -1) continue; // go to the next compute
57  if(!c) CkAbort("TorusLB: Compute Heap empty!\n");
58 
59  for(int j=0; j<6; j++) {
60  bestPe[j] = 0;
61  goodPe[j] = 0;
62  badPe[j] = 0;
63  }
64 
65  // Look at pes which have the compute's patches
66 
67  // HYBRID check if processor is in local group
68 #define SELECT_REALPE(X) if INGROUP((X)) { \
69  selectPes(&processors[(X) - beginGroup], c); \
70  }
71 
72  const int realPe1 = patches[c->patch1].processor;
73  SELECT_REALPE(realPe1)
74 
75  const int realPe2 = patches[c->patch2].processor;
76  if ( realPe2 != realPe1 ) {
77  SELECT_REALPE(realPe2)
78  }
79 
80  // Try the processors which have the patches' proxies
81  p = (processorInfo *)(patches[c->patch1].proxiesOn.iterator((Iterator *)&nextP));
82  while(p) { // patch 1
83  if INGROUP(p->Id) selectPes(p, c);
84  p = (processorInfo *)(patches[c->patch1].proxiesOn.next((Iterator *)&nextP));
85  }
86 
87  p = (processorInfo *)(patches[c->patch2].proxiesOn.iterator((Iterator *)&nextP));
88  while(p) { // patch 2
89  if INGROUP(p->Id) selectPes(p, c);
90  p = (processorInfo *)(patches[c->patch2].proxiesOn.next((Iterator *)&nextP));
91  }
92 
93  // see if we have found a processor to place the compute on
94  p = 0;
95  if((p = bestPe[5])
96 #if USE_TOPOMAP
97  || (p = goodPe[5])
98 #endif
99  || (p = bestPe[4])
100 #if USE_TOPOMAP
101  || (p = goodPe[4])
102 #endif
103  || (p = bestPe[3])
104 #if USE_TOPOMAP
105  || (p = goodPe[3])
106 #endif
107  || (p = bestPe[1])
108 #if USE_TOPOMAP
109  || (p = goodPe[1])
110 #endif
111  || (p = bestPe[2])
112 #if USE_TOPOMAP
113  || (p = goodPe[2])
114 #endif
115  || (p = bestPe[0])
116 #if USE_TOPOMAP
117  || (p = goodPe[0])
118 #endif
119  ) {
120  assign(c, p);
121  continue;
122  }
123 
124  // Try all pes on the nodes of the home patches
125  if ( CmiNumNodes() > 1 ) { // else not useful
126  double minLoad = overLoad * averageLoad;
127  minp = 0;
128  int realNode1 = CmiNodeOf(realPe1);
129  int nodeSize = CmiNodeSize(realNode1);
130  if ( nodeSize > 1 ) { // else did it already
131  int firstpe = CmiNodeFirst(realNode1);
132  for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) {
133  if INGROUP(rpe) {
134  p = &processors[rpe - beginGroup];
135  if ( p->available && ( p->load + c->load < minLoad ) ) {
136  minLoad = p->load + c->load;
137  minp = p;
138  }
139  }
140  }
141  }
142  if ( realPe2 != realPe1 ) {
143  int realNode2 = CmiNodeOf(realPe2);
144  if ( realNode2 != realNode1 ) { // else did it already
145  nodeSize = CmiNodeSize(realNode2);
146  if ( nodeSize > 1 ) {
147  int firstpe = CmiNodeFirst(realNode2);
148  for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) {
149  if INGROUP(rpe) {
150  p = &processors[rpe - beginGroup];
151  if ( p->available && ( p->load + c->load < minLoad ) ) {
152  minLoad = p->load + c->load;
153  minp = p;
154  }
155  }
156  }
157  }
158  }
159  }
160  if(minp) {
161  assign(c, minp);
162  continue;
163  }
164  }
165 
166  // Try all pes on the physical nodes of the home patches
167  if ( ( CmiNumPhysicalNodes() > 1 ) &&
168  ( CmiNumPhysicalNodes() < CmiNumNodes() ) ) { // else not useful
169  double minLoad = overLoad * averageLoad;
170  minp = 0;
171  int realNode1 = CmiPhysicalNodeID(realPe1);
172  int *rpelist;
173  int nodeSize;
174  CmiGetPesOnPhysicalNode(realNode1, &rpelist, &nodeSize);
175  if ( nodeSize > 1 ) { // else did it already
176  for ( int ipe = 0; ipe < nodeSize; ++ipe ) {
177  int rpe = rpelist[ipe];
178  if INGROUP(rpe) {
179  p = &processors[rpe - beginGroup];
180  if ( p->available && ( p->load + c->load < minLoad ) ) {
181  minLoad = p->load + c->load;
182  minp = p;
183  }
184  }
185  }
186  }
187  if ( realPe2 != realPe1 ) {
188  int realNode2 = CmiPhysicalNodeID(realPe2);
189  if ( realNode2 != realNode1 ) { // else did it already
190  CmiGetPesOnPhysicalNode(realNode2, &rpelist, &nodeSize);
191  if ( nodeSize > 1 ) { // else did it already
192  for ( int ipe = 0; ipe < nodeSize; ++ipe ) {
193  int rpe = rpelist[ipe];
194  if INGROUP(rpe) {
195  p = &processors[rpe - beginGroup];
196  if ( p->available && ( p->load + c->load < minLoad ) ) {
197  minLoad = p->load + c->load;
198  minp = p;
199  }
200  }
201  }
202  }
203  }
204  }
205  if(minp) {
206  assign(c, minp);
207  continue;
208  }
209  }
210 
211 
212  int found = 0;
213 #if USE_TOPOMAP
214  // If no processor found, go through the whole list in a topological fashion
215  // first try the inner brick
216  int p1, p2, pe, x1, x2, xm, xM, y1, y2, ym, yM, z1, z2, zm, zM, t1, t2;
217  int dimNX, dimNY, dimNZ, dimNT;
218  double minLoad;
219  p1 = patches[c->patch1].processor;
220  p2 = patches[c->patch2].processor;
221 
222  tmgr.rankToCoordinates(p1, x1, y1, z1, t1);
223  tmgr.rankToCoordinates(p2, x2, y2, z2, t2);
224  dimNX = tmgr.getDimNX();
225  dimNY = tmgr.getDimNY();
226  dimNZ = tmgr.getDimNZ();
227  dimNT = tmgr.getDimNT();
228 
229  brickDim(x1, x2, dimNX, xm, xM);
230  brickDim(y1, y2, dimNY, ym, yM);
231  brickDim(z1, z2, dimNZ, zm, zM);
232 
233  // to shrink the inner brick by some hops
234 #if 0
235  xm=xm+SHRINK_INNER_BRICK;
236  ym=ym+SHRINK_INNER_BRICK;
237  zm=zm+SHRINK_INNER_BRICK;
238 
239  xM=xM-SHRINK_INNER_BRICK;
240  yM=yM-SHRINK_INNER_BRICK;
241  zM=zM-SHRINK_INNER_BRICK;
242 #endif
243 
244  // first go over the processors inside the brick and choose the least
245  // overloaded one
246  p = 0; minp = 0;
247  minLoad = overLoad * averageLoad;
248  for(int i=xm; i<=xM; i++)
249  for(int j=ym; j<=yM; j++)
250  for(int k=zm; k<=zM; k++)
251  for(int l=0; l<dimNT; l++)
252  {
253  pe = tmgr.coordinatesToRank(i%dimNX, j%dimNY, k%dimNZ, l);
254  if ( ! INGROUP(pe) ) continue;
255  p = &processors[pe - beginGroup];
256  if(c->load + p->load < minLoad) {
257  minLoad = c->load + p->load;
258  minp = p;
259  found = 1;
260  }
261  }
262 
263  // if no success, then go through the remaining torus (outer brick)
264  // and pick the first underloaded one
265  minLoad = overLoad * averageLoad;
266  if(found == 0) {
267  p = 0; minp = 0;
268  for(int i=xM+1; i<xm+dimNX; i++)
269  for(int j=0; j<dimNY; j++)
270  for(int k=0; k<dimNZ; k++)
271  for(int l=0; l<dimNT; l++)
272  {
273  pe = tmgr.coordinatesToRank(i%dimNX, j%dimNY, k%dimNZ, l);
274  if ( ! INGROUP(pe) ) continue;
275  p = &processors[pe - beginGroup];
276  if(c->load + p->load < minLoad) {
277  minp = p;
278  found = 1; break;
279  }
280  }
281  }
282 
283  if(found == 0) {
284  for(int j=yM+1; j<ym+dimNY; j++)
285  for(int i=xm; i<=xM; i++)
286  for(int k=0; k<dimNZ; k++)
287  for(int l=0; l<dimNT; l++)
288  {
289  pe = tmgr.coordinatesToRank(i%dimNX, j%dimNY, k%dimNZ, l);
290  if ( ! INGROUP(pe) ) continue;
291  p = &processors[pe - beginGroup];
292  if(c->load + p->load < minLoad) {
293  minp = p;
294  found = 1; break;
295  }
296  }
297  }
298 
299  if(found == 0) {
300  for(int k=zM+1; k<zm+dimNZ; k++)
301  for(int i=xm; i<=xM; i++)
302  for(int j=ym; j<=yM; j++)
303  for(int l=0; l<dimNT; l++)
304  {
305  pe = tmgr.coordinatesToRank(i%dimNX, j%dimNY, k%dimNZ, l);
306  if ( ! INGROUP(pe) ) continue;
307  p = &processors[pe - beginGroup];
308  if(c->load + p->load < minLoad) {
309  minp = p;
310  found = 1; break;
311  }
312  }
313  }
314 
315 
316  if(found == 1) {
317  assign(c, minp);
318  continue;
319  }
320 
321 #endif /* USE_TOPOMAP */
322 
323  if(found == 0) {
324  heapIterator nextp;
325  processorInfo *p = (processorInfo *)(pes->iterator((heapIterator *) &nextp));
326  while (p) {
327  selectPes(p, c);
328  p = (processorInfo *)(pes->next(&nextp));
329  }
330  p = 0;
331  if((p = bestPe[5])
332 #if USE_TOPOMAP
333  || (p = goodPe[5])
334 #endif
335  || (p = bestPe[4])
336 #if USE_TOPOMAP
337  || (p = goodPe[4])
338 #endif
339  || (p = bestPe[3])
340 #if USE_TOPOMAP
341  || (p = goodPe[3])
342 #endif
343  || (p = bestPe[1])
344 #if USE_TOPOMAP
345  || (p = goodPe[1])
346 #endif
347  || (p = bestPe[2])
348 #if USE_TOPOMAP
349  || (p = goodPe[2])
350 #endif
351  || (p = bestPe[0])
352 #if USE_TOPOMAP
353  || (p = goodPe[0])
354 #endif
355  ) {
356  assign(c, p);
357  found = 1;
358  continue;
359  }
360  }
361 
362  if(found == 0) {
363  p = 0;
364  if((p = badPe[5])
365  || (p = badPe[4])
366  || (p = badPe[3])
367  || (p = badPe[1])
368  || (p = badPe[2])
369  || (p = badPe[0])) {
370  assign(c, p);
371  found = 1;
372  continue;
373  }
374  }
375 
376  if(found == 0) {
377  CkPrintf("TorusLB: No receiver found average %f overload %f\n", averageLoad, overLoad);
378  CkAbort("TorusLB: No receiver found\n");
379  }
380 
381  } // end of computes for-loop
382 
383  printLoads(3);
384 }
385 
386 void TorusLB::selectPes(processorInfo *p, computeInfo *c) {
387  if (p->available == false)
388  return;
389 
390  // find the position in bestPe/goodPe to place this pair
391  // HP HP HP HP HP HP
392  // 02 11 20 01 10 00
393  // 5 4 3 2 1 0
394  int numPatches, numProxies, /* badForComm, */ index;
395  numAvailable(c, p, &numPatches, &numProxies, 0 /* &badForComm */);
396  int numEither = numPatches + numProxies;
397  index = (numEither*(numEither+1))/2 + numProxies;
398 
399 #if USE_TOPOMAP
400  int x, y, z, t;
401  int p1, p2, pe, x1, x2, xm, xM, y1, y2, ym, yM, z1, z2, zm, zM, t1, t2;
402  int dimNX, dimNY, dimNZ, dimNT;
403  double minLoad;
404  p1 = patches[c->patch1].processor;
405  p2 = patches[c->patch2].processor;
406 
407  tmgr.rankToCoordinates(p1, x1, y1, z1, t1);
408  tmgr.rankToCoordinates(p2, x2, y2, z2, t2);
409  dimNX = tmgr.getDimNX();
410  dimNY = tmgr.getDimNY();
411  dimNZ = tmgr.getDimNZ();
412  dimNT = tmgr.getDimNT();
413 
414  brickDim(x1, x2, dimNX, xm, xM);
415  brickDim(y1, y2, dimNY, ym, yM);
416  brickDim(z1, z2, dimNZ, zm, zM);
417 #endif
418 
419  if (p->load + c->load < overLoad * averageLoad) {
420  // replace only if the new processor is underloaded
421 #if USE_TOPOMAP
422  tmgr.rankToCoordinates(p->Id, x, y, z, t);
423  int wB = withinBrick(x, y, z, xm, xM, dimNX, ym, yM, dimNY, zm, zM, dimNZ);
424  if (wB) {
425  // if within the brick formed by the patch processors
426 #endif
427  // or the non-topology case
428  processorInfo* &oldp = bestPe[index];
429  if (!(oldp) || p->load < oldp->load )
430  oldp = p;
431 #if USE_TOPOMAP
432  } else {
433  // if outside the brick formed by the patch processors
434  processorInfo* &oldp = goodPe[index];
435  double loadDiff = 0.0;
436 
437  if (!(oldp)) {
438  // replace if there is no processor at that position
439  oldp = p;
440  }
441  else {
442  // get the load difference if the processor exixts
443  loadDiff = oldp->load - p->load;
444  if ((loadDiff > 0.4) || (loadDiff > 0.0 && (tmgr.getHopsBetweenRanks(p->Id, p1) +
445  tmgr.getHopsBetweenRanks(p->Id, p2) < tmgr.getHopsBetweenRanks(oldp->Id, p1) +
446  tmgr.getHopsBetweenRanks(oldp->Id, p2)))) {
447  // if weights are similar, look at the hops
448  oldp = p;
449  }
450  }
451  }
452 #endif
453  } else {
454  // for the first placement, we must find a processor to place
455  // the compute on, so choose a bad processor if necessary
456  processorInfo* &oldp = badPe[index];
457  if (!(oldp) || p->load < oldp->load )
458  oldp = p;
459  }
460 }
461 
int numComputes
Definition: Rebalancer.h:138
int patch1
Definition: elements.h:23
int numPatches
Definition: Rebalancer.h:137
int Id
Definition: elements.h:16
void assign(computeInfo *c, processorInfo *pRec)
Definition: Rebalancer.C:402
minHeap * pes
Definition: Rebalancer.h:131
if(ComputeNonbondedUtil::goMethod==2)
double averageLoad
Definition: Rebalancer.h:141
int processor
Definition: elements.h:24
#define SELECT_REALPE(X)
int numProxies
Definition: Rebalancer.h:139
void makeTwoHeaps()
Definition: Rebalancer.C:356
TorusLB(computeInfo *cs, patchInfo *pas, processorInfo *pes, int ncs, int npas, int npes)
Definition: TorusLB.C:19
processorInfo * processors
Definition: Rebalancer.h:130
void printLoads(int phase=0)
Definition: Rebalancer.C:874
void numAvailable(computeInfo *c, processorInfo *p, int *nPatches, int *nProxies, int *isBadForCommunication)
Definition: Rebalancer.C:1074
static Units next(Units u)
Definition: ParseOptions.C:48
#define INGROUP(PROC)
InfoRecord * next(heapIterator *)
Definition: heap.C:100
InfoRecord * deleteMax()
Definition: heap.C:152
int patch2
Definition: elements.h:23
maxHeap * computeSelfHeap
Definition: Rebalancer.h:133
gridSize z
maxHeap * computePairHeap
Definition: Rebalancer.h:132
int withinBrick(int x, int y, int z, int xm, int xM, int dimX, int ym, int yM, int dimY, int zm, int zM, int dimZ)
Definition: Rebalancer.h:199
double computeAverage()
Definition: Rebalancer.C:1001
void brickDim(int a, int b, int dim, int &min, int &max)
Definition: Rebalancer.h:179
double overLoad
Definition: Rebalancer.h:168
double load
Definition: elements.h:15
Definition: Set.h:19
~TorusLB()
Definition: TorusLB.C:33
InfoRecord * iterator(heapIterator *)
Definition: heap.C:94
IRSet proxiesOn
Definition: elements.h:33
patchInfo * patches
Definition: Rebalancer.h:129
gridSize y
int processor
Definition: elements.h:31
gridSize x
#define SHRINK_INNER_BRICK
Definition: TorusLB.C:17
InfoRecord * iterator(Iterator *)
Definition: Set.C:122
const char * strategyName
Definition: Rebalancer.h:127
bool available
Definition: elements.h:44