Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00009 #include <fftw.h>
00010 #include <rfftw.h>
00011 #else
00012 #include <sfftw.h>
00013 #include <srfftw.h>
00014 #endif
00015 #endif
00016 
00017 #include "InfoStream.h"
00018 #include "Node.h"
00019 #include "PatchMap.h"
00020 #include "PatchMap.inl"
00021 #include "AtomMap.h"
00022 #include "ComputePme.h"
00023 #include "ComputePmeMgr.decl.h"
00024 #include "PmeRealSpace.h"
00025 #include "PmeKSpace.h"
00026 #include "ComputeNonbondedUtil.h"
00027 #include "PatchMgr.h"
00028 #include "Molecule.h"
00029 #include "ReductionMgr.h"
00030 #include "ComputeMgr.h"
00031 #include "ComputeMgr.decl.h"
00032 // #define DEBUGM
00033 #define MIN_DEBUG_LEVEL 3
00034 #include "Debug.h"
00035 #include "SimParameters.h"
00036 #include "WorkDistrib.h"
00037 #include "varsizemsg.h"
00038 #include "Random.h"
00039 #include "Priorities.h"
00040 
00041 // commlib has been observed to cause hangs when starting load balancing
00042 // #define USE_COMM_LIB 1
00043 #ifdef USE_COMM_LIB
00044 #include "EachToManyMulticastStrategy.h"
00045 #endif
00046 
00047 #ifndef SQRT_PI
00048 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00049 #endif
00050 
00051 char *pencilPMEProcessors;
00052 
00053 
00054 class PmeGridMsg : public CMessage_PmeGridMsg {
00055 public:
00056 
00057   int sourceNode;
00058   int sequence;
00059   int hasData;
00060   Lattice lattice;
00061   PmeReduction *evir;
00062   int start;
00063   int len;
00064   int zlistlen;
00065   int *zlist;
00066   char *fgrid;
00067   float *qgrid;
00068 };
00069 
00070 class PmeTransMsg : public CMessage_PmeTransMsg {
00071 public:
00072 
00073   int sourceNode;
00074   int sequence;
00075   int hasData;
00076   Lattice lattice;
00077   int x_start;
00078   int nx;
00079   float *qgrid;
00080 };
00081 
00082 
00083 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00084 public:
00085 
00086   int sourceNode;
00087   int has_evir;
00088   PmeReduction *evir;
00089   int y_start;
00090   int ny;
00091   float *qgrid;
00092 
00093 };
00094 
00095 
00096 // use this idiom since messages don't have copy constructors
00097 struct PmePencilInitMsgData {
00098   PmeGrid grid;
00099   int xBlocks, yBlocks, zBlocks;
00100   CProxy_PmeXPencil xPencil;
00101   CProxy_PmeYPencil yPencil;
00102   CProxy_PmeZPencil zPencil;
00103   CProxy_ComputePmeMgr pmeProxy;
00104 };
00105 
00106 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00107 public:
00108    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00109    PmePencilInitMsgData data;
00110 };
00111 
00112 
00113 struct LocalPmeInfo {
00114   int nx, x_start;
00115   int ny_after_transpose, y_start_after_transpose;
00116 };
00117 
00118 
00119 //Assigns gridPeMap and transPeMap to the same set of processors.
00120 void generatePmePeList(int *peMap, int numPes){
00121   // decide which pes to use by bit reversal
00122   int i;
00123   int ncpus = CkNumPes();
00124   
00125   // find next highest power of two
00126   int npow2 = 1;  int nbits = 0;
00127   while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00128   
00129   // build bit reversal sequence
00130   SortableResizeArray<int> seq(ncpus);
00131   i = 0;
00132   for ( int icpu=0; icpu<ncpus; ++icpu ) {
00133     int ri;
00134     for ( ri = ncpus; ri >= ncpus; ++i ) {
00135       ri = 0;
00136       int pow2 = 1;
00137       int rpow2 = npow2 / 2;
00138       for ( int j=0; j<nbits; ++j ) {
00139         ri += rpow2 * ( ( i / pow2 ) % 2 );
00140         pow2 *= 2;  rpow2 /= 2;
00141       }
00142     }
00143     seq[icpu] = ri;
00144   }
00145   
00146   // extract and sort PME locations
00147   for ( i=0; i<numPes; ++i ) {
00148     seq[i] = seq[ncpus - numPes + i];
00149   }
00150   seq.resize(numPes);
00151   seq.sort();
00152   
00153   for ( i=0; i<numPes; ++i ) 
00154       peMap[i] = seq[i];
00155 
00156   //peMap[0] = 0;
00157 }
00158 
00159 //Assigns gridPeMap and transPeMap to different set of processors.
00160 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00161   // decide which pes to use by bit reversal
00162   int i;
00163   int ncpus = CkNumPes();
00164   
00165   // find next highest power of two
00166   int npow2 = 1;  int nbits = 0;
00167   while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00168   
00169   // build bit reversal sequence
00170   SortableResizeArray<int> seq(ncpus);
00171   SortableResizeArray<int> seq2(ncpus);
00172   i = 0;
00173   for ( int icpu=0; icpu<ncpus; ++icpu ) {
00174     int ri;
00175     for ( ri = ncpus; ri >= ncpus; ++i ) {
00176       ri = 0;
00177       int pow2 = 1;
00178       int rpow2 = npow2 / 2;
00179       for ( int j=0; j<nbits; ++j ) {
00180         ri += rpow2 * ( ( i / pow2 ) % 2 );
00181         pow2 *= 2;  rpow2 /= 2;
00182       }
00183     }
00184     seq[icpu] = ri;
00185     seq2[icpu] = ri;
00186   }
00187   
00188   // extract and sort PME locations
00189   for ( i=0; i<numGridPes; ++i ) {
00190     seq[i] = seq[ncpus - numGridPes + i];
00191   }
00192   seq.resize(numGridPes);
00193   seq.sort();
00194   int firstTransPe = ncpus - numGridPes - numTransPes;
00195   if ( firstTransPe < 0 ) {
00196     firstTransPe = 0;
00197     // 0 should be first in list, skip if possible
00198     if ( ncpus > numTransPes ) firstTransPe = 1;
00199   }
00200   for ( i=0; i<numTransPes; ++i ) {
00201     seq2[i] = seq2[firstTransPe + i];
00202   }
00203   seq2.resize(numTransPes);
00204   seq2.sort();
00205   
00206   for ( i=0; i<numGridPes; ++i ) 
00207     gridPeMap[i] = seq[i];
00208 
00209   for ( i=0; i<numTransPes; ++i ) 
00210     transPeMap[i] = seq2[i];
00211 }
00212 
00213 #if USE_TOPOMAP 
00214 //Topology aware PME allocation
00215 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00216                              int nbpes=0);
00217 #endif
00218 
00219 class ComputePmeMgr : public BOCclass {
00220 public:
00221   friend class ComputePme;
00222   ComputePmeMgr();
00223   ~ComputePmeMgr();
00224 
00225   void initialize(CkQdMsg*);
00226   void initialize_pencils(CkQdMsg*);
00227   void activate_pencils(CkQdMsg*);
00228   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00229 
00230   void sendGrid(void);
00231   void recvGrid(PmeGridMsg *);
00232   void gridCalc1(void);
00233   void sendTransBarrier(void);
00234   void sendTrans(void);
00235   void recvTrans(PmeTransMsg *);
00236   void gridCalc2(void);
00237   void sendUntrans(void);
00238   void recvUntrans(PmeUntransMsg *);
00239   void gridCalc3(void);
00240   void sendUngrid(void);
00241   void recvUngrid(PmeGridMsg *);
00242   void ungridCalc(void);
00243 
00244   void setCompute(ComputePme *c) { pmeCompute = c; c->setMgr(this); }
00245 
00246   //Tells if the current processor is a PME processor or not. Called by NamdCentralLB
00247   int isPmeProcessor(int p);  
00248 
00249 #ifdef NAMD_FFTW
00250   static CmiNodeLock fftw_plan_lock;
00251 #endif
00252 
00253 private:
00254   CProxy_ComputePmeMgr pmeProxy;
00255   CProxy_ComputePmeMgr pmeProxyDir;
00256   ComputePme *pmeCompute;
00257   PmeGrid myGrid;
00258   Lattice lattice;
00259   PmeKSpace *myKSpace;
00260   float *qgrid;
00261   float *kgrid;
00262 
00263 #ifdef NAMD_FFTW
00264   fftw_plan forward_plan_x, backward_plan_x;
00265   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00266   fftw_complex *work;
00267 #else
00268   float *work;
00269 #endif
00270 
00271   int fepOn, thermInt, lesOn, lesFactor, pairOn, selfOn, numGrids;
00272   int decouple;
00273   BigReal fepElecLambdaStart;
00274   BigReal elecLambdaUp;
00275   BigReal elecLambdaDown;
00276 
00277   
00278   LocalPmeInfo *localInfo;
00279   int qgrid_size;
00280   int qgrid_start;
00281   int qgrid_len;
00282   int fgrid_start;
00283   int fgrid_len;
00284 
00285   int numSources;
00286   int numGridPes;
00287   int numTransPes;
00288   int numDestRecipPes;
00289   int myGridPe;
00290   int myTransPe;
00291   int *gridPeMap;
00292   int *transPeMap;
00293   int *recipPeDest;
00294   int *gridPeOrder;
00295   int *transPeOrder;
00296   char *isPmeFlag;
00297   int grid_count;
00298   int trans_count;
00299   int untrans_count;
00300   int ungrid_count;
00301   PmeGridMsg **gridmsg_reuse;
00302   PmeReduction recip_evir[PME_MAX_EVALS];
00303   PmeReduction recip_evir2[PME_MAX_EVALS];
00304 
00305   int sequence;  // used for priorities
00306   int useBarrier;
00307   int sendTransBarrier_received;
00308 
00309   int usePencils;
00310   int xBlocks, yBlocks, zBlocks;
00311   CProxy_PmeXPencil xPencil;
00312   CProxy_PmeYPencil yPencil;
00313   CProxy_PmeZPencil zPencil;
00314   char *pencilActive;
00315   int numPencilsActive;
00316 };
00317 
00318 #ifdef NAMD_FFTW
00319   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00320 #endif
00321 
00322 int isPmeProcessor(int p){ 
00323   return CProxy_ComputePmeMgr::ckLocalBranch(CkpvAccess(BOCclass_group).computePmeMgr)->isPmeProcessor(p);
00324 }
00325 
00326 int ComputePmeMgr::isPmeProcessor(int p){ 
00327   return ( usePencils ? pencilPMEProcessors[p] : isPmeFlag[p] );
00328 }
00329 
00330 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00331                                  pmeProxyDir(thisgroup), pmeCompute(0) {
00332 
00333   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00334 
00335 #ifdef USE_COMM_LIB
00336   ComlibDelegateProxy(&pmeProxy);
00337 #endif
00338 
00339 #ifdef NAMD_FFTW
00340   if ( CmiMyRank() == 0 ) {
00341     fftw_plan_lock = CmiCreateLock();
00342   }
00343 #endif
00344 
00345   myKSpace = 0;
00346   localInfo = new LocalPmeInfo[CkNumPes()];
00347   gridPeMap = new int[CkNumPes()];
00348   transPeMap = new int[CkNumPes()];
00349   recipPeDest = new int[CkNumPes()];
00350   gridPeOrder = new int[CkNumPes()];
00351   transPeOrder = new int[CkNumPes()];
00352   isPmeFlag = new char[CkNumPes()];
00353   kgrid = 0;
00354   work = 0;
00355   grid_count = 0;
00356   trans_count = 0;
00357   untrans_count = 0;
00358   ungrid_count = 0;
00359   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00360   useBarrier = 0;
00361   sendTransBarrier_received = 0;
00362   usePencils = 0;
00363 }
00364 
00365 
00366 void ComputePmeMgr::recvArrays(
00367         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00368   xPencil = x;  yPencil = y;  zPencil = z;
00369 }
00370 
00371 
00372 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00373   delete msg;
00374 
00375   SimParameters *simParams = Node::Object()->simParameters;
00376   PatchMap *patchMap = PatchMap::Object();
00377 
00378   fepOn = simParams->fepOn;
00379   thermInt = simParams->thermInt;
00380   decouple = (fepOn || thermInt) && (simParams->decouple);
00381   fepElecLambdaStart = (fepOn || thermInt) ? simParams->fepElecLambdaStart : 0;
00382   if (fepOn || thermInt) {
00383     numGrids = 2;
00384     if (decouple) numGrids += 2;
00385     if (fepElecLambdaStart || thermInt) numGrids ++;
00386   }
00387   else numGrids = 1;
00388   lesOn = simParams->lesOn;
00389   useBarrier = simParams->PMEBarrier;
00390   if ( lesOn ) {
00391     lesFactor = simParams->lesFactor;
00392     numGrids = lesFactor;
00393   }
00394   selfOn = 0;
00395   pairOn = simParams->pairInteractionOn;
00396   if ( pairOn ) {
00397     selfOn = simParams->pairInteractionSelf;
00398     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00399     numGrids = selfOn ? 1 : 3;
00400   }
00401 
00402   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00403   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00404   else {
00405     int dimx = simParams->PMEGridSizeX;
00406     int dimy = simParams->PMEGridSizeY;
00407     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00408     if ( maxslabs > CkNumPes() ) maxslabs = CkNumPes();
00409     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00410                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00411     if ( maxpencils > CkNumPes() ) maxpencils = CkNumPes();
00412     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00413     else usePencils = 0;
00414   }
00415 
00416   if ( usePencils ) {
00417     if ( simParams->PMEPencils > 1 ) {
00418       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00419     } else {
00420       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00421                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00422       if ( nb2 > CkNumPes() ) nb2 = CkNumPes();
00423       int nb = (int) sqrt((float)nb2);
00424       xBlocks = zBlocks = nb;
00425       yBlocks = nb2 / nb;
00426     }
00427 
00428     int dimx = simParams->PMEGridSizeX;
00429     int bx = 1 + ( dimx - 1 ) / xBlocks;
00430     xBlocks = 1 + ( dimx - 1 ) / bx;
00431 
00432     int dimy = simParams->PMEGridSizeY;
00433     int by = 1 + ( dimy - 1 ) / yBlocks;
00434     yBlocks = 1 + ( dimy - 1 ) / by;
00435 
00436     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00437     int bz = 1 + ( dimz - 1 ) / zBlocks;
00438     zBlocks = 1 + ( dimz - 1 ) / bz;
00439 
00440     if ( ! CkMyPe() ) {
00441       iout << iINFO << "PME using " << xBlocks << " x " <<
00442         yBlocks << " x " << zBlocks <<
00443         " pencil grid for FFT and reciprocal sum.\n" << endi;
00444     }
00445   } else { // usePencils
00446 
00447   {  // decide how many pes to use for reciprocal sum
00448 
00449     // rules based on work available
00450     int minslices = simParams->PMEMinSlices;
00451     int dimx = simParams->PMEGridSizeX;
00452     int nrpx = ( dimx + minslices - 1 ) / minslices;
00453     int dimy = simParams->PMEGridSizeY;
00454     int nrpy = ( dimy + minslices - 1 ) / minslices;
00455 
00456     // rules based on processors available
00457     int nrpp = CkNumPes();
00458     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00459     if ( nrpp < nrpx ) nrpx = nrpp;
00460     if ( nrpp < nrpy ) nrpy = nrpp;
00461 
00462     // user override
00463     int nrps = simParams->PMEProcessors;
00464     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00465     if ( nrps > 0 ) nrpx = nrps;
00466     if ( nrps > 0 ) nrpy = nrps;
00467 
00468     // make sure there aren't any totally empty processors
00469     int bx = ( dimx + nrpx - 1 ) / nrpx;
00470     nrpx = ( dimx + bx - 1 ) / bx;
00471     int by = ( dimy + nrpy - 1 ) / nrpy;
00472     nrpy = ( dimy + by - 1 ) / by;
00473     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00474       NAMD_bug("Error in selecting number of PME processors.");
00475     if ( by != ( dimy + nrpy - 1 ) / nrpy )
00476       NAMD_bug("Error in selecting number of PME processors.");
00477 
00478     numGridPes = nrpx;
00479     numTransPes = nrpy;
00480   }
00481   if ( ! CkMyPe() ) {
00482     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
00483       " processors for FFT and reciprocal sum.\n" << endi;
00484   }
00485   { // generate random orderings for grid and trans messages
00486     int i;
00487     for ( i = 0; i < numGridPes; ++i ) {
00488       gridPeOrder[i] = i;
00489     }
00490     for ( i = 0; i < numTransPes; ++i ) {
00491       transPeOrder[i] = i;
00492     }
00493     Random rand(CkMyPe());
00494     rand.reorder(gridPeOrder,numGridPes);
00495     rand.reorder(transPeOrder,numTransPes);
00496   }
00497 
00498   int sum_npes = numTransPes + numGridPes;
00499   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
00500 
00501 #if USE_TOPOMAP
00502   PatchMap * pmap = PatchMap::Object();
00503   
00504   int patch_pes = pmap->numNodesWithPatches();
00505   TopoManager tmgr;
00506   if(tmgr.hasMultipleProcsPerNode())
00507     patch_pes *= 2;
00508  
00509   bool done = false;
00510 #ifndef USE_COMM_LIB  
00511   if(CkNumPes() > 2*sum_npes + patch_pes) {    
00512     done = generateBGLORBPmePeList(transPeMap, numTransPes);
00513     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
00514   }
00515   else 
00516 #endif
00517     if(CkNumPes() > 2 *max_npes + patch_pes) {
00518       done = generateBGLORBPmePeList(transPeMap, max_npes);
00519       gridPeMap = transPeMap;
00520     }
00521 
00522   if (!done)
00523 #endif
00524     {
00525       //generatePmePeList(transPeMap, max_npes);
00526       //gridPeMap = transPeMap;
00527       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
00528     }
00529   
00530 #ifdef USE_COMM_LIB  
00531   if(CkMyPe() == 0) {
00532       ComlibInstanceHandle cinst1 = CkCreateComlibInstance();
00533       EachToManyMulticastStrategy *strat = new 
00534           EachToManyMulticastStrategy(USE_DIRECT, numGridPes, 
00535                                       gridPeMap, numTransPes, transPeMap);
00536       cinst1.setStrategy(strat);
00537       
00538       ComlibInstanceHandle cinst2 = CkCreateComlibInstance();
00539       strat = new EachToManyMulticastStrategy(USE_DIRECT, numTransPes, transPeMap
00540                                               , numGridPes, gridPeMap);
00541       cinst2.setStrategy(strat);
00542       ComlibDoneCreating();
00543   }
00544 #endif
00545 
00546   myGridPe = -1;
00547   int i = 0;
00548   for ( i=0; i<CkNumPes(); ++i )
00549     isPmeFlag[i] = 0;
00550   for ( i=0; i<numGridPes; ++i ) {
00551     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
00552     isPmeFlag[gridPeMap[i]] |= 1;
00553   }
00554   myTransPe = -1;
00555   for ( i=0; i<numTransPes; ++i ) {
00556     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
00557     isPmeFlag[transPeMap[i]] |= 2;
00558   }
00559   
00560   if ( ! CkMyPe() ) {
00561     iout << iINFO << "PME GRID LOCATIONS:";
00562     int i;
00563     for ( i=0; i<numGridPes && i<10; ++i ) {
00564       iout << " " << gridPeMap[i];
00565     }
00566     if ( i < numGridPes ) iout << " ...";
00567     iout << "\n" << endi;
00568     iout << iINFO << "PME TRANS LOCATIONS:";
00569     for ( i=0; i<numTransPes && i<10; ++i ) {
00570       iout << " " << transPeMap[i];
00571     }
00572     if ( i < numTransPes ) iout << " ...";
00573     iout << "\n" << endi;
00574   }
00575 
00576   } // ! usePencils
00577 
00578   myGrid.K1 = simParams->PMEGridSizeX;
00579   myGrid.K2 = simParams->PMEGridSizeY;
00580   myGrid.K3 = simParams->PMEGridSizeZ;
00581   myGrid.order = simParams->PMEInterpOrder;
00582   myGrid.dim2 = myGrid.K2;
00583   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
00584 
00585   if ( ! usePencils ) {
00586     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
00587     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
00588     myGrid.block3 = myGrid.dim3 / 2;  // complex
00589   }
00590 
00591   if ( usePencils ) {
00592     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
00593     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
00594     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
00595 
00596     if ( CkMyPe() == 0 ) {
00597       int basepe = 0;  int npe = CkNumPes();
00598       if ( npe > xBlocks*yBlocks &&
00599                 npe > xBlocks*zBlocks &&
00600                 npe > yBlocks*zBlocks ) {
00601         // avoid node 0
00602         ++basepe;
00603         --npe;
00604       }
00605 
00606       zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
00607       yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
00608       xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
00609       
00610       // decide which pes to use by bit reversal and patch use
00611       int i;
00612       int ncpus = CkNumPes();
00613   
00614       // find next highest power of two
00615       int npow2 = 1;  int nbits = 0;
00616       while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00617   
00618       // build bit reversal sequence
00619       SortableResizeArray<int> patches, nopatches, pmeprocs;
00620       PatchMap *pmap = PatchMap::Object();
00621       i = 0;
00622       for ( int icpu=0; icpu<ncpus; ++icpu ) {
00623         int ri;
00624         for ( ri = ncpus; ri >= ncpus; ++i ) {
00625           ri = 0;
00626           int pow2 = 1;
00627           int rpow2 = npow2 / 2;
00628           for ( int j=0; j<nbits; ++j ) {
00629             ri += rpow2 * ( ( i / pow2 ) % 2 );
00630             pow2 *= 2;  rpow2 /= 2;
00631           }
00632         }
00633         // seq[icpu] = ri;
00634         if ( ri ) { // keep 0 for special case
00635           if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
00636           else nopatches.add(ri);
00637         }
00638       }
00639 
00640       // only use zero if it eliminates overloading or has patches
00641       int useZero = 0;
00642       int npens = xBlocks*yBlocks;
00643       if ( npens % ncpus == 0 ) useZero = 1;
00644       if ( npens == nopatches.size() + 1 ) useZero = 1;
00645       npens += xBlocks*zBlocks;
00646       if ( npens % ncpus == 0 ) useZero = 1;
00647       if ( npens == nopatches.size() + 1 ) useZero = 1;
00648       npens += yBlocks*zBlocks;
00649       if ( npens % ncpus == 0 ) useZero = 1;
00650       if ( npens == nopatches.size() + 1 ) useZero = 1;
00651 
00652       // add nopatches then patches in reversed order
00653       for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
00654       if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00655       for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
00656       if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00657   
00658       int pe = 0;
00659       int npes = pmeprocs.size();
00660       SortableResizeArray<int> zprocs(xBlocks*yBlocks);
00661       for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
00662       zprocs.sort();
00663       SortableResizeArray<int> yprocs(xBlocks*zBlocks);
00664       for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
00665       yprocs.sort();
00666       SortableResizeArray<int> xprocs(yBlocks*zBlocks);
00667       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
00668       xprocs.sort();
00669 
00670       pencilPMEProcessors = new char [CkNumPes()];
00671       memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00672 
00673       int x,y,z;
00674 
00675       iout << iINFO << "PME Z PENCIL LOCATIONS:";
00676       for ( i=0; i<zprocs.size() && i<10; ++i ) {
00677         iout << " " << zprocs[i];
00678       }
00679       if ( i < zprocs.size() ) iout << " ...";
00680       iout << "\n" << endi;
00681 
00682       for (pe=0, x = 0; x < xBlocks; ++x)
00683         for (y = 0; y < yBlocks; ++y, ++pe ) {
00684           zPencil(x,y,0).insert(zprocs[pe]);
00685           pencilPMEProcessors[zprocs[pe]] = 1;
00686         }
00687       zPencil.doneInserting();
00688       
00689       iout << iINFO << "PME Y PENCIL LOCATIONS:";
00690       for ( i=0; i<yprocs.size() && i<10; ++i ) {
00691         iout << " " << yprocs[i];
00692       }
00693       if ( i < yprocs.size() ) iout << " ...";
00694       iout << "\n" << endi;
00695 
00696       for (pe=0, z = 0; z < zBlocks; ++z )
00697         for (x = 0; x < xBlocks; ++x, ++pe ) {
00698           yPencil(x,0,z).insert(yprocs[pe]);
00699           pencilPMEProcessors[yprocs[pe]] = 1;
00700         }
00701       yPencil.doneInserting();
00702       
00703       iout << iINFO << "PME X PENCIL LOCATIONS:";
00704       for ( i=0; i<xprocs.size() && i<10; ++i ) {
00705         iout << " " << xprocs[i];
00706       }
00707       if ( i < xprocs.size() ) iout << " ...";
00708       iout << "\n" << endi;
00709 
00710       for (pe=0, y = 0; y < yBlocks; ++y )      
00711         for (z = 0; z < zBlocks; ++z, ++pe ) {
00712           xPencil(0,y,z).insert(xprocs[pe]);
00713           pencilPMEProcessors[xprocs[pe]] = 1;
00714         }
00715       xPencil.doneInserting();      
00716         
00717       pmeProxy.recvArrays(xPencil,yPencil,zPencil);
00718       PmePencilInitMsgData msgdata;
00719       msgdata.grid = myGrid;
00720       msgdata.xBlocks = xBlocks;
00721       msgdata.yBlocks = yBlocks;
00722       msgdata.zBlocks = zBlocks;
00723       msgdata.xPencil = xPencil;
00724       msgdata.yPencil = yPencil;
00725       msgdata.zPencil = zPencil;
00726       msgdata.pmeProxy = pmeProxyDir;
00727       xPencil.init(new PmePencilInitMsg(msgdata));
00728       yPencil.init(new PmePencilInitMsg(msgdata));
00729       zPencil.init(new PmePencilInitMsg(msgdata));
00730     }
00731     return;  // continue in initialize_pencils() at next startup stage
00732   }
00733 
00734 
00735   int pe;
00736   int nx = 0;
00737   for ( pe = 0; pe < numGridPes; ++pe ) {
00738     localInfo[pe].x_start = nx;
00739     nx += myGrid.block1;
00740     if ( nx > myGrid.K1 ) nx = myGrid.K1;
00741     localInfo[pe].nx = nx - localInfo[pe].x_start;
00742   }
00743   int ny = 0;
00744   for ( pe = 0; pe < numTransPes; ++pe ) {
00745     localInfo[pe].y_start_after_transpose = ny;
00746     ny += myGrid.block2;
00747     if ( ny > myGrid.K2 ) ny = myGrid.K2;
00748     localInfo[pe].ny_after_transpose =
00749                         ny - localInfo[pe].y_start_after_transpose;
00750   }
00751 
00752   {  // decide how many pes this node exchanges charges with
00753 
00754   PatchMap *patchMap = PatchMap::Object();
00755   Lattice lattice = simParams->lattice;
00756   BigReal sysdima = lattice.a_r().unit() * lattice.a();
00757   BigReal cutoff = simParams->cutoff;
00758   BigReal patchdim = simParams->patchDimension;
00759   int numPatches = patchMap->numPatches();
00760   int numNodes = CkNumPes();
00761   int *source_flags = new int[numNodes];
00762   int node;
00763   for ( node=0; node<numNodes; ++node ) {
00764     source_flags[node] = 0;
00765     recipPeDest[node] = 0;
00766   }
00767 
00768   // // make sure that we don't get ahead of ourselves on this node
00769   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
00770   //   source_flags[CkMyPe()] = 1;
00771   //   recipPeDest[myRecipPe] = 1;
00772   // }
00773 
00774   for ( int pid=0; pid < numPatches; ++pid ) {
00775     int pnode = patchMap->node(pid);
00776     BigReal minx = patchMap->min_a(pid);
00777     BigReal maxx = patchMap->max_a(pid);
00778     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
00779     // min1 (max1) is smallest (largest) grid line for this patch
00780     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
00781     int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
00782     for ( int i=min1; i<=max1; ++i ) {
00783       int ix = i;
00784       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
00785       while ( ix < 0 ) ix += myGrid.K1;
00786       // set source_flags[pnode] if this patch sends to our node
00787       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
00788            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
00789         source_flags[pnode] = 1;
00790       }
00791       // set dest_flags[] for node that our patch sends to
00792       if ( pnode == CkMyPe() ) {
00793         recipPeDest[ix / myGrid.block1] = 1;
00794       }
00795     }
00796   }
00797 
00798   numSources = 0;
00799   numDestRecipPes = 0;
00800   for ( node=0; node<numNodes; ++node ) {
00801     if ( source_flags[node] ) ++numSources;
00802     if ( recipPeDest[node] ) ++numDestRecipPes;
00803   }
00804 
00805 #if 0
00806   if ( numSources ) {
00807     iout << iINFO << "PME " << CkMyPe() << " sources:";
00808     for ( node=0; node<numNodes; ++node ) {
00809       if ( source_flags[node] ) iout << " " << node;
00810     }
00811     iout << "\n" << endi;
00812   }
00813 #endif
00814 
00815   delete [] source_flags;
00816 
00817   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
00818   //           CkMyPe(), numSources, numDestRecipPes);
00819 
00820   }  // decide how many pes this node exchanges charges with (end)
00821 
00822   ungrid_count = numDestRecipPes;
00823 
00824   sendTransBarrier_received = 0;
00825 
00826   if ( myGridPe < 0 && myTransPe < 0 ) return;
00827   // the following only for nodes doing reciprocal sum
00828 
00829   if ( myTransPe >= 0 ) {
00830       int k2_start = localInfo[myTransPe].y_start_after_transpose;
00831       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
00832       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
00833   }
00834 
00835   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
00836   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
00837   if ( local_size < local_size_2 ) local_size = local_size_2;
00838   qgrid = new float[local_size*numGrids];
00839   if ( numGridPes > 1 || numTransPes > 1 ) {
00840     kgrid = new float[local_size*numGrids];
00841   } else {
00842     kgrid = qgrid;
00843   }
00844   qgrid_size = local_size;
00845 
00846   if ( myGridPe >= 0 ) {
00847   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
00848   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
00849   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
00850   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
00851   }
00852 
00853   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
00854 
00855 #ifdef NAMD_FFTW
00856   CmiLock(fftw_plan_lock);
00857 
00858   work = new fftw_complex[n[0]];
00859 
00860   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
00861   if ( myGridPe >= 0 ) {
00862   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
00863         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00864         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
00865   }
00866   if ( ! CkMyPe() ) iout << " 2..." << endi;
00867   if ( myTransPe >= 0 ) {
00868   forward_plan_x = fftw_create_plan_specific(n[0], FFTW_REAL_TO_COMPLEX,
00869         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00870         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
00871         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
00872   }
00873   if ( ! CkMyPe() ) iout << " 3..." << endi;
00874   if ( myTransPe >= 0 ) {
00875   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_COMPLEX_TO_REAL,
00876         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00877         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
00878         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
00879   }
00880   if ( ! CkMyPe() ) iout << " 4..." << endi;
00881   if ( myGridPe >= 0 ) {
00882   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
00883         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00884         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
00885   }
00886   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
00887 
00888   CmiUnlock(fftw_plan_lock);
00889 #else
00890   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00891 #endif
00892 
00893   if ( myGridPe >= 0 && numSources == 0 )
00894                 NAMD_bug("PME grid elements exist without sources.");
00895   grid_count = numSources;
00896   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
00897   trans_count = numGridPes;
00898 }
00899 
00900 
00901 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
00902   delete msg;
00903   if ( ! usePencils ) return;
00904 
00905   SimParameters *simParams = Node::Object()->simParameters;
00906 
00907   PatchMap *patchMap = PatchMap::Object();
00908   Lattice lattice = simParams->lattice;
00909   BigReal sysdima = lattice.a_r().unit() * lattice.a();
00910   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
00911   BigReal cutoff = simParams->cutoff;
00912   BigReal patchdim = simParams->patchDimension;
00913   int numPatches = patchMap->numPatches();
00914 
00915   pencilActive = new char[xBlocks*yBlocks];
00916   for ( int i=0; i<xBlocks; ++i ) {
00917     for ( int j=0; j<yBlocks; ++j ) {
00918       pencilActive[i*yBlocks+j] = 0;
00919     }
00920   }
00921 
00922   for ( int pid=0; pid < numPatches; ++pid ) {
00923     int pnode = patchMap->node(pid);
00924     if ( pnode != CkMyPe() ) continue;
00925 
00926     BigReal minx = patchMap->min_a(pid);
00927     BigReal maxx = patchMap->max_a(pid);
00928     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
00929     // min1 (max1) is smallest (largest) grid line for this patch
00930     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
00931     int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
00932 
00933     BigReal miny = patchMap->min_b(pid);
00934     BigReal maxy = patchMap->max_b(pid);
00935     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
00936     // min2 (max2) is smallest (largest) grid line for this patch
00937     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) - myGrid.order + 1;
00938     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb)));
00939 
00940     for ( int i=min1; i<=max1; ++i ) {
00941       int ix = i;
00942       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
00943       while ( ix < 0 ) ix += myGrid.K1;
00944       for ( int j=min2; j<=max2; ++j ) {
00945         int jy = j;
00946         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
00947         while ( jy < 0 ) jy += myGrid.K2;
00948         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
00949       }
00950     }
00951   }
00952 
00953   numPencilsActive = 0;
00954   for ( int i=0; i<xBlocks; ++i ) {
00955     for ( int j=0; j<yBlocks; ++j ) {
00956       if ( pencilActive[i*yBlocks+j] ) {
00957         ++numPencilsActive;
00958         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
00959       }
00960     }
00961   }
00962   //if ( numPencilsActive ) {
00963   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
00964   //}
00965 
00966   ungrid_count = numPencilsActive;
00967 }
00968 
00969 
00970 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
00971   if ( ! usePencils ) return;
00972   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
00973 }
00974 
00975 
00976 ComputePmeMgr::~ComputePmeMgr() {
00977 
00978 #ifdef NAMD_FFTW
00979   if ( CmiMyRank() == 0 ) {
00980     CmiDestroyLock(fftw_plan_lock);
00981   }
00982 #endif
00983 
00984   delete myKSpace;
00985   delete [] localInfo;
00986   delete [] gridPeMap;
00987   delete [] transPeMap;
00988   delete [] recipPeDest;
00989   delete [] gridPeOrder;
00990   delete [] transPeOrder;
00991   delete [] isPmeFlag;
00992   delete [] qgrid;
00993   if ( kgrid != qgrid ) delete [] kgrid;
00994   delete [] work;
00995   delete [] gridmsg_reuse;
00996 }
00997 
00998 void ComputePmeMgr::sendGrid(void) {
00999   pmeCompute->sendData(numGridPes,gridPeOrder,recipPeDest,gridPeMap);
01000 }
01001 
01002 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01003   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01004   if ( grid_count == 0 ) {
01005     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01006   }
01007   if ( grid_count == numSources ) {
01008     lattice = msg->lattice;
01009     sequence = msg->sequence;
01010   }
01011 
01012   int zdim = myGrid.dim3;
01013   int zlistlen = msg->zlistlen;
01014   int *zlist = msg->zlist;
01015   float *qmsg = msg->qgrid;
01016   for ( int g=0; g<numGrids; ++g ) {
01017     char *f = msg->fgrid + fgrid_len * g;
01018     float *q = qgrid + qgrid_size * g;
01019     for ( int i=0; i<fgrid_len; ++i ) {
01020       if ( f[i] ) {
01021         for ( int k=0; k<zlistlen; ++k ) {
01022           q[zlist[k]] += *(qmsg++);
01023         }
01024       }
01025       q += zdim;
01026     }
01027   }
01028 
01029   gridmsg_reuse[numSources-grid_count] = msg;
01030   --grid_count;
01031 
01032   if ( grid_count == 0 ) {
01033 #if CHARM_VERSION > 050402
01034     pmeProxyDir[CkMyPe()].gridCalc1();
01035     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01036 #else
01037     pmeProxyDir.gridCalc1(CkMyPe());
01038     if ( useBarrier ) pmeProxyDir.sendTransBarrier(0);
01039 #endif
01040   }
01041 }
01042 
01043 void ComputePmeMgr::gridCalc1(void) {
01044   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01045 
01046 #ifdef NAMD_FFTW
01047   for ( int g=0; g<numGrids; ++g ) {
01048     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01049         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01050   }
01051 #endif
01052 
01053 #if CHARM_VERSION > 050402
01054   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01055 #else
01056   if ( ! useBarrier ) pmeProxyDir.sendTrans(CkMyPe());
01057 #endif
01058 }
01059 
01060 void ComputePmeMgr::sendTransBarrier(void) {
01061   sendTransBarrier_received += 1;
01062   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01063   if ( sendTransBarrier_received < numGridPes ) return;
01064   sendTransBarrier_received = 0;
01065   for ( int i=0; i<numGridPes; ++i ) {
01066 #if CHARM_VERSION > 050402
01067     pmeProxyDir[gridPeMap[i]].sendTrans();
01068 #else
01069     pmeProxyDir.sendTrans(gridPeMap[i]);
01070 #endif
01071   }
01072 }
01073 
01074 void ComputePmeMgr::sendTrans(void) {
01075   // CkPrintf("sendTrans on %d\n",myTransPe);
01076 
01077   // send data for transpose
01078   int zdim = myGrid.dim3;
01079   int nx = localInfo[myGridPe].nx;
01080   int x_start = localInfo[myGridPe].x_start;
01081   int slicelen = myGrid.K2 * zdim;
01082 
01083 #ifdef USE_COMM_LIB
01084   ComlibInstanceHandle cinst1 = CkGetComlibInstance(0);
01085   cinst1.beginIteration();
01086 #endif
01087 
01088 #if CMK_BLUEGENEL
01089   CmiNetworkProgressAfter (0);
01090 #endif
01091 
01092   for (int j=0; j<numTransPes; j++) {
01093     int pe = transPeOrder[j];  // different order on each node
01094     LocalPmeInfo &li = localInfo[pe];
01095     int cpylen = li.ny_after_transpose * zdim;
01096     PmeTransMsg *newmsg = new (nx * cpylen * numGrids,
01097                                 PRIORITY_SIZE) PmeTransMsg;
01098     newmsg->sourceNode = myGridPe;
01099     newmsg->lattice = lattice;
01100     newmsg->x_start = x_start;
01101     newmsg->nx = nx;
01102     for ( int g=0; g<numGrids; ++g ) {
01103       float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01104       float *qmsg = newmsg->qgrid + nx * cpylen * g;
01105       for ( int x = 0; x < nx; ++x ) {
01106         CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01107         q += slicelen;
01108         qmsg += cpylen;
01109       }
01110     }
01111     newmsg->sequence = sequence;
01112     SET_PRIORITY(newmsg,sequence,PME_TRANS_PRIORITY)
01113 #if CHARM_VERSION > 050402
01114     pmeProxy[transPeMap[pe]].recvTrans(newmsg);
01115 #else
01116     pmeProxy.recvTrans(newmsg,transPeMap[pe]);
01117 #endif
01118   }
01119  
01120   untrans_count = numTransPes;
01121 
01122 #ifdef USE_COMM_LIB
01123   cinst1.endIteration();
01124 #endif  
01125 }
01126 
01127 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
01128   // CkPrintf("recvTrans on Pe(%d)\n",CkMyPe());
01129   if ( trans_count == numGridPes ) {
01130     lattice = msg->lattice;
01131     sequence = msg->sequence;
01132   }
01133 
01134   int zdim = myGrid.dim3;
01135   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01136   int ny = localInfo[myTransPe].ny_after_transpose;
01137   int x_start = msg->x_start;
01138   int nx = msg->nx;
01139   for ( int g=0; g<numGrids; ++g ) {
01140     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
01141         (void*)(msg->qgrid + nx*ny*zdim*g), nx*ny*zdim*sizeof(float));
01142   }
01143 
01144   delete msg;
01145   --trans_count;
01146 
01147   if ( trans_count == 0 ) {
01148 #if CHARM_VERSION > 050402
01149     pmeProxyDir[CkMyPe()].gridCalc2();
01150 #else
01151     pmeProxyDir.gridCalc2(CkMyPe());
01152 #endif
01153   }
01154 }
01155 
01156 void ComputePmeMgr::gridCalc2(void) {
01157   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
01158 
01159 #if CMK_BLUEGENEL
01160   CmiNetworkProgressAfter (0);
01161 #endif
01162 
01163   int zdim = myGrid.dim3;
01164   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01165   int ny = localInfo[myTransPe].ny_after_transpose;
01166 
01167   for ( int g=0; g<numGrids; ++g ) {
01168     // finish forward FFT (x dimension)
01169 #ifdef NAMD_FFTW
01170     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01171         ny * zdim / 2, 1, work, 1, 0);
01172 #endif
01173 
01174     // reciprocal space portion of PME
01175     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01176     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
01177                         lattice, ewaldcof, &(recip_evir2[g][1]));
01178     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
01179 
01180     // start backward FFT (x dimension)
01181 #ifdef NAMD_FFTW
01182     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01183         ny * zdim / 2, 1, work, 1, 0);
01184 #endif
01185   }
01186 
01187 #if CHARM_VERSION > 050402
01188   pmeProxyDir[CkMyPe()].sendUntrans();
01189 #else
01190   pmeProxyDir.sendUntrans(CkMyPe());
01191 #endif
01192 }
01193 
01194 void ComputePmeMgr::sendUntrans(void) {
01195 
01196   int zdim = myGrid.dim3;
01197   int y_start = localInfo[myTransPe].y_start_after_transpose;
01198   int ny = localInfo[myTransPe].ny_after_transpose;
01199 
01200 #ifdef USE_COMM_LIB
01201   ComlibInstanceHandle cinst2 = CkGetComlibInstance(1); 
01202   cinst2.beginIteration();
01203 #endif  
01204 
01205 #if CMK_BLUEGENEL
01206   CmiNetworkProgressAfter (0);
01207 #endif
01208 
01209   // send data for reverse transpose
01210   for (int j=0; j<numGridPes; j++) {
01211     int pe = gridPeOrder[j];  // different order on each node
01212     LocalPmeInfo &li = localInfo[pe];
01213     int x_start =li.x_start;
01214     int nx = li.nx;
01215     PmeUntransMsg *newmsg = new (nx*ny*zdim*numGrids,numGrids,
01216                                 PRIORITY_SIZE) PmeUntransMsg;
01217     newmsg->sourceNode = myTransPe;
01218     newmsg->y_start = y_start;
01219     newmsg->ny = ny;
01220     for ( int g=0; g<numGrids; ++g ) {
01221       if ( j == 0 ) {  // only need these once
01222         newmsg->evir[g] = recip_evir2[g];
01223       } else {
01224         newmsg->evir[g] = 0.;
01225       }
01226       CmiMemcpy((void*)(newmsg->qgrid+nx*ny*zdim*g),
01227                 (void*)(kgrid + qgrid_size*g + x_start*ny*zdim),
01228                 nx*ny*zdim*sizeof(float));
01229     }
01230     SET_PRIORITY(newmsg,sequence,PME_UNTRANS_PRIORITY)
01231 #if CHARM_VERSION > 050402
01232     pmeProxy[gridPeMap[pe]].recvUntrans(newmsg);
01233 #else
01234     pmeProxy.recvUntrans(newmsg,gridPeMap[pe]);
01235 #endif
01236   }
01237 
01238 #ifdef USE_COMM_LIB
01239   cinst2.endIteration();
01240 #endif  
01241 
01242   trans_count = numGridPes;
01243 }
01244 
01245 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
01246   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
01247   if ( untrans_count == numTransPes ) {
01248     for ( int g=0; g<numGrids; ++g ) {
01249       recip_evir[g] = 0.;
01250     }
01251   }
01252 
01253 #if CMK_BLUEGENEL
01254   CmiNetworkProgressAfter (0);
01255 #endif
01256 
01257   int g;
01258   for ( g=0; g<numGrids; ++g ) {
01259     recip_evir[g] += msg->evir[g];
01260   }
01261 
01262   int zdim = myGrid.dim3;
01263   // int x_start = localInfo[myGridPe].x_start;
01264   int nx = localInfo[myGridPe].nx;
01265   int y_start = msg->y_start;
01266   int ny = msg->ny;
01267   int slicelen = myGrid.K2 * zdim;
01268   int cpylen = ny * zdim;
01269   for ( g=0; g<numGrids; ++g ) {
01270     float *q = qgrid + qgrid_size * g + y_start * zdim;
01271     float *qmsg = msg->qgrid + nx * cpylen * g;
01272     for ( int x = 0; x < nx; ++x ) {
01273       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
01274       q += slicelen;
01275       qmsg += cpylen;
01276     }
01277   }
01278 
01279   delete msg;
01280   --untrans_count;
01281 
01282   if ( untrans_count == 0 ) {
01283 #if CHARM_VERSION > 050402
01284     pmeProxyDir[CkMyPe()].gridCalc3();
01285 #else
01286     pmeProxyDir.gridCalc3(CkMyPe());
01287 #endif
01288   }
01289 }
01290 
01291 void ComputePmeMgr::gridCalc3(void) {
01292   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
01293 
01294   // finish backward FFT
01295 #ifdef NAMD_FFTW
01296   for ( int g=0; g<numGrids; ++g ) {
01297     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
01298         (fftw_complex *) (qgrid + qgrid_size * g),
01299         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
01300   }
01301 #endif
01302 
01303 #if CHARM_VERSION > 050402
01304   pmeProxyDir[CkMyPe()].sendUngrid();
01305 #else
01306   pmeProxyDir.sendUngrid(CkMyPe());
01307 #endif
01308 }
01309 
01310 void ComputePmeMgr::sendUngrid(void) {
01311 
01312   for ( int j=0; j<numSources; ++j ) {
01313     // int msglen = qgrid_len;
01314     PmeGridMsg *newmsg = gridmsg_reuse[j];
01315     int pe = newmsg->sourceNode;
01316     if ( j == 0 ) {  // only need these once
01317       for ( int g=0; g<numGrids; ++g ) {
01318         newmsg->evir[g] = recip_evir[g];
01319       }
01320     } else {
01321       for ( int g=0; g<numGrids; ++g ) {
01322         newmsg->evir[g] = 0.;
01323       }
01324     }
01325     int zdim = myGrid.dim3;
01326     int flen = newmsg->len;
01327     int fstart = newmsg->start;
01328     int zlistlen = newmsg->zlistlen;
01329     int *zlist = newmsg->zlist;
01330     float *qmsg = newmsg->qgrid;
01331     for ( int g=0; g<numGrids; ++g ) {
01332       char *f = newmsg->fgrid + fgrid_len * g;
01333       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
01334       for ( int i=0; i<flen; ++i ) {
01335         if ( f[i] ) {
01336           for ( int k=0; k<zlistlen; ++k ) {
01337             *(qmsg++) = q[zlist[k]];
01338           }
01339         }
01340         q += zdim;
01341       }
01342     }
01343     newmsg->sourceNode = myGridPe;
01344 
01345     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
01346 #if CHARM_VERSION > 050402
01347     pmeProxyDir[pe].recvUngrid(newmsg);
01348 #else
01349     pmeProxyDir.recvUngrid(newmsg,pe);
01350 #endif
01351   }
01352   grid_count = numSources;
01353   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01354 }
01355 
01356 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
01357   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
01358   if ( ungrid_count == 0 ) {
01359     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
01360   }
01361 
01362   if ( usePencils ) pmeCompute->copyPencils(msg);
01363   else pmeCompute->copyResults(msg);
01364   delete msg;
01365   --ungrid_count;
01366 
01367   if ( ungrid_count == 0 ) {
01368 #if CHARM_VERSION > 050402
01369     pmeProxyDir[CkMyPe()].ungridCalc();
01370 #else
01371     pmeProxyDir.ungridCalc(CkMyPe());
01372 #endif
01373   }
01374 }
01375 
01376 void ComputePmeMgr::ungridCalc(void) {
01377   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
01378 
01379   pmeCompute->ungridForces();
01380 
01381   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
01382 }
01383 
01384 
01385 static void scale_coordinates(PmeParticle p[], int N, Lattice lattice, PmeGrid grid) {
01386   Vector origin = lattice.origin();
01387   Vector recip1 = lattice.a_r();
01388   Vector recip2 = lattice.b_r();
01389   Vector recip3 = lattice.c_r();
01390   double ox = origin.x;
01391   double oy = origin.y;
01392   double oz = origin.z;
01393   double r1x = recip1.x;
01394   double r1y = recip1.y;
01395   double r1z = recip1.z;
01396   double r2x = recip2.x;
01397   double r2y = recip2.y;
01398   double r2z = recip2.z;
01399   double r3x = recip3.x;
01400   double r3y = recip3.y;
01401   double r3z = recip3.z;
01402   int K1 = grid.K1;
01403   int K2 = grid.K2;
01404   int K3 = grid.K3;
01405 
01406   for (int i=0; i<N; i++) {
01407     double px = p[i].x - ox;
01408     double py = p[i].y - oy;
01409     double pz = p[i].z - oz;
01410     double sx = px*r1x + py*r1y + pz*r1z;
01411     double sy = px*r2x + py*r2y + pz*r2z;
01412     double sz = px*r3x + py*r3y + pz*r3z;
01413     p[i].x = K1 * ( sx - floor(sx) );
01414     p[i].y = K2 * ( sy - floor(sy) );
01415     p[i].z = K3 * ( sz - floor(sz) );
01416     //  Check for rare rounding condition where K * ( 1 - epsilon ) == K
01417     //  which was observed with g++ on Intel x86 architecture.
01418     if ( p[i].x == K1 ) p[i].x = 0;
01419     if ( p[i].y == K2 ) p[i].y = 0;
01420     if ( p[i].z == K3 ) p[i].z = 0;
01421   }
01422 }
01423 
01424 static void scale_forces(Vector f[], int N, Lattice lattice) {
01425   Vector recip1 = lattice.a_r();
01426   Vector recip2 = lattice.b_r();
01427   Vector recip3 = lattice.c_r();
01428   double r1x = recip1.x;
01429   double r1y = recip1.y;
01430   double r1z = recip1.z;
01431   double r2x = recip2.x;
01432   double r2y = recip2.y;
01433   double r2z = recip2.z;
01434   double r3x = recip3.x;
01435   double r3y = recip3.y;
01436   double r3z = recip3.z;
01437 
01438   for (int i=0; i<N; i++) {
01439     double f1 = f[i].x;
01440     double f2 = f[i].y;
01441     double f3 = f[i].z;
01442     f[i].x = f1*r1x + f2*r2x + f3*r3x;
01443     f[i].y = f1*r1y + f2*r2y + f3*r3y;
01444     f[i].z = f1*r1z + f2*r2z + f3*r3z;
01445   }
01446 }
01447  
01448 
01449 ComputePme::ComputePme(ComputeID c) :
01450   ComputeHomePatches(c)
01451 {
01452   DebugM(4,"ComputePme created.\n");
01453 
01454   CProxy_ComputePmeMgr::ckLocalBranch(
01455         CkpvAccess(BOCclass_group).computePmeMgr)->setCompute(this);
01456 
01457   useAvgPositions = 1;
01458 
01459   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
01460 
01461   SimParameters *simParams = Node::Object()->simParameters;
01462 
01463   fepOn = simParams->fepOn;
01464   thermInt = simParams->thermInt;
01465   decouple = (fepOn || thermInt) && (simParams->decouple);
01466   fepElecLambdaStart = (fepOn || thermInt) ? simParams->fepElecLambdaStart : 0;
01467             
01468   if (fepOn || thermInt) {
01469     numGrids = 2;
01470     if (decouple) numGrids += 2;
01471     if (fepElecLambdaStart || thermInt) numGrids ++;
01472   }
01473   else numGrids = 1;
01474   lesOn = simParams->lesOn;
01475   if ( lesOn ) {
01476     lesFactor = simParams->lesFactor;
01477     numGrids = lesFactor;
01478   }
01479   selfOn = 0;
01480   pairOn = simParams->pairInteractionOn;
01481   if ( pairOn ) {
01482     selfOn = simParams->pairInteractionSelf;
01483     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
01484     numGrids = selfOn ? 1 : 3;
01485   }
01486 
01487   myGrid.K1 = simParams->PMEGridSizeX;
01488   myGrid.K2 = simParams->PMEGridSizeY;
01489   myGrid.K3 = simParams->PMEGridSizeZ;
01490   myGrid.order = simParams->PMEInterpOrder;
01491   myGrid.dim2 = myGrid.K2;
01492   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01493   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
01494   fsize = myGrid.K1 * myGrid.dim2;
01495   q_arr = new double*[fsize*numGrids];
01496   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(double*) );
01497   f_arr = new char[fsize*numGrids];
01498   fz_arr = new char[myGrid.K3];
01499 }
01500 
01501 ComputePme::~ComputePme()
01502 {
01503   for (int i=0; i<fsize*numGrids; ++i) {
01504     if ( q_arr[i] ) {
01505       delete [] q_arr[i];
01506     }
01507   }
01508   delete [] q_arr;
01509   delete [] f_arr;
01510   delete [] fz_arr;
01511 }
01512 
01513 void ComputePme::doWork()
01514 {
01515   DebugM(4,"Entering ComputePme::doWork().\n");
01516 
01517 #ifdef TRACE_COMPUTE_OBJECTS
01518     double traceObjStartTime = CmiWallTimer();
01519 #endif
01520 
01521   ResizeArrayIter<PatchElem> ap(patchList);
01522 
01523   // Skip computations if nothing to do.
01524   if ( ! patchList[0].p->flags.doFullElectrostatics )
01525   {
01526     for (ap = ap.begin(); ap != ap.end(); ap++) {
01527       CompAtom *x = (*ap).positionBox->open();
01528       Results *r = (*ap).forceBox->open();
01529       (*ap).positionBox->close(&x);
01530       (*ap).forceBox->close(&r);
01531     }
01532     reduction->submit();
01533     return;
01534   }
01535 
01536   // allocate storage
01537   numLocalAtoms = 0;
01538   for (ap = ap.begin(); ap != ap.end(); ap++) {
01539     numLocalAtoms += (*ap).p->getNumAtoms();
01540   }
01541 
01542   Lattice lattice = patchList[0].p->flags.lattice;
01543 
01544   localData = new PmeParticle[numLocalAtoms*(numGrids+
01545                                         ((numGrids>1 || selfOn)?1:0))];
01546   localPartition = new char[numLocalAtoms];
01547 
01548   int g;
01549   for ( g=0; g<numGrids; ++g ) {
01550     localGridData[g] = localData + numLocalAtoms*(g+1);
01551   }
01552 
01553   // get positions and charges
01554   PmeParticle * data_ptr = localData;
01555   char * part_ptr = localPartition;
01556   const BigReal coloumb_sqrt = sqrt( COLOUMB * ComputeNonbondedUtil::scaling
01557                                 * ComputeNonbondedUtil::dielectric_1 );
01558 
01559   for (ap = ap.begin(); ap != ap.end(); ap++) {
01560 #ifdef NETWORK_PROGRESS
01561     CmiNetworkProgress();
01562 #endif
01563 
01564     CompAtom *x = (*ap).positionBox->open();
01565     if ( patchList[0].p->flags.doMolly ) {
01566       (*ap).positionBox->close(&x);
01567       x = (*ap).avgPositionBox->open();
01568     }
01569     int numAtoms = (*ap).p->getNumAtoms();
01570 
01571     for(int i=0; i<numAtoms; ++i)
01572     {
01573       data_ptr->x = x[i].position.x;
01574       data_ptr->y = x[i].position.y;
01575       data_ptr->z = x[i].position.z;
01576       data_ptr->cg = coloumb_sqrt * x[i].charge;
01577       ++data_ptr;
01578       *part_ptr = x[i].partition;
01579       ++part_ptr;
01580     }
01581 
01582     if ( patchList[0].p->flags.doMolly ) { (*ap).avgPositionBox->close(&x); }
01583     else { (*ap).positionBox->close(&x); }
01584   }
01585 
01586   // copy to other grids if needed
01587   if ( ((fepOn || thermInt) && (!decouple)) || lesOn ) {
01588     for ( g=0; g<numGrids; ++g ) {
01589 #ifdef NETWORK_PROGRESS
01590       CmiNetworkProgress();
01591 #endif
01592 
01593       PmeParticle *lgd = localGridData[g];
01594       int nga = 0;
01595       for(int i=0; i<numLocalAtoms; ++i) {
01596         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
01597           // for FEP/TI: grid 0 gets non-alch + partition 1;
01598           // grid 1 gets non-alch + partition 2;
01599           // grid 2 (only if called for with numGrids=3) gets only non-alch
01600           lgd[nga++] = localData[i];
01601         }
01602       }
0160