Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 //#define MANUAL_DEBUG_FFTW3 1
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012 // fftw2 doesn't have these defined
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024 
00025 #include <vector>
00026 #include <algorithm>
00027 using namespace std;
00028 
00029 #include "InfoStream.h"
00030 #include "Node.h"
00031 #include "PatchMap.h"
00032 #include "PatchMap.inl"
00033 #include "AtomMap.h"
00034 #include "ComputePme.h"
00035 #include "ComputePmeMgr.decl.h"
00036 #include "PmeBase.inl"
00037 #include "PmeRealSpace.h"
00038 #include "PmeKSpace.h"
00039 #include "ComputeNonbondedUtil.h"
00040 #include "PatchMgr.h"
00041 #include "Molecule.h"
00042 #include "ReductionMgr.h"
00043 #include "ComputeMgr.h"
00044 #include "ComputeMgr.decl.h"
00045 // #define DEBUGM
00046 #define MIN_DEBUG_LEVEL 3
00047 #include "Debug.h"
00048 #include "SimParameters.h"
00049 #include "WorkDistrib.h"
00050 #include "varsizemsg.h"
00051 #include "Random.h"
00052 #include "ckhashtable.h"
00053 #include "Priorities.h"
00054 
00055 #include "ComputeMoa.h"
00056 #include "ComputeMoaMgr.decl.h" 
00057 
00058 //#define     USE_RANDOM_TOPO         1
00059 
00060 //#define USE_TOPO_SFC                    1
00061 //#define     USE_CKLOOP                1
00062 //#include "TopoManager.h"
00063 
00064 #ifndef SQRT_PI
00065 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00066 #endif
00067 
00068 #if CMK_PERSISTENT_COMM 
00069 #define USE_PERSISTENT      1
00070 #endif
00071 
00072 #if USE_PERSISTENT
00073 #define Z_PERSIST 1
00074 #define Y_PERSIST 1
00075 #define X_PERSIST 1
00076 #endif
00077 
00078 //#define USE_NODE_PAR_RECEIVE    1
00079 
00080 char *pencilPMEProcessors;
00081 
00082 class PmeAckMsg : public CMessage_PmeAckMsg {
00083 };
00084 
00085 class PmeGridMsg : public CMessage_PmeGridMsg {
00086 public:
00087 
00088   int sourceNode;
00089   int sequence;
00090   int hasData;
00091   Lattice lattice;
00092   PmeReduction *evir;
00093   int start;
00094   int len;
00095   int zlistlen;
00096   int *zlist;
00097   char *fgrid;
00098   float *qgrid;
00099   CkArrayIndex3D destElem;
00100 };
00101 
00102 class PmeTransMsg : public CMessage_PmeTransMsg {
00103 public:
00104 
00105   int sourceNode;
00106   int sequence;
00107   int hasData;
00108   Lattice lattice;
00109   int x_start;
00110   int nx;
00111   float *qgrid;
00112   CkArrayIndex3D destElem;
00113 };
00114 
00115 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00116 public:
00117   PmeTransMsg *msg;
00118   int *count;
00119   CmiNodeLock lock;
00120 };
00121 
00122 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00123 public:
00124 
00125   int sourceNode;
00126   int has_evir;
00127   PmeReduction *evir;
00128   int y_start;
00129   int ny;
00130   float *qgrid;
00131   CkArrayIndex3D destElem;
00132 };
00133 
00134 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00135 public:
00136   PmeUntransMsg *msg;
00137   int *count;
00138   CmiNodeLock lock;
00139 };
00140 
00141 class PmePencilMap : public CBase_PmePencilMap {
00142 public:
00143   PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00144     : ia(i_a), ib(i_b), nb(n_b),
00145       size(n), data(newcopyint(n,d)) {
00146   }
00147   virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00148     //Return an ``arrayHdl'', given some information about the array
00149     return 0;
00150   }
00151   virtual int procNum(int, const CkArrayIndex &i) {
00152     //Return the home processor number for this element of this array
00153     return data[ i.data()[ia] * nb + i.data()[ib] ];
00154   }
00155   virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00156     int mype = CkMyPe();
00157     for ( int i=0; i < size; ++i ) {
00158       if ( data[i] == mype ) {
00159         CkArrayIndex3D ai(0,0,0);
00160         ai.data()[ia] = i / nb;
00161         ai.data()[ib] = i % nb;
00162         if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00163         if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00164         mgr->insertInitial(ai,msg);
00165         msg = 0;
00166       }
00167     }
00168     mgr->doneInserting();
00169     if ( msg ) CkFreeMsg(msg);
00170   }
00171 private:
00172   const int ia, ib, nb, size;
00173   const int* const data;
00174   static int* newcopyint(int n, int *d) {
00175     int *newd = new int[n];
00176     memcpy(newd, d, n*sizeof(int));
00177     return newd;
00178   }
00179 };
00180 
00181 // use this idiom since messages don't have copy constructors
00182 struct PmePencilInitMsgData {
00183   PmeGrid grid;
00184   int xBlocks, yBlocks, zBlocks;
00185   CProxy_PmeXPencil xPencil;
00186   CProxy_PmeYPencil yPencil;
00187   CProxy_PmeZPencil zPencil;
00188   CProxy_ComputePmeMgr pmeProxy;
00189   CProxy_NodePmeMgr pmeNodeProxy;
00190   CProxy_PmePencilMap xm;
00191   CProxy_PmePencilMap ym;
00192   CProxy_PmePencilMap zm;
00193 };
00194 
00195 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00196 public:
00197    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00198    PmePencilInitMsgData data;
00199 };
00200 
00201 
00202 struct LocalPmeInfo {
00203   int nx, x_start;
00204   int ny_after_transpose, y_start_after_transpose;
00205 };
00206 
00207 struct NodePmeInfo {
00208   int npe, pe_start, real_node;
00209 };
00210 
00211 
00212 //Assigns gridPeMap and transPeMap to different set of processors.
00213 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00214   int ncpus = CkNumPes();
00215   
00216   for ( int i=0; i<numGridPes; ++i ) {
00217     gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
00218   }
00219   std::sort(gridPeMap,gridPeMap+numGridPes);
00220   int firstTransPe = ncpus - numGridPes - numTransPes;
00221   if ( firstTransPe < 0 ) {
00222     firstTransPe = 0;
00223     // 0 should be first in list, skip if possible
00224     if ( ncpus > numTransPes ) firstTransPe = 1;
00225   }
00226   for ( int i=0; i<numTransPes; ++i ) {
00227     transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
00228   }
00229   std::sort(transPeMap,transPeMap+numTransPes);
00230 }
00231 
00232 #if USE_TOPOMAP 
00233 //Topology aware PME allocation
00234 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00235                              int nbpes=0);
00236 #endif
00237 
00238 
00239 int compare_bit_reversed(int a, int b) {
00240   int d = a ^ b;
00241   int c = 1;
00242   if ( d ) while ( ! (d & c) ) {
00243     c = c << 1;
00244   }
00245   return (a & c) - (b & c);
00246 }
00247 
00248 inline bool less_than_bit_reversed(int a, int b) {
00249   int d = a ^ b;
00250   int c = 1;
00251   if ( d ) while ( ! (d & c) ) {
00252     c = c << 1;
00253   }
00254   return d && (b & c);
00255 }
00256 
00257 struct sortop_bit_reversed {
00258   inline bool operator() (int a, int b) const {
00259     return less_than_bit_reversed(a,b);
00260   }
00261 };
00262 
00263 struct ijpair {
00264   int i,j;
00265   ijpair() {;}
00266   ijpair(int I, int J) : i(I), j(J) {;}
00267 };
00268 
00269 struct ijpair_sortop_bit_reversed {
00270   inline bool operator() (const ijpair &a, const ijpair &b) const {
00271     return ( less_than_bit_reversed(a.i,b.i)
00272              || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
00273   }
00274 };
00275 
00276 class ComputePmeMgr : public BOCclass {
00277 public:
00278   friend class ComputePme;
00279   friend class NodePmeMgr;
00280   ComputePmeMgr();
00281   ~ComputePmeMgr();
00282 
00283   void initialize(CkQdMsg*);
00284   void initialize_pencils(CkQdMsg*);
00285   void activate_pencils(CkQdMsg*);
00286   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00287   void initialize_computes();
00288 
00289   void sendData(Lattice &, int sequence);
00290   void sendPencils(Lattice &, int sequence);
00291   void recvGrid(PmeGridMsg *);
00292   void gridCalc1(void);
00293   void sendTransBarrier(void);
00294   void sendTrans(void);
00295   void fwdSharedTrans(PmeTransMsg *);
00296   void recvSharedTrans(PmeSharedTransMsg *);
00297   void recvTrans(PmeTransMsg *);
00298   void procTrans(PmeTransMsg *);
00299   void gridCalc2(void);
00300   #ifdef OPENATOM_VERSION
00301   void gridCalc2Moa(void);
00302   #endif // OPENATOM_VERSION
00303   void gridCalc2R(void);
00304   void fwdSharedUntrans(PmeUntransMsg *);
00305   void recvSharedUntrans(PmeSharedUntransMsg *);
00306   void sendUntrans(void);
00307   void recvUntrans(PmeUntransMsg *);
00308   void procUntrans(PmeUntransMsg *);
00309   void gridCalc3(void);
00310   void sendUngrid(void);
00311   void recvUngrid(PmeGridMsg *);
00312   void recvAck(PmeAckMsg *);
00313   void copyResults(PmeGridMsg *);
00314   void copyPencils(PmeGridMsg *);
00315   void ungridCalc(void);
00316   void submitReductions();
00317 
00318 #if USE_PERSISTENT
00319   void setup_recvgrid_persistent();
00320 #endif
00321 
00322   //Tells if the current processor is a PME processor or not. Called by NamdCentralLB
00323   int isPmeProcessor(int p);  
00324 
00325   static CmiNodeLock fftw_plan_lock;
00326 
00327 private:
00328 
00329 #if USE_PERSISTENT
00330   PersistentHandle   *recvGrid_handle;
00331 #endif
00332 
00333   CProxy_ComputePmeMgr pmeProxy;
00334   CProxy_ComputePmeMgr pmeProxyDir;
00335   CProxy_NodePmeMgr pmeNodeProxy;
00336   
00337   void addCompute(ComputePme *c) {
00338     if ( ! pmeComputes.size() ) initialize_computes();
00339     pmeComputes.add(c);
00340     c->setMgr(this);
00341   }
00342 
00343   ResizeArray<ComputePme*> pmeComputes;
00344   ResizeArray<ComputePme*> heldComputes;
00345   PmeGrid myGrid;
00346   Lattice lattice;
00347   PmeKSpace *myKSpace;
00348   float *qgrid;
00349   float *kgrid;
00350 
00351 #ifdef NAMD_FFTW
00352 #ifdef NAMD_FFTW_3
00353   fftwf_plan *forward_plan_x, *backward_plan_x;
00354   fftwf_plan *forward_plan_yz, *backward_plan_yz;
00355   fftwf_complex *work;
00356 #else
00357   fftw_plan forward_plan_x, backward_plan_x;
00358   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00359   fftw_complex *work;
00360 #endif
00361 #else
00362   float *work;
00363 #endif
00364 
00365   int qsize, fsize, bsize;
00366   int alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00367   int alchDecouple;
00368   BigReal alchElecLambdaStart;
00369   BigReal elecLambdaUp;
00370   BigReal elecLambdaDown;
00371 
00372   float **q_arr;
00373   float **q_list;
00374   int q_count;
00375   char *f_arr;
00376   char *fz_arr;
00377   PmeReduction evir[PME_MAX_EVALS];
00378   SubmitReduction *reduction;
00379   SubmitReduction *amd_reduction;
00380 
00381   int noWorkCount;
00382   int doWorkCount;
00383   int ungridForcesCount;
00384   
00385   LocalPmeInfo *localInfo;
00386   NodePmeInfo *gridNodeInfo;
00387   NodePmeInfo *transNodeInfo;
00388   int qgrid_size;
00389   int qgrid_start;
00390   int qgrid_len;
00391   int fgrid_start;
00392   int fgrid_len;
00393 
00394   int numSources;
00395   int numGridPes;
00396   int numTransPes;
00397   int numGridNodes;
00398   int numTransNodes;
00399   int numDestRecipPes;
00400   int myGridPe, myGridNode;
00401   int myTransPe, myTransNode;
00402   int *gridPeMap;
00403   int *transPeMap;
00404   int *recipPeDest;
00405   int *gridPeOrder;
00406   int *gridNodeOrder;
00407   int *transNodeOrder;
00408   char *isPmeFlag;
00409   int grid_count;
00410   int trans_count;
00411   int untrans_count;
00412   int ungrid_count;
00413   PmeGridMsg **gridmsg_reuse;
00414   PmeReduction recip_evir[PME_MAX_EVALS];
00415   PmeReduction recip_evir2[PME_MAX_EVALS];
00416 
00417   int compute_sequence;  // set from patch computes, used for priorities
00418   int grid_sequence;  // set from grid messages, used for priorities
00419   int useBarrier;
00420   int sendTransBarrier_received;
00421 
00422   int usePencils;
00423   int xBlocks, yBlocks, zBlocks;
00424   CProxy_PmeXPencil xPencil;
00425   CProxy_PmeYPencil yPencil;
00426   CProxy_PmeZPencil zPencil;
00427   char *pencilActive;
00428   ijpair *activePencils;
00429   int numPencilsActive;
00430   int strayChargeErrors;
00431 };
00432 
00433   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00434 
00435 int isPmeProcessor(int p){ 
00436   return CProxy_ComputePmeMgr::ckLocalBranch(CkpvAccess(BOCclass_group).computePmeMgr)->isPmeProcessor(p);
00437 }
00438 
00439 int ComputePmeMgr::isPmeProcessor(int p){ 
00440   SimParameters *simParams = Node::Object()->simParameters;
00441   return ( usePencils || (simParams->PMEPencils>0) ? pencilPMEProcessors[p] : isPmeFlag[p] );
00442 }
00443 
00444 class NodePmeMgr : public CBase_NodePmeMgr {
00445 public:
00446   friend class ComputePmeMgr;
00447   NodePmeMgr();
00448   ~NodePmeMgr();
00449   void initialize();
00450   void recvTrans(PmeTransMsg *);
00451   void recvUntrans(PmeUntransMsg *);
00452   void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00453   void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00454   void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00455   void recvXTrans(PmeTransMsg *);
00456   void recvYTrans(PmeTransMsg *);
00457   void recvYUntrans(PmeUntransMsg *);
00458   void recvZGrid(PmeGridMsg *);
00459   void recvZUntrans(PmeUntransMsg *);
00460 
00461   void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00462       xm=_xm; ym=_ym; zm=_zm;
00463   }
00464   CProxy_PmePencilMap xm;
00465   CProxy_PmePencilMap ym;
00466   CProxy_PmePencilMap zm;
00467 
00468 private:
00469   CProxy_ComputePmeMgr mgrProxy;
00470   ComputePmeMgr *mgrObject;
00471   ComputePmeMgr **mgrObjects;
00472   CProxy_PmeXPencil xPencil;
00473   CProxy_PmeYPencil yPencil;
00474   CProxy_PmeZPencil zPencil;
00475   CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00476   CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00477   CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;  
00478 };
00479 
00480 NodePmeMgr::NodePmeMgr() {
00481   mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00482 }
00483 
00484 NodePmeMgr::~NodePmeMgr() {
00485   delete [] mgrObjects;
00486 }
00487 
00488 void NodePmeMgr::initialize() {
00489   CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00490   mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00491   if ( CkMyRank() == 0 ) {
00492     mgrProxy = proxy;
00493     mgrObject = proxy.ckLocalBranch();
00494   }
00495 }
00496 
00497 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00498   mgrObject->fwdSharedTrans(msg);
00499 }
00500 
00501 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00502   mgrObject->fwdSharedUntrans(msg);
00503 }
00504 
00505 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00506 {
00507   CmiLock(ComputePmeMgr::fftw_plan_lock);
00508   xPencilObj.put(idx)=obj;
00509   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00510 }
00511 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00512 {
00513   CmiLock(ComputePmeMgr::fftw_plan_lock);
00514   yPencilObj.put(idx)=obj;
00515   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00516 }
00517 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00518 {
00519   CmiLock(ComputePmeMgr::fftw_plan_lock);
00520   zPencilObj.put(idx)=obj;
00521   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00522 }
00523 
00524 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00525                                  pmeProxyDir(thisgroup) {
00526 
00527   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00528   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00529 
00530   pmeNodeProxy.ckLocalBranch()->initialize();
00531 
00532   if ( CmiMyRank() == 0 ) {
00533     fftw_plan_lock = CmiCreateLock();
00534   }
00535 
00536   myKSpace = 0;
00537   kgrid = 0;
00538   work = 0;
00539   grid_count = 0;
00540   trans_count = 0;
00541   untrans_count = 0;
00542   ungrid_count = 0;
00543   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00544   useBarrier = 0;
00545   sendTransBarrier_received = 0;
00546   usePencils = 0;
00547 }
00548 
00549 
00550 void ComputePmeMgr::recvArrays(
00551         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00552   xPencil = x;  yPencil = y;  zPencil = z;
00553   
00554     if(CmiMyRank()==0)
00555     {
00556       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00557       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00558       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00559     }
00560 }
00561 
00562 #if USE_TOPO_SFC
00563  struct Coord
00564   {
00565     int x, y, z;
00566     Coord(): x(0), y(0), z(0) {}
00567     Coord(int a, int b, int c): x(a), y(b), z(c) {}
00568   };
00569   extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00570 
00571   void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00572   {
00573      SortableResizeArray<int> newprocs(procs.size());
00574      int num = 0;
00575      for (int i=0; i<result.size(); i++) {
00576        Coord &c = result[i];
00577        for (int j=0; j<procs.size(); j++) {
00578          int pe = procs[j];
00579          int x,y,z,t;
00580          tmgr.rankToCoordinates(pe, x, y, z, t);    
00581          if (x==c.x && y==c.y && z==c.z)
00582            newprocs[num++] = pe;
00583        }
00584      } 
00585      CmiAssert(newprocs.size() == procs.size());
00586      procs = newprocs;
00587   }
00588 
00589   int find_level_grid(int x) 
00590   {
00591      int a = sqrt(x);
00592      int b;
00593      for (; a>0; a--) {
00594        if (x%a == 0) break;
00595      }
00596      if (a==1) a = x;
00597      b = x/a;
00598      //return a>b?a:b;
00599      return b;
00600   }
00601   CmiNodeLock tmgr_lock;
00602 #endif
00603 
00604 void Pme_init()
00605 {
00606 #if USE_TOPO_SFC
00607   if (CkMyRank() == 0) 
00608     tmgr_lock = CmiCreateLock();
00609 #endif
00610 }
00611 
00612 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00613   delete msg;
00614 
00615   localInfo = new LocalPmeInfo[CkNumPes()];
00616   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00617   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00618   gridPeMap = new int[CkNumPes()];
00619   transPeMap = new int[CkNumPes()];
00620   recipPeDest = new int[CkNumPes()];
00621   gridPeOrder = new int[CkNumPes()];
00622   gridNodeOrder = new int[CkNumNodes()];
00623   transNodeOrder = new int[CkNumNodes()];
00624   isPmeFlag = new char[CkNumPes()];  
00625 
00626   SimParameters *simParams = Node::Object()->simParameters;
00627   PatchMap *patchMap = PatchMap::Object();
00628 
00629   alchFepOn = simParams->alchFepOn;
00630   alchThermIntOn = simParams->alchThermIntOn;
00631   alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
00632   alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? 
00633     simParams->alchElecLambdaStart : 0;
00634   if (alchFepOn || alchThermIntOn) {
00635     numGrids = 2;
00636     if (alchDecouple) numGrids += 2;
00637     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00638   }
00639   else numGrids = 1;
00640   lesOn = simParams->lesOn;
00641   useBarrier = simParams->PMEBarrier;
00642   if ( lesOn ) {
00643     lesFactor = simParams->lesFactor;
00644     numGrids = lesFactor;
00645   }
00646   selfOn = 0;
00647   pairOn = simParams->pairInteractionOn;
00648   if ( pairOn ) {
00649     selfOn = simParams->pairInteractionSelf;
00650     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00651     numGrids = selfOn ? 1 : 3;
00652   }
00653 
00654   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00655   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00656   else {
00657     int nrps = simParams->PMEProcessors;
00658     if ( nrps <= 0 ) nrps = CkNumPes();
00659     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00660     int dimx = simParams->PMEGridSizeX;
00661     int dimy = simParams->PMEGridSizeY;
00662     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00663     if ( maxslabs > nrps ) maxslabs = nrps;
00664     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00665                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00666     if ( maxpencils > nrps ) maxpencils = nrps;
00667     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00668     else usePencils = 0;
00669   }
00670 
00671   if ( usePencils ) {
00672     int nrps = simParams->PMEProcessors;
00673     if ( nrps <= 0 ) nrps = CkNumPes();
00674     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00675     if ( simParams->PMEPencils > 1 &&
00676          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00677       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00678     } else {
00679       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00680                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00681       if ( nb2 > nrps ) nb2 = nrps;
00682       if ( nb2 < 1 ) nb2 = 1;
00683       int nb = (int) sqrt((float)nb2);
00684       if ( nb < 1 ) nb = 1;
00685       xBlocks = zBlocks = nb;
00686       yBlocks = nb2 / nb;
00687     }
00688 
00689     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00690     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00691     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00692 
00693     int dimx = simParams->PMEGridSizeX;
00694     int bx = 1 + ( dimx - 1 ) / xBlocks;
00695     xBlocks = 1 + ( dimx - 1 ) / bx;
00696 
00697     int dimy = simParams->PMEGridSizeY;
00698     int by = 1 + ( dimy - 1 ) / yBlocks;
00699     yBlocks = 1 + ( dimy - 1 ) / by;
00700 
00701     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00702     int bz = 1 + ( dimz - 1 ) / zBlocks;
00703     zBlocks = 1 + ( dimz - 1 ) / bz;
00704 
00705     if ( xBlocks * yBlocks > CkNumPes() ) {
00706       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00707     }
00708     if ( xBlocks * zBlocks > CkNumPes() ) {
00709       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00710     }
00711     if ( yBlocks * zBlocks > CkNumPes() ) {
00712       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00713     }
00714 
00715     if ( ! CkMyPe() ) {
00716       iout << iINFO << "PME using " << xBlocks << " x " <<
00717         yBlocks << " x " << zBlocks <<
00718         " pencil grid for FFT and reciprocal sum.\n" << endi;
00719     }
00720   } else { // usePencils
00721 
00722   {  // decide how many pes to use for reciprocal sum
00723 
00724     // rules based on work available
00725     int minslices = simParams->PMEMinSlices;
00726     int dimx = simParams->PMEGridSizeX;
00727     int nrpx = ( dimx + minslices - 1 ) / minslices;
00728     int dimy = simParams->PMEGridSizeY;
00729     int nrpy = ( dimy + minslices - 1 ) / minslices;
00730 
00731     // rules based on processors available
00732     int nrpp = CkNumPes();
00733     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00734     if ( nrpp < nrpx ) nrpx = nrpp;
00735     if ( nrpp < nrpy ) nrpy = nrpp;
00736 
00737     // user override
00738     int nrps = simParams->PMEProcessors;
00739     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00740     if ( nrps > 0 ) nrpx = nrps;
00741     if ( nrps > 0 ) nrpy = nrps;
00742 
00743     // make sure there aren't any totally empty processors
00744     int bx = ( dimx + nrpx - 1 ) / nrpx;
00745     nrpx = ( dimx + bx - 1 ) / bx;
00746     int by = ( dimy + nrpy - 1 ) / nrpy;
00747     nrpy = ( dimy + by - 1 ) / by;
00748     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00749       NAMD_bug("Error in selecting number of PME processors.");
00750     if ( by != ( dimy + nrpy - 1 ) / nrpy )
00751       NAMD_bug("Error in selecting number of PME processors.");
00752 
00753     numGridPes = nrpx;
00754     numTransPes = nrpy;
00755   }
00756   if ( ! CkMyPe() ) {
00757     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
00758       " processors for FFT and reciprocal sum.\n" << endi;
00759   }
00760 
00761   int sum_npes = numTransPes + numGridPes;
00762   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
00763 
00764 #if 0 // USE_TOPOMAP
00765   /* This code is being disabled permanently for slab PME on Blue Gene machines */
00766   PatchMap * pmap = PatchMap::Object();
00767   
00768   int patch_pes = pmap->numNodesWithPatches();
00769   TopoManager tmgr;
00770   if(tmgr.hasMultipleProcsPerNode())
00771     patch_pes *= 2;
00772 
00773   bool done = false;
00774   if(CkNumPes() > 2*sum_npes + patch_pes) {    
00775     done = generateBGLORBPmePeList(transPeMap, numTransPes);
00776     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
00777   }
00778   else 
00779     if(CkNumPes() > 2 *max_npes + patch_pes) {
00780       done = generateBGLORBPmePeList(transPeMap, max_npes);
00781       gridPeMap = transPeMap;
00782     }
00783 
00784   if (!done)
00785 #endif
00786     {
00787       //generatePmePeList(transPeMap, max_npes);
00788       //gridPeMap = transPeMap;
00789       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
00790     }
00791   
00792   if ( ! CkMyPe() ) {
00793     iout << iINFO << "PME GRID LOCATIONS:";
00794     int i;
00795     for ( i=0; i<numGridPes && i<10; ++i ) {
00796       iout << " " << gridPeMap[i];
00797     }
00798     if ( i < numGridPes ) iout << " ...";
00799     iout << "\n" << endi;
00800     iout << iINFO << "PME TRANS LOCATIONS:";
00801     for ( i=0; i<numTransPes && i<10; ++i ) {
00802       iout << " " << transPeMap[i];
00803     }
00804     if ( i < numTransPes ) iout << " ...";
00805     iout << "\n" << endi;
00806   }
00807 
00808   // sort based on nodes and physical nodes
00809   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
00810 
00811   myGridPe = -1;
00812   myGridNode = -1;
00813   int i = 0;
00814   for ( i=0; i<CkNumPes(); ++i )
00815     isPmeFlag[i] = 0;
00816   int node = -1;
00817   int real_node = -1;
00818   for ( i=0; i<numGridPes; ++i ) {
00819     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
00820     isPmeFlag[gridPeMap[i]] |= 1;
00821     int real_node_i = CkNodeOf(gridPeMap[i]);
00822     if ( real_node_i == real_node ) {
00823       gridNodeInfo[node].npe += 1;
00824     } else {
00825       real_node = real_node_i;
00826       ++node;
00827       gridNodeInfo[node].real_node = real_node;
00828       gridNodeInfo[node].pe_start = i;
00829       gridNodeInfo[node].npe = 1;
00830     }
00831     if ( CkMyNode() == real_node_i ) myGridNode = node;
00832   }
00833   numGridNodes = node + 1;
00834   myTransPe = -1;
00835   myTransNode = -1;
00836   node = -1;
00837   real_node = -1;
00838   for ( i=0; i<numTransPes; ++i ) {
00839     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
00840     isPmeFlag[transPeMap[i]] |= 2;
00841     int real_node_i = CkNodeOf(transPeMap[i]);
00842     if ( real_node_i == real_node ) {
00843       transNodeInfo[node].npe += 1;
00844     } else {
00845       real_node = real_node_i;
00846       ++node;
00847       transNodeInfo[node].real_node = real_node;
00848       transNodeInfo[node].pe_start = i;
00849       transNodeInfo[node].npe = 1;
00850     }
00851     if ( CkMyNode() == real_node_i ) myTransNode = node;
00852   }
00853   numTransNodes = node + 1;
00854 
00855   if ( ! CkMyPe() ) {
00856     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
00857          << numTransNodes << " TRANS NODES\n" << endi;
00858   }
00859 
00860   { // generate random orderings for grid and trans messages
00861     int i;
00862     for ( i = 0; i < numGridPes; ++i ) {
00863       gridPeOrder[i] = i;
00864     }
00865     Random rand(CkMyPe());
00866     if ( myGridPe < 0 ) {
00867       rand.reorder(gridPeOrder,numGridPes);
00868     } else {  // self last
00869       gridPeOrder[myGridPe] = numGridPes-1;
00870       gridPeOrder[numGridPes-1] = myGridPe;
00871       rand.reorder(gridPeOrder,numGridPes-1);
00872     } 
00873     for ( i = 0; i < numGridNodes; ++i ) {
00874       gridNodeOrder[i] = i;
00875     }
00876     if ( myGridNode < 0 ) {
00877       rand.reorder(gridNodeOrder,numGridNodes);
00878     } else {  // self last
00879       gridNodeOrder[myGridNode] = numGridNodes-1;
00880       gridNodeOrder[numGridNodes-1] = myGridNode;
00881       rand.reorder(gridNodeOrder,numGridNodes-1);
00882     }
00883     for ( i = 0; i < numTransNodes; ++i ) {
00884       transNodeOrder[i] = i;
00885     }
00886     if ( myTransNode < 0 ) {
00887       rand.reorder(transNodeOrder,numTransNodes);
00888     } else {  // self last
00889       transNodeOrder[myTransNode] = numTransNodes-1;
00890       transNodeOrder[numTransNodes-1] = myTransNode;
00891       rand.reorder(transNodeOrder,numTransNodes-1);
00892     }
00893   }
00894   
00895   } // ! usePencils
00896 
00897   myGrid.K1 = simParams->PMEGridSizeX;
00898   myGrid.K2 = simParams->PMEGridSizeY;
00899   myGrid.K3 = simParams->PMEGridSizeZ;
00900   myGrid.order = simParams->PMEInterpOrder;
00901   myGrid.dim2 = myGrid.K2;
00902   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
00903 
00904   if ( ! usePencils ) {
00905     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
00906     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
00907     myGrid.block3 = myGrid.dim3 / 2;  // complex
00908   }
00909 
00910   if ( usePencils ) {
00911     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
00912     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
00913     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
00914 
00915 
00916       int pe = 0;
00917       int x,y,z;
00918 
00919                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
00920                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
00921                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
00922                 {
00923                 int basepe = 0;  int npe = CkNumPes();
00924                         if ( npe > xBlocks*yBlocks &&
00925                                 npe > xBlocks*zBlocks &&
00926                                 npe > yBlocks*zBlocks ) {
00927                         // avoid node 0
00928                         ++basepe;
00929                         --npe;
00930                 }
00931 
00932       
00933                 // decide which pes to use by bit reversal and patch use
00934                 int i;
00935                 int ncpus = CkNumPes();
00936                 SortableResizeArray<int> patches, nopatches, pmeprocs;
00937                 PatchMap *pmap = PatchMap::Object();
00938                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
00939                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
00940                         if ( ri ) { // keep 0 for special case
00941                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
00942                                 else nopatches.add(ri);
00943                         }
00944                 }
00945 
00946 #if USE_RANDOM_TOPO
00947             Random rand(CkMyPe());
00948             int *tmp = new int[patches.size()];
00949             int nn = patches.size();
00950             for (i=0;i<nn;i++)  tmp[i] = patches[i];
00951             rand.reorder(tmp, nn);
00952             patches.resize(0);
00953             for (i=0;i<nn;i++)  patches.add(tmp[i]);
00954             delete [] tmp;
00955             tmp = new int[nopatches.size()];
00956             nn = nopatches.size();
00957             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
00958             rand.reorder(tmp, nn);
00959             nopatches.resize(0);
00960             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
00961             delete [] tmp;
00962 #endif
00963 
00964                 // only use zero if it eliminates overloading or has patches
00965                 int useZero = 0;
00966                 int npens = xBlocks*yBlocks;
00967                 if ( npens % ncpus == 0 ) useZero = 1;
00968                 if ( npens == nopatches.size() + 1 ) useZero = 1;
00969                 npens += xBlocks*zBlocks;
00970                 if ( npens % ncpus == 0 ) useZero = 1;
00971                 if ( npens == nopatches.size() + 1 ) useZero = 1;
00972                 npens += yBlocks*zBlocks;
00973                 if ( npens % ncpus == 0 ) useZero = 1;
00974                 if ( npens == nopatches.size() + 1 ) useZero = 1;
00975 
00976                 // add nopatches then patches in reversed order
00977                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
00978                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00979                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
00980                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00981   
00982                 int npes = pmeprocs.size();
00983                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
00984                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
00985 #if !USE_RANDOM_TOPO
00986                 zprocs.sort();
00987 #endif
00988                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
00989                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
00990 #if !USE_RANDOM_TOPO
00991                 yprocs.sort();
00992 #endif
00993       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
00994       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
00995 #if !USE_RANDOM_TOPO
00996       xprocs.sort();
00997 #endif
00998 
00999 #if USE_TOPO_SFC
01000   CmiLock(tmgr_lock);
01001   //{
01002   TopoManager tmgr;
01003   int xdim = tmgr.getDimNX();
01004   int ydim = tmgr.getDimNY();
01005   int zdim = tmgr.getDimNZ();
01006   int xdim1 = find_level_grid(xdim);
01007   int ydim1 = find_level_grid(ydim);
01008   int zdim1 = find_level_grid(zdim);
01009   if(CkMyPe() == 0)
01010       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01011 
01012   vector<Coord> result;
01013   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01014   sort_sfc(xprocs, tmgr, result);
01015   sort_sfc(yprocs, tmgr, result);
01016   sort_sfc(zprocs, tmgr, result);
01017   //}
01018   CmiUnlock(tmgr_lock);
01019 #endif
01020 
01021       pencilPMEProcessors = new char [CkNumPes()];
01022       memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
01023 
01024 
01025                 if(CkMyPe() == 0){  
01026               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01027           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01028 #if USE_TOPO_SFC
01029               int x,y,z,t;
01030               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01031               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01032 #else
01033               iout << " " << zprocs[i];
01034 #endif
01035           }
01036           if ( i < zprocs.size() ) iout << " ...";
01037               iout << "\n" << endi;
01038                 }
01039 
01040       for (pe=0, x = 0; x < xBlocks; ++x)
01041         for (y = 0; y < yBlocks; ++y, ++pe ) {
01042           pencilPMEProcessors[zprocs[pe]] = 1;
01043         }
01044      
01045                 if(CkMyPe() == 0){  
01046               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01047           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01048 #if USE_TOPO_SFC
01049               int x,y,z,t;
01050               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01051               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01052 #else
01053               iout << " " << yprocs[i];
01054 #endif
01055           }
01056           if ( i < yprocs.size() ) iout << " ...";
01057               iout << "\n" << endi;
01058                 }
01059 
01060       for (pe=0, z = 0; z < zBlocks; ++z )
01061         for (x = 0; x < xBlocks; ++x, ++pe ) {
01062           pencilPMEProcessors[yprocs[pe]] = 1;
01063         }
01064     
01065                 if(CkMyPe() == 0){  
01066                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01067                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01068 #if USE_TOPO_SFC
01069                 int x,y,z,t;
01070                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01071                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01072 #else
01073                 iout << " " << xprocs[i];
01074 #endif
01075             }
01076                 if ( i < xprocs.size() ) iout << " ...";
01077                 iout << "\n" << endi;
01078                 }
01079 
01080       for (pe=0, y = 0; y < yBlocks; ++y )      
01081         for (z = 0; z < zBlocks; ++z, ++pe ) {
01082           pencilPMEProcessors[xprocs[pe]] = 1;
01083         }
01084         
01085     }
01086 
01087 
01088         // creating the pencil arrays
01089         if ( CkMyPe() == 0 ){
01090 #if !USE_RANDOM_TOPO
01091         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01092         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01093         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01094         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01095 #endif
01096 #if 1
01097         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01098         CProxy_PmePencilMap ym;
01099         if ( simParams->PMEPencilsYLayout )
01100           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01101         else
01102           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01103         CProxy_PmePencilMap xm;
01104         if ( simParams->PMEPencilsXLayout )
01105           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01106         else
01107           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01108         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01109         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01110         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01111         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01112 #if CHARM_VERSION > 60301
01113         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01114         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01115         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01116 #endif
01117         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01118         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01119         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01120 #else
01121         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01122         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01123         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01124 
01125                 for (pe=0, x = 0; x < xBlocks; ++x)
01126                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01127                                 zPencil(x,y,0).insert(zprocs[pe]);
01128                         }
01129         zPencil.doneInserting();
01130 
01131                 for (pe=0, x = 0; x < xBlocks; ++x)
01132                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01133                                 yPencil(x,0,z).insert(yprocs[pe]);
01134                         }
01135         yPencil.doneInserting();
01136 
01137 
01138                 for (pe=0, y = 0; y < yBlocks; ++y )    
01139                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01140                                 xPencil(0,y,z).insert(xprocs[pe]);
01141                         }
01142                 xPencil.doneInserting();     
01143 #endif
01144 
01145                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01146                 PmePencilInitMsgData msgdata;
01147                 msgdata.grid = myGrid;
01148                 msgdata.xBlocks = xBlocks;
01149                 msgdata.yBlocks = yBlocks;
01150                 msgdata.zBlocks = zBlocks;
01151                 msgdata.xPencil = xPencil;
01152                 msgdata.yPencil = yPencil;
01153                 msgdata.zPencil = zPencil;
01154                 msgdata.pmeProxy = pmeProxyDir;
01155         msgdata.pmeNodeProxy = pmeNodeProxy;
01156         msgdata.xm = xm;
01157         msgdata.ym = ym;
01158         msgdata.zm = zm;
01159                 xPencil.init(new PmePencilInitMsg(msgdata));
01160                 yPencil.init(new PmePencilInitMsg(msgdata));
01161                 zPencil.init(new PmePencilInitMsg(msgdata));
01162         }
01163 
01164     return;  // continue in initialize_pencils() at next startup stage
01165   }
01166 
01167 
01168   int pe;
01169   int nx = 0;
01170   for ( pe = 0; pe < numGridPes; ++pe ) {
01171     localInfo[pe].x_start = nx;
01172     nx += myGrid.block1;
01173     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01174     localInfo[pe].nx = nx - localInfo[pe].x_start;
01175   }
01176   int ny = 0;
01177   for ( pe = 0; pe < numTransPes; ++pe ) {
01178     localInfo[pe].y_start_after_transpose = ny;
01179     ny += myGrid.block2;
01180     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01181     localInfo[pe].ny_after_transpose =
01182                         ny - localInfo[pe].y_start_after_transpose;
01183   }
01184 
01185   {  // decide how many pes this node exchanges charges with
01186 
01187   PatchMap *patchMap = PatchMap::Object();
01188   Lattice lattice = simParams->lattice;
01189   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01190   BigReal cutoff = simParams->cutoff;
01191   BigReal patchdim = simParams->patchDimension;
01192   int numPatches = patchMap->numPatches();
01193   int numNodes = CkNumPes();
01194   int *source_flags = new int[numNodes];
01195   int node;
01196   for ( node=0; node<numNodes; ++node ) {
01197     source_flags[node] = 0;
01198     recipPeDest[node] = 0;
01199   }
01200 
01201   // // make sure that we don't get ahead of ourselves on this node
01202   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01203   //   source_flags[CkMyPe()] = 1;
01204   //   recipPeDest[myRecipPe] = 1;
01205   // }
01206 
01207   for ( int pid=0; pid < numPatches; ++pid ) {
01208     int pnode = patchMap->node(pid);
01209     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01210     BigReal minx = patchMap->min_a(pid);
01211     BigReal maxx = patchMap->max_a(pid);
01212     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01213     // min1 (max1) is smallest (largest) grid line for this patch
01214     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01215     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01216     for ( int i=min1; i<=max1; ++i ) {
01217       int ix = i;
01218       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01219       while ( ix < 0 ) ix += myGrid.K1;
01220       // set source_flags[pnode] if this patch sends to our node
01221       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01222            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01223         source_flags[pnode] = 1;
01224       }
01225       // set dest_flags[] for node that our patch sends to
01226       if ( pnode == CkMyPe() ) {
01227         recipPeDest[ix / myGrid.block1] = 1;
01228       }
01229     }
01230   }
01231 
01232   int numSourcesSamePhysicalNode = 0;
01233   numSources = 0;
01234   numDestRecipPes = 0;
01235   for ( node=0; node<numNodes; ++node ) {
01236     if ( source_flags[node] ) ++numSources;
01237     if ( recipPeDest[node] ) ++numDestRecipPes;
01238     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01239   }
01240 
01241 #if 0
01242   if ( numSources ) {
01243     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01244             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01245     iout << iINFO << "PME " << CkMyPe() << " sources:";
01246     for ( node=0; node<numNodes; ++node ) {
01247       if ( source_flags[node] ) iout << " " << node;
01248     }
01249     iout << "\n" << endi;
01250   }
01251 #endif
01252 
01253   delete [] source_flags;
01254 
01255   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01256   //           CkMyPe(), numSources, numDestRecipPes);
01257 
01258   }  // decide how many pes this node exchanges charges with (end)
01259 
01260   ungrid_count = numDestRecipPes;
01261 
01262   sendTransBarrier_received = 0;
01263 
01264   if ( myGridPe < 0 && myTransPe < 0 ) return;
01265   // the following only for nodes doing reciprocal sum
01266 
01267   if ( myTransPe >= 0 ) {
01268       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01269       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01270       #ifdef OPENATOM_VERSION
01271       if ( simParams->openatomOn ) { 
01272         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01273         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01274       } else {
01275         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01276       }
01277       #else  // OPENATOM_VERSION
01278       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01279       #endif // OPENATOM_VERSION
01280   }
01281 
01282   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01283   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01284   if ( local_size < local_size_2 ) local_size = local_size_2;
01285   qgrid = new float[local_size*numGrids];
01286   if ( numGridPes > 1 || numTransPes > 1 ) {
01287     kgrid = new float[local_size*numGrids];
01288   } else {
01289     kgrid = qgrid;
01290   }
01291   qgrid_size = local_size;
01292 
01293   if ( myGridPe >= 0 ) {
01294   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01295   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01296   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01297   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01298   }
01299 
01300   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01301 #ifdef NAMD_FFTW
01302   CmiLock(fftw_plan_lock);
01303 #ifdef NAMD_FFTW_3
01304   work = new fftwf_complex[n[0]];
01305   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01306   if ( myGridPe >= 0 ) {
01307     forward_plan_yz=new fftwf_plan[numGrids];
01308     backward_plan_yz=new fftwf_plan[numGrids];
01309   }
01310   if ( myTransPe >= 0 ) {
01311     forward_plan_x=new fftwf_plan[numGrids];
01312     backward_plan_x=new fftwf_plan[numGrids];
01313   }
01314   /* need one plan per grid */
01315   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01316   if ( myGridPe >= 0 ) {
01317     for( int g=0; g<numGrids; g++)
01318       {
01319         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01320                                                      localInfo[myGridPe].nx,
01321                                                      qgrid + qgrid_size * g,
01322                                                      NULL,
01323                                                      1,
01324                                                      myGrid.dim2 * myGrid.dim3,
01325                                                      (fftwf_complex *) 
01326                                                      qgrid + qgrid_size * g,
01327                                                      NULL,
01328                                                      1,
01329                                                      myGrid.dim2 * (myGrid.dim3/2),
01330                                                      fftwFlags);
01331       }
01332   }
01333   int zdim = myGrid.dim3;
01334   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01335   if ( ! CkMyPe() ) iout << " 2..." << endi;
01336   if ( myTransPe >= 0 ) {
01337     for( int g=0; g<numGrids; g++)
01338       {
01339 
01340         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01341                                                 (fftwf_complex *)
01342                                                 (kgrid+qgrid_size*g),
01343                                                 NULL,
01344                                                 xStride,
01345                                                 1,
01346                                                 (fftwf_complex *)
01347                                                 (kgrid+qgrid_size*g),
01348                                                 NULL,
01349                                                 xStride,
01350                                                 1,
01351                                                 FFTW_FORWARD,fftwFlags);
01352         
01353       }
01354   }
01355   if ( ! CkMyPe() ) iout << " 3..." << endi;
01356   if ( myTransPe >= 0 ) {
01357     for( int g=0; g<numGrids; g++)
01358       {
01359         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01360                                                  (fftwf_complex *)
01361                                                  (kgrid+qgrid_size*g),
01362                                                  NULL,
01363                                                  xStride,
01364                                                  1,
01365                                                  (fftwf_complex *)
01366                                                  (kgrid+qgrid_size*g),
01367                                                  NULL,
01368                                                  xStride,
01369                                                  1,
01370                                                  FFTW_BACKWARD, fftwFlags);
01371 
01372       }
01373   }
01374   if ( ! CkMyPe() ) iout << " 4..." << endi;
01375   if ( myGridPe >= 0 ) {
01376     for( int g=0; g<numGrids; g++)
01377       {
01378         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01379                                                       localInfo[myGridPe].nx,
01380                                                       (fftwf_complex *)
01381                                                       qgrid + qgrid_size * g,
01382                                                       NULL,
01383                                                       1,
01384                                                       myGrid.dim2*(myGrid.dim3/2),
01385                                                       qgrid + qgrid_size * g,
01386                                                       NULL,
01387                                                       1,
01388                                                       myGrid.dim2 * myGrid.dim3,
01389                                                       fftwFlags);
01390       }
01391   }
01392   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01393 
01394 #else
01395   work = new fftw_complex[n[0]];
01396 
01397   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01398   if ( myGridPe >= 0 ) {
01399   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01400         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01401         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01402   }
01403   if ( ! CkMyPe() ) iout << " 2..." << endi;
01404   if ( myTransPe >= 0 ) {
01405       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01406         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01407         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01408         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01409   }
01410   if ( ! CkMyPe() ) iout << " 3..." << endi;
01411   if ( myTransPe >= 0 ) {
01412   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01413         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01414         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01415         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01416   }
01417   if ( ! CkMyPe() ) iout << " 4..." << endi;
01418   if ( myGridPe >= 0 ) {
01419   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01420         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01421         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01422   }
01423   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01424 #endif
01425   CmiUnlock(fftw_plan_lock);
01426 #else
01427   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01428 #endif
01429 
01430   if ( myGridPe >= 0 && numSources == 0 )
01431                 NAMD_bug("PME grid elements exist without sources.");
01432   grid_count = numSources;
01433   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01434   trans_count = numGridPes;
01435 }
01436 
01437 
01438 
01439 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01440   delete msg;
01441   if ( ! usePencils ) return;
01442 
01443   SimParameters *simParams = Node::Object()->simParameters;
01444 
01445   PatchMap *patchMap = PatchMap::Object();
01446   Lattice lattice = simParams->lattice;
01447   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01448   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01449   BigReal cutoff = simParams->cutoff;
01450   BigReal patchdim = simParams->patchDimension;
01451   int numPatches = patchMap->numPatches();
01452 
01453   pencilActive = new char[xBlocks*yBlocks];
01454   for ( int i=0; i<xBlocks; ++i ) {
01455     for ( int j=0; j<yBlocks; ++j ) {
01456       pencilActive[i*yBlocks+j] = 0;
01457     }
01458   }
01459 
01460   for ( int pid=0; pid < numPatches; ++pid ) {
01461     int pnode = patchMap->node(pid);
01462     if ( pnode != CkMyPe() ) continue;
01463 
01464     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01465     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01466 
01467     BigReal minx = patchMap->min_a(pid);
01468     BigReal maxx = patchMap->max_a(pid);
01469     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01470     // min1 (max1) is smallest (largest) grid line for this patch
01471     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01472     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01473 
01474     BigReal miny = patchMap->min_b(pid);
01475     BigReal maxy = patchMap->max_b(pid);
01476     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01477     // min2 (max2) is smallest (largest) grid line for this patch
01478     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01479     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01480 
01481     for ( int i=min1; i<=max1; ++i ) {
01482       int ix = i;
01483       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01484       while ( ix < 0 ) ix += myGrid.K1;
01485       for ( int j=min2; j<=max2; ++j ) {
01486         int jy = j;
01487         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01488         while ( jy < 0 ) jy += myGrid.K2;
01489         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01490       }
01491     }
01492   }
01493 
01494   numPencilsActive = 0;
01495   for ( int i=0; i<xBlocks; ++i ) {
01496     for ( int j=0; j<yBlocks; ++j ) {
01497       if ( pencilActive[i*yBlocks+j] ) {
01498         ++numPencilsActive;
01499         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01500       }
01501     }
01502   }
01503   activePencils = new ijpair[numPencilsActive];
01504   numPencilsActive = 0;
01505   for ( int i=0; i<xBlocks; ++i ) {
01506     for ( int j=0; j<yBlocks; ++j ) {
01507       if ( pencilActive[i*yBlocks+j] ) {
01508         activePencils[numPencilsActive++] = ijpair(i,j);
01509       }
01510     }
01511   }
01512   if ( simParams->PMESendOrder ) {
01513     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01514   } else {
01515     Random rand(CkMyPe());
01516     rand.reorder(activePencils,numPencilsActive);
01517   }
01518   //if ( numPencilsActive ) {
01519   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01520   //}
01521 
01522   ungrid_count = numPencilsActive;
01523 }
01524 
01525 
01526 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01527   if ( ! usePencils ) return;
01528   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01529 }
01530 
01531 
01532 ComputePmeMgr::~ComputePmeMgr() {
01533 
01534   if ( CmiMyRank() == 0 ) {
01535     CmiDestroyLock(fftw_plan_lock);
01536   }
01537 
01538   delete myKSpace;
01539   delete [] localInfo;
01540   delete [] gridNodeInfo;
01541   delete [] transNodeInfo;
01542   delete [] gridPeMap;
01543   delete [] transPeMap;
01544   delete [] recipPeDest;
01545   delete [] gridPeOrder;
01546   delete [] gridNodeOrder;
01547   delete [] transNodeOrder;
01548   delete [] isPmeFlag;
01549   delete [] qgrid;
01550   if ( kgrid != qgrid ) delete [] kgrid;
01551   delete [] work;
01552   delete [] gridmsg_reuse;
01553 
01554   for (int i=0; i<fsize*numGrids; ++i) {
01555     if ( q_arr[i] ) {
01556       delete [] q_arr[i];
01557     }
01558   }
01559   delete [] q_arr;
01560   delete [] q_list;
01561   delete [] f_arr;
01562   delete [] fz_arr;
01563 }
01564 
01565 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01566   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01567   if ( grid_count == 0 ) {
01568     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01569   }
01570   if ( grid_count == numSources ) {
01571     lattice = msg->lattice;
01572     grid_sequence = msg->sequence;
01573   }
01574 
01575   int zdim = myGrid.dim3;
01576   int zlistlen = msg->zlistlen;
01577   int *zlist = msg->zlist;
01578   float *qmsg = msg->qgrid;
01579   for ( int g=0; g<numGrids; ++g ) {
01580     char *f = msg->fgrid + fgrid_len * g;
01581     float *q = qgrid + qgrid_size * g;
01582     for ( int i=0; i<fgrid_len; ++i ) {
01583       if ( f[i] ) {
01584         for ( int k=0; k<zlistlen; ++k ) {
01585           q[zlist[k]] += *(qmsg++);
01586         }
01587       }
01588       q += zdim;
01589     }
01590   }
01591 
01592   gridmsg_reuse[numSources-grid_count] = msg;
01593   --grid_count;
01594 
01595   if ( grid_count == 0 ) {
01596     pmeProxyDir[CkMyPe()].gridCalc1();
01597     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01598   }
01599 }
01600 #ifdef MANUAL_DEBUG_FFTW3
01601 
01602 /* utility functions for manual debugging */
01603 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01604 {
01605 
01606   char fmt[1000];
01607   char filename[1000];
01608   strncpy(fmt,infilename,999);
01609   strncat(fmt,"_%d.out",999);
01610   sprintf(filename,fmt, pe);
01611   FILE *loutfile = fopen(filename, "w");
01612 #ifdef PAIRCALC_TEST_DUMP
01613   fprintf(loutfile,"%d\n",ydim);
01614 #endif
01615   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01616   for(int i=0;i<xdim;i++)
01617     for(int j=0;j<ydim;j++)
01618       for(int k=0;k<zdim;k++)
01619         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01620   fclose(loutfile);
01621 
01622 }
01623 
01624 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01625 {
01626   char fmt[1000];
01627   char filename[1000];
01628   strncpy(fmt,infilename,999);
01629   strncat(fmt,"_%d_%d_%d.out",999);
01630   sprintf(filename,fmt, x,y,z);
01631   FILE *loutfile = fopen(filename, "w");
01632   CkAssert(loutfile!=NULL);
01633   CkPrintf("opened %s for dump\n",filename);
01634   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01635   for(int i=0;i<xdim;i++)
01636     for(int j=0;j<ydim;j++)
01637       for(int k=0;k<zdim;k++)
01638         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01639   fclose(loutfile);
01640 }
01641 
01642 #endif
01643 
01644 void ComputePmeMgr::gridCalc1(void) {
01645   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01646 
01647 #ifdef NAMD_FFTW
01648   for ( int g=0; g<numGrids; ++g ) {
01649 #ifdef NAMD_FFTW_3
01650     fftwf_execute(forward_plan_yz[g]);
01651 #else
01652     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01653         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01654 #endif
01655 
01656   }
01657 #endif
01658 
01659   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01660 }
01661 
01662 void ComputePmeMgr::sendTransBarrier(void) {
01663   sendTransBarrier_received += 1;
01664   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01665   if ( sendTransBarrier_received < numGridPes ) return;
01666   sendTransBarrier_received = 0;
01667   for ( int i=0; i<numGridPes; ++i ) {
01668     pmeProxyDir[gridPeMap[i]].sendTrans();
01669   }
01670 }
01671 
01672 void ComputePmeMgr::sendTrans(void) {
01673   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01674 
01675   // send data for transpose
01676   int zdim = myGrid.dim3;
01677   int nx = localInfo[myGridPe].nx;
01678   int x_start = localInfo[myGridPe].x_start;
01679   int slicelen = myGrid.K2 * zdim;
01680 
01681   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01682 
01683 #if CMK_BLUEGENEL
01684   CmiNetworkProgressAfter (0);
01685 #endif
01686 
01687   for (int j=0; j<numTransNodes; j++) {
01688     int node = transNodeOrder[j];  // different order on each node
01689     int pe = transNodeInfo[node].pe_start;
01690     int npe = transNodeInfo[node].npe;
01691     int totlen = 0;
01692     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01693       LocalPmeInfo &li = localInfo[pe];
01694       int cpylen = li.ny_after_transpose * zdim;
01695       totlen += cpylen;
01696     }
01697     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01698                                 PRIORITY_SIZE) PmeTransMsg;
01699     newmsg->sourceNode = myGridPe;
01700     newmsg->lattice = lattice;
01701     newmsg->x_start = x_start;
01702     newmsg->nx = nx;
01703     for ( int g=0; g<numGrids; ++g ) {
01704       float *qmsg = newmsg->qgrid + nx * totlen * g;
01705       pe = transNodeInfo[node].pe_start;
01706       for (int i=0; i<npe; ++i, ++pe) {
01707         LocalPmeInfo &li = localInfo[pe];
01708         int cpylen = li.ny_after_transpose * zdim;
01709         if ( node == myTransNode ) {
01710           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01711           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01712         }
01713         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01714         for ( int x = 0; x < nx; ++x ) {
01715           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01716           q += slicelen;
01717           qmsg += cpylen;
01718         }
01719       }
01720     }
01721     newmsg->sequence = grid_sequence;
01722     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
01723     if ( node == myTransNode ) newmsg->nx = 0;
01724     if ( npe > 1 ) {
01725       if ( node == myTransNode ) fwdSharedTrans(newmsg);
01726       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
01727     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
01728   }
01729  
01730   untrans_count = numTransPes;
01731 
01732 }
01733 
01734 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
01735   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
01736   int pe = transNodeInfo[myTransNode].pe_start;
01737   int npe = transNodeInfo[myTransNode].npe;
01738   CmiNodeLock lock = CmiCreateLock();
01739   int *count = new int; *count = npe;
01740   for (int i=0; i<npe; ++i, ++pe) {
01741     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
01742     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
01743     shmsg->msg = msg;
01744     shmsg->count = count;
01745     shmsg->lock = lock;
01746     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
01747   }
01748 }
01749 
01750 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
01751   procTrans(msg->msg);
01752   CmiLock(msg->lock);
01753   int count = --(*msg->count);
01754   CmiUnlock(msg->lock);
01755   if ( count == 0 ) {
01756     CmiDestroyLock(msg->lock);
01757     delete msg->count;
01758     delete msg->msg;
01759   }
01760   delete msg;
01761 }
01762 
01763 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
01764   procTrans(msg);
01765   delete msg;
01766 }
01767 
01768 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
01769   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
01770   if ( trans_count == numGridPes ) {
01771     lattice = msg->lattice;
01772     grid_sequence = msg->sequence;
01773   }
01774 
01775  if ( msg->nx ) {
01776   int zdim = myGrid.dim3;
01777   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
01778   int first_pe = nodeInfo.pe_start;
01779   int last_pe = first_pe+nodeInfo.npe-1;
01780   int y_skip = localInfo[myTransPe].y_start_after_transpose
01781              - localInfo[first_pe].y_start_after_transpose;
01782   int ny_msg = localInfo[last_pe].y_start_after_transpose
01783              + localInfo[last_pe].ny_after_transpose
01784              - localInfo[first_pe].y_start_after_transpose;
01785   int ny = localInfo[myTransPe].ny_after_transpose;
01786   int x_start = msg->x_start;
01787   int nx = msg->nx;
01788   for ( int g=0; g<numGrids; ++g ) {
01789     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
01790         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
01791         nx*ny*zdim*sizeof(float));
01792   }
01793  }
01794 
01795   --trans_count;
01796 
01797   if ( trans_count == 0 ) {
01798     pmeProxyDir[CkMyPe()].gridCalc2();
01799   }
01800 }
01801 
01802 void ComputePmeMgr::gridCalc2(void) {
01803   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
01804 
01805 #if CMK_BLUEGENEL
01806   CmiNetworkProgressAfter (0);
01807 #endif
01808 
01809   int zdim = myGrid.dim3;
01810   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01811   int ny = localInfo[myTransPe].ny_after_transpose;
01812 
01813   for ( int g=0; g<numGrids; ++g ) {
01814     // finish forward FFT (x dimension)
01815 #ifdef NAMD_FFTW
01816 #ifdef NAMD_FFTW_3
01817     fftwf_execute(forward_plan_x[g]);
01818 #else
01819     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01820         ny * zdim / 2, 1, work, 1, 0);
01821 #endif
01822 #endif
01823   }
01824 
01825 #ifdef OPENATOM_VERSION
01826     if ( ! simParams -> openatomOn ) { 
01827 #endif // OPENATOM_VERSION
01828       gridCalc2R();
01829 #ifdef OPENATOM_VERSION
01830     } else {
01831       gridCalc2Moa();
01832     }
01833 #endif // OPENATOM_VERSION
01834 }
01835 
01836 #ifdef OPENATOM_VERSION
01837 void ComputePmeMgr::gridCalc2Moa(void) {
01838 
01839   int zdim = myGrid.dim3;
01840   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01841   int ny = localInfo[myTransPe].ny_after_transpose;
01842 
01843   SimParameters *simParams = Node::Object()->simParameters;
01844 
01845   CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01846 
01847   for ( int g=0; g<numGrids; ++g ) {
01848     #ifdef OPENATOM_VERSION_DEBUG 
01849     CkPrintf("Sending recQ on processor %d \n", CkMyPe());
01850     for ( int i=0; i<=(ny * zdim / 2); ++i) 
01851     {
01852       CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
01853     }
01854     #endif // OPENATOM_VERSION_DEBUG
01855 //     mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
01856     CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
01857     moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
01858   }
01859 }
01860 #endif // OPENATOM_VERSION
01861 
01862 void ComputePmeMgr::gridCalc2R(void) {
01863 
01864   int zdim = myGrid.dim3;
01865   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01866   int ny = localInfo[myTransPe].ny_after_transpose;
01867 
01868   for ( int g=0; g<numGrids; ++g ) {
01869     // reciprocal space portion of PME
01870     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01871     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
01872                         lattice, ewaldcof, &(recip_evir2[g][1]));
01873     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
01874 
01875     // start backward FFT (x dimension)
01876 
01877 #ifdef NAMD_FFTW
01878 #ifdef NAMD_FFTW_3
01879     fftwf_execute(backward_plan_x[g]);
01880 #else
01881     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01882         ny * zdim / 2, 1, work, 1, 0);
01883 #endif
01884 #endif
01885   }
01886   
01887   pmeProxyDir[CkMyPe()].sendUntrans();
01888 }
01889 
01890 void ComputePmeMgr::sendUntrans(void) {
01891 
01892   int zdim = myGrid.dim3;
01893   int y_start = localInfo[myTransPe].y_start_after_transpose;
01894   int ny = localInfo[myTransPe].ny_after_transpose;
01895   int slicelen = myGrid.K2 * zdim;
01896 
01897   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01898 
01899 #if CMK_BLUEGENEL
01900   CmiNetworkProgressAfter (0);
01901 #endif
01902 
01903   // send data for reverse transpose
01904   for (int j=0; j<numGridNodes; j++) {
01905     int node = gridNodeOrder[j];  // different order on each node
01906     int pe = gridNodeInfo[node].pe_start;
01907     int npe = gridNodeInfo[node].npe;
01908     int totlen = 0;
01909     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
01910       LocalPmeInfo &li = localInfo[pe];
01911       int cpylen = li.nx * zdim;
01912       totlen += cpylen;
01913     }
01914     PmeUntransMsg *newmsg = new (numGrids, ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
01915     newmsg->sourceNode = myTransPe;
01916     newmsg->y_start = y_start;
01917     newmsg->ny = ny;
01918     for ( int g=0; g<numGrids; ++g ) {
01919       if ( j == 0 ) {  // only need these once
01920         newmsg->evir[g] = recip_evir2[g];
01921       } else {
01922         newmsg->evir[g] = 0.;
01923       }
01924       float *qmsg = newmsg->qgrid + ny * totlen * g;
01925       pe = gridNodeInfo[node].pe_start;
01926       for (int i=0; i<npe; ++i, ++pe) {
01927         LocalPmeInfo &li = localInfo[pe];
01928         if ( node == myGridNode ) {
01929           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
01930           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
01931           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
01932           int cpylen = ny * zdim;
01933           for ( int x = 0; x < li.nx; ++x ) {
01934             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01935             q += cpylen;
01936             qmsg += slicelen;
01937           }
01938         } else {
01939           CmiMemcpy((void*)qmsg,
01940                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
01941                 li.nx*ny*zdim*sizeof(float));
01942           qmsg += li.nx*ny*zdim;
01943         }
01944       }
01945     }
01946     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
01947     if ( node == myGridNode ) newmsg->ny = 0;
01948     if ( npe > 1 ) {
01949       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
01950       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
01951     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
01952   }
01953 
01954   trans_count = numGridPes;
01955 }
01956 
01957 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
01958   int pe = gridNodeInfo[myGridNode].pe_start;
01959   int npe = gridNodeInfo[myGridNode].npe;
01960   CmiNodeLock lock = CmiCreateLock();
01961   int *count = new int; *count = npe;
01962   for (int i=0; i<npe; ++i, ++pe) {
01963     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
01964     shmsg->msg = msg;
01965     shmsg->count = count;
01966     shmsg->lock = lock;
01967     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
01968   }
01969 }
01970 
01971 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
01972   procUntrans(msg->msg);
01973   CmiLock(msg->lock);
01974   int count = --(*msg->count);
01975   CmiUnlock(msg->lock);
01976   if ( count == 0 ) {
01977     CmiDestroyLock(msg->lock);
01978     delete msg->count;
01979     delete msg->msg;
01980   }
01981   delete msg;
01982 }
01983 
01984 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
01985   procUntrans(msg);
01986   delete msg;
01987 }
01988 
01989 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
01990   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
01991   if ( untrans_count == numTransPes ) {
01992     for ( int g=0; g<numGrids; ++g ) {
01993       recip_evir[g] = 0.;
01994     }
01995   }
01996 
01997 #if CMK_BLUEGENEL
01998   CmiNetworkProgressAfter (0);
01999 #endif
02000 
02001   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02002   int first_pe = nodeInfo.pe_start;
02003   int g;
02004   if ( myGridPe == first_pe ) for ( g=0; g<numGrids; ++g ) {
02005     recip_evir[g] += msg->evir[g];
02006   }
02007 
02008  if ( msg->ny ) {
02009   int zdim = myGrid.dim3;
02010   int last_pe = first_pe+nodeInfo.npe-1;
02011   int x_skip = localInfo[myGridPe].x_start
02012              - localInfo[first_pe].x_start;
02013   int nx_msg = localInfo[last_pe].x_start
02014              + localInfo[last_pe].nx
02015              - localInfo[first_pe].x_start;
02016   int nx = localInfo[myGridPe].nx;
02017   int y_start = msg->y_start;
02018   int ny = msg->ny;
02019   int slicelen = myGrid.K2 * zdim;
02020   int cpylen = ny * zdim;
02021   for ( g=0; g<numGrids; ++g ) {
02022     float *q = qgrid + qgrid_size * g + y_start * zdim;
02023     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02024     for ( int x = 0; x < nx; ++x ) {
02025       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02026       q += slicelen;
02027       qmsg += cpylen;
02028     }
02029   }
02030  }
02031 
02032   --untrans_count;
02033 
02034   if ( untrans_count == 0 ) {
02035     pmeProxyDir[CkMyPe()].gridCalc3();
02036   }
02037 }
02038 
02039 void ComputePmeMgr::gridCalc3(void) {
02040   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02041 
02042   // finish backward FFT
02043 #ifdef NAMD_FFTW
02044 
02045   for ( int g=0; g<numGrids; ++g ) {
02046 #ifdef NAMD_FFTW_3
02047     fftwf_execute(backward_plan_yz[g]);
02048 #else
02049     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02050         (fftw_complex *) (qgrid + qgrid_size * g),
02051         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02052 #endif
02053   }
02054 
02055 #endif
02056 
02057   pmeProxyDir[CkMyPe()].sendUngrid();
02058 }
02059 
02060 void ComputePmeMgr::sendUngrid(void) {
02061 
02062   for ( int j=0; j<numSources; ++j ) {
02063     // int msglen = qgrid_len;
02064     PmeGridMsg *newmsg = gridmsg_reuse[j];
02065     int pe = newmsg->sourceNode;
02066     if ( j == 0 ) {  // only need these once
02067       for ( int g=0; g<numGrids; ++g ) {
02068         newmsg->evir[g] = recip_evir[g];
02069       }
02070     } else {
02071       for ( int g=0; g<numGrids; ++g ) {
02072         newmsg->evir[g] = 0.;
02073       }
02074     }
02075     int zdim = myGrid.dim3;
02076     int flen = newmsg->len;
02077     int fstart = newmsg->start;
02078     int zlistlen = newmsg->zlistlen;
02079     int *zlist = newmsg->zlist;
02080     float *qmsg = newmsg->qgrid;
02081     for ( int g=0; g<numGrids; ++g ) {
02082       char *f = newmsg->fgrid + fgrid_len * g;
02083       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02084       for ( int i=0; i<flen; ++i ) {
02085         if ( f[i] ) {
02086           for ( int k=0; k<zlistlen; ++k ) {
02087             *(qmsg++) = q[zlist[k]];
02088           }
02089         }
02090         q += zdim;
02091       }
02092     }
02093     newmsg->sourceNode = myGridPe;
02094 
02095     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02096     CmiEnableUrgentSend(1);
02097     pmeProxyDir[pe].recvUngrid(newmsg);
02098     CmiEnableUrgentSend(0);
02099   }
02100   grid_count = numSources;
02101   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02102 }
02103 
02104 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02105   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02106   if ( ungrid_count == 0 ) {
02107     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02108   }
02109 
02110   if ( usePencils ) copyPencils(msg);
02111   else copyResults(msg);
02112   delete msg;
02113   recvAck(0);
02114 }
02115 
02116 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02117   if ( msg ) delete msg;
02118   --ungrid_count;
02119 
02120   if ( ungrid_count == 0 ) {
02121     pmeProxyDir[CkMyPe()].ungridCalc();
02122   }
02123 }
02124 
02125 void ComputePmeMgr::ungridCalc(void) {
02126   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02127 
02128   // for (int j=0; j<myGrid.order-1; ++j) {
02129   //   fz_arr[myGrid.K3+j] = fz_arr[j];
02130   // }
02131   for (int i=0; i<q_count; ++i) {
02132     for (int j=0; j<myGrid.order-1; ++j) {
02133       q_list[i][myGrid.K3+j] = q_list[i][j];
02134     }
02135   }
02136 
02137   ungridForcesCount = pmeComputes.size();
02138 
02139   for ( int i=0; i<pmeComputes.size(); ++i ) {
02140     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02141     // pmeComputes[i]->ungridForces();
02142   }
02143   // submitReductions();  // must follow all ungridForces()
02144 
02145   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02146 }
02147 
02148 
02149 ComputePme::ComputePme(ComputeID c, PatchID pid) : Compute(c), patchID(pid)
02150 {
02151   DebugM(4,"ComputePme created.\n");
02152   basePriority = PME_PRIORITY;
02153   setNumPatches(1);
02154 
02155   CProxy_ComputePmeMgr::ckLocalBranch(
02156         CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
02157 
02158   SimParameters *simParams = Node::Object()->simParameters;
02159 
02160   alchFepOn = simParams->alchFepOn;
02161   alchThermIntOn = simParams->alchThermIntOn;
02162   alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
02163   alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? 
02164     simParams->alchElecLambdaStart : 0;
02165             
02166   if (alchFepOn || alchThermIntOn) {
02167     numGrids = 2;
02168     if (alchDecouple) numGrids += 2;
02169     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
02170   }
02171   else numGrids = 1;
02172   lesOn = simParams->lesOn;
02173   if ( lesOn ) {
02174     lesFactor = simParams->lesFactor;
02175     numGrids = lesFactor;
02176   }
02177   selfOn = 0;
02178   pairOn = simParams->pairInteractionOn;
02179   if ( pairOn ) {
02180     selfOn = simParams->pairInteractionSelf;
02181     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
02182     numGrids = selfOn ? 1 : 3;
02183   }
02184 
02185   myGrid.K1 = simParams->PMEGridSizeX;
02186   myGrid.K2 = simParams->PMEGridSizeY;
02187   myGrid.K3 = simParams->PMEGridSizeZ;
02188   myGrid.order = simParams->PMEInterpOrder;
02189   myGrid.dim2 = myGrid.K2;
02190   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02191 
02192   for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
02193 }
02194 
02195 void ComputePme::initialize() {
02196   if (!(patch = PatchMap::Object()->patch(patchID))) {
02197     NAMD_bug("ComputePme used with unknown patch.");
02198   }
02199   positionBox = patch->registerPositionPickup(this);
02200   avgPositionBox = patch->registerAvgPositionPickup(this);
02201   forceBox = patch->registerForceDeposit(this);
02202 }
02203 
02204 void ComputePmeMgr::initialize_computes() {
02205 
02206   noWorkCount = 0;
02207   doWorkCount = 0;
02208   ungridForcesCount = 0;
02209 
02210   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02211 
02212   SimParameters *simParams = Node::Object()->simParameters;
02213 
02214   strayChargeErrors = 0;
02215 
02216   if (simParams->accelMDOn) {
02217      amd_reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_AMD);
02218   } else {
02219      amd_reduction = NULL;
02220   }
02221 
02222   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02223   fsize = myGrid.K1 * myGrid.dim2;
02224   q_arr = new float*[fsize*numGrids];
02225   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02226   q_list = new float*[fsize*numGrids];
02227   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02228   q_count = 0;
02229   f_arr = new char[fsize*numGrids];
02230   // memset to non-zero value has race condition on BlueGene/Q
02231   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02232   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02233   fz_arr = new char[myGrid.K3+myGrid.order-1];
02234 
02235   for ( int g=0; g<numGrids; ++g ) {
02236     char *f = f_arr + g*fsize;
02237     if ( usePencils ) {
02238       int K1 = myGrid.K1;
02239       int K2 = myGrid.K2;
02240       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02241       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02242       int dim2 = myGrid.dim2;
02243       for (int ap=0; ap<numPencilsActive; ++ap) {
02244         int ib = activePencils[ap].i;
02245         int jb = activePencils[ap].j;
02246         int ibegin = ib*block1;
02247         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02248         int jbegin = jb*block2;
02249         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02250         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02251         for ( int i=ibegin; i<iend; ++i ) {
02252           for ( int j=jbegin; j<jend; ++j ) {
02253             f[i*dim2+j] = 0;
02254           }
02255         }
02256       }
02257     } else {
02258       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02259       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02260       for (int pe=0; pe<numGridPes; pe++) {
02261         if ( ! recipPeDest[pe] ) continue;
02262         int start = pe * bsize;
02263         int len = bsize;
02264         if ( start >= qsize ) { start = 0; len = 0; }
02265         if ( start + len > qsize ) { len = qsize - start; }
02266         int zdim = myGrid.dim3;
02267         int fstart = start / zdim;
02268         int flen = len / zdim;
02269         memset(f + fstart, 0, flen*sizeof(char));
02270       }
02271     }
02272   }
02273 
02274 #if USE_PERSISTENT
02275   recvGrid_handle = NULL;
02276 #endif
02277 }
02278 
02279 ComputePme::~ComputePme()
02280 {
02281     for ( int g=0; g<numGrids; ++g ) delete myRealSpace[g];
02282 }
02283 
02284 #if USE_PERSISTENT 
02285 void ComputePmeMgr::setup_recvgrid_persistent() 
02286 {
02287     int K1 = myGrid.K1;
02288     int K2 = myGrid.K2;
02289     int dim2 = myGrid.dim2;
02290     int dim3 = myGrid.dim3;
02291     int block1 = myGrid.block1;
02292     int block2 = myGrid.block2;
02293 
02294     CkArray *zPencil_local = zPencil.ckLocalBranch();
02295     recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02296     for (int ap=0; ap<numPencilsActive; ++ap) {
02297         int ib = activePencils[ap].i;
02298         int jb = activePencils[ap].j;
02299         int ibegin = ib*block1;
02300         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02301         int jbegin = jb*block2;
02302         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02303         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02304         // f is changing
02305         int fcount = 0;
02306         for ( int g=0; g<numGrids; ++g ) {
02307             char *f = f_arr + g*fsize;
02308             for ( int i=ibegin; i<iend; ++i ) {
02309                 for ( int j=jbegin; j<jend; ++j ) {
02310                     fcount += f[i*dim2+j];
02311                 }
02312             }
02313         }
02314         int zlistlen = 0;
02315         for ( int i=0; i<myGrid.K3; ++i ) {
02316             if ( fz_arr[i] ) ++zlistlen;
02317         }
02318         int hd = ( fcount? 1 : 0 );  // has data?
02319         int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
02320         int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
02321         int compress_size = sizeof(float)*hd*fcount*zlistlen;
02322         int size = compress_start +  compress_size  + PRIORITY_SIZE/8+6;
02323         recvGrid_handle[ap] =  CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
02324     }
02325 }
02326 #endif
02327 
02328 int ComputePme::noWork() {
02329 
02330   if ( patch->flags.doFullElectrostatics ) {
02331     if ( ! myMgr->ungridForcesCount ) return 0;  // work to do, enqueue as usual
02332     myMgr->heldComputes.add(this);
02333     return 1;  // don't enqueue yet
02334   }
02335 
02336   positionBox->skip();
02337   forceBox->skip();
02338 
02339   if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
02340     myMgr->noWorkCount = 0;
02341     myMgr->reduction->submit();
02342     if (myMgr->amd_reduction) myMgr->amd_reduction->submit();
02343   }
02344 
02345   return 1;  // no work for this step
02346 }
02347 
02348 void ComputePme::doWork()
02349 {
02350   DebugM(4,"Entering ComputePme::doWork().\n");
02351 
02352   if ( basePriority >= COMPUTE_HOME_PRIORITY ) {
02353     basePriority = PME_PRIORITY;
02354     ungridForces();
02355     if ( ! --(myMgr->ungridForcesCount) ) myMgr->submitReductions();
02356     return;
02357   }
02358   basePriority = COMPUTE_HOME_PRIORITY + PATCH_PRIORITY(patchID);
02359 
02360 #ifdef TRACE_COMPUTE_OBJECTS
02361     double traceObjStartTime = CmiWallTimer();
02362 #endif
02363 
02364   // allocate storage
02365   numLocalAtoms = patch->getNumAtoms();
02366 
02367   Lattice &lattice = patch->flags.lattice;
02368 
02369   localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
02370   localData = localData_alloc.begin();
02371   localPartition_alloc.resize(numLocalAtoms);
02372   localPartition = localPartition_alloc.begin();
02373 
02374   int g;
02375   for ( g=0; g<numGrids; ++g ) {
02376     localGridData[g] = localData + numLocalAtoms*(g+1);
02377   }
02378 
02379   // get positions and charges
02380   PmeParticle * data_ptr = localData;
02381   unsigned char * part_ptr = localPartition;
02382   const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
02383                                 * ComputeNonbondedUtil::dielectric_1 );
02384 
02385   {
02386     CompAtom *x = positionBox->open();
02387     // CompAtomExt *xExt = patch->getCompAtomExtInfo();
02388     if ( patch->flags.doMolly ) {
02389       positionBox->close(&x);
02390       x = avgPositionBox->open();
02391     }
02392     int numAtoms = patch->getNumAtoms();
02393 
02394     for(int i=0; i<numAtoms; ++i)
02395     {
02396       data_ptr->x = x[i].position.x;
02397       data_ptr->y = x[i].position.y;
02398       data_ptr->z = x[i].position.z;
02399       data_ptr->cg = coulomb_sqrt * x[i].charge;
02400       ++data_ptr;
02401       *part_ptr = x[i].partition;
02402       ++part_ptr;
02403     }
02404 
02405     if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
02406     else { positionBox->close(&x); }
02407   }
02408 
02409   // copy to other grids if needed
02410   if ( ((alchFepOn || alchThermIntOn) && (!alchDecouple)) || lesOn ) {
02411     for ( g=0; g<numGrids; ++g ) {
02412       PmeParticle *lgd = localGridData[g];
02413       int nga = 0;
02414       for(int i=0; i<numLocalAtoms; ++i) {
02415         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02416           // for FEP/TI: grid 0 gets non-alch + partition 1;
02417           // grid 1 gets non-alch + partition 2;
02418           // grid 2 (only if called for with numGrids=3) gets only non-alch
02419           lgd[nga++] = localData[i];
02420         }
02421       }
02422       numGridAtoms[g] = nga;
02423     }
02424   } else if ( (alchFepOn || alchThermIntOn) && alchDecouple) {
02425     // alchemical decoupling: four grids
02426     // g=0: partition 0 and partition 1
02427     // g=1: partition 0 and partition 2
02428     // g=2: only partition 1 atoms
02429     // g=3: only partition 2 atoms
02430     // plus one grid g=4, only partition 0, if numGrids=5
02431     for ( g=0; g<2; ++g ) {  // same as before for first 2
02432       PmeParticle *lgd = localGridData[g];
02433       int nga = 0;
02434       for(int i=0; i<numLocalAtoms; ++i) {
02435         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02436           lgd[nga++] = localData[i];
02437         }
02438       }
02439       numGridAtoms[g] = nga;
02440     }
02441     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
02442       PmeParticle *lgd = localGridData[g];
02443       int nga = 0;
02444       for(int i=0; i<numLocalAtoms; ++i) {
02445         if ( localPartition[i] == (g-1) ) {
02446           lgd[nga++] = localData[i];
02447         }
02448       }
02449       numGridAtoms[g] = nga;
02450     }
02451     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
02452       // numGrids=5 only if alchElecLambdaStart > 0
02453       PmeParticle *lgd = localGridData[g];
02454       int nga = 0;
02455       for(int i=0; i<numLocalAtoms; ++i) {
02456         if ( localPartition[i] == 0 ) {
02457           lgd[nga++] = localData[i];
02458         }
02459       }
02460       numGridAtoms[g] = nga;
02461     }
02462   } else if ( selfOn ) {
02463     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
02464     g = 0;
02465     PmeParticle *lgd = localGridData[g];
02466     int nga = 0;
02467     for(int i=0; i<numLocalAtoms; ++i) {
02468       if ( localPartition[i] == 1 ) {
02469         lgd[nga++] = localData[i];
02470       }
02471     }
02472     numGridAtoms[g] = nga;
02473   } else if ( pairOn ) {
02474     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
02475     g = 0;
02476     PmeParticle *lgd = localGridData[g];
02477     int nga = 0;
02478     for(int i=0; i<numLocalAtoms; ++i) {
02479       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02480         lgd[nga++] = localData[i];
02481       }
02482     }
02483     numGridAtoms[g] = nga;
02484     for ( g=1; g<3; ++g ) {
02485       PmeParticle *lgd = localGridData[g];
02486       int nga = 0;
02487       for(int i=0; i<numLocalAtoms; ++i) {
02488         if ( localPartition[i] == g ) {
02489           lgd[nga++] = localData[i];
02490         }
02491       }
02492       numGridAtoms[g] = nga;
02493     }
02494   } else {
02495     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
02496     localGridData[0] = localData;
02497     numGridAtoms[0] = numLocalAtoms;
02498   }
02499 
02500  if ( ! myMgr->doWorkCount ) {
02501   myMgr->doWorkCount = myMgr->pmeComputes.size();
02502 
02503   memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
02504 
02505   for (int i=0; i<myMgr->q_count; ++i) {
02506     memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
02507   }
02508 
02509   for ( g=0; g<numGrids; ++g ) {
02510     myMgr->evir[g] = 0;
02511   }
02512 
02513   myMgr->strayChargeErrors = 0;
02514 
02515   myMgr->compute_sequence = sequence();
02516  }
02517 
02518   if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
02519 
02520   int strayChargeErrors = 0;
02521 
02522   // calculate self energy
02523   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02524   for ( g=0; g<numGrids; ++g ) {
02525     BigReal selfEnergy = 0;
02526     data_ptr = localGridData[g];
02527     int i;
02528     for(i=0; i<numGridAtoms[g]; ++i)
02529     {
02530       selfEnergy += data_ptr->cg * data_ptr->cg;
02531       ++data_ptr;
02532     }
02533     selfEnergy *= -1. * ewaldcof / SQRT_PI;
02534     myMgr->evir[g][0] += selfEnergy;
02535 
02536     float **q = myMgr->q_arr + g*myMgr->fsize;
02537     char *f = myMgr->f_arr + g*myMgr->fsize;
02538 
02539     myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
02540     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
02541     myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
02542   }
02543   myMgr->strayChargeErrors += strayChargeErrors;
02544 
02545 #ifdef TRACE_COMPUTE_OBJECTS
02546     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
02547 #endif
02548 
02549  if ( --(myMgr->doWorkCount) == 0 ) {
02550   for (int j=0; j<myGrid.order-1; ++j) {
02551     myMgr->fz_arr[j] |= myMgr->fz_arr[myGrid.K3+j];
02552   }
02553   for (int i=0; i<myMgr->q_count; ++i) {
02554     for (int j=0; j<myGrid.order-1; ++j) {
02555       myMgr->q_list[i][j] += myMgr->q_list[i][myGrid.K3+j];
02556     }
02557   }
02558 
02559   if ( myMgr->usePencils ) {
02560     myMgr->sendPencils(lattice,sequence());
02561   } else {
02562     myMgr->sendData(lattice,sequence());
02563   }
02564  }
02565 
02566 }
02567 
02568 
02569 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
02570 
02571   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
02572 
02573 #if  USE_PERSISTENT
02574     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
02575 #endif
02576   int K1 = myGrid.K1;
02577   int K2 = myGrid.K2;
02578   int dim2 = myGrid.dim2;
02579   int dim3 = myGrid.dim3;
02580   int block1 = myGrid.block1;
02581   int block2 = myGrid.block2;
02582 
02583   // int savedMessages = 0;
02584   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
02585 
02586   for (int ap=0; ap<numPencilsActive; ++ap) {
02587     int ib = activePencils[ap].i;
02588     int jb = activePencils[ap].j;
02589     int ibegin = ib*block1;
02590     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02591     int jbegin = jb*block2;
02592     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02593     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02594 
02595     int fcount = 0;
02596     for ( int g=0; g<numGrids; ++g ) {
02597       char *f = f_arr + g*fsize;
02598       for ( int i=ibegin; i<iend; ++i ) {
02599        for ( int j=jbegin; j<jend; ++j ) {
02600         fcount += f[i*dim2+j];
02601        }
02602       }
02603     }
02604 
02605 #ifdef NETWORK_PROGRESS
02606     CmiNetworkProgress();
02607 #endif
02608 
02609     if ( ! pencilActive[ib*yBlocks+jb] )
02610       NAMD_bug("PME activePencils list inconsistent");
02611 
02612     int zlistlen = 0;
02613     for ( int i=0; i<myGrid.K3; ++i ) {
02614       if ( fz_arr[i] ) ++zlistlen;
02615     }
02616 
02617     int hd = ( fcount? 1 : 0 );  // has data?
02618     // if ( ! hd ) ++savedMessages;
02619 
02620     
02621     PmeGridMsg *msg = new ( hd*numGrids, hd*zlistlen, hd*flen,
02622         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
02623     msg->sourceNode = CkMyPe();
02624     msg->hasData = hd;
02625     msg->lattice = lattice;
02626    if ( hd ) {
02627 #if 0
02628     msg->start = fstart;
02629     msg->len = flen;
02630 #else
02631     msg->start = -1;   // obsolete?
02632     msg->len = -1;   // obsolete?
02633 #endif
02634     msg->zlistlen = zlistlen;
02635     int *zlist = msg->zlist;
02636     zlistlen = 0;
02637     for ( int i=0; i<myGrid.K3; ++i ) {
02638       if ( fz_arr[i] ) zlist[zlistlen++] = i;
02639     }
02640     char *fmsg = msg->fgrid;
02641     float *qmsg = msg->qgrid;
02642     for ( int g=0; g<numGrids; ++g ) {
02643       char *f = f_arr + g*fsize;
02644       float **q = q_arr + g*fsize;
02645       for ( int i=ibegin; i<iend; ++i ) {
02646        for ( int j=jbegin; j<jend; ++j ) {
02647         *(fmsg++) = f[i*dim2+j];
02648         if( f[i*dim2+j] ) {
02649           for ( int k=0; k<zlistlen; ++k ) {
02650             *(qmsg++) = q[i*dim2+j][zlist[k]];
02651           }
02652         }
02653        }
02654       }
02655     }
02656    }
02657 
02658     msg->sequence = compute_sequence;
02659     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
02660     CmiEnableUrgentSend(1);
02661 #if USE_NODE_PAR_RECEIVE
02662     msg->destElem=CkArrayIndex3D(ib,jb,0);
02663     CProxy_PmePencilMap lzm = npMgr->zm;
02664     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
02665     int destnode = CmiNodeOf(destproc);
02666     
02667 #if  0 
02668     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
02669 #endif
02670     pmeNodeProxy[destnode].recvZGrid(msg);
02671 #if 0 
02672     CmiUsePersistentHandle(NULL, 0);
02673 #endif
02674 #else
02675 #if 0 
02676     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
02677 #endif
02678     zPencil(ib,jb,0).recvGrid(msg);
02679 #if 0 
02680     CmiUsePersistentHandle(NULL, 0);
02681 #endif
02682 #endif
02683     CmiEnableUrgentSend(0);
02684   }
02685 
02686 
02687   if ( strayChargeErrors ) {
02688    strayChargeErrors = 0;
02689    iout << iERROR << "Stray PME grid charges detected: "
02690         << CkMyPe() << " sending to (x,y)";
02691    for (int ib=0; ib<xBlocks; ++ib) {
02692     for (int jb=0; jb<yBlocks; ++jb) {
02693      int ibegin = ib*block1;
02694      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02695      int jbegin = jb*block2;
02696      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02697      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02698 
02699      for ( int g=0; g<numGrids; ++g ) {
02700        char *f = f_arr + g*fsize;
02701        if ( ! pencilActive[ib*yBlocks+jb] ) {
02702            for ( int i=ibegin; i<iend; ++i ) {
02703             for ( int j=jbegin; j<jend; ++j ) {
02704              if ( f[i*dim2+j] == 3 ) {
02705                f[i*dim2+j] = 2;
02706                iout << " (" << i << "," << j << ")";
02707              }
02708             }
02709            }
02710        }
02711      }
02712     }
02713    }
02714    iout << "\n" << endi;
02715   }
02716 
02717   // if ( savedMessages ) {
02718   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
02719   // }
02720 
02721 }
02722 
02723 
02724 void ComputePmeMgr::copyPencils(PmeGridMsg *msg) {
02725 
02726   int K1 = myGrid.K1;
02727   int K2 = myGrid.K2;
02728   int dim2 = myGrid.dim2;
02729   int dim3 = myGrid.dim3;
02730   int block1 = myGrid.block1;
02731   int block2 = myGrid.block2;
02732 
02733   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
02734   int ib = msg->sourceNode / yBlocks;
02735   int jb = msg->sourceNode % yBlocks;
02736 
02737   int ibegin = ib*block1;
02738   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02739   int jbegin = jb*block2;
02740   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02741 
02742   int zlistlen = msg->zlistlen;
02743   int *zlist = msg->zlist;
02744   float *qmsg = msg->qgrid;
02745   int g;
02746   for ( g=0; g<numGrids; ++g ) {
02747     evir[g] += msg->evir[g];
02748     char *f = f_arr + g*fsize;
02749     float **q = q_arr + g*fsize;
02750     for ( int i=ibegin; i<iend; ++i ) {
02751      for ( int j=jbegin; j<jend; ++j ) {
02752       if( f[i*dim2+j] ) {
02753         f[i*dim2+j] = 0;
02754         for ( int k=0; k<zlistlen; ++k ) {
02755           q[i*dim2+j][zlist[k]] = *(qmsg++);
02756         }
02757       }
02758      }
02759     }
02760   }
02761 }
02762 
02763 
02764 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
02765 
02766   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
02767 
02768   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
02769 
02770   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
02771   for (int j=0; j<numGridPes; j++) {
02772     int pe = gridPeOrder[j];  // different order
02773     if ( ! recipPeDest[pe] && ! strayChargeErrors ) continue;
02774     int start = pe * bsize;
02775     int len = bsize;
02776     if ( start >= qsize ) { start = 0; len = 0; }
02777     if ( start + len > qsize ) { len = qsize - start; }
02778     int zdim = myGrid.dim3;
02779     int fstart = start / zdim;
02780     int flen = len / zdim;
02781     int fcount = 0;
02782     int i;
02783 
02784     int g;
02785     for ( g=0; g<numGrids; ++g ) {
02786       char *f = f_arr + fstart + g*fsize;
02787       for ( i=0; i<flen; ++i ) {
02788         fcount += f[i];
02789       }
02790       if ( ! recipPeDest[pe] ) {
02791         int errfound = 0;
02792         for ( i=0; i<flen; ++i ) {
02793           if ( f[i] == 3 ) {
02794             errfound = 1;
02795             break;
02796           }
02797         }
02798         if ( errfound ) {
02799           iout << iERROR << "Stray PME grid charges detected: "
02800                 << CkMyPe() << " sending to " << gridPeMap[pe] << " for planes";
02801           int iz = -1;
02802           for ( i=0; i<flen; ++i ) {
02803             if ( f[i] == 3 ) {
02804               f[i] = 2;
02805               int jz = (i+fstart)/myGrid.K2;
02806               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
02807             }
02808           }
02809           iout << "\n" << endi;
02810         }
02811       }
02812     }
02813 
02814 #ifdef NETWORK_PROGRESS
02815     CmiNetworkProgress();
02816 #endif
02817 
02818     if ( ! recipPeDest[pe] ) continue;
02819 
02820     int zlistlen = 0;
02821     for ( i=0; i<myGrid.K3; ++i ) {
02822       if ( fz_arr[i] ) ++zlistlen;
02823     }
02824 
02825     PmeGridMsg *msg = new (numGrids, zlistlen, flen*numGrids,
02826                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
02827 
02828     msg->sourceNode = CkMyPe();
02829     msg->lattice = lattice;
02830     msg->start = fstart;
02831     msg->len = flen;
02832     msg->zlistlen = zlistlen;
02833     int *zlist = msg->zlist;
02834     zlistlen = 0;
02835     for ( i=0; i<myGrid.K3; ++i ) {
02836       if ( fz_arr[i] ) zlist[zlistlen++] = i;
02837     }
02838     float *qmsg = msg->qgrid;
02839     for ( g=0; g<numGrids; ++g ) {
02840       char *f = f_arr + fstart + g*fsize;
02841       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
02842       float **q = q_arr + fstart + g*fsize;
02843       for ( i=0; i<flen; ++i ) {
02844         if ( f[i] ) {
02845           for ( int k=0; k<zlistlen; ++k ) {
02846             *(qmsg++) = q[i][zlist[k]];
02847           }
02848         }
02849       }
02850     }
02851 
02852     msg->sequence = compute_sequence;
02853     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
02854     pmeProxy[gridPeMap[pe]].recvGrid(msg);
02855   }
02856  
02857   strayChargeErrors = 0;
02858 
02859 }
02860 
02861 void ComputePmeMgr::copyResults(PmeGridMsg *msg) {
02862 
02863   int zdim = myGrid.dim3;
02864   int flen = msg->len;
02865   int fstart = msg->start;
02866   int zlistlen = msg->zlistlen;
02867   int *zlist = msg->zlist;
02868   float *qmsg = msg->qgrid;
02869   int g;
02870   for ( g=0; g<numGrids; ++g ) {
02871     evir[g] += msg->evir[g];
02872     char *f = msg->fgrid + g*flen;
02873     float **q = q_arr + fstart + g*fsize;
02874     for ( int i=0; i<flen; ++i ) {
02875       if ( f[i] ) {
02876         f[i] = 0;
02877         for ( int k=0; k<zlistlen; ++k ) {
02878           q[i][zlist[k]] = *(qmsg++);
02879         }
02880       }
02881     }
02882   }
02883 }
02884 
02885 void ComputePme::ungridForces() {
02886 
02887     if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
02888  
02889     SimParameters *simParams = Node::Object()->simParameters;
02890 
02891     localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
02892     Vector *localResults = localResults_alloc.begin();
02893     Vector *gridResults;
02894     if ( alchFepOn || alchThermIntOn || lesOn || selfOn || pairOn ) {
02895       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
02896       gridResults = localResults + numLocalAtoms;
02897     } else {
02898       gridResults = localResults;
02899     }
02900 
02901     Vector pairForce = 0.;
02902     Lattice &lattice = patch->flags.lattice;
02903     int g = 0;
02904     if(!simParams->commOnly) {
02905     for ( g=0; g<numGrids; ++g ) {
02906 #ifdef NETWORK_PROGRESS
02907       CmiNetworkProgress();
02908 #endif
02909 
02910       myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
02911       scale_forces(gridResults, numGridAtoms[g], lattice);
02912 
02913       if ( alchFepOn || alchThermIntOn ) {
02914         float scale = 1.;
02915         if(simParams->alchFepWhamOn)    {
02916                 if(simParams->alchFepElecOn)    {
02917             elecLambdaUp = simParams->alchElecLambda;
02918             elecLambdaDown = 1.0 - simParams->alchElecLambda;
02919                 }
02920                 else {
02921             elecLambdaUp = 0.0;
02922             elecLambdaDown = 1.0;
02923                 }
02924         }
02925         else            {
02926                 elecLambdaUp =  (simParams->alchLambda <= alchElecLambdaStart)? 0. : \
02927             (simParams->alchLambda - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02928                 elecLambdaDown =  ((1-simParams->alchLambda) <= alchElecLambdaStart)? 0. : ((1-simParams->alchLambda) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02929         }
02930 
02931         if ( g == 0 ) scale = elecLambdaUp;
02932         else if ( g == 1 ) scale = elecLambdaDown;
02933         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02934         if (alchDecouple) {
02935           if ( g == 2 ) scale = 1 - elecLambdaUp;
02936           else if ( g == 3 ) scale = 1 - elecLambdaDown;
02937           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02938         }
02939         int nga = 0;
02940         if (!alchDecouple) {
02941           for(int i=0; i<numLocalAtoms; ++i) {
02942             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02943               // (g=2: only partition 0)
02944               localResults[i] += gridResults[nga++] * scale;
02945             }
02946           }
02947         }
02948         else {  // alchDecouple
02949           if ( g < 2 ) {
02950             for(int i=0; i<numLocalAtoms; ++i) {
02951               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02952                 // g = 0: partition 0 or partition 1
02953                 // g = 1: partition 0 or partition 2
02954                 localResults[i] += gridResults[nga++] * scale;
02955               }
02956             }
02957           }
02958           else {
02959             for(int i=0; i<numLocalAtoms; ++i) {
02960               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
02961                 // g = 2: partition 1 only
02962                 // g = 3: partition 2 only
02963                 // g = 4: partition 0 only
02964                 localResults[i] += gridResults[nga++] * scale;
02965               }
02966             }
02967           }
02968         }
02969       } else if ( lesOn ) {
02970         float scale = 1.;
02971         if ( alchFepOn ) {
02972                 if(simParams->alchFepWhamOn)    {
02973                         if(simParams->alchFepElecOn) {
02974                   if ( g == 0 ) scale = simParams->alchElecLambda;
02975                     else if ( g == 1 ) scale = 1. - simParams->alchElecLambda;
02976                         }
02977                         else {
02978                   if ( g == 0 ) scale = 0.0;
02979                     else if ( g == 1 ) scale = 1.0;
02980                         }
02981                 }
02982                 else    {
02983                   if ( g == 0 ) scale = simParams->alchLambda;
02984                   else if ( g == 1 ) scale = 1. - simParams->alchLambda;
02985                 }
02986         } else if ( lesOn ) {
02987           scale = 1.0 / (float)lesFactor;
02988         }
02989         int nga = 0;
02990         for(int i=0; i<numLocalAtoms; ++i) {
02991           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02992             localResults[i] += gridResults[nga++] * scale;
02993           }
02994         }
02995       } else if ( selfOn ) {
02996         PmeParticle *lgd = localGridData[g];
02997         int nga = 0;
02998         for(int i=0; i<numLocalAtoms; ++i) {
02999           if ( localPartition[i] == 1 ) {
03000             pairForce += gridResults[nga];  // should add up to almost zero
03001             localResults[i] += gridResults[nga++];
03002           }
03003         }
03004       } else if ( pairOn ) {
03005         if ( g == 0 ) {
03006           int nga = 0;
03007           for(int i=0; i<numLocalAtoms; ++i) {
03008             if ( localPartition[i] == 1 ) {
03009               pairForce += gridResults[nga];
03010             }
03011             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
03012               localResults[i] += gridResults[nga++];
03013             }
03014           }
03015         } else if ( g == 1 ) {
03016           int nga = 0;
03017           for(int i=0; i<numLocalAtoms; ++i) {
03018             if ( localPartition[i] == g ) {
03019               pairForce -= gridResults[nga];  // should add up to almost zero
03020               localResults[i] -= gridResults[nga++];
03021             }
03022           }
03023         } else {
03024           int nga = 0;
03025           for(int i=0; i<numLocalAtoms; ++i) {
03026             if ( localPartition[i] == g ) {
03027               localResults[i] -= gridResults[nga++];
03028             }
03029          }
03030         }
03031       }
03032     }
03033     }
03034 
03035     Vector *results_ptr = localResults;
03036     
03037     // add in forces
03038     {
03039       Results *r = forceBox->open();
03040       Force *f = r->f[Results::slow];
03041       int numAtoms = patch->getNumAtoms();
03042 
03043       if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
03044         for(int i=0; i<numAtoms; ++i) {
03045           f[i].x += results_ptr->x;
03046           f[i].y += results_ptr->y;
03047           f[i].z += results_ptr->z;
03048           ++results_ptr;
03049         }
03050       }
03051       forceBox->close(&r);
03052     }
03053 
03054     if ( pairOn || selfOn ) {
03055         ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
03056     }
03057 
03058 }
03059 
03060 void ComputePmeMgr::submitReductions() {
03061 
03062     for ( int g=0; g<numGrids; ++g ) {
03063       float scale = 1.;
03064       if ( alchFepOn || alchThermIntOn ) {
03065         if ( g == 0 ) scale = elecLambdaUp;
03066         else if ( g == 1 ) scale = elecLambdaDown;
03067         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03068         if (alchDecouple) {
03069           if ( g == 2 ) scale = 1-elecLambdaUp;
03070           else if ( g == 3 ) scale = 1-elecLambdaDown;
03071           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03072         }
03073       } else if ( lesOn ) {
03074         scale = 1.0 / lesFactor;
03075       } else if ( pairOn ) {
03076         scale = ( g == 0 ? 1. : -1. );
03077       }
03078       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03079       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03080       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03081       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03082       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03083       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03084       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03085       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03086       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03087       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03088 
03089       if (amd_reduction) {
03090       amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03091       amd_reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03092       amd_reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03093       amd_reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03094       amd_reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03095       amd_reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03096       amd_reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03097       amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03098       amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03099       amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03100       }
03101 
03102       float scale2 = 0.;
03103 
03104       //alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? simParams->alchElecLambdaStart : 0;
03105       SimParameters *simParams = Node::Object()->simParameters;
03106 
03107       if (alchFepOn) {
03108         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
03109         if(simParams->alchFepWhamOn)    {
03110                 if(simParams->alchFepElecOn) {
03111                 elecLambda2Up = simParams->alchElecLambda;
03112                   elecLambda2Down =  1.0 - simParams->alchElecLambda;
03113                 }
03114                 else {
03115                 elecLambda2Up = 0.0;
03116                   elecLambda2Down =  1.0;
03117                 }
03118         }
03119         else    {
03120                 elecLambda2Up =  (simParams->alchLambda2 <= alchElecLambdaStart)? 0. : \
03121                       (simParams->alchLambda2 - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03122                 elecLambda2Down =  ((1-simParams->alchLambda2) <= alchElecLambdaStart)? 0. : \
03123                       ((1-simParams->alchLambda2) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03124         }
03125         
03126         if ( g == 0 ) scale2 = elecLambda2Up;
03127         else if ( g == 1 ) scale2 = elecLambda2Down;
03128         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03129         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
03130         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
03131         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03132       }
03133       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
03134         if( g==0 )      scale2 = scale + 1.0;
03135         else if( g==1 ) scale2 = scale - 1.0;
03136         else if( g==2 ) scale2 = scale - 1.0;
03137         else if( g==3 ) scale2 = scale + 1.0;
03138       }
03139       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03140       if (amd_reduction) amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03141       
03142       if (alchThermIntOn) {
03143         
03144         // no decoupling:
03145         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
03146         // (interactions between all atoms [partition 0 OR partition 1], 
03147         // minus all [within partition 0])
03148         // U = elecLambdaUp * (U[0] - U[2])
03149         // dU/dl = U[0] - U[2];
03150         
03151         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
03152         // (interactions between all atoms [partition 0 OR partition 2], 
03153         // minus all [within partition 0])
03154         // U = elecLambdaDown * (U[1] - U[2])
03155         // dU/dl = U[1] - U[2];
03156 
03157         // alchDecouple:
03158         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
03159         // (interactions between all atoms [partition 0 OR partition 1]
03160         // minus all [within partition 1] minus all [within partition 0]
03161         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
03162         // dU/dl = U[0] - U[2] - U[4];
03163 
03164         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
03165         // (interactions between all atoms [partition 0 OR partition 2]
03166         // minus all [within partition 2] minus all [within partition 0]
03167         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
03168         // dU/dl = U[1] - U[3] - U[4];
03169         
03170         
03171         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
03172         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
03173         if (!alchDecouple) {
03174           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03175           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03176         }
03177         else {  // alchDecouple
03178           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03179           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03180           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03181           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03182         }
03183       }
03184     }
03185     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
03186     reduction->submit();
03187     if (amd_reduction) amd_reduction->submit();
03188 
03189 
03190   for ( int i=0; i<heldComputes.size(); ++i ) {
03191     WorkDistrib::messageEnqueueWork(heldComputes[i]);
03192   }
03193   heldComputes.resize(0);
03194 }
03195 
03196 #if USE_TOPOMAP 
03197 
03198 #define NPRIMES 8
03199 const static unsigned int NAMDPrimes[] = {
03200   3,
03201   5,
03202   7,
03203   11,
03204   13,
03205   17,
03206   19,
03207   23,  
03208   29,
03209   31,
03210   37,
03211   59,
03212   73,
03213   93,
03214   113,
03215   157,
03216   307,
03217   617,
03218   1217                  //This should b enough for 64K nodes of BGL. 
03219 };
03220 
03221 #include "RecBisection.h"
03222 
03223 /***-----------------------------------------------------**********
03224     The Orthogonal Recursive Bisection strategy, which allocates PME
03225     objects close to the patches they communicate, and at the same
03226     time spreads them around the grid 
03227 ****----------------------------------------------------------****/
03228 
03229 bool generateBGLORBPmePeList(int *pemap, int numPes, 
03230                              int *block_pes, int nbpes) {
03231 
03232   PatchMap *pmap = PatchMap::Object();
03233   int *pmemap = new int [CkNumPes()];
03234 
03235   if (pemap == NULL)
03236     return false;
03237 
03238   TopoManager tmgr;
03239 
03240   memset(pmemap, 0, sizeof(int) * CkNumPes());
03241 
03242   for(int count = 0; count < CkNumPes(); count++) {
03243     if(count < nbpes)
03244       pmemap[block_pes[count]] = 1;
03245     
03246     if(pmap->numPatchesOnNode(count)) {
03247       pmemap[count] = 1;
03248       
03249       //Assumes an XYZT mapping !!
03250       if(tmgr.hasMultipleProcsPerNode()) {
03251         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
03252       }
03253     }
03254   }
03255 
03256   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
03257     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
03258     return false;
03259 
03260   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03261   Node *node = nd.ckLocalBranch();
03262   SimParameters *simParams = node->simParameters;
03263 
03264   //first split PME processors into patch groups
03265 
03266   int xsize = 0, ysize = 0, zsize = 0;
03267 
03268   xsize = tmgr.getDimNX();
03269   ysize = tmgr.getDimNY();
03270   zsize = tmgr.getDimNZ();
03271   
03272   int nx = xsize, ny = ysize, nz = zsize;
03273   DimensionMap dm;
03274   
03275   dm.x = 0;
03276   dm.y = 1;
03277   dm.z = 2;
03278   
03279   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
03280 
03281   //group size processors have to be allocated to each YZ plane
03282   int group_size = numPes/nx;
03283   if(numPes % nx)
03284     group_size ++;
03285 
03286   int my_prime = NAMDPrimes[0];
03287   int density = (ny * nz)/group_size + 1;
03288   int count = 0;
03289   
03290   //Choose a suitable prime Number
03291   for(count = 0; count < NPRIMES; count ++) {
03292     //Find a prime just greater than the density
03293     if(density < NAMDPrimes[count]) {
03294       my_prime = NAMDPrimes[count];
03295       break;
03296     }      
03297   }
03298   
03299   if(count == NPRIMES)
03300     my_prime = NAMDPrimes[NPRIMES-1];
03301 
03302   //int gcount = numPes/2;
03303   int gcount = 0;
03304   int npme_pes = 0;
03305   
03306   int coord[3];
03307 
03308   for(int x = 0; x < nx; x++) {
03309     coord[0] = (x + nx/2)%nx;
03310     
03311     for(count=0; count < group_size && npme_pes < numPes; count++) {
03312       int dest = (count + 1) * my_prime;      
03313       dest = dest % (ny * nz);      
03314       
03315       coord[2] = dest / ny;
03316       coord[1] = dest - coord[2] * ny;
03317       
03318       //Locate where in the actual grid the processor is
03319       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
03320         coord[dm.z] * xsize* ysize;
03321       
03322       if(pmemap[destPe] == 0) {
03323         pemap[gcount++] = destPe;
03324         pmemap[destPe] = 1;
03325         
03326         if(tmgr.hasMultipleProcsPerNode())
03327           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
03328 
03329         npme_pes ++;
03330       }
03331       else {
03332         for(int pos = 1; pos < ny * nz; pos++) {
03333           
03334           coord[2] += pos / ny;
03335           coord[1] += pos % ny;
03336           
03337           coord[2] = coord[2] % nz;
03338           coord[1] = coord[1] % ny;       
03339           
03340           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
03341             coord[dm.z] * xsize * ysize;
03342           
03343           if(pmemap[newdest] == 0) {
03344             pemap[gcount++] = newdest;
03345             pmemap[newdest] = 1;
03346             
03347             if(tmgr.hasMultipleProcsPerNode())
03348               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
03349             
03350             npme_pes ++;
03351             break;
03352           }
03353         }
03354       }      
03355     }   
03356     
03357     if(gcount == numPes)
03358       gcount = 0;    
03359     
03360     if(npme_pes >= numPes)
03361       break;
03362   }
03363   
03364   delete [] pmemap;
03365   
03366   if(npme_pes != numPes)
03367     //NAMD_bug("ORB PME allocator failed\n");
03368     return false;
03369 
03370   return true;
03371 }
03372 
03373 #endif
03374 
03375 template <class T> class PmePencil : public T {
03376 public:
03377   PmePencil() {
03378     data = 0;
03379     work = 0;
03380     send_order = 0;
03381     needs_reply = 0;
03382 #if USE_PERSISTENT
03383     trans_handle = untrans_handle = ungrid_handle = NULL;
03384 #endif
03385   }
03386   ~PmePencil() {
03387 #ifdef NAMD_FFTW
03388     fftwf_free(data);
03389 #endif
03390     delete [] work;
03391     delete [] send_order;
03392     delete [] needs_reply;
03393   }
03394   void base_init(PmePencilInitMsg *msg) {
03395     imsg=0;
03396     imsgb=0;
03397     hasData=0;
03398     initdata = msg->data;
03399   }
03400   void order_init(int nBlocks) {
03401     send_order = new int[nBlocks];
03402     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
03403     if ( Node::Object()->simParameters->PMESendOrder ) {
03404       std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
03405     } else {
03406       Random rand(CkMyPe());
03407       rand.reorder(send_order,nBlocks);
03408     }
03409     needs_reply = new int[nBlocks];
03410   }
03411   PmePencilInitMsgData initdata;
03412   Lattice lattice;
03413   PmeReduction evir;
03414   int sequence;  // used for priorities
03415   int imsg;  // used in sdag code
03416   int imsgb;  // Node par uses distinct counter for back path
03417   int hasData;  // used in message elimination
03418   float *data;
03419   float *work;
03420   int *send_order;
03421   int *needs_reply;
03422 #if USE_PERSISTENT
03423   PersistentHandle *trans_handle;
03424   PersistentHandle *untrans_handle;
03425   PersistentHandle *ungrid_handle;
03426 #endif
03427 };
03428 
03429 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
03430 public:
03431     PmeZPencil_SDAG_CODE
03432     PmeZPencil() { __sdag_init(); setMigratable(false); }
03433     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); imsg=imsgb=0;}
03434         ~PmeZPencil() {
03435         #ifdef NAMD_FFTW
03436         #ifdef NAMD_FFTW_3
03437                 delete [] forward_plans;
03438                 delete [] backward_plans;
03439         #endif
03440         #endif
03441         }
03442     void fft_init();
03443     void recv_grid(const PmeGridMsg *);
03444     void forward_fft();
03445     void send_trans();
03446         void send_subset_trans(int fromIdx, int toIdx);
03447     void recv_untrans(const PmeUntransMsg *);
03448     void node_process_untrans(PmeUntransMsg *);
03449     void node_process_grid(PmeGridMsg *);
03450     void backward_fft();
03451         void send_ungrid(PmeGridMsg *);
03452         void send_all_ungrid();
03453         void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
03454 private:
03455     ResizeArray<PmeGridMsg *> grid_msgs;
03456 #ifdef NAMD_FFTW
03457 #ifdef NAMD_FFTW_3
03458     fftwf_plan forward_plan, backward_plan;
03459 
03460         //for ckloop usage
03461         int numPlans;
03462         fftwf_plan *forward_plans, *backward_plans;
03463 #else
03464     rfftwnd_plan forward_plan, backward_plan;
03465 #endif
03466 #endif
03467 
03468     int nx, ny;
03469 #if USE_PERSISTENT
03470     void setup_persistent() {
03471       int hd = 1;// ( hasData ? 1 : 0 );
03472       int zBlocks = initdata.zBlocks;
03473       int block3 = initdata.grid.block3;
03474       int dim3 = initdata.grid.dim3;
03475       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03476       CmiAssert(yPencil_local);
03477       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
03478       for ( int isend=0; isend<zBlocks; ++isend ) {
03479           int kb = send_order[isend];
03480           int nz1 = block3;
03481           if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
03482           int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
03483           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8;
03484           int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
03485           int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
03486           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03487       }
03488     }
03489     
03490     void setup_ungrid_persistent() 
03491     {
03492        ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
03493        for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
03494            int peer = grid_msgs[imsg]->sourceNode;
03495            //ungrid_handle[imsg] = CmiCreatePersistent(peer, 0); 
03496        }
03497     }
03498 #endif
03499 };
03500 
03501 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
03502 public:
03503     PmeYPencil_SDAG_CODE
03504     PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
03505     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
03506     void fft_init();
03507     void recv_trans(const PmeTransMsg *);
03508     void forward_fft();
03509         void forward_subset_fft(int fromIdx, int toIdx);
03510     void send_trans();
03511         void send_subset_trans(int fromIdx, int toIdx);
03512     void recv_untrans(const PmeUntransMsg *);    
03513     void node_process_trans(PmeTransMsg *);
03514     void node_process_untrans(PmeUntransMsg *);
03515     void backward_fft();
03516         void backward_subset_fft(int fromIdx, int toIdx);
03517     void send_untrans();
03518     void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03519 private:
03520 #ifdef NAMD_FFTW
03521 #ifdef NAMD_FFTW_3
03522     fftwf_plan forward_plan, backward_plan;
03523 #else
03524     fftw_plan forward_plan, backward_plan;
03525 #endif
03526 #endif
03527 
03528     int nx, nz;
03529 #if USE_PERSISTENT
03530     void setup_persistent() {
03531       int yBlocks = initdata.yBlocks;
03532       int block2 = initdata.grid.block2;
03533       int K2 = initdata.grid.K2;
03534       int hd = 1;
03535       CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
03536       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03537       for ( int isend=0; isend<yBlocks; ++isend ) {
03538           int jb = send_order[isend];
03539           int ny1 = block2;
03540           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
03541           int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
03542           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8;
03543           int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
03544           int compress_size = sizeof(float)*hd*nx*ny1*nz*2; 
03545           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03546       }
03547 
03548       CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
03549       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03550       int send_evir = 1;
03551       for ( int isend=0; isend<yBlocks; ++isend ) {
03552           int jb = send_order[isend];
03553           int ny1 = block2;
03554           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
03555           int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
03556           int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof(PmeReduction)*send_evir +sizeof( envelope) + PRIORITY_SIZE/8;
03557           int compress_start = sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir + sizeof( envelope); 
03558           int compress_size = sizeof(float)*nx*ny1*nz*2;
03559           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size,  compress_start, compress_size, CMI_FLOATING);
03560           if ( send_evir ) {
03561               send_evir = 0;
03562           } 
03563       }
03564     }
03565 #endif
03566 };
03567 
03568 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
03569 public:
03570     PmeXPencil_SDAG_CODE
03571     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); imsg=imsgb=0; }
03572     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
03573         ~PmeXPencil() {
03574         #ifdef NAMD_FFTW
03575         #ifdef NAMD_FFTW_3
03576                 delete [] forward_plans;
03577                 delete [] backward_plans;
03578         #endif
03579         #endif
03580         }
03581     void fft_init();
03582     void recv_trans(const PmeTransMsg *);
03583     void forward_fft();
03584     void pme_kspace();
03585     void backward_fft();
03586     void send_untrans();
03587         void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03588     void node_process_trans(PmeTransMsg *);
03589 #ifdef NAMD_FFTW
03590 #ifdef NAMD_FFTW_3
03591     fftwf_plan forward_plan, backward_plan;
03592 
03593         int numPlans;
03594         fftwf_plan *forward_plans, *backward_plans;
03595 #else
03596     fftw_plan forward_plan, backward_plan;
03597 #endif
03598 #endif
03599     int ny, nz;
03600     PmeKSpace *myKSpace;
03601 #if USE_PERSISTENT
03602     void  setup_persistent() {
03603       int xBlocks = initdata.xBlocks;
03604       int block1 = initdata.grid.block1;
03605       int K1 = initdata.grid.K1;
03606       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03607       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
03608       int send_evir = 1;
03609       for ( int isend=0; isend<xBlocks; ++isend ) {
03610           int ib = send_order[isend];
03611           int nx1 = block1;
03612           if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
03613           int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
03614           int size = sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir +       
03615               sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8; 
03616           int compress_start = sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir + sizeof( envelope); 
03617           int compress_size = sizeof(float)*nx1*ny*nz*2;
03618           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03619           if ( send_evir ) {
03620               send_evir = 0;
03621           }
03622 
03623       }
03624     }
03625 #endif
03626 
03627 };
03628 
03629 void PmeZPencil::fft_init() {
03630   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03631   Node *node = nd.ckLocalBranch();
03632   SimParameters *simParams = node->simParameters;
03633 
03634 #if USE_NODE_PAR_RECEIVE
03635   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
03636 #endif
03637 
03638   int K1 = initdata.grid.K1;
03639   int K2 = initdata.grid.K2;
03640   int K3 = initdata.grid.K3;
03641   int dim3 = initdata.grid.dim3;
03642   int block1 = initdata.grid.block1;
03643   int block2 = initdata.grid.block2;
03644 
03645   nx = block1;
03646   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03647   ny = block2;
03648   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03649 
03650 #ifdef NAMD_FFTW
03651   CmiLock(ComputePmeMgr::fftw_plan_lock);
03652 
03653   data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
03654   work = new float[dim3];
03655 
03656   order_init(initdata.zBlocks);
03657 
03658 #ifdef NAMD_FFTW_3
03659   /* need array of sizes for the how many */
03660 
03661   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
03662   int sizeLines=nx*ny;
03663   int planLineSizes[1];
03664   planLineSizes[0]=K3;
03665   int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
03666   int ndimHalf=ndim/2;
03667   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
03668                                          (float *) data, NULL, 1, 
03669                                          ndim,
03670                                          (fftwf_complex *) data, NULL, 1,
03671                                          ndimHalf,
03672                                          fftwFlags);
03673 
03674   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
03675                                           (fftwf_complex *) data, NULL, 1, 
03676                                           ndimHalf,
03677                                           (float *) data, NULL, 1, 
03678                                           ndim,
03679                                           fftwFlags);
03680 #if     USE_CKLOOP
03681   if(simParams->useCkLoop) {
03682           //How many FFT plans to be created? The grain-size issue!!.
03683           //Currently, I am choosing the min(nx, ny) to be coarse-grain
03684           numPlans = (nx<=ny?nx:ny);
03685           int howmany = sizeLines/numPlans;
03686           forward_plans = new fftwf_plan[numPlans];
03687           backward_plans = new fftwf_plan[numPlans];
03688           for(int i=0; i<numPlans; i++) {
03689                   int dimStride = i*ndim*howmany;
03690                   int dimHalfStride = i*ndimHalf*howmany;
03691                   forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
03692                                                                                                          ((float *)data)+dimStride, NULL, 1,
03693                                                                                                          ndim,
03694                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03695                                                                                                          ndimHalf,
03696                                                                                                          fftwFlags);
03697 
03698                   backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
03699                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03700                                                                                                          ndimHalf,
03701                                                                                                          ((float *)data)+dimStride, NULL, 1,
03702                                                                                                          ndim,
03703                                                                                                          fftwFlags);
03704           }
03705   }else 
03706 #endif 
03707   {
03708           forward_plans = NULL;
03709           backward_plans = NULL;
03710   }
03711 #else
03712   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
03713         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03714         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03715   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
03716         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03717         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03718 #endif
03719   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03720 #else
03721   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03722 #endif
03723 
03724 #if USE_NODE_PAR_RECEIVE
03725     evir = 0.;
03726     memset(data, 0, sizeof(float) * nx*ny*dim3);
03727 #endif
03728 }
03729 
03730 void PmeYPencil::fft_init() {
03731   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03732   Node *node = nd.ckLocalBranch();
03733   SimParameters *simParams = node->simParameters;
03734 
03735 #if USE_NODE_PAR_RECEIVE
03736   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
03737 #endif
03738 
03739   int K1 = initdata.grid.K1;
03740   int K2 = initdata.grid.K2;
03741   int dim2 = initdata.grid.dim2;
03742   int dim3 = initdata.grid.dim3;
03743   int block1 = initdata.grid.block1;
03744   int block3 = initdata.grid.block3;
03745 
03746   nx = block1;
03747   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03748   nz = block3;
03749   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03750 
03751 #ifdef NAMD_FFTW
03752   CmiLock(ComputePmeMgr::fftw_plan_lock);
03753 
03754   data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
03755   work = new float[2*K2];
03756 
03757   order_init(initdata.yBlocks);
03758 
03759 #ifdef NAMD_FFTW_3
03760   /* need array of sizes for the dimensions */
03761   /* ideally this should be implementable as a single multidimensional
03762    *  plan, but that has proven tricky to implement, so we maintain the
03763    *  loop of 1d plan executions. */
03764   int sizeLines=nz;
03765   int planLineSizes[1];
03766   planLineSizes[0]=K2;
03767   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
03768   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
03769                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03770                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03771                                      FFTW_FORWARD, 
03772                                      fftwFlags);
03773   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
03774                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03775                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03776                                      FFTW_BACKWARD, 
03777                                       fftwFlags);
03778   CkAssert(forward_plan != NULL);
03779   CkAssert(backward_plan != NULL);
03780 #else
03781   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
03782         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03783         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03784         nz, (fftw_complex *) work, 1);
03785   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
03786         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03787         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03788         nz, (fftw_complex *) work, 1);
03789 #endif
03790   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03791 #else
03792   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03793 #endif
03794 
03795 #if USE_NODE_PAR_RECEIVE
03796   evir = 0;
03797   CmiMemoryWriteFence();
03798 #endif
03799 }
03800 
03801 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
03802 {
03803   if ( msg->hasData ) hasData = 1;
03804   needs_reply[msg->sourceNode] = msg->hasData;
03805   recv_trans(msg);
03806   int limsg;
03807   CmiMemoryAtomicFetchAndInc(imsg,limsg);
03808   if(limsg+1 == initdata.yBlocks)
03809     {
03810       if ( hasData ) {
03811         forward_fft();
03812       }
03813       send_trans();
03814       if( ! hasData)
03815         {
03816           send_untrans(); //todo, what is up with the recvAck in SDAG version?
03817         }
03818       imsg=0;
03819       CmiMemoryWriteFence();
03820     }
03821 }
03822 
03823 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
03824 {
03825   recv_untrans(msg);
03826   int limsg;
03827   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
03828   if(limsg+1 == initdata.yBlocks)
03829     {
03830       backward_fft();
03831       send_untrans();
03832       imsgb=0;
03833       CmiMemoryWriteFence();
03834     }
03835 }
03836 
03837 #define DEBUG_NODE_PAR_RECV 0
03838 
03839 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
03840   //  CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03841   PmeXPencil *target=xPencilObj.get(msg->destElem);
03842 #if DEBUG_NODE_PAR_RECV
03843   if(target == NULL)
03844     CkAbort("xpencil in recvXTrans not found, debug registeration");
03845 #endif  
03846     target->node_process_trans(msg);
03847   delete msg;
03848 }
03849 
03850 
03851 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
03852   //  CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03853   PmeYPencil *target=yPencilObj.get(msg->destElem);
03854 #if DEBUG_NODE_PAR_RECV
03855   if(target == NULL)
03856     CkAbort("ypencil in recvYTrans not found, debug registeration");
03857 #endif  
03858     target->node_process_trans(msg);
03859   delete msg;
03860  }
03861 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
03862   //  CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03863   PmeYPencil *target=yPencilObj.get(msg->destElem);
03864 #if DEBUG_NODE_PAR_RECV  
03865   if(target == NULL)
03866     CkAbort("ypencil in recvYUntrans not found, debug registeration");
03867 #endif  
03868     target->node_process_untrans(msg);
03869   delete msg;
03870  }
03871 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
03872   //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03873   PmeZPencil *target=zPencilObj.get(msg->destElem);
03874 #if DEBUG_NODE_PAR_RECV
03875   if(target == NULL)
03876     CkAbort("zpencil in recvZUntrans not found, debug registeration");
03877 #endif
03878   target->node_process_untrans(msg);
03879   delete msg;
03880 }
03881 
03882 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
03883   //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03884   PmeZPencil *target=zPencilObj.get(msg->destElem);
03885 #if DEBUG_NODE_PAR_RECV
03886   if(target == NULL){
03887     CkAbort("zpencil in recvZGrid not found, debug registeration");
03888   }
03889 #endif
03890   target->node_process_grid(msg); //msg is stored inside node_proces_grid
03891 }
03892 
03893 void PmeXPencil::fft_init() {
03894   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03895   Node *node = nd.ckLocalBranch();
03896   SimParameters *simParams = node->simParameters;
03897 #if USE_NODE_PAR_RECEIVE
03898   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
03899 #endif
03900 
03901   int K1 = initdata.grid.K1;
03902   int K2 = initdata.grid.K2;
03903   int dim3 = initdata.grid.dim3;
03904   int block2 = initdata.grid.block2;
03905   int block3 = initdata.grid.block3;
03906 
03907   ny = block2;
03908   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03909   nz = block3;
03910   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03911 
03912 #ifdef NAMD_FFTW
03913   CmiLock(ComputePmeMgr::fftw_plan_lock);
03914 
03915   data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
03916   work = new float[2*K1];
03917 
03918   order_init(initdata.xBlocks);
03919 
03920 #ifdef NAMD_FFTW_3
03921   /* need array of sizes for the how many */
03922   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
03923   int sizeLines=ny*nz;
03924   int planLineSizes[1];
03925   planLineSizes[0]=K1;
03926   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03927                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03928                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03929                                    FFTW_FORWARD,
03930                                      fftwFlags);
03931   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03932                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03933                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03934                                           FFTW_BACKWARD,
03935                                       fftwFlags);
03936 
03937 #if     USE_CKLOOP
03938   if(simParams->useCkLoop) {
03939           //How many FFT plans to be created? The grain-size issue!!.
03940           //Currently, I am choosing the min(nx, ny) to be coarse-grain
03941           numPlans = (ny<=nz?ny:nz);
03942           int howmany = sizeLines/numPlans;
03943           forward_plans = new fftwf_plan[numPlans];
03944           backward_plans = new fftwf_plan[numPlans];
03945           for(int i=0; i<numPlans; i++) {
03946                   int curStride = i*howmany;              
03947                   forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03948                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03949                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03950                                                                                                         FFTW_FORWARD,
03951                                                                                                          fftwFlags);
03952 
03953                   backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03954                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03955                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03956                                                                                                           FFTW_BACKWARD,
03957                                                                                                          fftwFlags);
03958           }
03959   }else
03960 #endif
03961   {
03962           forward_plans = NULL;
03963           backward_plans = NULL;
03964   }
03965 #else
03966   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
03967         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03968         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03969         ny*nz, (fftw_complex *) work, 1);
03970   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
03971         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03972         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03973         ny*nz, (fftw_complex *) work, 1);
03974 #endif
03975   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03976 #else
03977   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03978 #endif
03979 
03980   myKSpace = new PmeKSpace(initdata.grid,
03981                 thisIndex.y*block2, thisIndex.y*block2 + ny,
03982                 thisIndex.z*block3, thisIndex.z*block3 + nz);
03983 
03984 }
03985 
03986 // #define FFTCHECK   // run a grid of integers through the fft
03987 // #define ZEROCHECK  // check for suspicious zeros in fft
03988 
03989 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
03990 
03991   int dim3 = initdata.grid.dim3;
03992   if ( imsg == 0 ) {
03993     lattice = msg->lattice;
03994     sequence = msg->sequence;
03995 #if ! USE_NODE_PAR_RECEIVE
03996     memset(data, 0, sizeof(float)*nx*ny*dim3);
03997 #endif
03998   }
03999 
04000   if ( ! msg->hasData ) return;
04001 
04002   int zlistlen = msg->zlistlen;
04003   int *zlist = msg->zlist;
04004   char *fmsg = msg->fgrid;
04005   float *qmsg = msg->qgrid;
04006   float *d = data;
04007   int numGrids = 1;  // pencil FFT doesn't support multiple grids
04008   for ( int g=0; g<numGrids; ++g ) {
04009     for ( int i=0; i<nx; ++i ) {
04010      for ( int j=0; j<ny; ++j, d += dim3 ) {
04011       if( *(fmsg++) ) {
04012         for ( int k=0; k<zlistlen; ++k ) {
04013           d[zlist[k]] += *(qmsg++);
04014         }
04015       }
04016      }
04017     }
04018   }
04019 }
04020 
04021 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
04022 #ifdef NAMD_FFTW
04023 #ifdef NAMD_FFTW_3    
04024     fftwf_plan *plans = (fftwf_plan *)param;
04025     for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
04026 #endif
04027 #endif        
04028 }
04029 
04030 void PmeZPencil::forward_fft() {
04031   evir = 0.;
04032 #ifdef FFTCHECK
04033   int dim3 = initdata.grid.dim3;
04034   int K3 = initdata.grid.K3;
04035   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
04036   float *d = data;
04037   for ( int i=0; i<nx; ++i ) {
04038    for ( int j=0; j<ny; ++j, d += dim3 ) {
04039     for ( int k=0; k<dim3; ++k ) {
04040       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
04041     }
04042    }
04043   }
04044 #endif
04045 #ifdef NAMD_FFTW
04046 #ifdef MANUAL_DEBUG_FFTW3
04047   dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
04048 #endif
04049 #ifdef NAMD_FFTW_3
04050 #if     USE_CKLOOP
04051   int useCkLoop = Node::Object()->simParameters->useCkLoop;
04052   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT) {
04053           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
04054           //transform the above loop
04055           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
04056           return;
04057   }
04058 #endif
04059   fftwf_execute(forward_plan);
04060 #else
04061   rfftwnd_real_to_complex(forward_plan, nx*ny,
04062         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
04063 #endif
04064 #ifdef MANUAL_DEBUG_FFTW3
04065   dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
04066 #endif
04067 
04068 #endif
04069 #ifdef ZEROCHECK
04070   int dim3 = initdata.grid.dim3;
04071   int K3 = initdata.grid.K3;
04072   float *d = data;
04073   for ( int i=0; i<nx; ++i ) {
04074    for ( int j=0; j<ny; ++j, d += dim3 ) {
04075     for ( int k=0; k<dim3; ++k ) {
04076       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
04077         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
04078     }
04079    }
04080   }
04081 #endif
04082 }
04083 
04084 /* A single task for partitioned PmeZPencil::send_trans work */
04085 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
04086         PmeZPencil *zpencil = (PmeZPencil *)param;
04087         zpencil->send_subset_trans(first, last);        
04088 }
04089 
04090 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
04091         int zBlocks = initdata.zBlocks;
04092         int block3 = initdata.grid.block3;
04093         int dim3 = initdata.grid.dim3;
04094         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
04095           int kb = send_order[isend];
04096           int nz = block3;
04097           if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
04098           int hd = ( hasData ? 1 : 0 );
04099           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04100           msg->lattice = lattice;
04101           msg->sourceNode = thisIndex.y;
04102           msg->hasData = hasData;
04103           msg->nx = ny;
04104          if ( hasData ) {
04105           float *md = msg->qgrid;
04106           const float *d = data;
04107           for ( int i=0; i<nx; ++i ) {
04108            for ( int j=0; j<ny; ++j, d += dim3 ) {
04109                 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04110                   *(md++) = d[2*k];
04111                   *(md++) = d[2*k+1];
04112                 }
04113            }
04114           }
04115          }
04116           msg->sequence = sequence;
04117           SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
04118 
04119     CmiEnableUrgentSend(1);
04120 #if USE_NODE_PAR_RECEIVE
04121       msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04122 #if Y_PERSIST 
04123       CmiUsePersistentHandle(&trans_handle[isend], 1);
04124 #endif
04125       initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04126 #if Y_PERSIST 
04127       CmiUsePersistentHandle(NULL, 0);
04128 #endif    
04129 #else
04130 #if Y_PERSIST 
04131       CmiUsePersistentHandle(&trans_handle[isend], 1);
04132 #endif
04133       initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04134 #if Y_PERSIST 
04135       CmiUsePersistentHandle(NULL, 0);
04136 #endif    
04137 #endif
04138     CmiEnableUrgentSend(0);
04139     }
04140 }
04141 
04142 void PmeZPencil::send_trans() {
04143 #if USE_PERSISTENT
04144     if (trans_handle == NULL) setup_persistent();
04145 #endif
04146 #if     USE_CKLOOP
04147         Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04148         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS) {
04155                 //send_subset_trans(0, initdata.zBlocks-1);
04156                 CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
04157                 return;
04158         }
04159 #endif
04160   int zBlocks = initdata.zBlocks;
04161   int block3 = initdata.grid.block3;
04162   int dim3 = initdata.grid.dim3;
04163   for ( int isend=0; isend<zBlocks; ++isend ) {
04164     int kb = send_order[isend];
04165     int nz = block3;
04166     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
04167     int hd = ( hasData ? 1 : 0 );
04168     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04169     msg->lattice = lattice;
04170     msg->sourceNode = thisIndex.y;
04171     msg->hasData = hasData;
04172     msg->nx = ny;
04173    if ( hasData ) {
04174     float *md = msg->qgrid;
04175     const float *d = data;
04176     for ( int i=0; i<nx; ++i ) {
04177      for ( int j=0; j<ny; ++j, d += dim3 ) {
04178       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04179         *(md++) = d[2*k];
04180         *(md++) = d[2*k+1];
04181       }
04182      }
04183     }
04184    }
04185     msg->sequence = sequence;
04186     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
04187 
04188     CmiEnableUrgentSend(1);
04189 #if USE_NODE_PAR_RECEIVE
04190     msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04191 #if Y_PERSIST 
04192     CmiUsePersistentHandle(&trans_handle[isend], 1);
04193 #endif
04194     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04195 #if Y_PERSIST 
04196     CmiUsePersistentHandle(NULL, 0);
04197 #endif    
04198 #else
04199 #if Y_PERSIST 
04200     CmiUsePersistentHandle(&trans_handle[isend], 1);
04201 #endif
04202     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04203 #if Y_PERSIST 
04204     CmiUsePersistentHandle(NULL, 0);
04205 #endif    
04206 #endif
04207     CmiEnableUrgentSend(0);
04208   }
04209 }
04210 
04211 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
04212   if ( imsg == 0 ) {
04213     lattice = msg->lattice;
04214     sequence = msg->sequence;
04215   }
04216   int block2 = initdata.grid.block2;
04217   int K2 = initdata.grid.K2;
04218   int jb = msg->sourceNode;
04219   int ny = msg->nx;
04220  if ( msg->hasData ) {
04221   const float *md = msg->qgrid;
04222   float *d = data;
04223   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04224    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04225     for ( int k=0; k<nz; ++k ) {
04226 #ifdef ZEROCHECK
04227       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
04228         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04229 #endif
04230       d[2*(j*nz+k)] = *(md++);
04231       d[2*(j*nz+k)+1] = *(md++);
04232     }
04233    }
04234   }
04235  } else {
04236   float *d = data;
04237   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04238    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04239     for ( int k=0; k<nz; ++k ) {
04240       d[2*(j*nz+k)] = 0;
04241       d[2*(j*nz+k)+1] = 0;
04242     }
04243    }
04244   }
04245  }
04246 }
04247 
04248 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
04249         PmeYPencil *ypencil = (PmeYPencil *)param;
04250         ypencil->forward_subset_fft(first, last);
04251 }
04252 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
04253 #ifdef NAMD_FFTW
04254 #ifdef NAMD_FFTW_3
04255         for(int i=fromIdx; i<=toIdx; i++){
04256                 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
04257                       * nz * initdata.grid.K2,  
04258                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04259         }
04260 #endif
04261 #endif
04262 }
04263 
04264 void PmeYPencil::forward_fft() {
04265     evir = 0.;
04266 #ifdef NAMD_FFTW
04267 #ifdef MANUAL_DEBUG_FFTW3
04268   dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04269 #endif
04270   
04271 #ifdef NAMD_FFTW_3
04272 #if     USE_CKLOOP
04273   int useCkLoop = Node::Object()->simParameters->useCkLoop;
04274   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT) {
04275           CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
04276           return;
04277   }
04278 #endif
04279   //the above is a transformation of the following loop using CkLoop
04280   for ( int i=0; i<nx; ++i ) {
04281     fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
04282                       * nz * initdata.grid.K2,  
04283                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04284   }
04285 #else
04286   for ( int i=0; i<nx; ++i ) {
04287     fftw(forward_plan, nz,
04288         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04289         nz, 1, (fftw_complex *) work, 1, 0);
04290   }
04291 #endif
04292 #ifdef MANUAL_DEBUG_FFTW3
04293   dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04294 #endif
04295 
04296 #endif
04297 }
04298 
04299 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
04300         PmeYPencil *ypencil = (PmeYPencil *)param;
04301         ypencil->send_subset_trans(first, last);
04302 }
04303 
04304 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
04305         int yBlocks = initdata.yBlocks;
04306         int block2 = initdata.grid.block2;
04307         int K2 = initdata.grid.K2;
04308     for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
04309           int jb = send_order[isend];
04310           int ny = block2;
04311           if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04312           int hd = ( hasData ? 1 : 0 );
04313           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04314           msg->lattice = lattice;
04315           msg->sourceNode = thisIndex.x;
04316           msg->hasData = hasData;
04317           msg->nx = nx;
04318          if ( hasData ) {
04319           float *md = msg->qgrid;
04320           const float *d = data;
04321           for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04322            for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04323                 for ( int k=0; k<nz; ++k ) {
04324                   *(md++) = d[2*(j*nz+k)];
04325                   *(md++) = d[2*(j*nz+k)+1];
04326   #ifdef ZEROCHECK
04327                   if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04328           thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04329   #endif
04330                 }
04331            }
04332           }
04333           if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04334           thisIndex.x, jb, thisIndex.z);
04335          }
04336           msg->sequence = sequence;
04337           SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04338       CmiEnableUrgentSend(1);
04339 #if USE_NODE_PAR_RECEIVE
04340       msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04341 #if X_PERSIST 
04342       CmiUsePersistentHandle(&trans_handle[isend], 1);
04343 #endif
04344       initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
04345 #if X_PERSIST 
04346       CmiUsePersistentHandle(NULL, 0);
04347 #endif
04348 #else      
04349 #if X_PERSIST 
04350       CmiUsePersistentHandle(&trans_handle[isend], 1);
04351 #endif
04352       initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04353 #if X_PERSIST 
04354       CmiUsePersistentHandle(NULL, 0);
04355 #endif
04356 #endif
04357       CmiEnableUrgentSend(0);
04358         }
04359 }
04360 
04361 void PmeYPencil::send_trans() {
04362 #if USE_PERSISTENT
04363     if (trans_handle == NULL) setup_persistent();
04364 #endif
04365 #if     USE_CKLOOP
04366         Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04367         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS) {
04374                 //send_subset_trans(0, initdata.yBlocks-1);
04375                 CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
04376                 return;
04377         }
04378 #endif
04379   int yBlocks = initdata.yBlocks;
04380   int block2 = initdata.grid.block2;
04381   int K2 = initdata.grid.K2;
04382   for ( int isend=0; isend<yBlocks; ++isend ) {
04383     int jb = send_order[isend];
04384     int ny = block2;
04385     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04386     int hd = ( hasData ? 1 : 0 );
04387     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04388     msg->lattice = lattice;
04389     msg->sourceNode = thisIndex.x;
04390     msg->hasData = hasData;
04391     msg->nx = nx;
04392    if ( hasData ) {
04393     float *md = msg->qgrid;
04394     const float *d = data;
04395     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04396      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04397       for ( int k=0; k<nz; ++k ) {
04398         *(md++) = d[2*(j*nz+k)];
04399         *(md++) = d[2*(j*nz+k)+1];
04400 #ifdef ZEROCHECK
04401         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04402         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04403 #endif
04404       }
04405      }
04406     }
04407     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04408         thisIndex.x, jb, thisIndex.z);
04409    }
04410     msg->sequence = sequence;
04411     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04412     CmiEnableUrgentSend(1);
04413 #if USE_NODE_PAR_RECEIVE
04414     msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04415 #if X_PERSIST 
04416         CmiUsePersistentHandle(&trans_handle[isend], 1);
04417 #endif
04418     initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
04419 #if X_PERSIST 
04420         CmiUsePersistentHandle(NULL, 0);
04421 #endif
04422 #else
04423 #if X_PERSIST 
04424         CmiUsePersistentHandle(&trans_handle[isend], 1);
04425 #endif
04426     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04427 #if X_PERSIST 
04428         CmiUsePersistentHandle(NULL, 0);
04429 #endif
04430     
04431 #endif
04432     CmiEnableUrgentSend(0);
04433   }
04434 }
04435 
04436 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
04437 {
04438   if(msg->hasData) hasData=1;
04439   needs_reply[msg->sourceNode] = msg->hasData;
04440   recv_trans(msg);
04441   int limsg;
04442   CmiMemoryAtomicFetchAndInc(imsg,limsg);
04443   if(limsg+1 == initdata.xBlocks)
04444     {
04445       if(hasData){
04446         forward_fft();
04447         pme_kspace();
04448         backward_fft();
04449       }
04450       send_untrans();
04451       imsg=0;
04452       CmiMemoryWriteFence();
04453     }
04454 }
04455 
04456 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
04457   if ( imsg == 0 ) {
04458     lattice = msg->lattice;
04459     sequence = msg->sequence;
04460   }
04461   int block1 = initdata.grid.block1;
04462   int K1 = initdata.grid.K1;
04463   int ib = msg->sourceNode;
04464   int nx = msg->nx;
04465  if ( msg->hasData ) {
04466   const float *md = msg->qgrid;
04467   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04468    float *d = data + i*ny*nz*2;
04469    for ( int j=0; j<ny; ++j, d += nz*2 ) {
04470     for ( int k=0; k<nz; ++k ) {
04471 #ifdef ZEROCHECK
04472       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
04473         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
04474 #endif
04475       d[2*k] = *(md++);
04476       d[2*k+1] = *(md++);
04477     }
04478    }
04479   }
04480  } else {
04481   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04482    float *d = data + i*ny*nz*2;
04483    for ( int j=0; j<ny; ++j, d += nz*2 ) {
04484     for ( int k=0; k<nz; ++k ) {
04485       d[2*k] = 0;
04486       d[2*k+1] = 0;
04487     }
04488    }
04489   }
04490  }
04491 }
04492 
04493 void PmeXPencil::forward_fft() {
04494 #ifdef NAMD_FFTW
04495 
04496 #ifdef MANUAL_DEBUG_FFTW3
04497   dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04498 #endif
04499 
04500 #ifdef NAMD_FFTW_3
04501 #if     USE_CKLOOP
04502   int useCkLoop = Node::Object()->simParameters->useCkLoop;
04503   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT) {
04504           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
04505           //transform the above loop
04506           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
04507           return;
04508   }
04509 #endif
04510   fftwf_execute(forward_plan);
04511 #else
04512   fftw(forward_plan, ny*nz,
04513         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04514 #endif
04515 #ifdef MANUAL_DEBUG_FFTW3
04516   dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04517 #endif
04518 
04519 #endif
04520 }
04521 
04522 void PmeXPencil::pme_kspace() {
04523 
04524   evir = 0.;
04525 
04526 #ifdef FFTCHECK
04527   return;
04528 #endif
04529 
04530   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
04531 
04532   int numGrids = 1;
04533   for ( int g=0; g<numGrids; ++g ) {
04534     evir[0] = myKSpace->compute_energy(data+0*g,
04535                 lattice, ewaldcof, &(evir[1]));
04536   }
04537   
04538 #if USE_NODE_PAR_RECEIVE
04539     CmiMemoryWriteFence();
04540 #endif
04541 }
04542 
04543 void PmeXPencil::backward_fft() {
04544 #ifdef NAMD_FFTW
04545 #ifdef MANUAL_DEBUG_FFTW3
04546   dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04547 #endif
04548 
04549 #ifdef NAMD_FFTW_3
04550 #if     USE_CKLOOP
04551   int useCkLoop = Node::Object()->simParameters->useCkLoop;
04552   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT) {
04553           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
04554           //transform the above loop
04555           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
04556           return;
04557   }
04558 #endif
04559   fftwf_execute(backward_plan);
04560 #else
04561   fftw(backward_plan, ny*nz,
04562         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04563 #endif
04564 #ifdef MANUAL_DEBUG_FFTW3
04565   dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04566 #endif
04567 #endif
04568 }
04569 
04570 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04571         int evirIdx = paraNum;
04572         PmeXPencil *xpencil = (PmeXPencil *)param;
04573         xpencil->send_subset_untrans(first, last, evirIdx);
04574 }
04575 
04576 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04577         int xBlocks = initdata.xBlocks;
04578         int block1 = initdata.grid.block1;      
04579         int K1 = initdata.grid.K1;
04580 
04581         int ackL=0, ackH=-1;
04582         int unL=0, unH=-1;
04583         int send_evir=0;
04584         if(fromIdx >= evirIdx+1) {
04585                 //send PmeUntransMsg with has_evir=0
04586                 unL = fromIdx;
04587                 unH = toIdx;            
04588         } else if(toIdx <= evirIdx-1) {
04589                 //send PmeAckMsg
04590                 ackL=fromIdx;
04591                 ackH=toIdx;             
04592         } else {
04593                 //partially send PmeAckMsg and partially send PmeUntransMsg
04594                 ackL=fromIdx;
04595                 ackH=evirIdx-1;
04596                 send_evir=1;
04597                 unL=evirIdx+1;
04598                 unH=toIdx;
04599         }
04600 
04601         for(int isend=ackL; isend<=ackH; isend++) {
04602                 //send PmeAckMsg
04603         CmiEnableUrgentSend(1);
04604                 int ib = send_order[isend];
04605                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04606                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04607                 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04608         CmiEnableUrgentSend(0);
04609     }
04610 
04611     CmiEnableUrgentSend(1);
04612         //send PmeUntransMsg with has_evir=1
04613         if(send_evir) {
04614                 int ib = send_order[evirIdx];
04615                 int nx = block1;
04616                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04617                 PmeUntransMsg *msg = new (1,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;            
04618                 msg->evir[0] = evir;
04619                 msg->has_evir = 1;                              
04620                 msg->sourceNode = thisIndex.y;
04621                 msg->ny = ny;
04622                 float *md = msg->qgrid;
04623                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04624                         float *d = data + i*ny*nz*2;
04625                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
04626                                 for ( int k=0; k<nz; ++k ) {
04627                                         *(md++) = d[2*k];
04628                                         *(md++) = d[2*k+1];
04629                                 }
04630                         }
04631                 }
04632                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04633 #if USE_NODE_PAR_RECEIVE
04634         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04635         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04636 #else
04637         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04638 #endif
04639          }
04640     CmiEnableUrgentSend(0);
04641         
04642         //send PmeUntransMsg with has_evir=0
04643         for(int isend=unL; isend<=unH; isend++) {
04644                 int ib = send_order[isend];
04645                 int nx = block1;
04646                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04647                 PmeUntransMsg *msg = new (0,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04648                 msg->has_evir = 0;              
04649                 msg->sourceNode = thisIndex.y;
04650                 msg->ny = ny;
04651                 float *md = msg->qgrid;
04652                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04653                         float *d = data + i*ny*nz*2;
04654                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
04655                                 for ( int k=0; k<nz; ++k ) {
04656                                         *(md++) = d[2*k];
04657                                         *(md++) = d[2*k+1];
04658                                 }
04659                         }
04660                 }
04661                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04662         CmiEnableUrgentSend(1);
04663 #if USE_NODE_PAR_RECEIVE
04664         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04665 #if Y_PERSIST 
04666         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04667 #endif
04668         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04669 #if Y_PERSIST 
04670         CmiUsePersistentHandle(NULL, 0);
04671 #endif
04672 #else
04673 #if Y_PERSIST 
04674   //      CmiUsePersistentHandle(&untrans_handle[isend], 1);
04675 #endif
04676         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04677 #if Y_PERSIST 
04678    //     CmiUsePersistentHandle(NULL, 0);
04679 #endif
04680 #endif
04681         CmiEnableUrgentSend(0);
04682         }
04683 }
04684 
04685 void PmeXPencil::send_untrans() {
04686 #if USE_PERSISTENT
04687   if (untrans_handle == NULL) setup_persistent();
04688 #endif
04689 #if     USE_CKLOOP
04690   Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04691   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS) {
04692                 int xBlocks = initdata.xBlocks;
04693                 int evirIdx = 0;
04694                 for ( int isend=0; isend<xBlocks; ++isend ) {
04695                         int ib = send_order[isend];
04696                         if (needs_reply[ib]) {
04697                                 evirIdx = isend;
04698                                 break;
04699                         }
04700                 }
04701 
04702                 //basically: 
04703                 //[0,evirIdx-1]->send PmeAckMsg
04704                 //evirIdx->send PmeUntransMsg with has_evir=1
04705                 //[evirIdx+1, xBlocks-1]->send PmeUntransMsg with has_evir=0
04706                 //send_subset_untrans(0, xBlocks-1, evirIdx);
04707 #if USE_NODE_PAR_RECEIVE
04708                 //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
04709                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
04710 #else
04711         //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
04712                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
04713 #endif        
04714                 return;
04715   }
04716 #endif
04717   int xBlocks = initdata.xBlocks;
04718   int block1 = initdata.grid.block1;
04719   int K1 = initdata.grid.K1;
04720   int send_evir = 1;
04721   for ( int isend=0; isend<xBlocks; ++isend ) {
04722     int ib = send_order[isend];
04723     if ( ! needs_reply[ib] ) {
04724       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04725       CmiEnableUrgentSend(1);
04726       SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04727       initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04728       CmiEnableUrgentSend(0);
04729       continue;
04730     }
04731     int nx = block1;
04732     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04733     PmeUntransMsg *msg = new (send_evir, nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04734     if ( send_evir ) {
04735       msg->evir[0] = evir;
04736       msg->has_evir = 1;
04737       send_evir = 0;
04738     } else {
04739       msg->has_evir = 0;
04740     }
04741     msg->sourceNode = thisIndex.y;
04742     msg->ny = ny;
04743     float *md = msg->qgrid;
04744     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04745      float *d = data + i*ny*nz*2;
04746      for ( int j=0; j<ny; ++j, d += nz*2 ) {
04747       for ( int k=0; k<nz; ++k ) {
04748         *(md++) = d[2*k];
04749         *(md++) = d[2*k+1];
04750       }
04751      }
04752     }
04753     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04754 
04755     CmiEnableUrgentSend(1);
04756 #if USE_NODE_PAR_RECEIVE
04757     msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04758 #if Y_PERSIST 
04759     CmiUsePersistentHandle(&untrans_handle[isend], 1);
04760 #endif
04761     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04762 #if Y_PERSIST 
04763     CmiUsePersistentHandle(NULL, 0);
04764 #endif
04765 #else
04766 #if Y_PERSIST 
04767     CmiUsePersistentHandle(&untrans_handle[isend], 1);
04768 #endif
04769     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04770 #if Y_PERSIST 
04771     CmiUsePersistentHandle(NULL, 0);
04772 #endif
04773 #endif
04774     CmiEnableUrgentSend(0);
04775   }
04776 }
04777 
04778 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
04779   if ( msg->has_evir ) evir += msg->evir[0];
04780   int block2 = initdata.grid.block2;
04781   int K2 = initdata.grid.K2;
04782   int jb = msg->sourceNode;
04783   int ny = msg->ny;
04784   const float *md = msg->qgrid;
04785   float *d = data;
04786   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04787 #if CMK_BLUEGENEL
04788     CmiNetworkProgress();
04789 #endif   
04790     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04791       for ( int k=0; k<nz; ++k ) {
04792 #ifdef ZEROCHECK
04793         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
04794                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04795 #endif
04796         d[2*(j*nz+k)] = *(md++);
04797         d[2*(j*nz+k)+1] = *(md++);
04798       }
04799     }
04800   }
04801 }
04802 
04803 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
04804         PmeYPencil *ypencil = (PmeYPencil *)param;
04805         ypencil->backward_subset_fft(first, last);
04806 }
04807 
04808 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
04809 #ifdef NAMD_FFTW
04810 #ifdef NAMD_FFTW_3
04811         for(int i=fromIdx; i<=toIdx; i++){
04812                 fftwf_execute_dft(backward_plan,        
04813                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2,         
04814                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04815         }
04816 #endif
04817 #endif
04818 }
04819 
04820 void PmeYPencil::backward_fft() {
04821 #ifdef NAMD_FFTW
04822 #ifdef MANUAL_DEBUG_FFTW3
04823   dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04824 #endif
04825 
04826 #ifdef NAMD_FFTW_3
04827 #if     USE_CKLOOP
04828   int useCkLoop = Node::Object()->simParameters->useCkLoop;
04829   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT) {
04830           CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
04831           return;
04832   }
04833 #endif
04834   //the above is a transformation of the following loop using CkLoop
04835   for ( int i=0; i<nx; ++i ) {
04836 #if CMK_BLUEGENEL
04837         CmiNetworkProgress();
04838 #endif
04839     fftwf_execute_dft(backward_plan,    
04840                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
04841                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04842   }
04843 #else
04844         for ( int i=0; i<nx; ++i ) {
04845 #if CMK_BLUEGENEL
04846           CmiNetworkProgress();
04847 #endif
04848                 fftw(backward_plan, nz,
04849                 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04850                 nz, 1, (fftw_complex *) work, 1, 0);
04851         }
04852 #endif
04853 
04854 #ifdef MANUAL_DEBUG_FFTW3
04855   dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04856 #endif
04857 
04858 #endif
04859 }
04860 
04861 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04862         int evirIdx = paraNum;
04863         PmeYPencil *ypencil = (PmeYPencil *)param;
04864         ypencil->send_subset_untrans(first, last, evirIdx);
04865 }
04866 
04867 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04868         int yBlocks = initdata.yBlocks;
04869         int block2 = initdata.grid.block2;      
04870         int K2 = initdata.grid.K2;
04871 
04872         int ackL=0, ackH=-1;
04873         int unL=0, unH=-1;
04874         int send_evir=0;
04875         if(fromIdx >= evirIdx+1) {
04876                 //send PmeUntransMsg with has_evir=0
04877                 unL = fromIdx;
04878                 unH = toIdx;            
04879         } else if(toIdx <= evirIdx-1) {
04880                 //send PmeAckMsg
04881                 ackL=fromIdx;
04882                 ackH=toIdx;             
04883         } else {
04884                 //partially send PmeAckMsg and partially send PmeUntransMsg
04885                 ackL=fromIdx;
04886                 ackH=evirIdx-1;
04887                 send_evir=1;
04888                 unL=evirIdx+1;
04889                 unH=toIdx;
04890         }
04891 
04892         for(int isend=ackL; isend<=ackH; isend++) {
04893                 //send PmeAckMsg
04894         CmiEnableUrgentSend(1);
04895                 int jb = send_order[isend];
04896                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04897                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04898                 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
04899         CmiEnableUrgentSend(0);
04900         }
04901 
04902     CmiEnableUrgentSend(1);
04903         //send PmeUntransMsg with has_evir=1
04904         if(send_evir) {
04905                 int jb = send_order[evirIdx];
04906                 int ny = block2;
04907                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04908                 PmeUntransMsg *msg = new (1,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;            
04909                 msg->evir[0] = evir;
04910                 msg->has_evir = 1;                              
04911                 msg->sourceNode = thisIndex.z;
04912                 msg->ny = nz;
04913                 float *md = msg->qgrid;
04914                 const float *d = data;
04915                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04916                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04917                                 for ( int k=0; k<nz; ++k ) {
04918                                         *(md++) = d[2*(j*nz+k)];
04919                                         *(md++) = d[2*(j*nz+k)+1];
04920                                 }
04921                         }
04922                 }
04923                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04924 #if USE_NODE_PAR_RECEIVE
04925         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04926     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
04927         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04928 #else
04929         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04930 #endif
04931         }
04932 
04933     CmiEnableUrgentSend(0);
04934         //send PmeUntransMsg with has_evir=0
04935         for(int isend=unL; isend<=unH; isend++) {
04936                 int jb = send_order[isend];
04937                 int ny = block2;
04938                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04939                 PmeUntransMsg *msg = new (0,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04940                 msg->has_evir = 0;
04941                 msg->sourceNode = thisIndex.z;
04942                 msg->ny = nz;
04943                 float *md = msg->qgrid;
04944                 const float *d = data;
04945                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04946                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04947                                 for ( int k=0; k<nz; ++k ) {
04948                                         *(md++) = d[2*(j*nz+k)];
04949                                         *(md++) = d[2*(j*nz+k)+1];
04950                                 }
04951                         }
04952                 }
04953                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04954             CmiEnableUrgentSend(1);
04955 #if USE_NODE_PAR_RECEIVE
04956         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04957         //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
04958 #if Z_PERSIST 
04959         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04960 #endif
04961         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04962 #if Z_PERSIST 
04963         CmiUsePersistentHandle(NULL, 0);
04964 #endif
04965 #else
04966 #if Z_PERSIST 
04967         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04968 #endif
04969         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04970 #if Z_PERSIST 
04971         CmiUsePersistentHandle(NULL, 0);
04972 #endif
04973 #endif
04974     CmiEnableUrgentSend(0);
04975         }
04976 }
04977 
04978 void PmeYPencil::send_untrans() {
04979 #if USE_PERSISTENT
04980   if (untrans_handle == NULL) setup_persistent();
04981 #endif
04982 #if     USE_CKLOOP
04983   Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04984   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS) {
04985           int yBlocks = initdata.yBlocks;
04986           int evirIdx = 0;
04987           for ( int isend=0; isend<yBlocks; ++isend ) {
04988                   int jb = send_order[isend];
04989                   if (needs_reply[jb]) {
04990                           evirIdx = isend;
04991                           break;
04992                   }
04993           }
04994 
04995           //basically: 
04996           //[0,evirIdx-1]->send PmeAckMsg
04997           //evirIdx->send PmeUntransMsg with has_evir=1
04998           //[evirIdx+1, yBlocks-1]->send PmeUntransMsg with has_evir=0
04999           //send_subset_untrans(0, yBlocks-1, evirIdx);
05000 #if USE_NODE_PAR_RECEIVE      
05001           //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
05002           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
05003       evir = 0.;
05004       CmiMemoryWriteFence();
05005 #else
05006       //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
05007           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
05008 #endif
05009           return;
05010   }
05011 #endif
05012   int yBlocks = initdata.yBlocks;
05013   int block2 = initdata.grid.block2;
05014   int K2 = initdata.grid.K2;
05015   int send_evir = 1;
05016   for ( int isend=0; isend<yBlocks; ++isend ) {
05017     int jb = send_order[isend];
05018     if ( ! needs_reply[jb] ) {
05019       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05020       CmiEnableUrgentSend(1);
05021       SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
05022       initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
05023       CmiEnableUrgentSend(0);
05024       continue;
05025     }
05026     int ny = block2;
05027     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05028     PmeUntransMsg *msg = new (send_evir,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05029     if ( send_evir ) {
05030       msg->evir[0] = evir;
05031       msg->has_evir = 1;
05032       send_evir = 0;
05033     } else {
05034       msg->has_evir = 0;
05035     }
05036     msg->sourceNode = thisIndex.z;
05037     msg->ny = nz;
05038     float *md = msg->qgrid;
05039     const float *d = data;
05040     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05041      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05042       for ( int k=0; k<nz; ++k ) {
05043         *(md++) = d[2*(j*nz+k)];
05044         *(md++) = d[2*(j*nz+k)+1];
05045       }
05046      }
05047     }
05048     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
05049 
05050     CmiEnableUrgentSend(1);
05051 #if USE_NODE_PAR_RECEIVE
05052     msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
05053     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
05054 #if Z_PERSIST 
05055     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05056 #endif
05057     initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
05058 #if Z_PERSIST
05059     CmiUsePersistentHandle(NULL, 0);
05060 #endif
05061 #else
05062 #if Z_PERSIST 
05063     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05064 #endif
05065     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
05066 #if Z_PERSIST 
05067     CmiUsePersistentHandle(NULL, 0);
05068 #endif
05069 #endif    
05070     CmiEnableUrgentSend(0);
05071   }
05072   
05073 #if USE_NODE_PAR_RECEIVE
05074   evir = 0.;
05075   CmiMemoryWriteFence();
05076 #endif
05077 }
05078 
05079 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
05080 #if ! USE_NODE_PAR_RECEIVE
05081     if(imsg==0) evir=0.;
05082 #endif
05083 
05084   if ( msg->has_evir ) evir += msg->evir[0];
05085   int block3 = initdata.grid.block3;
05086   int dim3 = initdata.grid.dim3;
05087   int kb = msg->sourceNode;
05088   int nz = msg->ny;
05089   const float *md = msg->qgrid;
05090   float *d = data;
05091   for ( int i=0; i<nx; ++i ) {
05092 #if CMK_BLUEGENEL
05093     CmiNetworkProgress();
05094 #endif   
05095     for ( int j=0; j<ny; ++j, d += dim3 ) {
05096       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05097 #ifdef ZEROCHECK
05098         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
05099                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
05100 #endif
05101         d[2*k] = *(md++);
05102         d[2*k+1] = *(md++);
05103       }
05104     }
05105   }
05106 }
05107 
05108 void PmeZPencil::backward_fft() {
05109 #ifdef NAMD_FFTW
05110 #ifdef MANUAL_DEBUG_FFTW3
05111   dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05112 #endif
05113 #ifdef NAMD_FFTW_3
05114 #if     USE_CKLOOP
05115   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05116   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT) {
05117           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
05118           //transform the above loop
05119           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05120           return;
05121   }
05122 #endif
05123   fftwf_execute(backward_plan);
05124 #else
05125   rfftwnd_complex_to_real(backward_plan, nx*ny,
05126             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
05127 #endif
05128 #ifdef MANUAL_DEBUG_FFTW3
05129   dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05130 #endif
05131 
05132 #endif
05133   
05134 #if CMK_BLUEGENEL
05135   CmiNetworkProgress();
05136 #endif
05137 
05138 #ifdef FFTCHECK
05139   int dim3 = initdata.grid.dim3;
05140   int K1 = initdata.grid.K1;
05141   int K2 = initdata.grid.K2;
05142   int K3 = initdata.grid.K3;
05143   float scale = 1. / (1. * K1 * K2 * K3);
05144   float maxerr = 0.;
05145   float maxstd = 0.;
05146   int mi, mj, mk;  mi = mj = mk = -1;
05147   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05148   const float *d = data;
05149   for ( int i=0; i<nx; ++i ) {
05150    for ( int j=0; j<ny; ++j, d += dim3 ) {
05151     for ( int k=0; k<K3; ++k ) {
05152       float std = 10. * (10. * (10. * std_base + i) + j) + k;
05153       float err = scale * d[k] - std;
05154       if ( fabsf(err) > fabsf(maxerr) ) {
05155         maxerr = err;
05156         maxstd = std;
05157         mi = i;  mj = j;  mk = k;
05158       }
05159     }
05160    }
05161   }
05162   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
05163                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
05164 #endif
05165 
05166 }
05167 
05168 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
05169         //to take advantage of the interface which allows 3 user params at most.
05170         //under such situtation, no new parameter list needs to be created!! -Chao Mei
05171         int specialIdx = paraNum;
05172         PmeZPencil *zpencil = (PmeZPencil *)param;
05173         zpencil->send_subset_ungrid(first, last, specialIdx);
05174 }
05175 
05176 void PmeZPencil::send_all_ungrid() {
05177 /* 
05178 //Original code: the transformation is to first extract the msg 
05179 //idx that will has evir value set. -Chao Mei  
05180         int send_evir = 1;
05181         for (int imsg=0; imsg < grid_msgs.size(); ++imsg ) {
05182                 PmeGridMsg *msg = grid_msgs[imsg];
05183                 if ( msg->hasData ) {
05184                         if ( send_evir ) {
05185                                 msg->evir[0] = evir;
05186                                 send_evir = 0;
05187                         } else {
05188                                 msg->evir[0] = 0.;
05189                         }
05190                 }
05191                 send_ungrid(msg);
05192         }
05193 */
05194         int evirIdx = 0;
05195         for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
05196                 if(grid_msgs[imsg]->hasData) {
05197                         evirIdx = imsg;
05198                         break;
05199                 }
05200         }
05201 
05202 #if     USE_CKLOOP
05203         Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
05204         if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS) {
05205                 //????What's the best value for numChunks?????
05206 #if USE_NODE_PAR_RECEIVE        
05207                 //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 1); //has to sync
05208                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
05209 #else
05210         //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 0); //not sync
05211                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0); //not sync
05212 #endif        
05213                 return;
05214         }
05215 #endif
05216         send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
05217 }
05218 
05219 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
05220         for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
05221                 PmeGridMsg *msg = grid_msgs[imsg];
05222                 if ( msg->hasData) {
05223                         if (imsg == specialIdx) {
05224                                 msg->evir[0] = evir;                            
05225                         } else {
05226                                 msg->evir[0] = 0.;
05227                         }
05228                 }
05229                 send_ungrid(msg);
05230         }
05231 }
05232 
05233 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
05234   int pe = msg->sourceNode;
05235   if ( ! msg->hasData ) {
05236     delete msg;
05237     PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
05238     SET_PRIORITY(ackmsg,sequence,PME_UNGRID_PRIORITY)
05239     CmiEnableUrgentSend(1);
05240     initdata.pmeProxy[pe].recvAck(ackmsg);
05241     CmiEnableUrgentSend(0);
05242     return;
05243   }
05244   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
05245   int dim3 = initdata.grid.dim3;
05246   int zlistlen = msg->zlistlen;
05247   int *zlist = msg->zlist;
05248   char *fmsg = msg->fgrid;
05249   float *qmsg = msg->qgrid;
05250   float *d = data;
05251   int numGrids = 1;  // pencil FFT doesn't support multiple grids
05252   for ( int g=0; g<numGrids; ++g ) {
05253 #if CMK_BLUEGENEL
05254     CmiNetworkProgress();
05255 #endif    
05256     for ( int i=0; i<nx; ++i ) {
05257       for ( int j=0; j<ny; ++j, d += dim3 ) {
05258         if( *(fmsg++) ) {
05259           for ( int k=0; k<zlistlen; ++k ) {
05260             *(qmsg++) = d[zlist[k]];
05261           }
05262         }
05263       }
05264     }
05265   }
05266   SET_PRIORITY(msg,sequence,PME_UNGRID_PRIORITY)
05267     CmiEnableUrgentSend(1);
05268   initdata.pmeProxy[pe].recvUngrid(msg);
05269     CmiEnableUrgentSend(0);
05270 }
05271 
05272 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
05273 {
05274 #if USE_NODE_PAR_RECEIVE
05275   CmiLock(ComputePmeMgr::fftw_plan_lock);
05276   CmiMemoryReadFence();
05277 #endif
05278   recv_grid(msg);
05279   if(msg->hasData) hasData=msg->hasData;
05280   int limsg;
05281   CmiMemoryAtomicFetchAndInc(imsg,limsg);
05282   grid_msgs[limsg] = msg;
05283   //  CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);      
05284   if(limsg+1 == grid_msgs.size())
05285     {
05286 
05287       if (hasData)
05288         {
05289           forward_fft();
05290         }
05291       send_trans();
05292       imsg=0;
05293       CmiMemoryWriteFence();
05294       //      CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
05295     }
05296 #if USE_NODE_PAR_RECEIVE
05297   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05298   CmiMemoryWriteFence();
05299 #endif
05300 }
05301 
05302 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
05303 {
05304 #if USE_NODE_PAR_RECEIVE
05305   CmiLock(ComputePmeMgr::fftw_plan_lock);
05306   CmiMemoryReadFence();
05307 #endif    
05308   recv_untrans(msg);
05309   int limsg;
05310   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
05311   if(limsg+1 == initdata.zBlocks)
05312     {
05313       if(hasData) // maybe this should be an assert
05314         {
05315           backward_fft();
05316         }
05317         
05318         send_all_ungrid();
05319     /*  int send_evir = 1;
05320       // TODO: this part should use Chao's output parallelization
05321       for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
05322         PmeGridMsg *omsg = grid_msgs[limsg];
05323         if ( omsg->hasData ) {
05324           if ( send_evir ) {
05325             omsg->evir[0] = evir;
05326             send_evir = 0;
05327           } else {
05328             omsg->evir[0] = 0.;
05329           }
05330         }
05331         send_ungrid(omsg);
05332       } */
05333       imsgb=0;
05334       evir = 0;
05335       memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3); 
05336       CmiMemoryWriteFence();
05337       //      CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
05338     }
05339 #if USE_NODE_PAR_RECEIVE
05340   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05341   CmiMemoryWriteFence();
05342 #endif
05343 }
05344 
05345 
05346 #include "ComputePmeMgr.def.h"
05347 

Generated on Sat May 25 04:07:16 2013 for NAMD by  doxygen 1.3.9.1