Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 //#define MANUAL_DEBUG_FFTW3 1
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012 // fftw2 doesn't have these defined
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024 
00025 #include <vector>
00026 using namespace std;
00027 
00028 #include "InfoStream.h"
00029 #include "Node.h"
00030 #include "PatchMap.h"
00031 #include "PatchMap.inl"
00032 #include "AtomMap.h"
00033 #include "ComputePme.h"
00034 #include "ComputePmeMgr.decl.h"
00035 #include "PmeRealSpace.h"
00036 #include "PmeKSpace.h"
00037 #include "ComputeNonbondedUtil.h"
00038 #include "PatchMgr.h"
00039 #include "Molecule.h"
00040 #include "ReductionMgr.h"
00041 #include "ComputeMgr.h"
00042 #include "ComputeMgr.decl.h"
00043 // #define DEBUGM
00044 #define MIN_DEBUG_LEVEL 3
00045 #include "Debug.h"
00046 #include "SimParameters.h"
00047 #include "WorkDistrib.h"
00048 #include "varsizemsg.h"
00049 #include "Random.h"
00050 #include "ckhashtable.h"
00051 #include "Priorities.h"
00052 
00053 #include "ComputeMoa.h"
00054 #include "ComputeMoaMgr.decl.h" 
00055 
00056 //#define     USE_RANDOM_TOPO         1
00057 
00058 //#define USE_TOPO_SFC                    1
00059 //#define     USE_NODEHELPER                1
00060 #include "TopoManager.h"
00061 
00062 #ifndef SQRT_PI
00063 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00064 #endif
00065 
00066 #if CMK_PERSISTENT_COMM
00067 #define USE_PERSISTENT      1
00068 #endif
00069 
00070 //#define USE_NODE_PAR_RECEIVE    1
00071 
00072 char *pencilPMEProcessors;
00073 
00074 class PmeAckMsg : public CMessage_PmeAckMsg {
00075 };
00076 
00077 class PmeGridMsg : public CMessage_PmeGridMsg {
00078 public:
00079 
00080   int sourceNode;
00081   int sequence;
00082   int hasData;
00083   Lattice lattice;
00084   PmeReduction *evir;
00085   int start;
00086   int len;
00087   int zlistlen;
00088   int *zlist;
00089   char *fgrid;
00090   float *qgrid;
00091   CkArrayIndex3D destElem;
00092 };
00093 
00094 class PmeTransMsg : public CMessage_PmeTransMsg {
00095 public:
00096 
00097   int sourceNode;
00098   int sequence;
00099   int hasData;
00100   Lattice lattice;
00101   int x_start;
00102   int nx;
00103   float *qgrid;
00104   CkArrayIndex3D destElem;
00105 };
00106 
00107 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00108 public:
00109   PmeTransMsg *msg;
00110   int *count;
00111   CmiNodeLock lock;
00112 };
00113 
00114 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00115 public:
00116 
00117   int sourceNode;
00118   int has_evir;
00119   PmeReduction *evir;
00120   int y_start;
00121   int ny;
00122   float *qgrid;
00123   CkArrayIndex3D destElem;
00124 };
00125 
00126 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00127 public:
00128   PmeUntransMsg *msg;
00129   int *count;
00130   CmiNodeLock lock;
00131 };
00132 
00133 class PmePencilMap : public CBase_PmePencilMap {
00134 public:
00135   PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00136     : ia(i_a), ib(i_b), nb(n_b),
00137       size(n), data(newcopyint(n,d)) {
00138   }
00139   virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00140     //Return an ``arrayHdl'', given some information about the array
00141     return 0;
00142   }
00143   virtual int procNum(int, const CkArrayIndex &i) {
00144     //Return the home processor number for this element of this array
00145     return data[ i.data()[ia] * nb + i.data()[ib] ];
00146   }
00147   virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00148     int mype = CkMyPe();
00149     for ( int i=0; i < size; ++i ) {
00150       if ( data[i] == mype ) {
00151         CkArrayIndex3D ai(0,0,0);
00152         ai.data()[ia] = i / nb;
00153         ai.data()[ib] = i % nb;
00154         if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00155         if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00156         mgr->insertInitial(ai,msg);
00157         msg = 0;
00158       }
00159     }
00160     mgr->doneInserting();
00161     if ( msg ) CkFreeMsg(msg);
00162   }
00163 private:
00164   const int ia, ib, nb, size;
00165   const int* const data;
00166   static int* newcopyint(int n, int *d) {
00167     int *newd = new int[n];
00168     memcpy(newd, d, n*sizeof(int));
00169     return newd;
00170   }
00171 };
00172 
00173 // use this idiom since messages don't have copy constructors
00174 struct PmePencilInitMsgData {
00175   PmeGrid grid;
00176   int xBlocks, yBlocks, zBlocks;
00177   CProxy_PmeXPencil xPencil;
00178   CProxy_PmeYPencil yPencil;
00179   CProxy_PmeZPencil zPencil;
00180   CProxy_ComputePmeMgr pmeProxy;
00181   CProxy_NodePmeMgr pmeNodeProxy;
00182   CProxy_PmePencilMap xm;
00183   CProxy_PmePencilMap ym;
00184   CProxy_PmePencilMap zm;
00185 };
00186 
00187 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00188 public:
00189    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00190    PmePencilInitMsgData data;
00191 };
00192 
00193 
00194 struct LocalPmeInfo {
00195   int nx, x_start;
00196   int ny_after_transpose, y_start_after_transpose;
00197 };
00198 
00199 struct NodePmeInfo {
00200   int npe, pe_start, real_node;
00201 };
00202 
00203 
00204 //Assigns gridPeMap and transPeMap to the same set of processors.
00205 void generatePmePeList(int *peMap, int numPes){
00206   // decide which pes to use by bit reversal
00207   int i;
00208   int ncpus = CkNumPes();
00209   
00210   // find next highest power of two
00211   int npow2 = 1;  int nbits = 0;
00212   while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00213   
00214   // build bit reversal sequence
00215   SortableResizeArray<int> seq(ncpus);
00216   i = 0;
00217   for ( int icpu=0; icpu<ncpus; ++icpu ) {
00218     int ri;
00219     for ( ri = ncpus; ri >= ncpus; ++i ) {
00220       ri = 0;
00221       int pow2 = 1;
00222       int rpow2 = npow2 / 2;
00223       for ( int j=0; j<nbits; ++j ) {
00224         ri += rpow2 * ( ( i / pow2 ) % 2 );
00225         pow2 *= 2;  rpow2 /= 2;
00226       }
00227     }
00228     seq[icpu] = ri;
00229   }
00230   
00231   // extract and sort PME locations
00232   for ( i=0; i<numPes; ++i ) {
00233     seq[i] = seq[ncpus - numPes + i];
00234   }
00235   seq.resize(numPes);
00236   seq.sort();
00237   
00238   for ( i=0; i<numPes; ++i ) 
00239       peMap[i] = seq[i];
00240 
00241   //peMap[0] = 0;
00242 }
00243 
00244 //Assigns gridPeMap and transPeMap to different set of processors.
00245 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00246   // decide which pes to use by bit reversal
00247   int i;
00248   int ncpus = CkNumPes();
00249   
00250   // find next highest power of two
00251   int npow2 = 1;  int nbits = 0;
00252   while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00253   
00254   // build bit reversal sequence
00255   SortableResizeArray<int> seq(ncpus);
00256   SortableResizeArray<int> seq2(ncpus);
00257   i = 0;
00258   for ( int icpu=0; icpu<ncpus; ++icpu ) {
00259     int ri;
00260     for ( ri = ncpus; ri >= ncpus; ++i ) {
00261       ri = 0;
00262       int pow2 = 1;
00263       int rpow2 = npow2 / 2;
00264       for ( int j=0; j<nbits; ++j ) {
00265         ri += rpow2 * ( ( i / pow2 ) % 2 );
00266         pow2 *= 2;  rpow2 /= 2;
00267       }
00268     }
00269     seq[icpu] = ri;
00270     seq2[icpu] = ri;
00271   }
00272   
00273   // extract and sort PME locations
00274   for ( i=0; i<numGridPes; ++i ) {
00275     seq[i] = seq[ncpus - numGridPes + i];
00276   }
00277   seq.resize(numGridPes);
00278   seq.sort();
00279   int firstTransPe = ncpus - numGridPes - numTransPes;
00280   if ( firstTransPe < 0 ) {
00281     firstTransPe = 0;
00282     // 0 should be first in list, skip if possible
00283     if ( ncpus > numTransPes ) firstTransPe = 1;
00284   }
00285   for ( i=0; i<numTransPes; ++i ) {
00286     seq2[i] = seq2[firstTransPe + i];
00287   }
00288   seq2.resize(numTransPes);
00289   seq2.sort();
00290   
00291   for ( i=0; i<numGridPes; ++i ) 
00292     gridPeMap[i] = seq[i];
00293 
00294   for ( i=0; i<numTransPes; ++i ) 
00295     transPeMap[i] = seq2[i];
00296 }
00297 
00298 #if USE_TOPOMAP 
00299 //Topology aware PME allocation
00300 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00301                              int nbpes=0);
00302 #endif
00303 
00304 struct ijpair {
00305   int i,j;
00306   ijpair() {;}
00307   ijpair(int I, int J) : i(I), j(J) {;}
00308 };
00309 
00310 class ComputePmeMgr : public BOCclass {
00311 public:
00312   friend class ComputePme;
00313   friend class NodePmeMgr;
00314   ComputePmeMgr();
00315   ~ComputePmeMgr();
00316 
00317   void initialize(CkQdMsg*);
00318   void initialize_pencils(CkQdMsg*);
00319   void activate_pencils(CkQdMsg*);
00320   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00321 
00322   void sendGrid(void);
00323   void recvGrid(PmeGridMsg *);
00324   void gridCalc1(void);
00325   void sendTransBarrier(void);
00326   void sendTrans(void);
00327   void fwdSharedTrans(PmeTransMsg *);
00328   void recvSharedTrans(PmeSharedTransMsg *);
00329   void recvTrans(PmeTransMsg *);
00330   void procTrans(PmeTransMsg *);
00331   void gridCalc2(void);
00332   #ifdef OPENATOM_VERSION
00333   void gridCalc2Moa(void);
00334   #endif // OPENATOM_VERSION
00335   void gridCalc2R(void);
00336   void fwdSharedUntrans(PmeUntransMsg *);
00337   void recvSharedUntrans(PmeSharedUntransMsg *);
00338   void sendUntrans(void);
00339   void recvUntrans(PmeUntransMsg *);
00340   void procUntrans(PmeUntransMsg *);
00341   void gridCalc3(void);
00342   void sendUngrid(void);
00343   void recvUngrid(PmeGridMsg *);
00344   void recvAck(PmeAckMsg *);
00345   void ungridCalc(void);
00346 
00347   void setCompute(ComputePme *c) { pmeCompute = c; c->setMgr(this); }
00348 
00349   //Tells if the current processor is a PME processor or not. Called by NamdCentralLB
00350   int isPmeProcessor(int p);  
00351 
00352 #ifdef NAMD_FFTW
00353   static CmiNodeLock fftw_plan_lock;
00354 #endif
00355 
00356 private:
00357   CProxy_ComputePmeMgr pmeProxy;
00358   CProxy_ComputePmeMgr pmeProxyDir;
00359   CProxy_NodePmeMgr pmeNodeProxy;
00360   
00361   ComputePme *pmeCompute;
00362   PmeGrid myGrid;
00363   Lattice lattice;
00364   PmeKSpace *myKSpace;
00365   float *qgrid;
00366   float *kgrid;
00367 
00368 #ifdef NAMD_FFTW
00369 #ifdef NAMD_FFTW_3
00370   fftwf_plan *forward_plan_x, *backward_plan_x;
00371   fftwf_plan *forward_plan_yz, *backward_plan_yz;
00372   fftwf_complex *work;
00373 #else
00374   fftw_plan forward_plan_x, backward_plan_x;
00375   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00376   fftw_complex *work;
00377 #endif
00378 #else
00379   float *work;
00380 #endif
00381 
00382   int alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00383   int alchDecouple;
00384   BigReal alchElecLambdaStart;
00385   BigReal elecLambdaUp;
00386   BigReal elecLambdaDown;
00387 
00388   
00389   LocalPmeInfo *localInfo;
00390   NodePmeInfo *gridNodeInfo;
00391   NodePmeInfo *transNodeInfo;
00392   int qgrid_size;
00393   int qgrid_start;
00394   int qgrid_len;
00395   int fgrid_start;
00396   int fgrid_len;
00397 
00398   int numSources;
00399   int numGridPes;
00400   int numTransPes;
00401   int numGridNodes;
00402   int numTransNodes;
00403   int numDestRecipPes;
00404   int myGridPe, myGridNode;
00405   int myTransPe, myTransNode;
00406   int *gridPeMap;
00407   int *transPeMap;
00408   int *recipPeDest;
00409   int *gridPeOrder;
00410   int *gridNodeOrder;
00411   int *transNodeOrder;
00412   char *isPmeFlag;
00413   int grid_count;
00414   int trans_count;
00415   int untrans_count;
00416   int ungrid_count;
00417   PmeGridMsg **gridmsg_reuse;
00418   PmeReduction recip_evir[PME_MAX_EVALS];
00419   PmeReduction recip_evir2[PME_MAX_EVALS];
00420 
00421   int sequence;  // used for priorities
00422   int useBarrier;
00423   int sendTransBarrier_received;
00424 
00425   int usePencils;
00426   int xBlocks, yBlocks, zBlocks;
00427   CProxy_PmeXPencil xPencil;
00428   CProxy_PmeYPencil yPencil;
00429   CProxy_PmeZPencil zPencil;
00430   char *pencilActive;
00431   ijpair *activePencils;
00432   int numPencilsActive;
00433 };
00434 
00435 #ifdef NAMD_FFTW
00436   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00437 #endif
00438 
00439 int isPmeProcessor(int p){ 
00440   return CProxy_ComputePmeMgr::ckLocalBranch(CkpvAccess(BOCclass_group).computePmeMgr)->isPmeProcessor(p);
00441 }
00442 
00443 int ComputePmeMgr::isPmeProcessor(int p){ 
00444   return ( usePencils ? pencilPMEProcessors[p] : isPmeFlag[p] );
00445 }
00446 
00447 class NodePmeMgr : public CBase_NodePmeMgr {
00448 public:
00449   friend class ComputePmeMgr;
00450   NodePmeMgr();
00451   ~NodePmeMgr();
00452   void initialize();
00453   void recvTrans(PmeTransMsg *);
00454   void recvUntrans(PmeUntransMsg *);
00455   void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00456   void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00457   void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00458   void recvXTrans(PmeTransMsg *);
00459   void recvYTrans(PmeTransMsg *);
00460   void recvYUntrans(PmeUntransMsg *);
00461   void recvZGrid(PmeGridMsg *);
00462   void recvZUntrans(PmeUntransMsg *);
00463 
00464   void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00465       xm=_xm; ym=_ym; zm=_zm;
00466   }
00467   CProxy_PmePencilMap xm;
00468   CProxy_PmePencilMap ym;
00469   CProxy_PmePencilMap zm;
00470 
00471 private:
00472   CProxy_ComputePmeMgr mgrProxy;
00473   ComputePmeMgr *mgrObject;
00474   ComputePmeMgr **mgrObjects;
00475   CProxy_PmeXPencil xPencil;
00476   CProxy_PmeYPencil yPencil;
00477   CProxy_PmeZPencil zPencil;
00478   CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00479   CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00480   CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;  
00481 };
00482 
00483 NodePmeMgr::NodePmeMgr() {
00484   mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00485 }
00486 
00487 NodePmeMgr::~NodePmeMgr() {
00488   delete [] mgrObjects;
00489 }
00490 
00491 void NodePmeMgr::initialize() {
00492   CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00493   mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00494   if ( CkMyRank() == 0 ) {
00495     mgrProxy = proxy;
00496     mgrObject = proxy.ckLocalBranch();
00497   }
00498 }
00499 
00500 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00501   mgrObject->fwdSharedTrans(msg);
00502 }
00503 
00504 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00505   mgrObject->fwdSharedUntrans(msg);
00506 }
00507 
00508 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00509 {
00510   CmiLock(ComputePmeMgr::fftw_plan_lock);
00511   xPencilObj.put(idx)=obj;
00512   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00513 }
00514 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00515 {
00516   CmiLock(ComputePmeMgr::fftw_plan_lock);
00517   yPencilObj.put(idx)=obj;
00518   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00519 }
00520 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00521 {
00522   CmiLock(ComputePmeMgr::fftw_plan_lock);
00523   zPencilObj.put(idx)=obj;
00524   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00525 }
00526 
00527 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00528                                  pmeProxyDir(thisgroup), pmeCompute(0) {
00529 
00530   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00531   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00532 
00533   pmeNodeProxy.ckLocalBranch()->initialize();
00534 
00535 #ifdef NAMD_FFTW
00536   if ( CmiMyRank() == 0 ) {
00537     fftw_plan_lock = CmiCreateLock();
00538   }
00539 #endif
00540 
00541   myKSpace = 0;
00542   kgrid = 0;
00543   work = 0;
00544   grid_count = 0;
00545   trans_count = 0;
00546   untrans_count = 0;
00547   ungrid_count = 0;
00548   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00549   useBarrier = 0;
00550   sendTransBarrier_received = 0;
00551   usePencils = 0;
00552 }
00553 
00554 
00555 void ComputePmeMgr::recvArrays(
00556         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00557   xPencil = x;  yPencil = y;  zPencil = z;
00558   
00559     if(CmiMyRank()==0)
00560     {
00561       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00562       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00563       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00564     }
00565 }
00566 
00567 #if USE_TOPO_SFC
00568  struct Coord
00569   {
00570     int x, y, z;
00571     Coord(): x(0), y(0), z(0) {}
00572     Coord(int a, int b, int c): x(a), y(b), z(c) {}
00573   };
00574   extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00575 
00576   void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00577   {
00578      SortableResizeArray<int> newprocs(procs.size());
00579      int num = 0;
00580      for (int i=0; i<result.size(); i++) {
00581        Coord &c = result[i];
00582        for (int j=0; j<procs.size(); j++) {
00583          int pe = procs[j];
00584          int x,y,z,t;
00585          tmgr.rankToCoordinates(pe, x, y, z, t);    
00586          if (x==c.x && y==c.y && z==c.z)
00587            newprocs[num++] = pe;
00588        }
00589      } 
00590      CmiAssert(newprocs.size() == procs.size());
00591      procs = newprocs;
00592   }
00593 
00594   int find_level_grid(int x) 
00595   {
00596      int a = sqrt(x);
00597      int b;
00598      for (; a>0; a--) {
00599        if (x%a == 0) break;
00600      }
00601      if (a==1) a = x;
00602      b = x/a;
00603      //return a>b?a:b;
00604      return b;
00605   }
00606   CmiNodeLock tmgr_lock;
00607 #endif
00608 
00609 void Pme_init()
00610 {
00611 #if USE_TOPO_SFC
00612   if (CkMyRank() == 0) 
00613     tmgr_lock = CmiCreateLock();
00614 #endif
00615 }
00616 
00617 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00618   delete msg;
00619 
00620   localInfo = new LocalPmeInfo[CkNumPes()];
00621   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00622   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00623   gridPeMap = new int[CkNumPes()];
00624   transPeMap = new int[CkNumPes()];
00625   recipPeDest = new int[CkNumPes()];
00626   gridPeOrder = new int[CkNumPes()];
00627   gridNodeOrder = new int[CkNumNodes()];
00628   transNodeOrder = new int[CkNumNodes()];
00629   isPmeFlag = new char[CkNumPes()];  
00630 
00631   SimParameters *simParams = Node::Object()->simParameters;
00632   PatchMap *patchMap = PatchMap::Object();
00633 
00634   alchFepOn = simParams->alchFepOn;
00635   alchThermIntOn = simParams->alchThermIntOn;
00636   alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
00637   alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? 
00638     simParams->alchElecLambdaStart : 0;
00639   if (alchFepOn || alchThermIntOn) {
00640     numGrids = 2;
00641     if (alchDecouple) numGrids += 2;
00642     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00643   }
00644   else numGrids = 1;
00645   lesOn = simParams->lesOn;
00646   useBarrier = simParams->PMEBarrier;
00647   if ( lesOn ) {
00648     lesFactor = simParams->lesFactor;
00649     numGrids = lesFactor;
00650   }
00651   selfOn = 0;
00652   pairOn = simParams->pairInteractionOn;
00653   if ( pairOn ) {
00654     selfOn = simParams->pairInteractionSelf;
00655     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00656     numGrids = selfOn ? 1 : 3;
00657   }
00658 
00659   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00660   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00661   else {
00662     int nrps = simParams->PMEProcessors;
00663     if ( nrps <= 0 ) nrps = CkNumPes();
00664     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00665     int dimx = simParams->PMEGridSizeX;
00666     int dimy = simParams->PMEGridSizeY;
00667     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00668     if ( maxslabs > nrps ) maxslabs = nrps;
00669     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00670                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00671     if ( maxpencils > nrps ) maxpencils = nrps;
00672     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00673     else usePencils = 0;
00674   }
00675 
00676   if ( usePencils ) {
00677     int nrps = simParams->PMEProcessors;
00678     if ( nrps <= 0 ) nrps = CkNumPes();
00679     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00680     if ( simParams->PMEPencils > 1 &&
00681          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00682       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00683     } else {
00684       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00685                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00686       if ( nb2 > nrps ) nb2 = nrps;
00687       int nb = (int) sqrt((float)nb2);
00688       xBlocks = zBlocks = nb;
00689       yBlocks = nb2 / nb;
00690     }
00691 
00692     int dimx = simParams->PMEGridSizeX;
00693     int bx = 1 + ( dimx - 1 ) / xBlocks;
00694     xBlocks = 1 + ( dimx - 1 ) / bx;
00695 
00696     int dimy = simParams->PMEGridSizeY;
00697     int by = 1 + ( dimy - 1 ) / yBlocks;
00698     yBlocks = 1 + ( dimy - 1 ) / by;
00699 
00700     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00701     int bz = 1 + ( dimz - 1 ) / zBlocks;
00702     zBlocks = 1 + ( dimz - 1 ) / bz;
00703 
00704     if ( ! CkMyPe() ) {
00705       iout << iINFO << "PME using " << xBlocks << " x " <<
00706         yBlocks << " x " << zBlocks <<
00707         " pencil grid for FFT and reciprocal sum.\n" << endi;
00708     }
00709   } else { // usePencils
00710 
00711   {  // decide how many pes to use for reciprocal sum
00712 
00713     // rules based on work available
00714     int minslices = simParams->PMEMinSlices;
00715     int dimx = simParams->PMEGridSizeX;
00716     int nrpx = ( dimx + minslices - 1 ) / minslices;
00717     int dimy = simParams->PMEGridSizeY;
00718     int nrpy = ( dimy + minslices - 1 ) / minslices;
00719 
00720     // rules based on processors available
00721     int nrpp = CkNumPes();
00722     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00723     if ( nrpp < nrpx ) nrpx = nrpp;
00724     if ( nrpp < nrpy ) nrpy = nrpp;
00725 
00726     // user override
00727     int nrps = simParams->PMEProcessors;
00728     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00729     if ( nrps > 0 ) nrpx = nrps;
00730     if ( nrps > 0 ) nrpy = nrps;
00731 
00732     // make sure there aren't any totally empty processors
00733     int bx = ( dimx + nrpx - 1 ) / nrpx;
00734     nrpx = ( dimx + bx - 1 ) / bx;
00735     int by = ( dimy + nrpy - 1 ) / nrpy;
00736     nrpy = ( dimy + by - 1 ) / by;
00737     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00738       NAMD_bug("Error in selecting number of PME processors.");
00739     if ( by != ( dimy + nrpy - 1 ) / nrpy )
00740       NAMD_bug("Error in selecting number of PME processors.");
00741 
00742     numGridPes = nrpx;
00743     numTransPes = nrpy;
00744   }
00745   if ( ! CkMyPe() ) {
00746     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
00747       " processors for FFT and reciprocal sum.\n" << endi;
00748   }
00749 
00750   int sum_npes = numTransPes + numGridPes;
00751   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
00752 
00753 #if 0 // USE_TOPOMAP
00754   /* This code is being disabled permanently for slab PME on Blue Gene machines */
00755   PatchMap * pmap = PatchMap::Object();
00756   
00757   int patch_pes = pmap->numNodesWithPatches();
00758   TopoManager tmgr;
00759   if(tmgr.hasMultipleProcsPerNode())
00760     patch_pes *= 2;
00761 
00762   bool done = false;
00763   if(CkNumPes() > 2*sum_npes + patch_pes) {    
00764     done = generateBGLORBPmePeList(transPeMap, numTransPes);
00765     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
00766   }
00767   else 
00768     if(CkNumPes() > 2 *max_npes + patch_pes) {
00769       done = generateBGLORBPmePeList(transPeMap, max_npes);
00770       gridPeMap = transPeMap;
00771     }
00772 
00773   if (!done)
00774 #endif
00775     {
00776       //generatePmePeList(transPeMap, max_npes);
00777       //gridPeMap = transPeMap;
00778       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
00779     }
00780   
00781   myGridPe = -1;
00782   myGridNode = -1;
00783   int i = 0;
00784   for ( i=0; i<CkNumPes(); ++i )
00785     isPmeFlag[i] = 0;
00786   int node = -1;
00787   int real_node = -1;
00788   for ( i=0; i<numGridPes; ++i ) {
00789     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
00790     isPmeFlag[gridPeMap[i]] |= 1;
00791     int real_node_i = CkNodeOf(gridPeMap[i]);
00792     if ( real_node_i == real_node ) {
00793       gridNodeInfo[node].npe += 1;
00794     } else {
00795       real_node = real_node_i;
00796       ++node;
00797       gridNodeInfo[node].real_node = real_node;
00798       gridNodeInfo[node].pe_start = i;
00799       gridNodeInfo[node].npe = 1;
00800     }
00801     if ( CkMyNode() == real_node_i ) myGridNode = node;
00802   }
00803   numGridNodes = node + 1;
00804   myTransPe = -1;
00805   myTransNode = -1;
00806   node = -1;
00807   real_node = -1;
00808   for ( i=0; i<numTransPes; ++i ) {
00809     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
00810     isPmeFlag[transPeMap[i]] |= 2;
00811     int real_node_i = CkNodeOf(transPeMap[i]);
00812     if ( real_node_i == real_node ) {
00813       transNodeInfo[node].npe += 1;
00814     } else {
00815       real_node = real_node_i;
00816       ++node;
00817       transNodeInfo[node].real_node = real_node;
00818       transNodeInfo[node].pe_start = i;
00819       transNodeInfo[node].npe = 1;
00820     }
00821     if ( CkMyNode() == real_node_i ) myTransNode = node;
00822   }
00823   numTransNodes = node + 1;
00824 
00825   if ( ! CkMyPe() ) {
00826     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
00827          << numTransNodes << " TRANS NODES\n" << endi;
00828     iout << iINFO << "PME GRID LOCATIONS:";
00829     int i;
00830     for ( i=0; i<numGridPes && i<10; ++i ) {
00831       iout << " " << gridPeMap[i];
00832     }
00833     if ( i < numGridPes ) iout << " ...";
00834     iout << "\n" << endi;
00835     iout << iINFO << "PME TRANS LOCATIONS:";
00836     for ( i=0; i<numTransPes && i<10; ++i ) {
00837       iout << " " << transPeMap[i];
00838     }
00839     if ( i < numTransPes ) iout << " ...";
00840     iout << "\n" << endi;
00841   }
00842 
00843   { // generate random orderings for grid and trans messages
00844     int i;
00845     for ( i = 0; i < numGridPes; ++i ) {
00846       gridPeOrder[i] = i;
00847     }
00848     Random rand(CkMyPe());
00849     if ( myGridPe < 0 ) {
00850       rand.reorder(gridPeOrder,numGridPes);
00851     } else {  // self last
00852       gridPeOrder[myGridPe] = numGridPes-1;
00853       gridPeOrder[numGridPes-1] = myGridPe;
00854       rand.reorder(gridPeOrder,numGridPes-1);
00855     } 
00856     for ( i = 0; i < numGridNodes; ++i ) {
00857       gridNodeOrder[i] = i;
00858     }
00859     if ( myGridNode < 0 ) {
00860       rand.reorder(gridNodeOrder,numGridNodes);
00861     } else {  // self last
00862       gridNodeOrder[myGridNode] = numGridNodes-1;
00863       gridNodeOrder[numGridNodes-1] = myGridNode;
00864       rand.reorder(gridNodeOrder,numGridNodes-1);
00865     }
00866     for ( i = 0; i < numTransNodes; ++i ) {
00867       transNodeOrder[i] = i;
00868     }
00869     if ( myTransNode < 0 ) {
00870       rand.reorder(transNodeOrder,numTransNodes);
00871     } else {  // self last
00872       transNodeOrder[myTransNode] = numTransNodes-1;
00873       transNodeOrder[numTransNodes-1] = myTransNode;
00874       rand.reorder(transNodeOrder,numTransNodes-1);
00875     }
00876   }
00877   
00878   } // ! usePencils
00879 
00880   myGrid.K1 = simParams->PMEGridSizeX;
00881   myGrid.K2 = simParams->PMEGridSizeY;
00882   myGrid.K3 = simParams->PMEGridSizeZ;
00883   myGrid.order = simParams->PMEInterpOrder;
00884   myGrid.dim2 = myGrid.K2;
00885   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
00886 
00887   if ( ! usePencils ) {
00888     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
00889     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
00890     myGrid.block3 = myGrid.dim3 / 2;  // complex
00891   }
00892 
00893   if ( usePencils ) {
00894     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
00895     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
00896     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
00897 
00898 
00899       int pe = 0;
00900       int x,y,z;
00901 
00902                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
00903                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
00904                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
00905                 {
00906                 int basepe = 0;  int npe = CkNumPes();
00907                         if ( npe > xBlocks*yBlocks &&
00908                                 npe > xBlocks*zBlocks &&
00909                                 npe > yBlocks*zBlocks ) {
00910                         // avoid node 0
00911                         ++basepe;
00912                         --npe;
00913                 }
00914 
00915       
00916                 // decide which pes to use by bit reversal and patch use
00917                 int i;
00918                 int ncpus = CkNumPes();
00919   
00920                 // find next highest power of two
00921                 int npow2 = 1;  int nbits = 0;
00922                 while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00923   
00924                 // build bit reversal sequence
00925                 SortableResizeArray<int> patches, nopatches, pmeprocs;
00926                 PatchMap *pmap = PatchMap::Object();
00927                 i = 0;
00928                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
00929                         int ri;
00930                         for ( ri = ncpus; ri >= ncpus; ++i ) {
00931                                 ri = 0;
00932                                 int pow2 = 1;
00933                                 int rpow2 = npow2 / 2;
00934                                 for ( int j=0; j<nbits; ++j ) {
00935                                 ri += rpow2 * ( ( i / pow2 ) % 2 );
00936                                 pow2 *= 2;  rpow2 /= 2;
00937                                 }
00938                         }
00939                         // seq[icpu] = ri;
00940                         if ( ri ) { // keep 0 for special case
00941                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
00942                                 else nopatches.add(ri);
00943                         }
00944                 }
00945 
00946 #if USE_RANDOM_TOPO
00947             Random rand(CkMyPe());
00948             int *tmp = new int[patches.size()];
00949             int nn = patches.size();
00950             for (i=0;i<nn;i++)  tmp[i] = patches[i];
00951             rand.reorder(tmp, nn);
00952             patches.resize(0);
00953             for (i=0;i<nn;i++)  patches.add(tmp[i]);
00954             delete [] tmp;
00955             tmp = new int[nopatches.size()];
00956             nn = nopatches.size();
00957             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
00958             rand.reorder(tmp, nn);
00959             nopatches.resize(0);
00960             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
00961             delete [] tmp;
00962 #endif
00963 
00964                 // only use zero if it eliminates overloading or has patches
00965                 int useZero = 0;
00966                 int npens = xBlocks*yBlocks;
00967                 if ( npens % ncpus == 0 ) useZero = 1;
00968                 if ( npens == nopatches.size() + 1 ) useZero = 1;
00969                 npens += xBlocks*zBlocks;
00970                 if ( npens % ncpus == 0 ) useZero = 1;
00971                 if ( npens == nopatches.size() + 1 ) useZero = 1;
00972                 npens += yBlocks*zBlocks;
00973                 if ( npens % ncpus == 0 ) useZero = 1;
00974                 if ( npens == nopatches.size() + 1 ) useZero = 1;
00975 
00976                 // add nopatches then patches in reversed order
00977                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
00978                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00979                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
00980                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00981   
00982                 int npes = pmeprocs.size();
00983                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
00984                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
00985 #if !USE_RANDOM_TOPO
00986                 zprocs.sort();
00987 #endif
00988                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
00989                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
00990 #if !USE_RANDOM_TOPO
00991                 yprocs.sort();
00992 #endif
00993       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
00994       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
00995 #if !USE_RANDOM_TOPO
00996       xprocs.sort();
00997 #endif
00998 
00999 #if USE_TOPO_SFC
01000   CmiLock(tmgr_lock);
01001   //{
01002   TopoManager tmgr;
01003   int xdim = tmgr.getDimNX();
01004   int ydim = tmgr.getDimNY();
01005   int zdim = tmgr.getDimNZ();
01006   int xdim1 = find_level_grid(xdim);
01007   int ydim1 = find_level_grid(ydim);
01008   int zdim1 = find_level_grid(zdim);
01009   if(CkMyPe() == 0)
01010       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01011 
01012   vector<Coord> result;
01013   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01014   sort_sfc(xprocs, tmgr, result);
01015   sort_sfc(yprocs, tmgr, result);
01016   sort_sfc(zprocs, tmgr, result);
01017   //}
01018   CmiUnlock(tmgr_lock);
01019 #endif
01020 
01021       pencilPMEProcessors = new char [CkNumPes()];
01022       memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
01023 
01024 
01025                 if(CkMyPe() == 0){  
01026               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01027           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01028 #if USE_TOPO_SFC
01029               int x,y,z,t;
01030               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01031               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01032 #else
01033               iout << " " << zprocs[i];
01034 #endif
01035           }
01036           if ( i < zprocs.size() ) iout << " ...";
01037               iout << "\n" << endi;
01038                 }
01039 
01040       for (pe=0, x = 0; x < xBlocks; ++x)
01041         for (y = 0; y < yBlocks; ++y, ++pe ) {
01042           pencilPMEProcessors[zprocs[pe]] = 1;
01043         }
01044      
01045                 if(CkMyPe() == 0){  
01046               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01047           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01048 #if USE_TOPO_SFC
01049               int x,y,z,t;
01050               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01051               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01052 #else
01053               iout << " " << yprocs[i];
01054 #endif
01055           }
01056           if ( i < yprocs.size() ) iout << " ...";
01057               iout << "\n" << endi;
01058                 }
01059 
01060       for (pe=0, z = 0; z < zBlocks; ++z )
01061         for (x = 0; x < xBlocks; ++x, ++pe ) {
01062           pencilPMEProcessors[yprocs[pe]] = 1;
01063         }
01064     
01065                 if(CkMyPe() == 0){  
01066                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01067                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01068 #if USE_TOPO_SFC
01069                 int x,y,z,t;
01070                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01071                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01072 #else
01073                 iout << " " << xprocs[i];
01074 #endif
01075             }
01076                 if ( i < xprocs.size() ) iout << " ...";
01077                 iout << "\n" << endi;
01078                 }
01079 
01080       for (pe=0, y = 0; y < yBlocks; ++y )      
01081         for (z = 0; z < zBlocks; ++z, ++pe ) {
01082           pencilPMEProcessors[xprocs[pe]] = 1;
01083         }
01084         
01085     }
01086 
01087 
01088         // creating the pencil arrays
01089         if ( CkMyPe() == 0 ){
01090 #if 1
01091         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01092         CProxy_PmePencilMap ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin());
01093         CProxy_PmePencilMap xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin());
01094         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01095         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01096         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01097         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01098 #if CHARM_VERSION > 60301
01099         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01100         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01101         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01102 #endif
01103         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01104         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01105         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01106 #else
01107         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01108         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01109         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01110 
01111                 for (pe=0, x = 0; x < xBlocks; ++x)
01112                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01113                                 zPencil(x,y,0).insert(zprocs[pe]);
01114                         }
01115         zPencil.doneInserting();
01116 
01117                 for (pe=0, z = 0; z < zBlocks; ++z )
01118                         for (x = 0; x < xBlocks; ++x, ++pe ) {
01119                                 yPencil(x,0,z).insert(yprocs[pe]);
01120                         }
01121         yPencil.doneInserting();
01122 
01123 
01124                 for (pe=0, y = 0; y < yBlocks; ++y )    
01125                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01126                                 xPencil(0,y,z).insert(xprocs[pe]);
01127                         }
01128                 xPencil.doneInserting();     
01129 #endif
01130 
01131                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01132                 PmePencilInitMsgData msgdata;
01133                 msgdata.grid = myGrid;
01134                 msgdata.xBlocks = xBlocks;
01135                 msgdata.yBlocks = yBlocks;
01136                 msgdata.zBlocks = zBlocks;
01137                 msgdata.xPencil = xPencil;
01138                 msgdata.yPencil = yPencil;
01139                 msgdata.zPencil = zPencil;
01140                 msgdata.pmeProxy = pmeProxyDir;
01141         msgdata.pmeNodeProxy = pmeNodeProxy;
01142         msgdata.xm = xm;
01143         msgdata.ym = ym;
01144         msgdata.zm = zm;
01145                 xPencil.init(new PmePencilInitMsg(msgdata));
01146                 yPencil.init(new PmePencilInitMsg(msgdata));
01147                 zPencil.init(new PmePencilInitMsg(msgdata));
01148         }
01149 
01150     return;  // continue in initialize_pencils() at next startup stage
01151   }
01152 
01153 
01154   int pe;
01155   int nx = 0;
01156   for ( pe = 0; pe < numGridPes; ++pe ) {
01157     localInfo[pe].x_start = nx;
01158     nx += myGrid.block1;
01159     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01160     localInfo[pe].nx = nx - localInfo[pe].x_start;
01161   }
01162   int ny = 0;
01163   for ( pe = 0; pe < numTransPes; ++pe ) {
01164     localInfo[pe].y_start_after_transpose = ny;
01165     ny += myGrid.block2;
01166     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01167     localInfo[pe].ny_after_transpose =
01168                         ny - localInfo[pe].y_start_after_transpose;
01169   }
01170 
01171   {  // decide how many pes this node exchanges charges with
01172 
01173   PatchMap *patchMap = PatchMap::Object();
01174   Lattice lattice = simParams->lattice;
01175   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01176   BigReal cutoff = simParams->cutoff;
01177   BigReal patchdim = simParams->patchDimension;
01178   int numPatches = patchMap->numPatches();
01179   int numNodes = CkNumPes();
01180   int *source_flags = new int[numNodes];
01181   int node;
01182   for ( node=0; node<numNodes; ++node ) {
01183     source_flags[node] = 0;
01184     recipPeDest[node] = 0;
01185   }
01186 
01187   // // make sure that we don't get ahead of ourselves on this node
01188   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01189   //   source_flags[CkMyPe()] = 1;
01190   //   recipPeDest[myRecipPe] = 1;
01191   // }
01192 
01193   for ( int pid=0; pid < numPatches; ++pid ) {
01194     int pnode = patchMap->node(pid);
01195     BigReal minx = patchMap->min_a(pid);
01196     BigReal maxx = patchMap->max_a(pid);
01197     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01198     // min1 (max1) is smallest (largest) grid line for this patch
01199     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
01200     int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
01201     for ( int i=min1; i<=max1; ++i ) {
01202       int ix = i;
01203       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01204       while ( ix < 0 ) ix += myGrid.K1;
01205       // set source_flags[pnode] if this patch sends to our node
01206       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01207            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01208         source_flags[pnode] = 1;
01209       }
01210       // set dest_flags[] for node that our patch sends to
01211       if ( pnode == CkMyPe() ) {
01212         recipPeDest[ix / myGrid.block1] = 1;
01213       }
01214     }
01215   }
01216 
01217   numSources = 0;
01218   numDestRecipPes = 0;
01219   for ( node=0; node<numNodes; ++node ) {
01220     if ( source_flags[node] ) ++numSources;
01221     if ( recipPeDest[node] ) ++numDestRecipPes;
01222   }
01223 
01224 #if 0
01225   if ( numSources ) {
01226     iout << iINFO << "PME " << CkMyPe() << " sources:";
01227     for ( node=0; node<numNodes; ++node ) {
01228       if ( source_flags[node] ) iout << " " << node;
01229     }
01230     iout << "\n" << endi;
01231   }
01232 #endif
01233 
01234   delete [] source_flags;
01235 
01236   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01237   //           CkMyPe(), numSources, numDestRecipPes);
01238 
01239   }  // decide how many pes this node exchanges charges with (end)
01240 
01241   ungrid_count = numDestRecipPes;
01242 
01243   sendTransBarrier_received = 0;
01244 
01245   if ( myGridPe < 0 && myTransPe < 0 ) return;
01246   // the following only for nodes doing reciprocal sum
01247 
01248   if ( myTransPe >= 0 ) {
01249       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01250       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01251       #ifdef OPENATOM_VERSION
01252       if ( simParams->openatomOn ) { 
01253         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01254         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01255       } else {
01256         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01257       }
01258       #else  // OPENATOM_VERSION
01259       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01260       #endif // OPENATOM_VERSION
01261   }
01262 
01263   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01264   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01265   if ( local_size < local_size_2 ) local_size = local_size_2;
01266   qgrid = new float[local_size*numGrids];
01267   if ( numGridPes > 1 || numTransPes > 1 ) {
01268     kgrid = new float[local_size*numGrids];
01269   } else {
01270     kgrid = qgrid;
01271   }
01272   qgrid_size = local_size;
01273 
01274   if ( myGridPe >= 0 ) {
01275   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01276   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01277   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01278   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01279   }
01280 
01281   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01282 #ifdef NAMD_FFTW
01283   CmiLock(fftw_plan_lock);
01284 #ifdef NAMD_FFTW_3
01285   work = new fftwf_complex[n[0]];
01286   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01287   if ( myGridPe >= 0 ) {
01288     forward_plan_yz=new fftwf_plan[numGrids];
01289     backward_plan_yz=new fftwf_plan[numGrids];
01290   }
01291   if ( myTransPe >= 0 ) {
01292     forward_plan_x=new fftwf_plan[numGrids];
01293     backward_plan_x=new fftwf_plan[numGrids];
01294   }
01295   /* need one plan per grid */
01296   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01297   if ( myGridPe >= 0 ) {
01298     for( int g=0; g<numGrids; g++)
01299       {
01300         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01301                                                      localInfo[myGridPe].nx,
01302                                                      qgrid + qgrid_size * g,
01303                                                      NULL,
01304                                                      1,
01305                                                      myGrid.dim2 * myGrid.dim3,
01306                                                      (fftwf_complex *) 
01307                                                      qgrid + qgrid_size * g,
01308                                                      NULL,
01309                                                      1,
01310                                                      myGrid.dim2 * (myGrid.dim3/2),
01311                                                      fftwFlags);
01312       }
01313   }
01314   int zdim = myGrid.dim3;
01315   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01316   if ( ! CkMyPe() ) iout << " 2..." << endi;
01317   if ( myTransPe >= 0 ) {
01318     for( int g=0; g<numGrids; g++)
01319       {
01320 
01321         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01322                                                 (fftwf_complex *)
01323                                                 (kgrid+qgrid_size*g),
01324                                                 NULL,
01325                                                 xStride,
01326                                                 1,
01327                                                 (fftwf_complex *)
01328                                                 (kgrid+qgrid_size*g),
01329                                                 NULL,
01330                                                 xStride,
01331                                                 1,
01332                                                 FFTW_FORWARD,fftwFlags);
01333         
01334       }
01335   }
01336   if ( ! CkMyPe() ) iout << " 3..." << endi;
01337   if ( myTransPe >= 0 ) {
01338     for( int g=0; g<numGrids; g++)
01339       {
01340         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01341                                                  (fftwf_complex *)
01342                                                  (kgrid+qgrid_size*g),
01343                                                  NULL,
01344                                                  xStride,
01345                                                  1,
01346                                                  (fftwf_complex *)
01347                                                  (kgrid+qgrid_size*g),
01348                                                  NULL,
01349                                                  xStride,
01350                                                  1,
01351                                                  FFTW_BACKWARD, fftwFlags);
01352 
01353       }
01354   }
01355   if ( ! CkMyPe() ) iout << " 4..." << endi;
01356   if ( myGridPe >= 0 ) {
01357     for( int g=0; g<numGrids; g++)
01358       {
01359         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01360                                                       localInfo[myGridPe].nx,
01361                                                       (fftwf_complex *)
01362                                                       qgrid + qgrid_size * g,
01363                                                       NULL,
01364                                                       1,
01365                                                       myGrid.dim2*(myGrid.dim3/2),
01366                                                       qgrid + qgrid_size * g,
01367                                                       NULL,
01368                                                       1,
01369                                                       myGrid.dim2 * myGrid.dim3,
01370                                                       fftwFlags);
01371       }
01372   }
01373   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01374 
01375 #else
01376   work = new fftw_complex[n[0]];
01377 
01378   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01379   if ( myGridPe >= 0 ) {
01380   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01381         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01382         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01383   }
01384   if ( ! CkMyPe() ) iout << " 2..." << endi;
01385   if ( myTransPe >= 0 ) {
01386       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01387         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01388         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01389         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01390   }
01391   if ( ! CkMyPe() ) iout << " 3..." << endi;
01392   if ( myTransPe >= 0 ) {
01393   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01394         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01395         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01396         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01397   }
01398   if ( ! CkMyPe() ) iout << " 4..." << endi;
01399   if ( myGridPe >= 0 ) {
01400   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01401         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01402         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01403   }
01404   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01405 #endif
01406   CmiUnlock(fftw_plan_lock);
01407 #else
01408   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01409 #endif
01410 
01411   if ( myGridPe >= 0 && numSources == 0 )
01412                 NAMD_bug("PME grid elements exist without sources.");
01413   grid_count = numSources;
01414   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01415   trans_count = numGridPes;
01416 }
01417 
01418 
01419 
01420 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01421   delete msg;
01422   if ( ! usePencils ) return;
01423 
01424   SimParameters *simParams = Node::Object()->simParameters;
01425 
01426   PatchMap *patchMap = PatchMap::Object();
01427   Lattice lattice = simParams->lattice;
01428   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01429   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01430   BigReal cutoff = simParams->cutoff;
01431   BigReal patchdim = simParams->patchDimension;
01432   int numPatches = patchMap->numPatches();
01433 
01434   pencilActive = new char[xBlocks*yBlocks];
01435   for ( int i=0; i<xBlocks; ++i ) {
01436     for ( int j=0; j<yBlocks; ++j ) {
01437       pencilActive[i*yBlocks+j] = 0;
01438     }
01439   }
01440 
01441   for ( int pid=0; pid < numPatches; ++pid ) {
01442     int pnode = patchMap->node(pid);
01443     if ( pnode != CkMyPe() ) continue;
01444 
01445     BigReal minx = patchMap->min_a(pid);
01446     BigReal maxx = patchMap->max_a(pid);
01447     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01448     // min1 (max1) is smallest (largest) grid line for this patch
01449     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
01450     int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
01451 
01452     BigReal miny = patchMap->min_b(pid);
01453     BigReal maxy = patchMap->max_b(pid);
01454     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01455     // min2 (max2) is smallest (largest) grid line for this patch
01456     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) - myGrid.order + 1;
01457     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb)));
01458 
01459     for ( int i=min1; i<=max1; ++i ) {
01460       int ix = i;
01461       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01462       while ( ix < 0 ) ix += myGrid.K1;
01463       for ( int j=min2; j<=max2; ++j ) {
01464         int jy = j;
01465         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01466         while ( jy < 0 ) jy += myGrid.K2;
01467         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01468       }
01469     }
01470   }
01471 
01472   numPencilsActive = 0;
01473   for ( int i=0; i<xBlocks; ++i ) {
01474     for ( int j=0; j<yBlocks; ++j ) {
01475       if ( pencilActive[i*yBlocks+j] ) {
01476         ++numPencilsActive;
01477         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01478       }
01479     }
01480   }
01481   activePencils = new ijpair[numPencilsActive];
01482   numPencilsActive = 0;
01483   for ( int i=0; i<xBlocks; ++i ) {
01484     for ( int j=0; j<yBlocks; ++j ) {
01485       if ( pencilActive[i*yBlocks+j] ) {
01486         activePencils[numPencilsActive++] = ijpair(i,j);
01487       }
01488     }
01489   }
01490   Random rand(CkMyPe());
01491   rand.reorder(activePencils,numPencilsActive);
01492   //if ( numPencilsActive ) {
01493   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01494   //}
01495 
01496   ungrid_count = numPencilsActive;
01497 }
01498 
01499 
01500 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01501   if ( ! usePencils ) return;
01502   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01503 }
01504 
01505 
01506 ComputePmeMgr::~ComputePmeMgr() {
01507 
01508 #ifdef NAMD_FFTW
01509   if ( CmiMyRank() == 0 ) {
01510     CmiDestroyLock(fftw_plan_lock);
01511   }
01512 #endif
01513 
01514   delete myKSpace;
01515   delete [] localInfo;
01516   delete [] gridNodeInfo;
01517   delete [] transNodeInfo;
01518   delete [] gridPeMap;
01519   delete [] transPeMap;
01520   delete [] recipPeDest;
01521   delete [] gridPeOrder;
01522   delete [] gridNodeOrder;
01523   delete [] transNodeOrder;
01524   delete [] isPmeFlag;
01525   delete [] qgrid;
01526   if ( kgrid != qgrid ) delete [] kgrid;
01527   delete [] work;
01528   delete [] gridmsg_reuse;
01529 }
01530 
01531 void ComputePmeMgr::sendGrid(void) {
01532   pmeCompute->sendData(numGridPes,gridPeOrder,recipPeDest,gridPeMap);
01533 }
01534 
01535 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01536   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01537   if ( grid_count == 0 ) {
01538     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01539   }
01540   if ( grid_count == numSources ) {
01541     lattice = msg->lattice;
01542     sequence = msg->sequence;
01543   }
01544 
01545   int zdim = myGrid.dim3;
01546   int zlistlen = msg->zlistlen;
01547   int *zlist = msg->zlist;
01548   float *qmsg = msg->qgrid;
01549   for ( int g=0; g<numGrids; ++g ) {
01550     char *f = msg->fgrid + fgrid_len * g;
01551     float *q = qgrid + qgrid_size * g;
01552     for ( int i=0; i<fgrid_len; ++i ) {
01553       if ( f[i] ) {
01554         for ( int k=0; k<zlistlen; ++k ) {
01555           q[zlist[k]] += *(qmsg++);
01556         }
01557       }
01558       q += zdim;
01559     }
01560   }
01561 
01562   gridmsg_reuse[numSources-grid_count] = msg;
01563   --grid_count;
01564 
01565   if ( grid_count == 0 ) {
01566     pmeProxyDir[CkMyPe()].gridCalc1();
01567     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01568   }
01569 }
01570 #ifdef MANUAL_DEBUG_FFTW3
01571 
01572 /* utility functions for manual debugging */
01573 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01574 {
01575 
01576   char fmt[1000];
01577   char filename[1000];
01578   strncpy(fmt,infilename,999);
01579   strncat(fmt,"_%d.out",999);
01580   sprintf(filename,fmt, pe);
01581   FILE *loutfile = fopen(filename, "w");
01582 #ifdef PAIRCALC_TEST_DUMP
01583   fprintf(loutfile,"%d\n",ydim);
01584 #endif
01585   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01586   for(int i=0;i<xdim;i++)
01587     for(int j=0;j<ydim;j++)
01588       for(int k=0;k<zdim;k++)
01589         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01590   fclose(loutfile);
01591 
01592 }
01593 
01594 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01595 {
01596   char fmt[1000];
01597   char filename[1000];
01598   strncpy(fmt,infilename,999);
01599   strncat(fmt,"_%d_%d_%d.out",999);
01600   sprintf(filename,fmt, x,y,z);
01601   FILE *loutfile = fopen(filename, "w");
01602   CkAssert(loutfile!=NULL);
01603   CkPrintf("opened %s for dump\n",filename);
01604   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01605   for(int i=0;i<xdim;i++)
01606     for(int j=0;j<ydim;j++)
01607       for(int k=0;k<zdim;k++)
01608         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01609   fclose(loutfile);
01610 }
01611 
01612 #endif
01613 
01614 void ComputePmeMgr::gridCalc1(void) {
01615   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01616 
01617 #ifdef NAMD_FFTW
01618   for ( int g=0; g<numGrids; ++g ) {
01619 #ifdef NAMD_FFTW_3
01620     fftwf_execute(forward_plan_yz[g]);
01621 #else
01622     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01623         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01624 #endif
01625 
01626   }
01627 #endif
01628 
01629   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01630 }
01631 
01632 void ComputePmeMgr::sendTransBarrier(void) {
01633   sendTransBarrier_received += 1;
01634   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01635   if ( sendTransBarrier_received < numGridPes ) return;
01636   sendTransBarrier_received = 0;
01637   for ( int i=0; i<numGridPes; ++i ) {
01638     pmeProxyDir[gridPeMap[i]].sendTrans();
01639   }
01640 }
01641 
01642 void ComputePmeMgr::sendTrans(void) {
01643   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01644 
01645   // send data for transpose
01646   int zdim = myGrid.dim3;
01647   int nx = localInfo[myGridPe].nx;
01648   int x_start = localInfo[myGridPe].x_start;
01649   int slicelen = myGrid.K2 * zdim;
01650 
01651   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01652 
01653 #if CMK_BLUEGENEL
01654   CmiNetworkProgressAfter (0);
01655 #endif
01656 
01657   for (int j=0; j<numTransNodes; j++) {
01658     int node = transNodeOrder[j];  // different order on each node
01659     int pe = transNodeInfo[node].pe_start;
01660     int npe = transNodeInfo[node].npe;
01661     int totlen = 0;
01662     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01663       LocalPmeInfo &li = localInfo[pe];
01664       int cpylen = li.ny_after_transpose * zdim;
01665       totlen += cpylen;
01666     }
01667     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01668                                 PRIORITY_SIZE) PmeTransMsg;
01669     newmsg->sourceNode = myGridPe;
01670     newmsg->lattice = lattice;
01671     newmsg->x_start = x_start;
01672     newmsg->nx = nx;
01673     for ( int g=0; g<numGrids; ++g ) {
01674       float *qmsg = newmsg->qgrid + nx * totlen * g;
01675       pe = transNodeInfo[node].pe_start;
01676       for (int i=0; i<npe; ++i, ++pe) {
01677         LocalPmeInfo &li = localInfo[pe];
01678         int cpylen = li.ny_after_transpose * zdim;
01679         if ( node == myTransNode ) {
01680           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01681           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01682         }
01683         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01684         for ( int x = 0; x < nx; ++x ) {
01685           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01686           q += slicelen;
01687           qmsg += cpylen;
01688         }
01689       }
01690     }
01691     newmsg->sequence = sequence;
01692     SET_PRIORITY(newmsg,sequence,PME_TRANS_PRIORITY)
01693     if ( node == myTransNode ) newmsg->nx = 0;
01694     if ( npe > 1 ) {
01695       if ( node == myTransNode ) fwdSharedTrans(newmsg);
01696       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
01697     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
01698   }
01699  
01700   untrans_count = numTransPes;
01701 
01702 }
01703 
01704 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
01705   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
01706   int pe = transNodeInfo[myTransNode].pe_start;
01707   int npe = transNodeInfo[myTransNode].npe;
01708   CmiNodeLock lock = CmiCreateLock();
01709   int *count = new int; *count = npe;
01710   for (int i=0; i<npe; ++i, ++pe) {
01711     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
01712     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
01713     shmsg->msg = msg;
01714     shmsg->count = count;
01715     shmsg->lock = lock;
01716     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
01717   }
01718 }
01719 
01720 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
01721   procTrans(msg->msg);
01722   CmiLock(msg->lock);
01723   int count = --(*msg->count);
01724   CmiUnlock(msg->lock);
01725   if ( count == 0 ) {
01726     CmiDestroyLock(msg->lock);
01727     delete msg->count;
01728     delete msg->msg;
01729   }
01730   delete msg;
01731 }
01732 
01733 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
01734   procTrans(msg);
01735   delete msg;
01736 }
01737 
01738 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
01739   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
01740   if ( trans_count == numGridPes ) {
01741     lattice = msg->lattice;
01742     sequence = msg->sequence;
01743   }
01744 
01745  if ( msg->nx ) {
01746   int zdim = myGrid.dim3;
01747   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
01748   int first_pe = nodeInfo.pe_start;
01749   int last_pe = first_pe+nodeInfo.npe-1;
01750   int y_skip = localInfo[myTransPe].y_start_after_transpose
01751              - localInfo[first_pe].y_start_after_transpose;
01752   int ny_msg = localInfo[last_pe].y_start_after_transpose
01753              + localInfo[last_pe].ny_after_transpose
01754              - localInfo[first_pe].y_start_after_transpose;
01755   int ny = localInfo[myTransPe].ny_after_transpose;
01756   int x_start = msg->x_start;
01757   int nx = msg->nx;
01758   for ( int g=0; g<numGrids; ++g ) {
01759     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
01760         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
01761         nx*ny*zdim*sizeof(float));
01762   }
01763  }
01764 
01765   --trans_count;
01766 
01767   if ( trans_count == 0 ) {
01768     pmeProxyDir[CkMyPe()].gridCalc2();
01769   }
01770 }
01771 
01772 void ComputePmeMgr::gridCalc2(void) {
01773   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
01774 
01775 #if CMK_BLUEGENEL
01776   CmiNetworkProgressAfter (0);
01777 #endif
01778 
01779   int zdim = myGrid.dim3;
01780   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01781   int ny = localInfo[myTransPe].ny_after_transpose;
01782 
01783   for ( int g=0; g<numGrids; ++g ) {
01784     // finish forward FFT (x dimension)
01785 #ifdef NAMD_FFTW
01786 #ifdef NAMD_FFTW_3
01787     fftwf_execute(forward_plan_x[g]);
01788 #else
01789     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01790         ny * zdim / 2, 1, work, 1, 0);
01791 #endif
01792 #endif
01793   }
01794 
01795 #ifdef OPENATOM_VERSION
01796     if ( ! simParams -> openatomOn ) { 
01797 #endif // OPENATOM_VERSION
01798       gridCalc2R();
01799 #ifdef OPENATOM_VERSION
01800     } else {
01801       gridCalc2Moa();
01802     }
01803 #endif // OPENATOM_VERSION
01804 }
01805 
01806 #ifdef OPENATOM_VERSION
01807 void ComputePmeMgr::gridCalc2Moa(void) {
01808 
01809   int zdim = myGrid.dim3;
01810   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01811   int ny = localInfo[myTransPe].ny_after_transpose;
01812 
01813   SimParameters *simParams = Node::Object()->simParameters;
01814 
01815   CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01816 
01817   for ( int g=0; g<numGrids; ++g ) {
01818     #ifdef OPENATOM_VERSION_DEBUG 
01819     CkPrintf("Sending recQ on processor %d \n", CkMyPe());
01820     for ( int i=0; i<=(ny * zdim / 2); ++i) 
01821     {
01822       CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
01823     }
01824     #endif // OPENATOM_VERSION_DEBUG
01825 //     mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
01826     CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
01827     moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
01828   }
01829 }
01830 #endif // OPENATOM_VERSION
01831 
01832 void ComputePmeMgr::gridCalc2R(void) {
01833 
01834   int zdim = myGrid.dim3;
01835   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01836   int ny = localInfo[myTransPe].ny_after_transpose;
01837 
01838   for ( int g=0; g<numGrids; ++g ) {
01839     // reciprocal space portion of PME
01840     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01841     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
01842                         lattice, ewaldcof, &(recip_evir2[g][1]));
01843     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
01844 
01845     // start backward FFT (x dimension)
01846 
01847 #ifdef NAMD_FFTW
01848 #ifdef NAMD_FFTW_3
01849     fftwf_execute(backward_plan_x[g]);
01850 #else
01851     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01852         ny * zdim / 2, 1, work, 1, 0);
01853 #endif
01854 #endif
01855   }
01856   
01857   pmeProxyDir[CkMyPe()].sendUntrans();
01858 }
01859 
01860 void ComputePmeMgr::sendUntrans(void) {
01861 
01862   int zdim = myGrid.dim3;
01863   int y_start = localInfo[myTransPe].y_start_after_transpose;
01864   int ny = localInfo[myTransPe].ny_after_transpose;
01865   int slicelen = myGrid.K2 * zdim;
01866 
01867   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01868 
01869 #if CMK_BLUEGENEL
01870   CmiNetworkProgressAfter (0);
01871 #endif
01872 
01873   // send data for reverse transpose
01874   for (int j=0; j<numGridNodes; j++) {
01875     int node = gridNodeOrder[j];  // different order on each node
01876     int pe = gridNodeInfo[node].pe_start;
01877     int npe = gridNodeInfo[node].npe;
01878     int totlen = 0;
01879     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
01880       LocalPmeInfo &li = localInfo[pe];
01881       int cpylen = li.nx * zdim;
01882       totlen += cpylen;
01883     }
01884     PmeUntransMsg *newmsg = new (ny * totlen * numGrids,numGrids,
01885                                 PRIORITY_SIZE) PmeUntransMsg;
01886     newmsg->sourceNode = myTransPe;
01887     newmsg->y_start = y_start;
01888     newmsg->ny = ny;
01889     for ( int g=0; g<numGrids; ++g ) {
01890       if ( j == 0 ) {  // only need these once
01891         newmsg->evir[g] = recip_evir2[g];
01892       } else {
01893         newmsg->evir[g] = 0.;
01894       }
01895       float *qmsg = newmsg->qgrid + ny * totlen * g;
01896       pe = gridNodeInfo[node].pe_start;
01897       for (int i=0; i<npe; ++i, ++pe) {
01898         LocalPmeInfo &li = localInfo[pe];
01899         if ( node == myGridNode ) {
01900           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
01901           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
01902           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
01903           int cpylen = ny * zdim;
01904           for ( int x = 0; x < li.nx; ++x ) {
01905             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01906             q += cpylen;
01907             qmsg += slicelen;
01908           }
01909         } else {
01910           CmiMemcpy((void*)qmsg,
01911                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
01912                 li.nx*ny*zdim*sizeof(float));
01913           qmsg += li.nx*ny*zdim;
01914         }
01915       }
01916     }
01917     SET_PRIORITY(newmsg,sequence,PME_UNTRANS_PRIORITY)
01918     if ( node == myGridNode ) newmsg->ny = 0;
01919     if ( npe > 1 ) {
01920       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
01921       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
01922     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
01923   }
01924 
01925   trans_count = numGridPes;
01926 }
01927 
01928 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
01929   int pe = gridNodeInfo[myGridNode].pe_start;
01930   int npe = gridNodeInfo[myGridNode].npe;
01931   CmiNodeLock lock = CmiCreateLock();
01932   int *count = new int; *count = npe;
01933   for (int i=0; i<npe; ++i, ++pe) {
01934     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
01935     shmsg->msg = msg;
01936     shmsg->count = count;
01937     shmsg->lock = lock;
01938     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
01939   }
01940 }
01941 
01942 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
01943   procUntrans(msg->msg);
01944   CmiLock(msg->lock);
01945   int count = --(*msg->count);
01946   CmiUnlock(msg->lock);
01947   if ( count == 0 ) {
01948     CmiDestroyLock(msg->lock);
01949     delete msg->count;
01950     delete msg->msg;
01951   }
01952   delete msg;
01953 }
01954 
01955 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
01956   procUntrans(msg);
01957   delete msg;
01958 }
01959 
01960 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
01961   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
01962   if ( untrans_count == numTransPes ) {
01963     for ( int g=0; g<numGrids; ++g ) {
01964       recip_evir[g] = 0.;
01965     }
01966   }
01967 
01968 #if CMK_BLUEGENEL
01969   CmiNetworkProgressAfter (0);
01970 #endif
01971 
01972   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
01973   int first_pe = nodeInfo.pe_start;
01974   int g;
01975   if ( myGridPe == first_pe ) for ( g=0; g<numGrids; ++g ) {
01976     recip_evir[g] += msg->evir[g];
01977   }
01978 
01979  if ( msg->ny ) {
01980   int zdim = myGrid.dim3;
01981   int last_pe = first_pe+nodeInfo.npe-1;
01982   int x_skip = localInfo[myGridPe].x_start
01983              - localInfo[first_pe].x_start;
01984   int nx_msg = localInfo[last_pe].x_start
01985              + localInfo[last_pe].nx
01986              - localInfo[first_pe].x_start;
01987   int nx = localInfo[myGridPe].nx;
01988   int y_start = msg->y_start;
01989   int ny = msg->ny;
01990   int slicelen = myGrid.K2 * zdim;
01991   int cpylen = ny * zdim;
01992   for ( g=0; g<numGrids; ++g ) {
01993     float *q = qgrid + qgrid_size * g + y_start * zdim;
01994     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
01995     for ( int x = 0; x < nx; ++x ) {
01996       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
01997       q += slicelen;
01998       qmsg += cpylen;
01999     }
02000   }
02001  }
02002 
02003   --untrans_count;
02004 
02005   if ( untrans_count == 0 ) {
02006     pmeProxyDir[CkMyPe()].gridCalc3();
02007   }
02008 }
02009 
02010 void ComputePmeMgr::gridCalc3(void) {
02011   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02012 
02013   // finish backward FFT
02014 #ifdef NAMD_FFTW
02015 
02016   for ( int g=0; g<numGrids; ++g ) {
02017 #ifdef NAMD_FFTW_3
02018     fftwf_execute(backward_plan_yz[g]);
02019 #else
02020     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02021         (fftw_complex *) (qgrid + qgrid_size * g),
02022         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02023 #endif
02024   }
02025 
02026 #endif
02027 
02028   pmeProxyDir[CkMyPe()].sendUngrid();
02029 }
02030 
02031 void ComputePmeMgr::sendUngrid(void) {
02032 
02033   for ( int j=0; j<numSources; ++j ) {
02034     // int msglen = qgrid_len;
02035     PmeGridMsg *newmsg = gridmsg_reuse[j];
02036     int pe = newmsg->sourceNode;
02037     if ( j == 0 ) {  // only need these once
02038       for ( int g=0; g<numGrids; ++g ) {
02039         newmsg->evir[g] = recip_evir[g];
02040       }
02041     } else {
02042       for ( int g=0; g<numGrids; ++g ) {
02043         newmsg->evir[g] = 0.;
02044       }
02045     }
02046     int zdim = myGrid.dim3;
02047     int flen = newmsg->len;
02048     int fstart = newmsg->start;
02049     int zlistlen = newmsg->zlistlen;
02050     int *zlist = newmsg->zlist;
02051     float *qmsg = newmsg->qgrid;
02052     for ( int g=0; g<numGrids; ++g ) {
02053       char *f = newmsg->fgrid + fgrid_len * g;
02054       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02055       for ( int i=0; i<flen; ++i ) {
02056         if ( f[i] ) {
02057           for ( int k=0; k<zlistlen; ++k ) {
02058             *(qmsg++) = q[zlist[k]];
02059           }
02060         }
02061         q += zdim;
02062       }
02063     }
02064     newmsg->sourceNode = myGridPe;
02065 
02066     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
02067     CmiEnableUrgentSend(1);
02068     pmeProxyDir[pe].recvUngrid(newmsg);
02069     CmiEnableUrgentSend(0);
02070   }
02071   grid_count = numSources;
02072   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02073 }
02074 
02075 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02076   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02077   if ( ungrid_count == 0 ) {
02078     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02079   }
02080 
02081   if ( usePencils ) pmeCompute->copyPencils(msg);
02082   else pmeCompute->copyResults(msg);
02083   delete msg;
02084   recvAck(0);
02085 }
02086 
02087 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02088   if ( msg ) delete msg;
02089   --ungrid_count;
02090 
02091   if ( ungrid_count == 0 ) {
02092     pmeProxyDir[CkMyPe()].ungridCalc();
02093   }
02094 }
02095 
02096 void ComputePmeMgr::ungridCalc(void) {
02097   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02098 
02099   pmeCompute->ungridForces();
02100 
02101   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02102 }
02103 
02104 
02105 ComputePme::ComputePme(ComputeID c) :
02106   ComputeHomePatches(c)
02107 {
02108   DebugM(4,"ComputePme created.\n");
02109   basePriority = PME_PRIORITY;
02110 
02111   CProxy_ComputePmeMgr::ckLocalBranch(
02112         CkpvAccess(BOCclass_group).computePmeMgr)->setCompute(this);
02113 
02114   useAvgPositions = 1;
02115 
02116   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02117 
02118   SimParameters *simParams = Node::Object()->simParameters;
02119 
02120   if (simParams->accelMDOn) {
02121      amd_reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_AMD);
02122   } else {
02123      amd_reduction = NULL;
02124   }
02125 
02126   alchFepOn = simParams->alchFepOn;
02127   alchThermIntOn = simParams->alchThermIntOn;
02128   alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
02129   alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? 
02130     simParams->alchElecLambdaStart : 0;
02131             
02132   if (alchFepOn || alchThermIntOn) {
02133     numGrids = 2;
02134     if (alchDecouple) numGrids += 2;
02135     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
02136   }
02137   else numGrids = 1;
02138   lesOn = simParams->lesOn;
02139   if ( lesOn ) {
02140     lesFactor = simParams->lesFactor;
02141     numGrids = lesFactor;
02142   }
02143   selfOn = 0;
02144   pairOn = simParams->pairInteractionOn;
02145   if ( pairOn ) {
02146     selfOn = simParams->pairInteractionSelf;
02147     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
02148     numGrids = selfOn ? 1 : 3;
02149   }
02150 
02151   myGrid.K1 = simParams->PMEGridSizeX;
02152   myGrid.K2 = simParams->PMEGridSizeY;
02153   myGrid.K3 = simParams->PMEGridSizeZ;
02154   myGrid.order = simParams->PMEInterpOrder;
02155   myGrid.dim2 = myGrid.K2;
02156   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02157   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02158   fsize = myGrid.K1 * myGrid.dim2;
02159   q_arr = new double*[fsize*numGrids];
02160   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(double*) );
02161   q_list = new double*[fsize*numGrids];
02162   memset( (void*) q_list, 0, fsize*numGrids * sizeof(double*) );
02163   q_count = 0;
02164   f_arr = new char[fsize*numGrids];
02165   memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02166   fz_arr = new char[myGrid.K3];
02167 
02168   for ( int g=0; g<numGrids; ++g ) {
02169     char *f = f_arr + g*fsize;
02170     if ( myMgr->usePencils ) {
02171       int xBlocks = myMgr->xBlocks;
02172       int yBlocks = myMgr->yBlocks;
02173       int K1 = myGrid.K1;
02174       int K2 = myGrid.K2;
02175       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02176       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02177       int dim2 = myGrid.dim2;
02178       const ijpair *activePencils = myMgr->activePencils;
02179       const int numPencilsActive = myMgr->numPencilsActive;
02180       for (int ap=0; ap<numPencilsActive; ++ap) {
02181         int ib = activePencils[ap].i;
02182         int jb = activePencils[ap].j;
02183         int ibegin = ib*block1;
02184         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02185         int jbegin = jb*block2;
02186         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02187         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02188         for ( int i=ibegin; i<iend; ++i ) {
02189           for ( int j=jbegin; j<jend; ++j ) {
02190             f[i*dim2+j] = 0;
02191           }
02192         }
02193       }
02194     } else {
02195       int numRecipPes = myMgr->numGridPes;
02196       int block1 = ( myGrid.K1 + numRecipPes - 1 ) / numRecipPes;
02197       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02198       for (int pe=0; pe<numRecipPes; pe++) {
02199         if ( ! myMgr->recipPeDest[pe] ) continue;
02200         int start = pe * bsize;
02201         int len = bsize;
02202         if ( start >= qsize ) { start = 0; len = 0; }
02203         if ( start + len > qsize ) { len = qsize - start; }
02204         int zdim = myGrid.dim3;
02205         int fstart = start / zdim;
02206         int flen = len / zdim;
02207         memset(f + fstart, 0, flen*sizeof(char));
02208       }
02209     }
02210   }
02211 
02212 #if CMK_PERSISTENT_COMM
02213   recvGrid_handle = NULL;
02214 #endif
02215 }
02216 
02217 ComputePme::~ComputePme()
02218 {
02219   for (int i=0; i<fsize*numGrids; ++i) {
02220     if ( q_arr[i] ) {
02221       delete [] q_arr[i];
02222     }
02223   }
02224   delete [] q_arr;
02225   delete [] q_list;
02226   delete [] f_arr;
02227   delete [] fz_arr;
02228 }
02229 
02230 #if CMK_PERSISTENT_COMM 
02231 void ComputePme::setup_recvgrid_persistent() 
02232 {
02233     int xBlocks = myMgr->xBlocks;
02234     int yBlocks = myMgr->yBlocks;
02235     int zBlocks = myMgr->zBlocks;
02236     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
02237     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
02238     int K1 = myGrid.K1;
02239     int K2 = myGrid.K2;
02240     int dim2 = myGrid.dim2;
02241     int dim3 = myGrid.dim3;
02242     int block1 = myGrid.block1;
02243     int block2 = myGrid.block2;
02244 
02245     const char *pencilActive = myMgr->pencilActive;
02246     const ijpair *activePencils = myMgr->activePencils;
02247     const int numPencilsActive = myMgr->numPencilsActive;
02248     int hd = 1 ;  // has data?
02249 
02250     CkArray *zPencil_local = myMgr->zPencil.ckLocalBranch();
02251     recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02252     for (int ap=0; ap<numPencilsActive; ++ap) {
02253         int ib = activePencils[ap].i;
02254         int jb = activePencils[ap].j;
02255         int ibegin = ib*block1;
02256         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02257         int jbegin = jb*block2;
02258         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02259         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02260         // f is changing
02261         int fcount = 0;
02262         for ( int g=0; g<numGrids; ++g ) {
02263             char *f = f_arr + g*fsize;
02264             for ( int i=ibegin; i<iend; ++i ) {
02265                 for ( int j=jbegin; j<jend; ++j ) {
02266                     //fcount += f[i*dim2+j];
02267                     fcount += (f[i*dim2+j]>3?f[i*dim2+j]:3);
02268                 }
02269             }
02270         }
02271         int zlistlen = 0;
02272         for ( int i=0; i<myGrid.K3; ++i ) {
02273             if ( 1 ) ++zlistlen;
02274             //if ( fz_arr[i] ) ++zlistlen;
02275         }
02276         int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
02277         recvGrid_handle[ap] =  CmiCreatePersistent(peer, sizeof(PmeGridMsg)  
02278             + sizeof(float)*hd*fcount*zlistlen + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids
02279             + sizeof(envelope)+PRIORITY_SIZE/8 );
02280     }
02281 }
02282 #endif
02283 
02284 
02285 void ComputePme::doWork()
02286 {
02287   DebugM(4,"Entering ComputePme::doWork().\n");
02288 
02289 #ifdef TRACE_COMPUTE_OBJECTS
02290     double traceObjStartTime = CmiWallTimer();
02291 #endif
02292 
02293   ResizeArrayIter<PatchElem> ap(patchList);
02294 
02295   // Skip computations if nothing to do.
02296   if ( ! patchList[0].p->flags.doFullElectrostatics )
02297   {
02298     for (ap = ap.begin(); ap != ap.end(); ap++) {
02299       CompAtom *x = (*ap).positionBox->open();
02300       Results *r = (*ap).forceBox->open();
02301       (*ap).positionBox->close(&x);
02302       (*ap).forceBox->close(&r);
02303     }
02304     reduction->submit();
02305     if (amd_reduction) amd_reduction->submit();
02306     return;
02307   }
02308 
02309   // allocate storage
02310   numLocalAtoms = 0;
02311   for (ap = ap.begin(); ap != ap.end(); ap++) {
02312     numLocalAtoms += (*ap).p->getNumAtoms();
02313   }
02314 
02315   Lattice lattice = patchList[0].p->flags.lattice;
02316 
02317   localData = new PmeParticle[numLocalAtoms*(numGrids+
02318                                         ((numGrids>1 || selfOn)?1:0))];
02319   localPartition = new unsigned char[numLocalAtoms];
02320 
02321   int g;
02322   for ( g=0; g<numGrids; ++g ) {
02323     localGridData[g] = localData + numLocalAtoms*(g+1);
02324   }
02325 
02326   // get positions and charges
02327   PmeParticle * data_ptr = localData;
02328   unsigned char * part_ptr = localPartition;
02329   const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
02330                                 * ComputeNonbondedUtil::dielectric_1 );
02331 
02332   for (ap = ap.begin(); ap != ap.end(); ap++) {
02333 #ifdef NETWORK_PROGRESS
02334     CmiNetworkProgress();
02335 #endif
02336 
02337     CompAtom *x = (*ap).positionBox->open();
02338     // CompAtomExt *xExt = (*ap).p->getCompAtomExtInfo();
02339     if ( patchList[0].p->flags.doMolly ) {
02340       (*ap).positionBox->close(&x);
02341       x = (*ap).avgPositionBox->open();
02342     }
02343     int numAtoms = (*ap).p->getNumAtoms();
02344 
02345     for(int i=0; i<numAtoms; ++i)
02346     {
02347       data_ptr->x = x[i].position.x;
02348       data_ptr->y = x[i].position.y;
02349       data_ptr->z = x[i].position.z;
02350       data_ptr->cg = coulomb_sqrt * x[i].charge;
02351       ++data_ptr;
02352       *part_ptr = x[i].partition;
02353       ++part_ptr;
02354     }
02355 
02356     if ( patchList[0].p->flags.doMolly ) { (*ap).avgPositionBox->close(&x); }
02357     else { (*ap).positionBox->close(&x); }
02358   }
02359 
02360   // copy to other grids if needed
02361   if ( ((alchFepOn || alchThermIntOn) && (!alchDecouple)) || lesOn ) {
02362     for ( g=0; g<numGrids; ++g ) {
02363 #ifdef NETWORK_PROGRESS
02364       CmiNetworkProgress();
02365 #endif
02366 
02367       PmeParticle *lgd = localGridData[g];
02368       int nga = 0;
02369       for(int i=0; i<numLocalAtoms; ++i) {
02370         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02371           // for FEP/TI: grid 0 gets non-alch + partition 1;
02372           // grid 1 gets non-alch + partition 2;
02373           // grid 2 (only if called for with numGrids=3) gets only non-alch
02374           lgd[nga++] = localData[i];
02375         }
02376       }
02377       numGridAtoms[g] = nga;
02378     }
02379   } else if ( (alchFepOn || alchThermIntOn) && alchDecouple) {
02380     // alchemical decoupling: four grids
02381     // g=0: partition 0 and partition 1
02382     // g=1: partition 0 and partition 2
02383     // g=2: only partition 1 atoms
02384     // g=3: only partition 2 atoms
02385     // plus one grid g=4, only partition 0, if numGrids=5
02386     for ( g=0; g<2; ++g ) {  // same as before for first 2
02387 #ifdef NETWORK_PROGRESS
02388       CmiNetworkProgress();
02389 #endif
02390 
02391       PmeParticle *lgd = localGridData[g];
02392       int nga = 0;
02393       for(int i=0; i<numLocalAtoms; ++i) {
02394         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02395           lgd[nga++] = localData[i];
02396         }
02397       }
02398       numGridAtoms[g] = nga;
02399     }
02400     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
02401 #ifdef NETWORK_PROGRESS
02402       CmiNetworkProgress();
02403 #endif
02404 
02405       PmeParticle *lgd = localGridData[g];
02406       int nga = 0;
02407       for(int i=0; i<numLocalAtoms; ++i) {
02408         if ( localPartition[i] == (g-1) ) {
02409           lgd[nga++] = localData[i];
02410         }
02411       }
02412       numGridAtoms[g] = nga;
02413     }
02414     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
02415       // numGrids=5 only if alchElecLambdaStart > 0
02416 #ifdef NETWORK_PROGRESS
02417       CmiNetworkProgress();
02418 #endif
02419 
02420       PmeParticle *lgd = localGridData[g];
02421       int nga = 0;
02422       for(int i=0; i<numLocalAtoms; ++i) {
02423         if ( localPartition[i] == 0 ) {
02424           lgd[nga++] = localData[i];
02425         }
02426       }
02427       numGridAtoms[g] = nga;
02428     }
02429   } else if ( selfOn ) {
02430     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
02431     g = 0;
02432     PmeParticle *lgd = localGridData[g];
02433     int nga = 0;
02434     for(int i=0; i<numLocalAtoms; ++i) {
02435       if ( localPartition[i] == 1 ) {
02436         lgd[nga++] = localData[i];
02437       }
02438     }
02439     numGridAtoms[g] = nga;
02440   } else if ( pairOn ) {
02441     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
02442     g = 0;
02443     PmeParticle *lgd = localGridData[g];
02444     int nga = 0;
02445     for(int i=0; i<numLocalAtoms; ++i) {
02446       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02447         lgd[nga++] = localData[i];
02448       }
02449     }
02450     numGridAtoms[g] = nga;
02451     for ( g=1; g<3; ++g ) {
02452       PmeParticle *lgd = localGridData[g];
02453       int nga = 0;
02454       for(int i=0; i<numLocalAtoms; ++i) {
02455         if ( localPartition[i] == g ) {
02456           lgd[nga++] = localData[i];
02457         }
02458       }
02459       numGridAtoms[g] = nga;
02460     }
02461   } else {
02462     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
02463     localGridData[0] = localData;
02464     numGridAtoms[0] = numLocalAtoms;
02465   }
02466 
02467   memset( (void*) fz_arr, 0, myGrid.K3 * sizeof(char) );
02468 
02469   for (int i=0; i<q_count; ++i) {
02470     memset( (void*) (q_list[i]), 0, myGrid.dim3 * sizeof(double) );
02471   }
02472 
02473   strayChargeErrors = 0;
02474 
02475   // calculate self energy
02476   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02477   for ( g=0; g<numGrids; ++g ) {
02478 #ifdef NETWORK_PROGRESS
02479     CmiNetworkProgress();
02480 #endif
02481 
02482     evir[g] = 0;
02483     BigReal selfEnergy = 0;
02484     data_ptr = localGridData[g];
02485     int i;
02486     for(i=0; i<numGridAtoms[g]; ++i)
02487     {
02488       selfEnergy += data_ptr->cg * data_ptr->cg;
02489       ++data_ptr;
02490     }
02491     selfEnergy *= -1. * ewaldcof / SQRT_PI;
02492     evir[g][0] += selfEnergy;
02493 
02494     double **q = q_arr + g*fsize;
02495     char *f = f_arr + g*fsize;
02496 
02497     myRealSpace[g] = new PmeRealSpace(myGrid,numGridAtoms[g]);
02498     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
02499     myRealSpace[g]->fill_charges(q, q_list, q_count, strayChargeErrors, f, fz_arr, localGridData[g]);
02500   }
02501 
02502 #ifdef TRACE_COMPUTE_OBJECTS
02503     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
02504 #endif
02505 
02506   if ( myMgr->usePencils ) {
02507     sendPencils();
02508   } else {
02509 #if 0
02510   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
02511   pmeProxy[CkMyPe()].sendGrid();
02512 #else
02513   sendData(myMgr->numGridPes,myMgr->gridPeOrder,
02514                 myMgr->recipPeDest,myMgr->gridPeMap);
02515 #endif
02516   }
02517 
02518 }
02519 
02520 
02521 void ComputePme::sendPencils() {
02522 
02523   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
02524 
02525   int xBlocks = myMgr->xBlocks;
02526   int yBlocks = myMgr->yBlocks;
02527   int zBlocks = myMgr->zBlocks;
02528   myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
02529   myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
02530   int K1 = myGrid.K1;
02531   int K2 = myGrid.K2;
02532   int dim2 = myGrid.dim2;
02533   int dim3 = myGrid.dim3;
02534   int block1 = myGrid.block1;
02535   int block2 = myGrid.block2;
02536 
02537   Lattice lattice = patchList[0].p->flags.lattice;
02538 
02539   resultsRemaining = myMgr->numPencilsActive;
02540   const char *pencilActive = myMgr->pencilActive;
02541   const ijpair *activePencils = myMgr->activePencils;
02542   const int numPencilsActive = myMgr->numPencilsActive;
02543 
02544   // int savedMessages = 0;
02545   NodePmeMgr *npMgr = myMgr->pmeNodeProxy[CkMyNode()].ckLocalBranch();
02546 
02547   for (int ap=0; ap<numPencilsActive; ++ap) {
02548     int ib = activePencils[ap].i;
02549     int jb = activePencils[ap].j;
02550     int ibegin = ib*block1;
02551     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02552     int jbegin = jb*block2;
02553     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02554     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02555 
02556     int fcount = 0;
02557     for ( int g=0; g<numGrids; ++g ) {
02558       char *f = f_arr + g*fsize;
02559       for ( int i=ibegin; i<iend; ++i ) {
02560        for ( int j=jbegin; j<jend; ++j ) {
02561         fcount += f[i*dim2+j];
02562        }
02563       }
02564     }
02565 
02566 #ifdef NETWORK_PROGRESS
02567     CmiNetworkProgress();
02568 #endif
02569 
02570     if ( ! pencilActive[ib*yBlocks+jb] )
02571       NAMD_bug("PME activePencils list inconsistent");
02572 
02573     int zlistlen = 0;
02574     for ( int i=0; i<myGrid.K3; ++i ) {
02575       if ( fz_arr[i] ) ++zlistlen;
02576     }
02577 
02578     int hd = ( fcount? 1 : 0 );  // has data?
02579     // if ( ! hd ) ++savedMessages;
02580 
02581     PmeGridMsg *msg = new (hd*fcount*zlistlen, hd*zlistlen, hd*flen,
02582         hd*numGrids, PRIORITY_SIZE) PmeGridMsg;
02583     msg->sourceNode = CkMyPe();
02584     msg->hasData = hd;
02585     msg->lattice = lattice;
02586    if ( hd ) {
02587 #if 0
02588     msg->start = fstart;
02589     msg->len = flen;
02590 #else
02591     msg->start = -1;   // obsolete?
02592     msg->len = -1;   // obsolete?
02593 #endif
02594     msg->zlistlen = zlistlen;
02595     int *zlist = msg->zlist;
02596     zlistlen = 0;
02597     for ( int i=0; i<myGrid.K3; ++i ) {
02598       if ( fz_arr[i] ) zlist[zlistlen++] = i;
02599     }
02600     char *fmsg = msg->fgrid;
02601     float *qmsg = msg->qgrid;
02602     for ( int g=0; g<numGrids; ++g ) {
02603       char *f = f_arr + g*fsize;
02604       double **q = q_arr + g*fsize;
02605       for ( int i=ibegin; i<iend; ++i ) {
02606        for ( int j=jbegin; j<jend; ++j ) {
02607         *(fmsg++) = f[i*dim2+j];
02608         if( f[i*dim2+j] ) {
02609           for ( int k=0; k<zlistlen; ++k ) {
02610             *(qmsg++) = q[i*dim2+j][zlist[k]];
02611           }
02612         }
02613        }
02614       }
02615     }
02616    }
02617 
02618     msg->sequence = sequence();
02619     SET_PRIORITY(msg,sequence(),PME_GRID_PRIORITY)
02620     CmiEnableUrgentSend(1);
02621 #if USE_NODE_PAR_RECEIVE
02622     msg->destElem=CkArrayIndex3D(ib,jb,0);
02623     CProxy_PmePencilMap lzm = npMgr->zm;
02624     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
02625     int destnode = CmiNodeOf(destproc);
02626     myMgr->pmeNodeProxy[destnode].recvZGrid(msg);
02627 #else
02628     myMgr->zPencil(ib,jb,0).recvGrid(msg);
02629 #endif
02630     CmiEnableUrgentSend(0);
02631   }
02632 
02633 
02634   if ( strayChargeErrors ) {
02635    iout << iERROR << "Stray PME grid charges detected: "
02636         << CkMyPe() << " sending to (x,y)";
02637    for (int ib=0; ib<xBlocks; ++ib) {
02638     for (int jb=0; jb<yBlocks; ++jb) {
02639      int ibegin = ib*block1;
02640      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02641      int jbegin = jb*block2;
02642      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02643      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02644 
02645      for ( int g=0; g<numGrids; ++g ) {
02646        char *f = f_arr + g*fsize;
02647        if ( ! pencilActive[ib*yBlocks+jb] ) {
02648            for ( int i=ibegin; i<iend; ++i ) {
02649             for ( int j=jbegin; j<jend; ++j ) {
02650              if ( f[i*dim2+j] == 3 ) {
02651                f[i*dim2+j] = 2;
02652                iout << " (" << i << "," << j << ")";
02653              }
02654             }
02655            }
02656        }
02657      }
02658     }
02659    }
02660    iout << "\n" << endi;
02661   }
02662 
02663   // if ( savedMessages ) {
02664   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
02665   // }
02666 
02667 }
02668 
02669 
02670 void ComputePme::copyPencils(PmeGridMsg *msg) {
02671 
02672   int xBlocks = myMgr->xBlocks;
02673   int yBlocks = myMgr->yBlocks;
02674   int zBlocks = myMgr->zBlocks;
02675   myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
02676   myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
02677   int K1 = myGrid.K1;
02678   int K2 = myGrid.K2;
02679   int dim2 = myGrid.dim2;
02680   int dim3 = myGrid.dim3;
02681   int block1 = myGrid.block1;
02682   int block2 = myGrid.block2;
02683 
02684   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
02685   int ib = msg->sourceNode / yBlocks;
02686   int jb = msg->sourceNode % yBlocks;
02687 
02688   int ibegin = ib*block1;
02689   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02690   int jbegin = jb*block2;
02691   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02692 
02693   int zlistlen = msg->zlistlen;
02694   int *zlist = msg->zlist;
02695   float *qmsg = msg->qgrid;
02696   int g;
02697   for ( g=0; g<numGrids; ++g ) {
02698     evir[g] += msg->evir[g];
02699     char *f = f_arr + g*fsize;
02700     double **q = q_arr + g*fsize;
02701     for ( int i=ibegin; i<iend; ++i ) {
02702      for ( int j=jbegin; j<jend; ++j ) {
02703       if( f[i*dim2+j] ) {
02704         f[i*dim2+j] = 0;
02705         for ( int k=0; k<zlistlen; ++k ) {
02706           q[i*dim2+j][zlist[k]] = *(qmsg++);
02707         }
02708       }
02709      }
02710     }
02711   }
02712 }
02713 
02714 
02715 void ComputePme::sendData(int numRecipPes, int *recipPeOrder,
02716                                 int *recipPeDest, int *gridPeMap) {
02717 
02718   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
02719 
02720   myGrid.block1 = ( myGrid.K1 + numRecipPes - 1 ) / numRecipPes;
02721   myGrid.block2 = ( myGrid.K2 + numRecipPes - 1 ) / numRecipPes;
02722   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
02723 
02724   Lattice lattice = patchList[0].p->flags.lattice;
02725 
02726   resultsRemaining = numRecipPes;
02727 
02728   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
02729   for (int j=0; j<numRecipPes; j++) {
02730     int pe = recipPeOrder[j];  // different order
02731     if ( ! recipPeDest[pe] && ! strayChargeErrors ) continue;
02732     int start = pe * bsize;
02733     int len = bsize;
02734     if ( start >= qsize ) { start = 0; len = 0; }
02735     if ( start + len > qsize ) { len = qsize - start; }
02736     int zdim = myGrid.dim3;
02737     int fstart = start / zdim;
02738     int flen = len / zdim;
02739     int fcount = 0;
02740     int i;
02741 
02742     int g;
02743     for ( g=0; g<numGrids; ++g ) {
02744       char *f = f_arr + fstart + g*fsize;
02745       for ( i=0; i<flen; ++i ) {
02746         fcount += f[i];
02747       }
02748       if ( ! recipPeDest[pe] ) {
02749         int errfound = 0;
02750         for ( i=0; i<flen; ++i ) {
02751           if ( f[i] == 3 ) {
02752             errfound = 1;
02753             break;
02754           }
02755         }
02756         if ( errfound ) {
02757           iout << iERROR << "Stray PME grid charges detected: "
02758                 << CkMyPe() << " sending to " << gridPeMap[pe] << " for planes";
02759           int iz = -1;
02760           for ( i=0; i<flen; ++i ) {
02761             if ( f[i] == 3 ) {
02762               f[i] = 2;
02763               int jz = (i+fstart)/myGrid.K2;
02764               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
02765             }
02766           }
02767           iout << "\n" << endi;
02768         }
02769       }
02770     }
02771 
02772 #ifdef NETWORK_PROGRESS
02773     CmiNetworkProgress();
02774 #endif
02775 
02776     if ( ! recipPeDest[pe] ) continue;
02777 
02778     int zlistlen = 0;
02779     for ( i=0; i<myGrid.K3; ++i ) {
02780       if ( fz_arr[i] ) ++zlistlen;
02781     }
02782 
02783     PmeGridMsg *msg = new (fcount*zlistlen, zlistlen, flen*numGrids,
02784                                 numGrids, PRIORITY_SIZE) PmeGridMsg;
02785     msg->sourceNode = CkMyPe();
02786     msg->lattice = lattice;
02787     msg->start = fstart;
02788     msg->len = flen;
02789     msg->zlistlen = zlistlen;
02790     int *zlist = msg->zlist;
02791     zlistlen = 0;
02792     for ( i=0; i<myGrid.K3; ++i ) {
02793       if ( fz_arr[i] ) zlist[zlistlen++] = i;
02794     }
02795     float *qmsg = msg->qgrid;
02796     for ( g=0; g<numGrids; ++g ) {
02797       char *f = f_arr + fstart + g*fsize;
02798       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
02799       double **q = q_arr + fstart + g*fsize;
02800       for ( i=0; i<flen; ++i ) {
02801         if ( f[i] ) {
02802           for ( int k=0; k<zlistlen; ++k ) {
02803             *(qmsg++) = q[i][zlist[k]];
02804           }
02805         }
02806       }
02807     }
02808 
02809     msg->sequence = sequence();
02810     SET_PRIORITY(msg,sequence(),PME_GRID_PRIORITY)
02811     pmeProxy[gridPeMap[pe]].recvGrid(msg);
02812   }
02813 
02814 }
02815 
02816 void ComputePme::copyResults(PmeGridMsg *msg) {
02817 
02818   int zdim = myGrid.dim3;
02819   int flen = msg->len;
02820   int fstart = msg->start;
02821   int zlistlen = msg->zlistlen;
02822   int *zlist = msg->zlist;
02823   float *qmsg = msg->qgrid;
02824   int g;
02825   for ( g=0; g<numGrids; ++g ) {
02826     evir[g] += msg->evir[g];
02827     char *f = msg->fgrid + g*flen;
02828     double **q = q_arr + fstart + g*fsize;
02829     for ( int i=0; i<flen; ++i ) {
02830       if ( f[i] ) {
02831         f[i] = 0;
02832         for ( int k=0; k<zlistlen; ++k ) {
02833           q[i][zlist[k]] = *(qmsg++);
02834         }
02835       }
02836     }
02837   }
02838 }
02839 
02840 void ComputePme::ungridForces() {
02841 
02842     SimParameters *simParams = Node::Object()->simParameters;
02843 
02844     Vector *localResults = new Vector[numLocalAtoms*
02845                                         ((numGrids>1 || selfOn)?2:1)];
02846     Vector *gridResults;
02847     if ( alchFepOn || alchThermIntOn || lesOn || selfOn || pairOn ) {
02848       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
02849       gridResults = localResults + numLocalAtoms;
02850     } else {
02851       gridResults = localResults;
02852     }
02853 
02854     Vector pairForce = 0.;
02855     Lattice lattice = patchList[0].p->flags.lattice;
02856     int g = 0;
02857     if(!simParams->commOnly) {
02858     for ( g=0; g<numGrids; ++g ) {
02859 #ifdef NETWORK_PROGRESS
02860       CmiNetworkProgress();
02861 #endif
02862 
02863       myRealSpace[g]->compute_forces(q_arr+g*fsize, localGridData[g], gridResults);
02864       delete myRealSpace[g];
02865       scale_forces(gridResults, numGridAtoms[g], lattice);
02866 
02867       if ( alchFepOn || alchThermIntOn ) {
02868         double scale = 1.;
02869         elecLambdaUp =  (simParams->alchLambda <= alchElecLambdaStart)? 0. : \
02870           (simParams->alchLambda - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02871         elecLambdaDown =  ((1-simParams->alchLambda) <= alchElecLambdaStart)? 0. : ((1-simParams->alchLambda) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02872         if ( g == 0 ) scale = elecLambdaUp;
02873         else if ( g == 1 ) scale = elecLambdaDown;
02874         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02875         if (alchDecouple) {
02876           if ( g == 2 ) scale = 1 - elecLambdaUp;
02877           else if ( g == 3 ) scale = 1 - elecLambdaDown;
02878           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02879         }
02880         int nga = 0;
02881         if (!alchDecouple) {
02882           for(int i=0; i<numLocalAtoms; ++i) {
02883             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02884               // (g=2: only partition 0)
02885               localResults[i] += gridResults[nga++] * scale;
02886             }
02887           }
02888         }
02889         else {  // alchDecouple
02890           if ( g < 2 ) {
02891             for(int i=0; i<numLocalAtoms; ++i) {
02892               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02893                 // g = 0: partition 0 or partition 1
02894                 // g = 1: partition 0 or partition 2
02895                 localResults[i] += gridResults[nga++] * scale;
02896               }
02897             }
02898           }
02899           else {
02900             for(int i=0; i<numLocalAtoms; ++i) {
02901               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
02902                 // g = 2: partition 1 only
02903                 // g = 3: partition 2 only
02904                 // g = 4: partition 0 only
02905                 localResults[i] += gridResults[nga++] * scale;
02906               }
02907             }
02908           }
02909         }
02910       } else if ( lesOn ) {
02911         double scale = 1.;
02912         if ( alchFepOn ) {
02913           if ( g == 0 ) scale = simParams->alchLambda;
02914           else if ( g == 1 ) scale = 1. - simParams->alchLambda;
02915         } else if ( lesOn ) {
02916           scale = 1.0 / (double)lesFactor;
02917         }
02918         int nga = 0;
02919         for(int i=0; i<numLocalAtoms; ++i) {
02920           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02921             localResults[i] += gridResults[nga++] * scale;
02922           }
02923         }
02924       } else if ( selfOn ) {
02925         PmeParticle *lgd = localGridData[g];
02926         int nga = 0;
02927         for(int i=0; i<numLocalAtoms; ++i) {
02928           if ( localPartition[i] == 1 ) {
02929             pairForce += gridResults[nga];  // should add up to almost zero
02930             localResults[i] += gridResults[nga++];
02931           }
02932         }
02933       } else if ( pairOn ) {
02934         if ( g == 0 ) {
02935           int nga = 0;
02936           for(int i=0; i<numLocalAtoms; ++i) {
02937             if ( localPartition[i] == 1 ) {
02938               pairForce += gridResults[nga];
02939             }
02940             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02941               localResults[i] += gridResults[nga++];
02942             }
02943           }
02944         } else if ( g == 1 ) {
02945           int nga = 0;
02946           for(int i=0; i<numLocalAtoms; ++i) {
02947             if ( localPartition[i] == g ) {
02948               pairForce -= gridResults[nga];  // should add up to almost zero
02949               localResults[i] -= gridResults[nga++];
02950             }
02951           }
02952         } else {
02953           int nga = 0;
02954           for(int i=0; i<numLocalAtoms; ++i) {
02955             if ( localPartition[i] == g ) {
02956               localResults[i] -= gridResults[nga++];
02957             }
02958           }
02959         }
02960       }
02961     }
02962     }
02963     else
02964     {// cleanup for commOnly to avoid leaking
02965       for ( g=0; g<numGrids; ++g ) {delete myRealSpace[g];}
02966     }
02967     delete [] localData;
02968     delete [] localPartition;
02969 
02970     Vector *results_ptr = localResults;
02971     ResizeArrayIter<PatchElem> ap(patchList);
02972     
02973     // add in forces
02974     for (ap = ap.begin(); ap != ap.end(); ap++) {
02975       Results *r = (*ap).forceBox->open();
02976       Force *f = r->f[Results::slow];
02977       int numAtoms = (*ap).p->getNumAtoms();
02978 
02979       if ( ! strayChargeErrors && ! simParams->commOnly ) {
02980         for(int i=0; i<numAtoms; ++i) {
02981           f[i].x += results_ptr->x;
02982           f[i].y += results_ptr->y;
02983           f[i].z += results_ptr->z;
02984           ++results_ptr;
02985         }
02986       }
02987       (*ap).forceBox->close(&r);
02988     }
02989 
02990     delete [] localResults;
02991    
02992     if ( pairOn || selfOn ) {
02993         ADD_VECTOR_OBJECT(reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
02994     }
02995 
02996     for ( g=0; g<numGrids; ++g ) {
02997       double scale = 1.;
02998       if ( alchFepOn || alchThermIntOn ) {
02999         if ( g == 0 ) scale = elecLambdaUp;
03000         else if ( g == 1 ) scale = elecLambdaDown;
03001         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03002         if (alchDecouple) {
03003           if ( g == 2 ) scale = 1-elecLambdaUp;
03004           else if ( g == 3 ) scale = 1-elecLambdaDown;
03005           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03006         }
03007       } else if ( lesOn ) {
03008         scale = 1.0 / (double)lesFactor;
03009       } else if ( pairOn ) {
03010         scale = ( g == 0 ? 1. : -1. );
03011       }
03012       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03013       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03014       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03015       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03016       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03017       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03018       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03019       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03020       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03021       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03022 
03023       if (amd_reduction) {
03024       amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03025       amd_reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03026       amd_reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03027       amd_reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03028       amd_reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03029       amd_reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03030       amd_reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03031       amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03032       amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03033       amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03034       }
03035 
03036       double scale2 = 0.;
03037 
03038       //alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? simParams->alchElecLambdaStart : 0;
03039 
03040       if (alchFepOn) {
03041         BigReal elecLambda2Up =  (simParams->alchLambda2 <= alchElecLambdaStart)? 0. : \
03042               (simParams->alchLambda2 - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03043         BigReal elecLambda2Down =  ((1-simParams->alchLambda2) <= alchElecLambdaStart)? 0. : \
03044               ((1-simParams->alchLambda2) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03045         
03046         
03047         if ( g == 0 ) scale2 = elecLambda2Up;
03048         else if ( g == 1 ) scale2 = elecLambda2Down;
03049         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03050         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
03051         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
03052         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03053       }
03054       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03055       if (amd_reduction) amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03056       
03057       if (alchThermIntOn) {
03058         
03059         // no decoupling:
03060         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
03061         // (interactions between all atoms [partition 0 OR partition 1], 
03062         // minus all [within partition 0])
03063         // U = elecLambdaUp * (U[0] - U[2])
03064         // dU/dl = U[0] - U[2];
03065         
03066         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
03067         // (interactions between all atoms [partition 0 OR partition 2], 
03068         // minus all [within partition 0])
03069         // U = elecLambdaDown * (U[1] - U[2])
03070         // dU/dl = U[1] - U[2];
03071 
03072         // alchDecouple:
03073         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
03074         // (interactions between all atoms [partition 0 OR partition 1]
03075         // minus all [within partition 1] minus all [within partition 0]
03076         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
03077         // dU/dl = U[0] - U[2] - U[4];
03078 
03079         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
03080         // (interactions between all atoms [partition 0 OR partition 2]
03081         // minus all [within partition 2] minus all [within partition 0]
03082         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
03083         // dU/dl = U[1] - U[3] - U[4];
03084         
03085         
03086         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
03087         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
03088         if (!alchDecouple) {
03089           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03090           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03091         }
03092         else {  // alchDecouple
03093           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03094           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03095           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03096           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03097         }
03098       }
03099     }
03100     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
03101     reduction->submit();
03102     if (amd_reduction) amd_reduction->submit();
03103 }
03104 
03105 #if USE_TOPOMAP 
03106 
03107 #define NPRIMES 8
03108 const static unsigned int NAMDPrimes[] = {
03109   3,
03110   5,
03111   7,
03112   11,
03113   13,
03114   17,
03115   19,
03116   23,  
03117   29,
03118   31,
03119   37,
03120   59,
03121   73,
03122   93,
03123   113,
03124   157,
03125   307,
03126   617,
03127   1217                  //This should b enough for 64K nodes of BGL. 
03128 };
03129 
03130 #include "RecBisection.h"
03131 
03132 /***-----------------------------------------------------**********
03133     The Orthogonal Recursive Bisection strategy, which allocates PME
03134     objects close to the patches they communicate, and at the same
03135     time spreads them around the grid 
03136 ****----------------------------------------------------------****/
03137 
03138 bool generateBGLORBPmePeList(int *pemap, int numPes, 
03139                              int *block_pes, int nbpes) {
03140 
03141   PatchMap *pmap = PatchMap::Object();
03142   int *pmemap = new int [CkNumPes()];
03143 
03144   if (pemap == NULL)
03145     return false;
03146 
03147   TopoManager tmgr;
03148 
03149   memset(pmemap, 0, sizeof(int) * CkNumPes());
03150 
03151   for(int count = 0; count < CkNumPes(); count++) {
03152     if(count < nbpes)
03153       pmemap[block_pes[count]] = 1;
03154     
03155     if(pmap->numPatchesOnNode(count)) {
03156       pmemap[count] = 1;
03157       
03158       //Assumes an XYZT mapping !!
03159       if(tmgr.hasMultipleProcsPerNode()) {
03160         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
03161       }
03162     }
03163   }
03164 
03165   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
03166     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
03167     return false;
03168 
03169   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03170   Node *node = nd.ckLocalBranch();
03171   SimParameters *simParams = node->simParameters;
03172 
03173   //first split PME processors into patch groups
03174 
03175   int xsize = 0, ysize = 0, zsize = 0;
03176 
03177   xsize = tmgr.getDimNX();
03178   ysize = tmgr.getDimNY();
03179   zsize = tmgr.getDimNZ();
03180   
03181   int nx = xsize, ny = ysize, nz = zsize;
03182   DimensionMap dm;
03183   
03184   dm.x = 0;
03185   dm.y = 1;
03186   dm.z = 2;
03187   
03188   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
03189 
03190   //group size processors have to be allocated to each YZ plane
03191   int group_size = numPes/nx;
03192   if(numPes % nx)
03193     group_size ++;
03194 
03195   int my_prime = NAMDPrimes[0];
03196   int density = (ny * nz)/group_size + 1;
03197   int count = 0;
03198   
03199   //Choose a suitable prime Number
03200   for(count = 0; count < NPRIMES; count ++) {
03201     //Find a prime just greater than the density
03202     if(density < NAMDPrimes[count]) {
03203       my_prime = NAMDPrimes[count];
03204       break;
03205     }      
03206   }
03207   
03208   if(count == NPRIMES)
03209     my_prime = NAMDPrimes[NPRIMES-1];
03210 
03211   //int gcount = numPes/2;
03212   int gcount = 0;
03213   int npme_pes = 0;
03214   
03215   int coord[3];
03216 
03217   for(int x = 0; x < nx; x++) {
03218     coord[0] = (x + nx/2)%nx;
03219     
03220     for(count=0; count < group_size && npme_pes < numPes; count++) {
03221       int dest = (count + 1) * my_prime;      
03222       dest = dest % (ny * nz);      
03223       
03224       coord[2] = dest / ny;
03225       coord[1] = dest - coord[2] * ny;
03226       
03227       //Locate where in the actual grid the processor is
03228       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
03229         coord[dm.z] * xsize* ysize;
03230       
03231       if(pmemap[destPe] == 0) {
03232         pemap[gcount++] = destPe;
03233         pmemap[destPe] = 1;
03234         
03235         if(tmgr.hasMultipleProcsPerNode())
03236           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
03237 
03238         npme_pes ++;
03239       }
03240       else {
03241         for(int pos = 1; pos < ny * nz; pos++) {
03242           
03243           coord[2] += pos / ny;
03244           coord[1] += pos % ny;
03245           
03246           coord[2] = coord[2] % nz;
03247           coord[1] = coord[1] % ny;       
03248           
03249           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
03250             coord[dm.z] * xsize * ysize;
03251           
03252           if(pmemap[newdest] == 0) {
03253             pemap[gcount++] = newdest;
03254             pmemap[newdest] = 1;
03255             
03256             if(tmgr.hasMultipleProcsPerNode())
03257               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
03258             
03259             npme_pes ++;
03260             break;
03261           }
03262         }
03263       }      
03264     }   
03265     
03266     if(gcount == numPes)
03267       gcount = 0;    
03268     
03269     if(npme_pes >= numPes)
03270       break;
03271   }
03272   
03273   delete [] pmemap;
03274   
03275   if(npme_pes != numPes)
03276     //NAMD_bug("ORB PME allocator failed\n");
03277     return false;
03278 
03279   return true;
03280 }
03281 
03282 #endif
03283 
03284 template <class T> class PmePencil : public T {
03285 public:
03286   PmePencil() {
03287     data = 0;
03288     work = 0;
03289     send_order = 0;
03290     needs_reply = 0;
03291 #if USE_PERSISTENT
03292     trans_handle = untrans_handle = ungrid_handle = NULL;
03293 #endif
03294   }
03295   ~PmePencil() {
03296     fftwf_free(data);
03297     delete [] work;
03298     delete [] send_order;
03299     delete [] needs_reply;
03300   }
03301   void base_init(PmePencilInitMsg *msg) {
03302     imsg=0;
03303     imsgb=0;
03304     hasData=0;
03305     initdata = msg->data;
03306   }
03307   void order_init(int nBlocks) {
03308     send_order = new int[nBlocks];
03309     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
03310     Random rand(CkMyPe());
03311     rand.reorder(send_order,nBlocks);
03312     needs_reply = new int[nBlocks];
03313   }
03314   PmePencilInitMsgData initdata;
03315   Lattice lattice;
03316   PmeReduction evir;
03317   int sequence;  // used for priorities
03318   int imsg;  // used in sdag code
03319   int imsgb;  // Node par uses distinct counter for back path
03320   int hasData;  // used in message elimination
03321   float *data;
03322   float *work;
03323   int *send_order;
03324   int *needs_reply;
03325 #if USE_PERSISTENT
03326   PersistentHandle *trans_handle;
03327   PersistentHandle *untrans_handle;
03328   PersistentHandle *ungrid_handle;
03329 #endif
03330 };
03331 
03332 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
03333 public:
03334     PmeZPencil_SDAG_CODE
03335     PmeZPencil() { __sdag_init(); setMigratable(false); }
03336     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); imsg=imsgb=0;}
03337         ~PmeZPencil() {
03338         #ifdef NAMD_FFTW
03339         #ifdef NAMD_FFTW_3
03340                 delete [] forward_plans;
03341                 delete [] backward_plans;
03342         #endif
03343         #endif
03344         }
03345     void fft_init();
03346     void recv_grid(const PmeGridMsg *);
03347     void forward_fft();
03348     void send_trans();
03349         void send_subset_trans(int fromIdx, int toIdx);
03350     void recv_untrans(const PmeUntransMsg *);
03351     void node_process_untrans(PmeUntransMsg *);
03352     void node_process_grid(PmeGridMsg *);
03353     void backward_fft();
03354         void send_ungrid(PmeGridMsg *);
03355         void send_all_ungrid();
03356         void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
03357 private:
03358     ResizeArray<PmeGridMsg *> grid_msgs;
03359 #ifdef NAMD_FFTW
03360 #ifdef NAMD_FFTW_3
03361     fftwf_plan forward_plan, backward_plan;
03362 
03363         //for nodehelper usage
03364         int numPlans;
03365         fftwf_plan *forward_plans, *backward_plans;
03366 #else
03367     rfftwnd_plan forward_plan, backward_plan;
03368 #endif
03369 #endif
03370 
03371     int nx, ny;
03372 #if USE_PERSISTENT
03373     void setup_persistent() {
03374       int hd = 1;// ( hasData ? 1 : 0 );
03375       int zBlocks = initdata.zBlocks;
03376       int block3 = initdata.grid.block3;
03377       int dim3 = initdata.grid.dim3;
03378       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03379       CmiAssert(yPencil_local);
03380       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
03381       for ( int isend=0; isend<zBlocks; ++isend ) {
03382           int kb = send_order[isend];
03383           int nz = (initdata.grid.block3 > dim3/2 - kb*block3) ? (initdata.grid.block3) : (dim3/2 - kb*block3);
03384           int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
03385           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz*2 +sizeof( envelope)+PRIORITY_SIZE/8;
03386           trans_handle[isend] = CmiCreatePersistent(peer, size);
03387       }
03388     }
03389     
03390     void setup_ungrid_persistent() 
03391     {
03392        ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
03393        for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
03394            int peer = grid_msgs[imsg]->sourceNode;
03395            ungrid_handle[imsg] = CmiCreatePersistent(peer, 0); 
03396        }
03397     }
03398 #endif
03399 };
03400 
03401 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
03402 public:
03403     PmeYPencil_SDAG_CODE
03404     PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
03405     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
03406     void fft_init();
03407     void recv_trans(const PmeTransMsg *);
03408     void forward_fft();
03409         void forward_subset_fft(int fromIdx, int toIdx);
03410     void send_trans();
03411         void send_subset_trans(int fromIdx, int toIdx);
03412     void recv_untrans(const PmeUntransMsg *);    
03413     void node_process_trans(PmeTransMsg *);
03414     void node_process_untrans(PmeUntransMsg *);
03415     void backward_fft();
03416         void backward_subset_fft(int fromIdx, int toIdx);
03417     void send_untrans();
03418     void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03419 private:
03420 #ifdef NAMD_FFTW
03421 #ifdef NAMD_FFTW_3
03422     fftwf_plan forward_plan, backward_plan;
03423 #else
03424     fftw_plan forward_plan, backward_plan;
03425 #endif
03426 #endif
03427 
03428     int nx, nz;
03429 #if USE_PERSISTENT
03430     void setup_persistent() {
03431       int yBlocks = initdata.yBlocks;
03432       int block2 = initdata.grid.block2;
03433       int K2 = initdata.grid.K2;
03434       int hd = 1;
03435       CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
03436       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03437       for ( int isend=0; isend<yBlocks; ++isend ) {
03438           int jb = send_order[isend];
03439           int ny = block2 > (K2 - jb*block2) ? block2 : (K2 - jb*block2);
03440           int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
03441           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8;
03442           trans_handle[isend] = CmiCreatePersistent(peer, size);
03443       }
03444 
03445       CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
03446       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03447       int send_evir = 1;
03448       for ( int isend=0; isend<yBlocks; ++isend ) {
03449           int jb = send_order[isend];
03450           int ny = block2 > (K2 - jb*block2) ? block2 : (K2 - jb*block2);
03451           //if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
03452           int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
03453           //iout << "YPencils un_trans[ " << CkMyPe() << "  ==== " << peer <<  "\n" << endi;
03454           untrans_handle[isend] = CmiCreatePersistent(peer, sizeof(PmeUntransMsg) + sizeof(float)*nx*ny*nz*2 +sizeof(PmeReduction)*send_evir +sizeof( envelope) + PRIORITY_SIZE/8);
03455       }
03456     }
03457 #endif
03458 };
03459 
03460 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
03461 public:
03462     PmeXPencil_SDAG_CODE
03463     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); imsg=imsgb=0; }
03464     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
03465         ~PmeXPencil() {
03466         #ifdef NAMD_FFTW
03467         #ifdef NAMD_FFTW_3
03468                 delete [] forward_plans;
03469                 delete [] backward_plans;
03470         #endif
03471         #endif
03472         }
03473     void fft_init();
03474     void recv_trans(const PmeTransMsg *);
03475     void forward_fft();
03476     void pme_kspace();
03477     void backward_fft();
03478     void send_untrans();
03479         void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03480     void node_process_trans(PmeTransMsg *);
03481 #ifdef NAMD_FFTW
03482 #ifdef NAMD_FFTW_3
03483     fftwf_plan forward_plan, backward_plan;
03484 
03485         int numPlans;
03486         fftwf_plan *forward_plans, *backward_plans;
03487 #else
03488     fftw_plan forward_plan, backward_plan;
03489 #endif
03490 #endif
03491     int ny, nz;
03492     PmeKSpace *myKSpace;
03493 #if USE_PERSISTENT
03494     void  setup_persistent() {
03495       int xBlocks = initdata.xBlocks;
03496       int block1 = initdata.grid.block1;
03497       int K1 = initdata.grid.K1;
03498       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03499       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
03500       int send_evir = 1;
03501       for ( int isend=0; isend<xBlocks; ++isend ) {
03502           int ib = send_order[isend];
03503           int nx = block1 > (K1 - ib*block1) ? block1 : (K1 - ib*block1);
03504           int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
03505           untrans_handle[isend] = CmiCreatePersistent(peer, sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir + sizeof(float)*nx*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8);
03506       }
03507     }
03508 #endif
03509 
03510 };
03511 
03512 void PmeZPencil::fft_init() {
03513   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03514   Node *node = nd.ckLocalBranch();
03515   SimParameters *simParams = node->simParameters;
03516 
03517 #if USE_NODE_PAR_RECEIVE
03518   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
03519 #endif
03520 
03521   int K1 = initdata.grid.K1;
03522   int K2 = initdata.grid.K2;
03523   int K3 = initdata.grid.K3;
03524   int dim3 = initdata.grid.dim3;
03525   int block1 = initdata.grid.block1;
03526   int block2 = initdata.grid.block2;
03527 
03528   nx = block1;
03529   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03530   ny = block2;
03531   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03532 
03533   data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
03534   work = new float[dim3];
03535 
03536   order_init(initdata.zBlocks);
03537 
03538 #ifdef NAMD_FFTW
03539   CmiLock(ComputePmeMgr::fftw_plan_lock);
03540 #ifdef NAMD_FFTW_3
03541   /* need array of sizes for the how many */
03542 
03543   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
03544   int sizeLines=nx*ny;
03545   int planLineSizes[1];
03546   planLineSizes[0]=K3;
03547   int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
03548   int ndimHalf=ndim/2;
03549   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
03550                                          (float *) data, NULL, 1, 
03551                                          ndim,
03552                                          (fftwf_complex *) data, NULL, 1,
03553                                          ndimHalf,
03554                                          fftwFlags);
03555 
03556   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
03557                                           (fftwf_complex *) data, NULL, 1, 
03558                                           ndimHalf,
03559                                           (float *) data, NULL, 1, 
03560                                           ndim,
03561                                           fftwFlags);
03562 #if     USE_NODEHELPER
03563   if(simParams->useNodeHelper) {
03564           //How many FFT plans to be created? The grain-size issue!!.
03565           //Currently, I am choosing the min(nx, ny) to be coarse-grain
03566           numPlans = (nx<=ny?nx:ny);
03567           int howmany = sizeLines/numPlans;
03568           forward_plans = new fftwf_plan[numPlans];
03569           backward_plans = new fftwf_plan[numPlans];
03570           for(int i=0; i<numPlans; i++) {
03571                   int dimStride = i*ndim*howmany;
03572                   int dimHalfStride = i*ndimHalf*howmany;
03573                   forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
03574                                                                                                          ((float *)data)+dimStride, NULL, 1,
03575                                                                                                          ndim,
03576                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03577                                                                                                          ndimHalf,
03578                                                                                                          fftwFlags);
03579 
03580                   backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
03581                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03582                                                                                                          ndimHalf,
03583                                                                                                          ((float *)data)+dimStride, NULL, 1,
03584                                                                                                          ndim,
03585                                                                                                          fftwFlags);
03586           }
03587   }else 
03588 #endif 
03589   {
03590           forward_plans = NULL;
03591           backward_plans = NULL;
03592   }
03593 #else
03594   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
03595         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03596         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03597   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
03598         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03599         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03600 #endif
03601   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03602 #else
03603   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03604 #endif
03605 
03606 #if USE_NODE_PAR_RECEIVE
03607     evir = 0.;
03608     memset(data, 0, sizeof(float) * nx*ny*dim3);
03609 #endif
03610 }
03611 
03612 void PmeYPencil::fft_init() {
03613   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03614   Node *node = nd.ckLocalBranch();
03615   SimParameters *simParams = node->simParameters;
03616 
03617 #if USE_NODE_PAR_RECEIVE
03618   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
03619 #endif
03620 
03621   int K1 = initdata.grid.K1;
03622   int K2 = initdata.grid.K2;
03623   int dim2 = initdata.grid.dim2;
03624   int dim3 = initdata.grid.dim3;
03625   int block1 = initdata.grid.block1;
03626   int block3 = initdata.grid.block3;
03627 
03628   nx = block1;
03629   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03630   nz = block3;
03631   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03632 
03633   data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
03634   work = new float[2*K2];
03635 
03636   order_init(initdata.yBlocks);
03637 
03638 #ifdef NAMD_FFTW
03639   CmiLock(ComputePmeMgr::fftw_plan_lock);
03640 #ifdef NAMD_FFTW_3
03641   /* need array of sizes for the dimensions */
03642   /* ideally this should be implementable as a single multidimensional
03643    *  plan, but that has proven tricky to implement, so we maintain the
03644    *  loop of 1d plan executions. */
03645   int sizeLines=nz;
03646   int planLineSizes[1];
03647   planLineSizes[0]=K2;
03648   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
03649   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
03650                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03651                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03652                                      FFTW_FORWARD, 
03653                                      fftwFlags);
03654   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
03655                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03656                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03657                                      FFTW_BACKWARD, 
03658                                       fftwFlags);
03659   CkAssert(forward_plan != NULL);
03660   CkAssert(backward_plan != NULL);
03661 #else
03662   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
03663         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03664         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03665         nz, (fftw_complex *) work, 1);
03666   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
03667         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03668         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03669         nz, (fftw_complex *) work, 1);
03670 #endif
03671   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03672 #else
03673   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03674 #endif
03675 
03676 #if USE_NODE_PAR_RECEIVE
03677   evir = 0;
03678   CmiMemoryWriteFence();
03679 #endif
03680 }
03681 
03682 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
03683 {
03684   if ( msg->hasData ) hasData = 1;
03685   needs_reply[msg->sourceNode] = msg->hasData;
03686   recv_trans(msg);
03687   int limsg;
03688   CmiMemoryAtomicFetchAndInc(imsg,limsg);
03689   if(limsg+1 == initdata.yBlocks)
03690     {
03691       if ( hasData ) {
03692         forward_fft();
03693       }
03694       send_trans();
03695       if( ! hasData)
03696         {
03697           send_untrans(); //todo, what is up with the recvAck in SDAG version?
03698         }
03699       imsg=0;
03700       CmiMemoryWriteFence();
03701     }
03702 }
03703 
03704 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
03705 {
03706   recv_untrans(msg);
03707   int limsg;
03708   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
03709   if(limsg+1 == initdata.yBlocks)
03710     {
03711       backward_fft();
03712       send_untrans();
03713       imsgb=0;
03714       CmiMemoryWriteFence();
03715     }
03716 }
03717 
03718 #define DEBUG_NODE_PAR_RECV 0
03719 
03720 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
03721   //  CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03722   PmeXPencil *target=xPencilObj.get(msg->destElem);
03723 #if DEBUG_NODE_PAR_RECV
03724   if(target == NULL)
03725     CkAbort("xpencil in recvXTrans not found, debug registeration");
03726 #endif  
03727     target->node_process_trans(msg);
03728   delete msg;
03729 }
03730 
03731 
03732 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
03733   //  CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03734   PmeYPencil *target=yPencilObj.get(msg->destElem);
03735 #if DEBUG_NODE_PAR_RECV
03736   if(target == NULL)
03737     CkAbort("ypencil in recvYTrans not found, debug registeration");
03738 #endif  
03739     target->node_process_trans(msg);
03740   delete msg;
03741  }
03742 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
03743   //  CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03744   PmeYPencil *target=yPencilObj.get(msg->destElem);
03745 #if DEBUG_NODE_PAR_RECV  
03746   if(target == NULL)
03747     CkAbort("ypencil in recvYUntrans not found, debug registeration");
03748 #endif  
03749     target->node_process_untrans(msg);
03750   delete msg;
03751  }
03752 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
03753   //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03754   PmeZPencil *target=zPencilObj.get(msg->destElem);
03755 #if DEBUG_NODE_PAR_RECV
03756   if(target == NULL)
03757     CkAbort("zpencil in recvZUntrans not found, debug registeration");
03758 #endif
03759   target->node_process_untrans(msg);
03760   delete msg;
03761 }
03762 
03763 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
03764   //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
03765   PmeZPencil *target=zPencilObj.get(msg->destElem);
03766 #if DEBUG_NODE_PAR_RECV
03767   if(target == NULL){
03768     CkAbort("zpencil in recvZGrid not found, debug registeration");
03769   }
03770 #endif
03771   target->node_process_grid(msg); //msg is stored inside node_proces_grid
03772 }
03773 
03774 void PmeXPencil::fft_init() {
03775   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03776   Node *node = nd.ckLocalBranch();
03777   SimParameters *simParams = node->simParameters;
03778 #if USE_NODE_PAR_RECEIVE
03779   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
03780 #endif
03781 
03782   int K1 = initdata.grid.K1;
03783   int K2 = initdata.grid.K2;
03784   int dim3 = initdata.grid.dim3;
03785   int block2 = initdata.grid.block2;
03786   int block3 = initdata.grid.block3;
03787 
03788   ny = block2;
03789   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03790   nz = block3;
03791   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03792 
03793   data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
03794   work = new float[2*K1];
03795 
03796   order_init(initdata.xBlocks);
03797 
03798 #ifdef NAMD_FFTW
03799   CmiLock(ComputePmeMgr::fftw_plan_lock);
03800 #ifdef NAMD_FFTW_3
03801   /* need array of sizes for the how many */
03802   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
03803   int sizeLines=ny*nz;
03804   int planLineSizes[1];
03805   planLineSizes[0]=K1;
03806   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03807                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03808                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03809                                    FFTW_FORWARD,
03810                                      fftwFlags);
03811   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03812                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03813                                      (fftwf_complex *) data, NULL, sizeLines, 1,
03814                                           FFTW_BACKWARD,
03815                                       fftwFlags);
03816 
03817 #if     USE_NODEHELPER
03818   if(simParams->useNodeHelper) {
03819           //How many FFT plans to be created? The grain-size issue!!.
03820           //Currently, I am choosing the min(nx, ny) to be coarse-grain
03821           numPlans = (ny<=nz?ny:nz);
03822           int howmany = sizeLines/numPlans;
03823           forward_plans = new fftwf_plan[numPlans];
03824           backward_plans = new fftwf_plan[numPlans];
03825           for(int i=0; i<numPlans; i++) {
03826                   int curStride = i*howmany;              
03827                   forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03828                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03829                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03830                                                                                                         FFTW_FORWARD,
03831                                                                                                          fftwFlags);
03832 
03833                   backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03834                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03835                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03836                                                                                                           FFTW_BACKWARD,
03837                                                                                                          fftwFlags);
03838           }
03839   }else
03840 #endif
03841   {
03842           forward_plans = NULL;
03843           backward_plans = NULL;
03844   }
03845 #else
03846   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
03847         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03848         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03849         ny*nz, (fftw_complex *) work, 1);
03850   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
03851         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03852         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03853         ny*nz, (fftw_complex *) work, 1);
03854 #endif
03855   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03856 #else
03857   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03858 #endif
03859 
03860   myKSpace = new PmeKSpace(initdata.grid,
03861                 thisIndex.y*block2, thisIndex.y*block2 + ny,
03862                 thisIndex.z*block3, thisIndex.z*block3 + nz);
03863 
03864 }
03865 
03866 // #define FFTCHECK   // run a grid of integers through the fft
03867 // #define ZEROCHECK  // check for suspicious zeros in fft
03868 
03869 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
03870 
03871   int dim3 = initdata.grid.dim3;
03872   if ( imsg == 0 ) {
03873     lattice = msg->lattice;
03874     sequence = msg->sequence;
03875 #if ! USE_NODE_PAR_RECEIVE
03876     memset(data, 0, sizeof(float)*nx*ny*dim3);
03877 #endif
03878   }
03879 
03880   if ( ! msg->hasData ) return;
03881 
03882   int zlistlen = msg->zlistlen;
03883   int *zlist = msg->zlist;
03884   char *fmsg = msg->fgrid;
03885   float *qmsg = msg->qgrid;
03886   float *d = data;
03887   int numGrids = 1;  // pencil FFT doesn't support multiple grids
03888   for ( int g=0; g<numGrids; ++g ) {
03889     for ( int i=0; i<nx; ++i ) {
03890      for ( int j=0; j<ny; ++j, d += dim3 ) {
03891       if( *(fmsg++) ) {
03892         for ( int k=0; k<zlistlen; ++k ) {
03893           d[zlist[k]] += *(qmsg++);
03894         }
03895       }
03896      }
03897     }
03898   }
03899 }
03900 
03901 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
03902 #ifdef NAMD_FFTW
03903 #ifdef NAMD_FFTW_3    
03904     fftwf_plan *plans = (fftwf_plan *)param;
03905     for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
03906 #endif
03907 #endif        
03908 }
03909 
03910 void PmeZPencil::forward_fft() {
03911   evir = 0.;
03912 #ifdef FFTCHECK
03913   int dim3 = initdata.grid.dim3;
03914   int K3 = initdata.grid.K3;
03915   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
03916   float *d = data;
03917   for ( int i=0; i<nx; ++i ) {
03918    for ( int j=0; j<ny; ++j, d += dim3 ) {
03919     for ( int k=0; k<dim3; ++k ) {
03920       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
03921     }
03922    }
03923   }
03924 #endif
03925 #ifdef NAMD_FFTW
03926 #ifdef MANUAL_DEBUG_FFTW3
03927   dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
03928 #endif
03929 #ifdef NAMD_FFTW_3
03930 #if     USE_NODEHELPER
03931   int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
03932   if(useNodeHelper>=NDH_CTRL_PME_FORWARDFFT) {
03933           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
03934           //transform the above loop
03935           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
03936           NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
03937           return;
03938   }
03939 #endif
03940   fftwf_execute(forward_plan);
03941 #else
03942   rfftwnd_real_to_complex(forward_plan, nx*ny,
03943         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
03944 #endif
03945 #ifdef MANUAL_DEBUG_FFTW3
03946   dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
03947 #endif
03948 
03949 #endif
03950 #ifdef ZEROCHECK
03951   int dim3 = initdata.grid.dim3;
03952   int K3 = initdata.grid.K3;
03953   float *d = data;
03954   for ( int i=0; i<nx; ++i ) {
03955    for ( int j=0; j<ny; ++j, d += dim3 ) {
03956     for ( int k=0; k<dim3; ++k ) {
03957       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
03958         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
03959     }
03960    }
03961   }
03962 #endif
03963 }
03964 
03965 /* A single task for partitioned PmeZPencil::send_trans work */
03966 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
03967         PmeZPencil *zpencil = (PmeZPencil *)param;
03968         zpencil->send_subset_trans(first, last);        
03969 }
03970 
03971 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
03972         int zBlocks = initdata.zBlocks;
03973         int block3 = initdata.grid.block3;
03974         int dim3 = initdata.grid.dim3;
03975         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
03976           int kb = send_order[isend];
03977           int nz = block3;
03978           if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
03979           int hd = ( hasData ? 1 : 0 );
03980           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
03981           msg->lattice = lattice;
03982           msg->sourceNode = thisIndex.y;
03983           msg->hasData = hasData;
03984           msg->nx = ny;
03985          if ( hasData ) {
03986           float *md = msg->qgrid;
03987           const float *d = data;
03988           for ( int i=0; i<nx; ++i ) {
03989            for ( int j=0; j<ny; ++j, d += dim3 ) {
03990                 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
03991                   *(md++) = d[2*k];
03992                   *(md++) = d[2*k+1];
03993                 }
03994            }
03995           }
03996          }
03997           msg->sequence = sequence;
03998           SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
03999 
04000     CmiEnableUrgentSend(1);
04001 #if USE_NODE_PAR_RECEIVE
04002       msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04003 #if USE_PERSISTENT 
04004       CmiUsePersistentHandle(&trans_handle[isend], 1);
04005 #endif
04006       initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04007 #if USE_PERSISTENT
04008       CmiUsePersistentHandle(NULL, 0);
04009 #endif    
04010 #else
04011 #if USE_PERSISTENT 
04012       CmiUsePersistentHandle(&trans_handle[isend], 1);
04013 #endif
04014       initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04015 #if USE_PERSISTENT
04016       CmiUsePersistentHandle(NULL, 0);
04017 #endif    
04018 #endif
04019     CmiEnableUrgentSend(0);
04020     }
04021 }
04022 
04023 void PmeZPencil::send_trans() {
04024 #if USE_PERSISTENT
04025     if (trans_handle == NULL) setup_persistent();
04026 #endif
04027 #if     USE_NODEHELPER
04028         Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04029         if(useNodeHelper>=NDH_CTRL_PME_SENDTRANS) {
04036                 //send_subset_trans(0, initdata.zBlocks-1);
04037                 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;               
04038                 NodeHelper_Parallelize(nodeHelper, PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
04039                 return;
04040         }
04041 #endif
04042   int zBlocks = initdata.zBlocks;
04043   int block3 = initdata.grid.block3;
04044   int dim3 = initdata.grid.dim3;
04045   for ( int isend=0; isend<zBlocks; ++isend ) {
04046     int kb = send_order[isend];
04047     int nz = block3;
04048     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
04049     int hd = ( hasData ? 1 : 0 );
04050     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04051     msg->lattice = lattice;
04052     msg->sourceNode = thisIndex.y;
04053     msg->hasData = hasData;
04054     msg->nx = ny;
04055    if ( hasData ) {
04056     float *md = msg->qgrid;
04057     const float *d = data;
04058     for ( int i=0; i<nx; ++i ) {
04059      for ( int j=0; j<ny; ++j, d += dim3 ) {
04060       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04061         *(md++) = d[2*k];
04062         *(md++) = d[2*k+1];
04063       }
04064      }
04065     }
04066    }
04067     msg->sequence = sequence;
04068     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
04069 
04070     CmiEnableUrgentSend(1);
04071 #if USE_NODE_PAR_RECEIVE
04072     msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04073 #if USE_PERSISTENT
04074     CmiUsePersistentHandle(&trans_handle[isend], 1);
04075 #endif
04076     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04077 #if USE_PERSISTENT
04078     CmiUsePersistentHandle(NULL, 0);
04079 #endif    
04080 #else
04081 #if USE_PERSISTENT
04082     CmiUsePersistentHandle(&trans_handle[isend], 1);
04083 #endif
04084     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04085 #if USE_PERSISTENT
04086     CmiUsePersistentHandle(NULL, 0);
04087 #endif    
04088 #endif
04089     CmiEnableUrgentSend(0);
04090   }
04091 }
04092 
04093 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
04094   if ( imsg == 0 ) {
04095     lattice = msg->lattice;
04096     sequence = msg->sequence;
04097   }
04098   int block2 = initdata.grid.block2;
04099   int K2 = initdata.grid.K2;
04100   int jb = msg->sourceNode;
04101   int ny = msg->nx;
04102  if ( msg->hasData ) {
04103   const float *md = msg->qgrid;
04104   float *d = data;
04105   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04106    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04107     for ( int k=0; k<nz; ++k ) {
04108 #ifdef ZEROCHECK
04109       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
04110         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04111 #endif
04112       d[2*(j*nz+k)] = *(md++);
04113       d[2*(j*nz+k)+1] = *(md++);
04114     }
04115    }
04116   }
04117  } else {
04118   float *d = data;
04119   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04120    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04121     for ( int k=0; k<nz; ++k ) {
04122       d[2*(j*nz+k)] = 0;
04123       d[2*(j*nz+k)+1] = 0;
04124     }
04125    }
04126   }
04127  }
04128 }
04129 
04130 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
04131         PmeYPencil *ypencil = (PmeYPencil *)param;
04132         ypencil->forward_subset_fft(first, last);
04133 }
04134 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
04135 #ifdef NAMD_FFTW
04136 #ifdef NAMD_FFTW_3
04137         for(int i=fromIdx; i<=toIdx; i++){
04138                 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
04139                       * nz * initdata.grid.K2,  
04140                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04141         }
04142 #endif
04143 #endif
04144 }
04145 
04146 void PmeYPencil::forward_fft() {
04147     evir = 0.;
04148 #ifdef NAMD_FFTW
04149 #ifdef MANUAL_DEBUG_FFTW3
04150   dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04151 #endif
04152   
04153 #ifdef NAMD_FFTW_3
04154 #if     USE_NODEHELPER
04155   int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04156   if(useNodeHelper>=NDH_CTRL_PME_FORWARDFFT) {
04157           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04158           NodeHelper_Parallelize(nodeHelper, PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
04159           return;
04160   }
04161 #endif
04162   //the above is a transformation of the following loop using NodeHelper
04163   for ( int i=0; i<nx; ++i ) {
04164     fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
04165                       * nz * initdata.grid.K2,  
04166                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04167   }
04168 #else
04169   for ( int i=0; i<nx; ++i ) {
04170     fftw(forward_plan, nz,
04171         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04172         nz, 1, (fftw_complex *) work, 1, 0);
04173   }
04174 #endif
04175 #ifdef MANUAL_DEBUG_FFTW3
04176   dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04177 #endif
04178 
04179 #endif
04180 }
04181 
04182 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
04183         PmeYPencil *ypencil = (PmeYPencil *)param;
04184         ypencil->send_subset_trans(first, last);
04185 }
04186 
04187 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
04188         int yBlocks = initdata.yBlocks;
04189         int block2 = initdata.grid.block2;
04190         int K2 = initdata.grid.K2;
04191         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
04192           int jb = send_order[isend];
04193           int ny = block2;
04194           if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04195           int hd = ( hasData ? 1 : 0 );
04196           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04197           msg->lattice = lattice;
04198           msg->sourceNode = thisIndex.x;
04199           msg->hasData = hasData;
04200           msg->nx = nx;
04201          if ( hasData ) {
04202           float *md = msg->qgrid;
04203           const float *d = data;
04204           for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04205            for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04206                 for ( int k=0; k<nz; ++k ) {
04207                   *(md++) = d[2*(j*nz+k)];
04208                   *(md++) = d[2*(j*nz+k)+1];
04209   #ifdef ZEROCHECK
04210                   if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04211           thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04212   #endif
04213                 }
04214            }
04215           }
04216           if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04217           thisIndex.x, jb, thisIndex.z);
04218          }
04219           msg->sequence = sequence;
04220           SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04221       CmiEnableUrgentSend(1);
04222 #if USE_NODE_PAR_RECEIVE
04223       msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04224 #if USE_PERSISTENT 
04225       CmiUsePersistentHandle(&trans_handle[isend], 1);
04226 #endif
04227       initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
04228 #if USE_PERSISTENT
04229       CmiUsePersistentHandle(NULL, 0);
04230 #endif
04231 #else      
04232 #if USE_PERSISTENT 
04233       CmiUsePersistentHandle(&trans_handle[isend], 1);
04234 #endif
04235       initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04236 #if USE_PERSISTENT
04237       CmiUsePersistentHandle(NULL, 0);
04238 #endif
04239 #endif
04240       CmiEnableUrgentSend(0);
04241         }
04242 }
04243 
04244 void PmeYPencil::send_trans() {
04245 #if USE_PERSISTENT
04246     if (trans_handle == NULL) setup_persistent();
04247 #endif
04248 #if     USE_NODEHELPER
04249         Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04250         if(useNodeHelper>=NDH_CTRL_PME_SENDTRANS) {
04257                 //send_subset_trans(0, initdata.yBlocks-1);
04258                 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04259                 NodeHelper_Parallelize(nodeHelper, PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
04260                 return;
04261         }
04262 #endif
04263   int yBlocks = initdata.yBlocks;
04264   int block2 = initdata.grid.block2;
04265   int K2 = initdata.grid.K2;
04266   for ( int isend=0; isend<yBlocks; ++isend ) {
04267     int jb = send_order[isend];
04268     int ny = block2;
04269     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04270     int hd = ( hasData ? 1 : 0 );
04271     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04272     msg->lattice = lattice;
04273     msg->sourceNode = thisIndex.x;
04274     msg->hasData = hasData;
04275     msg->nx = nx;
04276    if ( hasData ) {
04277     float *md = msg->qgrid;
04278     const float *d = data;
04279     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04280      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04281       for ( int k=0; k<nz; ++k ) {
04282         *(md++) = d[2*(j*nz+k)];
04283         *(md++) = d[2*(j*nz+k)+1];
04284 #ifdef ZEROCHECK
04285         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04286         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04287 #endif
04288       }
04289      }
04290     }
04291     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04292         thisIndex.x, jb, thisIndex.z);
04293    }
04294     msg->sequence = sequence;
04295     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04296     CmiEnableUrgentSend(1);
04297 #if USE_NODE_PAR_RECEIVE
04298     msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04299 #if USE_PERSISTENT
04300     CmiUsePersistentHandle(&trans_handle[isend], 1);
04301 #endif
04302     initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
04303 #if USE_PERSISTENT
04304     CmiUsePersistentHandle(NULL, 0);
04305 #endif
04306 #else
04307 #if USE_PERSISTENT
04308     CmiUsePersistentHandle(&trans_handle[isend], 1);
04309 #endif
04310     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04311 #if USE_PERSISTENT
04312     CmiUsePersistentHandle(NULL, 0);
04313 #endif
04314     
04315 #endif
04316     CmiEnableUrgentSend(0);
04317   }
04318 }
04319 
04320 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
04321 {
04322   if(msg->hasData) hasData=1;
04323   needs_reply[msg->sourceNode] = msg->hasData;
04324   recv_trans(msg);
04325   int limsg;
04326   CmiMemoryAtomicFetchAndInc(imsg,limsg);
04327   if(limsg+1 == initdata.xBlocks)
04328     {
04329       if(hasData){
04330         forward_fft();
04331         pme_kspace();
04332         backward_fft();
04333       }
04334       send_untrans();
04335       imsg=0;
04336       CmiMemoryWriteFence();
04337     }
04338 }
04339 
04340 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
04341   if ( imsg == 0 ) {
04342     lattice = msg->lattice;
04343     sequence = msg->sequence;
04344   }
04345   int block1 = initdata.grid.block1;
04346   int K1 = initdata.grid.K1;
04347   int ib = msg->sourceNode;
04348   int nx = msg->nx;
04349  if ( msg->hasData ) {
04350   const float *md = msg->qgrid;
04351   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04352    float *d = data + i*ny*nz*2;
04353    for ( int j=0; j<ny; ++j, d += nz*2 ) {
04354     for ( int k=0; k<nz; ++k ) {
04355 #ifdef ZEROCHECK
04356       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
04357         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
04358 #endif
04359       d[2*k] = *(md++);
04360       d[2*k+1] = *(md++);
04361     }
04362    }
04363   }
04364  } else {
04365   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04366    float *d = data + i*ny*nz*2;
04367    for ( int j=0; j<ny; ++j, d += nz*2 ) {
04368     for ( int k=0; k<nz; ++k ) {
04369       d[2*k] = 0;
04370       d[2*k+1] = 0;
04371     }
04372    }
04373   }
04374  }
04375 }
04376 
04377 void PmeXPencil::forward_fft() {
04378 #ifdef NAMD_FFTW
04379 
04380 #ifdef MANUAL_DEBUG_FFTW3
04381   dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04382 #endif
04383 
04384 #ifdef NAMD_FFTW_3
04385 #if     USE_NODEHELPER
04386   int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04387   if(useNodeHelper>=NDH_CTRL_PME_FORWARDFFT) {
04388           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
04389           //transform the above loop
04390           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04391           NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
04392           return;
04393   }
04394 #endif
04395   fftwf_execute(forward_plan);
04396 #else
04397   fftw(forward_plan, ny*nz,
04398         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04399 #endif
04400 #ifdef MANUAL_DEBUG_FFTW3
04401   dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04402 #endif
04403 
04404 #endif
04405 }
04406 
04407 void PmeXPencil::pme_kspace() {
04408 
04409   evir = 0.;
04410 
04411 #ifdef FFTCHECK
04412   return;
04413 #endif
04414 
04415   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
04416 
04417   int numGrids = 1;
04418   for ( int g=0; g<numGrids; ++g ) {
04419     evir[0] = myKSpace->compute_energy(data+0*g,
04420                 lattice, ewaldcof, &(evir[1]));
04421   }
04422   
04423 #if USE_NODE_PAR_RECEIVE
04424     CmiMemoryWriteFence();
04425 #endif
04426 }
04427 
04428 void PmeXPencil::backward_fft() {
04429 #ifdef NAMD_FFTW
04430 #ifdef MANUAL_DEBUG_FFTW3
04431   dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04432 #endif
04433 
04434 #ifdef NAMD_FFTW_3
04435 #if     USE_NODEHELPER
04436   int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04437   if(useNodeHelper>=NDH_CTRL_PME_BACKWARDFFT) {
04438           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
04439           //transform the above loop
04440           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04441           NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
04442           return;
04443   }
04444 #endif
04445   fftwf_execute(backward_plan);
04446 #else
04447   fftw(backward_plan, ny*nz,
04448         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04449 #endif
04450 #ifdef MANUAL_DEBUG_FFTW3
04451   dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04452 #endif
04453 #endif
04454 }
04455 
04456 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04457         int evirIdx = paraNum;
04458         PmeXPencil *xpencil = (PmeXPencil *)param;
04459         xpencil->send_subset_untrans(first, last, evirIdx);
04460 }
04461 
04462 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04463         int xBlocks = initdata.xBlocks;
04464         int block1 = initdata.grid.block1;      
04465         int K1 = initdata.grid.K1;
04466 
04467         int ackL=0, ackH=-1;
04468         int unL=0, unH=-1;
04469         int send_evir=0;
04470         if(fromIdx >= evirIdx+1) {
04471                 //send PmeUntransMsg with has_evir=0
04472                 unL = fromIdx;
04473                 unH = toIdx;            
04474         } else if(toIdx <= evirIdx-1) {
04475                 //send PmeAckMsg
04476                 ackL=fromIdx;
04477                 ackH=toIdx;             
04478         } else {
04479                 //partially send PmeAckMsg and partially send PmeUntransMsg
04480                 ackL=fromIdx;
04481                 ackH=evirIdx-1;
04482                 send_evir=1;
04483                 unL=evirIdx+1;
04484                 unH=toIdx;
04485         }
04486 
04487         for(int isend=ackL; isend<=ackH; isend++) {
04488                 //send PmeAckMsg
04489         CmiEnableUrgentSend(1);
04490                 int ib = send_order[isend];
04491                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04492                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04493                 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04494         CmiEnableUrgentSend(0);
04495     }
04496 
04497     CmiEnableUrgentSend(1);
04498         //send PmeUntransMsg with has_evir=1
04499         if(send_evir) {
04500                 int ib = send_order[evirIdx];
04501                 int nx = block1;
04502                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04503                 PmeUntransMsg *msg = new (nx*ny*nz*2,1,PRIORITY_SIZE) PmeUntransMsg;            
04504                 msg->evir[0] = evir;
04505                 msg->has_evir = 1;                              
04506                 msg->sourceNode = thisIndex.y;
04507                 msg->ny = ny;
04508                 float *md = msg->qgrid;
04509                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04510                         float *d = data + i*ny*nz*2;
04511                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
04512                                 for ( int k=0; k<nz; ++k ) {
04513                                         *(md++) = d[2*k];
04514                                         *(md++) = d[2*k+1];
04515                                 }
04516                         }
04517                 }
04518                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04519 #if USE_NODE_PAR_RECEIVE
04520         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04521         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04522 #else
04523         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04524 #endif
04525          }
04526     CmiEnableUrgentSend(0);
04527         
04528         //send PmeUntransMsg with has_evir=0
04529         for(int isend=unL; isend<=unH; isend++) {
04530                 int ib = send_order[isend];
04531                 int nx = block1;
04532                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04533                 PmeUntransMsg *msg = new (nx*ny*nz*2,0,PRIORITY_SIZE) PmeUntransMsg;
04534                 msg->has_evir = 0;              
04535                 msg->sourceNode = thisIndex.y;
04536                 msg->ny = ny;
04537                 float *md = msg->qgrid;
04538                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04539                         float *d = data + i*ny*nz*2;
04540                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
04541                                 for ( int k=0; k<nz; ++k ) {
04542                                         *(md++) = d[2*k];
04543                                         *(md++) = d[2*k+1];
04544                                 }
04545                         }
04546                 }
04547                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04548         CmiEnableUrgentSend(1);
04549 #if USE_NODE_PAR_RECEIVE
04550         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04551 #if USE_PERSISTENT 
04552         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04553 #endif
04554         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04555 #if USE_PERSISTENT
04556         CmiUsePersistentHandle(NULL, 0);
04557 #endif
04558 #else
04559 #if USE_PERSISTENT 
04560         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04561 #endif
04562         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04563 #if USE_PERSISTENT
04564         CmiUsePersistentHandle(NULL, 0);
04565 #endif
04566 #endif
04567         CmiEnableUrgentSend(0);
04568         }
04569 }
04570 
04571 void PmeXPencil::send_untrans() {
04572 #if USE_PERSISTENT
04573   if (untrans_handle == NULL) setup_persistent();
04574 #endif
04575 #if     USE_NODEHELPER
04576   Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04577   if(useNodeHelper>=NDH_CTRL_PME_SENDUNTRANS) {
04578                 int xBlocks = initdata.xBlocks;
04579                 int evirIdx = 0;
04580                 for ( int isend=0; isend<xBlocks; ++isend ) {
04581                         int ib = send_order[isend];
04582                         if (needs_reply[ib]) {
04583                                 evirIdx = isend;
04584                                 break;
04585                         }
04586                 }
04587 
04588                 //basically: 
04589                 //[0,evirIdx-1]->send PmeAckMsg
04590                 //evirIdx->send PmeUntransMsg with has_evir=1
04591                 //[evirIdx+1, xBlocks-1]->send PmeUntransMsg with has_evir=0
04592                 //send_subset_untrans(0, xBlocks-1, evirIdx);
04593                 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04594 #if USE_NODE_PAR_RECEIVE
04595                 //NodeHelper_Parallelize(nodeHelper, PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
04596                 NodeHelper_Parallelize(nodeHelper, PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
04597 #else
04598         //NodeHelper_Parallelize(nodeHelper, PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
04599                 NodeHelper_Parallelize(nodeHelper, PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
04600 #endif        
04601                 return;
04602   }
04603 #endif
04604   int xBlocks = initdata.xBlocks;
04605   int block1 = initdata.grid.block1;
04606   int K1 = initdata.grid.K1;
04607   int send_evir = 1;
04608   for ( int isend=0; isend<xBlocks; ++isend ) {
04609     int ib = send_order[isend];
04610     if ( ! needs_reply[ib] ) {
04611       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04612       CmiEnableUrgentSend(1);
04613       SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04614       initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04615       CmiEnableUrgentSend(0);
04616       continue;
04617     }
04618     int nx = block1;
04619     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04620     PmeUntransMsg *msg = new (nx*ny*nz*2,send_evir,PRIORITY_SIZE) PmeUntransMsg;
04621     if ( send_evir ) {
04622       msg->evir[0] = evir;
04623       msg->has_evir = 1;
04624       send_evir = 0;
04625     } else {
04626       msg->has_evir = 0;
04627     }
04628     msg->sourceNode = thisIndex.y;
04629     msg->ny = ny;
04630     float *md = msg->qgrid;
04631     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04632      float *d = data + i*ny*nz*2;
04633      for ( int j=0; j<ny; ++j, d += nz*2 ) {
04634       for ( int k=0; k<nz; ++k ) {
04635         *(md++) = d[2*k];
04636         *(md++) = d[2*k+1];
04637       }
04638      }
04639     }
04640     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04641 
04642     CmiEnableUrgentSend(1);
04643 #if USE_NODE_PAR_RECEIVE
04644     msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04645 #if USE_PERSISTENT
04646     CmiUsePersistentHandle(&untrans_handle[isend], 1);
04647 #endif
04648     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04649 #if USE_PERSISTENT
04650     CmiUsePersistentHandle(NULL, 0);
04651 #endif
04652 #else
04653 #if USE_PERSISTENT
04654     CmiUsePersistentHandle(&untrans_handle[isend], 1);
04655 #endif
04656     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04657 #if USE_PERSISTENT
04658     CmiUsePersistentHandle(NULL, 0);
04659 #endif
04660 #endif
04661     CmiEnableUrgentSend(0);
04662   }
04663 }
04664 
04665 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
04666   if ( msg->has_evir ) evir += msg->evir[0];
04667   int block2 = initdata.grid.block2;
04668   int K2 = initdata.grid.K2;
04669   int jb = msg->sourceNode;
04670   int ny = msg->ny;
04671   const float *md = msg->qgrid;
04672   float *d = data;
04673   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04674 #if CMK_BLUEGENEL
04675     CmiNetworkProgress();
04676 #endif   
04677     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04678       for ( int k=0; k<nz; ++k ) {
04679 #ifdef ZEROCHECK
04680         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
04681                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04682 #endif
04683         d[2*(j*nz+k)] = *(md++);
04684         d[2*(j*nz+k)+1] = *(md++);
04685       }
04686     }
04687   }
04688 }
04689 
04690 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
04691         PmeYPencil *ypencil = (PmeYPencil *)param;
04692         ypencil->backward_subset_fft(first, last);
04693 }
04694 
04695 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
04696 #ifdef NAMD_FFTW
04697 #ifdef NAMD_FFTW_3
04698         for(int i=fromIdx; i<=toIdx; i++){
04699                 fftwf_execute_dft(backward_plan,        
04700                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2,         
04701                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04702         }
04703 #endif
04704 #endif
04705 }
04706 
04707 void PmeYPencil::backward_fft() {
04708 #ifdef NAMD_FFTW
04709 #ifdef MANUAL_DEBUG_FFTW3
04710   dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04711 #endif
04712 
04713 #ifdef NAMD_FFTW_3
04714 #if     USE_NODEHELPER
04715   int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04716   if(useNodeHelper>=NDH_CTRL_PME_BACKWARDFFT) {
04717           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04718           NodeHelper_Parallelize(nodeHelper, PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
04719           return;
04720   }
04721 #endif
04722   //the above is a transformation of the following loop using NodeHelper
04723   for ( int i=0; i<nx; ++i ) {
04724 #if CMK_BLUEGENEL
04725         CmiNetworkProgress();
04726 #endif
04727     fftwf_execute_dft(backward_plan,    
04728                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
04729                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04730   }
04731 #else
04732         for ( int i=0; i<nx; ++i ) {
04733 #if CMK_BLUEGENEL
04734           CmiNetworkProgress();
04735 #endif
04736                 fftw(backward_plan, nz,
04737                 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04738                 nz, 1, (fftw_complex *) work, 1, 0);
04739         }
04740 #endif
04741 
04742 #ifdef MANUAL_DEBUG_FFTW3
04743   dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04744 #endif
04745 
04746 #endif
04747 }
04748 
04749 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04750         int evirIdx = paraNum;
04751         PmeYPencil *ypencil = (PmeYPencil *)param;
04752         ypencil->send_subset_untrans(first, last, evirIdx);
04753 }
04754 
04755 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04756         int yBlocks = initdata.yBlocks;
04757         int block2 = initdata.grid.block2;      
04758         int K2 = initdata.grid.K2;
04759 
04760         int ackL=0, ackH=-1;
04761         int unL=0, unH=-1;
04762         int send_evir=0;
04763         if(fromIdx >= evirIdx+1) {
04764                 //send PmeUntransMsg with has_evir=0
04765                 unL = fromIdx;
04766                 unH = toIdx;            
04767         } else if(toIdx <= evirIdx-1) {
04768                 //send PmeAckMsg
04769                 ackL=fromIdx;
04770                 ackH=toIdx;             
04771         } else {
04772                 //partially send PmeAckMsg and partially send PmeUntransMsg
04773                 ackL=fromIdx;
04774                 ackH=evirIdx-1;
04775                 send_evir=1;
04776                 unL=evirIdx+1;
04777                 unH=toIdx;
04778         }
04779 
04780         for(int isend=ackL; isend<=ackH; isend++) {
04781                 //send PmeAckMsg
04782         CmiEnableUrgentSend(1);
04783                 int jb = send_order[isend];
04784                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04785                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04786                 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
04787         CmiEnableUrgentSend(0);
04788         }
04789 
04790     CmiEnableUrgentSend(1);
04791         //send PmeUntransMsg with has_evir=1
04792         if(send_evir) {
04793                 int jb = send_order[evirIdx];
04794                 int ny = block2;
04795                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04796                 PmeUntransMsg *msg = new (nx*ny*nz*2,1,PRIORITY_SIZE) PmeUntransMsg;            
04797                 msg->evir[0] = evir;
04798                 msg->has_evir = 1;                              
04799                 msg->sourceNode = thisIndex.z;
04800                 msg->ny = nz;
04801                 float *md = msg->qgrid;
04802                 const float *d = data;
04803                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04804                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04805                                 for ( int k=0; k<nz; ++k ) {
04806                                         *(md++) = d[2*(j*nz+k)];
04807                                         *(md++) = d[2*(j*nz+k)+1];
04808                                 }
04809                         }
04810                 }
04811                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04812 #if USE_NODE_PAR_RECEIVE
04813         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04814     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
04815         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04816 #else
04817         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04818 #endif
04819         }
04820 
04821     CmiEnableUrgentSend(0);
04822         //send PmeUntransMsg with has_evir=0
04823         for(int isend=unL; isend<=unH; isend++) {
04824                 int jb = send_order[isend];
04825                 int ny = block2;
04826                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04827                 PmeUntransMsg *msg = new (nx*ny*nz*2,0,PRIORITY_SIZE) PmeUntransMsg;
04828                 msg->has_evir = 0;
04829                 msg->sourceNode = thisIndex.z;
04830                 msg->ny = nz;
04831                 float *md = msg->qgrid;
04832                 const float *d = data;
04833                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04834                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04835                                 for ( int k=0; k<nz; ++k ) {
04836                                         *(md++) = d[2*(j*nz+k)];
04837                                         *(md++) = d[2*(j*nz+k)+1];
04838                                 }
04839                         }
04840                 }
04841                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04842             CmiEnableUrgentSend(1);
04843 #if USE_NODE_PAR_RECEIVE
04844         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04845         //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
04846 #if USE_PERSISTENT 
04847         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04848 #endif
04849         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04850 #if USE_PERSISTENT
04851         CmiUsePersistentHandle(NULL, 0);
04852 #endif
04853 #else
04854 #if USE_PERSISTENT 
04855         CmiUsePersistentHandle(&untrans_handle[isend], 1);
04856 #endif
04857         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04858 #if USE_PERSISTENT
04859         CmiUsePersistentHandle(NULL, 0);
04860 #endif
04861 #endif
04862     CmiEnableUrgentSend(0);
04863         }
04864 }
04865 
04866 void PmeYPencil::send_untrans() {
04867 #if USE_PERSISTENT
04868   if (untrans_handle == NULL) setup_persistent();
04869 #endif
04870 #if     USE_NODEHELPER
04871   Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04872   if(useNodeHelper>=NDH_CTRL_PME_SENDUNTRANS) {
04873           int yBlocks = initdata.yBlocks;
04874           int evirIdx = 0;
04875           for ( int isend=0; isend<yBlocks; ++isend ) {
04876                   int jb = send_order[isend];
04877                   if (needs_reply[jb]) {
04878                           evirIdx = isend;
04879                           break;
04880                   }
04881           }
04882 
04883           //basically: 
04884           //[0,evirIdx-1]->send PmeAckMsg
04885           //evirIdx->send PmeUntransMsg with has_evir=1
04886           //[evirIdx+1, yBlocks-1]->send PmeUntransMsg with has_evir=0
04887           //send_subset_untrans(0, yBlocks-1, evirIdx);
04888           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04889 #if USE_NODE_PAR_RECEIVE      
04890           //NodeHelper_Parallelize(nodeHelper, PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
04891           NodeHelper_Parallelize(nodeHelper, PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
04892       evir = 0.;
04893       CmiMemoryWriteFence();
04894 #else
04895       //NodeHelper_Parallelize(nodeHelper, PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
04896           NodeHelper_Parallelize(nodeHelper, PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
04897 #endif
04898           return;
04899   }
04900 #endif
04901   int yBlocks = initdata.yBlocks;
04902   int block2 = initdata.grid.block2;
04903   int K2 = initdata.grid.K2;
04904   int send_evir = 1;
04905   for ( int isend=0; isend<yBlocks; ++isend ) {
04906     int jb = send_order[isend];
04907     if ( ! needs_reply[jb] ) {
04908       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04909       CmiEnableUrgentSend(1);
04910       SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04911       initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
04912       CmiEnableUrgentSend(0);
04913       continue;
04914     }
04915     int ny = block2;
04916     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04917     PmeUntransMsg *msg = new (nx*ny*nz*2,send_evir,PRIORITY_SIZE) PmeUntransMsg;
04918     if ( send_evir ) {
04919       msg->evir[0] = evir;
04920       msg->has_evir = 1;
04921       send_evir = 0;
04922     } else {
04923       msg->has_evir = 0;
04924     }
04925     msg->sourceNode = thisIndex.z;
04926     msg->ny = nz;
04927     float *md = msg->qgrid;
04928     const float *d = data;
04929     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04930      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04931       for ( int k=0; k<nz; ++k ) {
04932         *(md++) = d[2*(j*nz+k)];
04933         *(md++) = d[2*(j*nz+k)+1];
04934       }
04935      }
04936     }
04937     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04938 
04939     CmiEnableUrgentSend(1);
04940 #if USE_NODE_PAR_RECEIVE
04941     msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04942     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
04943 #if USE_PERSISTENT
04944     CmiUsePersistentHandle(&untrans_handle[isend], 1);
04945 #endif
04946     initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04947 #if USE_PERSISTENT
04948     CmiUsePersistentHandle(NULL, 0);
04949 #endif
04950 #else
04951 #if USE_PERSISTENT
04952     CmiUsePersistentHandle(&untrans_handle[isend], 1);
04953 #endif
04954     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04955 #if USE_PERSISTENT
04956     CmiUsePersistentHandle(NULL, 0);
04957 #endif
04958 #endif    
04959     CmiEnableUrgentSend(0);
04960   }
04961   
04962 #if USE_NODE_PAR_RECEIVE
04963   evir = 0.;
04964   CmiMemoryWriteFence();
04965 #endif
04966 }
04967 
04968 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
04969 #if ! USE_NODE_PAR_RECEIVE
04970     if(imsg==0) evir=0.;
04971 #endif
04972 
04973   if ( msg->has_evir ) evir += msg->evir[0];
04974   int block3 = initdata.grid.block3;
04975   int dim3 = initdata.grid.dim3;
04976   int kb = msg->sourceNode;
04977   int nz = msg->ny;
04978   const float *md = msg->qgrid;
04979   float *d = data;
04980   for ( int i=0; i<nx; ++i ) {
04981 #if CMK_BLUEGENEL
04982     CmiNetworkProgress();
04983 #endif   
04984     for ( int j=0; j<ny; ++j, d += dim3 ) {
04985       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04986 #ifdef ZEROCHECK
04987         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
04988                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
04989 #endif
04990         d[2*k] = *(md++);
04991         d[2*k+1] = *(md++);
04992       }
04993     }
04994   }
04995 }
04996 
04997 void PmeZPencil::backward_fft() {
04998 #ifdef NAMD_FFTW
04999 #ifdef MANUAL_DEBUG_FFTW3
05000   dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05001 #endif
05002 #ifdef NAMD_FFTW_3
05003 #if     USE_NODEHELPER
05004   int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
05005   if(useNodeHelper>=NDH_CTRL_PME_BACKWARDFFT) {
05006           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
05007           //transform the above loop
05008           CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
05009           NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05010           return;
05011   }
05012 #endif
05013   fftwf_execute(backward_plan);
05014 #else
05015   rfftwnd_complex_to_real(backward_plan, nx*ny,
05016             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
05017 #endif
05018 #ifdef MANUAL_DEBUG_FFTW3
05019   dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05020 #endif
05021 
05022 #endif
05023   
05024 #if CMK_BLUEGENEL
05025   CmiNetworkProgress();
05026 #endif
05027 
05028 #ifdef FFTCHECK
05029   int dim3 = initdata.grid.dim3;
05030   int K1 = initdata.grid.K1;
05031   int K2 = initdata.grid.K2;
05032   int K3 = initdata.grid.K3;
05033   float scale = 1. / (1. * K1 * K2 * K3);
05034   float maxerr = 0.;
05035   float maxstd = 0.;
05036   int mi, mj, mk;  mi = mj = mk = -1;
05037   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05038   const float *d = data;
05039   for ( int i=0; i<nx; ++i ) {
05040    for ( int j=0; j<ny; ++j, d += dim3 ) {
05041     for ( int k=0; k<K3; ++k ) {
05042       float std = 10. * (10. * (10. * std_base + i) + j) + k;
05043       float err = scale * d[k] - std;
05044       if ( fabsf(err) > fabsf(maxerr) ) {
05045         maxerr = err;
05046         maxstd = std;
05047         mi = i;  mj = j;  mk = k;
05048       }
05049     }
05050    }
05051   }
05052   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
05053                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
05054 #endif
05055 
05056 }
05057 
05058 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
05059         //to take advantage of the interface which allows 3 user params at most.
05060         //under such situtation, no new parameter list needs to be created!! -Chao Mei
05061         int specialIdx = paraNum;
05062         PmeZPencil *zpencil = (PmeZPencil *)param;
05063         zpencil->send_subset_ungrid(first, last, specialIdx);
05064 }
05065 
05066 void PmeZPencil::send_all_ungrid() {
05067 /* 
05068 //Original code: the transformation is to first extract the msg 
05069 //idx that will has evir value set. -Chao Mei  
05070         int send_evir = 1;
05071         for (int imsg=0; imsg < grid_msgs.size(); ++imsg ) {
05072                 PmeGridMsg *msg = grid_msgs[imsg];
05073                 if ( msg->hasData ) {
05074                         if ( send_evir ) {
05075                                 msg->evir[0] = evir;
05076                                 send_evir = 0;
05077                         } else {
05078                                 msg->evir[0] = 0.;
05079                         }
05080                 }
05081                 send_ungrid(msg);
05082         }
05083 */
05084         int evirIdx = 0;
05085         for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
05086                 if(grid_msgs[imsg]->hasData) {
05087                         evirIdx = imsg;
05088                         break;
05089                 }
05090         }
05091 
05092 #if     USE_NODEHELPER
05093         Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
05094         if(useNodeHelper>=NDH_CTRL_PME_SENDUNTRANS) {
05095                 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
05096                 //????What's the best value for numChunks?????
05097 #if USE_NODE_PAR_RECEIVE        
05098                 //NodeHelper_Parallelize(nodeHelper, PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 1); //has to sync
05099                 NodeHelper_Parallelize(nodeHelper, PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
05100 #else
05101         //NodeHelper_Parallelize(nodeHelper, PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 0); //not sync
05102                 NodeHelper_Parallelize(nodeHelper, PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0); //not sync
05103 #endif        
05104                 return;
05105         }
05106 #endif
05107         send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
05108 }
05109 
05110 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
05111         for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
05112                 PmeGridMsg *msg = grid_msgs[imsg];
05113                 if ( msg->hasData) {
05114                         if (imsg == specialIdx) {
05115                                 msg->evir[0] = evir;                            
05116                         } else {
05117                                 msg->evir[0] = 0.;
05118                         }
05119                 }
05120                 send_ungrid(msg);
05121         }
05122 }
05123 
05124 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
05125   int pe = msg->sourceNode;
05126   if ( ! msg->hasData ) {
05127     delete msg;
05128     PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
05129     SET_PRIORITY(ackmsg,sequence,PME_UNGRID_PRIORITY)
05130     CmiEnableUrgentSend(1);
05131     initdata.pmeProxy[pe].recvAck(ackmsg);
05132     CmiEnableUrgentSend(0);
05133     return;
05134   }
05135   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
05136   int dim3 = initdata.grid.dim3;
05137   int zlistlen = msg->zlistlen;
05138   int *zlist = msg->zlist;
05139   char *fmsg = msg->fgrid;
05140   float *qmsg = msg->qgrid;
05141   float *d = data;
05142   int numGrids = 1;  // pencil FFT doesn't support multiple grids
05143   for ( int g=0; g<numGrids; ++g ) {
05144 #if CMK_BLUEGENEL
05145     CmiNetworkProgress();
05146 #endif    
05147     for ( int i=0; i<nx; ++i ) {
05148       for ( int j=0; j<ny; ++j, d += dim3 ) {
05149         if( *(fmsg++) ) {
05150           for ( int k=0; k<zlistlen; ++k ) {
05151             *(qmsg++) = d[zlist[k]];
05152           }
05153         }
05154       }
05155     }
05156   }
05157   SET_PRIORITY(msg,sequence,PME_UNGRID_PRIORITY)
05158     CmiEnableUrgentSend(1);
05159   initdata.pmeProxy[pe].recvUngrid(msg);
05160     CmiEnableUrgentSend(0);
05161 }
05162 
05163 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
05164 {
05165 #if USE_NODE_PAR_RECEIVE
05166   CmiLock(ComputePmeMgr::fftw_plan_lock);
05167   CmiMemoryReadFence();
05168 #endif
05169   recv_grid(msg);
05170   if(msg->hasData) hasData=msg->hasData;
05171   int limsg;
05172   CmiMemoryAtomicFetchAndInc(imsg,limsg);
05173   grid_msgs[limsg] = msg;
05174   //  CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);      
05175   if(limsg+1 == grid_msgs.size())
05176     {
05177 
05178       if (hasData)
05179         {
05180           forward_fft();
05181         }
05182       send_trans();
05183       imsg=0;
05184       CmiMemoryWriteFence();
05185       //      CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
05186     }
05187 #if USE_NODE_PAR_RECEIVE
05188   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05189   CmiMemoryWriteFence();
05190 #endif
05191 }
05192 
05193 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
05194 {
05195 #if USE_NODE_PAR_RECEIVE
05196   CmiLock(ComputePmeMgr::fftw_plan_lock);
05197   CmiMemoryReadFence();
05198 #endif    
05199   recv_untrans(msg);
05200   int limsg;
05201   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
05202   if(limsg+1 == initdata.zBlocks)
05203     {
05204       if(hasData) // maybe this should be an assert
05205         {
05206           backward_fft();
05207         }
05208         
05209         send_all_ungrid();
05210     /*  int send_evir = 1;
05211       // TODO: this part should use Chao's output parallelization
05212       for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
05213         PmeGridMsg *omsg = grid_msgs[limsg];
05214         if ( omsg->hasData ) {
05215           if ( send_evir ) {
05216             omsg->evir[0] = evir;
05217             send_evir = 0;
05218           } else {
05219             omsg->evir[0] = 0.;
05220           }
05221         }
05222         send_ungrid(omsg);
05223       } */
05224       imsgb=0;
05225       evir = 0;
05226       memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3); 
05227       CmiMemoryWriteFence();
05228       //      CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
05229     }
05230 #if USE_NODE_PAR_RECEIVE
05231   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05232   CmiMemoryWriteFence();
05233 #endif
05234 }
05235 
05236 
05237 #include "ComputePmeMgr.def.h"
05238 

Generated on Fri May 25 04:07:14 2012 for NAMD by  doxygen 1.3.9.1