ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 //#define MANUAL_DEBUG_FFTW3 1
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012 // fftw2 doesn't have these defined
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024 
00025 #include <vector>
00026 #include <algorithm>
00027 #include <deque>
00028 using namespace std;
00029 
00030 #include "InfoStream.h"
00031 #include "Node.h"
00032 #include "PatchMap.h"
00033 #include "PatchMap.inl"
00034 #include "AtomMap.h"
00035 #include "ComputePme.h"
00036 #include "ComputePmeMgr.decl.h"
00037 #include "PmeBase.inl"
00038 #include "PmeRealSpace.h"
00039 #include "PmeKSpace.h"
00040 #include "ComputeNonbondedUtil.h"
00041 #include "PatchMgr.h"
00042 #include "Molecule.h"
00043 #include "ReductionMgr.h"
00044 #include "ComputeMgr.h"
00045 #include "ComputeMgr.decl.h"
00046 // #define DEBUGM
00047 #define MIN_DEBUG_LEVEL 3
00048 #include "Debug.h"
00049 #include "SimParameters.h"
00050 #include "WorkDistrib.h"
00051 #include "varsizemsg.h"
00052 #include "Random.h"
00053 #include "ckhashtable.h"
00054 #include "Priorities.h"
00055 
00056 #include "ComputeMoa.h"
00057 #include "ComputeMoaMgr.decl.h" 
00058 
00059 //#define     USE_RANDOM_TOPO         1
00060 
00061 //#define USE_TOPO_SFC                    1
00062 //#define     USE_CKLOOP                1
00063 //#include "TopoManager.h"
00064 
00065 #include "DeviceCUDA.h"
00066 #ifdef NAMD_CUDA
00067 #include <cuda_runtime.h>
00068 #include <cuda.h>
00069 void cuda_errcheck(const char *msg);
00070 #ifdef WIN32
00071 #define __thread __declspec(thread)
00072 #endif
00073 extern __thread DeviceCUDA *deviceCUDA;
00074 #endif
00075 
00076 #include "ComputePmeCUDAKernel.h"
00077 
00078 #ifndef SQRT_PI
00079 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00080 #endif
00081 
00082 #if CMK_PERSISTENT_COMM 
00083 #define USE_PERSISTENT      1
00084 #endif
00085 
00086 #if USE_PERSISTENT
00087 #define Z_PERSIST 1
00088 #define Y_PERSIST 1
00089 #define X_PERSIST 1
00090 #endif
00091 
00092 #if defined(NAMD_CUDA) && defined(MEM_OPT_VERSION)
00093 #define USE_NODE_PAR_RECEIVE    1
00094 #endif
00095 
00096 int ComputePmeUtil::numGrids;
00097 Bool ComputePmeUtil::alchOn;
00098 Bool ComputePmeUtil::alchFepOn;
00099 Bool ComputePmeUtil::alchThermIntOn;
00100 Bool ComputePmeUtil::alchDecouple;
00101 BigReal ComputePmeUtil::alchElecLambdaStart;
00102 Bool ComputePmeUtil::lesOn;
00103 int ComputePmeUtil::lesFactor;
00104 Bool ComputePmeUtil::pairOn;
00105 Bool ComputePmeUtil::selfOn;
00106 
00107 char *pencilPMEProcessors;
00108 
00109 class PmeAckMsg : public CMessage_PmeAckMsg {
00110 };
00111 
00112 class PmeGridMsg : public CMessage_PmeGridMsg {
00113 public:
00114 
00115   int sourceNode;
00116   int sequence;
00117   int hasData;
00118   Lattice lattice;
00119   int start;
00120   int len;
00121   int zlistlen;
00122   int *zlist;
00123   char *fgrid;
00124   float *qgrid;
00125   CkArrayIndex3D destElem;
00126 };
00127 
00128 class PmeTransMsg : public CMessage_PmeTransMsg {
00129 public:
00130 
00131   int sourceNode;
00132   int sequence;
00133   int hasData;
00134   Lattice lattice;
00135   int x_start;
00136   int nx;
00137   float *qgrid;
00138   CkArrayIndex3D destElem;
00139 };
00140 
00141 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00142 public:
00143   PmeTransMsg *msg;
00144   int *count;
00145   CmiNodeLock lock;
00146 };
00147 
00148 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00149 public:
00150 
00151   int sourceNode;
00152   int y_start;
00153   int ny;
00154   float *qgrid;
00155   CkArrayIndex3D destElem;
00156 };
00157 
00158 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00159 public:
00160   PmeUntransMsg *msg;
00161   int *count;
00162   CmiNodeLock lock;
00163 };
00164 
00165 class PmeEvirMsg : public CMessage_PmeEvirMsg {
00166 public:
00167   PmeReduction *evir;
00168 };
00169 
00170 class PmePencilMap : public CBase_PmePencilMap {
00171 public:
00172   PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00173     : ia(i_a), ib(i_b), nb(n_b),
00174       size(n), data(newcopyint(n,d)) {
00175   }
00176   virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00177     //Return an ``arrayHdl'', given some information about the array
00178     return 0;
00179   }
00180   virtual int procNum(int, const CkArrayIndex &i) {
00181     //Return the home processor number for this element of this array
00182     return data[ i.data()[ia] * nb + i.data()[ib] ];
00183   }
00184   virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00185     int mype = CkMyPe();
00186     for ( int i=0; i < size; ++i ) {
00187       if ( data[i] == mype ) {
00188         CkArrayIndex3D ai(0,0,0);
00189         ai.data()[ia] = i / nb;
00190         ai.data()[ib] = i % nb;
00191         if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00192         if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00193         mgr->insertInitial(ai,msg);
00194         msg = 0;
00195       }
00196     }
00197     mgr->doneInserting();
00198     if ( msg ) CkFreeMsg(msg);
00199   }
00200 private:
00201   const int ia, ib, nb, size;
00202   const int* const data;
00203   static int* newcopyint(int n, int *d) {
00204     int *newd = new int[n];
00205     memcpy(newd, d, n*sizeof(int));
00206     return newd;
00207   }
00208 };
00209 
00210 // use this idiom since messages don't have copy constructors
00211 struct PmePencilInitMsgData {
00212   PmeGrid grid;
00213   int xBlocks, yBlocks, zBlocks;
00214   CProxy_PmeXPencil xPencil;
00215   CProxy_PmeYPencil yPencil;
00216   CProxy_PmeZPencil zPencil;
00217   CProxy_ComputePmeMgr pmeProxy;
00218   CProxy_NodePmeMgr pmeNodeProxy;
00219   CProxy_PmePencilMap xm;
00220   CProxy_PmePencilMap ym;
00221   CProxy_PmePencilMap zm;
00222 };
00223 
00224 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00225 public:
00226    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00227    PmePencilInitMsgData data;
00228 };
00229 
00230 
00231 struct LocalPmeInfo {
00232   int nx, x_start;
00233   int ny_after_transpose, y_start_after_transpose;
00234 };
00235 
00236 struct NodePmeInfo {
00237   int npe, pe_start, real_node;
00238 };
00239 
00240 
00241 static int findRecipEvirPe() {
00242     PatchMap *patchMap = PatchMap::Object();
00243     {
00244       int mype = CkMyPe();
00245       if ( patchMap->numPatchesOnNode(mype) ) {
00246         return mype; 
00247       }
00248     }
00249     {
00250       int node = CmiMyNode();
00251       int firstpe = CmiNodeFirst(node);
00252       int nodeSize = CmiNodeSize(node);
00253       int myrank = CkMyRank();
00254       for ( int i=0; i<nodeSize; ++i ) {
00255         int pe = firstpe + (myrank+i)%nodeSize;
00256         if ( patchMap->numPatchesOnNode(pe) ) {
00257           return pe;
00258         }
00259       }
00260     }
00261     {
00262       int *pelist;
00263       int nodeSize;
00264       CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
00265       int myrank;
00266       for ( int i=0; i<nodeSize; ++i ) {
00267         if ( pelist[i] == CkMyPe() ) myrank = i;
00268       }
00269       for ( int i=0; i<nodeSize; ++i ) {
00270         int pe = pelist[(myrank+i)%nodeSize];
00271         if ( patchMap->numPatchesOnNode(pe) ) {
00272           return pe;
00273         }
00274       }
00275     }
00276     {
00277       int mype = CkMyPe();
00278       int npes = CkNumPes();
00279       for ( int i=0; i<npes; ++i ) {
00280         int pe = (mype+i)%npes;
00281         if ( patchMap->numPatchesOnNode(pe) ) {
00282           return pe;
00283         }
00284       }
00285     }
00286     NAMD_bug("findRecipEvirPe() failed!");
00287     return -999;  // should never happen
00288 }
00289 
00290 
00291 //Assigns gridPeMap and transPeMap to different set of processors.
00292 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00293   int ncpus = CkNumPes();
00294   
00295   for ( int i=0; i<numGridPes; ++i ) {
00296     gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
00297   }
00298   std::sort(gridPeMap,gridPeMap+numGridPes);
00299   int firstTransPe = ncpus - numGridPes - numTransPes;
00300   if ( firstTransPe < 0 ) {
00301     firstTransPe = 0;
00302     // 0 should be first in list, skip if possible
00303     if ( ncpus > numTransPes ) firstTransPe = 1;
00304   }
00305   for ( int i=0; i<numTransPes; ++i ) {
00306     transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
00307   }
00308   std::sort(transPeMap,transPeMap+numTransPes);
00309 }
00310 
00311 #if USE_TOPOMAP 
00312 //Topology aware PME allocation
00313 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00314                              int nbpes=0);
00315 #endif
00316 
00317 
00318 int compare_bit_reversed(int a, int b) {
00319   int d = a ^ b;
00320   int c = 1;
00321   if ( d ) while ( ! (d & c) ) {
00322     c = c << 1;
00323   }
00324   return (a & c) - (b & c);
00325 }
00326 
00327 inline bool less_than_bit_reversed(int a, int b) {
00328   int d = a ^ b;
00329   int c = 1;
00330   if ( d ) while ( ! (d & c) ) {
00331     c = c << 1;
00332   }
00333   return d && (b & c);
00334 }
00335 
00336 struct sortop_bit_reversed {
00337   inline bool operator() (int a, int b) const {
00338     return less_than_bit_reversed(a,b);
00339   }
00340 };
00341 
00342 struct ijpair {
00343   int i,j;
00344   ijpair() {;}
00345   ijpair(int I, int J) : i(I), j(J) {;}
00346 };
00347 
00348 struct ijpair_sortop_bit_reversed {
00349   inline bool operator() (const ijpair &a, const ijpair &b) const {
00350     return ( less_than_bit_reversed(a.i,b.i)
00351              || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
00352   }
00353 };
00354 
00355 class ComputePmeMgr : public CBase_ComputePmeMgr, public ComputePmeUtil {
00356 public:
00357   friend class ComputePme;
00358   friend class NodePmeMgr;
00359   ComputePmeMgr();
00360   ~ComputePmeMgr();
00361 
00362   void initialize(CkQdMsg*);
00363   void initialize_pencils(CkQdMsg*);
00364   void activate_pencils(CkQdMsg*);
00365   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00366   void initialize_computes();
00367 
00368   void sendData(Lattice &, int sequence);
00369   void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
00370   Lattice *sendDataHelper_lattice;
00371   int sendDataHelper_sequence;
00372   int sendDataHelper_sourcepe;
00373   int sendDataHelper_errors;
00374   void sendPencils(Lattice &, int sequence);
00375   void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
00376   void recvGrid(PmeGridMsg *);
00377   void gridCalc1(void);
00378   void sendTransBarrier(void);
00379   void sendTransSubset(int first, int last);
00380   void sendTrans(void);
00381   void fwdSharedTrans(PmeTransMsg *);
00382   void recvSharedTrans(PmeSharedTransMsg *);
00383   void sendDataHelper(int);
00384   void sendPencilsHelper(int);
00385   void recvTrans(PmeTransMsg *);
00386   void procTrans(PmeTransMsg *);
00387   void gridCalc2(void);
00388   #ifdef OPENATOM_VERSION
00389   void gridCalc2Moa(void);
00390   #endif // OPENATOM_VERSION
00391   void gridCalc2R(void);
00392   void fwdSharedUntrans(PmeUntransMsg *);
00393   void recvSharedUntrans(PmeSharedUntransMsg *);
00394   void sendUntrans(void);
00395   void sendUntransSubset(int first, int last);
00396   void recvUntrans(PmeUntransMsg *);
00397   void procUntrans(PmeUntransMsg *);
00398   void gridCalc3(void);
00399   void sendUngrid(void);
00400   void sendUngridSubset(int first, int last);
00401   void recvUngrid(PmeGridMsg *);
00402   void recvAck(PmeAckMsg *);
00403   void copyResults(PmeGridMsg *);
00404   void copyPencils(PmeGridMsg *);
00405   void ungridCalc(void);
00406   void recvRecipEvir(PmeEvirMsg *);
00407   void addRecipEvirClient(void);
00408   void submitReductions();
00409 
00410 #if 0 && USE_PERSISTENT
00411   void setup_recvgrid_persistent();
00412 #endif
00413 
00414   static CmiNodeLock fftw_plan_lock;
00415   CmiNodeLock pmemgr_lock;  // for accessing this object from other threads
00416 
00417 #ifdef NAMD_CUDA
00418   float *a_data_host;
00419   float *a_data_dev;
00420   float *f_data_host;
00421   float *f_data_dev;
00422   int cuda_atoms_count;
00423   int cuda_atoms_alloc;
00424   static CmiNodeLock cuda_lock;
00425   void chargeGridSubmitted(Lattice &lattice, int sequence);
00426   cudaEvent_t end_charges;
00427   cudaEvent_t *end_forces;
00428   int forces_count;
00429   int forces_done_count;
00430   double charges_time;
00431   double forces_time;
00432   int check_charges_count;
00433   int check_forces_count;
00434   int master_pe;
00435   int this_pe;
00436 
00437   void cuda_submit_charges(Lattice &lattice, int sequence);
00438   struct cuda_submit_charges_args {
00439     ComputePmeMgr *mgr; Lattice *lattice; int sequence;
00440   };
00441   static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
00442   static bool cuda_busy;
00443 
00444   int chargeGridSubmittedCount;
00445   void sendChargeGridReady();
00446 #endif
00447   Lattice *saved_lattice;  // saved by chargeGridSubmitted
00448   int saved_sequence;      // saved by chargeGridSubmitted
00449   void pollChargeGridReady();
00450   void pollForcesReady();
00451   void recvChargeGridReady();
00452   void chargeGridReady(Lattice &lattice, int sequence);
00453 
00454   ResizeArray<ComputePme*> pmeComputes;
00455 
00456 private:
00457 
00458 #if 0 && USE_PERSISTENT
00459   PersistentHandle   *recvGrid_handle;
00460 #endif
00461 
00462   CProxy_ComputePmeMgr pmeProxy;
00463   CProxy_ComputePmeMgr pmeProxyDir;
00464   CProxy_NodePmeMgr pmeNodeProxy;
00465   NodePmeMgr *nodePmeMgr;
00466   ComputePmeMgr *masterPmeMgr;
00467   
00468   void addCompute(ComputePme *c) {
00469     if ( ! pmeComputes.size() ) initialize_computes();
00470     pmeComputes.add(c);
00471     c->setMgr(this);
00472   }
00473 
00474   ResizeArray<ComputePme*> heldComputes;
00475   PmeGrid myGrid;
00476   Lattice lattice;
00477   PmeKSpace *myKSpace;
00478   float *qgrid;
00479   float *kgrid;
00480 
00481 #ifdef NAMD_FFTW
00482 #ifdef NAMD_FFTW_3
00483   fftwf_plan *forward_plan_x, *backward_plan_x;
00484   fftwf_plan *forward_plan_yz, *backward_plan_yz;
00485   fftwf_complex *work;
00486 #else
00487   fftw_plan forward_plan_x, backward_plan_x;
00488   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00489   fftw_complex *work;
00490 #endif
00491 #else
00492   float *work;
00493 #endif
00494 
00495   int qsize, fsize, bsize;
00496   int offload;
00497   BigReal alchLambda;  // set on each step in ComputePme::ungridForces()
00498 
00499   float **q_arr;
00500   // q_list and q_count not used for offload
00501   float **q_list;
00502   int q_count;
00503   char *f_arr;
00504   char *fz_arr;
00505   PmeReduction evir[PME_MAX_EVALS];
00506   SubmitReduction *reduction;
00507 
00508   int noWorkCount;
00509   int doWorkCount;
00510   int ungridForcesCount;
00511 
00512 #ifdef NAMD_CUDA
00513 #define NUM_STREAMS 1
00514   cudaStream_t streams[NUM_STREAMS];
00515   int stream;
00516 
00517   float **q_arr_dev;
00518   float **v_arr_dev;
00519   float *q_data_host;
00520   float *q_data_dev;
00521   float *v_data_dev;
00522   int *ffz_host;
00523   int *ffz_dev;
00524   int q_data_size;
00525   int ffz_size;
00526 
00527   int f_data_mgr_alloc;
00528   float *f_data_mgr_host;
00529   float *f_data_mgr_dev;
00530   float **afn_host;
00531   float **afn_dev;
00532 
00533   float *bspline_coeffs_dev;
00534   float *bspline_dcoeffs_dev;
00535 #endif
00536   int recipEvirCount;   // used in compute only
00537   int recipEvirClients; // used in compute only
00538   int recipEvirPe;      // used in trans only
00539   
00540   LocalPmeInfo *localInfo;
00541   NodePmeInfo *gridNodeInfo;
00542   NodePmeInfo *transNodeInfo;
00543   int qgrid_size;
00544   int qgrid_start;
00545   int qgrid_len;
00546   int fgrid_start;
00547   int fgrid_len;
00548 
00549   int numSources;
00550   int numGridPes;
00551   int numTransPes;
00552   int numGridNodes;
00553   int numTransNodes;
00554   int numDestRecipPes;
00555   int myGridPe, myGridNode;
00556   int myTransPe, myTransNode;
00557   int *gridPeMap;
00558   int *transPeMap;
00559   int *recipPeDest;
00560   int *gridPeOrder;
00561   int *gridNodeOrder;
00562   int *transNodeOrder;
00563   int grid_count;
00564   int trans_count;
00565   int untrans_count;
00566   int ungrid_count;
00567   PmeGridMsg **gridmsg_reuse;
00568   PmeReduction recip_evir2[PME_MAX_EVALS];
00569 
00570   int compute_sequence;  // set from patch computes, used for priorities
00571   int grid_sequence;  // set from grid messages, used for priorities
00572   int useBarrier;
00573   int sendTransBarrier_received;
00574 
00575   int usePencils;
00576   int xBlocks, yBlocks, zBlocks;
00577   CProxy_PmeXPencil xPencil;
00578   CProxy_PmeYPencil yPencil;
00579   CProxy_PmeZPencil zPencil;
00580   char *pencilActive;
00581   ijpair *activePencils;
00582   int numPencilsActive;
00583   int strayChargeErrors;
00584 };
00585 
00586 ResizeArray<ComputePme*>& getComputes(ComputePmeMgr *mgr) {
00587     return mgr->pmeComputes ;
00588 }
00589 
00590   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00591 #ifdef NAMD_CUDA
00592   CmiNodeLock ComputePmeMgr::cuda_lock;
00593   std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
00594   bool ComputePmeMgr::cuda_busy;
00595 #endif
00596 
00597 int isPmeProcessor(int p){ 
00598   SimParameters *simParams = Node::Object()->simParameters;
00599   if (simParams->usePMECUDA) {
00600     return 0;
00601   } else {
00602     return pencilPMEProcessors[p];
00603   }
00604 }
00605 
00606 class NodePmeMgr : public CBase_NodePmeMgr {
00607 public:
00608   friend class ComputePmeMgr;
00609   friend class ComputePme;
00610   NodePmeMgr();
00611   ~NodePmeMgr();
00612   void initialize();
00613   void sendDataHelper(int);
00614   void sendPencilsHelper(int);
00615   void recvTrans(PmeTransMsg *);
00616   void recvUntrans(PmeUntransMsg *);
00617   void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00618   void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00619   void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00620   void recvXTrans(PmeTransMsg *);
00621   void recvYTrans(PmeTransMsg *);
00622   void recvYUntrans(PmeUntransMsg *);
00623   void recvZGrid(PmeGridMsg *);
00624   void recvZUntrans(PmeUntransMsg *);
00625 
00626   void recvUngrid(PmeGridMsg *);
00627 
00628   void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00629       xm=_xm; ym=_ym; zm=_zm;
00630   }
00631   CProxy_PmePencilMap xm;
00632   CProxy_PmePencilMap ym;
00633   CProxy_PmePencilMap zm;
00634 
00635 private:
00636   CProxy_ComputePmeMgr mgrProxy;
00637   ComputePmeMgr *mgrObject;
00638   ComputePmeMgr **mgrObjects;
00639 #ifdef NAMD_CUDA
00640   ComputePmeMgr *masterPmeMgr;
00641   int master_pe;
00642 #endif
00643   CProxy_PmeXPencil xPencil;
00644   CProxy_PmeYPencil yPencil;
00645   CProxy_PmeZPencil zPencil;
00646   CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00647   CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00648   CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;  
00649 
00650 #ifdef NAMD_CUDA
00651   cudaEvent_t end_charge_memset;
00652   cudaEvent_t end_all_pme_kernels;
00653   cudaEvent_t end_potential_memcpy;
00654 #endif
00655 };
00656 
00657 NodePmeMgr::NodePmeMgr() {
00658   mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00659 }
00660 
00661 NodePmeMgr::~NodePmeMgr() {
00662   delete [] mgrObjects;
00663 }
00664 
00665 void NodePmeMgr::initialize() {
00666   CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00667   mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00668   if ( CkMyRank() == 0 ) {
00669     mgrProxy = proxy;
00670     mgrObject = proxy.ckLocalBranch();
00671   }
00672 }
00673 
00674 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00675   mgrObject->fwdSharedTrans(msg);
00676 }
00677 
00678 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00679   mgrObject->fwdSharedUntrans(msg);
00680 }
00681 
00682 void NodePmeMgr::recvUngrid(PmeGridMsg *msg) {
00683 #ifdef NAMD_CUDA
00684   masterPmeMgr->recvUngrid(msg);
00685 #else
00686   NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
00687 #endif
00688 }
00689 
00690 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00691 {
00692   CmiLock(ComputePmeMgr::fftw_plan_lock);
00693   xPencilObj.put(idx)=obj;
00694   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00695 }
00696 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00697 {
00698   CmiLock(ComputePmeMgr::fftw_plan_lock);
00699   yPencilObj.put(idx)=obj;
00700   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00701 }
00702 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00703 {
00704   CmiLock(ComputePmeMgr::fftw_plan_lock);
00705   zPencilObj.put(idx)=obj;
00706   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00707 }
00708 
00709 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00710                                  pmeProxyDir(thisgroup) {
00711 
00712   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00713   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00714   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00715 
00716   pmeNodeProxy.ckLocalBranch()->initialize();
00717 
00718   if ( CmiMyRank() == 0 ) {
00719     fftw_plan_lock = CmiCreateLock();
00720   }
00721   pmemgr_lock = CmiCreateLock();
00722 
00723   myKSpace = 0;
00724   kgrid = 0;
00725   work = 0;
00726   grid_count = 0;
00727   trans_count = 0;
00728   untrans_count = 0;
00729   ungrid_count = 0;
00730   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00731   useBarrier = 0;
00732   sendTransBarrier_received = 0;
00733   usePencils = 0;
00734 
00735 #ifdef NAMD_CUDA
00736  // offload has not been set so this happens on every run
00737   if ( CmiMyRank() == 0 ) {
00738     cuda_lock = CmiCreateLock();
00739   }
00740 
00741 #if CUDA_VERSION >= 5050
00742   int leastPriority, greatestPriority;
00743   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00744   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00745   //if ( CkMyNode() == 0 ) {
00746   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00747   //}
00748 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00749 #else
00750 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00751 #endif
00752 
00753   stream = 0;
00754   for ( int i=0; i<NUM_STREAMS; ++i ) {
00755 #if 1
00756     CUDA_STREAM_CREATE(&streams[i]);
00757     cuda_errcheck("cudaStreamCreate");
00758 #else
00759   streams[i] = 0;  // XXXX Testing!!!
00760 #endif
00761   }
00762 
00763   this_pe = CkMyPe();
00764  
00765   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00766   end_forces = 0;
00767   check_charges_count = 0;
00768   check_forces_count = 0;
00769   chargeGridSubmittedCount = 0;
00770 
00771   cuda_atoms_count = 0;
00772   cuda_atoms_alloc = 0;
00773 
00774   f_data_mgr_alloc = 0;
00775   f_data_mgr_host = 0;
00776   f_data_mgr_dev = 0;
00777   afn_host = 0;
00778   afn_dev = 0;
00779 
00780 #define CUDA_EVENT_ID_PME_CHARGES 80
00781 #define CUDA_EVENT_ID_PME_FORCES 81
00782 #define CUDA_EVENT_ID_PME_TICK 82
00783 #define CUDA_EVENT_ID_PME_COPY 83
00784 #define CUDA_EVENT_ID_PME_KERNEL 84
00785   if ( 0 == CkMyPe() ) {
00786     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00787     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00788     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00789     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00790     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00791   }
00792 #endif
00793   recipEvirCount = 0;
00794   recipEvirClients = 0;
00795   recipEvirPe = -999;
00796 }
00797 
00798 
00799 void ComputePmeMgr::recvArrays(
00800         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00801   xPencil = x;  yPencil = y;  zPencil = z;
00802   
00803     if(CmiMyRank()==0)
00804     {
00805       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00806       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00807       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00808     }
00809 }
00810 
00811 #if USE_TOPO_SFC
00812  struct Coord
00813   {
00814     int x, y, z;
00815     Coord(): x(0), y(0), z(0) {}
00816     Coord(int a, int b, int c): x(a), y(b), z(c) {}
00817   };
00818   extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00819 
00820   void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00821   {
00822      SortableResizeArray<int> newprocs(procs.size());
00823      int num = 0;
00824      for (int i=0; i<result.size(); i++) {
00825        Coord &c = result[i];
00826        for (int j=0; j<procs.size(); j++) {
00827          int pe = procs[j];
00828          int x,y,z,t;
00829          tmgr.rankToCoordinates(pe, x, y, z, t);    
00830          if (x==c.x && y==c.y && z==c.z)
00831            newprocs[num++] = pe;
00832        }
00833      } 
00834      CmiAssert(newprocs.size() == procs.size());
00835      procs = newprocs;
00836   }
00837 
00838   int find_level_grid(int x) 
00839   {
00840      int a = sqrt(x);
00841      int b;
00842      for (; a>0; a--) {
00843        if (x%a == 0) break;
00844      }
00845      if (a==1) a = x;
00846      b = x/a;
00847      //return a>b?a:b;
00848      return b;
00849   }
00850   CmiNodeLock tmgr_lock;
00851 #endif
00852 
00853 void Pme_init()
00854 {
00855 #if USE_TOPO_SFC
00856   if (CkMyRank() == 0) 
00857     tmgr_lock = CmiCreateLock();
00858 #endif
00859 }
00860 
00861 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00862   delete msg;
00863 
00864   localInfo = new LocalPmeInfo[CkNumPes()];
00865   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00866   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00867   gridPeMap = new int[CkNumPes()];
00868   transPeMap = new int[CkNumPes()];
00869   recipPeDest = new int[CkNumPes()];
00870   gridPeOrder = new int[CkNumPes()];
00871   gridNodeOrder = new int[CkNumNodes()];
00872   transNodeOrder = new int[CkNumNodes()];
00873 
00874   if (CkMyRank() == 0) {
00875     pencilPMEProcessors = new char [CkNumPes()];
00876     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00877   }
00878 
00879   SimParameters *simParams = Node::Object()->simParameters;
00880   PatchMap *patchMap = PatchMap::Object();
00881 
00882   offload = simParams->PMEOffload;
00883 #ifdef NAMD_CUDA
00884   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00885     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00886   }
00887   if ( offload ) {
00888     int dev;
00889     cudaGetDevice(&dev);
00890     cuda_errcheck("in cudaGetDevice");
00891     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00892     cudaDeviceProp deviceProp;
00893     cudaGetDeviceProperties(&deviceProp, dev);
00894     cuda_errcheck("in cudaGetDeviceProperties");
00895     if ( deviceProp.major < 2 )
00896       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00897   }
00898 #endif
00899 
00900   alchLambda = -1.;  // illegal value to catch if not updated
00901   useBarrier = simParams->PMEBarrier;
00902 
00903   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00904   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00905   else {
00906     int nrps = simParams->PMEProcessors;
00907     if ( nrps <= 0 ) nrps = CkNumPes();
00908     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00909     int dimx = simParams->PMEGridSizeX;
00910     int dimy = simParams->PMEGridSizeY;
00911     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00912     if ( maxslabs > nrps ) maxslabs = nrps;
00913     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00914                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00915     if ( maxpencils > nrps ) maxpencils = nrps;
00916     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00917     else usePencils = 0;
00918   }
00919 
00920   if ( usePencils ) {
00921     int nrps = simParams->PMEProcessors;
00922     if ( nrps <= 0 ) nrps = CkNumPes();
00923     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00924     if ( simParams->PMEPencils > 1 &&
00925          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00926       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00927     } else {
00928       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00929                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00930       if ( nb2 > nrps ) nb2 = nrps;
00931       if ( nb2 < 1 ) nb2 = 1;
00932       int nb = (int) sqrt((float)nb2);
00933       if ( nb < 1 ) nb = 1;
00934       xBlocks = zBlocks = nb;
00935       yBlocks = nb2 / nb;
00936     }
00937 
00938     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00939     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00940     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00941 
00942     int dimx = simParams->PMEGridSizeX;
00943     int bx = 1 + ( dimx - 1 ) / xBlocks;
00944     xBlocks = 1 + ( dimx - 1 ) / bx;
00945 
00946     int dimy = simParams->PMEGridSizeY;
00947     int by = 1 + ( dimy - 1 ) / yBlocks;
00948     yBlocks = 1 + ( dimy - 1 ) / by;
00949 
00950     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00951     int bz = 1 + ( dimz - 1 ) / zBlocks;
00952     zBlocks = 1 + ( dimz - 1 ) / bz;
00953 
00954     if ( xBlocks * yBlocks > CkNumPes() ) {
00955       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00956     }
00957     if ( xBlocks * zBlocks > CkNumPes() ) {
00958       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00959     }
00960     if ( yBlocks * zBlocks > CkNumPes() ) {
00961       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00962     }
00963 
00964     if ( ! CkMyPe() ) {
00965       iout << iINFO << "PME using " << xBlocks << " x " <<
00966         yBlocks << " x " << zBlocks <<
00967         " pencil grid for FFT and reciprocal sum.\n" << endi;
00968     }
00969   } else { // usePencils
00970 
00971   {  // decide how many pes to use for reciprocal sum
00972 
00973     // rules based on work available
00974     int minslices = simParams->PMEMinSlices;
00975     int dimx = simParams->PMEGridSizeX;
00976     int nrpx = ( dimx + minslices - 1 ) / minslices;
00977     int dimy = simParams->PMEGridSizeY;
00978     int nrpy = ( dimy + minslices - 1 ) / minslices;
00979 
00980     // rules based on processors available
00981     int nrpp = CkNumPes();
00982     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00983     if ( nrpp < nrpx ) nrpx = nrpp;
00984     if ( nrpp < nrpy ) nrpy = nrpp;
00985 
00986     // user override
00987     int nrps = simParams->PMEProcessors;
00988     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00989     if ( nrps > 0 ) nrpx = nrps;
00990     if ( nrps > 0 ) nrpy = nrps;
00991 
00992     // make sure there aren't any totally empty processors
00993     int bx = ( dimx + nrpx - 1 ) / nrpx;
00994     nrpx = ( dimx + bx - 1 ) / bx;
00995     int by = ( dimy + nrpy - 1 ) / nrpy;
00996     nrpy = ( dimy + by - 1 ) / by;
00997     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00998       NAMD_bug("Error in selecting number of PME processors.");
00999     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01000       NAMD_bug("Error in selecting number of PME processors.");
01001 
01002     numGridPes = nrpx;
01003     numTransPes = nrpy;
01004   }
01005   if ( ! CkMyPe() ) {
01006     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01007       " processors for FFT and reciprocal sum.\n" << endi;
01008   }
01009 
01010   int sum_npes = numTransPes + numGridPes;
01011   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01012 
01013 #if 0 // USE_TOPOMAP
01014   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01015   PatchMap * pmap = PatchMap::Object();
01016   
01017   int patch_pes = pmap->numNodesWithPatches();
01018   TopoManager tmgr;
01019   if(tmgr.hasMultipleProcsPerNode())
01020     patch_pes *= 2;
01021 
01022   bool done = false;
01023   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01024     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01025     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01026   }
01027   else 
01028     if(CkNumPes() > 2 *max_npes + patch_pes) {
01029       done = generateBGLORBPmePeList(transPeMap, max_npes);
01030       gridPeMap = transPeMap;
01031     }
01032 
01033   if (!done)
01034 #endif
01035     {
01036       //generatePmePeList(transPeMap, max_npes);
01037       //gridPeMap = transPeMap;
01038       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01039     }
01040   
01041   if ( ! CkMyPe() ) {
01042     iout << iINFO << "PME GRID LOCATIONS:";
01043     int i;
01044     for ( i=0; i<numGridPes && i<10; ++i ) {
01045       iout << " " << gridPeMap[i];
01046     }
01047     if ( i < numGridPes ) iout << " ...";
01048     iout << "\n" << endi;
01049     iout << iINFO << "PME TRANS LOCATIONS:";
01050     for ( i=0; i<numTransPes && i<10; ++i ) {
01051       iout << " " << transPeMap[i];
01052     }
01053     if ( i < numTransPes ) iout << " ...";
01054     iout << "\n" << endi;
01055   }
01056 
01057   // sort based on nodes and physical nodes
01058   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01059 
01060   myGridPe = -1;
01061   myGridNode = -1;
01062   int i = 0;
01063   int node = -1;
01064   int real_node = -1;
01065   for ( i=0; i<numGridPes; ++i ) {
01066     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01067     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01068     int real_node_i = CkNodeOf(gridPeMap[i]);
01069     if ( real_node_i == real_node ) {
01070       gridNodeInfo[node].npe += 1;
01071     } else {
01072       real_node = real_node_i;
01073       ++node;
01074       gridNodeInfo[node].real_node = real_node;
01075       gridNodeInfo[node].pe_start = i;
01076       gridNodeInfo[node].npe = 1;
01077     }
01078     if ( CkMyNode() == real_node_i ) myGridNode = node;
01079   }
01080   numGridNodes = node + 1;
01081   myTransPe = -1;
01082   myTransNode = -1;
01083   node = -1;
01084   real_node = -1;
01085   for ( i=0; i<numTransPes; ++i ) {
01086     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01087     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01088     int real_node_i = CkNodeOf(transPeMap[i]);
01089     if ( real_node_i == real_node ) {
01090       transNodeInfo[node].npe += 1;
01091     } else {
01092       real_node = real_node_i;
01093       ++node;
01094       transNodeInfo[node].real_node = real_node;
01095       transNodeInfo[node].pe_start = i;
01096       transNodeInfo[node].npe = 1;
01097     }
01098     if ( CkMyNode() == real_node_i ) myTransNode = node;
01099   }
01100   numTransNodes = node + 1;
01101 
01102   if ( ! CkMyPe() ) {
01103     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01104          << numTransNodes << " TRANS NODES\n" << endi;
01105   }
01106 
01107   { // generate random orderings for grid and trans messages
01108     int i;
01109     for ( i = 0; i < numGridPes; ++i ) {
01110       gridPeOrder[i] = i;
01111     }
01112     Random rand(CkMyPe());
01113     if ( myGridPe < 0 ) {
01114       rand.reorder(gridPeOrder,numGridPes);
01115     } else {  // self last
01116       gridPeOrder[myGridPe] = numGridPes-1;
01117       gridPeOrder[numGridPes-1] = myGridPe;
01118       rand.reorder(gridPeOrder,numGridPes-1);
01119     } 
01120     for ( i = 0; i < numGridNodes; ++i ) {
01121       gridNodeOrder[i] = i;
01122     }
01123     if ( myGridNode < 0 ) {
01124       rand.reorder(gridNodeOrder,numGridNodes);
01125     } else {  // self last
01126       gridNodeOrder[myGridNode] = numGridNodes-1;
01127       gridNodeOrder[numGridNodes-1] = myGridNode;
01128       rand.reorder(gridNodeOrder,numGridNodes-1);
01129     }
01130     for ( i = 0; i < numTransNodes; ++i ) {
01131       transNodeOrder[i] = i;
01132     }
01133     if ( myTransNode < 0 ) {
01134       rand.reorder(transNodeOrder,numTransNodes);
01135     } else {  // self last
01136       transNodeOrder[myTransNode] = numTransNodes-1;
01137       transNodeOrder[numTransNodes-1] = myTransNode;
01138       rand.reorder(transNodeOrder,numTransNodes-1);
01139     }
01140   }
01141   
01142   } // ! usePencils
01143 
01144   myGrid.K1 = simParams->PMEGridSizeX;
01145   myGrid.K2 = simParams->PMEGridSizeY;
01146   myGrid.K3 = simParams->PMEGridSizeZ;
01147   myGrid.order = simParams->PMEInterpOrder;
01148   myGrid.dim2 = myGrid.K2;
01149   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01150 
01151   if ( ! usePencils ) {
01152     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01153     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01154     myGrid.block3 = myGrid.dim3 / 2;  // complex
01155   }
01156 
01157   if ( usePencils ) {
01158     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01159     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01160     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01161 
01162 
01163       int pe = 0;
01164       int x,y,z;
01165 
01166                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01167                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01168                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01169       
01170                 // decide which pes to use by bit reversal and patch use
01171                 int i;
01172                 int ncpus = CkNumPes();
01173                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01174                 PatchMap *pmap = PatchMap::Object();
01175                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01176                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01177                         if ( ri ) { // keep 0 for special case
01178                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
01179                                 else nopatches.add(ri);
01180                         }
01181                 }
01182 
01183 #if USE_RANDOM_TOPO
01184             Random rand(CkMyPe());
01185             int *tmp = new int[patches.size()];
01186             int nn = patches.size();
01187             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01188             rand.reorder(tmp, nn);
01189             patches.resize(0);
01190             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01191             delete [] tmp;
01192             tmp = new int[nopatches.size()];
01193             nn = nopatches.size();
01194             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01195             rand.reorder(tmp, nn);
01196             nopatches.resize(0);
01197             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01198             delete [] tmp;
01199 #endif
01200 
01201                 // only use zero if it eliminates overloading or has patches
01202                 int useZero = 0;
01203                 int npens = xBlocks*yBlocks;
01204                 if ( npens % ncpus == 0 ) useZero = 1;
01205                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01206                 npens += xBlocks*zBlocks;
01207                 if ( npens % ncpus == 0 ) useZero = 1;
01208                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01209                 npens += yBlocks*zBlocks;
01210                 if ( npens % ncpus == 0 ) useZero = 1;
01211                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01212 
01213                 // add nopatches then patches in reversed order
01214                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01215                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01216                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01217                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01218   
01219                 int npes = pmeprocs.size();
01220                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01221                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01222 #if !USE_RANDOM_TOPO
01223                 zprocs.sort();
01224 #endif
01225                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01226                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01227 #if !USE_RANDOM_TOPO
01228                 yprocs.sort();
01229 #endif
01230       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01231       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01232 #if !USE_RANDOM_TOPO
01233       xprocs.sort();
01234 #endif
01235 
01236 #if USE_TOPO_SFC
01237   CmiLock(tmgr_lock);
01238   //{
01239   TopoManager tmgr;
01240   int xdim = tmgr.getDimNX();
01241   int ydim = tmgr.getDimNY();
01242   int zdim = tmgr.getDimNZ();
01243   int xdim1 = find_level_grid(xdim);
01244   int ydim1 = find_level_grid(ydim);
01245   int zdim1 = find_level_grid(zdim);
01246   if(CkMyPe() == 0)
01247       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01248 
01249   vector<Coord> result;
01250   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01251   sort_sfc(xprocs, tmgr, result);
01252   sort_sfc(yprocs, tmgr, result);
01253   sort_sfc(zprocs, tmgr, result);
01254   //}
01255   CmiUnlock(tmgr_lock);
01256 #endif
01257 
01258 
01259                 if(CkMyPe() == 0){  
01260               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01261           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01262 #if USE_TOPO_SFC
01263               int x,y,z,t;
01264               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01265               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01266 #else
01267               iout << " " << zprocs[i];
01268 #endif
01269           }
01270           if ( i < zprocs.size() ) iout << " ...";
01271               iout << "\n" << endi;
01272                 }
01273 
01274     if (CkMyRank() == 0) {
01275       for (pe=0, x = 0; x < xBlocks; ++x)
01276         for (y = 0; y < yBlocks; ++y, ++pe ) {
01277           pencilPMEProcessors[zprocs[pe]] = 1;
01278         }
01279     }
01280      
01281                 if(CkMyPe() == 0){  
01282               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01283           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01284 #if USE_TOPO_SFC
01285               int x,y,z,t;
01286               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01287               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01288 #else
01289               iout << " " << yprocs[i];
01290 #endif
01291           }
01292           if ( i < yprocs.size() ) iout << " ...";
01293               iout << "\n" << endi;
01294                 }
01295 
01296     if (CkMyRank() == 0) {
01297       for (pe=0, z = 0; z < zBlocks; ++z )
01298         for (x = 0; x < xBlocks; ++x, ++pe ) {
01299           pencilPMEProcessors[yprocs[pe]] = 1;
01300         }
01301     }
01302     
01303                 if(CkMyPe() == 0){  
01304                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01305                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01306 #if USE_TOPO_SFC
01307                 int x,y,z,t;
01308                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01309                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01310 #else
01311                 iout << " " << xprocs[i];
01312 #endif
01313             }
01314                 if ( i < xprocs.size() ) iout << " ...";
01315                 iout << "\n" << endi;
01316                 }
01317 
01318     if (CkMyRank() == 0) {
01319       for (pe=0, y = 0; y < yBlocks; ++y )      
01320         for (z = 0; z < zBlocks; ++z, ++pe ) {
01321           pencilPMEProcessors[xprocs[pe]] = 1;
01322         }
01323     }
01324         
01325 
01326         // creating the pencil arrays
01327         if ( CkMyPe() == 0 ){
01328 #if !USE_RANDOM_TOPO
01329         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01330         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01331         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01332         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01333 #endif
01334 #if 1
01335         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01336         CProxy_PmePencilMap ym;
01337         if ( simParams->PMEPencilsYLayout )
01338           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01339         else
01340           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01341         CProxy_PmePencilMap xm;
01342         if ( simParams->PMEPencilsXLayout )
01343           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01344         else
01345           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01346         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01347         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01348         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01349         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01350         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01351         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01352         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01353         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01354         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01355         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01356 #else
01357         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01358         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01359         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01360 
01361                 for (pe=0, x = 0; x < xBlocks; ++x)
01362                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01363                                 zPencil(x,y,0).insert(zprocs[pe]);
01364                         }
01365         zPencil.doneInserting();
01366 
01367                 for (pe=0, x = 0; x < xBlocks; ++x)
01368                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01369                                 yPencil(x,0,z).insert(yprocs[pe]);
01370                         }
01371         yPencil.doneInserting();
01372 
01373 
01374                 for (pe=0, y = 0; y < yBlocks; ++y )    
01375                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01376                                 xPencil(0,y,z).insert(xprocs[pe]);
01377                         }
01378                 xPencil.doneInserting();     
01379 #endif
01380 
01381                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01382                 PmePencilInitMsgData msgdata;
01383                 msgdata.grid = myGrid;
01384                 msgdata.xBlocks = xBlocks;
01385                 msgdata.yBlocks = yBlocks;
01386                 msgdata.zBlocks = zBlocks;
01387                 msgdata.xPencil = xPencil;
01388                 msgdata.yPencil = yPencil;
01389                 msgdata.zPencil = zPencil;
01390                 msgdata.pmeProxy = pmeProxyDir;
01391         msgdata.pmeNodeProxy = pmeNodeProxy;
01392         msgdata.xm = xm;
01393         msgdata.ym = ym;
01394         msgdata.zm = zm;
01395                 xPencil.init(new PmePencilInitMsg(msgdata));
01396                 yPencil.init(new PmePencilInitMsg(msgdata));
01397                 zPencil.init(new PmePencilInitMsg(msgdata));
01398         }
01399 
01400     return;  // continue in initialize_pencils() at next startup stage
01401   }
01402 
01403 
01404   int pe;
01405   int nx = 0;
01406   for ( pe = 0; pe < numGridPes; ++pe ) {
01407     localInfo[pe].x_start = nx;
01408     nx += myGrid.block1;
01409     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01410     localInfo[pe].nx = nx - localInfo[pe].x_start;
01411   }
01412   int ny = 0;
01413   for ( pe = 0; pe < numTransPes; ++pe ) {
01414     localInfo[pe].y_start_after_transpose = ny;
01415     ny += myGrid.block2;
01416     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01417     localInfo[pe].ny_after_transpose =
01418                         ny - localInfo[pe].y_start_after_transpose;
01419   }
01420 
01421   {  // decide how many pes this node exchanges charges with
01422 
01423   PatchMap *patchMap = PatchMap::Object();
01424   Lattice lattice = simParams->lattice;
01425   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01426   BigReal cutoff = simParams->cutoff;
01427   BigReal patchdim = simParams->patchDimension;
01428   int numPatches = patchMap->numPatches();
01429   int numNodes = CkNumPes();
01430   int *source_flags = new int[numNodes];
01431   int node;
01432   for ( node=0; node<numNodes; ++node ) {
01433     source_flags[node] = 0;
01434     recipPeDest[node] = 0;
01435   }
01436 
01437   // // make sure that we don't get ahead of ourselves on this node
01438   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01439   //   source_flags[CkMyPe()] = 1;
01440   //   recipPeDest[myRecipPe] = 1;
01441   // }
01442 
01443   for ( int pid=0; pid < numPatches; ++pid ) {
01444     int pnode = patchMap->node(pid);
01445 #ifdef NAMD_CUDA
01446     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01447 #endif
01448     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01449     BigReal minx = patchMap->min_a(pid);
01450     BigReal maxx = patchMap->max_a(pid);
01451     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01452     // min1 (max1) is smallest (largest) grid line for this patch
01453     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01454     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01455     for ( int i=min1; i<=max1; ++i ) {
01456       int ix = i;
01457       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01458       while ( ix < 0 ) ix += myGrid.K1;
01459       // set source_flags[pnode] if this patch sends to our node
01460       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01461            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01462         source_flags[pnode] = 1;
01463       }
01464       // set dest_flags[] for node that our patch sends to
01465 #ifdef NAMD_CUDA
01466       if ( offload ) {
01467         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01468           recipPeDest[ix / myGrid.block1] = 1;
01469         }
01470       } else
01471 #endif
01472       if ( pnode == CkMyPe() ) {
01473         recipPeDest[ix / myGrid.block1] = 1;
01474       }
01475     }
01476   }
01477 
01478   int numSourcesSamePhysicalNode = 0;
01479   numSources = 0;
01480   numDestRecipPes = 0;
01481   for ( node=0; node<numNodes; ++node ) {
01482     if ( source_flags[node] ) ++numSources;
01483     if ( recipPeDest[node] ) ++numDestRecipPes;
01484     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01485   }
01486 
01487 #if 0
01488   if ( numSources ) {
01489     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01490             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01491     iout << iINFO << "PME " << CkMyPe() << " sources:";
01492     for ( node=0; node<numNodes; ++node ) {
01493       if ( source_flags[node] ) iout << " " << node;
01494     }
01495     iout << "\n" << endi;
01496   }
01497 #endif
01498 
01499   delete [] source_flags;
01500 
01501   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01502   //           CkMyPe(), numSources, numDestRecipPes);
01503 
01504   }  // decide how many pes this node exchanges charges with (end)
01505 
01506   ungrid_count = numDestRecipPes;
01507 
01508   sendTransBarrier_received = 0;
01509 
01510   if ( myGridPe < 0 && myTransPe < 0 ) return;
01511   // the following only for nodes doing reciprocal sum
01512 
01513   if ( myTransPe >= 0 ) {
01514     recipEvirPe = findRecipEvirPe();
01515     pmeProxy[recipEvirPe].addRecipEvirClient();
01516   }
01517 
01518   if ( myTransPe >= 0 ) {
01519       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01520       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01521       #ifdef OPENATOM_VERSION
01522       if ( simParams->openatomOn ) { 
01523         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01524         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01525       } else {
01526         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01527       }
01528       #else  // OPENATOM_VERSION
01529       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01530       #endif // OPENATOM_VERSION
01531   }
01532 
01533   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01534   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01535   if ( local_size < local_size_2 ) local_size = local_size_2;
01536   qgrid = new float[local_size*numGrids];
01537   if ( numGridPes > 1 || numTransPes > 1 ) {
01538     kgrid = new float[local_size*numGrids];
01539   } else {
01540     kgrid = qgrid;
01541   }
01542   qgrid_size = local_size;
01543 
01544   if ( myGridPe >= 0 ) {
01545   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01546   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01547   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01548   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01549   }
01550 
01551   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01552 #ifdef NAMD_FFTW
01553   CmiLock(fftw_plan_lock);
01554 #ifdef NAMD_FFTW_3
01555   work = new fftwf_complex[n[0]];
01556   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01557   if ( myGridPe >= 0 ) {
01558     forward_plan_yz=new fftwf_plan[numGrids];
01559     backward_plan_yz=new fftwf_plan[numGrids];
01560   }
01561   if ( myTransPe >= 0 ) {
01562     forward_plan_x=new fftwf_plan[numGrids];
01563     backward_plan_x=new fftwf_plan[numGrids];
01564   }
01565   /* need one plan per grid */
01566   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01567   if ( myGridPe >= 0 ) {
01568     for( int g=0; g<numGrids; g++)
01569       {
01570         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01571                                                      localInfo[myGridPe].nx,
01572                                                      qgrid + qgrid_size * g,
01573                                                      NULL,
01574                                                      1,
01575                                                      myGrid.dim2 * myGrid.dim3,
01576                                                      (fftwf_complex *) 
01577                                                      (qgrid + qgrid_size * g),
01578                                                      NULL,
01579                                                      1,
01580                                                      myGrid.dim2 * (myGrid.dim3/2),
01581                                                      fftwFlags);
01582       }
01583   }
01584   int zdim = myGrid.dim3;
01585   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01586   if ( ! CkMyPe() ) iout << " 2..." << endi;
01587   if ( myTransPe >= 0 ) {
01588     for( int g=0; g<numGrids; g++)
01589       {
01590 
01591         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01592                                                 (fftwf_complex *)
01593                                                 (kgrid+qgrid_size*g),
01594                                                 NULL,
01595                                                 xStride,
01596                                                 1,
01597                                                 (fftwf_complex *)
01598                                                 (kgrid+qgrid_size*g),
01599                                                 NULL,
01600                                                 xStride,
01601                                                 1,
01602                                                 FFTW_FORWARD,fftwFlags);
01603         
01604       }
01605   }
01606   if ( ! CkMyPe() ) iout << " 3..." << endi;
01607   if ( myTransPe >= 0 ) {
01608     for( int g=0; g<numGrids; g++)
01609       {
01610         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01611                                                  (fftwf_complex *)
01612                                                  (kgrid+qgrid_size*g),
01613                                                  NULL,
01614                                                  xStride,
01615                                                  1,
01616                                                  (fftwf_complex *)
01617                                                  (kgrid+qgrid_size*g),
01618                                                  NULL,
01619                                                  xStride,
01620                                                  1,
01621                                                  FFTW_BACKWARD, fftwFlags);
01622 
01623       }
01624   }
01625   if ( ! CkMyPe() ) iout << " 4..." << endi;
01626   if ( myGridPe >= 0 ) {
01627     for( int g=0; g<numGrids; g++)
01628       {
01629         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01630                                                       localInfo[myGridPe].nx,
01631                                                       (fftwf_complex *)
01632                                                       (qgrid + qgrid_size * g),
01633                                                       NULL,
01634                                                       1,
01635                                                       myGrid.dim2*(myGrid.dim3/2),
01636                                                       qgrid + qgrid_size * g,
01637                                                       NULL,
01638                                                       1,
01639                                                       myGrid.dim2 * myGrid.dim3,
01640                                                       fftwFlags);
01641       }
01642   }
01643   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01644 
01645 #else
01646   work = new fftw_complex[n[0]];
01647 
01648   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01649   if ( myGridPe >= 0 ) {
01650   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01651         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01652         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01653   }
01654   if ( ! CkMyPe() ) iout << " 2..." << endi;
01655   if ( myTransPe >= 0 ) {
01656       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01657         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01658         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01659         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01660   }
01661   if ( ! CkMyPe() ) iout << " 3..." << endi;
01662   if ( myTransPe >= 0 ) {
01663   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01664         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01665         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01666         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01667   }
01668   if ( ! CkMyPe() ) iout << " 4..." << endi;
01669   if ( myGridPe >= 0 ) {
01670   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01671         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01672         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01673   }
01674   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01675 #endif
01676   CmiUnlock(fftw_plan_lock);
01677 #else
01678   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01679 #endif
01680 
01681   if ( myGridPe >= 0 && numSources == 0 )
01682                 NAMD_bug("PME grid elements exist without sources.");
01683   grid_count = numSources;
01684   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01685   trans_count = numGridPes;
01686 }
01687 
01688 
01689 
01690 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01691   delete msg;
01692   if ( ! usePencils ) return;
01693 
01694   SimParameters *simParams = Node::Object()->simParameters;
01695 
01696   PatchMap *patchMap = PatchMap::Object();
01697   Lattice lattice = simParams->lattice;
01698   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01699   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01700   BigReal cutoff = simParams->cutoff;
01701   BigReal patchdim = simParams->patchDimension;
01702   int numPatches = patchMap->numPatches();
01703 
01704   pencilActive = new char[xBlocks*yBlocks];
01705   for ( int i=0; i<xBlocks; ++i ) {
01706     for ( int j=0; j<yBlocks; ++j ) {
01707       pencilActive[i*yBlocks+j] = 0;
01708     }
01709   }
01710 
01711   for ( int pid=0; pid < numPatches; ++pid ) {
01712     int pnode = patchMap->node(pid);
01713 #ifdef NAMD_CUDA
01714     if ( offload ) {
01715       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01716     } else
01717 #endif
01718     if ( pnode != CkMyPe() ) continue;
01719 
01720     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01721     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01722 
01723     BigReal minx = patchMap->min_a(pid);
01724     BigReal maxx = patchMap->max_a(pid);
01725     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01726     // min1 (max1) is smallest (largest) grid line for this patch
01727     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01728     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01729 
01730     BigReal miny = patchMap->min_b(pid);
01731     BigReal maxy = patchMap->max_b(pid);
01732     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01733     // min2 (max2) is smallest (largest) grid line for this patch
01734     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01735     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01736 
01737     for ( int i=min1; i<=max1; ++i ) {
01738       int ix = i;
01739       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01740       while ( ix < 0 ) ix += myGrid.K1;
01741       for ( int j=min2; j<=max2; ++j ) {
01742         int jy = j;
01743         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01744         while ( jy < 0 ) jy += myGrid.K2;
01745         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01746       }
01747     }
01748   }
01749 
01750   numPencilsActive = 0;
01751   for ( int i=0; i<xBlocks; ++i ) {
01752     for ( int j=0; j<yBlocks; ++j ) {
01753       if ( pencilActive[i*yBlocks+j] ) {
01754         ++numPencilsActive;
01755 #ifdef NAMD_CUDA
01756         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01757 #endif
01758         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01759       }
01760     }
01761   }
01762   activePencils = new ijpair[numPencilsActive];
01763   numPencilsActive = 0;
01764   for ( int i=0; i<xBlocks; ++i ) {
01765     for ( int j=0; j<yBlocks; ++j ) {
01766       if ( pencilActive[i*yBlocks+j] ) {
01767         activePencils[numPencilsActive++] = ijpair(i,j);
01768       }
01769     }
01770   }
01771   if ( simParams->PMESendOrder ) {
01772     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01773   } else {
01774     Random rand(CkMyPe());
01775     rand.reorder(activePencils,numPencilsActive);
01776   }
01777   //if ( numPencilsActive ) {
01778   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01779   //}
01780 
01781   ungrid_count = numPencilsActive;
01782 }
01783 
01784 
01785 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01786   if ( ! usePencils ) return;
01787   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01788 }
01789 
01790 
01791 ComputePmeMgr::~ComputePmeMgr() {
01792 
01793   if ( CmiMyRank() == 0 ) {
01794     CmiDestroyLock(fftw_plan_lock);
01795   }
01796   CmiDestroyLock(pmemgr_lock);
01797 
01798   delete myKSpace;
01799   delete [] localInfo;
01800   delete [] gridNodeInfo;
01801   delete [] transNodeInfo;
01802   delete [] gridPeMap;
01803   delete [] transPeMap;
01804   delete [] recipPeDest;
01805   delete [] gridPeOrder;
01806   delete [] gridNodeOrder;
01807   delete [] transNodeOrder;
01808   delete [] qgrid;
01809   if ( kgrid != qgrid ) delete [] kgrid;
01810   delete [] work;
01811   delete [] gridmsg_reuse;
01812 
01813  if ( ! offload ) {
01814   for (int i=0; i<q_count; ++i) {
01815     delete [] q_list[i];
01816   }
01817   delete [] q_list;
01818   delete [] fz_arr;
01819  }
01820   delete [] f_arr;
01821   delete [] q_arr;
01822 }
01823 
01824 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01825   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01826   if ( grid_count == 0 ) {
01827     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01828   }
01829   if ( grid_count == numSources ) {
01830     lattice = msg->lattice;
01831     grid_sequence = msg->sequence;
01832   }
01833 
01834   int zdim = myGrid.dim3;
01835   int zlistlen = msg->zlistlen;
01836   int *zlist = msg->zlist;
01837   float *qmsg = msg->qgrid;
01838   for ( int g=0; g<numGrids; ++g ) {
01839     char *f = msg->fgrid + fgrid_len * g;
01840     float *q = qgrid + qgrid_size * g;
01841     for ( int i=0; i<fgrid_len; ++i ) {
01842       if ( f[i] ) {
01843         for ( int k=0; k<zlistlen; ++k ) {
01844           q[zlist[k]] += *(qmsg++);
01845         }
01846       }
01847       q += zdim;
01848     }
01849   }
01850 
01851   gridmsg_reuse[numSources-grid_count] = msg;
01852   --grid_count;
01853 
01854   if ( grid_count == 0 ) {
01855     pmeProxyDir[CkMyPe()].gridCalc1();
01856     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01857   }
01858 }
01859 #ifdef MANUAL_DEBUG_FFTW3
01860 
01861 /* utility functions for manual debugging */
01862 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01863 {
01864 
01865   char fmt[1000];
01866   char filename[1000];
01867   strncpy(fmt,infilename,999);
01868   strncat(fmt,"_%d.out",999);
01869   sprintf(filename,fmt, pe);
01870   FILE *loutfile = fopen(filename, "w");
01871 #ifdef PAIRCALC_TEST_DUMP
01872   fprintf(loutfile,"%d\n",ydim);
01873 #endif
01874   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01875   for(int i=0;i<xdim;i++)
01876     for(int j=0;j<ydim;j++)
01877       for(int k=0;k<zdim;k++)
01878         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01879   fclose(loutfile);
01880 
01881 }
01882 
01883 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01884 {
01885   char fmt[1000];
01886   char filename[1000];
01887   strncpy(fmt,infilename,999);
01888   strncat(fmt,"_%d_%d_%d.out",999);
01889   sprintf(filename,fmt, x,y,z);
01890   FILE *loutfile = fopen(filename, "w");
01891   CkAssert(loutfile!=NULL);
01892   CkPrintf("opened %s for dump\n",filename);
01893   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01894   for(int i=0;i<xdim;i++)
01895     for(int j=0;j<ydim;j++)
01896       for(int k=0;k<zdim;k++)
01897         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01898   fclose(loutfile);
01899 }
01900 
01901 #endif
01902 
01903 void ComputePmeMgr::gridCalc1(void) {
01904   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01905 
01906 #ifdef NAMD_FFTW
01907   for ( int g=0; g<numGrids; ++g ) {
01908 #ifdef NAMD_FFTW_3
01909     fftwf_execute(forward_plan_yz[g]);
01910 #else
01911     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01912         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01913 #endif
01914 
01915   }
01916 #endif
01917 
01918   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01919 }
01920 
01921 void ComputePmeMgr::sendTransBarrier(void) {
01922   sendTransBarrier_received += 1;
01923   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01924   if ( sendTransBarrier_received < numGridPes ) return;
01925   sendTransBarrier_received = 0;
01926   for ( int i=0; i<numGridPes; ++i ) {
01927     pmeProxyDir[gridPeMap[i]].sendTrans();
01928   }
01929 }
01930 
01931 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
01932   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
01933   mgr->sendTransSubset(first, last);
01934 }
01935 
01936 void ComputePmeMgr::sendTrans(void) {
01937 
01938   untrans_count = numTransPes;
01939 
01940 #if     CMK_SMP && USE_CKLOOP
01941   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01942   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01943     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01944   } else
01945 #endif
01946   {
01947     sendTransSubset(0, numTransNodes-1);
01948   }
01949 
01950 }
01951 
01952 void ComputePmeMgr::sendTransSubset(int first, int last) {
01953   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01954 
01955   // send data for transpose
01956   int zdim = myGrid.dim3;
01957   int nx = localInfo[myGridPe].nx;
01958   int x_start = localInfo[myGridPe].x_start;
01959   int slicelen = myGrid.K2 * zdim;
01960 
01961   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01962 
01963 #if CMK_BLUEGENEL
01964   CmiNetworkProgressAfter (0);
01965 #endif
01966 
01967   for (int j=first; j<=last; j++) {
01968     int node = transNodeOrder[j];  // different order on each node
01969     int pe = transNodeInfo[node].pe_start;
01970     int npe = transNodeInfo[node].npe;
01971     int totlen = 0;
01972     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01973       LocalPmeInfo &li = localInfo[pe];
01974       int cpylen = li.ny_after_transpose * zdim;
01975       totlen += cpylen;
01976     }
01977     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01978                                 PRIORITY_SIZE) PmeTransMsg;
01979     newmsg->sourceNode = myGridPe;
01980     newmsg->lattice = lattice;
01981     newmsg->x_start = x_start;
01982     newmsg->nx = nx;
01983     for ( int g=0; g<numGrids; ++g ) {
01984       float *qmsg = newmsg->qgrid + nx * totlen * g;
01985       pe = transNodeInfo[node].pe_start;
01986       for (int i=0; i<npe; ++i, ++pe) {
01987         LocalPmeInfo &li = localInfo[pe];
01988         int cpylen = li.ny_after_transpose * zdim;
01989         if ( node == myTransNode ) {
01990           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01991           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01992         }
01993         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01994         for ( int x = 0; x < nx; ++x ) {
01995           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01996           q += slicelen;
01997           qmsg += cpylen;
01998         }
01999       }
02000     }
02001     newmsg->sequence = grid_sequence;
02002     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02003     if ( node == myTransNode ) newmsg->nx = 0;
02004     if ( npe > 1 ) {
02005       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02006       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02007     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02008   }
02009 }
02010 
02011 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
02012   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02013   int pe = transNodeInfo[myTransNode].pe_start;
02014   int npe = transNodeInfo[myTransNode].npe;
02015   CmiNodeLock lock = CmiCreateLock();
02016   int *count = new int; *count = npe;
02017   for (int i=0; i<npe; ++i, ++pe) {
02018     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02019     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02020     shmsg->msg = msg;
02021     shmsg->count = count;
02022     shmsg->lock = lock;
02023     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02024   }
02025 }
02026 
02027 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
02028   procTrans(msg->msg);
02029   CmiLock(msg->lock);
02030   int count = --(*msg->count);
02031   CmiUnlock(msg->lock);
02032   if ( count == 0 ) {
02033     CmiDestroyLock(msg->lock);
02034     delete msg->count;
02035     delete msg->msg;
02036   }
02037   delete msg;
02038 }
02039 
02040 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
02041   procTrans(msg);
02042   delete msg;
02043 }
02044 
02045 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
02046   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02047   if ( trans_count == numGridPes ) {
02048     lattice = msg->lattice;
02049     grid_sequence = msg->sequence;
02050   }
02051 
02052  if ( msg->nx ) {
02053   int zdim = myGrid.dim3;
02054   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02055   int first_pe = nodeInfo.pe_start;
02056   int last_pe = first_pe+nodeInfo.npe-1;
02057   int y_skip = localInfo[myTransPe].y_start_after_transpose
02058              - localInfo[first_pe].y_start_after_transpose;
02059   int ny_msg = localInfo[last_pe].y_start_after_transpose
02060              + localInfo[last_pe].ny_after_transpose
02061              - localInfo[first_pe].y_start_after_transpose;
02062   int ny = localInfo[myTransPe].ny_after_transpose;
02063   int x_start = msg->x_start;
02064   int nx = msg->nx;
02065   for ( int g=0; g<numGrids; ++g ) {
02066     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02067         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02068         nx*ny*zdim*sizeof(float));
02069   }
02070  }
02071 
02072   --trans_count;
02073 
02074   if ( trans_count == 0 ) {
02075     pmeProxyDir[CkMyPe()].gridCalc2();
02076   }
02077 }
02078 
02079 void ComputePmeMgr::gridCalc2(void) {
02080   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02081 
02082 #if CMK_BLUEGENEL
02083   CmiNetworkProgressAfter (0);
02084 #endif
02085 
02086   int zdim = myGrid.dim3;
02087   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02088   int ny = localInfo[myTransPe].ny_after_transpose;
02089 
02090   for ( int g=0; g<numGrids; ++g ) {
02091     // finish forward FFT (x dimension)
02092 #ifdef NAMD_FFTW
02093 #ifdef NAMD_FFTW_3
02094     fftwf_execute(forward_plan_x[g]);
02095 #else
02096     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02097         ny * zdim / 2, 1, work, 1, 0);
02098 #endif
02099 #endif
02100   }
02101 
02102 #ifdef OPENATOM_VERSION
02103     if ( ! simParams -> openatomOn ) { 
02104 #endif // OPENATOM_VERSION
02105       gridCalc2R();
02106 #ifdef OPENATOM_VERSION
02107     } else {
02108       gridCalc2Moa();
02109     }
02110 #endif // OPENATOM_VERSION
02111 }
02112 
02113 #ifdef OPENATOM_VERSION
02114 void ComputePmeMgr::gridCalc2Moa(void) {
02115 
02116   int zdim = myGrid.dim3;
02117   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02118   int ny = localInfo[myTransPe].ny_after_transpose;
02119 
02120   SimParameters *simParams = Node::Object()->simParameters;
02121 
02122   CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
02123 
02124   for ( int g=0; g<numGrids; ++g ) {
02125     #ifdef OPENATOM_VERSION_DEBUG 
02126     CkPrintf("Sending recQ on processor %d \n", CkMyPe());
02127     for ( int i=0; i<=(ny * zdim / 2); ++i) 
02128     {
02129       CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
02130     }
02131     #endif // OPENATOM_VERSION_DEBUG
02132 //     mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
02133     CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
02134     moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
02135   }
02136 }
02137 #endif // OPENATOM_VERSION
02138 
02139 void ComputePmeMgr::gridCalc2R(void) {
02140 
02141   int useCkLoop = 0;
02142 #if CMK_SMP && USE_CKLOOP
02143   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02144        && CkNumPes() >= 2 * numTransPes ) {
02145     useCkLoop = 1;
02146   }
02147 #endif
02148 
02149   int zdim = myGrid.dim3;
02150   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02151   int ny = localInfo[myTransPe].ny_after_transpose;
02152 
02153   for ( int g=0; g<numGrids; ++g ) {
02154     // reciprocal space portion of PME
02155     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02156     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02157                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02158     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02159 
02160     // start backward FFT (x dimension)
02161 
02162 #ifdef NAMD_FFTW
02163 #ifdef NAMD_FFTW_3
02164     fftwf_execute(backward_plan_x[g]);
02165 #else
02166     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02167         ny * zdim / 2, 1, work, 1, 0);
02168 #endif
02169 #endif
02170   }
02171   
02172   pmeProxyDir[CkMyPe()].sendUntrans();
02173 }
02174 
02175 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
02176   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02177   mgr->sendUntransSubset(first, last);
02178 }
02179 
02180 void ComputePmeMgr::sendUntrans(void) {
02181 
02182   trans_count = numGridPes;
02183 
02184   { // send energy and virial
02185     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02186     for ( int g=0; g<numGrids; ++g ) {
02187       newmsg->evir[g] = recip_evir2[g];
02188     }
02189     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02190     CmiEnableUrgentSend(1);
02191     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02192     CmiEnableUrgentSend(0);
02193   }
02194 
02195 #if     CMK_SMP && USE_CKLOOP
02196   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02197   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02198     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02199   } else
02200 #endif
02201   {
02202     sendUntransSubset(0, numGridNodes-1);
02203   }
02204 
02205 }
02206 
02207 void ComputePmeMgr::sendUntransSubset(int first, int last) {
02208 
02209   int zdim = myGrid.dim3;
02210   int y_start = localInfo[myTransPe].y_start_after_transpose;
02211   int ny = localInfo[myTransPe].ny_after_transpose;
02212   int slicelen = myGrid.K2 * zdim;
02213 
02214   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02215 
02216 #if CMK_BLUEGENEL
02217   CmiNetworkProgressAfter (0);
02218 #endif
02219 
02220   // send data for reverse transpose
02221   for (int j=first; j<=last; j++) {
02222     int node = gridNodeOrder[j];  // different order on each node
02223     int pe = gridNodeInfo[node].pe_start;
02224     int npe = gridNodeInfo[node].npe;
02225     int totlen = 0;
02226     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02227       LocalPmeInfo &li = localInfo[pe];
02228       int cpylen = li.nx * zdim;
02229       totlen += cpylen;
02230     }
02231     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02232     newmsg->sourceNode = myTransPe;
02233     newmsg->y_start = y_start;
02234     newmsg->ny = ny;
02235     for ( int g=0; g<numGrids; ++g ) {
02236       float *qmsg = newmsg->qgrid + ny * totlen * g;
02237       pe = gridNodeInfo[node].pe_start;
02238       for (int i=0; i<npe; ++i, ++pe) {
02239         LocalPmeInfo &li = localInfo[pe];
02240         if ( node == myGridNode ) {
02241           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02242           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02243           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02244           int cpylen = ny * zdim;
02245           for ( int x = 0; x < li.nx; ++x ) {
02246             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02247             q += cpylen;
02248             qmsg += slicelen;
02249           }
02250         } else {
02251           CmiMemcpy((void*)qmsg,
02252                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02253                 li.nx*ny*zdim*sizeof(float));
02254           qmsg += li.nx*ny*zdim;
02255         }
02256       }
02257     }
02258     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02259     if ( node == myGridNode ) newmsg->ny = 0;
02260     if ( npe > 1 ) {
02261       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02262       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02263     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02264   }
02265 }
02266 
02267 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
02268   int pe = gridNodeInfo[myGridNode].pe_start;
02269   int npe = gridNodeInfo[myGridNode].npe;
02270   CmiNodeLock lock = CmiCreateLock();
02271   int *count = new int; *count = npe;
02272   for (int i=0; i<npe; ++i, ++pe) {
02273     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02274     shmsg->msg = msg;
02275     shmsg->count = count;
02276     shmsg->lock = lock;
02277     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02278   }
02279 }
02280 
02281 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
02282   procUntrans(msg->msg);
02283   CmiLock(msg->lock);
02284   int count = --(*msg->count);
02285   CmiUnlock(msg->lock);
02286   if ( count == 0 ) {
02287     CmiDestroyLock(msg->lock);
02288     delete msg->count;
02289     delete msg->msg;
02290   }
02291   delete msg;
02292 }
02293 
02294 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
02295   procUntrans(msg);
02296   delete msg;
02297 }
02298 
02299 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
02300   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02301 
02302 #if CMK_BLUEGENEL
02303   CmiNetworkProgressAfter (0);
02304 #endif
02305 
02306   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02307   int first_pe = nodeInfo.pe_start;
02308   int g;
02309 
02310  if ( msg->ny ) {
02311   int zdim = myGrid.dim3;
02312   int last_pe = first_pe+nodeInfo.npe-1;
02313   int x_skip = localInfo[myGridPe].x_start
02314              - localInfo[first_pe].x_start;
02315   int nx_msg = localInfo[last_pe].x_start
02316              + localInfo[last_pe].nx
02317              - localInfo[first_pe].x_start;
02318   int nx = localInfo[myGridPe].nx;
02319   int y_start = msg->y_start;
02320   int ny = msg->ny;
02321   int slicelen = myGrid.K2 * zdim;
02322   int cpylen = ny * zdim;
02323   for ( g=0; g<numGrids; ++g ) {
02324     float *q = qgrid + qgrid_size * g + y_start * zdim;
02325     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02326     for ( int x = 0; x < nx; ++x ) {
02327       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02328       q += slicelen;
02329       qmsg += cpylen;
02330     }
02331   }
02332  }
02333 
02334   --untrans_count;
02335 
02336   if ( untrans_count == 0 ) {
02337     pmeProxyDir[CkMyPe()].gridCalc3();
02338   }
02339 }
02340 
02341 void ComputePmeMgr::gridCalc3(void) {
02342   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02343 
02344   // finish backward FFT
02345 #ifdef NAMD_FFTW
02346   for ( int g=0; g<numGrids; ++g ) {
02347 #ifdef NAMD_FFTW_3
02348     fftwf_execute(backward_plan_yz[g]);
02349 #else
02350     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02351         (fftw_complex *) (qgrid + qgrid_size * g),
02352         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02353 #endif
02354   }
02355 
02356 #endif
02357 
02358   pmeProxyDir[CkMyPe()].sendUngrid();
02359 }
02360 
02361 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
02362   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02363   mgr->sendUngridSubset(first, last);
02364 }
02365 
02366 void ComputePmeMgr::sendUngrid(void) {
02367 
02368 #if     CMK_SMP && USE_CKLOOP
02369   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02370   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02371     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02372   } else
02373 #endif
02374   {
02375     sendUngridSubset(0, numSources-1);
02376   }
02377 
02378   grid_count = numSources;
02379   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02380 }
02381 
02382 void ComputePmeMgr::sendUngridSubset(int first, int last) {
02383 
02384 #ifdef NAMD_CUDA
02385   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02386 #else
02387   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02388 #endif
02389 
02390   for ( int j=first; j<=last; ++j ) {
02391     // int msglen = qgrid_len;
02392     PmeGridMsg *newmsg = gridmsg_reuse[j];
02393     int pe = newmsg->sourceNode;
02394     int zdim = myGrid.dim3;
02395     int flen = newmsg->len;
02396     int fstart = newmsg->start;
02397     int zlistlen = newmsg->zlistlen;
02398     int *zlist = newmsg->zlist;
02399     float *qmsg = newmsg->qgrid;
02400     for ( int g=0; g<numGrids; ++g ) {
02401       char *f = newmsg->fgrid + fgrid_len * g;
02402       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02403       for ( int i=0; i<flen; ++i ) {
02404         if ( f[i] ) {
02405           for ( int k=0; k<zlistlen; ++k ) {
02406             *(qmsg++) = q[zlist[k]];
02407           }
02408         }
02409         q += zdim;
02410       }
02411     }
02412     newmsg->sourceNode = myGridPe;
02413 
02414     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02415     CmiEnableUrgentSend(1);
02416 #ifdef NAMD_CUDA
02417     if ( offload ) {
02418       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02419     } else
02420 #endif
02421     pmeProxyDir[pe].recvUngrid(newmsg);
02422     CmiEnableUrgentSend(0);
02423   }
02424 }
02425 
02426 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02427   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02428 #ifdef NAMD_CUDA
02429   if ( ! offload )  // would need lock
02430 #endif
02431   if ( ungrid_count == 0 ) {
02432     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02433   }
02434 
02435   if ( usePencils ) copyPencils(msg);
02436   else copyResults(msg);
02437   delete msg;
02438   recvAck(0);
02439 }
02440 
02441 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02442   if ( msg ) delete msg;
02443 #ifdef NAMD_CUDA
02444   if ( offload ) {
02445     CmiLock(cuda_lock);
02446     if ( ungrid_count == 0 ) {
02447       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02448     }
02449     int uc = --ungrid_count;
02450     CmiUnlock(cuda_lock);
02451 
02452     if ( uc == 0 ) {
02453       pmeProxyDir[master_pe].ungridCalc();
02454     }
02455     return;
02456   }
02457 #endif
02458   --ungrid_count;
02459 
02460   if ( ungrid_count == 0 ) {
02461     pmeProxyDir[CkMyPe()].ungridCalc();
02462   }
02463 }
02464 
02465 #ifdef NAMD_CUDA
02466 #define count_limit 1000000
02467 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
02468 #define EVENT_STRIDE 10
02469 
02470 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
02471 
02472 void cuda_check_pme_forces(void *arg, double walltime) {
02473   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
02474 
02475  while ( 1 ) { // process multiple events per call
02476   cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
02477   if ( err == cudaSuccess ) {
02478     argp->check_forces_count = 0;
02479     for ( int i=0; i<EVENT_STRIDE; ++i ) {
02480       WorkDistrib::messageEnqueueWork(argp->pmeComputes[argp->forces_done_count]);
02481       if ( ++(argp->forces_done_count) == argp->forces_count ) break;
02482     }
02483     if ( argp->forces_done_count == argp->forces_count ) { // last event
02484       traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
02485       argp->forces_time = walltime - argp->forces_time;
02486       //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
02487       return;
02488     } else { // more events
02489       continue; // check next event
02490     }
02491   } else if ( err != cudaErrorNotReady ) {
02492     cuda_errcheck("in cuda_check_pme_forces");
02493     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_forces");
02494   } else if ( ++(argp->check_forces_count) >= count_limit ) {
02495     char errmsg[256];
02496     sprintf(errmsg,"cuda_check_pme_forces polled %d times over %f s on seq %d",
02497             argp->check_forces_count, walltime - argp->forces_time,
02498             argp->saved_sequence);
02499     cuda_errcheck(errmsg);
02500     NAMD_die(errmsg);
02501   } else {
02502     break; // call again
02503   }
02504  } // while ( 1 )
02505  CcdCallBacksReset(0,walltime);  // fix Charm++
02506  CUDA_POLL(cuda_check_pme_forces, arg);
02507 }
02508 #endif // NAMD_CUDA
02509 
02510 void ComputePmeMgr::ungridCalc(void) {
02511   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02512 
02513   ungridForcesCount = pmeComputes.size();
02514 
02515 #ifdef NAMD_CUDA
02516  if ( offload ) {
02517   //CmiLock(cuda_lock);
02518   cudaSetDevice(deviceCUDA->getDeviceID());
02519 
02520   if ( this == masterPmeMgr ) {
02521     double before = CmiWallTimer();
02522     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02523     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02524     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02525 
02526     const int myrank = CkMyRank();
02527     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02528       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02529         nodePmeMgr->mgrObjects[i]->ungridCalc();
02530       }
02531     }
02532     if ( ! pmeComputes.size() ) return;
02533   }
02534 
02535   if ( ! end_forces ) {
02536     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02537     end_forces = new cudaEvent_t[n];
02538     for ( int i=0; i<n; ++i ) {
02539       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02540     }
02541   }
02542 
02543   const int pcsz = pmeComputes.size();
02544   if ( ! afn_host ) {
02545     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02546     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02547     cuda_errcheck("malloc params for pme");
02548   }
02549   int totn = 0;
02550   for ( int i=0; i<pcsz; ++i ) {
02551     int n = pmeComputes[i]->numGridAtoms[0];
02552     totn += n;
02553   }
02554   if ( totn > f_data_mgr_alloc ) {
02555     if ( f_data_mgr_alloc ) {
02556       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02557       cudaFree(f_data_mgr_dev);
02558       cudaFreeHost(f_data_mgr_host);
02559     }
02560     f_data_mgr_alloc = 1.2 * (totn + 100);
02561     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02562     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02563     cuda_errcheck("malloc forces for pme");
02564   }
02565   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02566   float *f_dev = f_data_mgr_dev;
02567   float *f_host = f_data_mgr_host;
02568   for ( int i=0; i<pcsz; ++i ) {
02569     int n = pmeComputes[i]->numGridAtoms[0];
02570     pmeComputes[i]->f_data_dev = f_dev;
02571     pmeComputes[i]->f_data_host = f_host;
02572     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02573     afn_host[3*i+1] = f_dev;
02574     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02575     f_dev += 3*n;
02576     f_host += 3*n;
02577   }
02578   //CmiLock(cuda_lock);
02579   double before = CmiWallTimer();
02580   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02581   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02582   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02583   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02584 
02585   for ( int i=0; i<pcsz; ++i ) {
02586     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02587     if ( i%EVENT_STRIDE == 0 ) {
02588       int dimy = pcsz - i;
02589       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02590       int maxn = 0;
02591       int subtotn = 0;
02592       for ( int j=0; j<dimy; ++j ) {
02593         int n = pmeComputes[i+j]->numGridAtoms[0];
02594         subtotn += n;
02595         if ( n > maxn ) maxn = n;
02596       }
02597       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02598       before = CmiWallTimer();
02599       cuda_pme_forces(
02600         bspline_coeffs_dev,
02601         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02602         pmeComputes[i]->a_data_dev,
02603         pmeComputes[i]->f_data_dev,
02604         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02605         streams[stream]);
02606       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02607       before = CmiWallTimer();
02608       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02609         cudaMemcpyDeviceToHost, streams[stream]);
02610       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02611       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02612       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02613     }
02614     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02615   }
02616   //CmiUnlock(cuda_lock);
02617  } else
02618 #endif // NAMD_CUDA
02619  {
02620   for ( int i=0; i<pmeComputes.size(); ++i ) {
02621     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02622     // pmeComputes[i]->ungridForces();
02623   }
02624  }
02625   // submitReductions();  // must follow all ungridForces()
02626 
02627 #ifdef NAMD_CUDA
02628  if ( offload ) {
02629   forces_time = CmiWallTimer();
02630   forces_count = ungridForcesCount;
02631   forces_done_count = 0;
02632   pmeProxy[this_pe].pollForcesReady();
02633  }
02634 #endif
02635 
02636   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02637 }
02638 
02639 void ComputePmeMgr::pollForcesReady() {
02640 #ifdef NAMD_CUDA
02641   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02642   CUDA_POLL(cuda_check_pme_forces,this);
02643 #else
02644   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02645 #endif
02646 }
02647 
02648 void ComputePme::atomUpdate() { atomsChanged = 1; }
02649 
02650 ComputePme::ComputePme(ComputeID c, PatchID pid) : Compute(c), patchID(pid)
02651 {
02652   DebugM(4,"ComputePme created.\n");
02653   basePriority = PME_PRIORITY;
02654   setNumPatches(1);
02655 
02656   CProxy_ComputePmeMgr::ckLocalBranch(
02657         CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
02658 
02659   SimParameters *simParams = Node::Object()->simParameters;
02660 
02661   qmForcesOn =  simParams->qmForcesOn;
02662   offload = simParams->PMEOffload;
02663 
02664   numGridsMax = numGrids;
02665 
02666   myGrid.K1 = simParams->PMEGridSizeX;
02667   myGrid.K2 = simParams->PMEGridSizeY;
02668   myGrid.K3 = simParams->PMEGridSizeZ;
02669   myGrid.order = simParams->PMEInterpOrder;
02670   myGrid.dim2 = myGrid.K2;
02671   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02672 
02673 #ifdef NAMD_CUDA
02674   cuda_atoms_offset = 0;
02675   f_data_host = 0;
02676   f_data_dev = 0;
02677  if ( ! offload )
02678 #endif
02679  {
02680   for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
02681  }
02682 
02683   atomsChanged = 0;
02684   
02685   qmLoclIndx = 0;
02686   qmLocalCharges = 0;
02687 }
02688 
02689 void ComputePme::initialize() {
02690   if (!(patch = PatchMap::Object()->patch(patchID))) {
02691     NAMD_bug("ComputePme used with unknown patch.");
02692   }
02693   positionBox = patch->registerPositionPickup(this);
02694   avgPositionBox = patch->registerAvgPositionPickup(this);
02695   forceBox = patch->registerForceDeposit(this);
02696 #ifdef NAMD_CUDA
02697  if ( offload ) {
02698   myMgr->cuda_atoms_count += patch->getNumAtoms();
02699  }
02700 #endif
02701 }
02702 
02703 void ComputePmeMgr::initialize_computes() {
02704 
02705   noWorkCount = 0;
02706   doWorkCount = 0;
02707   ungridForcesCount = 0;
02708 
02709   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02710 
02711   SimParameters *simParams = Node::Object()->simParameters;
02712 
02713   strayChargeErrors = 0;
02714 
02715 #ifdef NAMD_CUDA
02716  PatchMap *patchMap = PatchMap::Object();
02717  int pe = master_pe = CkNodeFirst(CkMyNode());
02718  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02719     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02720     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02721     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02722     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02723     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02724         && pe != deviceCUDA->getMasterPe() ) {
02725       master_pe = pe;
02726     }
02727  }
02728  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02729    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02730  }
02731 
02732  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02733  bool cudaFirst = 1;
02734  if ( offload ) {
02735   CmiLock(cuda_lock);
02736   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02737  }
02738 
02739  if ( cudaFirst ) {
02740   nodePmeMgr->master_pe = master_pe;
02741   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02742  }
02743 #endif
02744 
02745   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02746   fsize = myGrid.K1 * myGrid.dim2;
02747   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02748 #ifdef NAMD_CUDA
02749  if ( ! offload )
02750 #endif
02751  {
02752   q_arr = new float*[fsize*numGrids];
02753   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02754   q_list = new float*[fsize*numGrids];
02755   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02756   q_count = 0;
02757  }
02758 
02759 #ifdef NAMD_CUDA
02760  if ( cudaFirst || ! offload ) {
02761 #endif
02762   f_arr = new char[fsize*numGrids];
02763   // memset to non-zero value has race condition on BlueGene/Q
02764   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02765   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02766 
02767   for ( int g=0; g<numGrids; ++g ) {
02768     char *f = f_arr + g*fsize;
02769     if ( usePencils ) {
02770       int K1 = myGrid.K1;
02771       int K2 = myGrid.K2;
02772       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02773       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02774       int dim2 = myGrid.dim2;
02775       for (int ap=0; ap<numPencilsActive; ++ap) {
02776         int ib = activePencils[ap].i;
02777         int jb = activePencils[ap].j;
02778         int ibegin = ib*block1;
02779         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02780         int jbegin = jb*block2;
02781         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02782         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02783         for ( int i=ibegin; i<iend; ++i ) {
02784           for ( int j=jbegin; j<jend; ++j ) {
02785             f[i*dim2+j] = 0;
02786           }
02787         }
02788       }
02789     } else {
02790       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02791       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02792       for (int pe=0; pe<numGridPes; pe++) {
02793         if ( ! recipPeDest[pe] ) continue;
02794         int start = pe * bsize;
02795         int len = bsize;
02796         if ( start >= qsize ) { start = 0; len = 0; }
02797         if ( start + len > qsize ) { len = qsize - start; }
02798         int zdim = myGrid.dim3;
02799         int fstart = start / zdim;
02800         int flen = len / zdim;
02801         memset(f + fstart, 0, flen*sizeof(char));
02802         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02803       }
02804     }
02805   }
02806 #ifdef NAMD_CUDA
02807  }
02808  if ( offload ) {
02809  cudaSetDevice(deviceCUDA->getDeviceID());
02810  if ( cudaFirst ) {
02811 
02812   int f_alloc_count = 0;
02813   for ( int n=fsize, i=0; i<n; ++i ) {
02814     if ( f_arr[i] == 0 ) {
02815       ++f_alloc_count;
02816     }
02817   }
02818   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02819 
02820   q_arr = new float*[fsize*numGrids];
02821   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02822 
02823   float **q_arr_dev_host = new float*[fsize];
02824   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02825 
02826   float **v_arr_dev_host = new float*[fsize];
02827   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02828 
02829   int q_stride = myGrid.K3+myGrid.order-1;
02830   q_data_size = f_alloc_count * q_stride * sizeof(float);
02831   ffz_size = (fsize + q_stride) * sizeof(int);
02832 
02833   // tack ffz onto end of q_data to allow merged transfer
02834   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02835   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02836   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02837   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02838   cudaMalloc((void**) &v_data_dev, q_data_size);
02839   cuda_errcheck("malloc grid data for pme");
02840   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02841   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02842   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02843   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02844   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02845 
02846   f_alloc_count = 0;
02847   for ( int n=fsize, i=0; i<n; ++i ) {
02848     if ( f_arr[i] == 0 ) {
02849       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02850       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02851       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02852       ++f_alloc_count;
02853     } else {
02854       q_arr[i] = 0;
02855       q_arr_dev_host[i] = 0;
02856       v_arr_dev_host[i] = 0;
02857     }
02858   }
02859 
02860   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02861   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02862   delete [] q_arr_dev_host;
02863   delete [] v_arr_dev_host;
02864   delete [] f_arr;
02865   f_arr = new char[fsize + q_stride];
02866   fz_arr = f_arr + fsize;
02867   memset(f_arr, 0, fsize + q_stride);
02868   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02869 
02870   cuda_errcheck("initialize grid data for pme");
02871 
02872   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02873   cuda_errcheck("initialize bspline coefficients for pme");
02874 
02875 #define XCOPY(X) masterPmeMgr->X = X;
02876   XCOPY(bspline_coeffs_dev)
02877   XCOPY(bspline_dcoeffs_dev)
02878   XCOPY(q_arr)
02879   XCOPY(q_arr_dev)
02880   XCOPY(v_arr_dev)
02881   XCOPY(q_data_size)
02882   XCOPY(q_data_host)
02883   XCOPY(q_data_dev)
02884   XCOPY(v_data_dev)
02885   XCOPY(ffz_size)
02886   XCOPY(ffz_host)
02887   XCOPY(ffz_dev)
02888   XCOPY(f_arr)
02889   XCOPY(fz_arr)
02890 #undef XCOPY
02891   //CkPrintf("pe %d init first\n", CkMyPe());
02892  } else { // cudaFirst
02893   //CkPrintf("pe %d init later\n", CkMyPe());
02894 #define XCOPY(X) X = masterPmeMgr->X;
02895   XCOPY(bspline_coeffs_dev)
02896   XCOPY(bspline_dcoeffs_dev)
02897   XCOPY(q_arr)
02898   XCOPY(q_arr_dev)
02899   XCOPY(v_arr_dev)
02900   XCOPY(q_data_size)
02901   XCOPY(q_data_host)
02902   XCOPY(q_data_dev)
02903   XCOPY(v_data_dev)
02904   XCOPY(ffz_size)
02905   XCOPY(ffz_host)
02906   XCOPY(ffz_dev)
02907   XCOPY(f_arr)
02908   XCOPY(fz_arr)
02909 #undef XCOPY
02910  } // cudaFirst
02911   CmiUnlock(cuda_lock);
02912  } else // offload
02913 #endif // NAMD_CUDA
02914  {
02915   fz_arr = new char[myGrid.K3+myGrid.order-1];
02916  }
02917 
02918 #if 0 && USE_PERSISTENT
02919   recvGrid_handle = NULL;
02920 #endif
02921 }
02922 
02923 ComputePme::~ComputePme()
02924 {
02925 #ifdef NAMD_CUDA
02926   if ( ! offload )
02927 #endif
02928   {
02929     for ( int g=0; g<numGridsMax; ++g ) delete myRealSpace[g];
02930   }
02931 }
02932 
02933 #if 0 && USE_PERSISTENT 
02934 void ComputePmeMgr::setup_recvgrid_persistent() 
02935 {
02936     int K1 = myGrid.K1;
02937     int K2 = myGrid.K2;
02938     int dim2 = myGrid.dim2;
02939     int dim3 = myGrid.dim3;
02940     int block1 = myGrid.block1;
02941     int block2 = myGrid.block2;
02942 
02943     CkArray *zPencil_local = zPencil.ckLocalBranch();
02944     recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02945     for (int ap=0; ap<numPencilsActive; ++ap) {
02946         int ib = activePencils[ap].i;
02947         int jb = activePencils[ap].j;
02948         int ibegin = ib*block1;
02949         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02950         int jbegin = jb*block2;
02951         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02952         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02953         // f is changing
02954         int fcount = 0;
02955         for ( int g=0; g<numGrids; ++g ) {
02956             char *f = f_arr + g*fsize;
02957             for ( int i=ibegin; i<iend; ++i ) {
02958                 for ( int j=jbegin; j<jend; ++j ) {
02959                     fcount += f[i*dim2+j];
02960                 }
02961             }
02962         }
02963         int zlistlen = 0;
02964         for ( int i=0; i<myGrid.K3; ++i ) {
02965             if ( fz_arr[i] ) ++zlistlen;
02966         }
02967         int hd = ( fcount? 1 : 0 );  // has data?
02968         int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
02969         int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
02970         int compress_size = sizeof(float)*hd*fcount*zlistlen;
02971         int size = compress_start +  compress_size  + PRIORITY_SIZE/8+6;
02972         recvGrid_handle[ap] =  CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
02973     }
02974 }
02975 #endif
02976 
02977 int ComputePme::noWork() {
02978 
02979   if ( patch->flags.doFullElectrostatics ) {
02980     // In QM/MM simulations, atom charges form QM regions need special treatment.
02981     if ( qmForcesOn ) {
02982         return 1;
02983     }
02984     if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0;  // work to do, enqueue as usual
02985     myMgr->heldComputes.add(this);
02986     return 1;  // don't enqueue yet
02987   }
02988 
02989   positionBox->skip();
02990   forceBox->skip();
02991 
02992   if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
02993     myMgr->noWorkCount = 0;
02994     myMgr->reduction->submit();
02995   }
02996 
02997   atomsChanged = 0;
02998 
02999   return 1;  // no work for this step
03000 }
03001 
03002 void ComputePmeMgr::addRecipEvirClient() {
03003   ++recipEvirClients;
03004 }
03005 
03006 void ComputePmeMgr::recvRecipEvir(PmeEvirMsg *msg) {
03007   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03008   for ( int g=0; g<numGrids; ++g ) {
03009     evir[g] += msg->evir[g];
03010   }
03011   delete msg;
03012   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03013   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03014 }
03015 
03016 void ComputePme::doQMWork() {
03017     
03018 //     iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
03019     
03020     
03021     int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
03022     const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
03023     const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
03024     const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
03025     
03026     const CompAtomExt *xExt = patch->getCompAtomExtInfo();
03027     
03028     // Determine number of qm atoms in this patch for the current step.
03029     numLocalQMAtoms = 0;
03030     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03031         if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
03032             numLocalQMAtoms++;
03033         }
03034     }
03035     
03036     // We prepare a charge vector with QM charges for use in the PME calculation.
03037     
03038     // Clears data from last step, if there is any.
03039     if (qmLoclIndx != 0)
03040         delete [] qmLoclIndx;
03041     if (qmLocalCharges != 0)
03042         delete [] qmLocalCharges;
03043     
03044     qmLoclIndx = new int[numLocalQMAtoms] ;
03045     qmLocalCharges = new Real[numLocalQMAtoms] ;
03046     
03047     // I am assuming there will be (in general) more QM atoms among all QM groups
03048     // than MM atoms in a patch.
03049     int procAtms = 0;
03050     
03051     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03052         
03053         for (int i=0; i<numQMAtms; i++) {
03054             
03055             if (qmAtmIndx[i] == xExt[paIter].id) {
03056                 
03057                 qmLoclIndx[procAtms] = paIter ;
03058                 qmLocalCharges[procAtms] = qmAtmChrg[i];
03059                 
03060                 procAtms++;
03061                 break;
03062             }
03063             
03064         }
03065         
03066         if (procAtms == numLocalQMAtoms)
03067             break;
03068     }
03069     
03070     doWork();
03071     return ;
03072 }
03073 
03074 void ComputePme::doWork()
03075 {
03076   DebugM(4,"Entering ComputePme::doWork().\n");
03077 
03078   if ( basePriority >= COMPUTE_HOME_PRIORITY ) {
03079 #ifdef NAMD_CUDA
03080     basePriority = ( offload ? PME_OFFLOAD_PRIORITY : PME_PRIORITY );
03081 #else
03082     basePriority = PME_PRIORITY;
03083 #endif
03084     ungridForces();
03085     // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03086     if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
03087     return;
03088   }
03089   basePriority = COMPUTE_HOME_PRIORITY + PATCH_PRIORITY(patchID);
03090   // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03091 
03092 #ifdef TRACE_COMPUTE_OBJECTS
03093     double traceObjStartTime = CmiWallTimer();
03094 #endif
03095 
03096 #ifdef NAMD_CUDA
03097   if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
03098 #endif
03099 
03100   // allocate storage
03101   numLocalAtoms = patch->getNumAtoms();
03102 
03103   Lattice &lattice = patch->flags.lattice;
03104 
03105   localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
03106   localData = localData_alloc.begin();
03107   localPartition_alloc.resize(numLocalAtoms);
03108   localPartition = localPartition_alloc.begin();
03109 
03110   int g;
03111   for ( g=0; g<numGrids; ++g ) {
03112     localGridData[g] = localData + numLocalAtoms*(g+1);
03113   }
03114 
03115   // get positions and charges
03116   PmeParticle * data_ptr = localData;
03117   unsigned char * part_ptr = localPartition;
03118   const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
03119                                 * ComputeNonbondedUtil::dielectric_1 );
03120 
03121   {
03122     CompAtom *x = positionBox->open();
03123     // CompAtomExt *xExt = patch->getCompAtomExtInfo();
03124     if ( patch->flags.doMolly ) {
03125       positionBox->close(&x);
03126       x = avgPositionBox->open();
03127     }
03128     int numAtoms = patch->getNumAtoms();
03129 
03130     for(int i=0; i<numAtoms; ++i)
03131     {
03132       data_ptr->x = x[i].position.x;
03133       data_ptr->y = x[i].position.y;
03134       data_ptr->z = x[i].position.z;
03135       data_ptr->cg = coulomb_sqrt * x[i].charge;
03136       ++data_ptr;
03137       *part_ptr = x[i].partition;
03138       ++part_ptr;
03139     }
03140 
03141     // QM loop to overwrite charges of QM atoms.
03142     // They are zero for NAMD, but are updated in ComputeQM.
03143     if ( qmForcesOn ) {
03144         
03145         for(int i=0; i<numLocalQMAtoms; ++i)
03146         {
03147           localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
03148         }
03149         
03150     }
03151     
03152     if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
03153     else { positionBox->close(&x); }
03154   }
03155 
03156   // copy to other grids if needed
03157   if ( (alchOn && (!alchDecouple)) || lesOn ) {
03158     for ( g=0; g<numGrids; ++g ) {
03159       PmeParticle *lgd = localGridData[g];
03160       int nga = 0;
03161       for(int i=0; i<numLocalAtoms; ++i) {
03162         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03163           // for FEP/TI: grid 0 gets non-alch + partition 1;
03164           // grid 1 gets non-alch + partition 2;
03165           // grid 2 (only if called for with numGrids=3) gets only non-alch
03166           lgd[nga++] = localData[i];
03167         }
03168       }
03169       numGridAtoms[g] = nga;
03170     }
03171   } else if ( alchOn && alchDecouple) {
03172     // alchemical decoupling: four grids
03173     // g=0: partition 0 and partition 1
03174     // g=1: partition 0 and partition 2
03175     // g=2: only partition 1 atoms
03176     // g=3: only partition 2 atoms
03177     // plus one grid g=4, only partition 0, if numGrids=5
03178     for ( g=0; g<2; ++g ) {  // same as before for first 2
03179       PmeParticle *lgd = localGridData[g];
03180       int nga = 0;
03181       for(int i=0; i<numLocalAtoms; ++i) {
03182         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03183           lgd[nga++] = localData[i];
03184         }
03185       }
03186       numGridAtoms[g] = nga;
03187     }
03188     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
03189       PmeParticle *lgd = localGridData[g];
03190       int nga = 0;
03191       for(int i=0; i<numLocalAtoms; ++i) {
03192         if ( localPartition[i] == (g-1) ) {
03193           lgd[nga++] = localData[i];
03194         }
03195       }
03196       numGridAtoms[g] = nga;
03197     }
03198     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
03199       // numGrids=5 only if alchElecLambdaStart > 0
03200       PmeParticle *lgd = localGridData[g];
03201       int nga = 0;
03202       for(int i=0; i<numLocalAtoms; ++i) {
03203         if ( localPartition[i] == 0 ) {
03204           lgd[nga++] = localData[i];
03205         }
03206       }
03207       numGridAtoms[g] = nga;
03208     }
03209   } else if ( selfOn ) {
03210     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
03211     g = 0;
03212     PmeParticle *lgd = localGridData[g];
03213     int nga = 0;
03214     for(int i=0; i<numLocalAtoms; ++i) {
03215       if ( localPartition[i] == 1 ) {
03216         lgd[nga++] = localData[i];
03217       }
03218     }
03219     numGridAtoms[g] = nga;
03220   } else if ( pairOn ) {
03221     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
03222     g = 0;
03223     PmeParticle *lgd = localGridData[g];
03224     int nga = 0;
03225     for(int i=0; i<numLocalAtoms; ++i) {
03226       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
03227         lgd[nga++] = localData[i];
03228       }
03229     }
03230     numGridAtoms[g] = nga;
03231     for ( g=1; g<3; ++g ) {
03232       PmeParticle *lgd = localGridData[g];
03233       int nga = 0;
03234       for(int i=0; i<numLocalAtoms; ++i) {
03235         if ( localPartition[i] == g ) {
03236           lgd[nga++] = localData[i];
03237         }
03238       }
03239       numGridAtoms[g] = nga;
03240     }
03241   } else {
03242     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
03243     localGridData[0] = localData;
03244     numGridAtoms[0] = numLocalAtoms;
03245   }
03246 
03247  if ( ! myMgr->doWorkCount ) {
03248   myMgr->doWorkCount = myMgr->pmeComputes.size();
03249 
03250 #ifdef NAMD_CUDA
03251  if ( !  offload )
03252 #endif // NAMD_CUDA
03253  {
03254   memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
03255 
03256   for (int i=0; i<myMgr->q_count; ++i) {
03257     memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
03258   }
03259  }
03260 
03261   for ( g=0; g<numGrids; ++g ) {
03262     myMgr->evir[g] = 0;
03263   }
03264 
03265   myMgr->strayChargeErrors = 0;
03266 
03267   myMgr->compute_sequence = sequence();
03268  }
03269 
03270   if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
03271 
03272   int strayChargeErrors = 0;
03273 
03274   // calculate self energy
03275   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
03276   for ( g=0; g<numGrids; ++g ) {
03277     BigReal selfEnergy = 0;
03278     data_ptr = localGridData[g];
03279     int i;
03280     for(i=0; i<numGridAtoms[g]; ++i)
03281     {
03282       selfEnergy += data_ptr->cg * data_ptr->cg;
03283       ++data_ptr;
03284     }
03285     selfEnergy *= -1. * ewaldcof / SQRT_PI;
03286     myMgr->evir[g][0] += selfEnergy;
03287 
03288     float **q = myMgr->q_arr + g*myMgr->fsize;
03289     char *f = myMgr->f_arr + g*myMgr->fsize;
03290 
03291     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
03292 #ifdef NAMD_CUDA
03293    if ( offload ) {
03294     if ( myMgr->cuda_atoms_alloc == 0 ) {  // first call
03295       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03296       cuda_errcheck("before malloc atom data for pme");
03297       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03298       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03299       cuda_errcheck("malloc atom data for pme");
03300       myMgr->cuda_atoms_count = 0;
03301     }
03302     cuda_atoms_offset = myMgr->cuda_atoms_count;
03303     int n = numGridAtoms[g];
03304     myMgr->cuda_atoms_count += n;
03305     if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
03306       CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
03307                         CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
03308       cuda_errcheck("before malloc expanded atom data for pme");
03309       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03310       const float *a_data_host_old = myMgr->a_data_host;
03311       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03312       cuda_errcheck("malloc expanded host atom data for pme");
03313       memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
03314       cudaFreeHost((void*) a_data_host_old);
03315       cuda_errcheck("free expanded host atom data for pme");
03316       cudaFree(myMgr->a_data_dev);
03317       cuda_errcheck("free expanded dev atom data for pme");
03318       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03319       cuda_errcheck("malloc expanded dev atom data for pme");
03320     }
03321     float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
03322     data_ptr = localGridData[g];
03323     double order_1 = myGrid.order - 1;
03324     double K1 = myGrid.K1;
03325     double K2 = myGrid.K2;
03326     double K3 = myGrid.K3;
03327     int found_negative = 0;
03328     for ( int i=0; i<n; ++i ) {
03329       if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
03330         found_negative = 1;
03331         // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
03332       }
03333       double x_int = (int) data_ptr[i].x;
03334       double y_int = (int) data_ptr[i].y;
03335       double z_int = (int) data_ptr[i].z;
03336       a_data_host[7*i  ] = data_ptr[i].x - x_int;  // subtract in double precision
03337       a_data_host[7*i+1] = data_ptr[i].y - y_int;
03338       a_data_host[7*i+2] = data_ptr[i].z - z_int;
03339       a_data_host[7*i+3] = data_ptr[i].cg;
03340       x_int -= order_1;  if ( x_int < 0 ) x_int += K1;
03341       y_int -= order_1;  if ( y_int < 0 ) y_int += K2;
03342       z_int -= order_1;  if ( z_int < 0 ) z_int += K3;
03343       a_data_host[7*i+4] = x_int;
03344       a_data_host[7*i+5] = y_int;
03345       a_data_host[7*i+6] = z_int;
03346     }
03347     if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
03348    } else
03349 #endif // NAMD_CUDA
03350    {
03351     myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
03352     myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
03353    }
03354   }
03355   myMgr->strayChargeErrors += strayChargeErrors;
03356 
03357 #ifdef TRACE_COMPUTE_OBJECTS
03358     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
03359 #endif
03360 
03361  if ( --(myMgr->doWorkCount) == 0 ) {
03362 // cudaDeviceSynchronize();  // XXXX
03363 #ifdef NAMD_CUDA
03364   if ( offload ) {
03365     ComputePmeMgr::cuda_submit_charges_args args;
03366     args.mgr = myMgr;
03367     args.lattice = &lattice;
03368     args.sequence = sequence();
03369     CmiLock(ComputePmeMgr::cuda_lock);
03370     if ( ComputePmeMgr::cuda_busy ) {
03371       ComputePmeMgr::cuda_submit_charges_deque.push_back(args);
03372     } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
03373       // avoid adding work to nonbonded data preparation pe
03374       args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03375     } else {
03376       ComputePmeMgr::cuda_busy = true;
03377       while ( 1 ) {
03378         CmiUnlock(ComputePmeMgr::cuda_lock);
03379         args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03380         CmiLock(ComputePmeMgr::cuda_lock);
03381         if ( ComputePmeMgr::cuda_submit_charges_deque.size() ) {
03382           args = ComputePmeMgr::cuda_submit_charges_deque.front();
03383           ComputePmeMgr::cuda_submit_charges_deque.pop_front();
03384         } else {
03385           ComputePmeMgr::cuda_busy = false;
03386           break;
03387         }
03388       }
03389     }
03390     CmiUnlock(ComputePmeMgr::cuda_lock);
03391   } else
03392 #endif // NAMD_CUDA
03393   {
03394     myMgr->chargeGridReady(lattice,sequence());
03395   }
03396  }
03397  atomsChanged = 0;
03398 }
03399 
03400 #ifdef NAMD_CUDA
03401 
03402 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
03403 
03404     int n = cuda_atoms_count;
03405     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03406     cuda_atoms_count = 0;
03407 
03408     const double before = CmiWallTimer();
03409     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03410                           cudaMemcpyHostToDevice, streams[stream]);
03411     const double after = CmiWallTimer();
03412 
03413     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03414 
03415     cuda_pme_charges(
03416       bspline_coeffs_dev,
03417       q_arr_dev, ffz_dev, ffz_dev + fsize,
03418       a_data_dev, n,
03419       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03420       streams[stream]);
03421     const double after2 = CmiWallTimer();
03422 
03423     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03424 
03425     masterPmeMgr->charges_time = before;
03426     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03427     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03428 }
03429 
03430 void cuda_check_pme_charges(void *arg, double walltime) {
03431   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
03432 
03433   cudaError_t err = cudaEventQuery(argp->end_charges);
03434   if ( err == cudaSuccess ) {
03435     traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
03436     argp->charges_time = walltime - argp->charges_time;
03437     argp->sendChargeGridReady();
03438     argp->check_charges_count = 0;
03439   } else if ( err != cudaErrorNotReady ) {
03440     cuda_errcheck("in cuda_check_pme_charges");
03441     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_charges");
03442   } else if ( ++(argp->check_charges_count) >= count_limit ) {
03443     char errmsg[256];
03444     sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
03445             argp->check_charges_count, walltime - argp->charges_time,
03446             argp->saved_sequence);
03447     cuda_errcheck(errmsg);
03448     NAMD_die(errmsg);
03449   } else {
03450     CcdCallBacksReset(0,walltime);  // fix Charm++
03451     CUDA_POLL(cuda_check_pme_charges, arg);
03452   }
03453 }
03454 
03455 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
03456   saved_lattice = &lattice;
03457   saved_sequence = sequence;
03458 
03459   // cudaDeviceSynchronize();  //  XXXX TESTING
03460   //int q_stride = myGrid.K3+myGrid.order-1;
03461   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03462   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03463   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03464   //  }
03465   //}
03466   //CmiLock(cuda_lock);
03467 
03468  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03469   double before = CmiWallTimer();
03470   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03471   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03472   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03473                         cudaMemcpyDeviceToHost, streams[stream]);
03474   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03475   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03476   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03477   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03478   //CmiUnlock(cuda_lock);
03479   // cudaDeviceSynchronize();  //  XXXX TESTING
03480   // cuda_errcheck("after memcpy grid to host");
03481 
03482   SimParameters *simParams = Node::Object()->simParameters;
03483   if ( ! simParams->useCUDA2 ) {
03484     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03485     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03486   }
03487 
03488   pmeProxy[master_pe].pollChargeGridReady();
03489  }
03490 }
03491 
03492 void ComputePmeMgr::sendChargeGridReady() {
03493   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03494     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03495     int cs = mgr->pmeComputes.size();
03496     if ( cs ) {
03497       mgr->ungridForcesCount = cs;
03498       mgr->recipEvirCount = mgr->recipEvirClients;
03499       masterPmeMgr->chargeGridSubmittedCount++;
03500     }
03501   }
03502   pmeProxy[master_pe].recvChargeGridReady();
03503 }
03504 #endif // NAMD_CUDA
03505 
03506 void ComputePmeMgr::pollChargeGridReady() {
03507 #ifdef NAMD_CUDA
03508   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03509   CUDA_POLL(cuda_check_pme_charges,this);
03510 #else
03511   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03512 #endif
03513 }
03514 
03515 void ComputePmeMgr::recvChargeGridReady() {
03516   chargeGridReady(*saved_lattice,saved_sequence);
03517 }
03518 
03519 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
03520 
03521 #ifdef NAMD_CUDA
03522  if ( offload ) {
03523   int errcount = 0;
03524   int q_stride = myGrid.K3+myGrid.order-1;
03525   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03526     f_arr[j] = ffz_host[j];
03527     if ( ffz_host[j] & ~1 ) ++errcount;
03528   }
03529   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03530  }
03531 #endif
03532   recipEvirCount = recipEvirClients;
03533   ungridForcesCount = pmeComputes.size();
03534 
03535   for (int j=0; j<myGrid.order-1; ++j) {
03536     fz_arr[j] |= fz_arr[myGrid.K3+j];
03537   }
03538 
03539   if ( usePencils ) {
03540     sendPencils(lattice,sequence);
03541   } else {
03542     sendData(lattice,sequence);
03543   }
03544 }
03545 
03546 
03547 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
03548 
03549   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03550 
03551 #if 0 && USE_PERSISTENT
03552     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03553 #endif
03554   int K1 = myGrid.K1;
03555   int K2 = myGrid.K2;
03556   int dim2 = myGrid.dim2;
03557   int dim3 = myGrid.dim3;
03558   int block1 = myGrid.block1;
03559   int block2 = myGrid.block2;
03560 
03561   // int savedMessages = 0;
03562   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03563 
03564   for (int ap=first; ap<=last; ++ap) {
03565     int ib = activePencils[ap].i;
03566     int jb = activePencils[ap].j;
03567     int ibegin = ib*block1;
03568     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03569     int jbegin = jb*block2;
03570     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03571     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03572 
03573     int fcount = 0;
03574     for ( int g=0; g<numGrids; ++g ) {
03575       char *f = f_arr + g*fsize;
03576 #ifdef NAMD_CUDA
03577      if ( offload ) {
03578       int errcount = 0;
03579       for ( int i=ibegin; i<iend; ++i ) {
03580        for ( int j=jbegin; j<jend; ++j ) {
03581         int k = i*dim2+j;
03582         f[k] = ffz_host[k];
03583         fcount += f[k];
03584         if ( ffz_host[k] & ~1 ) ++errcount;
03585        }
03586       }
03587       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03588      } else
03589 #endif
03590       for ( int i=ibegin; i<iend; ++i ) {
03591        for ( int j=jbegin; j<jend; ++j ) {
03592         fcount += f[i*dim2+j];
03593        }
03594       }
03595     }
03596 
03597 #ifdef NETWORK_PROGRESS
03598     CmiNetworkProgress();
03599 #endif
03600 
03601     if ( ! pencilActive[ib*yBlocks+jb] )
03602       NAMD_bug("PME activePencils list inconsistent");
03603 
03604     int zlistlen = 0;
03605     for ( int i=0; i<myGrid.K3; ++i ) {
03606       if ( fz_arr[i] ) ++zlistlen;
03607     }
03608 
03609     int hd = ( fcount? 1 : 0 );  // has data?
03610     // if ( ! hd ) ++savedMessages;
03611 
03612     
03613     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03614         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03615     msg->sourceNode = sourcepe;
03616     msg->hasData = hd;
03617     msg->lattice = lattice;
03618    if ( hd ) {
03619 #if 0
03620     msg->start = fstart;
03621     msg->len = flen;
03622 #else
03623     msg->start = -1;   // obsolete?
03624     msg->len = -1;   // obsolete?
03625 #endif
03626     msg->zlistlen = zlistlen;
03627     int *zlist = msg->zlist;
03628     zlistlen = 0;
03629     for ( int i=0; i<myGrid.K3; ++i ) {
03630       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03631     }
03632     char *fmsg = msg->fgrid;
03633     float *qmsg = msg->qgrid;
03634     for ( int g=0; g<numGrids; ++g ) {
03635       char *f = f_arr + g*fsize;
03636       float **q = q_arr + g*fsize;
03637       for ( int i=ibegin; i<iend; ++i ) {
03638        for ( int j=jbegin; j<jend; ++j ) {
03639         *(fmsg++) = f[i*dim2+j];
03640         if( f[i*dim2+j] ) {
03641           for (int h=0; h<myGrid.order-1; ++h) {
03642             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03643           }
03644           for ( int k=0; k<zlistlen; ++k ) {
03645             *(qmsg++) = q[i*dim2+j][zlist[k]];
03646           }
03647         }
03648        }
03649       }
03650     }
03651    }
03652 
03653     msg->sequence = compute_sequence;
03654     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03655     CmiEnableUrgentSend(1);
03656 #if USE_NODE_PAR_RECEIVE
03657     msg->destElem=CkArrayIndex3D(ib,jb,0);
03658     CProxy_PmePencilMap lzm = npMgr->zm;
03659     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03660     int destnode = CmiNodeOf(destproc);
03661     
03662 #if  0 
03663     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03664 #endif
03665     pmeNodeProxy[destnode].recvZGrid(msg);
03666 #if 0 
03667     CmiUsePersistentHandle(NULL, 0);
03668 #endif
03669 #else
03670 #if 0 
03671     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03672 #endif
03673     zPencil(ib,jb,0).recvGrid(msg);
03674 #if 0 
03675     CmiUsePersistentHandle(NULL, 0);
03676 #endif
03677 #endif
03678     CmiEnableUrgentSend(0);
03679   }
03680 
03681 
03682   // if ( savedMessages ) {
03683   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03684   // }
03685 
03686 }
03687 
03688 
03689 void ComputePmeMgr::sendPencilsHelper(int iter) {
03690   nodePmeMgr->sendPencilsHelper(iter);
03691 }
03692 
03693 void NodePmeMgr::sendPencilsHelper(int iter) {
03694 #ifdef NAMD_CUDA
03695   ComputePmeMgr *obj = masterPmeMgr;
03696   obj->sendPencilsPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe);
03697 #else
03698   NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
03699 #endif
03700 }
03701 
03702 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
03703 
03704   sendDataHelper_lattice = &lattice;
03705   sendDataHelper_sequence = sequence;
03706   sendDataHelper_sourcepe = CkMyPe();
03707 
03708 #ifdef NAMD_CUDA
03709   if ( offload ) {
03710     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03711 #if CMK_MULTICORE
03712       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03713       int ib = activePencils[ap].i;
03714       int jb = activePencils[ap].j;
03715       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03716       pmeProxy[destproc].sendPencilsHelper(ap);
03717 #else
03718       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03719 #endif
03720     }
03721   } else
03722 #endif
03723   {
03724     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03725   }
03726 
03727   if ( strayChargeErrors ) {
03728    strayChargeErrors = 0;
03729    iout << iERROR << "Stray PME grid charges detected: "
03730         << CkMyPe() << " sending to (x,y)";
03731    int K1 = myGrid.K1;
03732    int K2 = myGrid.K2;
03733    int dim2 = myGrid.dim2;
03734    int block1 = myGrid.block1;
03735    int block2 = myGrid.block2;
03736    for (int ib=0; ib<xBlocks; ++ib) {
03737     for (int jb=0; jb<yBlocks; ++jb) {
03738      int ibegin = ib*block1;
03739      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03740      int jbegin = jb*block2;
03741      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03742      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03743 
03744      for ( int g=0; g<numGrids; ++g ) {
03745        char *f = f_arr + g*fsize;
03746        if ( ! pencilActive[ib*yBlocks+jb] ) {
03747            for ( int i=ibegin; i<iend; ++i ) {
03748             for ( int j=jbegin; j<jend; ++j ) {
03749              if ( f[i*dim2+j] == 3 ) {
03750                f[i*dim2+j] = 2;
03751                iout << " (" << i << "," << j << ")";
03752              }
03753             }
03754            }
03755        }
03756      }
03757     }
03758    }
03759    iout << "\n" << endi;
03760   }
03761  
03762 }
03763 
03764 
03765 void ComputePmeMgr::copyPencils(PmeGridMsg *msg) {
03766 
03767   int K1 = myGrid.K1;
03768   int K2 = myGrid.K2;
03769   int dim2 = myGrid.dim2;
03770   int dim3 = myGrid.dim3;
03771   int block1 = myGrid.block1;
03772   int block2 = myGrid.block2;
03773 
03774   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03775   int ib = msg->sourceNode / yBlocks;
03776   int jb = msg->sourceNode % yBlocks;
03777 
03778   int ibegin = ib*block1;
03779   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03780   int jbegin = jb*block2;
03781   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03782 
03783   int zlistlen = msg->zlistlen;
03784   int *zlist = msg->zlist;
03785   float *qmsg = msg->qgrid;
03786   int g;
03787   for ( g=0; g<numGrids; ++g ) {
03788     char *f = f_arr + g*fsize;
03789     float **q = q_arr + g*fsize;
03790     for ( int i=ibegin; i<iend; ++i ) {
03791      for ( int j=jbegin; j<jend; ++j ) {
03792       if( f[i*dim2+j] ) {
03793         f[i*dim2+j] = 0;
03794         for ( int k=0; k<zlistlen; ++k ) {
03795           q[i*dim2+j][zlist[k]] = *(qmsg++);
03796         }
03797         for (int h=0; h<myGrid.order-1; ++h) {
03798           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03799         }
03800       }
03801      }
03802     }
03803   }
03804 }
03805 
03806 
03807 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
03808 
03809   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03810 
03811   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03812 
03813   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03814   for (int j=first; j<=last; j++) {
03815     int pe = gridPeOrder[j];  // different order
03816     if ( ! recipPeDest[pe] && ! errors ) continue;
03817     int start = pe * bsize;
03818     int len = bsize;
03819     if ( start >= qsize ) { start = 0; len = 0; }
03820     if ( start + len > qsize ) { len = qsize - start; }
03821     int zdim = myGrid.dim3;
03822     int fstart = start / zdim;
03823     int flen = len / zdim;
03824     int fcount = 0;
03825     int i;
03826 
03827     int g;
03828     for ( g=0; g<numGrids; ++g ) {
03829       char *f = f_arr + fstart + g*fsize;
03830 #ifdef NAMD_CUDA
03831      if ( offload ) {
03832       int errcount = 0;
03833       for ( i=0; i<flen; ++i ) {
03834         f[i] = ffz_host[fstart+i];
03835         fcount += f[i];
03836         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03837       }
03838       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03839      } else
03840 #endif
03841       for ( i=0; i<flen; ++i ) {
03842         fcount += f[i];
03843       }
03844       if ( ! recipPeDest[pe] ) {
03845         int errfound = 0;
03846         for ( i=0; i<flen; ++i ) {
03847           if ( f[i] == 3 ) {
03848             errfound = 1;
03849             break;
03850           }
03851         }
03852         if ( errfound ) {
03853           iout << iERROR << "Stray PME grid charges detected: "
03854                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03855           int iz = -1;
03856           for ( i=0; i<flen; ++i ) {
03857             if ( f[i] == 3 ) {
03858               f[i] = 2;
03859               int jz = (i+fstart)/myGrid.K2;
03860               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03861             }
03862           }
03863           iout << "\n" << endi;
03864         }
03865       }
03866     }
03867 
03868 #ifdef NETWORK_PROGRESS
03869     CmiNetworkProgress();
03870 #endif
03871 
03872     if ( ! recipPeDest[pe] ) continue;
03873 
03874     int zlistlen = 0;
03875     for ( i=0; i<myGrid.K3; ++i ) {
03876       if ( fz_arr[i] ) ++zlistlen;
03877     }
03878 
03879     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03880                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03881 
03882     msg->sourceNode = sourcepe;
03883     msg->lattice = lattice;
03884     msg->start = fstart;
03885     msg->len = flen;
03886     msg->zlistlen = zlistlen;
03887     int *zlist = msg->zlist;
03888     zlistlen = 0;
03889     for ( i=0; i<myGrid.K3; ++i ) {
03890       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03891     }
03892     float *qmsg = msg->qgrid;
03893     for ( g=0; g<numGrids; ++g ) {
03894       char *f = f_arr + fstart + g*fsize;
03895       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03896       float **q = q_arr + fstart + g*fsize;
03897       for ( i=0; i<flen; ++i ) {
03898         if ( f[i] ) {
03899           for (int h=0; h<myGrid.order-1; ++h) {
03900             q[i][h] += q[i][myGrid.K3+h];
03901           }
03902           for ( int k=0; k<zlistlen; ++k ) {
03903             *(qmsg++) = q[i][zlist[k]];
03904           }
03905         }
03906       }
03907     }
03908 
03909     msg->sequence = compute_sequence;
03910     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03911     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03912   }
03913 
03914 }
03915 
03916 void ComputePmeMgr::sendDataHelper(int iter) {
03917   nodePmeMgr->sendDataHelper(iter);
03918 }
03919 
03920 void NodePmeMgr::sendDataHelper(int iter) {
03921 #ifdef NAMD_CUDA
03922   ComputePmeMgr *obj = masterPmeMgr;
03923   obj->sendDataPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe, obj->sendDataHelper_errors);
03924 #else
03925   NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
03926 #endif
03927 }
03928 
03929 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
03930 
03931   sendDataHelper_lattice = &lattice;
03932   sendDataHelper_sequence = sequence;
03933   sendDataHelper_sourcepe = CkMyPe();
03934   sendDataHelper_errors = strayChargeErrors;
03935   strayChargeErrors = 0;
03936 
03937 #ifdef NAMD_CUDA
03938   if ( offload ) {
03939     for ( int i=0; i < numGridPes; ++i ) {
03940       int pe = gridPeOrder[i];  // different order
03941       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03942 #if CMK_MULTICORE
03943       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03944       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03945 #else
03946       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03947 #endif
03948     }
03949   } else
03950 #endif
03951   {
03952     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03953   }
03954  
03955 }
03956 
03957 void ComputePmeMgr::copyResults(PmeGridMsg *msg) {
03958 
03959   int zdim = myGrid.dim3;
03960   int flen = msg->len;
03961   int fstart = msg->start;
03962   int zlistlen = msg->zlistlen;
03963   int *zlist = msg->zlist;
03964   float *qmsg = msg->qgrid;
03965   int g;
03966   for ( g=0; g<numGrids; ++g ) {
03967     char *f = msg->fgrid + g*flen;
03968     float **q = q_arr + fstart + g*fsize;
03969     for ( int i=0; i<flen; ++i ) {
03970       if ( f[i] ) {
03971         f[i] = 0;
03972         for ( int k=0; k<zlistlen; ++k ) {
03973           q[i][zlist[k]] = *(qmsg++);
03974         }
03975         for (int h=0; h<myGrid.order-1; ++h) {
03976           q[i][myGrid.K3+h] = q[i][h];
03977         }
03978       }
03979     }
03980   }
03981 }
03982 
03983 void ComputePme::ungridForces() {
03984 
03985     if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
03986  
03987     SimParameters *simParams = Node::Object()->simParameters;
03988 
03989     localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
03990     Vector *localResults = localResults_alloc.begin();
03991     Vector *gridResults;
03992 
03993     if ( alchOn || lesOn || selfOn || pairOn ) {
03994       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
03995       gridResults = localResults + numLocalAtoms;
03996     } else {
03997       gridResults = localResults;
03998     }
03999 
04000     Vector pairForce = 0.;
04001     Lattice &lattice = patch->flags.lattice;
04002     int g = 0;
04003     if(!simParams->commOnly) {
04004     for ( g=0; g<numGrids; ++g ) {
04005 #ifdef NETWORK_PROGRESS
04006       CmiNetworkProgress();
04007 #endif
04008 
04009 #ifdef NAMD_CUDA
04010       if ( offload ) {
04011         int errfound = 0;
04012         for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04013           // Neither isnan() nor x != x worked when testing on Cray; this does.
04014           if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }  // CUDA NaN
04015           gridResults[i].x = f_data_host[3*i];
04016           gridResults[i].y = f_data_host[3*i+1];
04017           gridResults[i].z = f_data_host[3*i+2];
04018         }
04019         if ( errfound ) {
04020           int errcount = 0;
04021           for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04022             float f = f_data_host[3*i];
04023             if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) {  // CUDA NaN
04024               ++errcount;
04025               gridResults[i] = 0.;
04026             }
04027           }
04028           iout << iERROR << "Stray PME grid charges detected: "
04029                 << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
04030         }
04031       } else
04032 #endif // NAMD_CUDA
04033         {
04034           myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
04035         }
04036       scale_forces(gridResults, numGridAtoms[g], lattice);
04037       
04038       if (alchOn) {
04039         float scale = 1.;
04040         BigReal elecLambdaUp, elecLambdaDown;
04041         if ( simParams->alchFepWhamOn ) {
04042           if ( simParams->alchFepElecOn ) {
04043             elecLambdaUp = simParams->alchElecLambda;
04044             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04045           }
04046           else {
04047             elecLambdaUp = 0.0;
04048             elecLambdaDown = 1.0;
04049           }
04050         }
04051         else {
04052           BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04053           myMgr->alchLambda = alchLambda;
04054           elecLambdaUp = simParams->getElecLambda(alchLambda);
04055           elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
04056         }
04057         
04058         if ( g == 0 ) scale = elecLambdaUp;
04059         else if ( g == 1 ) scale = elecLambdaDown;
04060         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04061 
04062         if (alchDecouple) {
04063           if ( g == 2 ) scale = 1 - elecLambdaUp;
04064           else if ( g == 3 ) scale = 1 - elecLambdaDown;
04065           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04066         }
04067         int nga = 0;
04068         if (!alchDecouple) {
04069           for(int i=0; i<numLocalAtoms; ++i) {
04070             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04071               // (g=2: only partition 0)
04072               localResults[i] += gridResults[nga++] * scale;
04073             }
04074           }
04075         }
04076         else {  // alchDecouple
04077           if ( g < 2 ) {
04078             for(int i=0; i<numLocalAtoms; ++i) {
04079               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04080                 // g = 0: partition 0 or partition 1
04081                 // g = 1: partition 0 or partition 2
04082                 localResults[i] += gridResults[nga++] * scale;
04083               }
04084             }
04085           }
04086           else {
04087             for(int i=0; i<numLocalAtoms; ++i) {
04088               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
04089                 // g = 2: partition 1 only
04090                 // g = 3: partition 2 only
04091                 // g = 4: partition 0 only
04092                 localResults[i] += gridResults[nga++] * scale;
04093               }
04094             }
04095           }
04096         }
04097       } else if ( lesOn ) {
04098         float scale = 1.;
04099         if ( alchFepOn ) {
04100           if(simParams->alchFepWhamOn) {
04101             if(simParams->alchFepElecOn) {
04102               if ( g == 0 ) scale = simParams->alchElecLambda;
04103               else if ( g == 1 ) scale = 1. - simParams->alchElecLambda;
04104             }
04105             else {
04106               if ( g == 0 ) scale = 0.0;
04107               else if ( g == 1 ) scale = 1.0;
04108             }
04109           }
04110           else {
04111             BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04112             myMgr->alchLambda = alchLambda;
04113             if ( g == 0 ) scale = alchLambda;
04114             else if ( g == 1 ) scale = 1. - alchLambda;
04115           }
04116         } else if ( lesOn ) {
04117           scale = 1.0 / (float)lesFactor;
04118         }
04119         int nga = 0;
04120         for(int i=0; i<numLocalAtoms; ++i) {
04121           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04122             localResults[i] += gridResults[nga++] * scale;
04123           }
04124         }
04125       } else if ( selfOn ) {
04126         PmeParticle *lgd = localGridData[g];
04127         int nga = 0;
04128         for(int i=0; i<numLocalAtoms; ++i) {
04129           if ( localPartition[i] == 1 ) {
04130             pairForce += gridResults[nga];  // should add up to almost zero
04131             localResults[i] += gridResults[nga++];
04132           }
04133         }
04134       } else if ( pairOn ) {
04135         if ( g == 0 ) {
04136           int nga = 0;
04137           for(int i=0; i<numLocalAtoms; ++i) {
04138             if ( localPartition[i] == 1 ) {
04139               pairForce += gridResults[nga];
04140             }
04141             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
04142               localResults[i] += gridResults[nga++];
04143             }
04144           }
04145         } else if ( g == 1 ) {
04146           int nga = 0;
04147           for(int i=0; i<numLocalAtoms; ++i) {
04148             if ( localPartition[i] == g ) {
04149               pairForce -= gridResults[nga];  // should add up to almost zero
04150               localResults[i] -= gridResults[nga++];
04151             }
04152           }
04153         } else {
04154           int nga = 0;
04155           for(int i=0; i<numLocalAtoms; ++i) {
04156             if ( localPartition[i] == g ) {
04157               localResults[i] -= gridResults[nga++];
04158             }
04159          }
04160         }
04161       }
04162     }
04163     }
04164 
04165     Vector *results_ptr = localResults;
04166     
04167     // add in forces
04168     {
04169       Results *r = forceBox->open();
04170       Force *f = r->f[Results::slow];
04171       int numAtoms = patch->getNumAtoms();
04172 
04173       if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
04174         for(int i=0; i<numAtoms; ++i) {
04175           f[i].x += results_ptr->x;
04176           f[i].y += results_ptr->y;
04177           f[i].z += results_ptr->z;
04178           ++results_ptr;
04179         }
04180       }
04181       forceBox->close(&r);
04182     }
04183 
04184     if ( pairOn || selfOn ) {
04185         ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
04186     }
04187 
04188 }
04189 
04190 void ComputePmeMgr::submitReductions() {
04191 
04192     SimParameters *simParams = Node::Object()->simParameters;
04193 
04194     for ( int g=0; g<numGrids; ++g ) {
04195       float scale = 1.;
04196       if (alchOn) {
04197         BigReal elecLambdaUp, elecLambdaDown;
04198         if( simParams->alchFepWhamOn ) {
04199           if( simParams->alchFepElecOn ) {
04200             elecLambdaUp = simParams->alchElecLambda;
04201             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04202           }
04203           else {
04204             elecLambdaUp = 0.0;
04205             elecLambdaDown = 1.0;
04206           }
04207         }
04208         else {
04209           // alchLambda set on each step in ComputePme::ungridForces()
04210           if ( alchLambda < 0 || alchLambda > 1 ) {
04211             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04212           }
04213           elecLambdaUp = simParams->getElecLambda(alchLambda);
04214           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04215         }
04216         if ( g == 0 ) scale = elecLambdaUp;
04217         else if ( g == 1 ) scale = elecLambdaDown;
04218         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04219         if (alchDecouple) {
04220           if ( g == 2 ) scale = 1-elecLambdaUp;
04221           else if ( g == 3 ) scale = 1-elecLambdaDown;
04222           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04223         }
04224       } else if ( lesOn ) {
04225         scale = 1.0 / lesFactor;
04226       } else if ( pairOn ) {
04227         scale = ( g == 0 ? 1. : -1. );
04228       }
04229       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04230       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04231       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04232       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04233       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04234       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04235       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04236       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04237       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04238       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04239 
04240       float scale2 = 0.;
04241 
04242       // why is this declared/defined again here?
04243       SimParameters *simParams = Node::Object()->simParameters;
04244 
04245       if (alchFepOn) {
04246         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04247         if(simParams->alchFepWhamOn) {
04248           if(simParams->alchFepElecOn) {
04249             elecLambda2Up = simParams->alchElecLambda;
04250             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04251           }
04252           else {
04253             elecLambda2Up = 0.0;
04254             elecLambda2Down =  1.0;
04255           }
04256         }
04257         else {
04258           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04259           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04260         }
04261         
04262         if ( g == 0 ) scale2 = elecLambda2Up;
04263         else if ( g == 1 ) scale2 = elecLambda2Down;
04264         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04265         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04266         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04267         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04268       }
04269       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04270         if( g==0 )      scale2 = scale + 1.0;
04271         else if( g==1 ) scale2 = scale - 1.0;
04272         else if( g==2 ) scale2 = scale - 1.0;
04273         else if( g==3 ) scale2 = scale + 1.0;
04274       }
04275       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04276       
04277       if (alchThermIntOn) {
04278         
04279         // no decoupling:
04280         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04281         // (interactions between all atoms [partition 0 OR partition 1], 
04282         // minus all [within partition 0])
04283         // U = elecLambdaUp * (U[0] - U[2])
04284         // dU/dl = U[0] - U[2];
04285         
04286         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04287         // (interactions between all atoms [partition 0 OR partition 2], 
04288         // minus all [within partition 0])
04289         // U = elecLambdaDown * (U[1] - U[2])
04290         // dU/dl = U[1] - U[2];
04291 
04292         // alchDecouple:
04293         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04294         // (interactions between all atoms [partition 0 OR partition 1]
04295         // minus all [within partition 1] minus all [within partition 0]
04296         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04297         // dU/dl = U[0] - U[2] - U[4];
04298 
04299         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04300         // (interactions between all atoms [partition 0 OR partition 2]
04301         // minus all [within partition 2] minus all [within partition 0]
04302         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04303         // dU/dl = U[1] - U[3] - U[4];
04304         
04305         
04306         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04307         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04308         if (!alchDecouple) {
04309           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04310           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04311         }
04312         else {  // alchDecouple
04313           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04314           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04315           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04316           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04317         }
04318       }
04319     }
04320 
04321     alchLambda = -1.;  // illegal value to catch if not updated
04322 
04323     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04324     reduction->submit();
04325 
04326   for ( int i=0; i<heldComputes.size(); ++i ) {
04327     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04328   }
04329   heldComputes.resize(0);
04330 }
04331 
04332 #if USE_TOPOMAP 
04333 
04334 #define NPRIMES 8
04335 const static unsigned int NAMDPrimes[] = {
04336   3,
04337   5,
04338   7,
04339   11,
04340   13,
04341   17,
04342   19,
04343   23,  
04344   29,
04345   31,
04346   37,
04347   59,
04348   73,
04349   93,
04350   113,
04351   157,
04352   307,
04353   617,
04354   1217                  //This should b enough for 64K nodes of BGL. 
04355 };
04356 
04357 #include "RecBisection.h"
04358 
04359 /***-----------------------------------------------------**********
04360     The Orthogonal Recursive Bisection strategy, which allocates PME
04361     objects close to the patches they communicate, and at the same
04362     time spreads them around the grid 
04363 ****----------------------------------------------------------****/
04364 
04365 bool generateBGLORBPmePeList(int *pemap, int numPes, 
04366                              int *block_pes, int nbpes) {
04367 
04368   PatchMap *pmap = PatchMap::Object();
04369   int *pmemap = new int [CkNumPes()];
04370 
04371   if (pemap == NULL)
04372     return false;
04373 
04374   TopoManager tmgr;
04375 
04376   memset(pmemap, 0, sizeof(int) * CkNumPes());
04377 
04378   for(int count = 0; count < CkNumPes(); count++) {
04379     if(count < nbpes)
04380       pmemap[block_pes[count]] = 1;
04381     
04382     if(pmap->numPatchesOnNode(count)) {
04383       pmemap[count] = 1;
04384       
04385       //Assumes an XYZT mapping !!
04386       if(tmgr.hasMultipleProcsPerNode()) {
04387         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
04388       }
04389     }
04390   }
04391 
04392   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
04393     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
04394     return false;
04395 
04396   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04397   Node *node = nd.ckLocalBranch();
04398   SimParameters *simParams = node->simParameters;
04399 
04400   //first split PME processors into patch groups
04401 
04402   int xsize = 0, ysize = 0, zsize = 0;
04403 
04404   xsize = tmgr.getDimNX();
04405   ysize = tmgr.getDimNY();
04406   zsize = tmgr.getDimNZ();
04407   
04408   int nx = xsize, ny = ysize, nz = zsize;
04409   DimensionMap dm;
04410   
04411   dm.x = 0;
04412   dm.y = 1;
04413   dm.z = 2;
04414   
04415   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
04416 
04417   //group size processors have to be allocated to each YZ plane
04418   int group_size = numPes/nx;
04419   if(numPes % nx)
04420     group_size ++;
04421 
04422   int my_prime = NAMDPrimes[0];
04423   int density = (ny * nz)/group_size + 1;
04424   int count = 0;
04425   
04426   //Choose a suitable prime Number
04427   for(count = 0; count < NPRIMES; count ++) {
04428     //Find a prime just greater than the density
04429     if(density < NAMDPrimes[count]) {
04430       my_prime = NAMDPrimes[count];
04431       break;
04432     }      
04433   }
04434   
04435   if(count == NPRIMES)
04436     my_prime = NAMDPrimes[NPRIMES-1];
04437 
04438   //int gcount = numPes/2;
04439   int gcount = 0;
04440   int npme_pes = 0;
04441   
04442   int coord[3];
04443 
04444   for(int x = 0; x < nx; x++) {
04445     coord[0] = (x + nx/2)%nx;
04446     
04447     for(count=0; count < group_size && npme_pes < numPes; count++) {
04448       int dest = (count + 1) * my_prime;      
04449       dest = dest % (ny * nz);      
04450       
04451       coord[2] = dest / ny;
04452       coord[1] = dest - coord[2] * ny;
04453       
04454       //Locate where in the actual grid the processor is
04455       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
04456         coord[dm.z] * xsize* ysize;
04457       
04458       if(pmemap[destPe] == 0) {
04459         pemap[gcount++] = destPe;
04460         pmemap[destPe] = 1;
04461         
04462         if(tmgr.hasMultipleProcsPerNode())
04463           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
04464 
04465         npme_pes ++;
04466       }
04467       else {
04468         for(int pos = 1; pos < ny * nz; pos++) {
04469           
04470           coord[2] += pos / ny;
04471           coord[1] += pos % ny;
04472           
04473           coord[2] = coord[2] % nz;
04474           coord[1] = coord[1] % ny;       
04475           
04476           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
04477             coord[dm.z] * xsize * ysize;
04478           
04479           if(pmemap[newdest] == 0) {
04480             pemap[gcount++] = newdest;
04481             pmemap[newdest] = 1;
04482             
04483             if(tmgr.hasMultipleProcsPerNode())
04484               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
04485             
04486             npme_pes ++;
04487             break;
04488           }
04489         }
04490       }      
04491     }   
04492     
04493     if(gcount == numPes)
04494       gcount = 0;    
04495     
04496     if(npme_pes >= numPes)
04497       break;
04498   }
04499   
04500   delete [] pmemap;
04501   
04502   if(npme_pes != numPes)
04503     //NAMD_bug("ORB PME allocator failed\n");
04504     return false;
04505 
04506   return true;
04507 }
04508 
04509 #endif
04510 
04511 template <class T> class PmePencil : public T {
04512 public:
04513   PmePencil() {
04514     data = 0;
04515     work = 0;
04516     send_order = 0;
04517     needs_reply = 0;
04518 #if USE_PERSISTENT
04519     trans_handle = untrans_handle = ungrid_handle = NULL;
04520 #endif
04521   }
04522   ~PmePencil() {
04523 #ifdef NAMD_FFTW
04524     fftwf_free(data);
04525 #endif
04526     delete [] work;
04527     delete [] send_order;
04528     delete [] needs_reply;
04529   }
04530   void base_init(PmePencilInitMsg *msg) {
04531     imsg=0;
04532     imsgb=0;
04533     hasData=0;
04534     initdata = msg->data;
04535   }
04536   void order_init(int nBlocks) {
04537     send_order = new int[nBlocks];
04538     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
04539     if ( Node::Object()->simParameters->PMESendOrder ) {
04540       std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
04541     } else {
04542       Random rand(CkMyPe());
04543       rand.reorder(send_order,nBlocks);
04544     }
04545     needs_reply = new int[nBlocks];
04546     offload = Node::Object()->simParameters->PMEOffload;
04547   }
04548   PmePencilInitMsgData initdata;
04549   Lattice lattice;
04550   PmeReduction evir;
04551   int sequence;  // used for priorities
04552   int imsg;  // used in sdag code
04553   int imsgb;  // Node par uses distinct counter for back path
04554   int hasData;  // used in message elimination
04555   int offload;
04556   float *data;
04557   float *work;
04558   int *send_order;
04559   int *needs_reply;
04560 #if USE_PERSISTENT
04561   PersistentHandle *trans_handle;
04562   PersistentHandle *untrans_handle;
04563   PersistentHandle *ungrid_handle;
04564 #endif
04565 };
04566 
04567 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
04568 public:
04569     PmeZPencil_SDAG_CODE
04570     PmeZPencil() { __sdag_init(); setMigratable(false); }
04571     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); imsg=imsgb=0;}
04572         ~PmeZPencil() {
04573         #ifdef NAMD_FFTW
04574         #ifdef NAMD_FFTW_3
04575                 delete [] forward_plans;
04576                 delete [] backward_plans;
04577         #endif
04578         #endif
04579         }
04580     void fft_init();
04581     void recv_grid(const PmeGridMsg *);
04582     void forward_fft();
04583     void send_trans();
04584         void send_subset_trans(int fromIdx, int toIdx);
04585     void recv_untrans(const PmeUntransMsg *);
04586     void node_process_untrans(PmeUntransMsg *);
04587     void node_process_grid(PmeGridMsg *);
04588     void backward_fft();
04589         void send_ungrid(PmeGridMsg *);
04590         void send_all_ungrid();
04591         void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
04592 private:
04593     ResizeArray<PmeGridMsg *> grid_msgs;
04594     ResizeArray<int> work_zlist;
04595 #ifdef NAMD_FFTW
04596 #ifdef NAMD_FFTW_3
04597     fftwf_plan forward_plan, backward_plan;
04598 
04599         //for ckloop usage
04600         int numPlans;
04601         fftwf_plan *forward_plans, *backward_plans;
04602 #else
04603     rfftwnd_plan forward_plan, backward_plan;
04604 #endif
04605 #endif
04606 
04607     int nx, ny;
04608 #if USE_PERSISTENT
04609     void setup_persistent() {
04610       int hd = 1;// ( hasData ? 1 : 0 );
04611       int zBlocks = initdata.zBlocks;
04612       int block3 = initdata.grid.block3;
04613       int dim3 = initdata.grid.dim3;
04614       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04615       CmiAssert(yPencil_local);
04616       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
04617       for ( int isend=0; isend<zBlocks; ++isend ) {
04618           int kb = send_order[isend];
04619           int nz1 = block3;
04620           if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
04621           int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
04622           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
04623           int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
04624           int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
04625           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04626       }
04627     }
04628     
04629     void setup_ungrid_persistent() 
04630     {
04631        ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
04632        for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
04633            int peer = grid_msgs[imsg]->sourceNode;
04634            //ungrid_handle[imsg] = CmiCreatePersistent(peer, 0); 
04635        }
04636     }
04637 #endif
04638 };
04639 
04640 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
04641 public:
04642     PmeYPencil_SDAG_CODE
04643     PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
04644     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
04645     void fft_init();
04646     void recv_trans(const PmeTransMsg *);
04647     void forward_fft();
04648         void forward_subset_fft(int fromIdx, int toIdx);
04649     void send_trans();
04650         void send_subset_trans(int fromIdx, int toIdx);
04651     void recv_untrans(const PmeUntransMsg *);    
04652     void node_process_trans(PmeTransMsg *);
04653     void node_process_untrans(PmeUntransMsg *);
04654     void backward_fft();
04655         void backward_subset_fft(int fromIdx, int toIdx);
04656     void send_untrans();
04657     void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
04658 private:
04659 #ifdef NAMD_FFTW
04660 #ifdef NAMD_FFTW_3
04661     fftwf_plan forward_plan, backward_plan;
04662 #else
04663     fftw_plan forward_plan, backward_plan;
04664 #endif
04665 #endif
04666 
04667     int nx, nz;
04668 #if USE_PERSISTENT
04669     void setup_persistent() {
04670       int yBlocks = initdata.yBlocks;
04671       int block2 = initdata.grid.block2;
04672       int K2 = initdata.grid.K2;
04673       int hd = 1;
04674       CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
04675       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04676       for ( int isend=0; isend<yBlocks; ++isend ) {
04677           int jb = send_order[isend];
04678           int ny1 = block2;
04679           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04680           int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
04681           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
04682           int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
04683           int compress_size = sizeof(float)*hd*nx*ny1*nz*2; 
04684           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04685       }
04686 
04687       CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
04688       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04689       for ( int isend=0; isend<yBlocks; ++isend ) {
04690           int jb = send_order[isend];
04691           int ny1 = block2;
04692           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04693           int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
04694           int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
04695           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04696           int compress_size = sizeof(float)*nx*ny1*nz*2;
04697           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size,  compress_start, compress_size, CMI_FLOATING);
04698       }
04699     }
04700 #endif
04701 };
04702 
04703 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
04704 public:
04705     PmeXPencil_SDAG_CODE
04706     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
04707     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
04708         ~PmeXPencil() {
04709         #ifdef NAMD_FFTW
04710         #ifdef NAMD_FFTW_3
04711                 delete [] forward_plans;
04712                 delete [] backward_plans;
04713         #endif
04714         #endif
04715         }
04716     void fft_init();
04717     void recv_trans(const PmeTransMsg *);
04718     void forward_fft();
04719     void pme_kspace();
04720     void backward_fft();
04721     void send_untrans();
04722         void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
04723     void node_process_trans(PmeTransMsg *);
04724 #ifdef NAMD_FFTW
04725 #ifdef NAMD_FFTW_3
04726     fftwf_plan forward_plan, backward_plan;
04727 
04728         int numPlans;
04729         fftwf_plan *forward_plans, *backward_plans;
04730 #else
04731     fftw_plan forward_plan, backward_plan;
04732 #endif
04733 #endif
04734     int ny, nz;
04735     int recipEvirPe;
04736     void evir_init();
04737     PmeKSpace *myKSpace;
04738 #if USE_PERSISTENT
04739     void  setup_persistent() {
04740       int xBlocks = initdata.xBlocks;
04741       int block1 = initdata.grid.block1;
04742       int K1 = initdata.grid.K1;
04743       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04744       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
04745       for ( int isend=0; isend<xBlocks; ++isend ) {
04746           int ib = send_order[isend];
04747           int nx1 = block1;
04748           if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
04749           int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
04750           int size = sizeof(PmeUntransMsg) +
04751               sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24; 
04752           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04753           int compress_size = sizeof(float)*nx1*ny*nz*2;
04754           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04755       }
04756     }
04757 #endif
04758 
04759 };
04760 
04761 void PmeXPencil::evir_init() {
04762   recipEvirPe = findRecipEvirPe();
04763   initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
04764 }
04765 
04766 void PmeZPencil::fft_init() {
04767   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04768   Node *node = nd.ckLocalBranch();
04769   SimParameters *simParams = node->simParameters;
04770 
04771 #if USE_NODE_PAR_RECEIVE
04772   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
04773 #endif
04774 
04775   int K1 = initdata.grid.K1;
04776   int K2 = initdata.grid.K2;
04777   int K3 = initdata.grid.K3;
04778   int dim3 = initdata.grid.dim3;
04779   int block1 = initdata.grid.block1;
04780   int block2 = initdata.grid.block2;
04781 
04782   nx = block1;
04783   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04784   ny = block2;
04785   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
04786 
04787 #ifdef NAMD_FFTW
04788   CmiLock(ComputePmeMgr::fftw_plan_lock);
04789 
04790   data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
04791   work = new float[dim3];
04792 
04793   order_init(initdata.zBlocks);
04794 
04795 #ifdef NAMD_FFTW_3
04796   /* need array of sizes for the how many */
04797 
04798   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04799   int sizeLines=nx*ny;
04800   int planLineSizes[1];
04801   planLineSizes[0]=K3;
04802   int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
04803   int ndimHalf=ndim/2;
04804   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
04805                                          (float *) data, NULL, 1, 
04806                                          ndim,
04807                                          (fftwf_complex *) data, NULL, 1,
04808                                          ndimHalf,
04809                                          fftwFlags);
04810 
04811   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
04812                                           (fftwf_complex *) data, NULL, 1, 
04813                                           ndimHalf,
04814                                           (float *) data, NULL, 1, 
04815                                           ndim,
04816                                           fftwFlags);
04817 #if     CMK_SMP && USE_CKLOOP
04818   if(simParams->useCkLoop) {
04819           //How many FFT plans to be created? The grain-size issue!!.
04820           //Currently, I am choosing the min(nx, ny) to be coarse-grain
04821           numPlans = (nx<=ny?nx:ny);
04822           if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
04823           if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
04824           int howmany = sizeLines/numPlans;
04825           forward_plans = new fftwf_plan[numPlans];
04826           backward_plans = new fftwf_plan[numPlans];
04827           for(int i=0; i<numPlans; i++) {
04828                   int dimStride = i*ndim*howmany;
04829                   int dimHalfStride = i*ndimHalf*howmany;
04830                   forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
04831                                                                                                          ((float *)data)+dimStride, NULL, 1,
04832                                                                                                          ndim,
04833                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04834                                                                                                          ndimHalf,
04835                                                                                                          fftwFlags);
04836 
04837                   backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
04838                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04839                                                                                                          ndimHalf,
04840                                                                                                          ((float *)data)+dimStride, NULL, 1,
04841                                                                                                          ndim,
04842                                                                                                          fftwFlags);
04843           }
04844   }else 
04845 #endif 
04846   {
04847           forward_plans = NULL;
04848           backward_plans = NULL;
04849   }
04850 #else
04851   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
04852         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04853         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04854   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
04855         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04856         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04857 #endif
04858   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04859 #else
04860   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04861 #endif
04862 
04863 #if USE_NODE_PAR_RECEIVE
04864     evir = 0.;
04865     memset(data, 0, sizeof(float) * nx*ny*dim3);
04866 #endif
04867 }
04868 
04869 void PmeYPencil::fft_init() {
04870   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04871   Node *node = nd.ckLocalBranch();
04872   SimParameters *simParams = node->simParameters;
04873 
04874 #if USE_NODE_PAR_RECEIVE
04875   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
04876 #endif
04877 
04878   int K1 = initdata.grid.K1;
04879   int K2 = initdata.grid.K2;
04880   int dim2 = initdata.grid.dim2;
04881   int dim3 = initdata.grid.dim3;
04882   int block1 = initdata.grid.block1;
04883   int block3 = initdata.grid.block3;
04884 
04885   nx = block1;
04886   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04887   nz = block3;
04888   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
04889 
04890 #ifdef NAMD_FFTW
04891   CmiLock(ComputePmeMgr::fftw_plan_lock);
04892 
04893   data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
04894   work = new float[2*K2];
04895 
04896   order_init(initdata.yBlocks);
04897 
04898 #ifdef NAMD_FFTW_3
04899   /* need array of sizes for the dimensions */
04900   /* ideally this should be implementable as a single multidimensional
04901    *  plan, but that has proven tricky to implement, so we maintain the
04902    *  loop of 1d plan executions. */
04903   int sizeLines=nz;
04904   int planLineSizes[1];
04905   planLineSizes[0]=K2;
04906   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04907   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04908                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04909                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04910                                      FFTW_FORWARD, 
04911                                      fftwFlags);
04912   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04913                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04914                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04915                                      FFTW_BACKWARD, 
04916                                       fftwFlags);
04917   CkAssert(forward_plan != NULL);
04918   CkAssert(backward_plan != NULL);
04919 #else
04920   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
04921         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04922         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04923         nz, (fftw_complex *) work, 1);
04924   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
04925         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04926         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04927         nz, (fftw_complex *) work, 1);
04928 #endif
04929   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04930 #else
04931   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04932 #endif
04933 
04934 #if USE_NODE_PAR_RECEIVE
04935   evir = 0;
04936   CmiMemoryWriteFence();
04937 #endif
04938 }
04939 
04940 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
04941 {
04942   if ( msg->hasData ) hasData = 1;
04943   needs_reply[msg->sourceNode] = msg->hasData;
04944   recv_trans(msg);
04945   int limsg;
04946   CmiMemoryAtomicFetchAndInc(imsg,limsg);
04947   if(limsg+1 == initdata.yBlocks)
04948     {
04949       if ( hasData ) {
04950         forward_fft();
04951       }
04952       send_trans();
04953       if( ! hasData)
04954         {
04955           send_untrans(); //todo, what is up with the recvAck in SDAG version?
04956         }
04957       imsg=0;
04958       CmiMemoryWriteFence();
04959     }
04960 }
04961 
04962 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
04963 {
04964   recv_untrans(msg);
04965   int limsg;
04966   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
04967   if(limsg+1 == initdata.yBlocks)
04968     {
04969       backward_fft();
04970       send_untrans();
04971       imsgb=0;
04972       CmiMemoryWriteFence();
04973     }
04974 }
04975 
04976 #define DEBUG_NODE_PAR_RECV 0
04977 
04978 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
04979   //  CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
04980   PmeXPencil *target=xPencilObj.get(msg->destElem);
04981 #if DEBUG_NODE_PAR_RECV
04982   if(target == NULL)
04983     CkAbort("xpencil in recvXTrans not found, debug registeration");
04984 #endif  
04985     target->node_process_trans(msg);
04986   delete msg;
04987 }
04988 
04989 
04990 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
04991   //  CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
04992   PmeYPencil *target=yPencilObj.get(msg->destElem);
04993 #if DEBUG_NODE_PAR_RECV
04994   if(target == NULL)
04995     CkAbort("ypencil in recvYTrans not found, debug registeration");
04996 #endif  
04997     target->node_process_trans(msg);
04998   delete msg;
04999  }
05000 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
05001   //  CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05002   PmeYPencil *target=yPencilObj.get(msg->destElem);
05003 #if DEBUG_NODE_PAR_RECV  
05004   if(target == NULL)
05005     CkAbort("ypencil in recvYUntrans not found, debug registeration");
05006 #endif  
05007     target->node_process_untrans(msg);
05008   delete msg;
05009  }
05010 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
05011   //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05012   PmeZPencil *target=zPencilObj.get(msg->destElem);
05013 #if DEBUG_NODE_PAR_RECV
05014   if(target == NULL)
05015     CkAbort("zpencil in recvZUntrans not found, debug registeration");
05016 #endif
05017   target->node_process_untrans(msg);
05018   delete msg;
05019 }
05020 
05021 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
05022   //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05023   PmeZPencil *target=zPencilObj.get(msg->destElem);
05024 #if DEBUG_NODE_PAR_RECV
05025   if(target == NULL){
05026     CkAbort("zpencil in recvZGrid not found, debug registeration");
05027   }
05028 #endif
05029   target->node_process_grid(msg); //msg is stored inside node_proces_grid
05030 }
05031 
05032 void PmeXPencil::fft_init() {
05033   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
05034   Node *node = nd.ckLocalBranch();
05035   SimParameters *simParams = node->simParameters;
05036 #if USE_NODE_PAR_RECEIVE
05037   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
05038 #endif
05039 
05040   int K1 = initdata.grid.K1;
05041   int K2 = initdata.grid.K2;
05042   int dim3 = initdata.grid.dim3;
05043   int block2 = initdata.grid.block2;
05044   int block3 = initdata.grid.block3;
05045 
05046   ny = block2;
05047   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
05048   nz = block3;
05049   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
05050 
05051 #ifdef NAMD_FFTW
05052   CmiLock(ComputePmeMgr::fftw_plan_lock);
05053 
05054   data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
05055   work = new float[2*K1];
05056 
05057   order_init(initdata.xBlocks);
05058 
05059 #ifdef NAMD_FFTW_3
05060   /* need array of sizes for the how many */
05061   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
05062   int sizeLines=ny*nz;
05063   int planLineSizes[1];
05064   planLineSizes[0]=K1;
05065   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05066                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05067                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05068                                    FFTW_FORWARD,
05069                                      fftwFlags);
05070   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05071                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05072                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05073                                           FFTW_BACKWARD,
05074                                       fftwFlags);
05075 
05076 #if     CMK_SMP && USE_CKLOOP
05077   if(simParams->useCkLoop) {
05078           //How many FFT plans to be created? The grain-size issue!!.
05079           //Currently, I am choosing the min(nx, ny) to be coarse-grain
05080           numPlans = (ny<=nz?ny:nz);
05081           // limit attempted parallelism due to false sharing
05082           //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
05083           //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
05084           if ( sizeLines/numPlans < 4 ) numPlans = 1;
05085           int howmany = sizeLines/numPlans;
05086           forward_plans = new fftwf_plan[numPlans];
05087           backward_plans = new fftwf_plan[numPlans];
05088           for(int i=0; i<numPlans; i++) {
05089                   int curStride = i*howmany;              
05090                   forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05091                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05092                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05093                                                                                                         FFTW_FORWARD,
05094                                                                                                          fftwFlags);
05095 
05096                   backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05097                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05098                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05099                                                                                                           FFTW_BACKWARD,
05100                                                                                                          fftwFlags);
05101           }
05102   }else
05103 #endif
05104   {
05105           forward_plans = NULL;
05106           backward_plans = NULL;
05107   }
05108 #else
05109   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
05110         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05111         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05112         ny*nz, (fftw_complex *) work, 1);
05113   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
05114         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05115         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05116         ny*nz, (fftw_complex *) work, 1);
05117 #endif
05118   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05119 #else
05120   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
05121 #endif
05122 
05123   myKSpace = new PmeKSpace(initdata.grid,
05124                 thisIndex.y*block2, thisIndex.y*block2 + ny,
05125                 thisIndex.z*block3, thisIndex.z*block3 + nz);
05126 
05127 }
05128 
05129 // #define FFTCHECK   // run a grid of integers through the fft
05130 // #define ZEROCHECK  // check for suspicious zeros in fft
05131 
05132 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
05133 
05134   int dim3 = initdata.grid.dim3;
05135   if ( imsg == 0 ) {
05136     lattice = msg->lattice;
05137     sequence = msg->sequence;
05138 #if ! USE_NODE_PAR_RECEIVE
05139     memset(data, 0, sizeof(float)*nx*ny*dim3);
05140 #endif
05141   }
05142 
05143   if ( ! msg->hasData ) return;
05144 
05145   int zlistlen = msg->zlistlen;
05146 #ifdef NAMD_KNL
05147   int * __restrict msg_zlist = msg->zlist;
05148   int * __restrict zlist = work_zlist.begin();
05149   __assume_aligned(zlist,64);
05150   for ( int k=0; k<zlistlen; ++k ) {
05151     zlist[k] = msg_zlist[k];
05152   }
05153 #else
05154   int * __restrict zlist = msg->zlist;
05155 #endif
05156   char * __restrict fmsg = msg->fgrid;
05157   float * __restrict qmsg = msg->qgrid;
05158   float * __restrict d = data;
05159   int numGrids = 1;  // pencil FFT doesn't support multiple grids
05160   for ( int g=0; g<numGrids; ++g ) {
05161     for ( int i=0; i<nx; ++i ) {
05162      for ( int j=0; j<ny; ++j, d += dim3 ) {
05163       if( *(fmsg++) ) {
05164         #pragma ivdep
05165         for ( int k=0; k<zlistlen; ++k ) {
05166           d[zlist[k]] += *(qmsg++);
05167         }
05168       }
05169      }
05170     }
05171   }
05172 }
05173 
05174 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
05175 #ifdef NAMD_FFTW
05176 #ifdef NAMD_FFTW_3    
05177     fftwf_plan *plans = (fftwf_plan *)param;
05178     for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
05179 #endif
05180 #endif        
05181 }
05182 
05183 void PmeZPencil::forward_fft() {
05184   evir = 0.;
05185 #ifdef FFTCHECK
05186   int dim3 = initdata.grid.dim3;
05187   int K3 = initdata.grid.K3;
05188   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05189   float *d = data;
05190   for ( int i=0; i<nx; ++i ) {
05191    for ( int j=0; j<ny; ++j, d += dim3 ) {
05192     for ( int k=0; k<dim3; ++k ) {
05193       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
05194     }
05195    }
05196   }
05197 #endif
05198 #ifdef NAMD_FFTW
05199 #ifdef MANUAL_DEBUG_FFTW3
05200   dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05201 #endif
05202 #ifdef NAMD_FFTW_3
05203 #if     CMK_SMP && USE_CKLOOP
05204   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05205   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05206      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05207           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05208           //transform the above loop
05209           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05210           return;
05211   }
05212 #endif
05213   fftwf_execute(forward_plan);
05214 #else
05215   rfftwnd_real_to_complex(forward_plan, nx*ny,
05216         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
05217 #endif
05218 #ifdef MANUAL_DEBUG_FFTW3
05219   dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05220 #endif
05221 
05222 #endif
05223 #ifdef ZEROCHECK
05224   int dim3 = initdata.grid.dim3;
05225   int K3 = initdata.grid.K3;
05226   float *d = data;
05227   for ( int i=0; i<nx; ++i ) {
05228    for ( int j=0; j<ny; ++j, d += dim3 ) {
05229     for ( int k=0; k<dim3; ++k ) {
05230       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
05231         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
05232     }
05233    }
05234   }
05235 #endif
05236 }
05237 
05238 /* A single task for partitioned PmeZPencil::send_trans work */
05239 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05240         PmeZPencil *zpencil = (PmeZPencil *)param;
05241         zpencil->send_subset_trans(first, last);        
05242 }
05243 
05244 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
05245         int zBlocks = initdata.zBlocks;
05246         int block3 = initdata.grid.block3;
05247         int dim3 = initdata.grid.dim3;
05248         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05249           int kb = send_order[isend];
05250           int nz = block3;
05251           if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05252           int hd = ( hasData ? 1 : 0 );
05253           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05254           msg->lattice = lattice;
05255           msg->sourceNode = thisIndex.y;
05256           msg->hasData = hasData;
05257           msg->nx = ny;
05258          if ( hasData ) {
05259           float *md = msg->qgrid;
05260           const float *d = data;
05261           for ( int i=0; i<nx; ++i ) {
05262            for ( int j=0; j<ny; ++j, d += dim3 ) {
05263                 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05264                   *(md++) = d[2*k];
05265                   *(md++) = d[2*k+1];
05266                 }
05267            }
05268           }
05269          }
05270           msg->sequence = sequence;
05271           SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05272 
05273     CmiEnableUrgentSend(1);
05274 #if USE_NODE_PAR_RECEIVE
05275       msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05276 #if Y_PERSIST 
05277       CmiUsePersistentHandle(&trans_handle[isend], 1);
05278 #endif
05279       initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05280 #if Y_PERSIST 
05281       CmiUsePersistentHandle(NULL, 0);
05282 #endif    
05283 #else
05284 #if Y_PERSIST 
05285       CmiUsePersistentHandle(&trans_handle[isend], 1);
05286 #endif
05287       initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05288 #if Y_PERSIST 
05289       CmiUsePersistentHandle(NULL, 0);
05290 #endif    
05291 #endif
05292     CmiEnableUrgentSend(0);
05293     }
05294 }
05295 
05296 void PmeZPencil::send_trans() {
05297 #if USE_PERSISTENT
05298     if (trans_handle == NULL) setup_persistent();
05299 #endif
05300 #if     CMK_SMP && USE_CKLOOP
05301         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05302         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05303            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05310                 //send_subset_trans(0, initdata.zBlocks-1);
05311                 CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
05312                 return;
05313         }
05314 #endif
05315   int zBlocks = initdata.zBlocks;
05316   int block3 = initdata.grid.block3;
05317   int dim3 = initdata.grid.dim3;
05318   for ( int isend=0; isend<zBlocks; ++isend ) {
05319     int kb = send_order[isend];
05320     int nz = block3;
05321     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05322     int hd = ( hasData ? 1 : 0 );
05323     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05324     msg->lattice = lattice;
05325     msg->sourceNode = thisIndex.y;
05326     msg->hasData = hasData;
05327     msg->nx = ny;
05328    if ( hasData ) {
05329     float *md = msg->qgrid;
05330     const float *d = data;
05331     for ( int i=0; i<nx; ++i ) {
05332      for ( int j=0; j<ny; ++j, d += dim3 ) {
05333       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05334         *(md++) = d[2*k];
05335         *(md++) = d[2*k+1];
05336       }
05337      }
05338     }
05339    }
05340     msg->sequence = sequence;
05341     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05342 
05343     CmiEnableUrgentSend(1);
05344 #if USE_NODE_PAR_RECEIVE
05345     msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05346 #if Y_PERSIST 
05347     CmiUsePersistentHandle(&trans_handle[isend], 1);
05348 #endif
05349     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05350 #if Y_PERSIST 
05351     CmiUsePersistentHandle(NULL, 0);
05352 #endif    
05353 #else
05354 #if Y_PERSIST 
05355     CmiUsePersistentHandle(&trans_handle[isend], 1);
05356 #endif
05357     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05358 #if Y_PERSIST 
05359     CmiUsePersistentHandle(NULL, 0);
05360 #endif    
05361 #endif
05362     CmiEnableUrgentSend(0);
05363   }
05364 }
05365 
05366 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
05367   if ( imsg == 0 ) {
05368     lattice = msg->lattice;
05369     sequence = msg->sequence;
05370   }
05371   int block2 = initdata.grid.block2;
05372   int K2 = initdata.grid.K2;
05373   int jb = msg->sourceNode;
05374   int ny = msg->nx;
05375  if ( msg->hasData ) {
05376   const float *md = msg->qgrid;
05377   float *d = data;
05378   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05379    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05380     for ( int k=0; k<nz; ++k ) {
05381 #ifdef ZEROCHECK
05382       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
05383         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05384 #endif
05385       d[2*(j*nz+k)] = *(md++);
05386       d[2*(j*nz+k)+1] = *(md++);
05387     }
05388    }
05389   }
05390  } else {
05391   float *d = data;
05392   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05393    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05394     for ( int k=0; k<nz; ++k ) {
05395       d[2*(j*nz+k)] = 0;
05396       d[2*(j*nz+k)+1] = 0;
05397     }
05398    }
05399   }
05400  }
05401 }
05402 
05403 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
05404         PmeYPencil *ypencil = (PmeYPencil *)param;
05405         ypencil->forward_subset_fft(first, last);
05406 }
05407 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
05408 #ifdef NAMD_FFTW
05409 #ifdef NAMD_FFTW_3
05410         for(int i=fromIdx; i<=toIdx; i++){
05411                 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05412                       * nz * initdata.grid.K2,  
05413                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05414         }
05415 #endif
05416 #endif
05417 }
05418 
05419 void PmeYPencil::forward_fft() {
05420     evir = 0.;
05421 #ifdef NAMD_FFTW
05422 #ifdef MANUAL_DEBUG_FFTW3
05423   dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05424 #endif
05425   
05426 #ifdef NAMD_FFTW_3
05427 #if     CMK_SMP && USE_CKLOOP
05428   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05429   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05430      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05431           CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
05432           return;
05433   }
05434 #endif
05435   //the above is a transformation of the following loop using CkLoop
05436   for ( int i=0; i<nx; ++i ) {
05437     fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05438                       * nz * initdata.grid.K2,  
05439                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05440   }
05441 #else
05442   for ( int i=0; i<nx; ++i ) {
05443     fftw(forward_plan, nz,
05444         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
05445         nz, 1, (fftw_complex *) work, 1, 0);
05446   }
05447 #endif
05448 #ifdef MANUAL_DEBUG_FFTW3
05449   dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05450 #endif
05451 
05452 #endif
05453 }
05454 
05455 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05456         PmeYPencil *ypencil = (PmeYPencil *)param;
05457         ypencil->send_subset_trans(first, last);
05458 }
05459 
05460 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
05461         int yBlocks = initdata.yBlocks;
05462         int block2 = initdata.grid.block2;
05463         int K2 = initdata.grid.K2;
05464     for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05465           int jb = send_order[isend];
05466           int ny = block2;
05467           if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05468           int hd = ( hasData ? 1 : 0 );
05469           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05470           msg->lattice = lattice;
05471           msg->sourceNode = thisIndex.x;
05472           msg->hasData = hasData;
05473           msg->nx = nx;
05474          if ( hasData ) {
05475           float *md = msg->qgrid;
05476           const float *d = data;
05477           for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05478            for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05479                 for ( int k=0; k<nz; ++k ) {
05480                   *(md++) = d[2*(j*nz+k)];
05481                   *(md++) = d[2*(j*nz+k)+1];
05482   #ifdef ZEROCHECK
05483                   if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05484           thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05485   #endif
05486                 }
05487            }
05488           }
05489           if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05490           thisIndex.x, jb, thisIndex.z);
05491          }
05492           msg->sequence = sequence;
05493           SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05494       CmiEnableUrgentSend(1);
05495 #if USE_NODE_PAR_RECEIVE
05496       msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05497 #if X_PERSIST 
05498       CmiUsePersistentHandle(&trans_handle[isend], 1);
05499 #endif
05500       initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05501 #if X_PERSIST 
05502       CmiUsePersistentHandle(NULL, 0);
05503 #endif
05504 #else      
05505 #if X_PERSIST 
05506       CmiUsePersistentHandle(&trans_handle[isend], 1);
05507 #endif
05508       initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05509 #if X_PERSIST 
05510       CmiUsePersistentHandle(NULL, 0);
05511 #endif
05512 #endif
05513       CmiEnableUrgentSend(0);
05514         }
05515 }
05516 
05517 void PmeYPencil::send_trans() {
05518 #if USE_PERSISTENT
05519     if (trans_handle == NULL) setup_persistent();
05520 #endif
05521 #if     CMK_SMP && USE_CKLOOP
05522         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05523         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05524            && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05531                 //send_subset_trans(0, initdata.yBlocks-1);
05532                 CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
05533                 return;
05534         }
05535 #endif
05536   int yBlocks = initdata.yBlocks;
05537   int block2 = initdata.grid.block2;
05538   int K2 = initdata.grid.K2;
05539   for ( int isend=0; isend<yBlocks; ++isend ) {
05540     int jb = send_order[isend];
05541     int ny = block2;
05542     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05543     int hd = ( hasData ? 1 : 0 );
05544     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05545     msg->lattice = lattice;
05546     msg->sourceNode = thisIndex.x;
05547     msg->hasData = hasData;
05548     msg->nx = nx;
05549    if ( hasData ) {
05550     float *md = msg->qgrid;
05551     const float *d = data;
05552     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05553      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05554       for ( int k=0; k<nz; ++k ) {
05555         *(md++) = d[2*(j*nz+k)];
05556         *(md++) = d[2*(j*nz+k)+1];
05557 #ifdef ZEROCHECK
05558         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05559         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05560 #endif
05561       }
05562      }
05563     }
05564     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05565         thisIndex.x, jb, thisIndex.z);
05566    }
05567     msg->sequence = sequence;
05568     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05569     CmiEnableUrgentSend(1);
05570 #if USE_NODE_PAR_RECEIVE
05571     msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05572 #if X_PERSIST 
05573         CmiUsePersistentHandle(&trans_handle[isend], 1);
05574 #endif
05575     initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05576 #if X_PERSIST 
05577         CmiUsePersistentHandle(NULL, 0);
05578 #endif
05579 #else
05580 #if X_PERSIST 
05581         CmiUsePersistentHandle(&trans_handle[isend], 1);
05582 #endif
05583     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05584 #if X_PERSIST 
05585         CmiUsePersistentHandle(NULL, 0);
05586 #endif
05587     
05588 #endif
05589     CmiEnableUrgentSend(0);
05590   }
05591 }
05592 
05593 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
05594 {
05595   if(msg->hasData) hasData=1;
05596   needs_reply[msg->sourceNode] = msg->hasData;
05597   recv_trans(msg);
05598   int limsg;
05599   CmiMemoryAtomicFetchAndInc(imsg,limsg);
05600   if(limsg+1 == initdata.xBlocks)
05601     {
05602       if(hasData){
05603         forward_fft();
05604         pme_kspace();
05605         backward_fft();
05606       }
05607       send_untrans();
05608       imsg=0;
05609       CmiMemoryWriteFence();
05610     }
05611 }
05612 
05613 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
05614   if ( imsg == 0 ) {
05615     lattice = msg->lattice;
05616     sequence = msg->sequence;
05617   }
05618   int block1 = initdata.grid.block1;
05619   int K1 = initdata.grid.K1;
05620   int ib = msg->sourceNode;
05621   int nx = msg->nx;
05622  if ( msg->hasData ) {
05623   const float *md = msg->qgrid;
05624   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05625    float *d = data + i*ny*nz*2;
05626    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05627     for ( int k=0; k<nz; ++k ) {
05628 #ifdef ZEROCHECK
05629       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
05630         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
05631 #endif
05632       d[2*k] = *(md++);
05633       d[2*k+1] = *(md++);
05634     }
05635    }
05636   }
05637  } else {
05638   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05639    float *d = data + i*ny*nz*2;
05640    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05641     for ( int k=0; k<nz; ++k ) {
05642       d[2*k] = 0;
05643       d[2*k+1] = 0;
05644     }
05645    }
05646   }
05647  }
05648 }
05649 
05650 void PmeXPencil::forward_fft() {
05651 #ifdef NAMD_FFTW
05652 
05653 #ifdef MANUAL_DEBUG_FFTW3
05654   dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05655 #endif
05656 
05657 #ifdef NAMD_FFTW_3
05658 #if     CMK_SMP && USE_CKLOOP
05659   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05660   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05661      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05662           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05663           //transform the above loop
05664           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05665           return;
05666   }
05667 #endif
05668   fftwf_execute(forward_plan);
05669 #else
05670   fftw(forward_plan, ny*nz,
05671         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05672 #endif
05673 #ifdef MANUAL_DEBUG_FFTW3
05674   dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05675 #endif
05676 
05677 #endif
05678 }
05679 
05680 void PmeXPencil::pme_kspace() {
05681 
05682   evir = 0.;
05683 
05684 #ifdef FFTCHECK
05685   return;
05686 #endif
05687 
05688   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
05689 
05690   int useCkLoop = 0;
05691 #if CMK_SMP && USE_CKLOOP
05692   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
05693        && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
05694     useCkLoop = 1;
05695   }
05696 #endif
05697 
05698   int numGrids = 1;
05699   for ( int g=0; g<numGrids; ++g ) {
05700     evir[0] = myKSpace->compute_energy(data+0*g,
05701                 lattice, ewaldcof, &(evir[1]), useCkLoop);
05702   }
05703   
05704 #if USE_NODE_PAR_RECEIVE
05705     CmiMemoryWriteFence();
05706 #endif
05707 }
05708 
05709 void PmeXPencil::backward_fft() {
05710 #ifdef NAMD_FFTW
05711 #ifdef MANUAL_DEBUG_FFTW3
05712   dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05713 #endif
05714 
05715 #ifdef NAMD_FFTW_3
05716 #if     CMK_SMP && USE_CKLOOP
05717   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05718   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
05719      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05720           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
05721           //transform the above loop
05722           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05723           return;
05724   }
05725 #endif
05726   fftwf_execute(backward_plan);
05727 #else
05728   fftw(backward_plan, ny*nz,
05729         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05730 #endif
05731 #ifdef MANUAL_DEBUG_FFTW3
05732   dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05733 #endif
05734 #endif
05735 }
05736 
05737 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
05738         int evirIdx = paraNum;
05739         PmeXPencil *xpencil = (PmeXPencil *)param;
05740         xpencil->send_subset_untrans(first, last, evirIdx);
05741 }
05742 
05743 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
05744         int xBlocks = initdata.xBlocks;
05745         int block1 = initdata.grid.block1;      
05746         int K1 = initdata.grid.K1;
05747 
05748         int ackL=0, ackH=-1;
05749         int unL=0, unH=-1;
05750         int send_evir=0;
05751         if(fromIdx >= evirIdx+1) {
05752                 //send PmeUntransMsg with has_evir=0
05753                 unL = fromIdx;
05754                 unH = toIdx;            
05755         } else if(toIdx <= evirIdx-1) {
05756                 //send PmeAckMsg
05757                 ackL=fromIdx;
05758                 ackH=toIdx;             
05759         } else {
05760                 //partially send PmeAckMsg and partially send PmeUntransMsg
05761                 ackL=fromIdx;
05762                 ackH=evirIdx-1;
05763                 send_evir=1;
05764                 unL=evirIdx+1;
05765                 unH=toIdx;
05766         }
05767 
05768         for(int isend=ackL; isend<=ackH; isend++) {
05769                 //send PmeAckMsg
05770         CmiEnableUrgentSend(1);
05771                 int ib = send_order[isend];
05772                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05773                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05774                 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05775         CmiEnableUrgentSend(0);
05776     }
05777 
05778     CmiEnableUrgentSend(1);
05779         //send PmeUntransMsg with has_evir=1
05780         if(send_evir) {
05781                 int ib = send_order[evirIdx];
05782                 int nx = block1;
05783                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05784                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;              
05785                 msg->sourceNode = thisIndex.y;
05786                 msg->ny = ny;
05787                 float *md = msg->qgrid;
05788                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05789                         float *d = data + i*ny*nz*2;
05790                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05791                                 for ( int k=0; k<nz; ++k ) {
05792                                         *(md++) = d[2*k];
05793                                         *(md++) = d[2*k+1];
05794                                 }
05795                         }
05796                 }
05797                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05798 #if USE_NODE_PAR_RECEIVE
05799         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05800         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05801 #else
05802         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05803 #endif
05804          }
05805     CmiEnableUrgentSend(0);
05806         
05807         //send PmeUntransMsg with has_evir=0
05808         for(int isend=unL; isend<=unH; isend++) {
05809                 int ib = send_order[isend];
05810                 int nx = block1;
05811                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05812                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05813                 msg->sourceNode = thisIndex.y;
05814                 msg->ny = ny;
05815                 float *md = msg->qgrid;
05816                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05817                         float *d = data + i*ny*nz*2;
05818                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05819                                 for ( int k=0; k<nz; ++k ) {
05820                                         *(md++) = d[2*k];
05821                                         *(md++) = d[2*k+1];
05822                                 }
05823                         }
05824                 }
05825                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05826         CmiEnableUrgentSend(1);
05827 #if USE_NODE_PAR_RECEIVE
05828         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05829 #if Y_PERSIST 
05830         CmiUsePersistentHandle(&untrans_handle[isend], 1);
05831 #endif
05832         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05833 #if Y_PERSIST 
05834         CmiUsePersistentHandle(NULL, 0);
05835 #endif
05836 #else
05837 #if Y_PERSIST 
05838   //      CmiUsePersistentHandle(&untrans_handle[isend], 1);
05839 #endif
05840         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05841 #if Y_PERSIST 
05842    //     CmiUsePersistentHandle(NULL, 0);
05843 #endif
05844 #endif
05845         CmiEnableUrgentSend(0);
05846         }
05847 }
05848 
05849 void PmeXPencil::send_untrans() {
05850 
05851   { // send energy and virial
05852     int numGrids = 1;
05853     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
05854     newmsg->evir[0] = evir;
05855     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
05856     CmiEnableUrgentSend(1);
05857     initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
05858     CmiEnableUrgentSend(0);
05859   }
05860 
05861 #if USE_PERSISTENT
05862   if (untrans_handle == NULL) setup_persistent();
05863 #endif
05864 #if     CMK_SMP && USE_CKLOOP
05865   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05866   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
05867      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05868                 int xBlocks = initdata.xBlocks;
05869                 int evirIdx = 0;
05870                 for ( int isend=0; isend<xBlocks; ++isend ) {
05871                         int ib = send_order[isend];
05872                         if (needs_reply[ib]) {
05873                                 evirIdx = isend;
05874                                 break;
05875                         }
05876                 }
05877 
05878                 //basically: 
05879                 //[0,evirIdx-1]->send PmeAckMsg
05880                 //evirIdx->send PmeUntransMsg with has_evir=1
05881                 //[evirIdx+1, xBlocks-1]->send PmeUntransMsg with has_evir=0
05882                 //send_subset_untrans(0, xBlocks-1, evirIdx);
05883 #if USE_NODE_PAR_RECEIVE
05884                 //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
05885                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
05886 #else
05887         //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
05888                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
05889 #endif        
05890                 return;
05891   }
05892 #endif
05893   int xBlocks = initdata.xBlocks;
05894   int block1 = initdata.grid.block1;
05895   int K1 = initdata.grid.K1;
05896   int send_evir = 1;
05897   for ( int isend=0; isend<xBlocks; ++isend ) {
05898     int ib = send_order[isend];
05899     if ( ! needs_reply[ib] ) {
05900       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05901       CmiEnableUrgentSend(1);
05902       SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05903       initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05904       CmiEnableUrgentSend(0);
05905       continue;
05906     }
05907     int nx = block1;
05908     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05909     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05910     if ( send_evir ) {
05911       send_evir = 0;
05912     }
05913     msg->sourceNode = thisIndex.y;
05914     msg->ny = ny;
05915     float *md = msg->qgrid;
05916     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05917      float *d = data + i*ny*nz*2;
05918      for ( int j=0; j<ny; ++j, d += nz*2 ) {
05919       for ( int k=0; k<nz; ++k ) {
05920         *(md++) = d[2*k];
05921         *(md++) = d[2*k+1];
05922       }
05923      }
05924     }
05925     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05926 
05927     CmiEnableUrgentSend(1);
05928 #if USE_NODE_PAR_RECEIVE
05929     msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05930 #if Y_PERSIST 
05931     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05932 #endif
05933     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05934 #if Y_PERSIST 
05935     CmiUsePersistentHandle(NULL, 0);
05936 #endif
05937 #else
05938 #if Y_PERSIST 
05939     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05940 #endif
05941     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05942 #if Y_PERSIST 
05943     CmiUsePersistentHandle(NULL, 0);
05944 #endif
05945 #endif
05946     CmiEnableUrgentSend(0);
05947   }
05948 }
05949 
05950 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
05951   int block2 = initdata.grid.block2;
05952   int K2 = initdata.grid.K2;
05953   int jb = msg->sourceNode;
05954   int ny = msg->ny;
05955   const float *md = msg->qgrid;
05956   float *d = data;
05957   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05958 #if CMK_BLUEGENEL
05959     CmiNetworkProgress();
05960 #endif   
05961     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05962       for ( int k=0; k<nz; ++k ) {
05963 #ifdef ZEROCHECK
05964         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
05965                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05966 #endif
05967         d[2*(j*nz+k)] = *(md++);
05968         d[2*(j*nz+k)+1] = *(md++);
05969       }
05970     }
05971   }
05972 }
05973 
05974 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
05975         PmeYPencil *ypencil = (PmeYPencil *)param;
05976         ypencil->backward_subset_fft(first, last);
05977 }
05978 
05979 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
05980 #ifdef NAMD_FFTW
05981 #ifdef NAMD_FFTW_3
05982         for(int i=fromIdx; i<=toIdx; i++){
05983                 fftwf_execute_dft(backward_plan,        
05984                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2,         
05985                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05986         }
05987 #endif
05988 #endif
05989 }
05990 
05991 void PmeYPencil::backward_fft() {
05992 #ifdef NAMD_FFTW
05993 #ifdef MANUAL_DEBUG_FFTW3
05994   dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05995 #endif
05996 
05997 #ifdef NAMD_FFTW_3
05998 #if     CMK_SMP && USE_CKLOOP
05999   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06000   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06001      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06002           CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
06003           return;
06004   }
06005 #endif
06006   //the above is a transformation of the following loop using CkLoop
06007   for ( int i=0; i<nx; ++i ) {
06008 #if CMK_BLUEGENEL
06009         CmiNetworkProgress();
06010 #endif
06011     fftwf_execute_dft(backward_plan,    
06012                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
06013                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
06014   }
06015 #else
06016         for ( int i=0; i<nx; ++i ) {
06017 #if CMK_BLUEGENEL
06018           CmiNetworkProgress();
06019 #endif
06020                 fftw(backward_plan, nz,
06021                 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
06022                 nz, 1, (fftw_complex *) work, 1, 0);
06023         }
06024 #endif
06025 
06026 #ifdef MANUAL_DEBUG_FFTW3
06027   dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
06028 #endif
06029 
06030 #endif
06031 }
06032 
06033 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
06034         int evirIdx = paraNum;
06035         PmeYPencil *ypencil = (PmeYPencil *)param;
06036         ypencil->send_subset_untrans(first, last, evirIdx);
06037 }
06038 
06039 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
06040         int yBlocks = initdata.yBlocks;
06041         int block2 = initdata.grid.block2;      
06042         int K2 = initdata.grid.K2;
06043 
06044         int ackL=0, ackH=-1;
06045         int unL=0, unH=-1;
06046         int send_evir=0;
06047         if(fromIdx >= evirIdx+1) {
06048                 //send PmeUntransMsg with has_evir=0
06049                 unL = fromIdx;
06050                 unH = toIdx;            
06051         } else if(toIdx <= evirIdx-1) {
06052                 //send PmeAckMsg
06053                 ackL=fromIdx;
06054                 ackH=toIdx;             
06055         } else {
06056                 //partially send PmeAckMsg and partially send PmeUntransMsg
06057                 ackL=fromIdx;
06058                 ackH=evirIdx-1;
06059                 send_evir=1;
06060                 unL=evirIdx+1;
06061                 unH=toIdx;
06062         }
06063 
06064         for(int isend=ackL; isend<=ackH; isend++) {
06065                 //send PmeAckMsg
06066         CmiEnableUrgentSend(1);
06067                 int jb = send_order[isend];
06068                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06069                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06070                 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06071         CmiEnableUrgentSend(0);
06072         }
06073 
06074     CmiEnableUrgentSend(1);
06075         //send PmeUntransMsg with has_evir=1
06076         if(send_evir) {
06077                 int jb = send_order[evirIdx];
06078                 int ny = block2;
06079                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06080                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;              
06081                 msg->sourceNode = thisIndex.z;
06082                 msg->ny = nz;
06083                 float *md = msg->qgrid;
06084                 const float *d = data;
06085                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06086                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06087                                 for ( int k=0; k<nz; ++k ) {
06088                                         *(md++) = d[2*(j*nz+k)];
06089                                         *(md++) = d[2*(j*nz+k)+1];
06090                                 }
06091                         }
06092                 }
06093                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06094 #if USE_NODE_PAR_RECEIVE
06095         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06096     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06097         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06098 #else
06099         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06100 #endif
06101         }
06102 
06103     CmiEnableUrgentSend(0);
06104         //send PmeUntransMsg with has_evir=0
06105         for(int isend=unL; isend<=unH; isend++) {
06106                 int jb = send_order[isend];
06107                 int ny = block2;
06108                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06109                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06110                 msg->sourceNode = thisIndex.z;
06111                 msg->ny = nz;
06112                 float *md = msg->qgrid;
06113                 const float *d = data;
06114                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06115                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06116                                 for ( int k=0; k<nz; ++k ) {
06117                                         *(md++) = d[2*(j*nz+k)];
06118                                         *(md++) = d[2*(j*nz+k)+1];
06119                                 }
06120                         }
06121                 }
06122                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06123             CmiEnableUrgentSend(1);
06124 #if USE_NODE_PAR_RECEIVE
06125         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06126         //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06127 #if Z_PERSIST 
06128         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06129 #endif
06130         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06131 #if Z_PERSIST 
06132         CmiUsePersistentHandle(NULL, 0);
06133 #endif
06134 #else
06135 #if Z_PERSIST 
06136         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06137 #endif
06138         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06139 #if Z_PERSIST 
06140         CmiUsePersistentHandle(NULL, 0);
06141 #endif
06142 #endif
06143     CmiEnableUrgentSend(0);
06144         }
06145 }
06146 
06147 void PmeYPencil::send_untrans() {
06148 #if USE_PERSISTENT
06149   if (untrans_handle == NULL) setup_persistent();
06150 #endif
06151 #if     CMK_SMP && USE_CKLOOP
06152   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06153   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06154      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06155           int yBlocks = initdata.yBlocks;
06156           int evirIdx = 0;
06157           for ( int isend=0; isend<yBlocks; ++isend ) {
06158                   int jb = send_order[isend];
06159                   if (needs_reply[jb]) {
06160                           evirIdx = isend;
06161                           break;
06162                   }
06163           }
06164 
06165           //basically: 
06166           //[0,evirIdx-1]->send PmeAckMsg
06167           //evirIdx->send PmeUntransMsg with has_evir=1
06168           //[evirIdx+1, yBlocks-1]->send PmeUntransMsg with has_evir=0
06169           //send_subset_untrans(0, yBlocks-1, evirIdx);
06170 #if USE_NODE_PAR_RECEIVE      
06171           //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
06172           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
06173       evir = 0.;
06174       CmiMemoryWriteFence();
06175 #else
06176       //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
06177           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
06178 #endif
06179           return;
06180   }
06181 #endif
06182   int yBlocks = initdata.yBlocks;
06183   int block2 = initdata.grid.block2;
06184   int K2 = initdata.grid.K2;
06185   int send_evir = 1;
06186   for ( int isend=0; isend<yBlocks; ++isend ) {
06187     int jb = send_order[isend];
06188     if ( ! needs_reply[jb] ) {
06189       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06190       CmiEnableUrgentSend(1);
06191       SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06192       initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06193       CmiEnableUrgentSend(0);
06194       continue;
06195     }
06196     int ny = block2;
06197     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06198     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06199     if ( send_evir ) {
06200       send_evir = 0;
06201     }
06202     msg->sourceNode = thisIndex.z;
06203     msg->ny = nz;
06204     float *md = msg->qgrid;
06205     const float *d = data;
06206     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06207      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06208       for ( int k=0; k<nz; ++k ) {
06209         *(md++) = d[2*(j*nz+k)];
06210         *(md++) = d[2*(j*nz+k)+1];
06211       }
06212      }
06213     }
06214     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06215 
06216     CmiEnableUrgentSend(1);
06217 #if USE_NODE_PAR_RECEIVE
06218     msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06219     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06220 #if Z_PERSIST 
06221     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06222 #endif
06223     initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06224 #if Z_PERSIST
06225     CmiUsePersistentHandle(NULL, 0);
06226 #endif
06227 #else
06228 #if Z_PERSIST 
06229     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06230 #endif
06231     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06232 #if Z_PERSIST 
06233     CmiUsePersistentHandle(NULL, 0);
06234 #endif
06235 #endif    
06236     CmiEnableUrgentSend(0);
06237   }
06238   
06239 #if USE_NODE_PAR_RECEIVE
06240   evir = 0.;
06241   CmiMemoryWriteFence();
06242 #endif
06243 }
06244 
06245 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
06246 #if ! USE_NODE_PAR_RECEIVE
06247     if(imsg==0) evir=0.;
06248 #endif
06249 
06250   int block3 = initdata.grid.block3;
06251   int dim3 = initdata.grid.dim3;
06252   int kb = msg->sourceNode;
06253   int nz = msg->ny;
06254   const float *md = msg->qgrid;
06255   float *d = data;
06256   for ( int i=0; i<nx; ++i ) {
06257 #if CMK_BLUEGENEL
06258     CmiNetworkProgress();
06259 #endif   
06260     for ( int j=0; j<ny; ++j, d += dim3 ) {
06261       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
06262 #ifdef ZEROCHECK
06263         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
06264                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
06265 #endif
06266         d[2*k] = *(md++);
06267         d[2*k+1] = *(md++);
06268       }
06269     }
06270   }
06271 }
06272 
06273 void PmeZPencil::backward_fft() {
06274 #ifdef NAMD_FFTW
06275 #ifdef MANUAL_DEBUG_FFTW3
06276   dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06277 #endif
06278 #ifdef NAMD_FFTW_3
06279 #if     CMK_SMP && USE_CKLOOP
06280   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06281   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06282      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06283           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
06284           //transform the above loop
06285           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
06286           return;
06287   }
06288 #endif
06289   fftwf_execute(backward_plan);
06290 #else
06291   rfftwnd_complex_to_real(backward_plan, nx*ny,
06292             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
06293 #endif
06294 #ifdef MANUAL_DEBUG_FFTW3
06295   dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06296 #endif
06297 
06298 #endif
06299   
06300 #if CMK_BLUEGENEL
06301   CmiNetworkProgress();
06302 #endif
06303 
06304 #ifdef FFTCHECK
06305   int dim3 = initdata.grid.dim3;
06306   int K1 = initdata.grid.K1;
06307   int K2 = initdata.grid.K2;
06308   int K3 = initdata.grid.K3;
06309   float scale = 1. / (1. * K1 * K2 * K3);
06310   float maxerr = 0.;
06311   float maxstd = 0.;
06312   int mi, mj, mk;  mi = mj = mk = -1;
06313   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
06314   const float *d = data;
06315   for ( int i=0; i<nx; ++i ) {
06316    for ( int j=0; j<ny; ++j, d += dim3 ) {
06317     for ( int k=0; k<K3; ++k ) {
06318       float std = 10. * (10. * (10. * std_base + i) + j) + k;
06319       float err = scale * d[k] - std;
06320       if ( fabsf(err) > fabsf(maxerr) ) {
06321         maxerr = err;
06322         maxstd = std;
06323         mi = i;  mj = j;  mk = k;
06324       }
06325     }
06326    }
06327   }
06328   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
06329                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
06330 #endif
06331 
06332 }
06333 
06334 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
06335         //to take advantage of the interface which allows 3 user params at most.
06336         //under such situtation, no new parameter list needs to be created!! -Chao Mei
06337         int specialIdx = paraNum;
06338         PmeZPencil *zpencil = (PmeZPencil *)param;
06339         zpencil->send_subset_ungrid(first, last, specialIdx);
06340 }
06341 
06342 void PmeZPencil::send_all_ungrid() {
06343 /* 
06344 //Original code: the transformation is to first extract the msg 
06345 //idx that will has evir value set. -Chao Mei  
06346         int send_evir = 1;
06347         for (int imsg=0; imsg < grid_msgs.size(); ++imsg ) {
06348                 PmeGridMsg *msg = grid_msgs[imsg];
06349                 if ( msg->hasData ) {
06350                         if ( send_evir ) {
06351                                 msg->evir[0] = evir;
06352                                 send_evir = 0;
06353                         } else {
06354                                 msg->evir[0] = 0.;
06355                         }
06356                 }
06357                 send_ungrid(msg);
06358         }
06359 */
06360         int evirIdx = 0;
06361         for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
06362                 if(grid_msgs[imsg]->hasData) {
06363                         evirIdx = imsg;
06364                         break;
06365                 }
06366         }
06367 
06368 #if     CMK_SMP && USE_CKLOOP
06369         int useCkLoop = Node::Object()->simParameters->useCkLoop;
06370         if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06371            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06372                 //????What's the best value for numChunks?????
06373 #if USE_NODE_PAR_RECEIVE        
06374                 //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 1); //has to sync
06375                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
06376 #else
06377         //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 0); //not sync
06378                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0); //not sync
06379 #endif        
06380                 return;
06381         }
06382 #endif
06383         send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
06384 }
06385 
06386 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
06387         for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
06388                 PmeGridMsg *msg = grid_msgs[imsg];
06389                 send_ungrid(msg);
06390         }
06391 }
06392 
06393 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
06394 
06395 #ifdef NAMD_CUDA
06396   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
06397 #else
06398   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
06399 #endif
06400 
06401   int pe = msg->sourceNode;
06402   if ( ! msg->hasData ) {
06403     delete msg;
06404     PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
06405     SET_PRIORITY(ackmsg,sequence,UNGRID_PRIORITY)
06406     CmiEnableUrgentSend(1);
06407     initdata.pmeProxy[pe].recvAck(ackmsg);
06408     CmiEnableUrgentSend(0);
06409     return;
06410   }
06411   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
06412   int dim3 = initdata.grid.dim3;
06413   int zlistlen = msg->zlistlen;
06414   int *zlist = msg->zlist;
06415   char *fmsg = msg->fgrid;
06416   float *qmsg = msg->qgrid;
06417   float *d = data;
06418   int numGrids = 1;  // pencil FFT doesn't support multiple grids
06419   for ( int g=0; g<numGrids; ++g ) {
06420 #if CMK_BLUEGENEL
06421     CmiNetworkProgress();
06422 #endif    
06423     for ( int i=0; i<nx; ++i ) {
06424       for ( int j=0; j<ny; ++j, d += dim3 ) {
06425         if( *(fmsg++) ) {
06426           for ( int k=0; k<zlistlen; ++k ) {
06427             *(qmsg++) = d[zlist[k]];
06428           }
06429         }
06430       }
06431     }
06432   }
06433   SET_PRIORITY(msg,sequence,UNGRID_PRIORITY)
06434     CmiEnableUrgentSend(1);
06435 #ifdef NAMD_CUDA
06436     if ( offload ) {
06437       initdata.pmeNodeProxy[CkNodeOf(pe)].recvUngrid(msg);
06438     } else
06439 #endif
06440   initdata.pmeProxy[pe].recvUngrid(msg);
06441     CmiEnableUrgentSend(0);
06442 }
06443 
06444 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
06445 {
06446 #if USE_NODE_PAR_RECEIVE
06447   CmiLock(ComputePmeMgr::fftw_plan_lock);
06448   CmiMemoryReadFence();
06449 #endif
06450   recv_grid(msg);
06451   if(msg->hasData) hasData=msg->hasData;
06452   int limsg;
06453   CmiMemoryAtomicFetchAndInc(imsg,limsg);
06454   grid_msgs[limsg] = msg;
06455   //  CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);      
06456   if(limsg+1 == grid_msgs.size())
06457     {
06458 
06459       if (hasData)
06460         {
06461           forward_fft();
06462         }
06463       send_trans();
06464       imsg=0;
06465       CmiMemoryWriteFence();
06466       //      CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06467     }
06468 #if USE_NODE_PAR_RECEIVE
06469   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06470   CmiMemoryWriteFence();
06471 #endif
06472 }
06473 
06474 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
06475 {
06476   recv_untrans(msg);
06477 #if USE_NODE_PAR_RECEIVE
06478   CmiMemoryWriteFence();
06479   CmiLock(ComputePmeMgr::fftw_plan_lock);
06480 #endif    
06481   int limsg;
06482   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
06483   if(limsg+1 == initdata.zBlocks)
06484     {
06485 #if USE_NODE_PAR_RECEIVE
06486       CmiMemoryReadFence();
06487 #endif    
06488       if(hasData) // maybe this should be an assert
06489         {
06490           backward_fft();
06491         }
06492         
06493         send_all_ungrid();
06494     /*  int send_evir = 1;
06495       // TODO: this part should use Chao's output parallelization
06496       for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
06497         PmeGridMsg *omsg = grid_msgs[limsg];
06498         if ( omsg->hasData ) {
06499           if ( send_evir ) {
06500             omsg->evir[0] = evir;
06501             send_evir = 0;
06502           } else {
06503             omsg->evir[0] = 0.;
06504           }
06505         }
06506         send_ungrid(omsg);
06507       } */
06508       imsgb=0;
06509       evir = 0;
06510       memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3); 
06511       CmiMemoryWriteFence();
06512       //      CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06513     }
06514 #if USE_NODE_PAR_RECEIVE
06515   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06516 #endif
06517 }
06518 
06519 void ComputePmeUtil::select(void)
06520 {
06521   if ( CkMyRank() ) return;
06522   
06523   SimParameters *simParams = Node::Object()->simParameters;
06524 
06525   alchOn = simParams->alchOn;
06526   alchFepOn = simParams->alchFepOn;
06527   alchThermIntOn = simParams->alchThermIntOn;
06528   alchDecouple = alchOn && simParams->alchDecouple;
06529   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0; 
06530   lesOn = simParams->lesOn;
06531   lesFactor = simParams->lesFactor;
06532   pairOn = simParams->pairInteractionOn;
06533   selfOn = simParams->pairInteractionSelf;
06534 
06535   if ( alchOn ) {
06536     numGrids = 2;
06537     if (alchDecouple) numGrids += 2;
06538     if (alchElecLambdaStart || alchThermIntOn) numGrids++;
06539   } else if ( lesOn ) {
06540     numGrids = lesFactor;
06541   } else if ( pairOn ) {
06542     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
06543     numGrids = (selfOn ? 1 : 3);
06544   } else {
06545     numGrids = 1;
06546   }
06547 
06548 }
06549 
06550 #include "ComputePmeMgr.def.h"
06551 

Generated on Sun Apr 22 01:17:13 2018 for NAMD by  doxygen 1.4.7