ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 //#define MANUAL_DEBUG_FFTW3 1
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012 // fftw2 doesn't have these defined
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024 
00025 #include <vector>
00026 #include <algorithm>
00027 #include <deque>
00028 using namespace std;
00029 
00030 #include "InfoStream.h"
00031 #include "Node.h"
00032 #include "PatchMap.h"
00033 #include "PatchMap.inl"
00034 #include "AtomMap.h"
00035 #include "ComputePme.h"
00036 #include "ComputePmeMgr.decl.h"
00037 #include "PmeBase.inl"
00038 #include "PmeRealSpace.h"
00039 #include "PmeKSpace.h"
00040 #include "ComputeNonbondedUtil.h"
00041 #include "PatchMgr.h"
00042 #include "Molecule.h"
00043 #include "ReductionMgr.h"
00044 #include "ComputeMgr.h"
00045 #include "ComputeMgr.decl.h"
00046 // #define DEBUGM
00047 #define MIN_DEBUG_LEVEL 3
00048 #include "Debug.h"
00049 #include "SimParameters.h"
00050 #include "WorkDistrib.h"
00051 #include "varsizemsg.h"
00052 #include "Random.h"
00053 #include "ckhashtable.h"
00054 #include "Priorities.h"
00055 
00056 #include "ComputeMoa.h"
00057 #include "ComputeMoaMgr.decl.h" 
00058 
00059 //#define     USE_RANDOM_TOPO         1
00060 
00061 //#define USE_TOPO_SFC                    1
00062 //#define     USE_CKLOOP                1
00063 //#include "TopoManager.h"
00064 
00065 #include "DeviceCUDA.h"
00066 #ifdef NAMD_CUDA
00067 #include <cuda_runtime.h>
00068 #include <cuda.h>
00069 void cuda_errcheck(const char *msg);
00070 #ifdef WIN32
00071 #define __thread __declspec(thread)
00072 #endif
00073 extern __thread DeviceCUDA *deviceCUDA;
00074 #endif
00075 
00076 #include "ComputePmeCUDAKernel.h"
00077 
00078 #ifndef SQRT_PI
00079 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00080 #endif
00081 
00082 #if CMK_PERSISTENT_COMM 
00083 #define USE_PERSISTENT      1
00084 #endif
00085 
00086 #if USE_PERSISTENT
00087 #define Z_PERSIST 1
00088 #define Y_PERSIST 1
00089 #define X_PERSIST 1
00090 #endif
00091 
00092 #if defined(NAMD_CUDA) && defined(MEM_OPT_VERSION)
00093 #define USE_NODE_PAR_RECEIVE    1
00094 #endif
00095 
00096 char *pencilPMEProcessors;
00097 
00098 class PmeAckMsg : public CMessage_PmeAckMsg {
00099 };
00100 
00101 class PmeGridMsg : public CMessage_PmeGridMsg {
00102 public:
00103 
00104   int sourceNode;
00105   int sequence;
00106   int hasData;
00107   Lattice lattice;
00108   int start;
00109   int len;
00110   int zlistlen;
00111   int *zlist;
00112   char *fgrid;
00113   float *qgrid;
00114   CkArrayIndex3D destElem;
00115 };
00116 
00117 class PmeTransMsg : public CMessage_PmeTransMsg {
00118 public:
00119 
00120   int sourceNode;
00121   int sequence;
00122   int hasData;
00123   Lattice lattice;
00124   int x_start;
00125   int nx;
00126   float *qgrid;
00127   CkArrayIndex3D destElem;
00128 };
00129 
00130 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00131 public:
00132   PmeTransMsg *msg;
00133   int *count;
00134   CmiNodeLock lock;
00135 };
00136 
00137 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00138 public:
00139 
00140   int sourceNode;
00141   int y_start;
00142   int ny;
00143   float *qgrid;
00144   CkArrayIndex3D destElem;
00145 };
00146 
00147 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00148 public:
00149   PmeUntransMsg *msg;
00150   int *count;
00151   CmiNodeLock lock;
00152 };
00153 
00154 class PmeEvirMsg : public CMessage_PmeEvirMsg {
00155 public:
00156   PmeReduction *evir;
00157 };
00158 
00159 class PmePencilMap : public CBase_PmePencilMap {
00160 public:
00161   PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00162     : ia(i_a), ib(i_b), nb(n_b),
00163       size(n), data(newcopyint(n,d)) {
00164   }
00165   virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00166     //Return an ``arrayHdl'', given some information about the array
00167     return 0;
00168   }
00169   virtual int procNum(int, const CkArrayIndex &i) {
00170     //Return the home processor number for this element of this array
00171     return data[ i.data()[ia] * nb + i.data()[ib] ];
00172   }
00173   virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00174     int mype = CkMyPe();
00175     for ( int i=0; i < size; ++i ) {
00176       if ( data[i] == mype ) {
00177         CkArrayIndex3D ai(0,0,0);
00178         ai.data()[ia] = i / nb;
00179         ai.data()[ib] = i % nb;
00180         if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00181         if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00182         mgr->insertInitial(ai,msg);
00183         msg = 0;
00184       }
00185     }
00186     mgr->doneInserting();
00187     if ( msg ) CkFreeMsg(msg);
00188   }
00189 private:
00190   const int ia, ib, nb, size;
00191   const int* const data;
00192   static int* newcopyint(int n, int *d) {
00193     int *newd = new int[n];
00194     memcpy(newd, d, n*sizeof(int));
00195     return newd;
00196   }
00197 };
00198 
00199 // use this idiom since messages don't have copy constructors
00200 struct PmePencilInitMsgData {
00201   PmeGrid grid;
00202   int xBlocks, yBlocks, zBlocks;
00203   CProxy_PmeXPencil xPencil;
00204   CProxy_PmeYPencil yPencil;
00205   CProxy_PmeZPencil zPencil;
00206   CProxy_ComputePmeMgr pmeProxy;
00207   CProxy_NodePmeMgr pmeNodeProxy;
00208   CProxy_PmePencilMap xm;
00209   CProxy_PmePencilMap ym;
00210   CProxy_PmePencilMap zm;
00211 };
00212 
00213 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00214 public:
00215    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00216    PmePencilInitMsgData data;
00217 };
00218 
00219 
00220 struct LocalPmeInfo {
00221   int nx, x_start;
00222   int ny_after_transpose, y_start_after_transpose;
00223 };
00224 
00225 struct NodePmeInfo {
00226   int npe, pe_start, real_node;
00227 };
00228 
00229 
00230 static int findRecipEvirPe() {
00231     PatchMap *patchMap = PatchMap::Object();
00232     {
00233       int mype = CkMyPe();
00234       if ( patchMap->numPatchesOnNode(mype) ) {
00235         return mype; 
00236       }
00237     }
00238     {
00239       int node = CmiMyNode();
00240       int firstpe = CmiNodeFirst(node);
00241       int nodeSize = CmiNodeSize(node);
00242       int myrank = CkMyRank();
00243       for ( int i=0; i<nodeSize; ++i ) {
00244         int pe = firstpe + (myrank+i)%nodeSize;
00245         if ( patchMap->numPatchesOnNode(pe) ) {
00246           return pe;
00247         }
00248       }
00249     }
00250     {
00251       int *pelist;
00252       int nodeSize;
00253       CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
00254       int myrank;
00255       for ( int i=0; i<nodeSize; ++i ) {
00256         if ( pelist[i] == CkMyPe() ) myrank = i;
00257       }
00258       for ( int i=0; i<nodeSize; ++i ) {
00259         int pe = pelist[(myrank+i)%nodeSize];
00260         if ( patchMap->numPatchesOnNode(pe) ) {
00261           return pe;
00262         }
00263       }
00264     }
00265     {
00266       int mype = CkMyPe();
00267       int npes = CkNumPes();
00268       for ( int i=0; i<npes; ++i ) {
00269         int pe = (mype+i)%npes;
00270         if ( patchMap->numPatchesOnNode(pe) ) {
00271           return pe;
00272         }
00273       }
00274     }
00275     NAMD_bug("findRecipEvirPe() failed!");
00276     return -999;  // should never happen
00277 }
00278 
00279 
00280 //Assigns gridPeMap and transPeMap to different set of processors.
00281 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00282   int ncpus = CkNumPes();
00283   
00284   for ( int i=0; i<numGridPes; ++i ) {
00285     gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
00286   }
00287   std::sort(gridPeMap,gridPeMap+numGridPes);
00288   int firstTransPe = ncpus - numGridPes - numTransPes;
00289   if ( firstTransPe < 0 ) {
00290     firstTransPe = 0;
00291     // 0 should be first in list, skip if possible
00292     if ( ncpus > numTransPes ) firstTransPe = 1;
00293   }
00294   for ( int i=0; i<numTransPes; ++i ) {
00295     transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
00296   }
00297   std::sort(transPeMap,transPeMap+numTransPes);
00298 }
00299 
00300 #if USE_TOPOMAP 
00301 //Topology aware PME allocation
00302 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00303                              int nbpes=0);
00304 #endif
00305 
00306 
00307 int compare_bit_reversed(int a, int b) {
00308   int d = a ^ b;
00309   int c = 1;
00310   if ( d ) while ( ! (d & c) ) {
00311     c = c << 1;
00312   }
00313   return (a & c) - (b & c);
00314 }
00315 
00316 inline bool less_than_bit_reversed(int a, int b) {
00317   int d = a ^ b;
00318   int c = 1;
00319   if ( d ) while ( ! (d & c) ) {
00320     c = c << 1;
00321   }
00322   return d && (b & c);
00323 }
00324 
00325 struct sortop_bit_reversed {
00326   inline bool operator() (int a, int b) const {
00327     return less_than_bit_reversed(a,b);
00328   }
00329 };
00330 
00331 struct ijpair {
00332   int i,j;
00333   ijpair() {;}
00334   ijpair(int I, int J) : i(I), j(J) {;}
00335 };
00336 
00337 struct ijpair_sortop_bit_reversed {
00338   inline bool operator() (const ijpair &a, const ijpair &b) const {
00339     return ( less_than_bit_reversed(a.i,b.i)
00340              || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
00341   }
00342 };
00343 
00344 class ComputePmeMgr : public CBase_ComputePmeMgr {
00345 public:
00346   friend class ComputePme;
00347   friend class NodePmeMgr;
00348   ComputePmeMgr();
00349   ~ComputePmeMgr();
00350 
00351   void initialize(CkQdMsg*);
00352   void initialize_pencils(CkQdMsg*);
00353   void activate_pencils(CkQdMsg*);
00354   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00355   void initialize_computes();
00356 
00357   void sendData(Lattice &, int sequence);
00358   void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
00359   Lattice *sendDataHelper_lattice;
00360   int sendDataHelper_sequence;
00361   int sendDataHelper_sourcepe;
00362   int sendDataHelper_errors;
00363   void sendPencils(Lattice &, int sequence);
00364   void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
00365   void recvGrid(PmeGridMsg *);
00366   void gridCalc1(void);
00367   void sendTransBarrier(void);
00368   void sendTransSubset(int first, int last);
00369   void sendTrans(void);
00370   void fwdSharedTrans(PmeTransMsg *);
00371   void recvSharedTrans(PmeSharedTransMsg *);
00372   void sendDataHelper(int);
00373   void sendPencilsHelper(int);
00374   void recvTrans(PmeTransMsg *);
00375   void procTrans(PmeTransMsg *);
00376   void gridCalc2(void);
00377   #ifdef OPENATOM_VERSION
00378   void gridCalc2Moa(void);
00379   #endif // OPENATOM_VERSION
00380   void gridCalc2R(void);
00381   void fwdSharedUntrans(PmeUntransMsg *);
00382   void recvSharedUntrans(PmeSharedUntransMsg *);
00383   void sendUntrans(void);
00384   void sendUntransSubset(int first, int last);
00385   void recvUntrans(PmeUntransMsg *);
00386   void procUntrans(PmeUntransMsg *);
00387   void gridCalc3(void);
00388   void sendUngrid(void);
00389   void sendUngridSubset(int first, int last);
00390   void recvUngrid(PmeGridMsg *);
00391   void recvAck(PmeAckMsg *);
00392   void copyResults(PmeGridMsg *);
00393   void copyPencils(PmeGridMsg *);
00394   void ungridCalc(void);
00395   void recvRecipEvir(PmeEvirMsg *);
00396   void addRecipEvirClient(void);
00397   void submitReductions();
00398 
00399 #if 0 && USE_PERSISTENT
00400   void setup_recvgrid_persistent();
00401 #endif
00402 
00403   static CmiNodeLock fftw_plan_lock;
00404   CmiNodeLock pmemgr_lock;  // for accessing this object from other threads
00405 
00406 #ifdef NAMD_CUDA
00407   float *a_data_host;
00408   float *a_data_dev;
00409   float *f_data_host;
00410   float *f_data_dev;
00411   int cuda_atoms_count;
00412   int cuda_atoms_alloc;
00413   static CmiNodeLock cuda_lock;
00414   void chargeGridSubmitted(Lattice &lattice, int sequence);
00415   cudaEvent_t end_charges;
00416   cudaEvent_t *end_forces;
00417   int forces_count;
00418   int forces_done_count;
00419   double charges_time;
00420   double forces_time;
00421   int check_charges_count;
00422   int check_forces_count;
00423   int master_pe;
00424   int this_pe;
00425 
00426   void cuda_submit_charges(Lattice &lattice, int sequence);
00427   struct cuda_submit_charges_args {
00428     ComputePmeMgr *mgr; Lattice *lattice; int sequence;
00429   };
00430   static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
00431   static bool cuda_busy;
00432 
00433   int chargeGridSubmittedCount;
00434   void sendChargeGridReady();
00435 #endif
00436   Lattice *saved_lattice;  // saved by chargeGridSubmitted
00437   int saved_sequence;      // saved by chargeGridSubmitted
00438   void pollChargeGridReady();
00439   void pollForcesReady();
00440   void recvChargeGridReady();
00441   void chargeGridReady(Lattice &lattice, int sequence);
00442 
00443   ResizeArray<ComputePme*> pmeComputes;
00444 
00445 private:
00446 
00447 #if 0 && USE_PERSISTENT
00448   PersistentHandle   *recvGrid_handle;
00449 #endif
00450 
00451   CProxy_ComputePmeMgr pmeProxy;
00452   CProxy_ComputePmeMgr pmeProxyDir;
00453   CProxy_NodePmeMgr pmeNodeProxy;
00454   NodePmeMgr *nodePmeMgr;
00455   ComputePmeMgr *masterPmeMgr;
00456   
00457   void addCompute(ComputePme *c) {
00458     if ( ! pmeComputes.size() ) initialize_computes();
00459     pmeComputes.add(c);
00460     c->setMgr(this);
00461   }
00462 
00463   ResizeArray<ComputePme*> heldComputes;
00464   PmeGrid myGrid;
00465   Lattice lattice;
00466   PmeKSpace *myKSpace;
00467   float *qgrid;
00468   float *kgrid;
00469 
00470 #ifdef NAMD_FFTW
00471 #ifdef NAMD_FFTW_3
00472   fftwf_plan *forward_plan_x, *backward_plan_x;
00473   fftwf_plan *forward_plan_yz, *backward_plan_yz;
00474   fftwf_complex *work;
00475 #else
00476   fftw_plan forward_plan_x, backward_plan_x;
00477   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00478   fftw_complex *work;
00479 #endif
00480 #else
00481   float *work;
00482 #endif
00483 
00484   int qsize, fsize, bsize;
00485   int alchOn, alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00486   int alchDecouple;
00487   int offload;
00488   BigReal alchElecLambdaStart;
00489   BigReal alchLambda;  // set on each step in ComputePme::ungridForces()
00490 
00491   float **q_arr;
00492   // q_list and q_count not used for offload
00493   float **q_list;
00494   int q_count;
00495   char *f_arr;
00496   char *fz_arr;
00497   PmeReduction evir[PME_MAX_EVALS];
00498   SubmitReduction *reduction;
00499 
00500   int noWorkCount;
00501   int doWorkCount;
00502   int ungridForcesCount;
00503 
00504 #ifdef NAMD_CUDA
00505 #define NUM_STREAMS 1
00506   cudaStream_t streams[NUM_STREAMS];
00507   int stream;
00508 
00509   float **q_arr_dev;
00510   float **v_arr_dev;
00511   float *q_data_host;
00512   float *q_data_dev;
00513   float *v_data_dev;
00514   int *ffz_host;
00515   int *ffz_dev;
00516   int q_data_size;
00517   int ffz_size;
00518 
00519   int f_data_mgr_alloc;
00520   float *f_data_mgr_host;
00521   float *f_data_mgr_dev;
00522   float **afn_host;
00523   float **afn_dev;
00524 
00525   float *bspline_coeffs_dev;
00526   float *bspline_dcoeffs_dev;
00527 #endif
00528   int recipEvirCount;   // used in compute only
00529   int recipEvirClients; // used in compute only
00530   int recipEvirPe;      // used in trans only
00531   
00532   LocalPmeInfo *localInfo;
00533   NodePmeInfo *gridNodeInfo;
00534   NodePmeInfo *transNodeInfo;
00535   int qgrid_size;
00536   int qgrid_start;
00537   int qgrid_len;
00538   int fgrid_start;
00539   int fgrid_len;
00540 
00541   int numSources;
00542   int numGridPes;
00543   int numTransPes;
00544   int numGridNodes;
00545   int numTransNodes;
00546   int numDestRecipPes;
00547   int myGridPe, myGridNode;
00548   int myTransPe, myTransNode;
00549   int *gridPeMap;
00550   int *transPeMap;
00551   int *recipPeDest;
00552   int *gridPeOrder;
00553   int *gridNodeOrder;
00554   int *transNodeOrder;
00555   int grid_count;
00556   int trans_count;
00557   int untrans_count;
00558   int ungrid_count;
00559   PmeGridMsg **gridmsg_reuse;
00560   PmeReduction recip_evir2[PME_MAX_EVALS];
00561 
00562   int compute_sequence;  // set from patch computes, used for priorities
00563   int grid_sequence;  // set from grid messages, used for priorities
00564   int useBarrier;
00565   int sendTransBarrier_received;
00566 
00567   int usePencils;
00568   int xBlocks, yBlocks, zBlocks;
00569   CProxy_PmeXPencil xPencil;
00570   CProxy_PmeYPencil yPencil;
00571   CProxy_PmeZPencil zPencil;
00572   char *pencilActive;
00573   ijpair *activePencils;
00574   int numPencilsActive;
00575   int strayChargeErrors;
00576 };
00577 
00578 ResizeArray<ComputePme*>& getComputes(ComputePmeMgr *mgr) {
00579     return mgr->pmeComputes ;
00580 }
00581 
00582   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00583 #ifdef NAMD_CUDA
00584   CmiNodeLock ComputePmeMgr::cuda_lock;
00585   std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
00586   bool ComputePmeMgr::cuda_busy;
00587 #endif
00588 
00589 int isPmeProcessor(int p){ 
00590   SimParameters *simParams = Node::Object()->simParameters;
00591   if (simParams->usePMECUDA) {
00592     return 0;
00593   } else {
00594     return pencilPMEProcessors[p];
00595   }
00596 }
00597 
00598 class NodePmeMgr : public CBase_NodePmeMgr {
00599 public:
00600   friend class ComputePmeMgr;
00601   friend class ComputePme;
00602   NodePmeMgr();
00603   ~NodePmeMgr();
00604   void initialize();
00605   void sendDataHelper(int);
00606   void sendPencilsHelper(int);
00607   void recvTrans(PmeTransMsg *);
00608   void recvUntrans(PmeUntransMsg *);
00609   void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00610   void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00611   void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00612   void recvXTrans(PmeTransMsg *);
00613   void recvYTrans(PmeTransMsg *);
00614   void recvYUntrans(PmeUntransMsg *);
00615   void recvZGrid(PmeGridMsg *);
00616   void recvZUntrans(PmeUntransMsg *);
00617 
00618   void recvUngrid(PmeGridMsg *);
00619 
00620   void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00621       xm=_xm; ym=_ym; zm=_zm;
00622   }
00623   CProxy_PmePencilMap xm;
00624   CProxy_PmePencilMap ym;
00625   CProxy_PmePencilMap zm;
00626 
00627 private:
00628   CProxy_ComputePmeMgr mgrProxy;
00629   ComputePmeMgr *mgrObject;
00630   ComputePmeMgr **mgrObjects;
00631 #ifdef NAMD_CUDA
00632   ComputePmeMgr *masterPmeMgr;
00633   int master_pe;
00634 #endif
00635   CProxy_PmeXPencil xPencil;
00636   CProxy_PmeYPencil yPencil;
00637   CProxy_PmeZPencil zPencil;
00638   CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00639   CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00640   CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;  
00641 
00642 #ifdef NAMD_CUDA
00643   cudaEvent_t end_charge_memset;
00644   cudaEvent_t end_all_pme_kernels;
00645   cudaEvent_t end_potential_memcpy;
00646 #endif
00647 };
00648 
00649 NodePmeMgr::NodePmeMgr() {
00650   mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00651 }
00652 
00653 NodePmeMgr::~NodePmeMgr() {
00654   delete [] mgrObjects;
00655 }
00656 
00657 void NodePmeMgr::initialize() {
00658   CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00659   mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00660   if ( CkMyRank() == 0 ) {
00661     mgrProxy = proxy;
00662     mgrObject = proxy.ckLocalBranch();
00663   }
00664 }
00665 
00666 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00667   mgrObject->fwdSharedTrans(msg);
00668 }
00669 
00670 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00671   mgrObject->fwdSharedUntrans(msg);
00672 }
00673 
00674 void NodePmeMgr::recvUngrid(PmeGridMsg *msg) {
00675 #ifdef NAMD_CUDA
00676   masterPmeMgr->recvUngrid(msg);
00677 #else
00678   NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
00679 #endif
00680 }
00681 
00682 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00683 {
00684   CmiLock(ComputePmeMgr::fftw_plan_lock);
00685   xPencilObj.put(idx)=obj;
00686   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00687 }
00688 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00689 {
00690   CmiLock(ComputePmeMgr::fftw_plan_lock);
00691   yPencilObj.put(idx)=obj;
00692   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00693 }
00694 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00695 {
00696   CmiLock(ComputePmeMgr::fftw_plan_lock);
00697   zPencilObj.put(idx)=obj;
00698   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00699 }
00700 
00701 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00702                                  pmeProxyDir(thisgroup) {
00703 
00704   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00705   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00706   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00707 
00708   pmeNodeProxy.ckLocalBranch()->initialize();
00709 
00710   if ( CmiMyRank() == 0 ) {
00711     fftw_plan_lock = CmiCreateLock();
00712   }
00713   pmemgr_lock = CmiCreateLock();
00714 
00715   myKSpace = 0;
00716   kgrid = 0;
00717   work = 0;
00718   grid_count = 0;
00719   trans_count = 0;
00720   untrans_count = 0;
00721   ungrid_count = 0;
00722   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00723   useBarrier = 0;
00724   sendTransBarrier_received = 0;
00725   usePencils = 0;
00726 
00727 #ifdef NAMD_CUDA
00728  // offload has not been set so this happens on every run
00729   if ( CmiMyRank() == 0 ) {
00730     cuda_lock = CmiCreateLock();
00731   }
00732 
00733 #if CUDA_VERSION >= 5050
00734   int leastPriority, greatestPriority;
00735   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00736   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00737   //if ( CkMyNode() == 0 ) {
00738   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00739   //}
00740 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00741 #else
00742 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00743 #endif
00744 
00745   stream = 0;
00746   for ( int i=0; i<NUM_STREAMS; ++i ) {
00747 #if 1
00748     CUDA_STREAM_CREATE(&streams[i]);
00749     cuda_errcheck("cudaStreamCreate");
00750 #else
00751   streams[i] = 0;  // XXXX Testing!!!
00752 #endif
00753   }
00754 
00755   this_pe = CkMyPe();
00756  
00757   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00758   end_forces = 0;
00759   check_charges_count = 0;
00760   check_forces_count = 0;
00761   chargeGridSubmittedCount = 0;
00762 
00763   cuda_atoms_count = 0;
00764   cuda_atoms_alloc = 0;
00765 
00766   f_data_mgr_alloc = 0;
00767   f_data_mgr_host = 0;
00768   f_data_mgr_dev = 0;
00769   afn_host = 0;
00770   afn_dev = 0;
00771 
00772 #define CUDA_EVENT_ID_PME_CHARGES 80
00773 #define CUDA_EVENT_ID_PME_FORCES 81
00774 #define CUDA_EVENT_ID_PME_TICK 82
00775 #define CUDA_EVENT_ID_PME_COPY 83
00776 #define CUDA_EVENT_ID_PME_KERNEL 84
00777   if ( 0 == CkMyPe() ) {
00778     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00779     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00780     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00781     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00782     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00783   }
00784 #endif
00785   recipEvirCount = 0;
00786   recipEvirClients = 0;
00787   recipEvirPe = -999;
00788 }
00789 
00790 
00791 void ComputePmeMgr::recvArrays(
00792         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00793   xPencil = x;  yPencil = y;  zPencil = z;
00794   
00795     if(CmiMyRank()==0)
00796     {
00797       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00798       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00799       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00800     }
00801 }
00802 
00803 #if USE_TOPO_SFC
00804  struct Coord
00805   {
00806     int x, y, z;
00807     Coord(): x(0), y(0), z(0) {}
00808     Coord(int a, int b, int c): x(a), y(b), z(c) {}
00809   };
00810   extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00811 
00812   void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00813   {
00814      SortableResizeArray<int> newprocs(procs.size());
00815      int num = 0;
00816      for (int i=0; i<result.size(); i++) {
00817        Coord &c = result[i];
00818        for (int j=0; j<procs.size(); j++) {
00819          int pe = procs[j];
00820          int x,y,z,t;
00821          tmgr.rankToCoordinates(pe, x, y, z, t);    
00822          if (x==c.x && y==c.y && z==c.z)
00823            newprocs[num++] = pe;
00824        }
00825      } 
00826      CmiAssert(newprocs.size() == procs.size());
00827      procs = newprocs;
00828   }
00829 
00830   int find_level_grid(int x) 
00831   {
00832      int a = sqrt(x);
00833      int b;
00834      for (; a>0; a--) {
00835        if (x%a == 0) break;
00836      }
00837      if (a==1) a = x;
00838      b = x/a;
00839      //return a>b?a:b;
00840      return b;
00841   }
00842   CmiNodeLock tmgr_lock;
00843 #endif
00844 
00845 void Pme_init()
00846 {
00847 #if USE_TOPO_SFC
00848   if (CkMyRank() == 0) 
00849     tmgr_lock = CmiCreateLock();
00850 #endif
00851 }
00852 
00853 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00854   delete msg;
00855 
00856   localInfo = new LocalPmeInfo[CkNumPes()];
00857   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00858   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00859   gridPeMap = new int[CkNumPes()];
00860   transPeMap = new int[CkNumPes()];
00861   recipPeDest = new int[CkNumPes()];
00862   gridPeOrder = new int[CkNumPes()];
00863   gridNodeOrder = new int[CkNumNodes()];
00864   transNodeOrder = new int[CkNumNodes()];
00865 
00866   if (CkMyRank() == 0) {
00867     pencilPMEProcessors = new char [CkNumPes()];
00868     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00869   }
00870 
00871   SimParameters *simParams = Node::Object()->simParameters;
00872   PatchMap *patchMap = PatchMap::Object();
00873 
00874   offload = simParams->PMEOffload;
00875 #ifdef NAMD_CUDA
00876   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00877     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00878   }
00879   if ( offload ) {
00880     int dev;
00881     cudaGetDevice(&dev);
00882     cuda_errcheck("in cudaGetDevice");
00883     cudaDeviceProp deviceProp;
00884     cudaGetDeviceProperties(&deviceProp, dev);
00885     cuda_errcheck("in cudaGetDeviceProperties");
00886     if ( deviceProp.major < 2 )
00887       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00888   }
00889 #endif
00890 
00891   alchLambda = -1.;  // illegal value to catch if not updated
00892 
00893   alchOn = simParams->alchOn;
00894   alchFepOn = simParams->alchFepOn;
00895   alchThermIntOn = simParams->alchThermIntOn;
00896   alchDecouple = alchOn && simParams->alchDecouple;
00897   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
00898   if (alchOn) {
00899     numGrids = 2;
00900     if (alchDecouple) numGrids += 2;
00901     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00902   }
00903   else numGrids = 1;
00904   lesOn = simParams->lesOn;
00905   useBarrier = simParams->PMEBarrier;
00906   if ( lesOn ) {
00907     lesFactor = simParams->lesFactor;
00908     numGrids = lesFactor;
00909   }
00910   selfOn = 0;
00911   pairOn = simParams->pairInteractionOn;
00912   if ( pairOn ) {
00913     selfOn = simParams->pairInteractionSelf;
00914     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00915     numGrids = selfOn ? 1 : 3;
00916   }
00917 
00918   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00919   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00920   else {
00921     int nrps = simParams->PMEProcessors;
00922     if ( nrps <= 0 ) nrps = CkNumPes();
00923     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00924     int dimx = simParams->PMEGridSizeX;
00925     int dimy = simParams->PMEGridSizeY;
00926     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00927     if ( maxslabs > nrps ) maxslabs = nrps;
00928     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00929                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00930     if ( maxpencils > nrps ) maxpencils = nrps;
00931     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00932     else usePencils = 0;
00933   }
00934 
00935   if ( usePencils ) {
00936     int nrps = simParams->PMEProcessors;
00937     if ( nrps <= 0 ) nrps = CkNumPes();
00938     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00939     if ( simParams->PMEPencils > 1 &&
00940          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00941       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00942     } else {
00943       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00944                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00945       if ( nb2 > nrps ) nb2 = nrps;
00946       if ( nb2 < 1 ) nb2 = 1;
00947       int nb = (int) sqrt((float)nb2);
00948       if ( nb < 1 ) nb = 1;
00949       xBlocks = zBlocks = nb;
00950       yBlocks = nb2 / nb;
00951     }
00952 
00953     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00954     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00955     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00956 
00957     int dimx = simParams->PMEGridSizeX;
00958     int bx = 1 + ( dimx - 1 ) / xBlocks;
00959     xBlocks = 1 + ( dimx - 1 ) / bx;
00960 
00961     int dimy = simParams->PMEGridSizeY;
00962     int by = 1 + ( dimy - 1 ) / yBlocks;
00963     yBlocks = 1 + ( dimy - 1 ) / by;
00964 
00965     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00966     int bz = 1 + ( dimz - 1 ) / zBlocks;
00967     zBlocks = 1 + ( dimz - 1 ) / bz;
00968 
00969     if ( xBlocks * yBlocks > CkNumPes() ) {
00970       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00971     }
00972     if ( xBlocks * zBlocks > CkNumPes() ) {
00973       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00974     }
00975     if ( yBlocks * zBlocks > CkNumPes() ) {
00976       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00977     }
00978 
00979     if ( ! CkMyPe() ) {
00980       iout << iINFO << "PME using " << xBlocks << " x " <<
00981         yBlocks << " x " << zBlocks <<
00982         " pencil grid for FFT and reciprocal sum.\n" << endi;
00983     }
00984   } else { // usePencils
00985 
00986   {  // decide how many pes to use for reciprocal sum
00987 
00988     // rules based on work available
00989     int minslices = simParams->PMEMinSlices;
00990     int dimx = simParams->PMEGridSizeX;
00991     int nrpx = ( dimx + minslices - 1 ) / minslices;
00992     int dimy = simParams->PMEGridSizeY;
00993     int nrpy = ( dimy + minslices - 1 ) / minslices;
00994 
00995     // rules based on processors available
00996     int nrpp = CkNumPes();
00997     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00998     if ( nrpp < nrpx ) nrpx = nrpp;
00999     if ( nrpp < nrpy ) nrpy = nrpp;
01000 
01001     // user override
01002     int nrps = simParams->PMEProcessors;
01003     if ( nrps > CkNumPes() ) nrps = CkNumPes();
01004     if ( nrps > 0 ) nrpx = nrps;
01005     if ( nrps > 0 ) nrpy = nrps;
01006 
01007     // make sure there aren't any totally empty processors
01008     int bx = ( dimx + nrpx - 1 ) / nrpx;
01009     nrpx = ( dimx + bx - 1 ) / bx;
01010     int by = ( dimy + nrpy - 1 ) / nrpy;
01011     nrpy = ( dimy + by - 1 ) / by;
01012     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01013       NAMD_bug("Error in selecting number of PME processors.");
01014     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01015       NAMD_bug("Error in selecting number of PME processors.");
01016 
01017     numGridPes = nrpx;
01018     numTransPes = nrpy;
01019   }
01020   if ( ! CkMyPe() ) {
01021     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01022       " processors for FFT and reciprocal sum.\n" << endi;
01023   }
01024 
01025   int sum_npes = numTransPes + numGridPes;
01026   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01027 
01028 #if 0 // USE_TOPOMAP
01029   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01030   PatchMap * pmap = PatchMap::Object();
01031   
01032   int patch_pes = pmap->numNodesWithPatches();
01033   TopoManager tmgr;
01034   if(tmgr.hasMultipleProcsPerNode())
01035     patch_pes *= 2;
01036 
01037   bool done = false;
01038   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01039     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01040     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01041   }
01042   else 
01043     if(CkNumPes() > 2 *max_npes + patch_pes) {
01044       done = generateBGLORBPmePeList(transPeMap, max_npes);
01045       gridPeMap = transPeMap;
01046     }
01047 
01048   if (!done)
01049 #endif
01050     {
01051       //generatePmePeList(transPeMap, max_npes);
01052       //gridPeMap = transPeMap;
01053       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01054     }
01055   
01056   if ( ! CkMyPe() ) {
01057     iout << iINFO << "PME GRID LOCATIONS:";
01058     int i;
01059     for ( i=0; i<numGridPes && i<10; ++i ) {
01060       iout << " " << gridPeMap[i];
01061     }
01062     if ( i < numGridPes ) iout << " ...";
01063     iout << "\n" << endi;
01064     iout << iINFO << "PME TRANS LOCATIONS:";
01065     for ( i=0; i<numTransPes && i<10; ++i ) {
01066       iout << " " << transPeMap[i];
01067     }
01068     if ( i < numTransPes ) iout << " ...";
01069     iout << "\n" << endi;
01070   }
01071 
01072   // sort based on nodes and physical nodes
01073   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01074 
01075   myGridPe = -1;
01076   myGridNode = -1;
01077   int i = 0;
01078   int node = -1;
01079   int real_node = -1;
01080   for ( i=0; i<numGridPes; ++i ) {
01081     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01082     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01083     int real_node_i = CkNodeOf(gridPeMap[i]);
01084     if ( real_node_i == real_node ) {
01085       gridNodeInfo[node].npe += 1;
01086     } else {
01087       real_node = real_node_i;
01088       ++node;
01089       gridNodeInfo[node].real_node = real_node;
01090       gridNodeInfo[node].pe_start = i;
01091       gridNodeInfo[node].npe = 1;
01092     }
01093     if ( CkMyNode() == real_node_i ) myGridNode = node;
01094   }
01095   numGridNodes = node + 1;
01096   myTransPe = -1;
01097   myTransNode = -1;
01098   node = -1;
01099   real_node = -1;
01100   for ( i=0; i<numTransPes; ++i ) {
01101     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01102     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01103     int real_node_i = CkNodeOf(transPeMap[i]);
01104     if ( real_node_i == real_node ) {
01105       transNodeInfo[node].npe += 1;
01106     } else {
01107       real_node = real_node_i;
01108       ++node;
01109       transNodeInfo[node].real_node = real_node;
01110       transNodeInfo[node].pe_start = i;
01111       transNodeInfo[node].npe = 1;
01112     }
01113     if ( CkMyNode() == real_node_i ) myTransNode = node;
01114   }
01115   numTransNodes = node + 1;
01116 
01117   if ( ! CkMyPe() ) {
01118     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01119          << numTransNodes << " TRANS NODES\n" << endi;
01120   }
01121 
01122   { // generate random orderings for grid and trans messages
01123     int i;
01124     for ( i = 0; i < numGridPes; ++i ) {
01125       gridPeOrder[i] = i;
01126     }
01127     Random rand(CkMyPe());
01128     if ( myGridPe < 0 ) {
01129       rand.reorder(gridPeOrder,numGridPes);
01130     } else {  // self last
01131       gridPeOrder[myGridPe] = numGridPes-1;
01132       gridPeOrder[numGridPes-1] = myGridPe;
01133       rand.reorder(gridPeOrder,numGridPes-1);
01134     } 
01135     for ( i = 0; i < numGridNodes; ++i ) {
01136       gridNodeOrder[i] = i;
01137     }
01138     if ( myGridNode < 0 ) {
01139       rand.reorder(gridNodeOrder,numGridNodes);
01140     } else {  // self last
01141       gridNodeOrder[myGridNode] = numGridNodes-1;
01142       gridNodeOrder[numGridNodes-1] = myGridNode;
01143       rand.reorder(gridNodeOrder,numGridNodes-1);
01144     }
01145     for ( i = 0; i < numTransNodes; ++i ) {
01146       transNodeOrder[i] = i;
01147     }
01148     if ( myTransNode < 0 ) {
01149       rand.reorder(transNodeOrder,numTransNodes);
01150     } else {  // self last
01151       transNodeOrder[myTransNode] = numTransNodes-1;
01152       transNodeOrder[numTransNodes-1] = myTransNode;
01153       rand.reorder(transNodeOrder,numTransNodes-1);
01154     }
01155   }
01156   
01157   } // ! usePencils
01158 
01159   myGrid.K1 = simParams->PMEGridSizeX;
01160   myGrid.K2 = simParams->PMEGridSizeY;
01161   myGrid.K3 = simParams->PMEGridSizeZ;
01162   myGrid.order = simParams->PMEInterpOrder;
01163   myGrid.dim2 = myGrid.K2;
01164   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01165 
01166   if ( ! usePencils ) {
01167     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01168     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01169     myGrid.block3 = myGrid.dim3 / 2;  // complex
01170   }
01171 
01172   if ( usePencils ) {
01173     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01174     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01175     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01176 
01177 
01178       int pe = 0;
01179       int x,y,z;
01180 
01181                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01182                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01183                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01184       
01185                 // decide which pes to use by bit reversal and patch use
01186                 int i;
01187                 int ncpus = CkNumPes();
01188                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01189                 PatchMap *pmap = PatchMap::Object();
01190                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01191                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01192                         if ( ri ) { // keep 0 for special case
01193                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
01194                                 else nopatches.add(ri);
01195                         }
01196                 }
01197 
01198 #if USE_RANDOM_TOPO
01199             Random rand(CkMyPe());
01200             int *tmp = new int[patches.size()];
01201             int nn = patches.size();
01202             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01203             rand.reorder(tmp, nn);
01204             patches.resize(0);
01205             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01206             delete [] tmp;
01207             tmp = new int[nopatches.size()];
01208             nn = nopatches.size();
01209             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01210             rand.reorder(tmp, nn);
01211             nopatches.resize(0);
01212             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01213             delete [] tmp;
01214 #endif
01215 
01216                 // only use zero if it eliminates overloading or has patches
01217                 int useZero = 0;
01218                 int npens = xBlocks*yBlocks;
01219                 if ( npens % ncpus == 0 ) useZero = 1;
01220                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01221                 npens += xBlocks*zBlocks;
01222                 if ( npens % ncpus == 0 ) useZero = 1;
01223                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01224                 npens += yBlocks*zBlocks;
01225                 if ( npens % ncpus == 0 ) useZero = 1;
01226                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01227 
01228                 // add nopatches then patches in reversed order
01229                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01230                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01231                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01232                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01233   
01234                 int npes = pmeprocs.size();
01235                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01236                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01237 #if !USE_RANDOM_TOPO
01238                 zprocs.sort();
01239 #endif
01240                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01241                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01242 #if !USE_RANDOM_TOPO
01243                 yprocs.sort();
01244 #endif
01245       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01246       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01247 #if !USE_RANDOM_TOPO
01248       xprocs.sort();
01249 #endif
01250 
01251 #if USE_TOPO_SFC
01252   CmiLock(tmgr_lock);
01253   //{
01254   TopoManager tmgr;
01255   int xdim = tmgr.getDimNX();
01256   int ydim = tmgr.getDimNY();
01257   int zdim = tmgr.getDimNZ();
01258   int xdim1 = find_level_grid(xdim);
01259   int ydim1 = find_level_grid(ydim);
01260   int zdim1 = find_level_grid(zdim);
01261   if(CkMyPe() == 0)
01262       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01263 
01264   vector<Coord> result;
01265   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01266   sort_sfc(xprocs, tmgr, result);
01267   sort_sfc(yprocs, tmgr, result);
01268   sort_sfc(zprocs, tmgr, result);
01269   //}
01270   CmiUnlock(tmgr_lock);
01271 #endif
01272 
01273 
01274                 if(CkMyPe() == 0){  
01275               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01276           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01277 #if USE_TOPO_SFC
01278               int x,y,z,t;
01279               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01280               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01281 #else
01282               iout << " " << zprocs[i];
01283 #endif
01284           }
01285           if ( i < zprocs.size() ) iout << " ...";
01286               iout << "\n" << endi;
01287                 }
01288 
01289     if (CkMyRank() == 0) {
01290       for (pe=0, x = 0; x < xBlocks; ++x)
01291         for (y = 0; y < yBlocks; ++y, ++pe ) {
01292           pencilPMEProcessors[zprocs[pe]] = 1;
01293         }
01294     }
01295      
01296                 if(CkMyPe() == 0){  
01297               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01298           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01299 #if USE_TOPO_SFC
01300               int x,y,z,t;
01301               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01302               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01303 #else
01304               iout << " " << yprocs[i];
01305 #endif
01306           }
01307           if ( i < yprocs.size() ) iout << " ...";
01308               iout << "\n" << endi;
01309                 }
01310 
01311     if (CkMyRank() == 0) {
01312       for (pe=0, z = 0; z < zBlocks; ++z )
01313         for (x = 0; x < xBlocks; ++x, ++pe ) {
01314           pencilPMEProcessors[yprocs[pe]] = 1;
01315         }
01316     }
01317     
01318                 if(CkMyPe() == 0){  
01319                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01320                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01321 #if USE_TOPO_SFC
01322                 int x,y,z,t;
01323                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01324                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01325 #else
01326                 iout << " " << xprocs[i];
01327 #endif
01328             }
01329                 if ( i < xprocs.size() ) iout << " ...";
01330                 iout << "\n" << endi;
01331                 }
01332 
01333     if (CkMyRank() == 0) {
01334       for (pe=0, y = 0; y < yBlocks; ++y )      
01335         for (z = 0; z < zBlocks; ++z, ++pe ) {
01336           pencilPMEProcessors[xprocs[pe]] = 1;
01337         }
01338     }
01339         
01340 
01341         // creating the pencil arrays
01342         if ( CkMyPe() == 0 ){
01343 #if !USE_RANDOM_TOPO
01344         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01345         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01346         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01347         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01348 #endif
01349 #if 1
01350         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01351         CProxy_PmePencilMap ym;
01352         if ( simParams->PMEPencilsYLayout )
01353           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01354         else
01355           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01356         CProxy_PmePencilMap xm;
01357         if ( simParams->PMEPencilsXLayout )
01358           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01359         else
01360           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01361         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01362         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01363         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01364         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01365         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01366         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01367         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01368         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01369         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01370         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01371 #else
01372         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01373         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01374         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01375 
01376                 for (pe=0, x = 0; x < xBlocks; ++x)
01377                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01378                                 zPencil(x,y,0).insert(zprocs[pe]);
01379                         }
01380         zPencil.doneInserting();
01381 
01382                 for (pe=0, x = 0; x < xBlocks; ++x)
01383                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01384                                 yPencil(x,0,z).insert(yprocs[pe]);
01385                         }
01386         yPencil.doneInserting();
01387 
01388 
01389                 for (pe=0, y = 0; y < yBlocks; ++y )    
01390                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01391                                 xPencil(0,y,z).insert(xprocs[pe]);
01392                         }
01393                 xPencil.doneInserting();     
01394 #endif
01395 
01396                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01397                 PmePencilInitMsgData msgdata;
01398                 msgdata.grid = myGrid;
01399                 msgdata.xBlocks = xBlocks;
01400                 msgdata.yBlocks = yBlocks;
01401                 msgdata.zBlocks = zBlocks;
01402                 msgdata.xPencil = xPencil;
01403                 msgdata.yPencil = yPencil;
01404                 msgdata.zPencil = zPencil;
01405                 msgdata.pmeProxy = pmeProxyDir;
01406         msgdata.pmeNodeProxy = pmeNodeProxy;
01407         msgdata.xm = xm;
01408         msgdata.ym = ym;
01409         msgdata.zm = zm;
01410                 xPencil.init(new PmePencilInitMsg(msgdata));
01411                 yPencil.init(new PmePencilInitMsg(msgdata));
01412                 zPencil.init(new PmePencilInitMsg(msgdata));
01413         }
01414 
01415     return;  // continue in initialize_pencils() at next startup stage
01416   }
01417 
01418 
01419   int pe;
01420   int nx = 0;
01421   for ( pe = 0; pe < numGridPes; ++pe ) {
01422     localInfo[pe].x_start = nx;
01423     nx += myGrid.block1;
01424     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01425     localInfo[pe].nx = nx - localInfo[pe].x_start;
01426   }
01427   int ny = 0;
01428   for ( pe = 0; pe < numTransPes; ++pe ) {
01429     localInfo[pe].y_start_after_transpose = ny;
01430     ny += myGrid.block2;
01431     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01432     localInfo[pe].ny_after_transpose =
01433                         ny - localInfo[pe].y_start_after_transpose;
01434   }
01435 
01436   {  // decide how many pes this node exchanges charges with
01437 
01438   PatchMap *patchMap = PatchMap::Object();
01439   Lattice lattice = simParams->lattice;
01440   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01441   BigReal cutoff = simParams->cutoff;
01442   BigReal patchdim = simParams->patchDimension;
01443   int numPatches = patchMap->numPatches();
01444   int numNodes = CkNumPes();
01445   int *source_flags = new int[numNodes];
01446   int node;
01447   for ( node=0; node<numNodes; ++node ) {
01448     source_flags[node] = 0;
01449     recipPeDest[node] = 0;
01450   }
01451 
01452   // // make sure that we don't get ahead of ourselves on this node
01453   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01454   //   source_flags[CkMyPe()] = 1;
01455   //   recipPeDest[myRecipPe] = 1;
01456   // }
01457 
01458   for ( int pid=0; pid < numPatches; ++pid ) {
01459     int pnode = patchMap->node(pid);
01460 #ifdef NAMD_CUDA
01461     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01462 #endif
01463     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01464     BigReal minx = patchMap->min_a(pid);
01465     BigReal maxx = patchMap->max_a(pid);
01466     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01467     // min1 (max1) is smallest (largest) grid line for this patch
01468     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01469     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01470     for ( int i=min1; i<=max1; ++i ) {
01471       int ix = i;
01472       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01473       while ( ix < 0 ) ix += myGrid.K1;
01474       // set source_flags[pnode] if this patch sends to our node
01475       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01476            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01477         source_flags[pnode] = 1;
01478       }
01479       // set dest_flags[] for node that our patch sends to
01480 #ifdef NAMD_CUDA
01481       if ( offload ) {
01482         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01483           recipPeDest[ix / myGrid.block1] = 1;
01484         }
01485       } else
01486 #endif
01487       if ( pnode == CkMyPe() ) {
01488         recipPeDest[ix / myGrid.block1] = 1;
01489       }
01490     }
01491   }
01492 
01493   int numSourcesSamePhysicalNode = 0;
01494   numSources = 0;
01495   numDestRecipPes = 0;
01496   for ( node=0; node<numNodes; ++node ) {
01497     if ( source_flags[node] ) ++numSources;
01498     if ( recipPeDest[node] ) ++numDestRecipPes;
01499     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01500   }
01501 
01502 #if 0
01503   if ( numSources ) {
01504     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01505             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01506     iout << iINFO << "PME " << CkMyPe() << " sources:";
01507     for ( node=0; node<numNodes; ++node ) {
01508       if ( source_flags[node] ) iout << " " << node;
01509     }
01510     iout << "\n" << endi;
01511   }
01512 #endif
01513 
01514   delete [] source_flags;
01515 
01516   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01517   //           CkMyPe(), numSources, numDestRecipPes);
01518 
01519   }  // decide how many pes this node exchanges charges with (end)
01520 
01521   ungrid_count = numDestRecipPes;
01522 
01523   sendTransBarrier_received = 0;
01524 
01525   if ( myGridPe < 0 && myTransPe < 0 ) return;
01526   // the following only for nodes doing reciprocal sum
01527 
01528   if ( myTransPe >= 0 ) {
01529     recipEvirPe = findRecipEvirPe();
01530     pmeProxy[recipEvirPe].addRecipEvirClient();
01531   }
01532 
01533   if ( myTransPe >= 0 ) {
01534       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01535       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01536       #ifdef OPENATOM_VERSION
01537       if ( simParams->openatomOn ) { 
01538         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01539         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01540       } else {
01541         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01542       }
01543       #else  // OPENATOM_VERSION
01544       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01545       #endif // OPENATOM_VERSION
01546   }
01547 
01548   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01549   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01550   if ( local_size < local_size_2 ) local_size = local_size_2;
01551   qgrid = new float[local_size*numGrids];
01552   if ( numGridPes > 1 || numTransPes > 1 ) {
01553     kgrid = new float[local_size*numGrids];
01554   } else {
01555     kgrid = qgrid;
01556   }
01557   qgrid_size = local_size;
01558 
01559   if ( myGridPe >= 0 ) {
01560   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01561   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01562   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01563   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01564   }
01565 
01566   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01567 #ifdef NAMD_FFTW
01568   CmiLock(fftw_plan_lock);
01569 #ifdef NAMD_FFTW_3
01570   work = new fftwf_complex[n[0]];
01571   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01572   if ( myGridPe >= 0 ) {
01573     forward_plan_yz=new fftwf_plan[numGrids];
01574     backward_plan_yz=new fftwf_plan[numGrids];
01575   }
01576   if ( myTransPe >= 0 ) {
01577     forward_plan_x=new fftwf_plan[numGrids];
01578     backward_plan_x=new fftwf_plan[numGrids];
01579   }
01580   /* need one plan per grid */
01581   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01582   if ( myGridPe >= 0 ) {
01583     for( int g=0; g<numGrids; g++)
01584       {
01585         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01586                                                      localInfo[myGridPe].nx,
01587                                                      qgrid + qgrid_size * g,
01588                                                      NULL,
01589                                                      1,
01590                                                      myGrid.dim2 * myGrid.dim3,
01591                                                      (fftwf_complex *) 
01592                                                      (qgrid + qgrid_size * g),
01593                                                      NULL,
01594                                                      1,
01595                                                      myGrid.dim2 * (myGrid.dim3/2),
01596                                                      fftwFlags);
01597       }
01598   }
01599   int zdim = myGrid.dim3;
01600   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01601   if ( ! CkMyPe() ) iout << " 2..." << endi;
01602   if ( myTransPe >= 0 ) {
01603     for( int g=0; g<numGrids; g++)
01604       {
01605 
01606         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01607                                                 (fftwf_complex *)
01608                                                 (kgrid+qgrid_size*g),
01609                                                 NULL,
01610                                                 xStride,
01611                                                 1,
01612                                                 (fftwf_complex *)
01613                                                 (kgrid+qgrid_size*g),
01614                                                 NULL,
01615                                                 xStride,
01616                                                 1,
01617                                                 FFTW_FORWARD,fftwFlags);
01618         
01619       }
01620   }
01621   if ( ! CkMyPe() ) iout << " 3..." << endi;
01622   if ( myTransPe >= 0 ) {
01623     for( int g=0; g<numGrids; g++)
01624       {
01625         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01626                                                  (fftwf_complex *)
01627                                                  (kgrid+qgrid_size*g),
01628                                                  NULL,
01629                                                  xStride,
01630                                                  1,
01631                                                  (fftwf_complex *)
01632                                                  (kgrid+qgrid_size*g),
01633                                                  NULL,
01634                                                  xStride,
01635                                                  1,
01636                                                  FFTW_BACKWARD, fftwFlags);
01637 
01638       }
01639   }
01640   if ( ! CkMyPe() ) iout << " 4..." << endi;
01641   if ( myGridPe >= 0 ) {
01642     for( int g=0; g<numGrids; g++)
01643       {
01644         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01645                                                       localInfo[myGridPe].nx,
01646                                                       (fftwf_complex *)
01647                                                       (qgrid + qgrid_size * g),
01648                                                       NULL,
01649                                                       1,
01650                                                       myGrid.dim2*(myGrid.dim3/2),
01651                                                       qgrid + qgrid_size * g,
01652                                                       NULL,
01653                                                       1,
01654                                                       myGrid.dim2 * myGrid.dim3,
01655                                                       fftwFlags);
01656       }
01657   }
01658   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01659 
01660 #else
01661   work = new fftw_complex[n[0]];
01662 
01663   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01664   if ( myGridPe >= 0 ) {
01665   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01666         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01667         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01668   }
01669   if ( ! CkMyPe() ) iout << " 2..." << endi;
01670   if ( myTransPe >= 0 ) {
01671       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01672         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01673         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01674         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01675   }
01676   if ( ! CkMyPe() ) iout << " 3..." << endi;
01677   if ( myTransPe >= 0 ) {
01678   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01679         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01680         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01681         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01682   }
01683   if ( ! CkMyPe() ) iout << " 4..." << endi;
01684   if ( myGridPe >= 0 ) {
01685   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01686         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01687         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01688   }
01689   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01690 #endif
01691   CmiUnlock(fftw_plan_lock);
01692 #else
01693   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01694 #endif
01695 
01696   if ( myGridPe >= 0 && numSources == 0 )
01697                 NAMD_bug("PME grid elements exist without sources.");
01698   grid_count = numSources;
01699   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01700   trans_count = numGridPes;
01701 }
01702 
01703 
01704 
01705 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01706   delete msg;
01707   if ( ! usePencils ) return;
01708 
01709   SimParameters *simParams = Node::Object()->simParameters;
01710 
01711   PatchMap *patchMap = PatchMap::Object();
01712   Lattice lattice = simParams->lattice;
01713   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01714   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01715   BigReal cutoff = simParams->cutoff;
01716   BigReal patchdim = simParams->patchDimension;
01717   int numPatches = patchMap->numPatches();
01718 
01719   pencilActive = new char[xBlocks*yBlocks];
01720   for ( int i=0; i<xBlocks; ++i ) {
01721     for ( int j=0; j<yBlocks; ++j ) {
01722       pencilActive[i*yBlocks+j] = 0;
01723     }
01724   }
01725 
01726   for ( int pid=0; pid < numPatches; ++pid ) {
01727     int pnode = patchMap->node(pid);
01728 #ifdef NAMD_CUDA
01729     if ( offload ) {
01730       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01731     } else
01732 #endif
01733     if ( pnode != CkMyPe() ) continue;
01734 
01735     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01736     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01737 
01738     BigReal minx = patchMap->min_a(pid);
01739     BigReal maxx = patchMap->max_a(pid);
01740     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01741     // min1 (max1) is smallest (largest) grid line for this patch
01742     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01743     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01744 
01745     BigReal miny = patchMap->min_b(pid);
01746     BigReal maxy = patchMap->max_b(pid);
01747     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01748     // min2 (max2) is smallest (largest) grid line for this patch
01749     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01750     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01751 
01752     for ( int i=min1; i<=max1; ++i ) {
01753       int ix = i;
01754       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01755       while ( ix < 0 ) ix += myGrid.K1;
01756       for ( int j=min2; j<=max2; ++j ) {
01757         int jy = j;
01758         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01759         while ( jy < 0 ) jy += myGrid.K2;
01760         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01761       }
01762     }
01763   }
01764 
01765   numPencilsActive = 0;
01766   for ( int i=0; i<xBlocks; ++i ) {
01767     for ( int j=0; j<yBlocks; ++j ) {
01768       if ( pencilActive[i*yBlocks+j] ) {
01769         ++numPencilsActive;
01770 #ifdef NAMD_CUDA
01771         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01772 #endif
01773         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01774       }
01775     }
01776   }
01777   activePencils = new ijpair[numPencilsActive];
01778   numPencilsActive = 0;
01779   for ( int i=0; i<xBlocks; ++i ) {
01780     for ( int j=0; j<yBlocks; ++j ) {
01781       if ( pencilActive[i*yBlocks+j] ) {
01782         activePencils[numPencilsActive++] = ijpair(i,j);
01783       }
01784     }
01785   }
01786   if ( simParams->PMESendOrder ) {
01787     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01788   } else {
01789     Random rand(CkMyPe());
01790     rand.reorder(activePencils,numPencilsActive);
01791   }
01792   //if ( numPencilsActive ) {
01793   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01794   //}
01795 
01796   ungrid_count = numPencilsActive;
01797 }
01798 
01799 
01800 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01801   if ( ! usePencils ) return;
01802   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01803 }
01804 
01805 
01806 ComputePmeMgr::~ComputePmeMgr() {
01807 
01808   if ( CmiMyRank() == 0 ) {
01809     CmiDestroyLock(fftw_plan_lock);
01810   }
01811   CmiDestroyLock(pmemgr_lock);
01812 
01813   delete myKSpace;
01814   delete [] localInfo;
01815   delete [] gridNodeInfo;
01816   delete [] transNodeInfo;
01817   delete [] gridPeMap;
01818   delete [] transPeMap;
01819   delete [] recipPeDest;
01820   delete [] gridPeOrder;
01821   delete [] gridNodeOrder;
01822   delete [] transNodeOrder;
01823   delete [] qgrid;
01824   if ( kgrid != qgrid ) delete [] kgrid;
01825   delete [] work;
01826   delete [] gridmsg_reuse;
01827 
01828  if ( ! offload ) {
01829   for (int i=0; i<q_count; ++i) {
01830     delete [] q_list[i];
01831   }
01832   delete [] q_list;
01833   delete [] fz_arr;
01834  }
01835   delete [] f_arr;
01836   delete [] q_arr;
01837 }
01838 
01839 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01840   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01841   if ( grid_count == 0 ) {
01842     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01843   }
01844   if ( grid_count == numSources ) {
01845     lattice = msg->lattice;
01846     grid_sequence = msg->sequence;
01847   }
01848 
01849   int zdim = myGrid.dim3;
01850   int zlistlen = msg->zlistlen;
01851   int *zlist = msg->zlist;
01852   float *qmsg = msg->qgrid;
01853   for ( int g=0; g<numGrids; ++g ) {
01854     char *f = msg->fgrid + fgrid_len * g;
01855     float *q = qgrid + qgrid_size * g;
01856     for ( int i=0; i<fgrid_len; ++i ) {
01857       if ( f[i] ) {
01858         for ( int k=0; k<zlistlen; ++k ) {
01859           q[zlist[k]] += *(qmsg++);
01860         }
01861       }
01862       q += zdim;
01863     }
01864   }
01865 
01866   gridmsg_reuse[numSources-grid_count] = msg;
01867   --grid_count;
01868 
01869   if ( grid_count == 0 ) {
01870     pmeProxyDir[CkMyPe()].gridCalc1();
01871     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01872   }
01873 }
01874 #ifdef MANUAL_DEBUG_FFTW3
01875 
01876 /* utility functions for manual debugging */
01877 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01878 {
01879 
01880   char fmt[1000];
01881   char filename[1000];
01882   strncpy(fmt,infilename,999);
01883   strncat(fmt,"_%d.out",999);
01884   sprintf(filename,fmt, pe);
01885   FILE *loutfile = fopen(filename, "w");
01886 #ifdef PAIRCALC_TEST_DUMP
01887   fprintf(loutfile,"%d\n",ydim);
01888 #endif
01889   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01890   for(int i=0;i<xdim;i++)
01891     for(int j=0;j<ydim;j++)
01892       for(int k=0;k<zdim;k++)
01893         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01894   fclose(loutfile);
01895 
01896 }
01897 
01898 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01899 {
01900   char fmt[1000];
01901   char filename[1000];
01902   strncpy(fmt,infilename,999);
01903   strncat(fmt,"_%d_%d_%d.out",999);
01904   sprintf(filename,fmt, x,y,z);
01905   FILE *loutfile = fopen(filename, "w");
01906   CkAssert(loutfile!=NULL);
01907   CkPrintf("opened %s for dump\n",filename);
01908   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01909   for(int i=0;i<xdim;i++)
01910     for(int j=0;j<ydim;j++)
01911       for(int k=0;k<zdim;k++)
01912         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01913   fclose(loutfile);
01914 }
01915 
01916 #endif
01917 
01918 void ComputePmeMgr::gridCalc1(void) {
01919   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01920 
01921 #ifdef NAMD_FFTW
01922   for ( int g=0; g<numGrids; ++g ) {
01923 #ifdef NAMD_FFTW_3
01924     fftwf_execute(forward_plan_yz[g]);
01925 #else
01926     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01927         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01928 #endif
01929 
01930   }
01931 #endif
01932 
01933   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01934 }
01935 
01936 void ComputePmeMgr::sendTransBarrier(void) {
01937   sendTransBarrier_received += 1;
01938   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01939   if ( sendTransBarrier_received < numGridPes ) return;
01940   sendTransBarrier_received = 0;
01941   for ( int i=0; i<numGridPes; ++i ) {
01942     pmeProxyDir[gridPeMap[i]].sendTrans();
01943   }
01944 }
01945 
01946 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
01947   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
01948   mgr->sendTransSubset(first, last);
01949 }
01950 
01951 void ComputePmeMgr::sendTrans(void) {
01952 
01953   untrans_count = numTransPes;
01954 
01955 #if     CMK_SMP && USE_CKLOOP
01956   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01957   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01958     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01959   } else
01960 #endif
01961   {
01962     sendTransSubset(0, numTransNodes-1);
01963   }
01964 
01965 }
01966 
01967 void ComputePmeMgr::sendTransSubset(int first, int last) {
01968   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01969 
01970   // send data for transpose
01971   int zdim = myGrid.dim3;
01972   int nx = localInfo[myGridPe].nx;
01973   int x_start = localInfo[myGridPe].x_start;
01974   int slicelen = myGrid.K2 * zdim;
01975 
01976   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01977 
01978 #if CMK_BLUEGENEL
01979   CmiNetworkProgressAfter (0);
01980 #endif
01981 
01982   for (int j=first; j<=last; j++) {
01983     int node = transNodeOrder[j];  // different order on each node
01984     int pe = transNodeInfo[node].pe_start;
01985     int npe = transNodeInfo[node].npe;
01986     int totlen = 0;
01987     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01988       LocalPmeInfo &li = localInfo[pe];
01989       int cpylen = li.ny_after_transpose * zdim;
01990       totlen += cpylen;
01991     }
01992     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01993                                 PRIORITY_SIZE) PmeTransMsg;
01994     newmsg->sourceNode = myGridPe;
01995     newmsg->lattice = lattice;
01996     newmsg->x_start = x_start;
01997     newmsg->nx = nx;
01998     for ( int g=0; g<numGrids; ++g ) {
01999       float *qmsg = newmsg->qgrid + nx * totlen * g;
02000       pe = transNodeInfo[node].pe_start;
02001       for (int i=0; i<npe; ++i, ++pe) {
02002         LocalPmeInfo &li = localInfo[pe];
02003         int cpylen = li.ny_after_transpose * zdim;
02004         if ( node == myTransNode ) {
02005           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
02006           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
02007         }
02008         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
02009         for ( int x = 0; x < nx; ++x ) {
02010           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02011           q += slicelen;
02012           qmsg += cpylen;
02013         }
02014       }
02015     }
02016     newmsg->sequence = grid_sequence;
02017     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02018     if ( node == myTransNode ) newmsg->nx = 0;
02019     if ( npe > 1 ) {
02020       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02021       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02022     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02023   }
02024 }
02025 
02026 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
02027   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02028   int pe = transNodeInfo[myTransNode].pe_start;
02029   int npe = transNodeInfo[myTransNode].npe;
02030   CmiNodeLock lock = CmiCreateLock();
02031   int *count = new int; *count = npe;
02032   for (int i=0; i<npe; ++i, ++pe) {
02033     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02034     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02035     shmsg->msg = msg;
02036     shmsg->count = count;
02037     shmsg->lock = lock;
02038     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02039   }
02040 }
02041 
02042 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
02043   procTrans(msg->msg);
02044   CmiLock(msg->lock);
02045   int count = --(*msg->count);
02046   CmiUnlock(msg->lock);
02047   if ( count == 0 ) {
02048     CmiDestroyLock(msg->lock);
02049     delete msg->count;
02050     delete msg->msg;
02051   }
02052   delete msg;
02053 }
02054 
02055 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
02056   procTrans(msg);
02057   delete msg;
02058 }
02059 
02060 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
02061   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02062   if ( trans_count == numGridPes ) {
02063     lattice = msg->lattice;
02064     grid_sequence = msg->sequence;
02065   }
02066 
02067  if ( msg->nx ) {
02068   int zdim = myGrid.dim3;
02069   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02070   int first_pe = nodeInfo.pe_start;
02071   int last_pe = first_pe+nodeInfo.npe-1;
02072   int y_skip = localInfo[myTransPe].y_start_after_transpose
02073              - localInfo[first_pe].y_start_after_transpose;
02074   int ny_msg = localInfo[last_pe].y_start_after_transpose
02075              + localInfo[last_pe].ny_after_transpose
02076              - localInfo[first_pe].y_start_after_transpose;
02077   int ny = localInfo[myTransPe].ny_after_transpose;
02078   int x_start = msg->x_start;
02079   int nx = msg->nx;
02080   for ( int g=0; g<numGrids; ++g ) {
02081     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02082         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02083         nx*ny*zdim*sizeof(float));
02084   }
02085  }
02086 
02087   --trans_count;
02088 
02089   if ( trans_count == 0 ) {
02090     pmeProxyDir[CkMyPe()].gridCalc2();
02091   }
02092 }
02093 
02094 void ComputePmeMgr::gridCalc2(void) {
02095   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02096 
02097 #if CMK_BLUEGENEL
02098   CmiNetworkProgressAfter (0);
02099 #endif
02100 
02101   int zdim = myGrid.dim3;
02102   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02103   int ny = localInfo[myTransPe].ny_after_transpose;
02104 
02105   for ( int g=0; g<numGrids; ++g ) {
02106     // finish forward FFT (x dimension)
02107 #ifdef NAMD_FFTW
02108 #ifdef NAMD_FFTW_3
02109     fftwf_execute(forward_plan_x[g]);
02110 #else
02111     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02112         ny * zdim / 2, 1, work, 1, 0);
02113 #endif
02114 #endif
02115   }
02116 
02117 #ifdef OPENATOM_VERSION
02118     if ( ! simParams -> openatomOn ) { 
02119 #endif // OPENATOM_VERSION
02120       gridCalc2R();
02121 #ifdef OPENATOM_VERSION
02122     } else {
02123       gridCalc2Moa();
02124     }
02125 #endif // OPENATOM_VERSION
02126 }
02127 
02128 #ifdef OPENATOM_VERSION
02129 void ComputePmeMgr::gridCalc2Moa(void) {
02130 
02131   int zdim = myGrid.dim3;
02132   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02133   int ny = localInfo[myTransPe].ny_after_transpose;
02134 
02135   SimParameters *simParams = Node::Object()->simParameters;
02136 
02137   CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
02138 
02139   for ( int g=0; g<numGrids; ++g ) {
02140     #ifdef OPENATOM_VERSION_DEBUG 
02141     CkPrintf("Sending recQ on processor %d \n", CkMyPe());
02142     for ( int i=0; i<=(ny * zdim / 2); ++i) 
02143     {
02144       CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
02145     }
02146     #endif // OPENATOM_VERSION_DEBUG
02147 //     mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
02148     CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
02149     moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
02150   }
02151 }
02152 #endif // OPENATOM_VERSION
02153 
02154 void ComputePmeMgr::gridCalc2R(void) {
02155 
02156   int useCkLoop = 0;
02157 #if CMK_SMP && USE_CKLOOP
02158   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02159        && CkNumPes() >= 2 * numTransPes ) {
02160     useCkLoop = 1;
02161   }
02162 #endif
02163 
02164   int zdim = myGrid.dim3;
02165   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02166   int ny = localInfo[myTransPe].ny_after_transpose;
02167 
02168   for ( int g=0; g<numGrids; ++g ) {
02169     // reciprocal space portion of PME
02170     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02171     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02172                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02173     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02174 
02175     // start backward FFT (x dimension)
02176 
02177 #ifdef NAMD_FFTW
02178 #ifdef NAMD_FFTW_3
02179     fftwf_execute(backward_plan_x[g]);
02180 #else
02181     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02182         ny * zdim / 2, 1, work, 1, 0);
02183 #endif
02184 #endif
02185   }
02186   
02187   pmeProxyDir[CkMyPe()].sendUntrans();
02188 }
02189 
02190 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
02191   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02192   mgr->sendUntransSubset(first, last);
02193 }
02194 
02195 void ComputePmeMgr::sendUntrans(void) {
02196 
02197   trans_count = numGridPes;
02198 
02199   { // send energy and virial
02200     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02201     for ( int g=0; g<numGrids; ++g ) {
02202       newmsg->evir[g] = recip_evir2[g];
02203     }
02204     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02205     CmiEnableUrgentSend(1);
02206     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02207     CmiEnableUrgentSend(0);
02208   }
02209 
02210 #if     CMK_SMP && USE_CKLOOP
02211   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02212   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02213     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02214   } else
02215 #endif
02216   {
02217     sendUntransSubset(0, numGridNodes-1);
02218   }
02219 
02220 }
02221 
02222 void ComputePmeMgr::sendUntransSubset(int first, int last) {
02223 
02224   int zdim = myGrid.dim3;
02225   int y_start = localInfo[myTransPe].y_start_after_transpose;
02226   int ny = localInfo[myTransPe].ny_after_transpose;
02227   int slicelen = myGrid.K2 * zdim;
02228 
02229   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02230 
02231 #if CMK_BLUEGENEL
02232   CmiNetworkProgressAfter (0);
02233 #endif
02234 
02235   // send data for reverse transpose
02236   for (int j=first; j<=last; j++) {
02237     int node = gridNodeOrder[j];  // different order on each node
02238     int pe = gridNodeInfo[node].pe_start;
02239     int npe = gridNodeInfo[node].npe;
02240     int totlen = 0;
02241     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02242       LocalPmeInfo &li = localInfo[pe];
02243       int cpylen = li.nx * zdim;
02244       totlen += cpylen;
02245     }
02246     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02247     newmsg->sourceNode = myTransPe;
02248     newmsg->y_start = y_start;
02249     newmsg->ny = ny;
02250     for ( int g=0; g<numGrids; ++g ) {
02251       float *qmsg = newmsg->qgrid + ny * totlen * g;
02252       pe = gridNodeInfo[node].pe_start;
02253       for (int i=0; i<npe; ++i, ++pe) {
02254         LocalPmeInfo &li = localInfo[pe];
02255         if ( node == myGridNode ) {
02256           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02257           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02258           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02259           int cpylen = ny * zdim;
02260           for ( int x = 0; x < li.nx; ++x ) {
02261             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02262             q += cpylen;
02263             qmsg += slicelen;
02264           }
02265         } else {
02266           CmiMemcpy((void*)qmsg,
02267                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02268                 li.nx*ny*zdim*sizeof(float));
02269           qmsg += li.nx*ny*zdim;
02270         }
02271       }
02272     }
02273     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02274     if ( node == myGridNode ) newmsg->ny = 0;
02275     if ( npe > 1 ) {
02276       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02277       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02278     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02279   }
02280 }
02281 
02282 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
02283   int pe = gridNodeInfo[myGridNode].pe_start;
02284   int npe = gridNodeInfo[myGridNode].npe;
02285   CmiNodeLock lock = CmiCreateLock();
02286   int *count = new int; *count = npe;
02287   for (int i=0; i<npe; ++i, ++pe) {
02288     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02289     shmsg->msg = msg;
02290     shmsg->count = count;
02291     shmsg->lock = lock;
02292     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02293   }
02294 }
02295 
02296 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
02297   procUntrans(msg->msg);
02298   CmiLock(msg->lock);
02299   int count = --(*msg->count);
02300   CmiUnlock(msg->lock);
02301   if ( count == 0 ) {
02302     CmiDestroyLock(msg->lock);
02303     delete msg->count;
02304     delete msg->msg;
02305   }
02306   delete msg;
02307 }
02308 
02309 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
02310   procUntrans(msg);
02311   delete msg;
02312 }
02313 
02314 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
02315   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02316 
02317 #if CMK_BLUEGENEL
02318   CmiNetworkProgressAfter (0);
02319 #endif
02320 
02321   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02322   int first_pe = nodeInfo.pe_start;
02323   int g;
02324 
02325  if ( msg->ny ) {
02326   int zdim = myGrid.dim3;
02327   int last_pe = first_pe+nodeInfo.npe-1;
02328   int x_skip = localInfo[myGridPe].x_start
02329              - localInfo[first_pe].x_start;
02330   int nx_msg = localInfo[last_pe].x_start
02331              + localInfo[last_pe].nx
02332              - localInfo[first_pe].x_start;
02333   int nx = localInfo[myGridPe].nx;
02334   int y_start = msg->y_start;
02335   int ny = msg->ny;
02336   int slicelen = myGrid.K2 * zdim;
02337   int cpylen = ny * zdim;
02338   for ( g=0; g<numGrids; ++g ) {
02339     float *q = qgrid + qgrid_size * g + y_start * zdim;
02340     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02341     for ( int x = 0; x < nx; ++x ) {
02342       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02343       q += slicelen;
02344       qmsg += cpylen;
02345     }
02346   }
02347  }
02348 
02349   --untrans_count;
02350 
02351   if ( untrans_count == 0 ) {
02352     pmeProxyDir[CkMyPe()].gridCalc3();
02353   }
02354 }
02355 
02356 void ComputePmeMgr::gridCalc3(void) {
02357   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02358 
02359   // finish backward FFT
02360 #ifdef NAMD_FFTW
02361 
02362   for ( int g=0; g<numGrids; ++g ) {
02363 #ifdef NAMD_FFTW_3
02364     fftwf_execute(backward_plan_yz[g]);
02365 #else
02366     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02367         (fftw_complex *) (qgrid + qgrid_size * g),
02368         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02369 #endif
02370   }
02371 
02372 #endif
02373 
02374   pmeProxyDir[CkMyPe()].sendUngrid();
02375 }
02376 
02377 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
02378   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02379   mgr->sendUngridSubset(first, last);
02380 }
02381 
02382 void ComputePmeMgr::sendUngrid(void) {
02383 
02384 #if     CMK_SMP && USE_CKLOOP
02385   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02386   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02387     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02388   } else
02389 #endif
02390   {
02391     sendUngridSubset(0, numSources-1);
02392   }
02393 
02394   grid_count = numSources;
02395   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02396 }
02397 
02398 void ComputePmeMgr::sendUngridSubset(int first, int last) {
02399 
02400 #ifdef NAMD_CUDA
02401   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02402 #else
02403   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02404 #endif
02405 
02406   for ( int j=first; j<=last; ++j ) {
02407     // int msglen = qgrid_len;
02408     PmeGridMsg *newmsg = gridmsg_reuse[j];
02409     int pe = newmsg->sourceNode;
02410     int zdim = myGrid.dim3;
02411     int flen = newmsg->len;
02412     int fstart = newmsg->start;
02413     int zlistlen = newmsg->zlistlen;
02414     int *zlist = newmsg->zlist;
02415     float *qmsg = newmsg->qgrid;
02416     for ( int g=0; g<numGrids; ++g ) {
02417       char *f = newmsg->fgrid + fgrid_len * g;
02418       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02419       for ( int i=0; i<flen; ++i ) {
02420         if ( f[i] ) {
02421           for ( int k=0; k<zlistlen; ++k ) {
02422             *(qmsg++) = q[zlist[k]];
02423           }
02424         }
02425         q += zdim;
02426       }
02427     }
02428     newmsg->sourceNode = myGridPe;
02429 
02430     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02431     CmiEnableUrgentSend(1);
02432 #ifdef NAMD_CUDA
02433     if ( offload ) {
02434       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02435     } else
02436 #endif
02437     pmeProxyDir[pe].recvUngrid(newmsg);
02438     CmiEnableUrgentSend(0);
02439   }
02440 }
02441 
02442 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02443   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02444 #ifdef NAMD_CUDA
02445   if ( ! offload )  // would need lock
02446 #endif
02447   if ( ungrid_count == 0 ) {
02448     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02449   }
02450 
02451   if ( usePencils ) copyPencils(msg);
02452   else copyResults(msg);
02453   delete msg;
02454   recvAck(0);
02455 }
02456 
02457 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02458   if ( msg ) delete msg;
02459 #ifdef NAMD_CUDA
02460   if ( offload ) {
02461     CmiLock(cuda_lock);
02462     if ( ungrid_count == 0 ) {
02463       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02464     }
02465     int uc = --ungrid_count;
02466     CmiUnlock(cuda_lock);
02467 
02468     if ( uc == 0 ) {
02469       pmeProxyDir[master_pe].ungridCalc();
02470     }
02471     return;
02472   }
02473 #endif
02474   --ungrid_count;
02475 
02476   if ( ungrid_count == 0 ) {
02477     pmeProxyDir[CkMyPe()].ungridCalc();
02478   }
02479 }
02480 
02481 #ifdef NAMD_CUDA
02482 #define count_limit 1000000
02483 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
02484 #define EVENT_STRIDE 10
02485 
02486 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
02487 
02488 void cuda_check_pme_forces(void *arg, double walltime) {
02489   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
02490 
02491  while ( 1 ) { // process multiple events per call
02492   cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
02493   if ( err == cudaSuccess ) {
02494     argp->check_forces_count = 0;
02495     for ( int i=0; i<EVENT_STRIDE; ++i ) {
02496       WorkDistrib::messageEnqueueWork(argp->pmeComputes[argp->forces_done_count]);
02497       if ( ++(argp->forces_done_count) == argp->forces_count ) break;
02498     }
02499     if ( argp->forces_done_count == argp->forces_count ) { // last event
02500       traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
02501       argp->forces_time = walltime - argp->forces_time;
02502       //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
02503       return;
02504     } else { // more events
02505       continue; // check next event
02506     }
02507   } else if ( err != cudaErrorNotReady ) {
02508     cuda_errcheck("in cuda_check_pme_forces");
02509     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_forces");
02510   } else if ( ++(argp->check_forces_count) >= count_limit ) {
02511     char errmsg[256];
02512     sprintf(errmsg,"cuda_check_pme_forces polled %d times over %f s on seq %d",
02513             argp->check_forces_count, walltime - argp->forces_time,
02514             argp->saved_sequence);
02515     cuda_errcheck(errmsg);
02516     NAMD_die(errmsg);
02517   } else {
02518     break; // call again
02519   }
02520  } // while ( 1 )
02521  CcdCallBacksReset(0,walltime);  // fix Charm++
02522  CUDA_POLL(cuda_check_pme_forces, arg);
02523 }
02524 #endif // NAMD_CUDA
02525 
02526 void ComputePmeMgr::ungridCalc(void) {
02527   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02528 
02529   ungridForcesCount = pmeComputes.size();
02530 
02531 #ifdef NAMD_CUDA
02532  if ( offload ) {
02533   //CmiLock(cuda_lock);
02534 
02535   if ( this == masterPmeMgr ) {
02536     double before = CmiWallTimer();
02537     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02538     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02539     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02540 
02541     const int myrank = CkMyRank();
02542     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02543       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02544         nodePmeMgr->mgrObjects[i]->ungridCalc();
02545       }
02546     }
02547     if ( ! pmeComputes.size() ) return;
02548   }
02549 
02550   if ( ! end_forces ) {
02551     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02552     end_forces = new cudaEvent_t[n];
02553     for ( int i=0; i<n; ++i ) {
02554       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02555     }
02556   }
02557 
02558   const int pcsz = pmeComputes.size();
02559   if ( ! afn_host ) {
02560     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02561     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02562     cuda_errcheck("malloc params for pme");
02563   }
02564   int totn = 0;
02565   for ( int i=0; i<pcsz; ++i ) {
02566     int n = pmeComputes[i]->numGridAtoms[0];
02567     totn += n;
02568   }
02569   if ( totn > f_data_mgr_alloc ) {
02570     if ( f_data_mgr_alloc ) {
02571       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02572       cudaFree(f_data_mgr_dev);
02573       cudaFreeHost(f_data_mgr_host);
02574     }
02575     f_data_mgr_alloc = 1.2 * (totn + 100);
02576     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02577     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02578     cuda_errcheck("malloc forces for pme");
02579   }
02580   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02581   float *f_dev = f_data_mgr_dev;
02582   float *f_host = f_data_mgr_host;
02583   for ( int i=0; i<pcsz; ++i ) {
02584     int n = pmeComputes[i]->numGridAtoms[0];
02585     pmeComputes[i]->f_data_dev = f_dev;
02586     pmeComputes[i]->f_data_host = f_host;
02587     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02588     afn_host[3*i+1] = f_dev;
02589     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02590     f_dev += 3*n;
02591     f_host += 3*n;
02592   }
02593   //CmiLock(cuda_lock);
02594   double before = CmiWallTimer();
02595   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02596   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02597   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02598   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02599 
02600   for ( int i=0; i<pcsz; ++i ) {
02601     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02602     if ( i%EVENT_STRIDE == 0 ) {
02603       int dimy = pcsz - i;
02604       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02605       int maxn = 0;
02606       int subtotn = 0;
02607       for ( int j=0; j<dimy; ++j ) {
02608         int n = pmeComputes[i+j]->numGridAtoms[0];
02609         subtotn += n;
02610         if ( n > maxn ) maxn = n;
02611       }
02612       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02613       before = CmiWallTimer();
02614       cuda_pme_forces(
02615         bspline_coeffs_dev,
02616         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02617         pmeComputes[i]->a_data_dev,
02618         pmeComputes[i]->f_data_dev,
02619         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02620         streams[stream]);
02621       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02622       before = CmiWallTimer();
02623       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02624         cudaMemcpyDeviceToHost, streams[stream]);
02625       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02626       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02627       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02628     }
02629     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02630   }
02631   //CmiUnlock(cuda_lock);
02632  } else
02633 #endif // NAMD_CUDA
02634  {
02635   for ( int i=0; i<pmeComputes.size(); ++i ) {
02636     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02637     // pmeComputes[i]->ungridForces();
02638   }
02639  }
02640   // submitReductions();  // must follow all ungridForces()
02641 
02642 #ifdef NAMD_CUDA
02643  if ( offload ) {
02644   forces_time = CmiWallTimer();
02645   forces_count = ungridForcesCount;
02646   forces_done_count = 0;
02647   pmeProxy[this_pe].pollForcesReady();
02648  }
02649 #endif
02650 
02651   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02652 }
02653 
02654 void ComputePmeMgr::pollForcesReady() {
02655 #ifdef NAMD_CUDA
02656   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02657   CUDA_POLL(cuda_check_pme_forces,this);
02658 #else
02659   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02660 #endif
02661 }
02662 
02663 void ComputePme::atomUpdate() { atomsChanged = 1; }
02664 
02665 ComputePme::ComputePme(ComputeID c, PatchID pid) : Compute(c), patchID(pid)
02666 {
02667   DebugM(4,"ComputePme created.\n");
02668   basePriority = PME_PRIORITY;
02669   setNumPatches(1);
02670 
02671   CProxy_ComputePmeMgr::ckLocalBranch(
02672         CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
02673 
02674   SimParameters *simParams = Node::Object()->simParameters;
02675 
02676   qmForcesOn =  simParams->qmForcesOn;
02677   offload = simParams->PMEOffload;
02678 
02679   alchOn = simParams->alchOn;
02680   alchFepOn = simParams->alchFepOn;
02681   alchThermIntOn = simParams->alchThermIntOn;
02682   alchDecouple = alchOn && simParams->alchDecouple;
02683   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
02684             
02685   if (alchOn) {
02686     numGrids = 2;
02687     if (alchDecouple) numGrids += 2;
02688     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
02689   }
02690   else numGrids = 1;
02691   lesOn = simParams->lesOn;
02692   if ( lesOn ) {
02693     lesFactor = simParams->lesFactor;
02694     numGrids = lesFactor;
02695   }
02696   selfOn = 0;
02697   pairOn = simParams->pairInteractionOn;
02698   if ( pairOn ) {
02699     selfOn = simParams->pairInteractionSelf;
02700     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
02701     numGrids = selfOn ? 1 : 3;
02702   }
02703 
02704   myGrid.K1 = simParams->PMEGridSizeX;
02705   myGrid.K2 = simParams->PMEGridSizeY;
02706   myGrid.K3 = simParams->PMEGridSizeZ;
02707   myGrid.order = simParams->PMEInterpOrder;
02708   myGrid.dim2 = myGrid.K2;
02709   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02710 
02711 #ifdef NAMD_CUDA
02712   cuda_atoms_offset = 0;
02713   f_data_host = 0;
02714   f_data_dev = 0;
02715  if ( ! offload )
02716 #endif
02717  {
02718   for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
02719  }
02720 
02721   atomsChanged = 0;
02722   
02723   qmLoclIndx = 0;
02724   qmLocalCharges = 0;
02725 }
02726 
02727 void ComputePme::initialize() {
02728   if (!(patch = PatchMap::Object()->patch(patchID))) {
02729     NAMD_bug("ComputePme used with unknown patch.");
02730   }
02731   positionBox = patch->registerPositionPickup(this);
02732   avgPositionBox = patch->registerAvgPositionPickup(this);
02733   forceBox = patch->registerForceDeposit(this);
02734 #ifdef NAMD_CUDA
02735  if ( offload ) {
02736   myMgr->cuda_atoms_count += patch->getNumAtoms();
02737  }
02738 #endif
02739 }
02740 
02741 void ComputePmeMgr::initialize_computes() {
02742 
02743   noWorkCount = 0;
02744   doWorkCount = 0;
02745   ungridForcesCount = 0;
02746 
02747   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02748 
02749   SimParameters *simParams = Node::Object()->simParameters;
02750 
02751   strayChargeErrors = 0;
02752 
02753 #ifdef NAMD_CUDA
02754  PatchMap *patchMap = PatchMap::Object();
02755  int pe = master_pe = CkNodeFirst(CkMyNode());
02756  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02757     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02758     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02759     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02760     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02761     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02762         && pe != deviceCUDA->getMasterPe() ) {
02763       master_pe = pe;
02764     }
02765  }
02766  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02767    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02768  }
02769 
02770  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02771  bool cudaFirst = 1;
02772  if ( offload ) {
02773   CmiLock(cuda_lock);
02774   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02775  }
02776 
02777  if ( cudaFirst ) {
02778   nodePmeMgr->master_pe = master_pe;
02779   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02780  }
02781 #endif
02782 
02783   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02784   fsize = myGrid.K1 * myGrid.dim2;
02785   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02786 #ifdef NAMD_CUDA
02787  if ( ! offload )
02788 #endif
02789  {
02790   q_arr = new float*[fsize*numGrids];
02791   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02792   q_list = new float*[fsize*numGrids];
02793   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02794   q_count = 0;
02795  }
02796 
02797 #ifdef NAMD_CUDA
02798  if ( cudaFirst || ! offload ) {
02799 #endif
02800   f_arr = new char[fsize*numGrids];
02801   // memset to non-zero value has race condition on BlueGene/Q
02802   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02803   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02804 
02805   for ( int g=0; g<numGrids; ++g ) {
02806     char *f = f_arr + g*fsize;
02807     if ( usePencils ) {
02808       int K1 = myGrid.K1;
02809       int K2 = myGrid.K2;
02810       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02811       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02812       int dim2 = myGrid.dim2;
02813       for (int ap=0; ap<numPencilsActive; ++ap) {
02814         int ib = activePencils[ap].i;
02815         int jb = activePencils[ap].j;
02816         int ibegin = ib*block1;
02817         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02818         int jbegin = jb*block2;
02819         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02820         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02821         for ( int i=ibegin; i<iend; ++i ) {
02822           for ( int j=jbegin; j<jend; ++j ) {
02823             f[i*dim2+j] = 0;
02824           }
02825         }
02826       }
02827     } else {
02828       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02829       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02830       for (int pe=0; pe<numGridPes; pe++) {
02831         if ( ! recipPeDest[pe] ) continue;
02832         int start = pe * bsize;
02833         int len = bsize;
02834         if ( start >= qsize ) { start = 0; len = 0; }
02835         if ( start + len > qsize ) { len = qsize - start; }
02836         int zdim = myGrid.dim3;
02837         int fstart = start / zdim;
02838         int flen = len / zdim;
02839         memset(f + fstart, 0, flen*sizeof(char));
02840         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02841       }
02842     }
02843   }
02844 #ifdef NAMD_CUDA
02845  }
02846  if ( offload ) {
02847  if ( cudaFirst ) {
02848 
02849   int f_alloc_count = 0;
02850   for ( int n=fsize, i=0; i<n; ++i ) {
02851     if ( f_arr[i] == 0 ) {
02852       ++f_alloc_count;
02853     }
02854   }
02855   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02856 
02857   q_arr = new float*[fsize*numGrids];
02858   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02859 
02860   float **q_arr_dev_host = new float*[fsize];
02861   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02862 
02863   float **v_arr_dev_host = new float*[fsize];
02864   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02865 
02866   int q_stride = myGrid.K3+myGrid.order-1;
02867   q_data_size = f_alloc_count * q_stride * sizeof(float);
02868   ffz_size = (fsize + q_stride) * sizeof(int);
02869 
02870   // tack ffz onto end of q_data to allow merged transfer
02871   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02872   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02873   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02874   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02875   cudaMalloc((void**) &v_data_dev, q_data_size);
02876   cuda_errcheck("malloc grid data for pme");
02877   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02878   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02879   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02880   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02881   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02882 
02883   f_alloc_count = 0;
02884   for ( int n=fsize, i=0; i<n; ++i ) {
02885     if ( f_arr[i] == 0 ) {
02886       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02887       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02888       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02889       ++f_alloc_count;
02890     } else {
02891       q_arr[i] = 0;
02892       q_arr_dev_host[i] = 0;
02893       v_arr_dev_host[i] = 0;
02894     }
02895   }
02896 
02897   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02898   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02899   delete [] q_arr_dev_host;
02900   delete [] v_arr_dev_host;
02901   delete [] f_arr;
02902   f_arr = new char[fsize + q_stride];
02903   fz_arr = f_arr + fsize;
02904   memset(f_arr, 0, fsize + q_stride);
02905   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02906 
02907   cuda_errcheck("initialize grid data for pme");
02908 
02909   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02910   cuda_errcheck("initialize bspline coefficients for pme");
02911 
02912 #define XCOPY(X) masterPmeMgr->X = X;
02913   XCOPY(bspline_coeffs_dev)
02914   XCOPY(bspline_dcoeffs_dev)
02915   XCOPY(q_arr)
02916   XCOPY(q_arr_dev)
02917   XCOPY(v_arr_dev)
02918   XCOPY(q_data_size)
02919   XCOPY(q_data_host)
02920   XCOPY(q_data_dev)
02921   XCOPY(v_data_dev)
02922   XCOPY(ffz_size)
02923   XCOPY(ffz_host)
02924   XCOPY(ffz_dev)
02925   XCOPY(f_arr)
02926   XCOPY(fz_arr)
02927 #undef XCOPY
02928   //CkPrintf("pe %d init first\n", CkMyPe());
02929  } else { // cudaFirst
02930   //CkPrintf("pe %d init later\n", CkMyPe());
02931 #define XCOPY(X) X = masterPmeMgr->X;
02932   XCOPY(bspline_coeffs_dev)
02933   XCOPY(bspline_dcoeffs_dev)
02934   XCOPY(q_arr)
02935   XCOPY(q_arr_dev)
02936   XCOPY(v_arr_dev)
02937   XCOPY(q_data_size)
02938   XCOPY(q_data_host)
02939   XCOPY(q_data_dev)
02940   XCOPY(v_data_dev)
02941   XCOPY(ffz_size)
02942   XCOPY(ffz_host)
02943   XCOPY(ffz_dev)
02944   XCOPY(f_arr)
02945   XCOPY(fz_arr)
02946 #undef XCOPY
02947  } // cudaFirst
02948   CmiUnlock(cuda_lock);
02949  } else // offload
02950 #endif // NAMD_CUDA
02951  {
02952   fz_arr = new char[myGrid.K3+myGrid.order-1];
02953  }
02954 
02955 #if 0 && USE_PERSISTENT
02956   recvGrid_handle = NULL;
02957 #endif
02958 }
02959 
02960 ComputePme::~ComputePme()
02961 {
02962 #ifdef NAMD_CUDA
02963   if ( ! offload )
02964 #endif
02965   {
02966     for ( int g=0; g<numGrids; ++g ) delete myRealSpace[g];
02967   }
02968 }
02969 
02970 #if 0 && USE_PERSISTENT 
02971 void ComputePmeMgr::setup_recvgrid_persistent() 
02972 {
02973     int K1 = myGrid.K1;
02974     int K2 = myGrid.K2;
02975     int dim2 = myGrid.dim2;
02976     int dim3 = myGrid.dim3;
02977     int block1 = myGrid.block1;
02978     int block2 = myGrid.block2;
02979 
02980     CkArray *zPencil_local = zPencil.ckLocalBranch();
02981     recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02982     for (int ap=0; ap<numPencilsActive; ++ap) {
02983         int ib = activePencils[ap].i;
02984         int jb = activePencils[ap].j;
02985         int ibegin = ib*block1;
02986         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02987         int jbegin = jb*block2;
02988         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02989         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02990         // f is changing
02991         int fcount = 0;
02992         for ( int g=0; g<numGrids; ++g ) {
02993             char *f = f_arr + g*fsize;
02994             for ( int i=ibegin; i<iend; ++i ) {
02995                 for ( int j=jbegin; j<jend; ++j ) {
02996                     fcount += f[i*dim2+j];
02997                 }
02998             }
02999         }
03000         int zlistlen = 0;
03001         for ( int i=0; i<myGrid.K3; ++i ) {
03002             if ( fz_arr[i] ) ++zlistlen;
03003         }
03004         int hd = ( fcount? 1 : 0 );  // has data?
03005         int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
03006         int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
03007         int compress_size = sizeof(float)*hd*fcount*zlistlen;
03008         int size = compress_start +  compress_size  + PRIORITY_SIZE/8+6;
03009         recvGrid_handle[ap] =  CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03010     }
03011 }
03012 #endif
03013 
03014 int ComputePme::noWork() {
03015 
03016   if ( patch->flags.doFullElectrostatics ) {
03017     // In QM/MM simulations, atom charges form QM regions need special treatment.
03018     if ( qmForcesOn ) {
03019         return 1;
03020     }
03021     if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0;  // work to do, enqueue as usual
03022     myMgr->heldComputes.add(this);
03023     return 1;  // don't enqueue yet
03024   }
03025 
03026   positionBox->skip();
03027   forceBox->skip();
03028 
03029   if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
03030     myMgr->noWorkCount = 0;
03031     myMgr->reduction->submit();
03032   }
03033 
03034   atomsChanged = 0;
03035 
03036   return 1;  // no work for this step
03037 }
03038 
03039 void ComputePmeMgr::addRecipEvirClient() {
03040   ++recipEvirClients;
03041 }
03042 
03043 void ComputePmeMgr::recvRecipEvir(PmeEvirMsg *msg) {
03044   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03045   for ( int g=0; g<numGrids; ++g ) {
03046     evir[g] += msg->evir[g];
03047   }
03048   delete msg;
03049   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03050   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03051 }
03052 
03053 void ComputePme::doQMWork() {
03054     
03055 //     iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
03056     
03057     
03058     int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
03059     const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
03060     const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
03061     const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
03062     
03063     const CompAtomExt *xExt = patch->getCompAtomExtInfo();
03064     
03065     // Determine number of qm atoms in this patch for the current step.
03066     numLocalQMAtoms = 0;
03067     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03068         if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
03069             numLocalQMAtoms++;
03070         }
03071     }
03072     
03073     // We prepare a charge vector with QM charges for use in the PME calculation.
03074     
03075     // Clears data from last step, if there is any.
03076     if (qmLoclIndx != 0)
03077         delete [] qmLoclIndx;
03078     if (qmLocalCharges != 0)
03079         delete [] qmLocalCharges;
03080     
03081     qmLoclIndx = new int[numLocalQMAtoms] ;
03082     qmLocalCharges = new Real[numLocalQMAtoms] ;
03083     
03084     // I am assuming there will be (in general) more QM atoms among all QM groups
03085     // than MM atoms in a patch.
03086     int procAtms = 0;
03087     
03088     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03089         
03090         for (int i=0; i<numQMAtms; i++) {
03091             
03092             if (qmAtmIndx[i] == xExt[paIter].id) {
03093                 
03094                 qmLoclIndx[procAtms] = paIter ;
03095                 qmLocalCharges[procAtms] = qmAtmChrg[i];
03096                 
03097                 procAtms++;
03098                 break;
03099             }
03100             
03101         }
03102         
03103         if (procAtms == numLocalQMAtoms)
03104             break;
03105     }
03106     
03107     doWork();
03108     return ;
03109 }
03110 
03111 void ComputePme::doWork()
03112 {
03113   DebugM(4,"Entering ComputePme::doWork().\n");
03114 
03115   if ( basePriority >= COMPUTE_HOME_PRIORITY ) {
03116 #ifdef NAMD_CUDA
03117     basePriority = ( offload ? PME_OFFLOAD_PRIORITY : PME_PRIORITY );
03118 #else
03119     basePriority = PME_PRIORITY;
03120 #endif
03121     ungridForces();
03122     // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03123     if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
03124     return;
03125   }
03126   basePriority = COMPUTE_HOME_PRIORITY + PATCH_PRIORITY(patchID);
03127   // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03128 
03129 #ifdef TRACE_COMPUTE_OBJECTS
03130     double traceObjStartTime = CmiWallTimer();
03131 #endif
03132 
03133   // allocate storage
03134   numLocalAtoms = patch->getNumAtoms();
03135 
03136   Lattice &lattice = patch->flags.lattice;
03137 
03138   localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
03139   localData = localData_alloc.begin();
03140   localPartition_alloc.resize(numLocalAtoms);
03141   localPartition = localPartition_alloc.begin();
03142 
03143   int g;
03144   for ( g=0; g<numGrids; ++g ) {
03145     localGridData[g] = localData + numLocalAtoms*(g+1);
03146   }
03147 
03148   // get positions and charges
03149   PmeParticle * data_ptr = localData;
03150   unsigned char * part_ptr = localPartition;
03151   const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
03152                                 * ComputeNonbondedUtil::dielectric_1 );
03153 
03154   {
03155     CompAtom *x = positionBox->open();
03156     // CompAtomExt *xExt = patch->getCompAtomExtInfo();
03157     if ( patch->flags.doMolly ) {
03158       positionBox->close(&x);
03159       x = avgPositionBox->open();
03160     }
03161     int numAtoms = patch->getNumAtoms();
03162 
03163     for(int i=0; i<numAtoms; ++i)
03164     {
03165       data_ptr->x = x[i].position.x;
03166       data_ptr->y = x[i].position.y;
03167       data_ptr->z = x[i].position.z;
03168       data_ptr->cg = coulomb_sqrt * x[i].charge;
03169       ++data_ptr;
03170       *part_ptr = x[i].partition;
03171       ++part_ptr;
03172     }
03173 
03174     // QM loop to overwrite charges of QM atoms.
03175     // They are zero for NAMD, but are updated in ComputeQM.
03176     if ( qmForcesOn ) {
03177         
03178         for(int i=0; i<numLocalQMAtoms; ++i)
03179         {
03180           localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
03181         }
03182         
03183     }
03184     
03185     if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
03186     else { positionBox->close(&x); }
03187   }
03188 
03189   // copy to other grids if needed
03190   if ( (alchOn && (!alchDecouple)) || lesOn ) {
03191     for ( g=0; g<numGrids; ++g ) {
03192       PmeParticle *lgd = localGridData[g];
03193       int nga = 0;
03194       for(int i=0; i<numLocalAtoms; ++i) {
03195         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03196           // for FEP/TI: grid 0 gets non-alch + partition 1;
03197           // grid 1 gets non-alch + partition 2;
03198           // grid 2 (only if called for with numGrids=3) gets only non-alch
03199           lgd[nga++] = localData[i];
03200         }
03201       }
03202       numGridAtoms[g] = nga;
03203     }
03204   } else if ( alchOn && alchDecouple) {
03205     // alchemical decoupling: four grids
03206     // g=0: partition 0 and partition 1
03207     // g=1: partition 0 and partition 2
03208     // g=2: only partition 1 atoms
03209     // g=3: only partition 2 atoms
03210     // plus one grid g=4, only partition 0, if numGrids=5
03211     for ( g=0; g<2; ++g ) {  // same as before for first 2
03212       PmeParticle *lgd = localGridData[g];
03213       int nga = 0;
03214       for(int i=0; i<numLocalAtoms; ++i) {
03215         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03216           lgd[nga++] = localData[i];
03217         }
03218       }
03219       numGridAtoms[g] = nga;
03220     }
03221     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
03222       PmeParticle *lgd = localGridData[g];
03223       int nga = 0;
03224       for(int i=0; i<numLocalAtoms; ++i) {
03225         if ( localPartition[i] == (g-1) ) {
03226           lgd[nga++] = localData[i];
03227         }
03228       }
03229       numGridAtoms[g] = nga;
03230     }
03231     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
03232       // numGrids=5 only if alchElecLambdaStart > 0
03233       PmeParticle *lgd = localGridData[g];
03234       int nga = 0;
03235       for(int i=0; i<numLocalAtoms; ++i) {
03236         if ( localPartition[i] == 0 ) {
03237           lgd[nga++] = localData[i];
03238         }
03239       }
03240       numGridAtoms[g] = nga;
03241     }
03242   } else if ( selfOn ) {
03243     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
03244     g = 0;
03245     PmeParticle *lgd = localGridData[g];
03246     int nga = 0;
03247     for(int i=0; i<numLocalAtoms; ++i) {
03248       if ( localPartition[i] == 1 ) {
03249         lgd[nga++] = localData[i];
03250       }
03251     }
03252     numGridAtoms[g] = nga;
03253   } else if ( pairOn ) {
03254     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
03255     g = 0;
03256     PmeParticle *lgd = localGridData[g];
03257     int nga = 0;
03258     for(int i=0; i<numLocalAtoms; ++i) {
03259       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
03260         lgd[nga++] = localData[i];
03261       }
03262     }
03263     numGridAtoms[g] = nga;
03264     for ( g=1; g<3; ++g ) {
03265       PmeParticle *lgd = localGridData[g];
03266       int nga = 0;
03267       for(int i=0; i<numLocalAtoms; ++i) {
03268         if ( localPartition[i] == g ) {
03269           lgd[nga++] = localData[i];
03270         }
03271       }
03272       numGridAtoms[g] = nga;
03273     }
03274   } else {
03275     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
03276     localGridData[0] = localData;
03277     numGridAtoms[0] = numLocalAtoms;
03278   }
03279 
03280  if ( ! myMgr->doWorkCount ) {
03281   myMgr->doWorkCount = myMgr->pmeComputes.size();
03282 
03283 #ifdef NAMD_CUDA
03284  if ( !  offload )
03285 #endif // NAMD_CUDA
03286  {
03287   memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
03288 
03289   for (int i=0; i<myMgr->q_count; ++i) {
03290     memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
03291   }
03292  }
03293 
03294   for ( g=0; g<numGrids; ++g ) {
03295     myMgr->evir[g] = 0;
03296   }
03297 
03298   myMgr->strayChargeErrors = 0;
03299 
03300   myMgr->compute_sequence = sequence();
03301  }
03302 
03303   if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
03304 
03305   int strayChargeErrors = 0;
03306 
03307   // calculate self energy
03308   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
03309   for ( g=0; g<numGrids; ++g ) {
03310     BigReal selfEnergy = 0;
03311     data_ptr = localGridData[g];
03312     int i;
03313     for(i=0; i<numGridAtoms[g]; ++i)
03314     {
03315       selfEnergy += data_ptr->cg * data_ptr->cg;
03316       ++data_ptr;
03317     }
03318     selfEnergy *= -1. * ewaldcof / SQRT_PI;
03319     myMgr->evir[g][0] += selfEnergy;
03320 
03321     float **q = myMgr->q_arr + g*myMgr->fsize;
03322     char *f = myMgr->f_arr + g*myMgr->fsize;
03323 
03324     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
03325 #ifdef NAMD_CUDA
03326    if ( offload ) {
03327     if ( myMgr->cuda_atoms_alloc == 0 ) {  // first call
03328       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03329       cuda_errcheck("before malloc atom data for pme");
03330       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03331       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03332       cuda_errcheck("malloc atom data for pme");
03333       myMgr->cuda_atoms_count = 0;
03334     }
03335     cuda_atoms_offset = myMgr->cuda_atoms_count;
03336     int n = numGridAtoms[g];
03337     myMgr->cuda_atoms_count += n;
03338     if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
03339       CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
03340                         CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
03341       cuda_errcheck("before malloc expanded atom data for pme");
03342       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03343       const float *a_data_host_old = myMgr->a_data_host;
03344       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03345       cuda_errcheck("malloc expanded host atom data for pme");
03346       memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
03347       cudaFreeHost((void*) a_data_host_old);
03348       cuda_errcheck("free expanded host atom data for pme");
03349       cudaFree(myMgr->a_data_dev);
03350       cuda_errcheck("free expanded dev atom data for pme");
03351       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03352       cuda_errcheck("malloc expanded dev atom data for pme");
03353     }
03354     float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
03355     data_ptr = localGridData[g];
03356     double order_1 = myGrid.order - 1;
03357     double K1 = myGrid.K1;
03358     double K2 = myGrid.K2;
03359     double K3 = myGrid.K3;
03360     int found_negative = 0;
03361     for ( int i=0; i<n; ++i ) {
03362       if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
03363         found_negative = 1;
03364         // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
03365       }
03366       double x_int = (int) data_ptr[i].x;
03367       double y_int = (int) data_ptr[i].y;
03368       double z_int = (int) data_ptr[i].z;
03369       a_data_host[7*i  ] = data_ptr[i].x - x_int;  // subtract in double precision
03370       a_data_host[7*i+1] = data_ptr[i].y - y_int;
03371       a_data_host[7*i+2] = data_ptr[i].z - z_int;
03372       a_data_host[7*i+3] = data_ptr[i].cg;
03373       x_int -= order_1;  if ( x_int < 0 ) x_int += K1;
03374       y_int -= order_1;  if ( y_int < 0 ) y_int += K2;
03375       z_int -= order_1;  if ( z_int < 0 ) z_int += K3;
03376       a_data_host[7*i+4] = x_int;
03377       a_data_host[7*i+5] = y_int;
03378       a_data_host[7*i+6] = z_int;
03379     }
03380     if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
03381    } else
03382 #endif // NAMD_CUDA
03383    {
03384     myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
03385     myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
03386    }
03387   }
03388   myMgr->strayChargeErrors += strayChargeErrors;
03389 
03390 #ifdef TRACE_COMPUTE_OBJECTS
03391     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
03392 #endif
03393 
03394  if ( --(myMgr->doWorkCount) == 0 ) {
03395 // cudaDeviceSynchronize();  // XXXX
03396 #ifdef NAMD_CUDA
03397   if ( offload ) {
03398     ComputePmeMgr::cuda_submit_charges_args args;
03399     args.mgr = myMgr;
03400     args.lattice = &lattice;
03401     args.sequence = sequence();
03402     CmiLock(ComputePmeMgr::cuda_lock);
03403     if ( ComputePmeMgr::cuda_busy ) {
03404       ComputePmeMgr::cuda_submit_charges_deque.push_back(args);
03405     } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
03406       // avoid adding work to nonbonded data preparation pe
03407       args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03408     } else {
03409       ComputePmeMgr::cuda_busy = true;
03410       while ( 1 ) {
03411         CmiUnlock(ComputePmeMgr::cuda_lock);
03412         args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03413         CmiLock(ComputePmeMgr::cuda_lock);
03414         if ( ComputePmeMgr::cuda_submit_charges_deque.size() ) {
03415           args = ComputePmeMgr::cuda_submit_charges_deque.front();
03416           ComputePmeMgr::cuda_submit_charges_deque.pop_front();
03417         } else {
03418           ComputePmeMgr::cuda_busy = false;
03419           break;
03420         }
03421       }
03422     }
03423     CmiUnlock(ComputePmeMgr::cuda_lock);
03424   } else
03425 #endif // NAMD_CUDA
03426   {
03427     myMgr->chargeGridReady(lattice,sequence());
03428   }
03429  }
03430  atomsChanged = 0;
03431 }
03432 
03433 #ifdef NAMD_CUDA
03434 
03435 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
03436 
03437     int n = cuda_atoms_count;
03438     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03439     cuda_atoms_count = 0;
03440 
03441     const double before = CmiWallTimer();
03442     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03443                           cudaMemcpyHostToDevice, streams[stream]);
03444     const double after = CmiWallTimer();
03445 
03446     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03447 
03448     cuda_pme_charges(
03449       bspline_coeffs_dev,
03450       q_arr_dev, ffz_dev, ffz_dev + fsize,
03451       a_data_dev, n,
03452       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03453       streams[stream]);
03454     const double after2 = CmiWallTimer();
03455 
03456     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03457 
03458     masterPmeMgr->charges_time = before;
03459     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03460     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03461 }
03462 
03463 void cuda_check_pme_charges(void *arg, double walltime) {
03464   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
03465 
03466   cudaError_t err = cudaEventQuery(argp->end_charges);
03467   if ( err == cudaSuccess ) {
03468     traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
03469     argp->charges_time = walltime - argp->charges_time;
03470     argp->sendChargeGridReady();
03471     argp->check_charges_count = 0;
03472   } else if ( err != cudaErrorNotReady ) {
03473     cuda_errcheck("in cuda_check_pme_charges");
03474     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_charges");
03475   } else if ( ++(argp->check_charges_count) >= count_limit ) {
03476     char errmsg[256];
03477     sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
03478             argp->check_charges_count, walltime - argp->charges_time,
03479             argp->saved_sequence);
03480     cuda_errcheck(errmsg);
03481     NAMD_die(errmsg);
03482   } else {
03483     CcdCallBacksReset(0,walltime);  // fix Charm++
03484     CUDA_POLL(cuda_check_pme_charges, arg);
03485   }
03486 }
03487 
03488 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
03489   saved_lattice = &lattice;
03490   saved_sequence = sequence;
03491 
03492   // cudaDeviceSynchronize();  //  XXXX TESTING
03493   //int q_stride = myGrid.K3+myGrid.order-1;
03494   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03495   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03496   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03497   //  }
03498   //}
03499   //CmiLock(cuda_lock);
03500 
03501  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03502   double before = CmiWallTimer();
03503   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03504   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03505   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03506                         cudaMemcpyDeviceToHost, streams[stream]);
03507   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03508   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03509   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03510   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03511   //CmiUnlock(cuda_lock);
03512   // cudaDeviceSynchronize();  //  XXXX TESTING
03513   // cuda_errcheck("after memcpy grid to host");
03514 
03515   SimParameters *simParams = Node::Object()->simParameters;
03516   if ( ! simParams->useCUDA2 ) {
03517     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03518     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03519   }
03520 
03521   pmeProxy[master_pe].pollChargeGridReady();
03522  }
03523 }
03524 
03525 void ComputePmeMgr::sendChargeGridReady() {
03526   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03527     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03528     int cs = mgr->pmeComputes.size();
03529     if ( cs ) {
03530       mgr->ungridForcesCount = cs;
03531       mgr->recipEvirCount = mgr->recipEvirClients;
03532       masterPmeMgr->chargeGridSubmittedCount++;
03533     }
03534   }
03535   pmeProxy[master_pe].recvChargeGridReady();
03536 }
03537 #endif // NAMD_CUDA
03538 
03539 void ComputePmeMgr::pollChargeGridReady() {
03540 #ifdef NAMD_CUDA
03541   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03542   CUDA_POLL(cuda_check_pme_charges,this);
03543 #else
03544   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03545 #endif
03546 }
03547 
03548 void ComputePmeMgr::recvChargeGridReady() {
03549   chargeGridReady(*saved_lattice,saved_sequence);
03550 }
03551 
03552 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
03553 
03554 #ifdef NAMD_CUDA
03555  if ( offload ) {
03556   int errcount = 0;
03557   int q_stride = myGrid.K3+myGrid.order-1;
03558   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03559     f_arr[j] = ffz_host[j];
03560     if ( ffz_host[j] & ~1 ) ++errcount;
03561   }
03562   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03563  }
03564 #endif
03565   recipEvirCount = recipEvirClients;
03566   ungridForcesCount = pmeComputes.size();
03567 
03568   for (int j=0; j<myGrid.order-1; ++j) {
03569     fz_arr[j] |= fz_arr[myGrid.K3+j];
03570   }
03571 
03572   if ( usePencils ) {
03573     sendPencils(lattice,sequence);
03574   } else {
03575     sendData(lattice,sequence);
03576   }
03577 }
03578 
03579 
03580 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
03581 
03582   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03583 
03584 #if 0 && USE_PERSISTENT
03585     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03586 #endif
03587   int K1 = myGrid.K1;
03588   int K2 = myGrid.K2;
03589   int dim2 = myGrid.dim2;
03590   int dim3 = myGrid.dim3;
03591   int block1 = myGrid.block1;
03592   int block2 = myGrid.block2;
03593 
03594   // int savedMessages = 0;
03595   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03596 
03597   for (int ap=first; ap<=last; ++ap) {
03598     int ib = activePencils[ap].i;
03599     int jb = activePencils[ap].j;
03600     int ibegin = ib*block1;
03601     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03602     int jbegin = jb*block2;
03603     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03604     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03605 
03606     int fcount = 0;
03607     for ( int g=0; g<numGrids; ++g ) {
03608       char *f = f_arr + g*fsize;
03609 #ifdef NAMD_CUDA
03610      if ( offload ) {
03611       int errcount = 0;
03612       for ( int i=ibegin; i<iend; ++i ) {
03613        for ( int j=jbegin; j<jend; ++j ) {
03614         int k = i*dim2+j;
03615         f[k] = ffz_host[k];
03616         fcount += f[k];
03617         if ( ffz_host[k] & ~1 ) ++errcount;
03618        }
03619       }
03620       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03621      } else
03622 #endif
03623       for ( int i=ibegin; i<iend; ++i ) {
03624        for ( int j=jbegin; j<jend; ++j ) {
03625         fcount += f[i*dim2+j];
03626        }
03627       }
03628     }
03629 
03630 #ifdef NETWORK_PROGRESS
03631     CmiNetworkProgress();
03632 #endif
03633 
03634     if ( ! pencilActive[ib*yBlocks+jb] )
03635       NAMD_bug("PME activePencils list inconsistent");
03636 
03637     int zlistlen = 0;
03638     for ( int i=0; i<myGrid.K3; ++i ) {
03639       if ( fz_arr[i] ) ++zlistlen;
03640     }
03641 
03642     int hd = ( fcount? 1 : 0 );  // has data?
03643     // if ( ! hd ) ++savedMessages;
03644 
03645     
03646     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03647         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03648     msg->sourceNode = sourcepe;
03649     msg->hasData = hd;
03650     msg->lattice = lattice;
03651    if ( hd ) {
03652 #if 0
03653     msg->start = fstart;
03654     msg->len = flen;
03655 #else
03656     msg->start = -1;   // obsolete?
03657     msg->len = -1;   // obsolete?
03658 #endif
03659     msg->zlistlen = zlistlen;
03660     int *zlist = msg->zlist;
03661     zlistlen = 0;
03662     for ( int i=0; i<myGrid.K3; ++i ) {
03663       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03664     }
03665     char *fmsg = msg->fgrid;
03666     float *qmsg = msg->qgrid;
03667     for ( int g=0; g<numGrids; ++g ) {
03668       char *f = f_arr + g*fsize;
03669       float **q = q_arr + g*fsize;
03670       for ( int i=ibegin; i<iend; ++i ) {
03671        for ( int j=jbegin; j<jend; ++j ) {
03672         *(fmsg++) = f[i*dim2+j];
03673         if( f[i*dim2+j] ) {
03674           for (int h=0; h<myGrid.order-1; ++h) {
03675             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03676           }
03677           for ( int k=0; k<zlistlen; ++k ) {
03678             *(qmsg++) = q[i*dim2+j][zlist[k]];
03679           }
03680         }
03681        }
03682       }
03683     }
03684    }
03685 
03686     msg->sequence = compute_sequence;
03687     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03688     CmiEnableUrgentSend(1);
03689 #if USE_NODE_PAR_RECEIVE
03690     msg->destElem=CkArrayIndex3D(ib,jb,0);
03691     CProxy_PmePencilMap lzm = npMgr->zm;
03692     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03693     int destnode = CmiNodeOf(destproc);
03694     
03695 #if  0 
03696     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03697 #endif
03698     pmeNodeProxy[destnode].recvZGrid(msg);
03699 #if 0 
03700     CmiUsePersistentHandle(NULL, 0);
03701 #endif
03702 #else
03703 #if 0 
03704     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03705 #endif
03706     zPencil(ib,jb,0).recvGrid(msg);
03707 #if 0 
03708     CmiUsePersistentHandle(NULL, 0);
03709 #endif
03710 #endif
03711     CmiEnableUrgentSend(0);
03712   }
03713 
03714 
03715   // if ( savedMessages ) {
03716   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03717   // }
03718 
03719 }
03720 
03721 
03722 void ComputePmeMgr::sendPencilsHelper(int iter) {
03723   nodePmeMgr->sendPencilsHelper(iter);
03724 }
03725 
03726 void NodePmeMgr::sendPencilsHelper(int iter) {
03727 #ifdef NAMD_CUDA
03728   ComputePmeMgr *obj = masterPmeMgr;
03729   obj->sendPencilsPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe);
03730 #else
03731   NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
03732 #endif
03733 }
03734 
03735 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
03736 
03737   sendDataHelper_lattice = &lattice;
03738   sendDataHelper_sequence = sequence;
03739   sendDataHelper_sourcepe = CkMyPe();
03740 
03741 #ifdef NAMD_CUDA
03742   if ( offload ) {
03743     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03744 #if CMK_MULTICORE
03745       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03746       int ib = activePencils[ap].i;
03747       int jb = activePencils[ap].j;
03748       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03749       pmeProxy[destproc].sendPencilsHelper(ap);
03750 #else
03751       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03752 #endif
03753     }
03754   } else
03755 #endif
03756   {
03757     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03758   }
03759 
03760   if ( strayChargeErrors ) {
03761    strayChargeErrors = 0;
03762    iout << iERROR << "Stray PME grid charges detected: "
03763         << CkMyPe() << " sending to (x,y)";
03764    int K1 = myGrid.K1;
03765    int K2 = myGrid.K2;
03766    int dim2 = myGrid.dim2;
03767    int block1 = myGrid.block1;
03768    int block2 = myGrid.block2;
03769    for (int ib=0; ib<xBlocks; ++ib) {
03770     for (int jb=0; jb<yBlocks; ++jb) {
03771      int ibegin = ib*block1;
03772      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03773      int jbegin = jb*block2;
03774      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03775      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03776 
03777      for ( int g=0; g<numGrids; ++g ) {
03778        char *f = f_arr + g*fsize;
03779        if ( ! pencilActive[ib*yBlocks+jb] ) {
03780            for ( int i=ibegin; i<iend; ++i ) {
03781             for ( int j=jbegin; j<jend; ++j ) {
03782              if ( f[i*dim2+j] == 3 ) {
03783                f[i*dim2+j] = 2;
03784                iout << " (" << i << "," << j << ")";
03785              }
03786             }
03787            }
03788        }
03789      }
03790     }
03791    }
03792    iout << "\n" << endi;
03793   }
03794  
03795 }
03796 
03797 
03798 void ComputePmeMgr::copyPencils(PmeGridMsg *msg) {
03799 
03800   int K1 = myGrid.K1;
03801   int K2 = myGrid.K2;
03802   int dim2 = myGrid.dim2;
03803   int dim3 = myGrid.dim3;
03804   int block1 = myGrid.block1;
03805   int block2 = myGrid.block2;
03806 
03807   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03808   int ib = msg->sourceNode / yBlocks;
03809   int jb = msg->sourceNode % yBlocks;
03810 
03811   int ibegin = ib*block1;
03812   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03813   int jbegin = jb*block2;
03814   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03815 
03816   int zlistlen = msg->zlistlen;
03817   int *zlist = msg->zlist;
03818   float *qmsg = msg->qgrid;
03819   int g;
03820   for ( g=0; g<numGrids; ++g ) {
03821     char *f = f_arr + g*fsize;
03822     float **q = q_arr + g*fsize;
03823     for ( int i=ibegin; i<iend; ++i ) {
03824      for ( int j=jbegin; j<jend; ++j ) {
03825       if( f[i*dim2+j] ) {
03826         f[i*dim2+j] = 0;
03827         for ( int k=0; k<zlistlen; ++k ) {
03828           q[i*dim2+j][zlist[k]] = *(qmsg++);
03829         }
03830         for (int h=0; h<myGrid.order-1; ++h) {
03831           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03832         }
03833       }
03834      }
03835     }
03836   }
03837 }
03838 
03839 
03840 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
03841 
03842   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03843 
03844   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03845 
03846   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03847   for (int j=first; j<=last; j++) {
03848     int pe = gridPeOrder[j];  // different order
03849     if ( ! recipPeDest[pe] && ! errors ) continue;
03850     int start = pe * bsize;
03851     int len = bsize;
03852     if ( start >= qsize ) { start = 0; len = 0; }
03853     if ( start + len > qsize ) { len = qsize - start; }
03854     int zdim = myGrid.dim3;
03855     int fstart = start / zdim;
03856     int flen = len / zdim;
03857     int fcount = 0;
03858     int i;
03859 
03860     int g;
03861     for ( g=0; g<numGrids; ++g ) {
03862       char *f = f_arr + fstart + g*fsize;
03863 #ifdef NAMD_CUDA
03864      if ( offload ) {
03865       int errcount = 0;
03866       for ( i=0; i<flen; ++i ) {
03867         f[i] = ffz_host[fstart+i];
03868         fcount += f[i];
03869         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03870       }
03871       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03872      } else
03873 #endif
03874       for ( i=0; i<flen; ++i ) {
03875         fcount += f[i];
03876       }
03877       if ( ! recipPeDest[pe] ) {
03878         int errfound = 0;
03879         for ( i=0; i<flen; ++i ) {
03880           if ( f[i] == 3 ) {
03881             errfound = 1;
03882             break;
03883           }
03884         }
03885         if ( errfound ) {
03886           iout << iERROR << "Stray PME grid charges detected: "
03887                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03888           int iz = -1;
03889           for ( i=0; i<flen; ++i ) {
03890             if ( f[i] == 3 ) {
03891               f[i] = 2;
03892               int jz = (i+fstart)/myGrid.K2;
03893               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03894             }
03895           }
03896           iout << "\n" << endi;
03897         }
03898       }
03899     }
03900 
03901 #ifdef NETWORK_PROGRESS
03902     CmiNetworkProgress();
03903 #endif
03904 
03905     if ( ! recipPeDest[pe] ) continue;
03906 
03907     int zlistlen = 0;
03908     for ( i=0; i<myGrid.K3; ++i ) {
03909       if ( fz_arr[i] ) ++zlistlen;
03910     }
03911 
03912     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03913                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03914 
03915     msg->sourceNode = sourcepe;
03916     msg->lattice = lattice;
03917     msg->start = fstart;
03918     msg->len = flen;
03919     msg->zlistlen = zlistlen;
03920     int *zlist = msg->zlist;
03921     zlistlen = 0;
03922     for ( i=0; i<myGrid.K3; ++i ) {
03923       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03924     }
03925     float *qmsg = msg->qgrid;
03926     for ( g=0; g<numGrids; ++g ) {
03927       char *f = f_arr + fstart + g*fsize;
03928       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03929       float **q = q_arr + fstart + g*fsize;
03930       for ( i=0; i<flen; ++i ) {
03931         if ( f[i] ) {
03932           for (int h=0; h<myGrid.order-1; ++h) {
03933             q[i][h] += q[i][myGrid.K3+h];
03934           }
03935           for ( int k=0; k<zlistlen; ++k ) {
03936             *(qmsg++) = q[i][zlist[k]];
03937           }
03938         }
03939       }
03940     }
03941 
03942     msg->sequence = compute_sequence;
03943     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03944     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03945   }
03946 
03947 }
03948 
03949 void ComputePmeMgr::sendDataHelper(int iter) {
03950   nodePmeMgr->sendDataHelper(iter);
03951 }
03952 
03953 void NodePmeMgr::sendDataHelper(int iter) {
03954 #ifdef NAMD_CUDA
03955   ComputePmeMgr *obj = masterPmeMgr;
03956   obj->sendDataPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe, obj->sendDataHelper_errors);
03957 #else
03958   NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
03959 #endif
03960 }
03961 
03962 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
03963 
03964   sendDataHelper_lattice = &lattice;
03965   sendDataHelper_sequence = sequence;
03966   sendDataHelper_sourcepe = CkMyPe();
03967   sendDataHelper_errors = strayChargeErrors;
03968   strayChargeErrors = 0;
03969 
03970 #ifdef NAMD_CUDA
03971   if ( offload ) {
03972     for ( int i=0; i < numGridPes; ++i ) {
03973       int pe = gridPeOrder[i];  // different order
03974       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03975 #if CMK_MULTICORE
03976       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03977       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03978 #else
03979       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03980 #endif
03981     }
03982   } else
03983 #endif
03984   {
03985     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03986   }
03987  
03988 }
03989 
03990 void ComputePmeMgr::copyResults(PmeGridMsg *msg) {
03991 
03992   int zdim = myGrid.dim3;
03993   int flen = msg->len;
03994   int fstart = msg->start;
03995   int zlistlen = msg->zlistlen;
03996   int *zlist = msg->zlist;
03997   float *qmsg = msg->qgrid;
03998   int g;
03999   for ( g=0; g<numGrids; ++g ) {
04000     char *f = msg->fgrid + g*flen;
04001     float **q = q_arr + fstart + g*fsize;
04002     for ( int i=0; i<flen; ++i ) {
04003       if ( f[i] ) {
04004         f[i] = 0;
04005         for ( int k=0; k<zlistlen; ++k ) {
04006           q[i][zlist[k]] = *(qmsg++);
04007         }
04008         for (int h=0; h<myGrid.order-1; ++h) {
04009           q[i][myGrid.K3+h] = q[i][h];
04010         }
04011       }
04012     }
04013   }
04014 }
04015 
04016 void ComputePme::ungridForces() {
04017 
04018     if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
04019  
04020     SimParameters *simParams = Node::Object()->simParameters;
04021 
04022     localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
04023     Vector *localResults = localResults_alloc.begin();
04024     Vector *gridResults;
04025     if ( alchOn || lesOn || selfOn || pairOn ) {
04026       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
04027       gridResults = localResults + numLocalAtoms;
04028     } else {
04029       gridResults = localResults;
04030     }
04031 
04032     Vector pairForce = 0.;
04033     Lattice &lattice = patch->flags.lattice;
04034     int g = 0;
04035     if(!simParams->commOnly) {
04036     for ( g=0; g<numGrids; ++g ) {
04037 #ifdef NETWORK_PROGRESS
04038       CmiNetworkProgress();
04039 #endif
04040 
04041 #ifdef NAMD_CUDA
04042       if ( offload ) {
04043         int errfound = 0;
04044         for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04045           // Neither isnan() nor x != x worked when testing on Cray; this does.
04046           if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }  // CUDA NaN
04047           gridResults[i].x = f_data_host[3*i];
04048           gridResults[i].y = f_data_host[3*i+1];
04049           gridResults[i].z = f_data_host[3*i+2];
04050         }
04051         if ( errfound ) {
04052           int errcount = 0;
04053           for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04054             float f = f_data_host[3*i];
04055             if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) {  // CUDA NaN
04056               ++errcount;
04057               gridResults[i] = 0.;
04058             }
04059           }
04060           iout << iERROR << "Stray PME grid charges detected: "
04061                 << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
04062         }
04063       } else
04064 #endif // NAMD_CUDA
04065         {
04066           myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
04067         }
04068       scale_forces(gridResults, numGridAtoms[g], lattice);
04069       
04070       if (alchOn) {
04071         float scale = 1.;
04072         BigReal elecLambdaUp, elecLambdaDown;
04073         if ( simParams->alchFepWhamOn ) {
04074           if ( simParams->alchFepElecOn ) {
04075             elecLambdaUp = simParams->alchElecLambda;
04076             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04077           }
04078           else {
04079             elecLambdaUp = 0.0;
04080             elecLambdaDown = 1.0;
04081           }
04082         }
04083         else {
04084           BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04085           myMgr->alchLambda = alchLambda;
04086           elecLambdaUp = simParams->getElecLambda(alchLambda);
04087           elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
04088         }
04089         
04090         if ( g == 0 ) scale = elecLambdaUp;
04091         else if ( g == 1 ) scale = elecLambdaDown;
04092         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04093 
04094         if (alchDecouple) {
04095           if ( g == 2 ) scale = 1 - elecLambdaUp;
04096           else if ( g == 3 ) scale = 1 - elecLambdaDown;
04097           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04098         }
04099         int nga = 0;
04100         if (!alchDecouple) {
04101           for(int i=0; i<numLocalAtoms; ++i) {
04102             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04103               // (g=2: only partition 0)
04104               localResults[i] += gridResults[nga++] * scale;
04105             }
04106           }
04107         }
04108         else {  // alchDecouple
04109           if ( g < 2 ) {
04110             for(int i=0; i<numLocalAtoms; ++i) {
04111               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04112                 // g = 0: partition 0 or partition 1
04113                 // g = 1: partition 0 or partition 2
04114                 localResults[i] += gridResults[nga++] * scale;
04115               }
04116             }
04117           }
04118           else {
04119             for(int i=0; i<numLocalAtoms; ++i) {
04120               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
04121                 // g = 2: partition 1 only
04122                 // g = 3: partition 2 only
04123                 // g = 4: partition 0 only
04124                 localResults[i] += gridResults[nga++] * scale;
04125               }
04126             }
04127           }
04128         }
04129       } else if ( lesOn ) {
04130         float scale = 1.;
04131         if ( alchFepOn ) {
04132           if(simParams->alchFepWhamOn) {
04133             if(simParams->alchFepElecOn) {
04134               if ( g == 0 ) scale = simParams->alchElecLambda;
04135               else if ( g == 1 ) scale = 1. - simParams->alchElecLambda;
04136             }
04137             else {
04138               if ( g == 0 ) scale = 0.0;
04139               else if ( g == 1 ) scale = 1.0;
04140             }
04141           }
04142           else {
04143             BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04144             myMgr->alchLambda = alchLambda;
04145             if ( g == 0 ) scale = alchLambda;
04146             else if ( g == 1 ) scale = 1. - alchLambda;
04147           }
04148         } else if ( lesOn ) {
04149           scale = 1.0 / (float)lesFactor;
04150         }
04151         int nga = 0;
04152         for(int i=0; i<numLocalAtoms; ++i) {
04153           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04154             localResults[i] += gridResults[nga++] * scale;
04155           }
04156         }
04157       } else if ( selfOn ) {
04158         PmeParticle *lgd = localGridData[g];
04159         int nga = 0;
04160         for(int i=0; i<numLocalAtoms; ++i) {
04161           if ( localPartition[i] == 1 ) {
04162             pairForce += gridResults[nga];  // should add up to almost zero
04163             localResults[i] += gridResults[nga++];
04164           }
04165         }
04166       } else if ( pairOn ) {
04167         if ( g == 0 ) {
04168           int nga = 0;
04169           for(int i=0; i<numLocalAtoms; ++i) {
04170             if ( localPartition[i] == 1 ) {
04171               pairForce += gridResults[nga];
04172             }
04173             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
04174               localResults[i] += gridResults[nga++];
04175             }
04176           }
04177         } else if ( g == 1 ) {
04178           int nga = 0;
04179           for(int i=0; i<numLocalAtoms; ++i) {
04180             if ( localPartition[i] == g ) {
04181               pairForce -= gridResults[nga];  // should add up to almost zero
04182               localResults[i] -= gridResults[nga++];
04183             }
04184           }
04185         } else {
04186           int nga = 0;
04187           for(int i=0; i<numLocalAtoms; ++i) {
04188             if ( localPartition[i] == g ) {
04189               localResults[i] -= gridResults[nga++];
04190             }
04191          }
04192         }
04193       }
04194     }
04195     }
04196 
04197     Vector *results_ptr = localResults;
04198     
04199     // add in forces
04200     {
04201       Results *r = forceBox->open();
04202       Force *f = r->f[Results::slow];
04203       int numAtoms = patch->getNumAtoms();
04204 
04205       if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
04206         for(int i=0; i<numAtoms; ++i) {
04207           f[i].x += results_ptr->x;
04208           f[i].y += results_ptr->y;
04209           f[i].z += results_ptr->z;
04210           ++results_ptr;
04211         }
04212       }
04213       forceBox->close(&r);
04214     }
04215 
04216     if ( pairOn || selfOn ) {
04217         ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
04218     }
04219 
04220 }
04221 
04222 void ComputePmeMgr::submitReductions() {
04223 
04224     SimParameters *simParams = Node::Object()->simParameters;
04225 
04226     for ( int g=0; g<numGrids; ++g ) {
04227       float scale = 1.;
04228       if (alchOn) {
04229         BigReal elecLambdaUp, elecLambdaDown;
04230         if( simParams->alchFepWhamOn ) {
04231           if( simParams->alchFepElecOn ) {
04232             elecLambdaUp = simParams->alchElecLambda;
04233             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04234           }
04235           else {
04236             elecLambdaUp = 0.0;
04237             elecLambdaDown = 1.0;
04238           }
04239         }
04240         else {
04241           // alchLambda set on each step in ComputePme::ungridForces()
04242           if ( alchLambda < 0 || alchLambda > 1 ) {
04243             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04244           }
04245           elecLambdaUp = simParams->getElecLambda(alchLambda);
04246           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04247         }
04248         if ( g == 0 ) scale = elecLambdaUp;
04249         else if ( g == 1 ) scale = elecLambdaDown;
04250         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04251         if (alchDecouple) {
04252           if ( g == 2 ) scale = 1-elecLambdaUp;
04253           else if ( g == 3 ) scale = 1-elecLambdaDown;
04254           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04255         }
04256       } else if ( lesOn ) {
04257         scale = 1.0 / lesFactor;
04258       } else if ( pairOn ) {
04259         scale = ( g == 0 ? 1. : -1. );
04260       }
04261       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04262       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04263       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04264       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04265       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04266       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04267       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04268       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04269       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04270       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04271 
04272       float scale2 = 0.;
04273 
04274       // why is this declared/defined again here?
04275       SimParameters *simParams = Node::Object()->simParameters;
04276 
04277       if (alchFepOn) {
04278         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04279         if(simParams->alchFepWhamOn) {
04280           if(simParams->alchFepElecOn) {
04281             elecLambda2Up = simParams->alchElecLambda;
04282             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04283           }
04284           else {
04285             elecLambda2Up = 0.0;
04286             elecLambda2Down =  1.0;
04287           }
04288         }
04289         else {
04290           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04291           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04292         }
04293         
04294         if ( g == 0 ) scale2 = elecLambda2Up;
04295         else if ( g == 1 ) scale2 = elecLambda2Down;
04296         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04297         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04298         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04299         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04300       }
04301       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04302         if( g==0 )      scale2 = scale + 1.0;
04303         else if( g==1 ) scale2 = scale - 1.0;
04304         else if( g==2 ) scale2 = scale - 1.0;
04305         else if( g==3 ) scale2 = scale + 1.0;
04306       }
04307       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04308       
04309       if (alchThermIntOn) {
04310         
04311         // no decoupling:
04312         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04313         // (interactions between all atoms [partition 0 OR partition 1], 
04314         // minus all [within partition 0])
04315         // U = elecLambdaUp * (U[0] - U[2])
04316         // dU/dl = U[0] - U[2];
04317         
04318         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04319         // (interactions between all atoms [partition 0 OR partition 2], 
04320         // minus all [within partition 0])
04321         // U = elecLambdaDown * (U[1] - U[2])
04322         // dU/dl = U[1] - U[2];
04323 
04324         // alchDecouple:
04325         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04326         // (interactions between all atoms [partition 0 OR partition 1]
04327         // minus all [within partition 1] minus all [within partition 0]
04328         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04329         // dU/dl = U[0] - U[2] - U[4];
04330 
04331         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04332         // (interactions between all atoms [partition 0 OR partition 2]
04333         // minus all [within partition 2] minus all [within partition 0]
04334         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04335         // dU/dl = U[1] - U[3] - U[4];
04336         
04337         
04338         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04339         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04340         if (!alchDecouple) {
04341           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04342           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04343         }
04344         else {  // alchDecouple
04345           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04346           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04347           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04348           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04349         }
04350       }
04351     }
04352 
04353     alchLambda = -1.;  // illegal value to catch if not updated
04354 
04355     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04356     reduction->submit();
04357 
04358   for ( int i=0; i<heldComputes.size(); ++i ) {
04359     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04360   }
04361   heldComputes.resize(0);
04362 }
04363 
04364 #if USE_TOPOMAP 
04365 
04366 #define NPRIMES 8
04367 const static unsigned int NAMDPrimes[] = {
04368   3,
04369   5,
04370   7,
04371   11,
04372   13,
04373   17,
04374   19,
04375   23,  
04376   29,
04377   31,
04378   37,
04379   59,
04380   73,
04381   93,
04382   113,
04383   157,
04384   307,
04385   617,
04386   1217                  //This should b enough for 64K nodes of BGL. 
04387 };
04388 
04389 #include "RecBisection.h"
04390 
04391 /***-----------------------------------------------------**********
04392     The Orthogonal Recursive Bisection strategy, which allocates PME
04393     objects close to the patches they communicate, and at the same
04394     time spreads them around the grid 
04395 ****----------------------------------------------------------****/
04396 
04397 bool generateBGLORBPmePeList(int *pemap, int numPes, 
04398                              int *block_pes, int nbpes) {
04399 
04400   PatchMap *pmap = PatchMap::Object();
04401   int *pmemap = new int [CkNumPes()];
04402 
04403   if (pemap == NULL)
04404     return false;
04405 
04406   TopoManager tmgr;
04407 
04408   memset(pmemap, 0, sizeof(int) * CkNumPes());
04409 
04410   for(int count = 0; count < CkNumPes(); count++) {
04411     if(count < nbpes)
04412       pmemap[block_pes[count]] = 1;
04413     
04414     if(pmap->numPatchesOnNode(count)) {
04415       pmemap[count] = 1;
04416       
04417       //Assumes an XYZT mapping !!
04418       if(tmgr.hasMultipleProcsPerNode()) {
04419         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
04420       }
04421     }
04422   }
04423 
04424   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
04425     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
04426     return false;
04427 
04428   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04429   Node *node = nd.ckLocalBranch();
04430   SimParameters *simParams = node->simParameters;
04431 
04432   //first split PME processors into patch groups
04433 
04434   int xsize = 0, ysize = 0, zsize = 0;
04435 
04436   xsize = tmgr.getDimNX();
04437   ysize = tmgr.getDimNY();
04438   zsize = tmgr.getDimNZ();
04439   
04440   int nx = xsize, ny = ysize, nz = zsize;
04441   DimensionMap dm;
04442   
04443   dm.x = 0;
04444   dm.y = 1;
04445   dm.z = 2;
04446   
04447   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
04448 
04449   //group size processors have to be allocated to each YZ plane
04450   int group_size = numPes/nx;
04451   if(numPes % nx)
04452     group_size ++;
04453 
04454   int my_prime = NAMDPrimes[0];
04455   int density = (ny * nz)/group_size + 1;
04456   int count = 0;
04457   
04458   //Choose a suitable prime Number
04459   for(count = 0; count < NPRIMES; count ++) {
04460     //Find a prime just greater than the density
04461     if(density < NAMDPrimes[count]) {
04462       my_prime = NAMDPrimes[count];
04463       break;
04464     }      
04465   }
04466   
04467   if(count == NPRIMES)
04468     my_prime = NAMDPrimes[NPRIMES-1];
04469 
04470   //int gcount = numPes/2;
04471   int gcount = 0;
04472   int npme_pes = 0;
04473   
04474   int coord[3];
04475 
04476   for(int x = 0; x < nx; x++) {
04477     coord[0] = (x + nx/2)%nx;
04478     
04479     for(count=0; count < group_size && npme_pes < numPes; count++) {
04480       int dest = (count + 1) * my_prime;      
04481       dest = dest % (ny * nz);      
04482       
04483       coord[2] = dest / ny;
04484       coord[1] = dest - coord[2] * ny;
04485       
04486       //Locate where in the actual grid the processor is
04487       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
04488         coord[dm.z] * xsize* ysize;
04489       
04490       if(pmemap[destPe] == 0) {
04491         pemap[gcount++] = destPe;
04492         pmemap[destPe] = 1;
04493         
04494         if(tmgr.hasMultipleProcsPerNode())
04495           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
04496 
04497         npme_pes ++;
04498       }
04499       else {
04500         for(int pos = 1; pos < ny * nz; pos++) {
04501           
04502           coord[2] += pos / ny;
04503           coord[1] += pos % ny;
04504           
04505           coord[2] = coord[2] % nz;
04506           coord[1] = coord[1] % ny;       
04507           
04508           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
04509             coord[dm.z] * xsize * ysize;
04510           
04511           if(pmemap[newdest] == 0) {
04512             pemap[gcount++] = newdest;
04513             pmemap[newdest] = 1;
04514             
04515             if(tmgr.hasMultipleProcsPerNode())
04516               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
04517             
04518             npme_pes ++;
04519             break;
04520           }
04521         }
04522       }      
04523     }   
04524     
04525     if(gcount == numPes)
04526       gcount = 0;    
04527     
04528     if(npme_pes >= numPes)
04529       break;
04530   }
04531   
04532   delete [] pmemap;
04533   
04534   if(npme_pes != numPes)
04535     //NAMD_bug("ORB PME allocator failed\n");
04536     return false;
04537 
04538   return true;
04539 }
04540 
04541 #endif
04542 
04543 template <class T> class PmePencil : public T {
04544 public:
04545   PmePencil() {
04546     data = 0;
04547     work = 0;
04548     send_order = 0;
04549     needs_reply = 0;
04550 #if USE_PERSISTENT
04551     trans_handle = untrans_handle = ungrid_handle = NULL;
04552 #endif
04553   }
04554   ~PmePencil() {
04555 #ifdef NAMD_FFTW
04556     fftwf_free(data);
04557 #endif
04558     delete [] work;
04559     delete [] send_order;
04560     delete [] needs_reply;
04561   }
04562   void base_init(PmePencilInitMsg *msg) {
04563     imsg=0;
04564     imsgb=0;
04565     hasData=0;
04566     initdata = msg->data;
04567   }
04568   void order_init(int nBlocks) {
04569     send_order = new int[nBlocks];
04570     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
04571     if ( Node::Object()->simParameters->PMESendOrder ) {
04572       std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
04573     } else {
04574       Random rand(CkMyPe());
04575       rand.reorder(send_order,nBlocks);
04576     }
04577     needs_reply = new int[nBlocks];
04578     offload = Node::Object()->simParameters->PMEOffload;
04579   }
04580   PmePencilInitMsgData initdata;
04581   Lattice lattice;
04582   PmeReduction evir;
04583   int sequence;  // used for priorities
04584   int imsg;  // used in sdag code
04585   int imsgb;  // Node par uses distinct counter for back path
04586   int hasData;  // used in message elimination
04587   int offload;
04588   float *data;
04589   float *work;
04590   int *send_order;
04591   int *needs_reply;
04592 #if USE_PERSISTENT
04593   PersistentHandle *trans_handle;
04594   PersistentHandle *untrans_handle;
04595   PersistentHandle *ungrid_handle;
04596 #endif
04597 };
04598 
04599 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
04600 public:
04601     PmeZPencil_SDAG_CODE
04602     PmeZPencil() { __sdag_init(); setMigratable(false); }
04603     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); imsg=imsgb=0;}
04604         ~PmeZPencil() {
04605         #ifdef NAMD_FFTW
04606         #ifdef NAMD_FFTW_3
04607                 delete [] forward_plans;
04608                 delete [] backward_plans;
04609         #endif
04610         #endif
04611         }
04612     void fft_init();
04613     void recv_grid(const PmeGridMsg *);
04614     void forward_fft();
04615     void send_trans();
04616         void send_subset_trans(int fromIdx, int toIdx);
04617     void recv_untrans(const PmeUntransMsg *);
04618     void node_process_untrans(PmeUntransMsg *);
04619     void node_process_grid(PmeGridMsg *);
04620     void backward_fft();
04621         void send_ungrid(PmeGridMsg *);
04622         void send_all_ungrid();
04623         void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
04624 private:
04625     ResizeArray<PmeGridMsg *> grid_msgs;
04626     ResizeArray<int> work_zlist;
04627 #ifdef NAMD_FFTW
04628 #ifdef NAMD_FFTW_3
04629     fftwf_plan forward_plan, backward_plan;
04630 
04631         //for ckloop usage
04632         int numPlans;
04633         fftwf_plan *forward_plans, *backward_plans;
04634 #else
04635     rfftwnd_plan forward_plan, backward_plan;
04636 #endif
04637 #endif
04638 
04639     int nx, ny;
04640 #if USE_PERSISTENT
04641     void setup_persistent() {
04642       int hd = 1;// ( hasData ? 1 : 0 );
04643       int zBlocks = initdata.zBlocks;
04644       int block3 = initdata.grid.block3;
04645       int dim3 = initdata.grid.dim3;
04646       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04647       CmiAssert(yPencil_local);
04648       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
04649       for ( int isend=0; isend<zBlocks; ++isend ) {
04650           int kb = send_order[isend];
04651           int nz1 = block3;
04652           if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
04653           int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
04654           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
04655           int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
04656           int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
04657           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04658       }
04659     }
04660     
04661     void setup_ungrid_persistent() 
04662     {
04663        ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
04664        for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
04665            int peer = grid_msgs[imsg]->sourceNode;
04666            //ungrid_handle[imsg] = CmiCreatePersistent(peer, 0); 
04667        }
04668     }
04669 #endif
04670 };
04671 
04672 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
04673 public:
04674     PmeYPencil_SDAG_CODE
04675     PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
04676     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
04677     void fft_init();
04678     void recv_trans(const PmeTransMsg *);
04679     void forward_fft();
04680         void forward_subset_fft(int fromIdx, int toIdx);
04681     void send_trans();
04682         void send_subset_trans(int fromIdx, int toIdx);
04683     void recv_untrans(const PmeUntransMsg *);    
04684     void node_process_trans(PmeTransMsg *);
04685     void node_process_untrans(PmeUntransMsg *);
04686     void backward_fft();
04687         void backward_subset_fft(int fromIdx, int toIdx);
04688     void send_untrans();
04689     void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
04690 private:
04691 #ifdef NAMD_FFTW
04692 #ifdef NAMD_FFTW_3
04693     fftwf_plan forward_plan, backward_plan;
04694 #else
04695     fftw_plan forward_plan, backward_plan;
04696 #endif
04697 #endif
04698 
04699     int nx, nz;
04700 #if USE_PERSISTENT
04701     void setup_persistent() {
04702       int yBlocks = initdata.yBlocks;
04703       int block2 = initdata.grid.block2;
04704       int K2 = initdata.grid.K2;
04705       int hd = 1;
04706       CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
04707       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04708       for ( int isend=0; isend<yBlocks; ++isend ) {
04709           int jb = send_order[isend];
04710           int ny1 = block2;
04711           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04712           int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
04713           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
04714           int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
04715           int compress_size = sizeof(float)*hd*nx*ny1*nz*2; 
04716           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04717       }
04718 
04719       CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
04720       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04721       for ( int isend=0; isend<yBlocks; ++isend ) {
04722           int jb = send_order[isend];
04723           int ny1 = block2;
04724           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04725           int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
04726           int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
04727           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04728           int compress_size = sizeof(float)*nx*ny1*nz*2;
04729           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size,  compress_start, compress_size, CMI_FLOATING);
04730       }
04731     }
04732 #endif
04733 };
04734 
04735 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
04736 public:
04737     PmeXPencil_SDAG_CODE
04738     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
04739     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
04740         ~PmeXPencil() {
04741         #ifdef NAMD_FFTW
04742         #ifdef NAMD_FFTW_3
04743                 delete [] forward_plans;
04744                 delete [] backward_plans;
04745         #endif
04746         #endif
04747         }
04748     void fft_init();
04749     void recv_trans(const PmeTransMsg *);
04750     void forward_fft();
04751     void pme_kspace();
04752     void backward_fft();
04753     void send_untrans();
04754         void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
04755     void node_process_trans(PmeTransMsg *);
04756 #ifdef NAMD_FFTW
04757 #ifdef NAMD_FFTW_3
04758     fftwf_plan forward_plan, backward_plan;
04759 
04760         int numPlans;
04761         fftwf_plan *forward_plans, *backward_plans;
04762 #else
04763     fftw_plan forward_plan, backward_plan;
04764 #endif
04765 #endif
04766     int ny, nz;
04767     int recipEvirPe;
04768     void evir_init();
04769     PmeKSpace *myKSpace;
04770 #if USE_PERSISTENT
04771     void  setup_persistent() {
04772       int xBlocks = initdata.xBlocks;
04773       int block1 = initdata.grid.block1;
04774       int K1 = initdata.grid.K1;
04775       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04776       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
04777       for ( int isend=0; isend<xBlocks; ++isend ) {
04778           int ib = send_order[isend];
04779           int nx1 = block1;
04780           if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
04781           int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
04782           int size = sizeof(PmeUntransMsg) +
04783               sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24; 
04784           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04785           int compress_size = sizeof(float)*nx1*ny*nz*2;
04786           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04787       }
04788     }
04789 #endif
04790 
04791 };
04792 
04793 void PmeXPencil::evir_init() {
04794   recipEvirPe = findRecipEvirPe();
04795   initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
04796 }
04797 
04798 void PmeZPencil::fft_init() {
04799   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04800   Node *node = nd.ckLocalBranch();
04801   SimParameters *simParams = node->simParameters;
04802 
04803 #if USE_NODE_PAR_RECEIVE
04804   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
04805 #endif
04806 
04807   int K1 = initdata.grid.K1;
04808   int K2 = initdata.grid.K2;
04809   int K3 = initdata.grid.K3;
04810   int dim3 = initdata.grid.dim3;
04811   int block1 = initdata.grid.block1;
04812   int block2 = initdata.grid.block2;
04813 
04814   nx = block1;
04815   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04816   ny = block2;
04817   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
04818 
04819 #ifdef NAMD_FFTW
04820   CmiLock(ComputePmeMgr::fftw_plan_lock);
04821 
04822   data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
04823   work = new float[dim3];
04824 
04825   order_init(initdata.zBlocks);
04826 
04827 #ifdef NAMD_FFTW_3
04828   /* need array of sizes for the how many */
04829 
04830   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04831   int sizeLines=nx*ny;
04832   int planLineSizes[1];
04833   planLineSizes[0]=K3;
04834   int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
04835   int ndimHalf=ndim/2;
04836   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
04837                                          (float *) data, NULL, 1, 
04838                                          ndim,
04839                                          (fftwf_complex *) data, NULL, 1,
04840                                          ndimHalf,
04841                                          fftwFlags);
04842 
04843   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
04844                                           (fftwf_complex *) data, NULL, 1, 
04845                                           ndimHalf,
04846                                           (float *) data, NULL, 1, 
04847                                           ndim,
04848                                           fftwFlags);
04849 #if     CMK_SMP && USE_CKLOOP
04850   if(simParams->useCkLoop) {
04851           //How many FFT plans to be created? The grain-size issue!!.
04852           //Currently, I am choosing the min(nx, ny) to be coarse-grain
04853           numPlans = (nx<=ny?nx:ny);
04854           if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
04855           if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
04856           int howmany = sizeLines/numPlans;
04857           forward_plans = new fftwf_plan[numPlans];
04858           backward_plans = new fftwf_plan[numPlans];
04859           for(int i=0; i<numPlans; i++) {
04860                   int dimStride = i*ndim*howmany;
04861                   int dimHalfStride = i*ndimHalf*howmany;
04862                   forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
04863                                                                                                          ((float *)data)+dimStride, NULL, 1,
04864                                                                                                          ndim,
04865                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04866                                                                                                          ndimHalf,
04867                                                                                                          fftwFlags);
04868 
04869                   backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
04870                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04871                                                                                                          ndimHalf,
04872                                                                                                          ((float *)data)+dimStride, NULL, 1,
04873                                                                                                          ndim,
04874                                                                                                          fftwFlags);
04875           }
04876   }else 
04877 #endif 
04878   {
04879           forward_plans = NULL;
04880           backward_plans = NULL;
04881   }
04882 #else
04883   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
04884         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04885         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04886   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
04887         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04888         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04889 #endif
04890   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04891 #else
04892   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04893 #endif
04894 
04895 #if USE_NODE_PAR_RECEIVE
04896     evir = 0.;
04897     memset(data, 0, sizeof(float) * nx*ny*dim3);
04898 #endif
04899 }
04900 
04901 void PmeYPencil::fft_init() {
04902   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04903   Node *node = nd.ckLocalBranch();
04904   SimParameters *simParams = node->simParameters;
04905 
04906 #if USE_NODE_PAR_RECEIVE
04907   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
04908 #endif
04909 
04910   int K1 = initdata.grid.K1;
04911   int K2 = initdata.grid.K2;
04912   int dim2 = initdata.grid.dim2;
04913   int dim3 = initdata.grid.dim3;
04914   int block1 = initdata.grid.block1;
04915   int block3 = initdata.grid.block3;
04916 
04917   nx = block1;
04918   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04919   nz = block3;
04920   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
04921 
04922 #ifdef NAMD_FFTW
04923   CmiLock(ComputePmeMgr::fftw_plan_lock);
04924 
04925   data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
04926   work = new float[2*K2];
04927 
04928   order_init(initdata.yBlocks);
04929 
04930 #ifdef NAMD_FFTW_3
04931   /* need array of sizes for the dimensions */
04932   /* ideally this should be implementable as a single multidimensional
04933    *  plan, but that has proven tricky to implement, so we maintain the
04934    *  loop of 1d plan executions. */
04935   int sizeLines=nz;
04936   int planLineSizes[1];
04937   planLineSizes[0]=K2;
04938   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04939   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04940                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04941                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04942                                      FFTW_FORWARD, 
04943                                      fftwFlags);
04944   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04945                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04946                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04947                                      FFTW_BACKWARD, 
04948                                       fftwFlags);
04949   CkAssert(forward_plan != NULL);
04950   CkAssert(backward_plan != NULL);
04951 #else
04952   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
04953         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04954         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04955         nz, (fftw_complex *) work, 1);
04956   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
04957         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04958         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04959         nz, (fftw_complex *) work, 1);
04960 #endif
04961   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04962 #else
04963   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04964 #endif
04965 
04966 #if USE_NODE_PAR_RECEIVE
04967   evir = 0;
04968   CmiMemoryWriteFence();
04969 #endif
04970 }
04971 
04972 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
04973 {
04974   if ( msg->hasData ) hasData = 1;
04975   needs_reply[msg->sourceNode] = msg->hasData;
04976   recv_trans(msg);
04977   int limsg;
04978   CmiMemoryAtomicFetchAndInc(imsg,limsg);
04979   if(limsg+1 == initdata.yBlocks)
04980     {
04981       if ( hasData ) {
04982         forward_fft();
04983       }
04984       send_trans();
04985       if( ! hasData)
04986         {
04987           send_untrans(); //todo, what is up with the recvAck in SDAG version?
04988         }
04989       imsg=0;
04990       CmiMemoryWriteFence();
04991     }
04992 }
04993 
04994 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
04995 {
04996   recv_untrans(msg);
04997   int limsg;
04998   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
04999   if(limsg+1 == initdata.yBlocks)
05000     {
05001       backward_fft();
05002       send_untrans();
05003       imsgb=0;
05004       CmiMemoryWriteFence();
05005     }
05006 }
05007 
05008 #define DEBUG_NODE_PAR_RECV 0
05009 
05010 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
05011   //  CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05012   PmeXPencil *target=xPencilObj.get(msg->destElem);
05013 #if DEBUG_NODE_PAR_RECV
05014   if(target == NULL)
05015     CkAbort("xpencil in recvXTrans not found, debug registeration");
05016 #endif  
05017     target->node_process_trans(msg);
05018   delete msg;
05019 }
05020 
05021 
05022 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
05023   //  CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05024   PmeYPencil *target=yPencilObj.get(msg->destElem);
05025 #if DEBUG_NODE_PAR_RECV
05026   if(target == NULL)
05027     CkAbort("ypencil in recvYTrans not found, debug registeration");
05028 #endif  
05029     target->node_process_trans(msg);
05030   delete msg;
05031  }
05032 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
05033   //  CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05034   PmeYPencil *target=yPencilObj.get(msg->destElem);
05035 #if DEBUG_NODE_PAR_RECV  
05036   if(target == NULL)
05037     CkAbort("ypencil in recvYUntrans not found, debug registeration");
05038 #endif  
05039     target->node_process_untrans(msg);
05040   delete msg;
05041  }
05042 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
05043   //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05044   PmeZPencil *target=zPencilObj.get(msg->destElem);
05045 #if DEBUG_NODE_PAR_RECV
05046   if(target == NULL)
05047     CkAbort("zpencil in recvZUntrans not found, debug registeration");
05048 #endif
05049   target->node_process_untrans(msg);
05050   delete msg;
05051 }
05052 
05053 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
05054   //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05055   PmeZPencil *target=zPencilObj.get(msg->destElem);
05056 #if DEBUG_NODE_PAR_RECV
05057   if(target == NULL){
05058     CkAbort("zpencil in recvZGrid not found, debug registeration");
05059   }
05060 #endif
05061   target->node_process_grid(msg); //msg is stored inside node_proces_grid
05062 }
05063 
05064 void PmeXPencil::fft_init() {
05065   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
05066   Node *node = nd.ckLocalBranch();
05067   SimParameters *simParams = node->simParameters;
05068 #if USE_NODE_PAR_RECEIVE
05069   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
05070 #endif
05071 
05072   int K1 = initdata.grid.K1;
05073   int K2 = initdata.grid.K2;
05074   int dim3 = initdata.grid.dim3;
05075   int block2 = initdata.grid.block2;
05076   int block3 = initdata.grid.block3;
05077 
05078   ny = block2;
05079   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
05080   nz = block3;
05081   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
05082 
05083 #ifdef NAMD_FFTW
05084   CmiLock(ComputePmeMgr::fftw_plan_lock);
05085 
05086   data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
05087   work = new float[2*K1];
05088 
05089   order_init(initdata.xBlocks);
05090 
05091 #ifdef NAMD_FFTW_3
05092   /* need array of sizes for the how many */
05093   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
05094   int sizeLines=ny*nz;
05095   int planLineSizes[1];
05096   planLineSizes[0]=K1;
05097   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05098                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05099                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05100                                    FFTW_FORWARD,
05101                                      fftwFlags);
05102   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05103                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05104                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05105                                           FFTW_BACKWARD,
05106                                       fftwFlags);
05107 
05108 #if     CMK_SMP && USE_CKLOOP
05109   if(simParams->useCkLoop) {
05110           //How many FFT plans to be created? The grain-size issue!!.
05111           //Currently, I am choosing the min(nx, ny) to be coarse-grain
05112           numPlans = (ny<=nz?ny:nz);
05113           // limit attempted parallelism due to false sharing
05114           //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
05115           //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
05116           if ( sizeLines/numPlans < 4 ) numPlans = 1;
05117           int howmany = sizeLines/numPlans;
05118           forward_plans = new fftwf_plan[numPlans];
05119           backward_plans = new fftwf_plan[numPlans];
05120           for(int i=0; i<numPlans; i++) {
05121                   int curStride = i*howmany;              
05122                   forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05123                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05124                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05125                                                                                                         FFTW_FORWARD,
05126                                                                                                          fftwFlags);
05127 
05128                   backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05129                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05130                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05131                                                                                                           FFTW_BACKWARD,
05132                                                                                                          fftwFlags);
05133           }
05134   }else
05135 #endif
05136   {
05137           forward_plans = NULL;
05138           backward_plans = NULL;
05139   }
05140 #else
05141   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
05142         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05143         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05144         ny*nz, (fftw_complex *) work, 1);
05145   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
05146         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05147         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05148         ny*nz, (fftw_complex *) work, 1);
05149 #endif
05150   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05151 #else
05152   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
05153 #endif
05154 
05155   myKSpace = new PmeKSpace(initdata.grid,
05156                 thisIndex.y*block2, thisIndex.y*block2 + ny,
05157                 thisIndex.z*block3, thisIndex.z*block3 + nz);
05158 
05159 }
05160 
05161 // #define FFTCHECK   // run a grid of integers through the fft
05162 // #define ZEROCHECK  // check for suspicious zeros in fft
05163 
05164 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
05165 
05166   int dim3 = initdata.grid.dim3;
05167   if ( imsg == 0 ) {
05168     lattice = msg->lattice;
05169     sequence = msg->sequence;
05170 #if ! USE_NODE_PAR_RECEIVE
05171     memset(data, 0, sizeof(float)*nx*ny*dim3);
05172 #endif
05173   }
05174 
05175   if ( ! msg->hasData ) return;
05176 
05177   int zlistlen = msg->zlistlen;
05178 #ifdef NAMD_KNL
05179   int * __restrict msg_zlist = msg->zlist;
05180   int * __restrict zlist = work_zlist.begin();
05181   __assume_aligned(zlist,64);
05182   for ( int k=0; k<zlistlen; ++k ) {
05183     zlist[k] = msg_zlist[k];
05184   }
05185 #else
05186   int * __restrict zlist = msg->zlist;
05187 #endif
05188   char * __restrict fmsg = msg->fgrid;
05189   float * __restrict qmsg = msg->qgrid;
05190   float * __restrict d = data;
05191   int numGrids = 1;  // pencil FFT doesn't support multiple grids
05192   for ( int g=0; g<numGrids; ++g ) {
05193     for ( int i=0; i<nx; ++i ) {
05194      for ( int j=0; j<ny; ++j, d += dim3 ) {
05195       if( *(fmsg++) ) {
05196         #pragma ivdep
05197         for ( int k=0; k<zlistlen; ++k ) {
05198           d[zlist[k]] += *(qmsg++);
05199         }
05200       }
05201      }
05202     }
05203   }
05204 }
05205 
05206 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
05207 #ifdef NAMD_FFTW
05208 #ifdef NAMD_FFTW_3    
05209     fftwf_plan *plans = (fftwf_plan *)param;
05210     for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
05211 #endif
05212 #endif        
05213 }
05214 
05215 void PmeZPencil::forward_fft() {
05216   evir = 0.;
05217 #ifdef FFTCHECK
05218   int dim3 = initdata.grid.dim3;
05219   int K3 = initdata.grid.K3;
05220   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05221   float *d = data;
05222   for ( int i=0; i<nx; ++i ) {
05223    for ( int j=0; j<ny; ++j, d += dim3 ) {
05224     for ( int k=0; k<dim3; ++k ) {
05225       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
05226     }
05227    }
05228   }
05229 #endif
05230 #ifdef NAMD_FFTW
05231 #ifdef MANUAL_DEBUG_FFTW3
05232   dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05233 #endif
05234 #ifdef NAMD_FFTW_3
05235 #if     CMK_SMP && USE_CKLOOP
05236   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05237   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05238      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05239           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05240           //transform the above loop
05241           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05242           return;
05243   }
05244 #endif
05245   fftwf_execute(forward_plan);
05246 #else
05247   rfftwnd_real_to_complex(forward_plan, nx*ny,
05248         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
05249 #endif
05250 #ifdef MANUAL_DEBUG_FFTW3
05251   dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05252 #endif
05253 
05254 #endif
05255 #ifdef ZEROCHECK
05256   int dim3 = initdata.grid.dim3;
05257   int K3 = initdata.grid.K3;
05258   float *d = data;
05259   for ( int i=0; i<nx; ++i ) {
05260    for ( int j=0; j<ny; ++j, d += dim3 ) {
05261     for ( int k=0; k<dim3; ++k ) {
05262       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
05263         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
05264     }
05265    }
05266   }
05267 #endif
05268 }
05269 
05270 /* A single task for partitioned PmeZPencil::send_trans work */
05271 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05272         PmeZPencil *zpencil = (PmeZPencil *)param;
05273         zpencil->send_subset_trans(first, last);        
05274 }
05275 
05276 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
05277         int zBlocks = initdata.zBlocks;
05278         int block3 = initdata.grid.block3;
05279         int dim3 = initdata.grid.dim3;
05280         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05281           int kb = send_order[isend];
05282           int nz = block3;
05283           if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05284           int hd = ( hasData ? 1 : 0 );
05285           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05286           msg->lattice = lattice;
05287           msg->sourceNode = thisIndex.y;
05288           msg->hasData = hasData;
05289           msg->nx = ny;
05290          if ( hasData ) {
05291           float *md = msg->qgrid;
05292           const float *d = data;
05293           for ( int i=0; i<nx; ++i ) {
05294            for ( int j=0; j<ny; ++j, d += dim3 ) {
05295                 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05296                   *(md++) = d[2*k];
05297                   *(md++) = d[2*k+1];
05298                 }
05299            }
05300           }
05301          }
05302           msg->sequence = sequence;
05303           SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05304 
05305     CmiEnableUrgentSend(1);
05306 #if USE_NODE_PAR_RECEIVE
05307       msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05308 #if Y_PERSIST 
05309       CmiUsePersistentHandle(&trans_handle[isend], 1);
05310 #endif
05311       initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05312 #if Y_PERSIST 
05313       CmiUsePersistentHandle(NULL, 0);
05314 #endif    
05315 #else
05316 #if Y_PERSIST 
05317       CmiUsePersistentHandle(&trans_handle[isend], 1);
05318 #endif
05319       initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05320 #if Y_PERSIST 
05321       CmiUsePersistentHandle(NULL, 0);
05322 #endif    
05323 #endif
05324     CmiEnableUrgentSend(0);
05325     }
05326 }
05327 
05328 void PmeZPencil::send_trans() {
05329 #if USE_PERSISTENT
05330     if (trans_handle == NULL) setup_persistent();
05331 #endif
05332 #if     CMK_SMP && USE_CKLOOP
05333         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05334         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05335            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05342                 //send_subset_trans(0, initdata.zBlocks-1);
05343                 CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
05344                 return;
05345         }
05346 #endif
05347   int zBlocks = initdata.zBlocks;
05348   int block3 = initdata.grid.block3;
05349   int dim3 = initdata.grid.dim3;
05350   for ( int isend=0; isend<zBlocks; ++isend ) {
05351     int kb = send_order[isend];
05352     int nz = block3;
05353     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05354     int hd = ( hasData ? 1 : 0 );
05355     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05356     msg->lattice = lattice;
05357     msg->sourceNode = thisIndex.y;
05358     msg->hasData = hasData;
05359     msg->nx = ny;
05360    if ( hasData ) {
05361     float *md = msg->qgrid;
05362     const float *d = data;
05363     for ( int i=0; i<nx; ++i ) {
05364      for ( int j=0; j<ny; ++j, d += dim3 ) {
05365       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05366         *(md++) = d[2*k];
05367         *(md++) = d[2*k+1];
05368       }
05369      }
05370     }
05371    }
05372     msg->sequence = sequence;
05373     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05374 
05375     CmiEnableUrgentSend(1);
05376 #if USE_NODE_PAR_RECEIVE
05377     msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05378 #if Y_PERSIST 
05379     CmiUsePersistentHandle(&trans_handle[isend], 1);
05380 #endif
05381     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05382 #if Y_PERSIST 
05383     CmiUsePersistentHandle(NULL, 0);
05384 #endif    
05385 #else
05386 #if Y_PERSIST 
05387     CmiUsePersistentHandle(&trans_handle[isend], 1);
05388 #endif
05389     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05390 #if Y_PERSIST 
05391     CmiUsePersistentHandle(NULL, 0);
05392 #endif    
05393 #endif
05394     CmiEnableUrgentSend(0);
05395   }
05396 }
05397 
05398 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
05399   if ( imsg == 0 ) {
05400     lattice = msg->lattice;
05401     sequence = msg->sequence;
05402   }
05403   int block2 = initdata.grid.block2;
05404   int K2 = initdata.grid.K2;
05405   int jb = msg->sourceNode;
05406   int ny = msg->nx;
05407  if ( msg->hasData ) {
05408   const float *md = msg->qgrid;
05409   float *d = data;
05410   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05411    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05412     for ( int k=0; k<nz; ++k ) {
05413 #ifdef ZEROCHECK
05414       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
05415         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05416 #endif
05417       d[2*(j*nz+k)] = *(md++);
05418       d[2*(j*nz+k)+1] = *(md++);
05419     }
05420    }
05421   }
05422  } else {
05423   float *d = data;
05424   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05425    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05426     for ( int k=0; k<nz; ++k ) {
05427       d[2*(j*nz+k)] = 0;
05428       d[2*(j*nz+k)+1] = 0;
05429     }
05430    }
05431   }
05432  }
05433 }
05434 
05435 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
05436         PmeYPencil *ypencil = (PmeYPencil *)param;
05437         ypencil->forward_subset_fft(first, last);
05438 }
05439 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
05440 #ifdef NAMD_FFTW
05441 #ifdef NAMD_FFTW_3
05442         for(int i=fromIdx; i<=toIdx; i++){
05443                 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05444                       * nz * initdata.grid.K2,  
05445                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05446         }
05447 #endif
05448 #endif
05449 }
05450 
05451 void PmeYPencil::forward_fft() {
05452     evir = 0.;
05453 #ifdef NAMD_FFTW
05454 #ifdef MANUAL_DEBUG_FFTW3
05455   dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05456 #endif
05457   
05458 #ifdef NAMD_FFTW_3
05459 #if     CMK_SMP && USE_CKLOOP
05460   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05461   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05462      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05463           CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
05464           return;
05465   }
05466 #endif
05467   //the above is a transformation of the following loop using CkLoop
05468   for ( int i=0; i<nx; ++i ) {
05469     fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05470                       * nz * initdata.grid.K2,  
05471                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05472   }
05473 #else
05474   for ( int i=0; i<nx; ++i ) {
05475     fftw(forward_plan, nz,
05476         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
05477         nz, 1, (fftw_complex *) work, 1, 0);
05478   }
05479 #endif
05480 #ifdef MANUAL_DEBUG_FFTW3
05481   dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05482 #endif
05483 
05484 #endif
05485 }
05486 
05487 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05488         PmeYPencil *ypencil = (PmeYPencil *)param;
05489         ypencil->send_subset_trans(first, last);
05490 }
05491 
05492 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
05493         int yBlocks = initdata.yBlocks;
05494         int block2 = initdata.grid.block2;
05495         int K2 = initdata.grid.K2;
05496     for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05497           int jb = send_order[isend];
05498           int ny = block2;
05499           if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05500           int hd = ( hasData ? 1 : 0 );
05501           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05502           msg->lattice = lattice;
05503           msg->sourceNode = thisIndex.x;
05504           msg->hasData = hasData;
05505           msg->nx = nx;
05506          if ( hasData ) {
05507           float *md = msg->qgrid;
05508           const float *d = data;
05509           for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05510            for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05511                 for ( int k=0; k<nz; ++k ) {
05512                   *(md++) = d[2*(j*nz+k)];
05513                   *(md++) = d[2*(j*nz+k)+1];
05514   #ifdef ZEROCHECK
05515                   if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05516           thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05517   #endif
05518                 }
05519            }
05520           }
05521           if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05522           thisIndex.x, jb, thisIndex.z);
05523          }
05524           msg->sequence = sequence;
05525           SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05526       CmiEnableUrgentSend(1);
05527 #if USE_NODE_PAR_RECEIVE
05528       msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05529 #if X_PERSIST 
05530       CmiUsePersistentHandle(&trans_handle[isend], 1);
05531 #endif
05532       initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05533 #if X_PERSIST 
05534       CmiUsePersistentHandle(NULL, 0);
05535 #endif
05536 #else      
05537 #if X_PERSIST 
05538       CmiUsePersistentHandle(&trans_handle[isend], 1);
05539 #endif
05540       initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05541 #if X_PERSIST 
05542       CmiUsePersistentHandle(NULL, 0);
05543 #endif
05544 #endif
05545       CmiEnableUrgentSend(0);
05546         }
05547 }
05548 
05549 void PmeYPencil::send_trans() {
05550 #if USE_PERSISTENT
05551     if (trans_handle == NULL) setup_persistent();
05552 #endif
05553 #if     CMK_SMP && USE_CKLOOP
05554         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05555         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05556            && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05563                 //send_subset_trans(0, initdata.yBlocks-1);
05564                 CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
05565                 return;
05566         }
05567 #endif
05568   int yBlocks = initdata.yBlocks;
05569   int block2 = initdata.grid.block2;
05570   int K2 = initdata.grid.K2;
05571   for ( int isend=0; isend<yBlocks; ++isend ) {
05572     int jb = send_order[isend];
05573     int ny = block2;
05574     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05575     int hd = ( hasData ? 1 : 0 );
05576     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05577     msg->lattice = lattice;
05578     msg->sourceNode = thisIndex.x;
05579     msg->hasData = hasData;
05580     msg->nx = nx;
05581    if ( hasData ) {
05582     float *md = msg->qgrid;
05583     const float *d = data;
05584     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05585      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05586       for ( int k=0; k<nz; ++k ) {
05587         *(md++) = d[2*(j*nz+k)];
05588         *(md++) = d[2*(j*nz+k)+1];
05589 #ifdef ZEROCHECK
05590         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05591         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05592 #endif
05593       }
05594      }
05595     }
05596     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05597         thisIndex.x, jb, thisIndex.z);
05598    }
05599     msg->sequence = sequence;
05600     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05601     CmiEnableUrgentSend(1);
05602 #if USE_NODE_PAR_RECEIVE
05603     msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05604 #if X_PERSIST 
05605         CmiUsePersistentHandle(&trans_handle[isend], 1);
05606 #endif
05607     initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05608 #if X_PERSIST 
05609         CmiUsePersistentHandle(NULL, 0);
05610 #endif
05611 #else
05612 #if X_PERSIST 
05613         CmiUsePersistentHandle(&trans_handle[isend], 1);
05614 #endif
05615     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05616 #if X_PERSIST 
05617         CmiUsePersistentHandle(NULL, 0);
05618 #endif
05619     
05620 #endif
05621     CmiEnableUrgentSend(0);
05622   }
05623 }
05624 
05625 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
05626 {
05627   if(msg->hasData) hasData=1;
05628   needs_reply[msg->sourceNode] = msg->hasData;
05629   recv_trans(msg);
05630   int limsg;
05631   CmiMemoryAtomicFetchAndInc(imsg,limsg);
05632   if(limsg+1 == initdata.xBlocks)
05633     {
05634       if(hasData){
05635         forward_fft();
05636         pme_kspace();
05637         backward_fft();
05638       }
05639       send_untrans();
05640       imsg=0;
05641       CmiMemoryWriteFence();
05642     }
05643 }
05644 
05645 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
05646   if ( imsg == 0 ) {
05647     lattice = msg->lattice;
05648     sequence = msg->sequence;
05649   }
05650   int block1 = initdata.grid.block1;
05651   int K1 = initdata.grid.K1;
05652   int ib = msg->sourceNode;
05653   int nx = msg->nx;
05654  if ( msg->hasData ) {
05655   const float *md = msg->qgrid;
05656   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05657    float *d = data + i*ny*nz*2;
05658    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05659     for ( int k=0; k<nz; ++k ) {
05660 #ifdef ZEROCHECK
05661       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
05662         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
05663 #endif
05664       d[2*k] = *(md++);
05665       d[2*k+1] = *(md++);
05666     }
05667    }
05668   }
05669  } else {
05670   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05671    float *d = data + i*ny*nz*2;
05672    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05673     for ( int k=0; k<nz; ++k ) {
05674       d[2*k] = 0;
05675       d[2*k+1] = 0;
05676     }
05677    }
05678   }
05679  }
05680 }
05681 
05682 void PmeXPencil::forward_fft() {
05683 #ifdef NAMD_FFTW
05684 
05685 #ifdef MANUAL_DEBUG_FFTW3
05686   dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05687 #endif
05688 
05689 #ifdef NAMD_FFTW_3
05690 #if     CMK_SMP && USE_CKLOOP
05691   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05692   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05693      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05694           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05695           //transform the above loop
05696           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05697           return;
05698   }
05699 #endif
05700   fftwf_execute(forward_plan);
05701 #else
05702   fftw(forward_plan, ny*nz,
05703         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05704 #endif
05705 #ifdef MANUAL_DEBUG_FFTW3
05706   dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05707 #endif
05708 
05709 #endif
05710 }
05711 
05712 void PmeXPencil::pme_kspace() {
05713 
05714   evir = 0.;
05715 
05716 #ifdef FFTCHECK
05717   return;
05718 #endif
05719 
05720   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
05721 
05722   int useCkLoop = 0;
05723 #if CMK_SMP && USE_CKLOOP
05724   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
05725        && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
05726     useCkLoop = 1;
05727   }
05728 #endif
05729 
05730   int numGrids = 1;
05731   for ( int g=0; g<numGrids; ++g ) {
05732     evir[0] = myKSpace->compute_energy(data+0*g,
05733                 lattice, ewaldcof, &(evir[1]), useCkLoop);
05734   }
05735   
05736 #if USE_NODE_PAR_RECEIVE
05737     CmiMemoryWriteFence();
05738 #endif
05739 }
05740 
05741 void PmeXPencil::backward_fft() {
05742 #ifdef NAMD_FFTW
05743 #ifdef MANUAL_DEBUG_FFTW3
05744   dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05745 #endif
05746 
05747 #ifdef NAMD_FFTW_3
05748 #if     CMK_SMP && USE_CKLOOP
05749   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05750   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
05751      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05752           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
05753           //transform the above loop
05754           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05755           return;
05756   }
05757 #endif
05758   fftwf_execute(backward_plan);
05759 #else
05760   fftw(backward_plan, ny*nz,
05761         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05762 #endif
05763 #ifdef MANUAL_DEBUG_FFTW3
05764   dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05765 #endif
05766 #endif
05767 }
05768 
05769 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
05770         int evirIdx = paraNum;
05771         PmeXPencil *xpencil = (PmeXPencil *)param;
05772         xpencil->send_subset_untrans(first, last, evirIdx);
05773 }
05774 
05775 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
05776         int xBlocks = initdata.xBlocks;
05777         int block1 = initdata.grid.block1;      
05778         int K1 = initdata.grid.K1;
05779 
05780         int ackL=0, ackH=-1;
05781         int unL=0, unH=-1;
05782         int send_evir=0;
05783         if(fromIdx >= evirIdx+1) {
05784                 //send PmeUntransMsg with has_evir=0
05785                 unL = fromIdx;
05786                 unH = toIdx;            
05787         } else if(toIdx <= evirIdx-1) {
05788                 //send PmeAckMsg
05789                 ackL=fromIdx;
05790                 ackH=toIdx;             
05791         } else {
05792                 //partially send PmeAckMsg and partially send PmeUntransMsg
05793                 ackL=fromIdx;
05794                 ackH=evirIdx-1;
05795                 send_evir=1;
05796                 unL=evirIdx+1;
05797                 unH=toIdx;
05798         }
05799 
05800         for(int isend=ackL; isend<=ackH; isend++) {
05801                 //send PmeAckMsg
05802         CmiEnableUrgentSend(1);
05803                 int ib = send_order[isend];
05804                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05805                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05806                 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05807         CmiEnableUrgentSend(0);
05808     }
05809 
05810     CmiEnableUrgentSend(1);
05811         //send PmeUntransMsg with has_evir=1
05812         if(send_evir) {
05813                 int ib = send_order[evirIdx];
05814                 int nx = block1;
05815                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05816                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;              
05817                 msg->sourceNode = thisIndex.y;
05818                 msg->ny = ny;
05819                 float *md = msg->qgrid;
05820                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05821                         float *d = data + i*ny*nz*2;
05822                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05823                                 for ( int k=0; k<nz; ++k ) {
05824                                         *(md++) = d[2*k];
05825                                         *(md++) = d[2*k+1];
05826                                 }
05827                         }
05828                 }
05829                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05830 #if USE_NODE_PAR_RECEIVE
05831         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05832         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05833 #else
05834         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05835 #endif
05836          }
05837     CmiEnableUrgentSend(0);
05838         
05839         //send PmeUntransMsg with has_evir=0
05840         for(int isend=unL; isend<=unH; isend++) {
05841                 int ib = send_order[isend];
05842                 int nx = block1;
05843                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05844                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05845                 msg->sourceNode = thisIndex.y;
05846                 msg->ny = ny;
05847                 float *md = msg->qgrid;
05848                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05849                         float *d = data + i*ny*nz*2;
05850                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05851                                 for ( int k=0; k<nz; ++k ) {
05852                                         *(md++) = d[2*k];
05853                                         *(md++) = d[2*k+1];
05854                                 }
05855                         }
05856                 }
05857                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05858         CmiEnableUrgentSend(1);
05859 #if USE_NODE_PAR_RECEIVE
05860         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05861 #if Y_PERSIST 
05862         CmiUsePersistentHandle(&untrans_handle[isend], 1);
05863 #endif
05864         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05865 #if Y_PERSIST 
05866         CmiUsePersistentHandle(NULL, 0);
05867 #endif
05868 #else
05869 #if Y_PERSIST 
05870   //      CmiUsePersistentHandle(&untrans_handle[isend], 1);
05871 #endif
05872         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05873 #if Y_PERSIST 
05874    //     CmiUsePersistentHandle(NULL, 0);
05875 #endif
05876 #endif
05877         CmiEnableUrgentSend(0);
05878         }
05879 }
05880 
05881 void PmeXPencil::send_untrans() {
05882 
05883   { // send energy and virial
05884     int numGrids = 1;
05885     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
05886     newmsg->evir[0] = evir;
05887     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
05888     CmiEnableUrgentSend(1);
05889     initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
05890     CmiEnableUrgentSend(0);
05891   }
05892 
05893 #if USE_PERSISTENT
05894   if (untrans_handle == NULL) setup_persistent();
05895 #endif
05896 #if     CMK_SMP && USE_CKLOOP
05897   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05898   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
05899      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05900                 int xBlocks = initdata.xBlocks;
05901                 int evirIdx = 0;
05902                 for ( int isend=0; isend<xBlocks; ++isend ) {
05903                         int ib = send_order[isend];
05904                         if (needs_reply[ib]) {
05905                                 evirIdx = isend;
05906                                 break;
05907                         }
05908                 }
05909 
05910                 //basically: 
05911                 //[0,evirIdx-1]->send PmeAckMsg
05912                 //evirIdx->send PmeUntransMsg with has_evir=1
05913                 //[evirIdx+1, xBlocks-1]->send PmeUntransMsg with has_evir=0
05914                 //send_subset_untrans(0, xBlocks-1, evirIdx);
05915 #if USE_NODE_PAR_RECEIVE
05916                 //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
05917                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
05918 #else
05919         //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
05920                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
05921 #endif        
05922                 return;
05923   }
05924 #endif
05925   int xBlocks = initdata.xBlocks;
05926   int block1 = initdata.grid.block1;
05927   int K1 = initdata.grid.K1;
05928   int send_evir = 1;
05929   for ( int isend=0; isend<xBlocks; ++isend ) {
05930     int ib = send_order[isend];
05931     if ( ! needs_reply[ib] ) {
05932       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05933       CmiEnableUrgentSend(1);
05934       SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05935       initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05936       CmiEnableUrgentSend(0);
05937       continue;
05938     }
05939     int nx = block1;
05940     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05941     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05942     if ( send_evir ) {
05943       send_evir = 0;
05944     }
05945     msg->sourceNode = thisIndex.y;
05946     msg->ny = ny;
05947     float *md = msg->qgrid;
05948     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05949      float *d = data + i*ny*nz*2;
05950      for ( int j=0; j<ny; ++j, d += nz*2 ) {
05951       for ( int k=0; k<nz; ++k ) {
05952         *(md++) = d[2*k];
05953         *(md++) = d[2*k+1];
05954       }
05955      }
05956     }
05957     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05958 
05959     CmiEnableUrgentSend(1);
05960 #if USE_NODE_PAR_RECEIVE
05961     msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05962 #if Y_PERSIST 
05963     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05964 #endif
05965     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05966 #if Y_PERSIST 
05967     CmiUsePersistentHandle(NULL, 0);
05968 #endif
05969 #else
05970 #if Y_PERSIST 
05971     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05972 #endif
05973     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05974 #if Y_PERSIST 
05975     CmiUsePersistentHandle(NULL, 0);
05976 #endif
05977 #endif
05978     CmiEnableUrgentSend(0);
05979   }
05980 }
05981 
05982 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
05983   int block2 = initdata.grid.block2;
05984   int K2 = initdata.grid.K2;
05985   int jb = msg->sourceNode;
05986   int ny = msg->ny;
05987   const float *md = msg->qgrid;
05988   float *d = data;
05989   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05990 #if CMK_BLUEGENEL
05991     CmiNetworkProgress();
05992 #endif   
05993     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05994       for ( int k=0; k<nz; ++k ) {
05995 #ifdef ZEROCHECK
05996         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
05997                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05998 #endif
05999         d[2*(j*nz+k)] = *(md++);
06000         d[2*(j*nz+k)+1] = *(md++);
06001       }
06002     }
06003   }
06004 }
06005 
06006 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
06007         PmeYPencil *ypencil = (PmeYPencil *)param;
06008         ypencil->backward_subset_fft(first, last);
06009 }
06010 
06011 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
06012 #ifdef NAMD_FFTW
06013 #ifdef NAMD_FFTW_3
06014         for(int i=fromIdx; i<=toIdx; i++){
06015                 fftwf_execute_dft(backward_plan,        
06016                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2,         
06017                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
06018         }
06019 #endif
06020 #endif
06021 }
06022 
06023 void PmeYPencil::backward_fft() {
06024 #ifdef NAMD_FFTW
06025 #ifdef MANUAL_DEBUG_FFTW3
06026   dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
06027 #endif
06028 
06029 #ifdef NAMD_FFTW_3
06030 #if     CMK_SMP && USE_CKLOOP
06031   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06032   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06033      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06034           CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
06035           return;
06036   }
06037 #endif
06038   //the above is a transformation of the following loop using CkLoop
06039   for ( int i=0; i<nx; ++i ) {
06040 #if CMK_BLUEGENEL
06041         CmiNetworkProgress();
06042 #endif
06043     fftwf_execute_dft(backward_plan,    
06044                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
06045                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
06046   }
06047 #else
06048         for ( int i=0; i<nx; ++i ) {
06049 #if CMK_BLUEGENEL
06050           CmiNetworkProgress();
06051 #endif
06052                 fftw(backward_plan, nz,
06053                 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
06054                 nz, 1, (fftw_complex *) work, 1, 0);
06055         }
06056 #endif
06057 
06058 #ifdef MANUAL_DEBUG_FFTW3
06059   dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
06060 #endif
06061 
06062 #endif
06063 }
06064 
06065 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
06066         int evirIdx = paraNum;
06067         PmeYPencil *ypencil = (PmeYPencil *)param;
06068         ypencil->send_subset_untrans(first, last, evirIdx);
06069 }
06070 
06071 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
06072         int yBlocks = initdata.yBlocks;
06073         int block2 = initdata.grid.block2;      
06074         int K2 = initdata.grid.K2;
06075 
06076         int ackL=0, ackH=-1;
06077         int unL=0, unH=-1;
06078         int send_evir=0;
06079         if(fromIdx >= evirIdx+1) {
06080                 //send PmeUntransMsg with has_evir=0
06081                 unL = fromIdx;
06082                 unH = toIdx;            
06083         } else if(toIdx <= evirIdx-1) {
06084                 //send PmeAckMsg
06085                 ackL=fromIdx;
06086                 ackH=toIdx;             
06087         } else {
06088                 //partially send PmeAckMsg and partially send PmeUntransMsg
06089                 ackL=fromIdx;
06090                 ackH=evirIdx-1;
06091                 send_evir=1;
06092                 unL=evirIdx+1;
06093                 unH=toIdx;
06094         }
06095 
06096         for(int isend=ackL; isend<=ackH; isend++) {
06097                 //send PmeAckMsg
06098         CmiEnableUrgentSend(1);
06099                 int jb = send_order[isend];
06100                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06101                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06102                 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06103         CmiEnableUrgentSend(0);
06104         }
06105 
06106     CmiEnableUrgentSend(1);
06107         //send PmeUntransMsg with has_evir=1
06108         if(send_evir) {
06109                 int jb = send_order[evirIdx];
06110                 int ny = block2;
06111                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06112                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;              
06113                 msg->sourceNode = thisIndex.z;
06114                 msg->ny = nz;
06115                 float *md = msg->qgrid;
06116                 const float *d = data;
06117                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06118                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06119                                 for ( int k=0; k<nz; ++k ) {
06120                                         *(md++) = d[2*(j*nz+k)];
06121                                         *(md++) = d[2*(j*nz+k)+1];
06122                                 }
06123                         }
06124                 }
06125                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06126 #if USE_NODE_PAR_RECEIVE
06127         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06128     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06129         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06130 #else
06131         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06132 #endif
06133         }
06134 
06135     CmiEnableUrgentSend(0);
06136         //send PmeUntransMsg with has_evir=0
06137         for(int isend=unL; isend<=unH; isend++) {
06138                 int jb = send_order[isend];
06139                 int ny = block2;
06140                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06141                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06142                 msg->sourceNode = thisIndex.z;
06143                 msg->ny = nz;
06144                 float *md = msg->qgrid;
06145                 const float *d = data;
06146                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06147                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06148                                 for ( int k=0; k<nz; ++k ) {
06149                                         *(md++) = d[2*(j*nz+k)];
06150                                         *(md++) = d[2*(j*nz+k)+1];
06151                                 }
06152                         }
06153                 }
06154                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06155             CmiEnableUrgentSend(1);
06156 #if USE_NODE_PAR_RECEIVE
06157         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06158         //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06159 #if Z_PERSIST 
06160         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06161 #endif
06162         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06163 #if Z_PERSIST 
06164         CmiUsePersistentHandle(NULL, 0);
06165 #endif
06166 #else
06167 #if Z_PERSIST 
06168         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06169 #endif
06170         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06171 #if Z_PERSIST 
06172         CmiUsePersistentHandle(NULL, 0);
06173 #endif
06174 #endif
06175     CmiEnableUrgentSend(0);
06176         }
06177 }
06178 
06179 void PmeYPencil::send_untrans() {
06180 #if USE_PERSISTENT
06181   if (untrans_handle == NULL) setup_persistent();
06182 #endif
06183 #if     CMK_SMP && USE_CKLOOP
06184   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06185   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06186      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06187           int yBlocks = initdata.yBlocks;
06188           int evirIdx = 0;
06189           for ( int isend=0; isend<yBlocks; ++isend ) {
06190                   int jb = send_order[isend];
06191                   if (needs_reply[jb]) {
06192                           evirIdx = isend;
06193                           break;
06194                   }
06195           }
06196 
06197           //basically: 
06198           //[0,evirIdx-1]->send PmeAckMsg
06199           //evirIdx->send PmeUntransMsg with has_evir=1
06200           //[evirIdx+1, yBlocks-1]->send PmeUntransMsg with has_evir=0
06201           //send_subset_untrans(0, yBlocks-1, evirIdx);
06202 #if USE_NODE_PAR_RECEIVE      
06203           //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
06204           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
06205       evir = 0.;
06206       CmiMemoryWriteFence();
06207 #else
06208       //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
06209           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
06210 #endif
06211           return;
06212   }
06213 #endif
06214   int yBlocks = initdata.yBlocks;
06215   int block2 = initdata.grid.block2;
06216   int K2 = initdata.grid.K2;
06217   int send_evir = 1;
06218   for ( int isend=0; isend<yBlocks; ++isend ) {
06219     int jb = send_order[isend];
06220     if ( ! needs_reply[jb] ) {
06221       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06222       CmiEnableUrgentSend(1);
06223       SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06224       initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06225       CmiEnableUrgentSend(0);
06226       continue;
06227     }
06228     int ny = block2;
06229     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06230     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06231     if ( send_evir ) {
06232       send_evir = 0;
06233     }
06234     msg->sourceNode = thisIndex.z;
06235     msg->ny = nz;
06236     float *md = msg->qgrid;
06237     const float *d = data;
06238     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06239      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06240       for ( int k=0; k<nz; ++k ) {
06241         *(md++) = d[2*(j*nz+k)];
06242         *(md++) = d[2*(j*nz+k)+1];
06243       }
06244      }
06245     }
06246     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06247 
06248     CmiEnableUrgentSend(1);
06249 #if USE_NODE_PAR_RECEIVE
06250     msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06251     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06252 #if Z_PERSIST 
06253     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06254 #endif
06255     initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06256 #if Z_PERSIST
06257     CmiUsePersistentHandle(NULL, 0);
06258 #endif
06259 #else
06260 #if Z_PERSIST 
06261     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06262 #endif
06263     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06264 #if Z_PERSIST 
06265     CmiUsePersistentHandle(NULL, 0);
06266 #endif
06267 #endif    
06268     CmiEnableUrgentSend(0);
06269   }
06270   
06271 #if USE_NODE_PAR_RECEIVE
06272   evir = 0.;
06273   CmiMemoryWriteFence();
06274 #endif
06275 }
06276 
06277 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
06278 #if ! USE_NODE_PAR_RECEIVE
06279     if(imsg==0) evir=0.;
06280 #endif
06281 
06282   int block3 = initdata.grid.block3;
06283   int dim3 = initdata.grid.dim3;
06284   int kb = msg->sourceNode;
06285   int nz = msg->ny;
06286   const float *md = msg->qgrid;
06287   float *d = data;
06288   for ( int i=0; i<nx; ++i ) {
06289 #if CMK_BLUEGENEL
06290     CmiNetworkProgress();
06291 #endif   
06292     for ( int j=0; j<ny; ++j, d += dim3 ) {
06293       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
06294 #ifdef ZEROCHECK
06295         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
06296                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
06297 #endif
06298         d[2*k] = *(md++);
06299         d[2*k+1] = *(md++);
06300       }
06301     }
06302   }
06303 }
06304 
06305 void PmeZPencil::backward_fft() {
06306 #ifdef NAMD_FFTW
06307 #ifdef MANUAL_DEBUG_FFTW3
06308   dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06309 #endif
06310 #ifdef NAMD_FFTW_3
06311 #if     CMK_SMP && USE_CKLOOP
06312   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06313   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06314      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06315           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
06316           //transform the above loop
06317           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
06318           return;
06319   }
06320 #endif
06321   fftwf_execute(backward_plan);
06322 #else
06323   rfftwnd_complex_to_real(backward_plan, nx*ny,
06324             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
06325 #endif
06326 #ifdef MANUAL_DEBUG_FFTW3
06327   dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06328 #endif
06329 
06330 #endif
06331   
06332 #if CMK_BLUEGENEL
06333   CmiNetworkProgress();
06334 #endif
06335 
06336 #ifdef FFTCHECK
06337   int dim3 = initdata.grid.dim3;
06338   int K1 = initdata.grid.K1;
06339   int K2 = initdata.grid.K2;
06340   int K3 = initdata.grid.K3;
06341   float scale = 1. / (1. * K1 * K2 * K3);
06342   float maxerr = 0.;
06343   float maxstd = 0.;
06344   int mi, mj, mk;  mi = mj = mk = -1;
06345   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
06346   const float *d = data;
06347   for ( int i=0; i<nx; ++i ) {
06348    for ( int j=0; j<ny; ++j, d += dim3 ) {
06349     for ( int k=0; k<K3; ++k ) {
06350       float std = 10. * (10. * (10. * std_base + i) + j) + k;
06351       float err = scale * d[k] - std;
06352       if ( fabsf(err) > fabsf(maxerr) ) {
06353         maxerr = err;
06354         maxstd = std;
06355         mi = i;  mj = j;  mk = k;
06356       }
06357     }
06358    }
06359   }
06360   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
06361                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
06362 #endif
06363 
06364 }
06365 
06366 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
06367         //to take advantage of the interface which allows 3 user params at most.
06368         //under such situtation, no new parameter list needs to be created!! -Chao Mei
06369         int specialIdx = paraNum;
06370         PmeZPencil *zpencil = (PmeZPencil *)param;
06371         zpencil->send_subset_ungrid(first, last, specialIdx);
06372 }
06373 
06374 void PmeZPencil::send_all_ungrid() {
06375 /* 
06376 //Original code: the transformation is to first extract the msg 
06377 //idx that will has evir value set. -Chao Mei  
06378         int send_evir = 1;
06379         for (int imsg=0; imsg < grid_msgs.size(); ++imsg ) {
06380                 PmeGridMsg *msg = grid_msgs[imsg];
06381                 if ( msg->hasData ) {
06382                         if ( send_evir ) {
06383                                 msg->evir[0] = evir;
06384                                 send_evir = 0;
06385                         } else {
06386                                 msg->evir[0] = 0.;
06387                         }
06388                 }
06389                 send_ungrid(msg);
06390         }
06391 */
06392         int evirIdx = 0;
06393         for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
06394                 if(grid_msgs[imsg]->hasData) {
06395                         evirIdx = imsg;
06396                         break;
06397                 }
06398         }
06399 
06400 #if     CMK_SMP && USE_CKLOOP
06401         int useCkLoop = Node::Object()->simParameters->useCkLoop;
06402         if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06403            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06404                 //????What's the best value for numChunks?????
06405 #if USE_NODE_PAR_RECEIVE        
06406                 //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 1); //has to sync
06407                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
06408 #else
06409         //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 0); //not sync
06410                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0); //not sync
06411 #endif        
06412                 return;
06413         }
06414 #endif
06415         send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
06416 }
06417 
06418 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
06419         for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
06420                 PmeGridMsg *msg = grid_msgs[imsg];
06421                 send_ungrid(msg);
06422         }
06423 }
06424 
06425 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
06426 
06427 #ifdef NAMD_CUDA
06428   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
06429 #else
06430   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
06431 #endif
06432 
06433   int pe = msg->sourceNode;
06434   if ( ! msg->hasData ) {
06435     delete msg;
06436     PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
06437     SET_PRIORITY(ackmsg,sequence,UNGRID_PRIORITY)
06438     CmiEnableUrgentSend(1);
06439     initdata.pmeProxy[pe].recvAck(ackmsg);
06440     CmiEnableUrgentSend(0);
06441     return;
06442   }
06443   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
06444   int dim3 = initdata.grid.dim3;
06445   int zlistlen = msg->zlistlen;
06446   int *zlist = msg->zlist;
06447   char *fmsg = msg->fgrid;
06448   float *qmsg = msg->qgrid;
06449   float *d = data;
06450   int numGrids = 1;  // pencil FFT doesn't support multiple grids
06451   for ( int g=0; g<numGrids; ++g ) {
06452 #if CMK_BLUEGENEL
06453     CmiNetworkProgress();
06454 #endif    
06455     for ( int i=0; i<nx; ++i ) {
06456       for ( int j=0; j<ny; ++j, d += dim3 ) {
06457         if( *(fmsg++) ) {
06458           for ( int k=0; k<zlistlen; ++k ) {
06459             *(qmsg++) = d[zlist[k]];
06460           }
06461         }
06462       }
06463     }
06464   }
06465   SET_PRIORITY(msg,sequence,UNGRID_PRIORITY)
06466     CmiEnableUrgentSend(1);
06467 #ifdef NAMD_CUDA
06468     if ( offload ) {
06469       initdata.pmeNodeProxy[CkNodeOf(pe)].recvUngrid(msg);
06470     } else
06471 #endif
06472   initdata.pmeProxy[pe].recvUngrid(msg);
06473     CmiEnableUrgentSend(0);
06474 }
06475 
06476 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
06477 {
06478 #if USE_NODE_PAR_RECEIVE
06479   CmiLock(ComputePmeMgr::fftw_plan_lock);
06480   CmiMemoryReadFence();
06481 #endif
06482   recv_grid(msg);
06483   if(msg->hasData) hasData=msg->hasData;
06484   int limsg;
06485   CmiMemoryAtomicFetchAndInc(imsg,limsg);
06486   grid_msgs[limsg] = msg;
06487   //  CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);      
06488   if(limsg+1 == grid_msgs.size())
06489     {
06490 
06491       if (hasData)
06492         {
06493           forward_fft();
06494         }
06495       send_trans();
06496       imsg=0;
06497       CmiMemoryWriteFence();
06498       //      CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06499     }
06500 #if USE_NODE_PAR_RECEIVE
06501   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06502   CmiMemoryWriteFence();
06503 #endif
06504 }
06505 
06506 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
06507 {
06508   recv_untrans(msg);
06509 #if USE_NODE_PAR_RECEIVE
06510   CmiMemoryWriteFence();
06511   CmiLock(ComputePmeMgr::fftw_plan_lock);
06512 #endif    
06513   int limsg;
06514   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
06515   if(limsg+1 == initdata.zBlocks)
06516     {
06517 #if USE_NODE_PAR_RECEIVE
06518       CmiMemoryReadFence();
06519 #endif    
06520       if(hasData) // maybe this should be an assert
06521         {
06522           backward_fft();
06523         }
06524         
06525         send_all_ungrid();
06526     /*  int send_evir = 1;
06527       // TODO: this part should use Chao's output parallelization
06528       for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
06529         PmeGridMsg *omsg = grid_msgs[limsg];
06530         if ( omsg->hasData ) {
06531           if ( send_evir ) {
06532             omsg->evir[0] = evir;
06533             send_evir = 0;
06534           } else {
06535             omsg->evir[0] = 0.;
06536           }
06537         }
06538         send_ungrid(omsg);
06539       } */
06540       imsgb=0;
06541       evir = 0;
06542       memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3); 
06543       CmiMemoryWriteFence();
06544       //      CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06545     }
06546 #if USE_NODE_PAR_RECEIVE
06547   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06548 #endif
06549 }
06550 
06551 
06552 #include "ComputePmeMgr.def.h"
06553 

Generated on Sat Sep 23 01:17:13 2017 for NAMD by  doxygen 1.4.7