ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 //#define MANUAL_DEBUG_FFTW3 1
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012 // fftw2 doesn't have these defined
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024 
00025 #include <vector>
00026 #include <algorithm>
00027 #include <deque>
00028 using namespace std;
00029 
00030 #include "InfoStream.h"
00031 #include "Node.h"
00032 #include "PatchMap.h"
00033 #include "PatchMap.inl"
00034 #include "AtomMap.h"
00035 #include "ComputePme.h"
00036 #include "ComputePmeMgr.decl.h"
00037 #include "PmeBase.inl"
00038 #include "PmeRealSpace.h"
00039 #include "PmeKSpace.h"
00040 #include "ComputeNonbondedUtil.h"
00041 #include "PatchMgr.h"
00042 #include "Molecule.h"
00043 #include "ReductionMgr.h"
00044 #include "ComputeMgr.h"
00045 #include "ComputeMgr.decl.h"
00046 // #define DEBUGM
00047 #define MIN_DEBUG_LEVEL 3
00048 #include "Debug.h"
00049 #include "SimParameters.h"
00050 #include "WorkDistrib.h"
00051 #include "varsizemsg.h"
00052 #include "Random.h"
00053 #include "ckhashtable.h"
00054 #include "Priorities.h"
00055 
00056 #include "ComputeMoa.h"
00057 #include "ComputeMoaMgr.decl.h" 
00058 
00059 //#define     USE_RANDOM_TOPO         1
00060 
00061 //#define USE_TOPO_SFC                    1
00062 //#define     USE_CKLOOP                1
00063 //#include "TopoManager.h"
00064 
00065 #include "DeviceCUDA.h"
00066 #ifdef NAMD_CUDA
00067 #include <cuda_runtime.h>
00068 #include <cuda.h>
00069 void cuda_errcheck(const char *msg);
00070 #ifdef WIN32
00071 #define __thread __declspec(thread)
00072 #endif
00073 extern __thread DeviceCUDA *deviceCUDA;
00074 #endif
00075 
00076 #include "ComputePmeCUDAKernel.h"
00077 
00078 #ifndef SQRT_PI
00079 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00080 #endif
00081 
00082 #if CMK_PERSISTENT_COMM 
00083 #define USE_PERSISTENT      1
00084 #endif
00085 
00086 #if USE_PERSISTENT
00087 #define Z_PERSIST 1
00088 #define Y_PERSIST 1
00089 #define X_PERSIST 1
00090 #endif
00091 
00092 #if defined(NAMD_CUDA) && defined(MEM_OPT_VERSION)
00093 #define USE_NODE_PAR_RECEIVE    1
00094 #endif
00095 
00096 char *pencilPMEProcessors;
00097 
00098 class PmeAckMsg : public CMessage_PmeAckMsg {
00099 };
00100 
00101 class PmeGridMsg : public CMessage_PmeGridMsg {
00102 public:
00103 
00104   int sourceNode;
00105   int sequence;
00106   int hasData;
00107   Lattice lattice;
00108   int start;
00109   int len;
00110   int zlistlen;
00111   int *zlist;
00112   char *fgrid;
00113   float *qgrid;
00114   CkArrayIndex3D destElem;
00115 };
00116 
00117 class PmeTransMsg : public CMessage_PmeTransMsg {
00118 public:
00119 
00120   int sourceNode;
00121   int sequence;
00122   int hasData;
00123   Lattice lattice;
00124   int x_start;
00125   int nx;
00126   float *qgrid;
00127   CkArrayIndex3D destElem;
00128 };
00129 
00130 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00131 public:
00132   PmeTransMsg *msg;
00133   int *count;
00134   CmiNodeLock lock;
00135 };
00136 
00137 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00138 public:
00139 
00140   int sourceNode;
00141   int y_start;
00142   int ny;
00143   float *qgrid;
00144   CkArrayIndex3D destElem;
00145 };
00146 
00147 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00148 public:
00149   PmeUntransMsg *msg;
00150   int *count;
00151   CmiNodeLock lock;
00152 };
00153 
00154 class PmeEvirMsg : public CMessage_PmeEvirMsg {
00155 public:
00156   PmeReduction *evir;
00157 };
00158 
00159 class PmePencilMap : public CBase_PmePencilMap {
00160 public:
00161   PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00162     : ia(i_a), ib(i_b), nb(n_b),
00163       size(n), data(newcopyint(n,d)) {
00164   }
00165   virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00166     //Return an ``arrayHdl'', given some information about the array
00167     return 0;
00168   }
00169   virtual int procNum(int, const CkArrayIndex &i) {
00170     //Return the home processor number for this element of this array
00171     return data[ i.data()[ia] * nb + i.data()[ib] ];
00172   }
00173   virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00174     int mype = CkMyPe();
00175     for ( int i=0; i < size; ++i ) {
00176       if ( data[i] == mype ) {
00177         CkArrayIndex3D ai(0,0,0);
00178         ai.data()[ia] = i / nb;
00179         ai.data()[ib] = i % nb;
00180         if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00181         if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00182         mgr->insertInitial(ai,msg);
00183         msg = 0;
00184       }
00185     }
00186     mgr->doneInserting();
00187     if ( msg ) CkFreeMsg(msg);
00188   }
00189 private:
00190   const int ia, ib, nb, size;
00191   const int* const data;
00192   static int* newcopyint(int n, int *d) {
00193     int *newd = new int[n];
00194     memcpy(newd, d, n*sizeof(int));
00195     return newd;
00196   }
00197 };
00198 
00199 // use this idiom since messages don't have copy constructors
00200 struct PmePencilInitMsgData {
00201   PmeGrid grid;
00202   int xBlocks, yBlocks, zBlocks;
00203   CProxy_PmeXPencil xPencil;
00204   CProxy_PmeYPencil yPencil;
00205   CProxy_PmeZPencil zPencil;
00206   CProxy_ComputePmeMgr pmeProxy;
00207   CProxy_NodePmeMgr pmeNodeProxy;
00208   CProxy_PmePencilMap xm;
00209   CProxy_PmePencilMap ym;
00210   CProxy_PmePencilMap zm;
00211 };
00212 
00213 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00214 public:
00215    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00216    PmePencilInitMsgData data;
00217 };
00218 
00219 
00220 struct LocalPmeInfo {
00221   int nx, x_start;
00222   int ny_after_transpose, y_start_after_transpose;
00223 };
00224 
00225 struct NodePmeInfo {
00226   int npe, pe_start, real_node;
00227 };
00228 
00229 
00230 static int findRecipEvirPe() {
00231     PatchMap *patchMap = PatchMap::Object();
00232     {
00233       int mype = CkMyPe();
00234       if ( patchMap->numPatchesOnNode(mype) ) {
00235         return mype; 
00236       }
00237     }
00238     {
00239       int node = CmiMyNode();
00240       int firstpe = CmiNodeFirst(node);
00241       int nodeSize = CmiNodeSize(node);
00242       int myrank = CkMyRank();
00243       for ( int i=0; i<nodeSize; ++i ) {
00244         int pe = firstpe + (myrank+i)%nodeSize;
00245         if ( patchMap->numPatchesOnNode(pe) ) {
00246           return pe;
00247         }
00248       }
00249     }
00250     {
00251       int *pelist;
00252       int nodeSize;
00253       CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
00254       int myrank;
00255       for ( int i=0; i<nodeSize; ++i ) {
00256         if ( pelist[i] == CkMyPe() ) myrank = i;
00257       }
00258       for ( int i=0; i<nodeSize; ++i ) {
00259         int pe = pelist[(myrank+i)%nodeSize];
00260         if ( patchMap->numPatchesOnNode(pe) ) {
00261           return pe;
00262         }
00263       }
00264     }
00265     {
00266       int mype = CkMyPe();
00267       int npes = CkNumPes();
00268       for ( int i=0; i<npes; ++i ) {
00269         int pe = (mype+i)%npes;
00270         if ( patchMap->numPatchesOnNode(pe) ) {
00271           return pe;
00272         }
00273       }
00274     }
00275     NAMD_bug("findRecipEvirPe() failed!");
00276     return -999;  // should never happen
00277 }
00278 
00279 
00280 //Assigns gridPeMap and transPeMap to different set of processors.
00281 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00282   int ncpus = CkNumPes();
00283   
00284   for ( int i=0; i<numGridPes; ++i ) {
00285     gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
00286   }
00287   std::sort(gridPeMap,gridPeMap+numGridPes);
00288   int firstTransPe = ncpus - numGridPes - numTransPes;
00289   if ( firstTransPe < 0 ) {
00290     firstTransPe = 0;
00291     // 0 should be first in list, skip if possible
00292     if ( ncpus > numTransPes ) firstTransPe = 1;
00293   }
00294   for ( int i=0; i<numTransPes; ++i ) {
00295     transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
00296   }
00297   std::sort(transPeMap,transPeMap+numTransPes);
00298 }
00299 
00300 #if USE_TOPOMAP 
00301 //Topology aware PME allocation
00302 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00303                              int nbpes=0);
00304 #endif
00305 
00306 
00307 int compare_bit_reversed(int a, int b) {
00308   int d = a ^ b;
00309   int c = 1;
00310   if ( d ) while ( ! (d & c) ) {
00311     c = c << 1;
00312   }
00313   return (a & c) - (b & c);
00314 }
00315 
00316 inline bool less_than_bit_reversed(int a, int b) {
00317   int d = a ^ b;
00318   int c = 1;
00319   if ( d ) while ( ! (d & c) ) {
00320     c = c << 1;
00321   }
00322   return d && (b & c);
00323 }
00324 
00325 struct sortop_bit_reversed {
00326   inline bool operator() (int a, int b) const {
00327     return less_than_bit_reversed(a,b);
00328   }
00329 };
00330 
00331 struct ijpair {
00332   int i,j;
00333   ijpair() {;}
00334   ijpair(int I, int J) : i(I), j(J) {;}
00335 };
00336 
00337 struct ijpair_sortop_bit_reversed {
00338   inline bool operator() (const ijpair &a, const ijpair &b) const {
00339     return ( less_than_bit_reversed(a.i,b.i)
00340              || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
00341   }
00342 };
00343 
00344 class ComputePmeMgr : public CBase_ComputePmeMgr {
00345 public:
00346   friend class ComputePme;
00347   friend class NodePmeMgr;
00348   ComputePmeMgr();
00349   ~ComputePmeMgr();
00350 
00351   void initialize(CkQdMsg*);
00352   void initialize_pencils(CkQdMsg*);
00353   void activate_pencils(CkQdMsg*);
00354   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00355   void initialize_computes();
00356 
00357   void sendData(Lattice &, int sequence);
00358   void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
00359   Lattice *sendDataHelper_lattice;
00360   int sendDataHelper_sequence;
00361   int sendDataHelper_sourcepe;
00362   int sendDataHelper_errors;
00363   void sendPencils(Lattice &, int sequence);
00364   void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
00365   void recvGrid(PmeGridMsg *);
00366   void gridCalc1(void);
00367   void sendTransBarrier(void);
00368   void sendTransSubset(int first, int last);
00369   void sendTrans(void);
00370   void fwdSharedTrans(PmeTransMsg *);
00371   void recvSharedTrans(PmeSharedTransMsg *);
00372   void sendDataHelper(int);
00373   void sendPencilsHelper(int);
00374   void recvTrans(PmeTransMsg *);
00375   void procTrans(PmeTransMsg *);
00376   void gridCalc2(void);
00377   #ifdef OPENATOM_VERSION
00378   void gridCalc2Moa(void);
00379   #endif // OPENATOM_VERSION
00380   void gridCalc2R(void);
00381   void fwdSharedUntrans(PmeUntransMsg *);
00382   void recvSharedUntrans(PmeSharedUntransMsg *);
00383   void sendUntrans(void);
00384   void sendUntransSubset(int first, int last);
00385   void recvUntrans(PmeUntransMsg *);
00386   void procUntrans(PmeUntransMsg *);
00387   void gridCalc3(void);
00388   void sendUngrid(void);
00389   void sendUngridSubset(int first, int last);
00390   void recvUngrid(PmeGridMsg *);
00391   void recvAck(PmeAckMsg *);
00392   void copyResults(PmeGridMsg *);
00393   void copyPencils(PmeGridMsg *);
00394   void ungridCalc(void);
00395   void recvRecipEvir(PmeEvirMsg *);
00396   void addRecipEvirClient(void);
00397   void submitReductions();
00398 
00399 #if 0 && USE_PERSISTENT
00400   void setup_recvgrid_persistent();
00401 #endif
00402 
00403   static CmiNodeLock fftw_plan_lock;
00404   CmiNodeLock pmemgr_lock;  // for accessing this object from other threads
00405 
00406 #ifdef NAMD_CUDA
00407   float *a_data_host;
00408   float *a_data_dev;
00409   float *f_data_host;
00410   float *f_data_dev;
00411   int cuda_atoms_count;
00412   int cuda_atoms_alloc;
00413   static CmiNodeLock cuda_lock;
00414   void chargeGridSubmitted(Lattice &lattice, int sequence);
00415   cudaEvent_t end_charges;
00416   cudaEvent_t *end_forces;
00417   int forces_count;
00418   int forces_done_count;
00419   double charges_time;
00420   double forces_time;
00421   int check_charges_count;
00422   int check_forces_count;
00423   int master_pe;
00424   int this_pe;
00425 
00426   void cuda_submit_charges(Lattice &lattice, int sequence);
00427   struct cuda_submit_charges_args {
00428     ComputePmeMgr *mgr; Lattice *lattice; int sequence;
00429   };
00430   static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
00431   static bool cuda_busy;
00432 
00433   int chargeGridSubmittedCount;
00434   void sendChargeGridReady();
00435 #endif
00436   Lattice *saved_lattice;  // saved by chargeGridSubmitted
00437   int saved_sequence;      // saved by chargeGridSubmitted
00438   void pollChargeGridReady();
00439   void pollForcesReady();
00440   void recvChargeGridReady();
00441   void chargeGridReady(Lattice &lattice, int sequence);
00442 
00443   ResizeArray<ComputePme*> pmeComputes;
00444 
00445 private:
00446 
00447 #if 0 && USE_PERSISTENT
00448   PersistentHandle   *recvGrid_handle;
00449 #endif
00450 
00451   CProxy_ComputePmeMgr pmeProxy;
00452   CProxy_ComputePmeMgr pmeProxyDir;
00453   CProxy_NodePmeMgr pmeNodeProxy;
00454   NodePmeMgr *nodePmeMgr;
00455   ComputePmeMgr *masterPmeMgr;
00456   
00457   void addCompute(ComputePme *c) {
00458     if ( ! pmeComputes.size() ) initialize_computes();
00459     pmeComputes.add(c);
00460     c->setMgr(this);
00461   }
00462 
00463   ResizeArray<ComputePme*> heldComputes;
00464   PmeGrid myGrid;
00465   Lattice lattice;
00466   PmeKSpace *myKSpace;
00467   float *qgrid;
00468   float *kgrid;
00469 
00470 #ifdef NAMD_FFTW
00471 #ifdef NAMD_FFTW_3
00472   fftwf_plan *forward_plan_x, *backward_plan_x;
00473   fftwf_plan *forward_plan_yz, *backward_plan_yz;
00474   fftwf_complex *work;
00475 #else
00476   fftw_plan forward_plan_x, backward_plan_x;
00477   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00478   fftw_complex *work;
00479 #endif
00480 #else
00481   float *work;
00482 #endif
00483 
00484   int qsize, fsize, bsize;
00485   int alchOn, alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00486   int alchDecouple;
00487   int offload;
00488   BigReal alchElecLambdaStart;
00489   BigReal alchLambda;  // set on each step in ComputePme::ungridForces()
00490 
00491   float **q_arr;
00492   // q_list and q_count not used for offload
00493   float **q_list;
00494   int q_count;
00495   char *f_arr;
00496   char *fz_arr;
00497   PmeReduction evir[PME_MAX_EVALS];
00498   SubmitReduction *reduction;
00499 
00500   int noWorkCount;
00501   int doWorkCount;
00502   int ungridForcesCount;
00503 
00504 #ifdef NAMD_CUDA
00505 #define NUM_STREAMS 1
00506   cudaStream_t streams[NUM_STREAMS];
00507   int stream;
00508 
00509   float **q_arr_dev;
00510   float **v_arr_dev;
00511   float *q_data_host;
00512   float *q_data_dev;
00513   float *v_data_dev;
00514   int *ffz_host;
00515   int *ffz_dev;
00516   int q_data_size;
00517   int ffz_size;
00518 
00519   int f_data_mgr_alloc;
00520   float *f_data_mgr_host;
00521   float *f_data_mgr_dev;
00522   float **afn_host;
00523   float **afn_dev;
00524 
00525   float *bspline_coeffs_dev;
00526   float *bspline_dcoeffs_dev;
00527 #endif
00528   int recipEvirCount;   // used in compute only
00529   int recipEvirClients; // used in compute only
00530   int recipEvirPe;      // used in trans only
00531   
00532   LocalPmeInfo *localInfo;
00533   NodePmeInfo *gridNodeInfo;
00534   NodePmeInfo *transNodeInfo;
00535   int qgrid_size;
00536   int qgrid_start;
00537   int qgrid_len;
00538   int fgrid_start;
00539   int fgrid_len;
00540 
00541   int numSources;
00542   int numGridPes;
00543   int numTransPes;
00544   int numGridNodes;
00545   int numTransNodes;
00546   int numDestRecipPes;
00547   int myGridPe, myGridNode;
00548   int myTransPe, myTransNode;
00549   int *gridPeMap;
00550   int *transPeMap;
00551   int *recipPeDest;
00552   int *gridPeOrder;
00553   int *gridNodeOrder;
00554   int *transNodeOrder;
00555   int grid_count;
00556   int trans_count;
00557   int untrans_count;
00558   int ungrid_count;
00559   PmeGridMsg **gridmsg_reuse;
00560   PmeReduction recip_evir2[PME_MAX_EVALS];
00561 
00562   int compute_sequence;  // set from patch computes, used for priorities
00563   int grid_sequence;  // set from grid messages, used for priorities
00564   int useBarrier;
00565   int sendTransBarrier_received;
00566 
00567   int usePencils;
00568   int xBlocks, yBlocks, zBlocks;
00569   CProxy_PmeXPencil xPencil;
00570   CProxy_PmeYPencil yPencil;
00571   CProxy_PmeZPencil zPencil;
00572   char *pencilActive;
00573   ijpair *activePencils;
00574   int numPencilsActive;
00575   int strayChargeErrors;
00576 };
00577 
00578 ResizeArray<ComputePme*>& getComputes(ComputePmeMgr *mgr) {
00579     return mgr->pmeComputes ;
00580 }
00581 
00582   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00583 #ifdef NAMD_CUDA
00584   CmiNodeLock ComputePmeMgr::cuda_lock;
00585   std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
00586   bool ComputePmeMgr::cuda_busy;
00587 #endif
00588 
00589 int isPmeProcessor(int p){ 
00590   SimParameters *simParams = Node::Object()->simParameters;
00591   if (simParams->usePMECUDA) {
00592     return 0;
00593   } else {
00594     return pencilPMEProcessors[p];
00595   }
00596 }
00597 
00598 class NodePmeMgr : public CBase_NodePmeMgr {
00599 public:
00600   friend class ComputePmeMgr;
00601   friend class ComputePme;
00602   NodePmeMgr();
00603   ~NodePmeMgr();
00604   void initialize();
00605   void sendDataHelper(int);
00606   void sendPencilsHelper(int);
00607   void recvTrans(PmeTransMsg *);
00608   void recvUntrans(PmeUntransMsg *);
00609   void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00610   void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00611   void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00612   void recvXTrans(PmeTransMsg *);
00613   void recvYTrans(PmeTransMsg *);
00614   void recvYUntrans(PmeUntransMsg *);
00615   void recvZGrid(PmeGridMsg *);
00616   void recvZUntrans(PmeUntransMsg *);
00617 
00618   void recvUngrid(PmeGridMsg *);
00619 
00620   void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00621       xm=_xm; ym=_ym; zm=_zm;
00622   }
00623   CProxy_PmePencilMap xm;
00624   CProxy_PmePencilMap ym;
00625   CProxy_PmePencilMap zm;
00626 
00627 private:
00628   CProxy_ComputePmeMgr mgrProxy;
00629   ComputePmeMgr *mgrObject;
00630   ComputePmeMgr **mgrObjects;
00631 #ifdef NAMD_CUDA
00632   ComputePmeMgr *masterPmeMgr;
00633   int master_pe;
00634 #endif
00635   CProxy_PmeXPencil xPencil;
00636   CProxy_PmeYPencil yPencil;
00637   CProxy_PmeZPencil zPencil;
00638   CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00639   CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00640   CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;  
00641 
00642 #ifdef NAMD_CUDA
00643   cudaEvent_t end_charge_memset;
00644   cudaEvent_t end_all_pme_kernels;
00645   cudaEvent_t end_potential_memcpy;
00646 #endif
00647 };
00648 
00649 NodePmeMgr::NodePmeMgr() {
00650   mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00651 }
00652 
00653 NodePmeMgr::~NodePmeMgr() {
00654   delete [] mgrObjects;
00655 }
00656 
00657 void NodePmeMgr::initialize() {
00658   CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00659   mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00660   if ( CkMyRank() == 0 ) {
00661     mgrProxy = proxy;
00662     mgrObject = proxy.ckLocalBranch();
00663   }
00664 }
00665 
00666 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00667   mgrObject->fwdSharedTrans(msg);
00668 }
00669 
00670 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00671   mgrObject->fwdSharedUntrans(msg);
00672 }
00673 
00674 void NodePmeMgr::recvUngrid(PmeGridMsg *msg) {
00675 #ifdef NAMD_CUDA
00676   masterPmeMgr->recvUngrid(msg);
00677 #else
00678   NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
00679 #endif
00680 }
00681 
00682 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00683 {
00684   CmiLock(ComputePmeMgr::fftw_plan_lock);
00685   xPencilObj.put(idx)=obj;
00686   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00687 }
00688 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00689 {
00690   CmiLock(ComputePmeMgr::fftw_plan_lock);
00691   yPencilObj.put(idx)=obj;
00692   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00693 }
00694 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00695 {
00696   CmiLock(ComputePmeMgr::fftw_plan_lock);
00697   zPencilObj.put(idx)=obj;
00698   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00699 }
00700 
00701 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00702                                  pmeProxyDir(thisgroup) {
00703 
00704   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00705   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00706   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00707 
00708   pmeNodeProxy.ckLocalBranch()->initialize();
00709 
00710   if ( CmiMyRank() == 0 ) {
00711     fftw_plan_lock = CmiCreateLock();
00712   }
00713   pmemgr_lock = CmiCreateLock();
00714 
00715   myKSpace = 0;
00716   kgrid = 0;
00717   work = 0;
00718   grid_count = 0;
00719   trans_count = 0;
00720   untrans_count = 0;
00721   ungrid_count = 0;
00722   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00723   useBarrier = 0;
00724   sendTransBarrier_received = 0;
00725   usePencils = 0;
00726 
00727 #ifdef NAMD_CUDA
00728  // offload has not been set so this happens on every run
00729   if ( CmiMyRank() == 0 ) {
00730     cuda_lock = CmiCreateLock();
00731   }
00732 
00733 #if CUDA_VERSION >= 5050
00734   int leastPriority, greatestPriority;
00735   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00736   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00737   //if ( CkMyNode() == 0 ) {
00738   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00739   //}
00740 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00741 #else
00742 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00743 #endif
00744 
00745   stream = 0;
00746   for ( int i=0; i<NUM_STREAMS; ++i ) {
00747 #if 1
00748     CUDA_STREAM_CREATE(&streams[i]);
00749     cuda_errcheck("cudaStreamCreate");
00750 #else
00751   streams[i] = 0;  // XXXX Testing!!!
00752 #endif
00753   }
00754 
00755   this_pe = CkMyPe();
00756  
00757   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00758   end_forces = 0;
00759   check_charges_count = 0;
00760   check_forces_count = 0;
00761   chargeGridSubmittedCount = 0;
00762 
00763   cuda_atoms_count = 0;
00764   cuda_atoms_alloc = 0;
00765 
00766   f_data_mgr_alloc = 0;
00767   f_data_mgr_host = 0;
00768   f_data_mgr_dev = 0;
00769   afn_host = 0;
00770   afn_dev = 0;
00771 
00772 #define CUDA_EVENT_ID_PME_CHARGES 80
00773 #define CUDA_EVENT_ID_PME_FORCES 81
00774 #define CUDA_EVENT_ID_PME_TICK 82
00775 #define CUDA_EVENT_ID_PME_COPY 83
00776 #define CUDA_EVENT_ID_PME_KERNEL 84
00777   if ( 0 == CkMyPe() ) {
00778     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00779     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00780     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00781     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00782     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00783   }
00784 #endif
00785   recipEvirCount = 0;
00786   recipEvirClients = 0;
00787   recipEvirPe = -999;
00788 }
00789 
00790 
00791 void ComputePmeMgr::recvArrays(
00792         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00793   xPencil = x;  yPencil = y;  zPencil = z;
00794   
00795     if(CmiMyRank()==0)
00796     {
00797       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00798       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00799       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00800     }
00801 }
00802 
00803 #if USE_TOPO_SFC
00804  struct Coord
00805   {
00806     int x, y, z;
00807     Coord(): x(0), y(0), z(0) {}
00808     Coord(int a, int b, int c): x(a), y(b), z(c) {}
00809   };
00810   extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00811 
00812   void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00813   {
00814      SortableResizeArray<int> newprocs(procs.size());
00815      int num = 0;
00816      for (int i=0; i<result.size(); i++) {
00817        Coord &c = result[i];
00818        for (int j=0; j<procs.size(); j++) {
00819          int pe = procs[j];
00820          int x,y,z,t;
00821          tmgr.rankToCoordinates(pe, x, y, z, t);    
00822          if (x==c.x && y==c.y && z==c.z)
00823            newprocs[num++] = pe;
00824        }
00825      } 
00826      CmiAssert(newprocs.size() == procs.size());
00827      procs = newprocs;
00828   }
00829 
00830   int find_level_grid(int x) 
00831   {
00832      int a = sqrt(x);
00833      int b;
00834      for (; a>0; a--) {
00835        if (x%a == 0) break;
00836      }
00837      if (a==1) a = x;
00838      b = x/a;
00839      //return a>b?a:b;
00840      return b;
00841   }
00842   CmiNodeLock tmgr_lock;
00843 #endif
00844 
00845 void Pme_init()
00846 {
00847 #if USE_TOPO_SFC
00848   if (CkMyRank() == 0) 
00849     tmgr_lock = CmiCreateLock();
00850 #endif
00851 }
00852 
00853 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00854   delete msg;
00855 
00856   localInfo = new LocalPmeInfo[CkNumPes()];
00857   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00858   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00859   gridPeMap = new int[CkNumPes()];
00860   transPeMap = new int[CkNumPes()];
00861   recipPeDest = new int[CkNumPes()];
00862   gridPeOrder = new int[CkNumPes()];
00863   gridNodeOrder = new int[CkNumNodes()];
00864   transNodeOrder = new int[CkNumNodes()];
00865 
00866   if (CkMyRank() == 0) {
00867     pencilPMEProcessors = new char [CkNumPes()];
00868     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00869   }
00870 
00871   SimParameters *simParams = Node::Object()->simParameters;
00872   PatchMap *patchMap = PatchMap::Object();
00873 
00874   offload = simParams->PMEOffload;
00875 #ifdef NAMD_CUDA
00876   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00877     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00878   }
00879   if ( offload ) {
00880     int dev;
00881     cudaGetDevice(&dev);
00882     cuda_errcheck("in cudaGetDevice");
00883     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00884     cudaDeviceProp deviceProp;
00885     cudaGetDeviceProperties(&deviceProp, dev);
00886     cuda_errcheck("in cudaGetDeviceProperties");
00887     if ( deviceProp.major < 2 )
00888       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00889   }
00890 #endif
00891 
00892   alchLambda = -1.;  // illegal value to catch if not updated
00893 
00894   alchOn = simParams->alchOn;
00895   alchFepOn = simParams->alchFepOn;
00896   alchThermIntOn = simParams->alchThermIntOn;
00897   alchDecouple = alchOn && simParams->alchDecouple;
00898   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
00899   if (alchOn) {
00900     numGrids = 2;
00901     if (alchDecouple) numGrids += 2;
00902     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00903   }
00904   else numGrids = 1;
00905   lesOn = simParams->lesOn;
00906   useBarrier = simParams->PMEBarrier;
00907   if ( lesOn ) {
00908     lesFactor = simParams->lesFactor;
00909     numGrids = lesFactor;
00910   }
00911   selfOn = 0;
00912   pairOn = simParams->pairInteractionOn;
00913   if ( pairOn ) {
00914     selfOn = simParams->pairInteractionSelf;
00915     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00916     numGrids = selfOn ? 1 : 3;
00917   }
00918 
00919   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00920   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00921   else {
00922     int nrps = simParams->PMEProcessors;
00923     if ( nrps <= 0 ) nrps = CkNumPes();
00924     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00925     int dimx = simParams->PMEGridSizeX;
00926     int dimy = simParams->PMEGridSizeY;
00927     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00928     if ( maxslabs > nrps ) maxslabs = nrps;
00929     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00930                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00931     if ( maxpencils > nrps ) maxpencils = nrps;
00932     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00933     else usePencils = 0;
00934   }
00935 
00936   if ( usePencils ) {
00937     int nrps = simParams->PMEProcessors;
00938     if ( nrps <= 0 ) nrps = CkNumPes();
00939     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00940     if ( simParams->PMEPencils > 1 &&
00941          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00942       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00943     } else {
00944       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00945                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00946       if ( nb2 > nrps ) nb2 = nrps;
00947       if ( nb2 < 1 ) nb2 = 1;
00948       int nb = (int) sqrt((float)nb2);
00949       if ( nb < 1 ) nb = 1;
00950       xBlocks = zBlocks = nb;
00951       yBlocks = nb2 / nb;
00952     }
00953 
00954     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00955     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00956     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00957 
00958     int dimx = simParams->PMEGridSizeX;
00959     int bx = 1 + ( dimx - 1 ) / xBlocks;
00960     xBlocks = 1 + ( dimx - 1 ) / bx;
00961 
00962     int dimy = simParams->PMEGridSizeY;
00963     int by = 1 + ( dimy - 1 ) / yBlocks;
00964     yBlocks = 1 + ( dimy - 1 ) / by;
00965 
00966     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00967     int bz = 1 + ( dimz - 1 ) / zBlocks;
00968     zBlocks = 1 + ( dimz - 1 ) / bz;
00969 
00970     if ( xBlocks * yBlocks > CkNumPes() ) {
00971       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00972     }
00973     if ( xBlocks * zBlocks > CkNumPes() ) {
00974       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00975     }
00976     if ( yBlocks * zBlocks > CkNumPes() ) {
00977       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00978     }
00979 
00980     if ( ! CkMyPe() ) {
00981       iout << iINFO << "PME using " << xBlocks << " x " <<
00982         yBlocks << " x " << zBlocks <<
00983         " pencil grid for FFT and reciprocal sum.\n" << endi;
00984     }
00985   } else { // usePencils
00986 
00987   {  // decide how many pes to use for reciprocal sum
00988 
00989     // rules based on work available
00990     int minslices = simParams->PMEMinSlices;
00991     int dimx = simParams->PMEGridSizeX;
00992     int nrpx = ( dimx + minslices - 1 ) / minslices;
00993     int dimy = simParams->PMEGridSizeY;
00994     int nrpy = ( dimy + minslices - 1 ) / minslices;
00995 
00996     // rules based on processors available
00997     int nrpp = CkNumPes();
00998     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00999     if ( nrpp < nrpx ) nrpx = nrpp;
01000     if ( nrpp < nrpy ) nrpy = nrpp;
01001 
01002     // user override
01003     int nrps = simParams->PMEProcessors;
01004     if ( nrps > CkNumPes() ) nrps = CkNumPes();
01005     if ( nrps > 0 ) nrpx = nrps;
01006     if ( nrps > 0 ) nrpy = nrps;
01007 
01008     // make sure there aren't any totally empty processors
01009     int bx = ( dimx + nrpx - 1 ) / nrpx;
01010     nrpx = ( dimx + bx - 1 ) / bx;
01011     int by = ( dimy + nrpy - 1 ) / nrpy;
01012     nrpy = ( dimy + by - 1 ) / by;
01013     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01014       NAMD_bug("Error in selecting number of PME processors.");
01015     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01016       NAMD_bug("Error in selecting number of PME processors.");
01017 
01018     numGridPes = nrpx;
01019     numTransPes = nrpy;
01020   }
01021   if ( ! CkMyPe() ) {
01022     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01023       " processors for FFT and reciprocal sum.\n" << endi;
01024   }
01025 
01026   int sum_npes = numTransPes + numGridPes;
01027   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01028 
01029 #if 0 // USE_TOPOMAP
01030   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01031   PatchMap * pmap = PatchMap::Object();
01032   
01033   int patch_pes = pmap->numNodesWithPatches();
01034   TopoManager tmgr;
01035   if(tmgr.hasMultipleProcsPerNode())
01036     patch_pes *= 2;
01037 
01038   bool done = false;
01039   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01040     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01041     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01042   }
01043   else 
01044     if(CkNumPes() > 2 *max_npes + patch_pes) {
01045       done = generateBGLORBPmePeList(transPeMap, max_npes);
01046       gridPeMap = transPeMap;
01047     }
01048 
01049   if (!done)
01050 #endif
01051     {
01052       //generatePmePeList(transPeMap, max_npes);
01053       //gridPeMap = transPeMap;
01054       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01055     }
01056   
01057   if ( ! CkMyPe() ) {
01058     iout << iINFO << "PME GRID LOCATIONS:";
01059     int i;
01060     for ( i=0; i<numGridPes && i<10; ++i ) {
01061       iout << " " << gridPeMap[i];
01062     }
01063     if ( i < numGridPes ) iout << " ...";
01064     iout << "\n" << endi;
01065     iout << iINFO << "PME TRANS LOCATIONS:";
01066     for ( i=0; i<numTransPes && i<10; ++i ) {
01067       iout << " " << transPeMap[i];
01068     }
01069     if ( i < numTransPes ) iout << " ...";
01070     iout << "\n" << endi;
01071   }
01072 
01073   // sort based on nodes and physical nodes
01074   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01075 
01076   myGridPe = -1;
01077   myGridNode = -1;
01078   int i = 0;
01079   int node = -1;
01080   int real_node = -1;
01081   for ( i=0; i<numGridPes; ++i ) {
01082     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01083     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01084     int real_node_i = CkNodeOf(gridPeMap[i]);
01085     if ( real_node_i == real_node ) {
01086       gridNodeInfo[node].npe += 1;
01087     } else {
01088       real_node = real_node_i;
01089       ++node;
01090       gridNodeInfo[node].real_node = real_node;
01091       gridNodeInfo[node].pe_start = i;
01092       gridNodeInfo[node].npe = 1;
01093     }
01094     if ( CkMyNode() == real_node_i ) myGridNode = node;
01095   }
01096   numGridNodes = node + 1;
01097   myTransPe = -1;
01098   myTransNode = -1;
01099   node = -1;
01100   real_node = -1;
01101   for ( i=0; i<numTransPes; ++i ) {
01102     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01103     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01104     int real_node_i = CkNodeOf(transPeMap[i]);
01105     if ( real_node_i == real_node ) {
01106       transNodeInfo[node].npe += 1;
01107     } else {
01108       real_node = real_node_i;
01109       ++node;
01110       transNodeInfo[node].real_node = real_node;
01111       transNodeInfo[node].pe_start = i;
01112       transNodeInfo[node].npe = 1;
01113     }
01114     if ( CkMyNode() == real_node_i ) myTransNode = node;
01115   }
01116   numTransNodes = node + 1;
01117 
01118   if ( ! CkMyPe() ) {
01119     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01120          << numTransNodes << " TRANS NODES\n" << endi;
01121   }
01122 
01123   { // generate random orderings for grid and trans messages
01124     int i;
01125     for ( i = 0; i < numGridPes; ++i ) {
01126       gridPeOrder[i] = i;
01127     }
01128     Random rand(CkMyPe());
01129     if ( myGridPe < 0 ) {
01130       rand.reorder(gridPeOrder,numGridPes);
01131     } else {  // self last
01132       gridPeOrder[myGridPe] = numGridPes-1;
01133       gridPeOrder[numGridPes-1] = myGridPe;
01134       rand.reorder(gridPeOrder,numGridPes-1);
01135     } 
01136     for ( i = 0; i < numGridNodes; ++i ) {
01137       gridNodeOrder[i] = i;
01138     }
01139     if ( myGridNode < 0 ) {
01140       rand.reorder(gridNodeOrder,numGridNodes);
01141     } else {  // self last
01142       gridNodeOrder[myGridNode] = numGridNodes-1;
01143       gridNodeOrder[numGridNodes-1] = myGridNode;
01144       rand.reorder(gridNodeOrder,numGridNodes-1);
01145     }
01146     for ( i = 0; i < numTransNodes; ++i ) {
01147       transNodeOrder[i] = i;
01148     }
01149     if ( myTransNode < 0 ) {
01150       rand.reorder(transNodeOrder,numTransNodes);
01151     } else {  // self last
01152       transNodeOrder[myTransNode] = numTransNodes-1;
01153       transNodeOrder[numTransNodes-1] = myTransNode;
01154       rand.reorder(transNodeOrder,numTransNodes-1);
01155     }
01156   }
01157   
01158   } // ! usePencils
01159 
01160   myGrid.K1 = simParams->PMEGridSizeX;
01161   myGrid.K2 = simParams->PMEGridSizeY;
01162   myGrid.K3 = simParams->PMEGridSizeZ;
01163   myGrid.order = simParams->PMEInterpOrder;
01164   myGrid.dim2 = myGrid.K2;
01165   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01166 
01167   if ( ! usePencils ) {
01168     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01169     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01170     myGrid.block3 = myGrid.dim3 / 2;  // complex
01171   }
01172 
01173   if ( usePencils ) {
01174     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01175     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01176     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01177 
01178 
01179       int pe = 0;
01180       int x,y,z;
01181 
01182                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01183                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01184                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01185       
01186                 // decide which pes to use by bit reversal and patch use
01187                 int i;
01188                 int ncpus = CkNumPes();
01189                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01190                 PatchMap *pmap = PatchMap::Object();
01191                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01192                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01193                         if ( ri ) { // keep 0 for special case
01194                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
01195                                 else nopatches.add(ri);
01196                         }
01197                 }
01198 
01199 #if USE_RANDOM_TOPO
01200             Random rand(CkMyPe());
01201             int *tmp = new int[patches.size()];
01202             int nn = patches.size();
01203             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01204             rand.reorder(tmp, nn);
01205             patches.resize(0);
01206             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01207             delete [] tmp;
01208             tmp = new int[nopatches.size()];
01209             nn = nopatches.size();
01210             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01211             rand.reorder(tmp, nn);
01212             nopatches.resize(0);
01213             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01214             delete [] tmp;
01215 #endif
01216 
01217                 // only use zero if it eliminates overloading or has patches
01218                 int useZero = 0;
01219                 int npens = xBlocks*yBlocks;
01220                 if ( npens % ncpus == 0 ) useZero = 1;
01221                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01222                 npens += xBlocks*zBlocks;
01223                 if ( npens % ncpus == 0 ) useZero = 1;
01224                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01225                 npens += yBlocks*zBlocks;
01226                 if ( npens % ncpus == 0 ) useZero = 1;
01227                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01228 
01229                 // add nopatches then patches in reversed order
01230                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01231                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01232                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01233                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01234   
01235                 int npes = pmeprocs.size();
01236                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01237                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01238 #if !USE_RANDOM_TOPO
01239                 zprocs.sort();
01240 #endif
01241                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01242                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01243 #if !USE_RANDOM_TOPO
01244                 yprocs.sort();
01245 #endif
01246       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01247       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01248 #if !USE_RANDOM_TOPO
01249       xprocs.sort();
01250 #endif
01251 
01252 #if USE_TOPO_SFC
01253   CmiLock(tmgr_lock);
01254   //{
01255   TopoManager tmgr;
01256   int xdim = tmgr.getDimNX();
01257   int ydim = tmgr.getDimNY();
01258   int zdim = tmgr.getDimNZ();
01259   int xdim1 = find_level_grid(xdim);
01260   int ydim1 = find_level_grid(ydim);
01261   int zdim1 = find_level_grid(zdim);
01262   if(CkMyPe() == 0)
01263       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01264 
01265   vector<Coord> result;
01266   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01267   sort_sfc(xprocs, tmgr, result);
01268   sort_sfc(yprocs, tmgr, result);
01269   sort_sfc(zprocs, tmgr, result);
01270   //}
01271   CmiUnlock(tmgr_lock);
01272 #endif
01273 
01274 
01275                 if(CkMyPe() == 0){  
01276               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01277           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01278 #if USE_TOPO_SFC
01279               int x,y,z,t;
01280               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01281               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01282 #else
01283               iout << " " << zprocs[i];
01284 #endif
01285           }
01286           if ( i < zprocs.size() ) iout << " ...";
01287               iout << "\n" << endi;
01288                 }
01289 
01290     if (CkMyRank() == 0) {
01291       for (pe=0, x = 0; x < xBlocks; ++x)
01292         for (y = 0; y < yBlocks; ++y, ++pe ) {
01293           pencilPMEProcessors[zprocs[pe]] = 1;
01294         }
01295     }
01296      
01297                 if(CkMyPe() == 0){  
01298               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01299           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01300 #if USE_TOPO_SFC
01301               int x,y,z,t;
01302               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01303               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01304 #else
01305               iout << " " << yprocs[i];
01306 #endif
01307           }
01308           if ( i < yprocs.size() ) iout << " ...";
01309               iout << "\n" << endi;
01310                 }
01311 
01312     if (CkMyRank() == 0) {
01313       for (pe=0, z = 0; z < zBlocks; ++z )
01314         for (x = 0; x < xBlocks; ++x, ++pe ) {
01315           pencilPMEProcessors[yprocs[pe]] = 1;
01316         }
01317     }
01318     
01319                 if(CkMyPe() == 0){  
01320                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01321                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01322 #if USE_TOPO_SFC
01323                 int x,y,z,t;
01324                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01325                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01326 #else
01327                 iout << " " << xprocs[i];
01328 #endif
01329             }
01330                 if ( i < xprocs.size() ) iout << " ...";
01331                 iout << "\n" << endi;
01332                 }
01333 
01334     if (CkMyRank() == 0) {
01335       for (pe=0, y = 0; y < yBlocks; ++y )      
01336         for (z = 0; z < zBlocks; ++z, ++pe ) {
01337           pencilPMEProcessors[xprocs[pe]] = 1;
01338         }
01339     }
01340         
01341 
01342         // creating the pencil arrays
01343         if ( CkMyPe() == 0 ){
01344 #if !USE_RANDOM_TOPO
01345         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01346         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01347         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01348         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01349 #endif
01350 #if 1
01351         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01352         CProxy_PmePencilMap ym;
01353         if ( simParams->PMEPencilsYLayout )
01354           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01355         else
01356           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01357         CProxy_PmePencilMap xm;
01358         if ( simParams->PMEPencilsXLayout )
01359           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01360         else
01361           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01362         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01363         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01364         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01365         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01366         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01367         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01368         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01369         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01370         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01371         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01372 #else
01373         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01374         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01375         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01376 
01377                 for (pe=0, x = 0; x < xBlocks; ++x)
01378                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01379                                 zPencil(x,y,0).insert(zprocs[pe]);
01380                         }
01381         zPencil.doneInserting();
01382 
01383                 for (pe=0, x = 0; x < xBlocks; ++x)
01384                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01385                                 yPencil(x,0,z).insert(yprocs[pe]);
01386                         }
01387         yPencil.doneInserting();
01388 
01389 
01390                 for (pe=0, y = 0; y < yBlocks; ++y )    
01391                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01392                                 xPencil(0,y,z).insert(xprocs[pe]);
01393                         }
01394                 xPencil.doneInserting();     
01395 #endif
01396 
01397                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01398                 PmePencilInitMsgData msgdata;
01399                 msgdata.grid = myGrid;
01400                 msgdata.xBlocks = xBlocks;
01401                 msgdata.yBlocks = yBlocks;
01402                 msgdata.zBlocks = zBlocks;
01403                 msgdata.xPencil = xPencil;
01404                 msgdata.yPencil = yPencil;
01405                 msgdata.zPencil = zPencil;
01406                 msgdata.pmeProxy = pmeProxyDir;
01407         msgdata.pmeNodeProxy = pmeNodeProxy;
01408         msgdata.xm = xm;
01409         msgdata.ym = ym;
01410         msgdata.zm = zm;
01411                 xPencil.init(new PmePencilInitMsg(msgdata));
01412                 yPencil.init(new PmePencilInitMsg(msgdata));
01413                 zPencil.init(new PmePencilInitMsg(msgdata));
01414         }
01415 
01416     return;  // continue in initialize_pencils() at next startup stage
01417   }
01418 
01419 
01420   int pe;
01421   int nx = 0;
01422   for ( pe = 0; pe < numGridPes; ++pe ) {
01423     localInfo[pe].x_start = nx;
01424     nx += myGrid.block1;
01425     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01426     localInfo[pe].nx = nx - localInfo[pe].x_start;
01427   }
01428   int ny = 0;
01429   for ( pe = 0; pe < numTransPes; ++pe ) {
01430     localInfo[pe].y_start_after_transpose = ny;
01431     ny += myGrid.block2;
01432     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01433     localInfo[pe].ny_after_transpose =
01434                         ny - localInfo[pe].y_start_after_transpose;
01435   }
01436 
01437   {  // decide how many pes this node exchanges charges with
01438 
01439   PatchMap *patchMap = PatchMap::Object();
01440   Lattice lattice = simParams->lattice;
01441   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01442   BigReal cutoff = simParams->cutoff;
01443   BigReal patchdim = simParams->patchDimension;
01444   int numPatches = patchMap->numPatches();
01445   int numNodes = CkNumPes();
01446   int *source_flags = new int[numNodes];
01447   int node;
01448   for ( node=0; node<numNodes; ++node ) {
01449     source_flags[node] = 0;
01450     recipPeDest[node] = 0;
01451   }
01452 
01453   // // make sure that we don't get ahead of ourselves on this node
01454   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01455   //   source_flags[CkMyPe()] = 1;
01456   //   recipPeDest[myRecipPe] = 1;
01457   // }
01458 
01459   for ( int pid=0; pid < numPatches; ++pid ) {
01460     int pnode = patchMap->node(pid);
01461 #ifdef NAMD_CUDA
01462     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01463 #endif
01464     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01465     BigReal minx = patchMap->min_a(pid);
01466     BigReal maxx = patchMap->max_a(pid);
01467     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01468     // min1 (max1) is smallest (largest) grid line for this patch
01469     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01470     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01471     for ( int i=min1; i<=max1; ++i ) {
01472       int ix = i;
01473       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01474       while ( ix < 0 ) ix += myGrid.K1;
01475       // set source_flags[pnode] if this patch sends to our node
01476       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01477            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01478         source_flags[pnode] = 1;
01479       }
01480       // set dest_flags[] for node that our patch sends to
01481 #ifdef NAMD_CUDA
01482       if ( offload ) {
01483         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01484           recipPeDest[ix / myGrid.block1] = 1;
01485         }
01486       } else
01487 #endif
01488       if ( pnode == CkMyPe() ) {
01489         recipPeDest[ix / myGrid.block1] = 1;
01490       }
01491     }
01492   }
01493 
01494   int numSourcesSamePhysicalNode = 0;
01495   numSources = 0;
01496   numDestRecipPes = 0;
01497   for ( node=0; node<numNodes; ++node ) {
01498     if ( source_flags[node] ) ++numSources;
01499     if ( recipPeDest[node] ) ++numDestRecipPes;
01500     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01501   }
01502 
01503 #if 0
01504   if ( numSources ) {
01505     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01506             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01507     iout << iINFO << "PME " << CkMyPe() << " sources:";
01508     for ( node=0; node<numNodes; ++node ) {
01509       if ( source_flags[node] ) iout << " " << node;
01510     }
01511     iout << "\n" << endi;
01512   }
01513 #endif
01514 
01515   delete [] source_flags;
01516 
01517   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01518   //           CkMyPe(), numSources, numDestRecipPes);
01519 
01520   }  // decide how many pes this node exchanges charges with (end)
01521 
01522   ungrid_count = numDestRecipPes;
01523 
01524   sendTransBarrier_received = 0;
01525 
01526   if ( myGridPe < 0 && myTransPe < 0 ) return;
01527   // the following only for nodes doing reciprocal sum
01528 
01529   if ( myTransPe >= 0 ) {
01530     recipEvirPe = findRecipEvirPe();
01531     pmeProxy[recipEvirPe].addRecipEvirClient();
01532   }
01533 
01534   if ( myTransPe >= 0 ) {
01535       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01536       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01537       #ifdef OPENATOM_VERSION
01538       if ( simParams->openatomOn ) { 
01539         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01540         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01541       } else {
01542         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01543       }
01544       #else  // OPENATOM_VERSION
01545       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01546       #endif // OPENATOM_VERSION
01547   }
01548 
01549   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01550   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01551   if ( local_size < local_size_2 ) local_size = local_size_2;
01552   qgrid = new float[local_size*numGrids];
01553   if ( numGridPes > 1 || numTransPes > 1 ) {
01554     kgrid = new float[local_size*numGrids];
01555   } else {
01556     kgrid = qgrid;
01557   }
01558   qgrid_size = local_size;
01559 
01560   if ( myGridPe >= 0 ) {
01561   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01562   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01563   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01564   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01565   }
01566 
01567   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01568 #ifdef NAMD_FFTW
01569   CmiLock(fftw_plan_lock);
01570 #ifdef NAMD_FFTW_3
01571   work = new fftwf_complex[n[0]];
01572   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01573   if ( myGridPe >= 0 ) {
01574     forward_plan_yz=new fftwf_plan[numGrids];
01575     backward_plan_yz=new fftwf_plan[numGrids];
01576   }
01577   if ( myTransPe >= 0 ) {
01578     forward_plan_x=new fftwf_plan[numGrids];
01579     backward_plan_x=new fftwf_plan[numGrids];
01580   }
01581   /* need one plan per grid */
01582   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01583   if ( myGridPe >= 0 ) {
01584     for( int g=0; g<numGrids; g++)
01585       {
01586         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01587                                                      localInfo[myGridPe].nx,
01588                                                      qgrid + qgrid_size * g,
01589                                                      NULL,
01590                                                      1,
01591                                                      myGrid.dim2 * myGrid.dim3,
01592                                                      (fftwf_complex *) 
01593                                                      (qgrid + qgrid_size * g),
01594                                                      NULL,
01595                                                      1,
01596                                                      myGrid.dim2 * (myGrid.dim3/2),
01597                                                      fftwFlags);
01598       }
01599   }
01600   int zdim = myGrid.dim3;
01601   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01602   if ( ! CkMyPe() ) iout << " 2..." << endi;
01603   if ( myTransPe >= 0 ) {
01604     for( int g=0; g<numGrids; g++)
01605       {
01606 
01607         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01608                                                 (fftwf_complex *)
01609                                                 (kgrid+qgrid_size*g),
01610                                                 NULL,
01611                                                 xStride,
01612                                                 1,
01613                                                 (fftwf_complex *)
01614                                                 (kgrid+qgrid_size*g),
01615                                                 NULL,
01616                                                 xStride,
01617                                                 1,
01618                                                 FFTW_FORWARD,fftwFlags);
01619         
01620       }
01621   }
01622   if ( ! CkMyPe() ) iout << " 3..." << endi;
01623   if ( myTransPe >= 0 ) {
01624     for( int g=0; g<numGrids; g++)
01625       {
01626         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01627                                                  (fftwf_complex *)
01628                                                  (kgrid+qgrid_size*g),
01629                                                  NULL,
01630                                                  xStride,
01631                                                  1,
01632                                                  (fftwf_complex *)
01633                                                  (kgrid+qgrid_size*g),
01634                                                  NULL,
01635                                                  xStride,
01636                                                  1,
01637                                                  FFTW_BACKWARD, fftwFlags);
01638 
01639       }
01640   }
01641   if ( ! CkMyPe() ) iout << " 4..." << endi;
01642   if ( myGridPe >= 0 ) {
01643     for( int g=0; g<numGrids; g++)
01644       {
01645         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01646                                                       localInfo[myGridPe].nx,
01647                                                       (fftwf_complex *)
01648                                                       (qgrid + qgrid_size * g),
01649                                                       NULL,
01650                                                       1,
01651                                                       myGrid.dim2*(myGrid.dim3/2),
01652                                                       qgrid + qgrid_size * g,
01653                                                       NULL,
01654                                                       1,
01655                                                       myGrid.dim2 * myGrid.dim3,
01656                                                       fftwFlags);
01657       }
01658   }
01659   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01660 
01661 #else
01662   work = new fftw_complex[n[0]];
01663 
01664   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01665   if ( myGridPe >= 0 ) {
01666   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01667         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01668         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01669   }
01670   if ( ! CkMyPe() ) iout << " 2..." << endi;
01671   if ( myTransPe >= 0 ) {
01672       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01673         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01674         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01675         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01676   }
01677   if ( ! CkMyPe() ) iout << " 3..." << endi;
01678   if ( myTransPe >= 0 ) {
01679   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01680         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01681         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01682         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01683   }
01684   if ( ! CkMyPe() ) iout << " 4..." << endi;
01685   if ( myGridPe >= 0 ) {
01686   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01687         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01688         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01689   }
01690   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01691 #endif
01692   CmiUnlock(fftw_plan_lock);
01693 #else
01694   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01695 #endif
01696 
01697   if ( myGridPe >= 0 && numSources == 0 )
01698                 NAMD_bug("PME grid elements exist without sources.");
01699   grid_count = numSources;
01700   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01701   trans_count = numGridPes;
01702 }
01703 
01704 
01705 
01706 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01707   delete msg;
01708   if ( ! usePencils ) return;
01709 
01710   SimParameters *simParams = Node::Object()->simParameters;
01711 
01712   PatchMap *patchMap = PatchMap::Object();
01713   Lattice lattice = simParams->lattice;
01714   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01715   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01716   BigReal cutoff = simParams->cutoff;
01717   BigReal patchdim = simParams->patchDimension;
01718   int numPatches = patchMap->numPatches();
01719 
01720   pencilActive = new char[xBlocks*yBlocks];
01721   for ( int i=0; i<xBlocks; ++i ) {
01722     for ( int j=0; j<yBlocks; ++j ) {
01723       pencilActive[i*yBlocks+j] = 0;
01724     }
01725   }
01726 
01727   for ( int pid=0; pid < numPatches; ++pid ) {
01728     int pnode = patchMap->node(pid);
01729 #ifdef NAMD_CUDA
01730     if ( offload ) {
01731       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01732     } else
01733 #endif
01734     if ( pnode != CkMyPe() ) continue;
01735 
01736     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01737     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01738 
01739     BigReal minx = patchMap->min_a(pid);
01740     BigReal maxx = patchMap->max_a(pid);
01741     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01742     // min1 (max1) is smallest (largest) grid line for this patch
01743     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01744     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01745 
01746     BigReal miny = patchMap->min_b(pid);
01747     BigReal maxy = patchMap->max_b(pid);
01748     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01749     // min2 (max2) is smallest (largest) grid line for this patch
01750     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01751     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01752 
01753     for ( int i=min1; i<=max1; ++i ) {
01754       int ix = i;
01755       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01756       while ( ix < 0 ) ix += myGrid.K1;
01757       for ( int j=min2; j<=max2; ++j ) {
01758         int jy = j;
01759         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01760         while ( jy < 0 ) jy += myGrid.K2;
01761         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01762       }
01763     }
01764   }
01765 
01766   numPencilsActive = 0;
01767   for ( int i=0; i<xBlocks; ++i ) {
01768     for ( int j=0; j<yBlocks; ++j ) {
01769       if ( pencilActive[i*yBlocks+j] ) {
01770         ++numPencilsActive;
01771 #ifdef NAMD_CUDA
01772         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01773 #endif
01774         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01775       }
01776     }
01777   }
01778   activePencils = new ijpair[numPencilsActive];
01779   numPencilsActive = 0;
01780   for ( int i=0; i<xBlocks; ++i ) {
01781     for ( int j=0; j<yBlocks; ++j ) {
01782       if ( pencilActive[i*yBlocks+j] ) {
01783         activePencils[numPencilsActive++] = ijpair(i,j);
01784       }
01785     }
01786   }
01787   if ( simParams->PMESendOrder ) {
01788     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01789   } else {
01790     Random rand(CkMyPe());
01791     rand.reorder(activePencils,numPencilsActive);
01792   }
01793   //if ( numPencilsActive ) {
01794   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01795   //}
01796 
01797   ungrid_count = numPencilsActive;
01798 }
01799 
01800 
01801 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01802   if ( ! usePencils ) return;
01803   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01804 }
01805 
01806 
01807 ComputePmeMgr::~ComputePmeMgr() {
01808 
01809   if ( CmiMyRank() == 0 ) {
01810     CmiDestroyLock(fftw_plan_lock);
01811   }
01812   CmiDestroyLock(pmemgr_lock);
01813 
01814   delete myKSpace;
01815   delete [] localInfo;
01816   delete [] gridNodeInfo;
01817   delete [] transNodeInfo;
01818   delete [] gridPeMap;
01819   delete [] transPeMap;
01820   delete [] recipPeDest;
01821   delete [] gridPeOrder;
01822   delete [] gridNodeOrder;
01823   delete [] transNodeOrder;
01824   delete [] qgrid;
01825   if ( kgrid != qgrid ) delete [] kgrid;
01826   delete [] work;
01827   delete [] gridmsg_reuse;
01828 
01829  if ( ! offload ) {
01830   for (int i=0; i<q_count; ++i) {
01831     delete [] q_list[i];
01832   }
01833   delete [] q_list;
01834   delete [] fz_arr;
01835  }
01836   delete [] f_arr;
01837   delete [] q_arr;
01838 }
01839 
01840 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01841   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01842   if ( grid_count == 0 ) {
01843     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01844   }
01845   if ( grid_count == numSources ) {
01846     lattice = msg->lattice;
01847     grid_sequence = msg->sequence;
01848   }
01849 
01850   int zdim = myGrid.dim3;
01851   int zlistlen = msg->zlistlen;
01852   int *zlist = msg->zlist;
01853   float *qmsg = msg->qgrid;
01854   for ( int g=0; g<numGrids; ++g ) {
01855     char *f = msg->fgrid + fgrid_len * g;
01856     float *q = qgrid + qgrid_size * g;
01857     for ( int i=0; i<fgrid_len; ++i ) {
01858       if ( f[i] ) {
01859         for ( int k=0; k<zlistlen; ++k ) {
01860           q[zlist[k]] += *(qmsg++);
01861         }
01862       }
01863       q += zdim;
01864     }
01865   }
01866 
01867   gridmsg_reuse[numSources-grid_count] = msg;
01868   --grid_count;
01869 
01870   if ( grid_count == 0 ) {
01871     pmeProxyDir[CkMyPe()].gridCalc1();
01872     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01873   }
01874 }
01875 #ifdef MANUAL_DEBUG_FFTW3
01876 
01877 /* utility functions for manual debugging */
01878 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01879 {
01880 
01881   char fmt[1000];
01882   char filename[1000];
01883   strncpy(fmt,infilename,999);
01884   strncat(fmt,"_%d.out",999);
01885   sprintf(filename,fmt, pe);
01886   FILE *loutfile = fopen(filename, "w");
01887 #ifdef PAIRCALC_TEST_DUMP
01888   fprintf(loutfile,"%d\n",ydim);
01889 #endif
01890   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01891   for(int i=0;i<xdim;i++)
01892     for(int j=0;j<ydim;j++)
01893       for(int k=0;k<zdim;k++)
01894         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01895   fclose(loutfile);
01896 
01897 }
01898 
01899 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01900 {
01901   char fmt[1000];
01902   char filename[1000];
01903   strncpy(fmt,infilename,999);
01904   strncat(fmt,"_%d_%d_%d.out",999);
01905   sprintf(filename,fmt, x,y,z);
01906   FILE *loutfile = fopen(filename, "w");
01907   CkAssert(loutfile!=NULL);
01908   CkPrintf("opened %s for dump\n",filename);
01909   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01910   for(int i=0;i<xdim;i++)
01911     for(int j=0;j<ydim;j++)
01912       for(int k=0;k<zdim;k++)
01913         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01914   fclose(loutfile);
01915 }
01916 
01917 #endif
01918 
01919 void ComputePmeMgr::gridCalc1(void) {
01920   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01921 
01922 #ifdef NAMD_FFTW
01923   for ( int g=0; g<numGrids; ++g ) {
01924 #ifdef NAMD_FFTW_3
01925     fftwf_execute(forward_plan_yz[g]);
01926 #else
01927     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01928         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01929 #endif
01930 
01931   }
01932 #endif
01933 
01934   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01935 }
01936 
01937 void ComputePmeMgr::sendTransBarrier(void) {
01938   sendTransBarrier_received += 1;
01939   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01940   if ( sendTransBarrier_received < numGridPes ) return;
01941   sendTransBarrier_received = 0;
01942   for ( int i=0; i<numGridPes; ++i ) {
01943     pmeProxyDir[gridPeMap[i]].sendTrans();
01944   }
01945 }
01946 
01947 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
01948   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
01949   mgr->sendTransSubset(first, last);
01950 }
01951 
01952 void ComputePmeMgr::sendTrans(void) {
01953 
01954   untrans_count = numTransPes;
01955 
01956 #if     CMK_SMP && USE_CKLOOP
01957   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01958   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01959     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01960   } else
01961 #endif
01962   {
01963     sendTransSubset(0, numTransNodes-1);
01964   }
01965 
01966 }
01967 
01968 void ComputePmeMgr::sendTransSubset(int first, int last) {
01969   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01970 
01971   // send data for transpose
01972   int zdim = myGrid.dim3;
01973   int nx = localInfo[myGridPe].nx;
01974   int x_start = localInfo[myGridPe].x_start;
01975   int slicelen = myGrid.K2 * zdim;
01976 
01977   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01978 
01979 #if CMK_BLUEGENEL
01980   CmiNetworkProgressAfter (0);
01981 #endif
01982 
01983   for (int j=first; j<=last; j++) {
01984     int node = transNodeOrder[j];  // different order on each node
01985     int pe = transNodeInfo[node].pe_start;
01986     int npe = transNodeInfo[node].npe;
01987     int totlen = 0;
01988     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01989       LocalPmeInfo &li = localInfo[pe];
01990       int cpylen = li.ny_after_transpose * zdim;
01991       totlen += cpylen;
01992     }
01993     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01994                                 PRIORITY_SIZE) PmeTransMsg;
01995     newmsg->sourceNode = myGridPe;
01996     newmsg->lattice = lattice;
01997     newmsg->x_start = x_start;
01998     newmsg->nx = nx;
01999     for ( int g=0; g<numGrids; ++g ) {
02000       float *qmsg = newmsg->qgrid + nx * totlen * g;
02001       pe = transNodeInfo[node].pe_start;
02002       for (int i=0; i<npe; ++i, ++pe) {
02003         LocalPmeInfo &li = localInfo[pe];
02004         int cpylen = li.ny_after_transpose * zdim;
02005         if ( node == myTransNode ) {
02006           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
02007           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
02008         }
02009         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
02010         for ( int x = 0; x < nx; ++x ) {
02011           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02012           q += slicelen;
02013           qmsg += cpylen;
02014         }
02015       }
02016     }
02017     newmsg->sequence = grid_sequence;
02018     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02019     if ( node == myTransNode ) newmsg->nx = 0;
02020     if ( npe > 1 ) {
02021       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02022       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02023     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02024   }
02025 }
02026 
02027 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
02028   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02029   int pe = transNodeInfo[myTransNode].pe_start;
02030   int npe = transNodeInfo[myTransNode].npe;
02031   CmiNodeLock lock = CmiCreateLock();
02032   int *count = new int; *count = npe;
02033   for (int i=0; i<npe; ++i, ++pe) {
02034     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02035     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02036     shmsg->msg = msg;
02037     shmsg->count = count;
02038     shmsg->lock = lock;
02039     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02040   }
02041 }
02042 
02043 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
02044   procTrans(msg->msg);
02045   CmiLock(msg->lock);
02046   int count = --(*msg->count);
02047   CmiUnlock(msg->lock);
02048   if ( count == 0 ) {
02049     CmiDestroyLock(msg->lock);
02050     delete msg->count;
02051     delete msg->msg;
02052   }
02053   delete msg;
02054 }
02055 
02056 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
02057   procTrans(msg);
02058   delete msg;
02059 }
02060 
02061 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
02062   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02063   if ( trans_count == numGridPes ) {
02064     lattice = msg->lattice;
02065     grid_sequence = msg->sequence;
02066   }
02067 
02068  if ( msg->nx ) {
02069   int zdim = myGrid.dim3;
02070   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02071   int first_pe = nodeInfo.pe_start;
02072   int last_pe = first_pe+nodeInfo.npe-1;
02073   int y_skip = localInfo[myTransPe].y_start_after_transpose
02074              - localInfo[first_pe].y_start_after_transpose;
02075   int ny_msg = localInfo[last_pe].y_start_after_transpose
02076              + localInfo[last_pe].ny_after_transpose
02077              - localInfo[first_pe].y_start_after_transpose;
02078   int ny = localInfo[myTransPe].ny_after_transpose;
02079   int x_start = msg->x_start;
02080   int nx = msg->nx;
02081   for ( int g=0; g<numGrids; ++g ) {
02082     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02083         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02084         nx*ny*zdim*sizeof(float));
02085   }
02086  }
02087 
02088   --trans_count;
02089 
02090   if ( trans_count == 0 ) {
02091     pmeProxyDir[CkMyPe()].gridCalc2();
02092   }
02093 }
02094 
02095 void ComputePmeMgr::gridCalc2(void) {
02096   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02097 
02098 #if CMK_BLUEGENEL
02099   CmiNetworkProgressAfter (0);
02100 #endif
02101 
02102   int zdim = myGrid.dim3;
02103   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02104   int ny = localInfo[myTransPe].ny_after_transpose;
02105 
02106   for ( int g=0; g<numGrids; ++g ) {
02107     // finish forward FFT (x dimension)
02108 #ifdef NAMD_FFTW
02109 #ifdef NAMD_FFTW_3
02110     fftwf_execute(forward_plan_x[g]);
02111 #else
02112     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02113         ny * zdim / 2, 1, work, 1, 0);
02114 #endif
02115 #endif
02116   }
02117 
02118 #ifdef OPENATOM_VERSION
02119     if ( ! simParams -> openatomOn ) { 
02120 #endif // OPENATOM_VERSION
02121       gridCalc2R();
02122 #ifdef OPENATOM_VERSION
02123     } else {
02124       gridCalc2Moa();
02125     }
02126 #endif // OPENATOM_VERSION
02127 }
02128 
02129 #ifdef OPENATOM_VERSION
02130 void ComputePmeMgr::gridCalc2Moa(void) {
02131 
02132   int zdim = myGrid.dim3;
02133   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02134   int ny = localInfo[myTransPe].ny_after_transpose;
02135 
02136   SimParameters *simParams = Node::Object()->simParameters;
02137 
02138   CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
02139 
02140   for ( int g=0; g<numGrids; ++g ) {
02141     #ifdef OPENATOM_VERSION_DEBUG 
02142     CkPrintf("Sending recQ on processor %d \n", CkMyPe());
02143     for ( int i=0; i<=(ny * zdim / 2); ++i) 
02144     {
02145       CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
02146     }
02147     #endif // OPENATOM_VERSION_DEBUG
02148 //     mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
02149     CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
02150     moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
02151   }
02152 }
02153 #endif // OPENATOM_VERSION
02154 
02155 void ComputePmeMgr::gridCalc2R(void) {
02156 
02157   int useCkLoop = 0;
02158 #if CMK_SMP && USE_CKLOOP
02159   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02160        && CkNumPes() >= 2 * numTransPes ) {
02161     useCkLoop = 1;
02162   }
02163 #endif
02164 
02165   int zdim = myGrid.dim3;
02166   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02167   int ny = localInfo[myTransPe].ny_after_transpose;
02168 
02169   for ( int g=0; g<numGrids; ++g ) {
02170     // reciprocal space portion of PME
02171     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02172     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02173                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02174     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02175 
02176     // start backward FFT (x dimension)
02177 
02178 #ifdef NAMD_FFTW
02179 #ifdef NAMD_FFTW_3
02180     fftwf_execute(backward_plan_x[g]);
02181 #else
02182     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02183         ny * zdim / 2, 1, work, 1, 0);
02184 #endif
02185 #endif
02186   }
02187   
02188   pmeProxyDir[CkMyPe()].sendUntrans();
02189 }
02190 
02191 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
02192   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02193   mgr->sendUntransSubset(first, last);
02194 }
02195 
02196 void ComputePmeMgr::sendUntrans(void) {
02197 
02198   trans_count = numGridPes;
02199 
02200   { // send energy and virial
02201     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02202     for ( int g=0; g<numGrids; ++g ) {
02203       newmsg->evir[g] = recip_evir2[g];
02204     }
02205     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02206     CmiEnableUrgentSend(1);
02207     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02208     CmiEnableUrgentSend(0);
02209   }
02210 
02211 #if     CMK_SMP && USE_CKLOOP
02212   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02213   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02214     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02215   } else
02216 #endif
02217   {
02218     sendUntransSubset(0, numGridNodes-1);
02219   }
02220 
02221 }
02222 
02223 void ComputePmeMgr::sendUntransSubset(int first, int last) {
02224 
02225   int zdim = myGrid.dim3;
02226   int y_start = localInfo[myTransPe].y_start_after_transpose;
02227   int ny = localInfo[myTransPe].ny_after_transpose;
02228   int slicelen = myGrid.K2 * zdim;
02229 
02230   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02231 
02232 #if CMK_BLUEGENEL
02233   CmiNetworkProgressAfter (0);
02234 #endif
02235 
02236   // send data for reverse transpose
02237   for (int j=first; j<=last; j++) {
02238     int node = gridNodeOrder[j];  // different order on each node
02239     int pe = gridNodeInfo[node].pe_start;
02240     int npe = gridNodeInfo[node].npe;
02241     int totlen = 0;
02242     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02243       LocalPmeInfo &li = localInfo[pe];
02244       int cpylen = li.nx * zdim;
02245       totlen += cpylen;
02246     }
02247     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02248     newmsg->sourceNode = myTransPe;
02249     newmsg->y_start = y_start;
02250     newmsg->ny = ny;
02251     for ( int g=0; g<numGrids; ++g ) {
02252       float *qmsg = newmsg->qgrid + ny * totlen * g;
02253       pe = gridNodeInfo[node].pe_start;
02254       for (int i=0; i<npe; ++i, ++pe) {
02255         LocalPmeInfo &li = localInfo[pe];
02256         if ( node == myGridNode ) {
02257           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02258           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02259           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02260           int cpylen = ny * zdim;
02261           for ( int x = 0; x < li.nx; ++x ) {
02262             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02263             q += cpylen;
02264             qmsg += slicelen;
02265           }
02266         } else {
02267           CmiMemcpy((void*)qmsg,
02268                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02269                 li.nx*ny*zdim*sizeof(float));
02270           qmsg += li.nx*ny*zdim;
02271         }
02272       }
02273     }
02274     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02275     if ( node == myGridNode ) newmsg->ny = 0;
02276     if ( npe > 1 ) {
02277       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02278       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02279     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02280   }
02281 }
02282 
02283 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
02284   int pe = gridNodeInfo[myGridNode].pe_start;
02285   int npe = gridNodeInfo[myGridNode].npe;
02286   CmiNodeLock lock = CmiCreateLock();
02287   int *count = new int; *count = npe;
02288   for (int i=0; i<npe; ++i, ++pe) {
02289     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02290     shmsg->msg = msg;
02291     shmsg->count = count;
02292     shmsg->lock = lock;
02293     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02294   }
02295 }
02296 
02297 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
02298   procUntrans(msg->msg);
02299   CmiLock(msg->lock);
02300   int count = --(*msg->count);
02301   CmiUnlock(msg->lock);
02302   if ( count == 0 ) {
02303     CmiDestroyLock(msg->lock);
02304     delete msg->count;
02305     delete msg->msg;
02306   }
02307   delete msg;
02308 }
02309 
02310 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
02311   procUntrans(msg);
02312   delete msg;
02313 }
02314 
02315 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
02316   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02317 
02318 #if CMK_BLUEGENEL
02319   CmiNetworkProgressAfter (0);
02320 #endif
02321 
02322   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02323   int first_pe = nodeInfo.pe_start;
02324   int g;
02325 
02326  if ( msg->ny ) {
02327   int zdim = myGrid.dim3;
02328   int last_pe = first_pe+nodeInfo.npe-1;
02329   int x_skip = localInfo[myGridPe].x_start
02330              - localInfo[first_pe].x_start;
02331   int nx_msg = localInfo[last_pe].x_start
02332              + localInfo[last_pe].nx
02333              - localInfo[first_pe].x_start;
02334   int nx = localInfo[myGridPe].nx;
02335   int y_start = msg->y_start;
02336   int ny = msg->ny;
02337   int slicelen = myGrid.K2 * zdim;
02338   int cpylen = ny * zdim;
02339   for ( g=0; g<numGrids; ++g ) {
02340     float *q = qgrid + qgrid_size * g + y_start * zdim;
02341     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02342     for ( int x = 0; x < nx; ++x ) {
02343       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02344       q += slicelen;
02345       qmsg += cpylen;
02346     }
02347   }
02348  }
02349 
02350   --untrans_count;
02351 
02352   if ( untrans_count == 0 ) {
02353     pmeProxyDir[CkMyPe()].gridCalc3();
02354   }
02355 }
02356 
02357 void ComputePmeMgr::gridCalc3(void) {
02358   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02359 
02360   // finish backward FFT
02361 #ifdef NAMD_FFTW
02362 
02363   for ( int g=0; g<numGrids; ++g ) {
02364 #ifdef NAMD_FFTW_3
02365     fftwf_execute(backward_plan_yz[g]);
02366 #else
02367     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02368         (fftw_complex *) (qgrid + qgrid_size * g),
02369         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02370 #endif
02371   }
02372 
02373 #endif
02374 
02375   pmeProxyDir[CkMyPe()].sendUngrid();
02376 }
02377 
02378 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
02379   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02380   mgr->sendUngridSubset(first, last);
02381 }
02382 
02383 void ComputePmeMgr::sendUngrid(void) {
02384 
02385 #if     CMK_SMP && USE_CKLOOP
02386   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02387   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02388     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02389   } else
02390 #endif
02391   {
02392     sendUngridSubset(0, numSources-1);
02393   }
02394 
02395   grid_count = numSources;
02396   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02397 }
02398 
02399 void ComputePmeMgr::sendUngridSubset(int first, int last) {
02400 
02401 #ifdef NAMD_CUDA
02402   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02403 #else
02404   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02405 #endif
02406 
02407   for ( int j=first; j<=last; ++j ) {
02408     // int msglen = qgrid_len;
02409     PmeGridMsg *newmsg = gridmsg_reuse[j];
02410     int pe = newmsg->sourceNode;
02411     int zdim = myGrid.dim3;
02412     int flen = newmsg->len;
02413     int fstart = newmsg->start;
02414     int zlistlen = newmsg->zlistlen;
02415     int *zlist = newmsg->zlist;
02416     float *qmsg = newmsg->qgrid;
02417     for ( int g=0; g<numGrids; ++g ) {
02418       char *f = newmsg->fgrid + fgrid_len * g;
02419       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02420       for ( int i=0; i<flen; ++i ) {
02421         if ( f[i] ) {
02422           for ( int k=0; k<zlistlen; ++k ) {
02423             *(qmsg++) = q[zlist[k]];
02424           }
02425         }
02426         q += zdim;
02427       }
02428     }
02429     newmsg->sourceNode = myGridPe;
02430 
02431     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02432     CmiEnableUrgentSend(1);
02433 #ifdef NAMD_CUDA
02434     if ( offload ) {
02435       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02436     } else
02437 #endif
02438     pmeProxyDir[pe].recvUngrid(newmsg);
02439     CmiEnableUrgentSend(0);
02440   }
02441 }
02442 
02443 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02444   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02445 #ifdef NAMD_CUDA
02446   if ( ! offload )  // would need lock
02447 #endif
02448   if ( ungrid_count == 0 ) {
02449     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02450   }
02451 
02452   if ( usePencils ) copyPencils(msg);
02453   else copyResults(msg);
02454   delete msg;
02455   recvAck(0);
02456 }
02457 
02458 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02459   if ( msg ) delete msg;
02460 #ifdef NAMD_CUDA
02461   if ( offload ) {
02462     CmiLock(cuda_lock);
02463     if ( ungrid_count == 0 ) {
02464       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02465     }
02466     int uc = --ungrid_count;
02467     CmiUnlock(cuda_lock);
02468 
02469     if ( uc == 0 ) {
02470       pmeProxyDir[master_pe].ungridCalc();
02471     }
02472     return;
02473   }
02474 #endif
02475   --ungrid_count;
02476 
02477   if ( ungrid_count == 0 ) {
02478     pmeProxyDir[CkMyPe()].ungridCalc();
02479   }
02480 }
02481 
02482 #ifdef NAMD_CUDA
02483 #define count_limit 1000000
02484 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
02485 #define EVENT_STRIDE 10
02486 
02487 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
02488 
02489 void cuda_check_pme_forces(void *arg, double walltime) {
02490   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
02491 
02492  while ( 1 ) { // process multiple events per call
02493   cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
02494   if ( err == cudaSuccess ) {
02495     argp->check_forces_count = 0;
02496     for ( int i=0; i<EVENT_STRIDE; ++i ) {
02497       WorkDistrib::messageEnqueueWork(argp->pmeComputes[argp->forces_done_count]);
02498       if ( ++(argp->forces_done_count) == argp->forces_count ) break;
02499     }
02500     if ( argp->forces_done_count == argp->forces_count ) { // last event
02501       traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
02502       argp->forces_time = walltime - argp->forces_time;
02503       //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
02504       return;
02505     } else { // more events
02506       continue; // check next event
02507     }
02508   } else if ( err != cudaErrorNotReady ) {
02509     cuda_errcheck("in cuda_check_pme_forces");
02510     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_forces");
02511   } else if ( ++(argp->check_forces_count) >= count_limit ) {
02512     char errmsg[256];
02513     sprintf(errmsg,"cuda_check_pme_forces polled %d times over %f s on seq %d",
02514             argp->check_forces_count, walltime - argp->forces_time,
02515             argp->saved_sequence);
02516     cuda_errcheck(errmsg);
02517     NAMD_die(errmsg);
02518   } else {
02519     break; // call again
02520   }
02521  } // while ( 1 )
02522  CcdCallBacksReset(0,walltime);  // fix Charm++
02523  CUDA_POLL(cuda_check_pme_forces, arg);
02524 }
02525 #endif // NAMD_CUDA
02526 
02527 void ComputePmeMgr::ungridCalc(void) {
02528   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02529 
02530   ungridForcesCount = pmeComputes.size();
02531 
02532 #ifdef NAMD_CUDA
02533  if ( offload ) {
02534   //CmiLock(cuda_lock);
02535   cudaSetDevice(deviceCUDA->getDeviceID());
02536 
02537   if ( this == masterPmeMgr ) {
02538     double before = CmiWallTimer();
02539     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02540     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02541     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02542 
02543     const int myrank = CkMyRank();
02544     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02545       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02546         nodePmeMgr->mgrObjects[i]->ungridCalc();
02547       }
02548     }
02549     if ( ! pmeComputes.size() ) return;
02550   }
02551 
02552   if ( ! end_forces ) {
02553     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02554     end_forces = new cudaEvent_t[n];
02555     for ( int i=0; i<n; ++i ) {
02556       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02557     }
02558   }
02559 
02560   const int pcsz = pmeComputes.size();
02561   if ( ! afn_host ) {
02562     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02563     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02564     cuda_errcheck("malloc params for pme");
02565   }
02566   int totn = 0;
02567   for ( int i=0; i<pcsz; ++i ) {
02568     int n = pmeComputes[i]->numGridAtoms[0];
02569     totn += n;
02570   }
02571   if ( totn > f_data_mgr_alloc ) {
02572     if ( f_data_mgr_alloc ) {
02573       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02574       cudaFree(f_data_mgr_dev);
02575       cudaFreeHost(f_data_mgr_host);
02576     }
02577     f_data_mgr_alloc = 1.2 * (totn + 100);
02578     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02579     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02580     cuda_errcheck("malloc forces for pme");
02581   }
02582   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02583   float *f_dev = f_data_mgr_dev;
02584   float *f_host = f_data_mgr_host;
02585   for ( int i=0; i<pcsz; ++i ) {
02586     int n = pmeComputes[i]->numGridAtoms[0];
02587     pmeComputes[i]->f_data_dev = f_dev;
02588     pmeComputes[i]->f_data_host = f_host;
02589     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02590     afn_host[3*i+1] = f_dev;
02591     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02592     f_dev += 3*n;
02593     f_host += 3*n;
02594   }
02595   //CmiLock(cuda_lock);
02596   double before = CmiWallTimer();
02597   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02598   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02599   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02600   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02601 
02602   for ( int i=0; i<pcsz; ++i ) {
02603     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02604     if ( i%EVENT_STRIDE == 0 ) {
02605       int dimy = pcsz - i;
02606       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02607       int maxn = 0;
02608       int subtotn = 0;
02609       for ( int j=0; j<dimy; ++j ) {
02610         int n = pmeComputes[i+j]->numGridAtoms[0];
02611         subtotn += n;
02612         if ( n > maxn ) maxn = n;
02613       }
02614       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02615       before = CmiWallTimer();
02616       cuda_pme_forces(
02617         bspline_coeffs_dev,
02618         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02619         pmeComputes[i]->a_data_dev,
02620         pmeComputes[i]->f_data_dev,
02621         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02622         streams[stream]);
02623       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02624       before = CmiWallTimer();
02625       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02626         cudaMemcpyDeviceToHost, streams[stream]);
02627       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02628       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02629       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02630     }
02631     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02632   }
02633   //CmiUnlock(cuda_lock);
02634  } else
02635 #endif // NAMD_CUDA
02636  {
02637   for ( int i=0; i<pmeComputes.size(); ++i ) {
02638     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02639     // pmeComputes[i]->ungridForces();
02640   }
02641  }
02642   // submitReductions();  // must follow all ungridForces()
02643 
02644 #ifdef NAMD_CUDA
02645  if ( offload ) {
02646   forces_time = CmiWallTimer();
02647   forces_count = ungridForcesCount;
02648   forces_done_count = 0;
02649   pmeProxy[this_pe].pollForcesReady();
02650  }
02651 #endif
02652 
02653   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02654 }
02655 
02656 void ComputePmeMgr::pollForcesReady() {
02657 #ifdef NAMD_CUDA
02658   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02659   CUDA_POLL(cuda_check_pme_forces,this);
02660 #else
02661   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02662 #endif
02663 }
02664 
02665 void ComputePme::atomUpdate() { atomsChanged = 1; }
02666 
02667 ComputePme::ComputePme(ComputeID c, PatchID pid) : Compute(c), patchID(pid)
02668 {
02669   DebugM(4,"ComputePme created.\n");
02670   basePriority = PME_PRIORITY;
02671   setNumPatches(1);
02672 
02673   CProxy_ComputePmeMgr::ckLocalBranch(
02674         CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
02675 
02676   SimParameters *simParams = Node::Object()->simParameters;
02677 
02678   qmForcesOn =  simParams->qmForcesOn;
02679   offload = simParams->PMEOffload;
02680 
02681   alchOn = simParams->alchOn;
02682   alchFepOn = simParams->alchFepOn;
02683   alchThermIntOn = simParams->alchThermIntOn;
02684   alchDecouple = alchOn && simParams->alchDecouple;
02685   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
02686             
02687   if (alchOn) {
02688     numGrids = 2;
02689     if (alchDecouple) numGrids += 2;
02690     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
02691   }
02692   else numGrids = 1;
02693   lesOn = simParams->lesOn;
02694   if ( lesOn ) {
02695     lesFactor = simParams->lesFactor;
02696     numGrids = lesFactor;
02697   }
02698   selfOn = 0;
02699   pairOn = simParams->pairInteractionOn;
02700   if ( pairOn ) {
02701     selfOn = simParams->pairInteractionSelf;
02702     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
02703     numGrids = selfOn ? 1 : 3;
02704   }
02705 
02706   myGrid.K1 = simParams->PMEGridSizeX;
02707   myGrid.K2 = simParams->PMEGridSizeY;
02708   myGrid.K3 = simParams->PMEGridSizeZ;
02709   myGrid.order = simParams->PMEInterpOrder;
02710   myGrid.dim2 = myGrid.K2;
02711   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02712 
02713 #ifdef NAMD_CUDA
02714   cuda_atoms_offset = 0;
02715   f_data_host = 0;
02716   f_data_dev = 0;
02717  if ( ! offload )
02718 #endif
02719  {
02720   for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
02721  }
02722 
02723   atomsChanged = 0;
02724   
02725   qmLoclIndx = 0;
02726   qmLocalCharges = 0;
02727 }
02728 
02729 void ComputePme::initialize() {
02730   if (!(patch = PatchMap::Object()->patch(patchID))) {
02731     NAMD_bug("ComputePme used with unknown patch.");
02732   }
02733   positionBox = patch->registerPositionPickup(this);
02734   avgPositionBox = patch->registerAvgPositionPickup(this);
02735   forceBox = patch->registerForceDeposit(this);
02736 #ifdef NAMD_CUDA
02737  if ( offload ) {
02738   myMgr->cuda_atoms_count += patch->getNumAtoms();
02739  }
02740 #endif
02741 }
02742 
02743 void ComputePmeMgr::initialize_computes() {
02744 
02745   noWorkCount = 0;
02746   doWorkCount = 0;
02747   ungridForcesCount = 0;
02748 
02749   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02750 
02751   SimParameters *simParams = Node::Object()->simParameters;
02752 
02753   strayChargeErrors = 0;
02754 
02755 #ifdef NAMD_CUDA
02756  PatchMap *patchMap = PatchMap::Object();
02757  int pe = master_pe = CkNodeFirst(CkMyNode());
02758  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02759     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02760     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02761     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02762     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02763     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02764         && pe != deviceCUDA->getMasterPe() ) {
02765       master_pe = pe;
02766     }
02767  }
02768  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02769    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02770  }
02771 
02772  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02773  bool cudaFirst = 1;
02774  if ( offload ) {
02775   CmiLock(cuda_lock);
02776   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02777  }
02778 
02779  if ( cudaFirst ) {
02780   nodePmeMgr->master_pe = master_pe;
02781   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02782  }
02783 #endif
02784 
02785   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02786   fsize = myGrid.K1 * myGrid.dim2;
02787   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02788 #ifdef NAMD_CUDA
02789  if ( ! offload )
02790 #endif
02791  {
02792   q_arr = new float*[fsize*numGrids];
02793   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02794   q_list = new float*[fsize*numGrids];
02795   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02796   q_count = 0;
02797  }
02798 
02799 #ifdef NAMD_CUDA
02800  if ( cudaFirst || ! offload ) {
02801 #endif
02802   f_arr = new char[fsize*numGrids];
02803   // memset to non-zero value has race condition on BlueGene/Q
02804   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02805   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02806 
02807   for ( int g=0; g<numGrids; ++g ) {
02808     char *f = f_arr + g*fsize;
02809     if ( usePencils ) {
02810       int K1 = myGrid.K1;
02811       int K2 = myGrid.K2;
02812       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02813       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02814       int dim2 = myGrid.dim2;
02815       for (int ap=0; ap<numPencilsActive; ++ap) {
02816         int ib = activePencils[ap].i;
02817         int jb = activePencils[ap].j;
02818         int ibegin = ib*block1;
02819         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02820         int jbegin = jb*block2;
02821         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02822         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02823         for ( int i=ibegin; i<iend; ++i ) {
02824           for ( int j=jbegin; j<jend; ++j ) {
02825             f[i*dim2+j] = 0;
02826           }
02827         }
02828       }
02829     } else {
02830       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02831       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02832       for (int pe=0; pe<numGridPes; pe++) {
02833         if ( ! recipPeDest[pe] ) continue;
02834         int start = pe * bsize;
02835         int len = bsize;
02836         if ( start >= qsize ) { start = 0; len = 0; }
02837         if ( start + len > qsize ) { len = qsize - start; }
02838         int zdim = myGrid.dim3;
02839         int fstart = start / zdim;
02840         int flen = len / zdim;
02841         memset(f + fstart, 0, flen*sizeof(char));
02842         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02843       }
02844     }
02845   }
02846 #ifdef NAMD_CUDA
02847  }
02848  if ( offload ) {
02849  cudaSetDevice(deviceCUDA->getDeviceID());
02850  if ( cudaFirst ) {
02851 
02852   int f_alloc_count = 0;
02853   for ( int n=fsize, i=0; i<n; ++i ) {
02854     if ( f_arr[i] == 0 ) {
02855       ++f_alloc_count;
02856     }
02857   }
02858   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02859 
02860   q_arr = new float*[fsize*numGrids];
02861   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02862 
02863   float **q_arr_dev_host = new float*[fsize];
02864   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02865 
02866   float **v_arr_dev_host = new float*[fsize];
02867   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02868 
02869   int q_stride = myGrid.K3+myGrid.order-1;
02870   q_data_size = f_alloc_count * q_stride * sizeof(float);
02871   ffz_size = (fsize + q_stride) * sizeof(int);
02872 
02873   // tack ffz onto end of q_data to allow merged transfer
02874   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02875   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02876   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02877   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02878   cudaMalloc((void**) &v_data_dev, q_data_size);
02879   cuda_errcheck("malloc grid data for pme");
02880   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02881   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02882   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02883   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02884   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02885 
02886   f_alloc_count = 0;
02887   for ( int n=fsize, i=0; i<n; ++i ) {
02888     if ( f_arr[i] == 0 ) {
02889       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02890       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02891       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02892       ++f_alloc_count;
02893     } else {
02894       q_arr[i] = 0;
02895       q_arr_dev_host[i] = 0;
02896       v_arr_dev_host[i] = 0;
02897     }
02898   }
02899 
02900   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02901   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02902   delete [] q_arr_dev_host;
02903   delete [] v_arr_dev_host;
02904   delete [] f_arr;
02905   f_arr = new char[fsize + q_stride];
02906   fz_arr = f_arr + fsize;
02907   memset(f_arr, 0, fsize + q_stride);
02908   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02909 
02910   cuda_errcheck("initialize grid data for pme");
02911 
02912   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02913   cuda_errcheck("initialize bspline coefficients for pme");
02914 
02915 #define XCOPY(X) masterPmeMgr->X = X;
02916   XCOPY(bspline_coeffs_dev)
02917   XCOPY(bspline_dcoeffs_dev)
02918   XCOPY(q_arr)
02919   XCOPY(q_arr_dev)
02920   XCOPY(v_arr_dev)
02921   XCOPY(q_data_size)
02922   XCOPY(q_data_host)
02923   XCOPY(q_data_dev)
02924   XCOPY(v_data_dev)
02925   XCOPY(ffz_size)
02926   XCOPY(ffz_host)
02927   XCOPY(ffz_dev)
02928   XCOPY(f_arr)
02929   XCOPY(fz_arr)
02930 #undef XCOPY
02931   //CkPrintf("pe %d init first\n", CkMyPe());
02932  } else { // cudaFirst
02933   //CkPrintf("pe %d init later\n", CkMyPe());
02934 #define XCOPY(X) X = masterPmeMgr->X;
02935   XCOPY(bspline_coeffs_dev)
02936   XCOPY(bspline_dcoeffs_dev)
02937   XCOPY(q_arr)
02938   XCOPY(q_arr_dev)
02939   XCOPY(v_arr_dev)
02940   XCOPY(q_data_size)
02941   XCOPY(q_data_host)
02942   XCOPY(q_data_dev)
02943   XCOPY(v_data_dev)
02944   XCOPY(ffz_size)
02945   XCOPY(ffz_host)
02946   XCOPY(ffz_dev)
02947   XCOPY(f_arr)
02948   XCOPY(fz_arr)
02949 #undef XCOPY
02950  } // cudaFirst
02951   CmiUnlock(cuda_lock);
02952  } else // offload
02953 #endif // NAMD_CUDA
02954  {
02955   fz_arr = new char[myGrid.K3+myGrid.order-1];
02956  }
02957 
02958 #if 0 && USE_PERSISTENT
02959   recvGrid_handle = NULL;
02960 #endif
02961 }
02962 
02963 ComputePme::~ComputePme()
02964 {
02965 #ifdef NAMD_CUDA
02966   if ( ! offload )
02967 #endif
02968   {
02969     for ( int g=0; g<numGrids; ++g ) delete myRealSpace[g];
02970   }
02971 }
02972 
02973 #if 0 && USE_PERSISTENT 
02974 void ComputePmeMgr::setup_recvgrid_persistent() 
02975 {
02976     int K1 = myGrid.K1;
02977     int K2 = myGrid.K2;
02978     int dim2 = myGrid.dim2;
02979     int dim3 = myGrid.dim3;
02980     int block1 = myGrid.block1;
02981     int block2 = myGrid.block2;
02982 
02983     CkArray *zPencil_local = zPencil.ckLocalBranch();
02984     recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02985     for (int ap=0; ap<numPencilsActive; ++ap) {
02986         int ib = activePencils[ap].i;
02987         int jb = activePencils[ap].j;
02988         int ibegin = ib*block1;
02989         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02990         int jbegin = jb*block2;
02991         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02992         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02993         // f is changing
02994         int fcount = 0;
02995         for ( int g=0; g<numGrids; ++g ) {
02996             char *f = f_arr + g*fsize;
02997             for ( int i=ibegin; i<iend; ++i ) {
02998                 for ( int j=jbegin; j<jend; ++j ) {
02999                     fcount += f[i*dim2+j];
03000                 }
03001             }
03002         }
03003         int zlistlen = 0;
03004         for ( int i=0; i<myGrid.K3; ++i ) {
03005             if ( fz_arr[i] ) ++zlistlen;
03006         }
03007         int hd = ( fcount? 1 : 0 );  // has data?
03008         int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
03009         int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
03010         int compress_size = sizeof(float)*hd*fcount*zlistlen;
03011         int size = compress_start +  compress_size  + PRIORITY_SIZE/8+6;
03012         recvGrid_handle[ap] =  CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03013     }
03014 }
03015 #endif
03016 
03017 int ComputePme::noWork() {
03018 
03019   if ( patch->flags.doFullElectrostatics ) {
03020     // In QM/MM simulations, atom charges form QM regions need special treatment.
03021     if ( qmForcesOn ) {
03022         return 1;
03023     }
03024     if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0;  // work to do, enqueue as usual
03025     myMgr->heldComputes.add(this);
03026     return 1;  // don't enqueue yet
03027   }
03028 
03029   positionBox->skip();
03030   forceBox->skip();
03031 
03032   if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
03033     myMgr->noWorkCount = 0;
03034     myMgr->reduction->submit();
03035   }
03036 
03037   atomsChanged = 0;
03038 
03039   return 1;  // no work for this step
03040 }
03041 
03042 void ComputePmeMgr::addRecipEvirClient() {
03043   ++recipEvirClients;
03044 }
03045 
03046 void ComputePmeMgr::recvRecipEvir(PmeEvirMsg *msg) {
03047   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03048   for ( int g=0; g<numGrids; ++g ) {
03049     evir[g] += msg->evir[g];
03050   }
03051   delete msg;
03052   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03053   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03054 }
03055 
03056 void ComputePme::doQMWork() {
03057     
03058 //     iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
03059     
03060     
03061     int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
03062     const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
03063     const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
03064     const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
03065     
03066     const CompAtomExt *xExt = patch->getCompAtomExtInfo();
03067     
03068     // Determine number of qm atoms in this patch for the current step.
03069     numLocalQMAtoms = 0;
03070     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03071         if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
03072             numLocalQMAtoms++;
03073         }
03074     }
03075     
03076     // We prepare a charge vector with QM charges for use in the PME calculation.
03077     
03078     // Clears data from last step, if there is any.
03079     if (qmLoclIndx != 0)
03080         delete [] qmLoclIndx;
03081     if (qmLocalCharges != 0)
03082         delete [] qmLocalCharges;
03083     
03084     qmLoclIndx = new int[numLocalQMAtoms] ;
03085     qmLocalCharges = new Real[numLocalQMAtoms] ;
03086     
03087     // I am assuming there will be (in general) more QM atoms among all QM groups
03088     // than MM atoms in a patch.
03089     int procAtms = 0;
03090     
03091     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03092         
03093         for (int i=0; i<numQMAtms; i++) {
03094             
03095             if (qmAtmIndx[i] == xExt[paIter].id) {
03096                 
03097                 qmLoclIndx[procAtms] = paIter ;
03098                 qmLocalCharges[procAtms] = qmAtmChrg[i];
03099                 
03100                 procAtms++;
03101                 break;
03102             }
03103             
03104         }
03105         
03106         if (procAtms == numLocalQMAtoms)
03107             break;
03108     }
03109     
03110     doWork();
03111     return ;
03112 }
03113 
03114 void ComputePme::doWork()
03115 {
03116   DebugM(4,"Entering ComputePme::doWork().\n");
03117 
03118   if ( basePriority >= COMPUTE_HOME_PRIORITY ) {
03119 #ifdef NAMD_CUDA
03120     basePriority = ( offload ? PME_OFFLOAD_PRIORITY : PME_PRIORITY );
03121 #else
03122     basePriority = PME_PRIORITY;
03123 #endif
03124     ungridForces();
03125     // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03126     if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
03127     return;
03128   }
03129   basePriority = COMPUTE_HOME_PRIORITY + PATCH_PRIORITY(patchID);
03130   // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03131 
03132 #ifdef TRACE_COMPUTE_OBJECTS
03133     double traceObjStartTime = CmiWallTimer();
03134 #endif
03135 
03136 #ifdef NAMD_CUDA
03137   if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
03138 #endif
03139 
03140   // allocate storage
03141   numLocalAtoms = patch->getNumAtoms();
03142 
03143   Lattice &lattice = patch->flags.lattice;
03144 
03145   localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
03146   localData = localData_alloc.begin();
03147   localPartition_alloc.resize(numLocalAtoms);
03148   localPartition = localPartition_alloc.begin();
03149 
03150   int g;
03151   for ( g=0; g<numGrids; ++g ) {
03152     localGridData[g] = localData + numLocalAtoms*(g+1);
03153   }
03154 
03155   // get positions and charges
03156   PmeParticle * data_ptr = localData;
03157   unsigned char * part_ptr = localPartition;
03158   const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
03159                                 * ComputeNonbondedUtil::dielectric_1 );
03160 
03161   {
03162     CompAtom *x = positionBox->open();
03163     // CompAtomExt *xExt = patch->getCompAtomExtInfo();
03164     if ( patch->flags.doMolly ) {
03165       positionBox->close(&x);
03166       x = avgPositionBox->open();
03167     }
03168     int numAtoms = patch->getNumAtoms();
03169 
03170     for(int i=0; i<numAtoms; ++i)
03171     {
03172       data_ptr->x = x[i].position.x;
03173       data_ptr->y = x[i].position.y;
03174       data_ptr->z = x[i].position.z;
03175       data_ptr->cg = coulomb_sqrt * x[i].charge;
03176       ++data_ptr;
03177       *part_ptr = x[i].partition;
03178       ++part_ptr;
03179     }
03180 
03181     // QM loop to overwrite charges of QM atoms.
03182     // They are zero for NAMD, but are updated in ComputeQM.
03183     if ( qmForcesOn ) {
03184         
03185         for(int i=0; i<numLocalQMAtoms; ++i)
03186         {
03187           localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
03188         }
03189         
03190     }
03191     
03192     if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
03193     else { positionBox->close(&x); }
03194   }
03195 
03196   // copy to other grids if needed
03197   if ( (alchOn && (!alchDecouple)) || lesOn ) {
03198     for ( g=0; g<numGrids; ++g ) {
03199       PmeParticle *lgd = localGridData[g];
03200       int nga = 0;
03201       for(int i=0; i<numLocalAtoms; ++i) {
03202         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03203           // for FEP/TI: grid 0 gets non-alch + partition 1;
03204           // grid 1 gets non-alch + partition 2;
03205           // grid 2 (only if called for with numGrids=3) gets only non-alch
03206           lgd[nga++] = localData[i];
03207         }
03208       }
03209       numGridAtoms[g] = nga;
03210     }
03211   } else if ( alchOn && alchDecouple) {
03212     // alchemical decoupling: four grids
03213     // g=0: partition 0 and partition 1
03214     // g=1: partition 0 and partition 2
03215     // g=2: only partition 1 atoms
03216     // g=3: only partition 2 atoms
03217     // plus one grid g=4, only partition 0, if numGrids=5
03218     for ( g=0; g<2; ++g ) {  // same as before for first 2
03219       PmeParticle *lgd = localGridData[g];
03220       int nga = 0;
03221       for(int i=0; i<numLocalAtoms; ++i) {
03222         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03223           lgd[nga++] = localData[i];
03224         }
03225       }
03226       numGridAtoms[g] = nga;
03227     }
03228     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
03229       PmeParticle *lgd = localGridData[g];
03230       int nga = 0;
03231       for(int i=0; i<numLocalAtoms; ++i) {
03232         if ( localPartition[i] == (g-1) ) {
03233           lgd[nga++] = localData[i];
03234         }
03235       }
03236       numGridAtoms[g] = nga;
03237     }
03238     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
03239       // numGrids=5 only if alchElecLambdaStart > 0
03240       PmeParticle *lgd = localGridData[g];
03241       int nga = 0;
03242       for(int i=0; i<numLocalAtoms; ++i) {
03243         if ( localPartition[i] == 0 ) {
03244           lgd[nga++] = localData[i];
03245         }
03246       }
03247       numGridAtoms[g] = nga;
03248     }
03249   } else if ( selfOn ) {
03250     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
03251     g = 0;
03252     PmeParticle *lgd = localGridData[g];
03253     int nga = 0;
03254     for(int i=0; i<numLocalAtoms; ++i) {
03255       if ( localPartition[i] == 1 ) {
03256         lgd[nga++] = localData[i];
03257       }
03258     }
03259     numGridAtoms[g] = nga;
03260   } else if ( pairOn ) {
03261     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
03262     g = 0;
03263     PmeParticle *lgd = localGridData[g];
03264     int nga = 0;
03265     for(int i=0; i<numLocalAtoms; ++i) {
03266       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
03267         lgd[nga++] = localData[i];
03268       }
03269     }
03270     numGridAtoms[g] = nga;
03271     for ( g=1; g<3; ++g ) {
03272       PmeParticle *lgd = localGridData[g];
03273       int nga = 0;
03274       for(int i=0; i<numLocalAtoms; ++i) {
03275         if ( localPartition[i] == g ) {
03276           lgd[nga++] = localData[i];
03277         }
03278       }
03279       numGridAtoms[g] = nga;
03280     }
03281   } else {
03282     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
03283     localGridData[0] = localData;
03284     numGridAtoms[0] = numLocalAtoms;
03285   }
03286 
03287  if ( ! myMgr->doWorkCount ) {
03288   myMgr->doWorkCount = myMgr->pmeComputes.size();
03289 
03290 #ifdef NAMD_CUDA
03291  if ( !  offload )
03292 #endif // NAMD_CUDA
03293  {
03294   memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
03295 
03296   for (int i=0; i<myMgr->q_count; ++i) {
03297     memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
03298   }
03299  }
03300 
03301   for ( g=0; g<numGrids; ++g ) {
03302     myMgr->evir[g] = 0;
03303   }
03304 
03305   myMgr->strayChargeErrors = 0;
03306 
03307   myMgr->compute_sequence = sequence();
03308  }
03309 
03310   if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
03311 
03312   int strayChargeErrors = 0;
03313 
03314   // calculate self energy
03315   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
03316   for ( g=0; g<numGrids; ++g ) {
03317     BigReal selfEnergy = 0;
03318     data_ptr = localGridData[g];
03319     int i;
03320     for(i=0; i<numGridAtoms[g]; ++i)
03321     {
03322       selfEnergy += data_ptr->cg * data_ptr->cg;
03323       ++data_ptr;
03324     }
03325     selfEnergy *= -1. * ewaldcof / SQRT_PI;
03326     myMgr->evir[g][0] += selfEnergy;
03327 
03328     float **q = myMgr->q_arr + g*myMgr->fsize;
03329     char *f = myMgr->f_arr + g*myMgr->fsize;
03330 
03331     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
03332 #ifdef NAMD_CUDA
03333    if ( offload ) {
03334     if ( myMgr->cuda_atoms_alloc == 0 ) {  // first call
03335       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03336       cuda_errcheck("before malloc atom data for pme");
03337       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03338       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03339       cuda_errcheck("malloc atom data for pme");
03340       myMgr->cuda_atoms_count = 0;
03341     }
03342     cuda_atoms_offset = myMgr->cuda_atoms_count;
03343     int n = numGridAtoms[g];
03344     myMgr->cuda_atoms_count += n;
03345     if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
03346       CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
03347                         CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
03348       cuda_errcheck("before malloc expanded atom data for pme");
03349       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03350       const float *a_data_host_old = myMgr->a_data_host;
03351       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03352       cuda_errcheck("malloc expanded host atom data for pme");
03353       memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
03354       cudaFreeHost((void*) a_data_host_old);
03355       cuda_errcheck("free expanded host atom data for pme");
03356       cudaFree(myMgr->a_data_dev);
03357       cuda_errcheck("free expanded dev atom data for pme");
03358       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03359       cuda_errcheck("malloc expanded dev atom data for pme");
03360     }
03361     float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
03362     data_ptr = localGridData[g];
03363     double order_1 = myGrid.order - 1;
03364     double K1 = myGrid.K1;
03365     double K2 = myGrid.K2;
03366     double K3 = myGrid.K3;
03367     int found_negative = 0;
03368     for ( int i=0; i<n; ++i ) {
03369       if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
03370         found_negative = 1;
03371         // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
03372       }
03373       double x_int = (int) data_ptr[i].x;
03374       double y_int = (int) data_ptr[i].y;
03375       double z_int = (int) data_ptr[i].z;
03376       a_data_host[7*i  ] = data_ptr[i].x - x_int;  // subtract in double precision
03377       a_data_host[7*i+1] = data_ptr[i].y - y_int;
03378       a_data_host[7*i+2] = data_ptr[i].z - z_int;
03379       a_data_host[7*i+3] = data_ptr[i].cg;
03380       x_int -= order_1;  if ( x_int < 0 ) x_int += K1;
03381       y_int -= order_1;  if ( y_int < 0 ) y_int += K2;
03382       z_int -= order_1;  if ( z_int < 0 ) z_int += K3;
03383       a_data_host[7*i+4] = x_int;
03384       a_data_host[7*i+5] = y_int;
03385       a_data_host[7*i+6] = z_int;
03386     }
03387     if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
03388    } else
03389 #endif // NAMD_CUDA
03390    {
03391     myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
03392     myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
03393    }
03394   }
03395   myMgr->strayChargeErrors += strayChargeErrors;
03396 
03397 #ifdef TRACE_COMPUTE_OBJECTS
03398     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
03399 #endif
03400 
03401  if ( --(myMgr->doWorkCount) == 0 ) {
03402 // cudaDeviceSynchronize();  // XXXX
03403 #ifdef NAMD_CUDA
03404   if ( offload ) {
03405     ComputePmeMgr::cuda_submit_charges_args args;
03406     args.mgr = myMgr;
03407     args.lattice = &lattice;
03408     args.sequence = sequence();
03409     CmiLock(ComputePmeMgr::cuda_lock);
03410     if ( ComputePmeMgr::cuda_busy ) {
03411       ComputePmeMgr::cuda_submit_charges_deque.push_back(args);
03412     } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
03413       // avoid adding work to nonbonded data preparation pe
03414       args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03415     } else {
03416       ComputePmeMgr::cuda_busy = true;
03417       while ( 1 ) {
03418         CmiUnlock(ComputePmeMgr::cuda_lock);
03419         args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03420         CmiLock(ComputePmeMgr::cuda_lock);
03421         if ( ComputePmeMgr::cuda_submit_charges_deque.size() ) {
03422           args = ComputePmeMgr::cuda_submit_charges_deque.front();
03423           ComputePmeMgr::cuda_submit_charges_deque.pop_front();
03424         } else {
03425           ComputePmeMgr::cuda_busy = false;
03426           break;
03427         }
03428       }
03429     }
03430     CmiUnlock(ComputePmeMgr::cuda_lock);
03431   } else
03432 #endif // NAMD_CUDA
03433   {
03434     myMgr->chargeGridReady(lattice,sequence());
03435   }
03436  }
03437  atomsChanged = 0;
03438 }
03439 
03440 #ifdef NAMD_CUDA
03441 
03442 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
03443 
03444     int n = cuda_atoms_count;
03445     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03446     cuda_atoms_count = 0;
03447 
03448     const double before = CmiWallTimer();
03449     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03450                           cudaMemcpyHostToDevice, streams[stream]);
03451     const double after = CmiWallTimer();
03452 
03453     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03454 
03455     cuda_pme_charges(
03456       bspline_coeffs_dev,
03457       q_arr_dev, ffz_dev, ffz_dev + fsize,
03458       a_data_dev, n,
03459       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03460       streams[stream]);
03461     const double after2 = CmiWallTimer();
03462 
03463     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03464 
03465     masterPmeMgr->charges_time = before;
03466     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03467     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03468 }
03469 
03470 void cuda_check_pme_charges(void *arg, double walltime) {
03471   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
03472 
03473   cudaError_t err = cudaEventQuery(argp->end_charges);
03474   if ( err == cudaSuccess ) {
03475     traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
03476     argp->charges_time = walltime - argp->charges_time;
03477     argp->sendChargeGridReady();
03478     argp->check_charges_count = 0;
03479   } else if ( err != cudaErrorNotReady ) {
03480     cuda_errcheck("in cuda_check_pme_charges");
03481     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_charges");
03482   } else if ( ++(argp->check_charges_count) >= count_limit ) {
03483     char errmsg[256];
03484     sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
03485             argp->check_charges_count, walltime - argp->charges_time,
03486             argp->saved_sequence);
03487     cuda_errcheck(errmsg);
03488     NAMD_die(errmsg);
03489   } else {
03490     CcdCallBacksReset(0,walltime);  // fix Charm++
03491     CUDA_POLL(cuda_check_pme_charges, arg);
03492   }
03493 }
03494 
03495 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
03496   saved_lattice = &lattice;
03497   saved_sequence = sequence;
03498 
03499   // cudaDeviceSynchronize();  //  XXXX TESTING
03500   //int q_stride = myGrid.K3+myGrid.order-1;
03501   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03502   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03503   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03504   //  }
03505   //}
03506   //CmiLock(cuda_lock);
03507 
03508  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03509   double before = CmiWallTimer();
03510   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03511   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03512   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03513                         cudaMemcpyDeviceToHost, streams[stream]);
03514   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03515   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03516   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03517   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03518   //CmiUnlock(cuda_lock);
03519   // cudaDeviceSynchronize();  //  XXXX TESTING
03520   // cuda_errcheck("after memcpy grid to host");
03521 
03522   SimParameters *simParams = Node::Object()->simParameters;
03523   if ( ! simParams->useCUDA2 ) {
03524     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03525     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03526   }
03527 
03528   pmeProxy[master_pe].pollChargeGridReady();
03529  }
03530 }
03531 
03532 void ComputePmeMgr::sendChargeGridReady() {
03533   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03534     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03535     int cs = mgr->pmeComputes.size();
03536     if ( cs ) {
03537       mgr->ungridForcesCount = cs;
03538       mgr->recipEvirCount = mgr->recipEvirClients;
03539       masterPmeMgr->chargeGridSubmittedCount++;
03540     }
03541   }
03542   pmeProxy[master_pe].recvChargeGridReady();
03543 }
03544 #endif // NAMD_CUDA
03545 
03546 void ComputePmeMgr::pollChargeGridReady() {
03547 #ifdef NAMD_CUDA
03548   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03549   CUDA_POLL(cuda_check_pme_charges,this);
03550 #else
03551   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03552 #endif
03553 }
03554 
03555 void ComputePmeMgr::recvChargeGridReady() {
03556   chargeGridReady(*saved_lattice,saved_sequence);
03557 }
03558 
03559 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
03560 
03561 #ifdef NAMD_CUDA
03562  if ( offload ) {
03563   int errcount = 0;
03564   int q_stride = myGrid.K3+myGrid.order-1;
03565   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03566     f_arr[j] = ffz_host[j];
03567     if ( ffz_host[j] & ~1 ) ++errcount;
03568   }
03569   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03570  }
03571 #endif
03572   recipEvirCount = recipEvirClients;
03573   ungridForcesCount = pmeComputes.size();
03574 
03575   for (int j=0; j<myGrid.order-1; ++j) {
03576     fz_arr[j] |= fz_arr[myGrid.K3+j];
03577   }
03578 
03579   if ( usePencils ) {
03580     sendPencils(lattice,sequence);
03581   } else {
03582     sendData(lattice,sequence);
03583   }
03584 }
03585 
03586 
03587 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
03588 
03589   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03590 
03591 #if 0 && USE_PERSISTENT
03592     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03593 #endif
03594   int K1 = myGrid.K1;
03595   int K2 = myGrid.K2;
03596   int dim2 = myGrid.dim2;
03597   int dim3 = myGrid.dim3;
03598   int block1 = myGrid.block1;
03599   int block2 = myGrid.block2;
03600 
03601   // int savedMessages = 0;
03602   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03603 
03604   for (int ap=first; ap<=last; ++ap) {
03605     int ib = activePencils[ap].i;
03606     int jb = activePencils[ap].j;
03607     int ibegin = ib*block1;
03608     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03609     int jbegin = jb*block2;
03610     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03611     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03612 
03613     int fcount = 0;
03614     for ( int g=0; g<numGrids; ++g ) {
03615       char *f = f_arr + g*fsize;
03616 #ifdef NAMD_CUDA
03617      if ( offload ) {
03618       int errcount = 0;
03619       for ( int i=ibegin; i<iend; ++i ) {
03620        for ( int j=jbegin; j<jend; ++j ) {
03621         int k = i*dim2+j;
03622         f[k] = ffz_host[k];
03623         fcount += f[k];
03624         if ( ffz_host[k] & ~1 ) ++errcount;
03625        }
03626       }
03627       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03628      } else
03629 #endif
03630       for ( int i=ibegin; i<iend; ++i ) {
03631        for ( int j=jbegin; j<jend; ++j ) {
03632         fcount += f[i*dim2+j];
03633        }
03634       }
03635     }
03636 
03637 #ifdef NETWORK_PROGRESS
03638     CmiNetworkProgress();
03639 #endif
03640 
03641     if ( ! pencilActive[ib*yBlocks+jb] )
03642       NAMD_bug("PME activePencils list inconsistent");
03643 
03644     int zlistlen = 0;
03645     for ( int i=0; i<myGrid.K3; ++i ) {
03646       if ( fz_arr[i] ) ++zlistlen;
03647     }
03648 
03649     int hd = ( fcount? 1 : 0 );  // has data?
03650     // if ( ! hd ) ++savedMessages;
03651 
03652     
03653     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03654         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03655     msg->sourceNode = sourcepe;
03656     msg->hasData = hd;
03657     msg->lattice = lattice;
03658    if ( hd ) {
03659 #if 0
03660     msg->start = fstart;
03661     msg->len = flen;
03662 #else
03663     msg->start = -1;   // obsolete?
03664     msg->len = -1;   // obsolete?
03665 #endif
03666     msg->zlistlen = zlistlen;
03667     int *zlist = msg->zlist;
03668     zlistlen = 0;
03669     for ( int i=0; i<myGrid.K3; ++i ) {
03670       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03671     }
03672     char *fmsg = msg->fgrid;
03673     float *qmsg = msg->qgrid;
03674     for ( int g=0; g<numGrids; ++g ) {
03675       char *f = f_arr + g*fsize;
03676       float **q = q_arr + g*fsize;
03677       for ( int i=ibegin; i<iend; ++i ) {
03678        for ( int j=jbegin; j<jend; ++j ) {
03679         *(fmsg++) = f[i*dim2+j];
03680         if( f[i*dim2+j] ) {
03681           for (int h=0; h<myGrid.order-1; ++h) {
03682             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03683           }
03684           for ( int k=0; k<zlistlen; ++k ) {
03685             *(qmsg++) = q[i*dim2+j][zlist[k]];
03686           }
03687         }
03688        }
03689       }
03690     }
03691    }
03692 
03693     msg->sequence = compute_sequence;
03694     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03695     CmiEnableUrgentSend(1);
03696 #if USE_NODE_PAR_RECEIVE
03697     msg->destElem=CkArrayIndex3D(ib,jb,0);
03698     CProxy_PmePencilMap lzm = npMgr->zm;
03699     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03700     int destnode = CmiNodeOf(destproc);
03701     
03702 #if  0 
03703     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03704 #endif
03705     pmeNodeProxy[destnode].recvZGrid(msg);
03706 #if 0 
03707     CmiUsePersistentHandle(NULL, 0);
03708 #endif
03709 #else
03710 #if 0 
03711     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03712 #endif
03713     zPencil(ib,jb,0).recvGrid(msg);
03714 #if 0 
03715     CmiUsePersistentHandle(NULL, 0);
03716 #endif
03717 #endif
03718     CmiEnableUrgentSend(0);
03719   }
03720 
03721 
03722   // if ( savedMessages ) {
03723   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03724   // }
03725 
03726 }
03727 
03728 
03729 void ComputePmeMgr::sendPencilsHelper(int iter) {
03730   nodePmeMgr->sendPencilsHelper(iter);
03731 }
03732 
03733 void NodePmeMgr::sendPencilsHelper(int iter) {
03734 #ifdef NAMD_CUDA
03735   ComputePmeMgr *obj = masterPmeMgr;
03736   obj->sendPencilsPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe);
03737 #else
03738   NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
03739 #endif
03740 }
03741 
03742 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
03743 
03744   sendDataHelper_lattice = &lattice;
03745   sendDataHelper_sequence = sequence;
03746   sendDataHelper_sourcepe = CkMyPe();
03747 
03748 #ifdef NAMD_CUDA
03749   if ( offload ) {
03750     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03751 #if CMK_MULTICORE
03752       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03753       int ib = activePencils[ap].i;
03754       int jb = activePencils[ap].j;
03755       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03756       pmeProxy[destproc].sendPencilsHelper(ap);
03757 #else
03758       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03759 #endif
03760     }
03761   } else
03762 #endif
03763   {
03764     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03765   }
03766 
03767   if ( strayChargeErrors ) {
03768    strayChargeErrors = 0;
03769    iout << iERROR << "Stray PME grid charges detected: "
03770         << CkMyPe() << " sending to (x,y)";
03771    int K1 = myGrid.K1;
03772    int K2 = myGrid.K2;
03773    int dim2 = myGrid.dim2;
03774    int block1 = myGrid.block1;
03775    int block2 = myGrid.block2;
03776    for (int ib=0; ib<xBlocks; ++ib) {
03777     for (int jb=0; jb<yBlocks; ++jb) {
03778      int ibegin = ib*block1;
03779      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03780      int jbegin = jb*block2;
03781      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03782      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03783 
03784      for ( int g=0; g<numGrids; ++g ) {
03785        char *f = f_arr + g*fsize;
03786        if ( ! pencilActive[ib*yBlocks+jb] ) {
03787            for ( int i=ibegin; i<iend; ++i ) {
03788             for ( int j=jbegin; j<jend; ++j ) {
03789              if ( f[i*dim2+j] == 3 ) {
03790                f[i*dim2+j] = 2;
03791                iout << " (" << i << "," << j << ")";
03792              }
03793             }
03794            }
03795        }
03796      }
03797     }
03798    }
03799    iout << "\n" << endi;
03800   }
03801  
03802 }
03803 
03804 
03805 void ComputePmeMgr::copyPencils(PmeGridMsg *msg) {
03806 
03807   int K1 = myGrid.K1;
03808   int K2 = myGrid.K2;
03809   int dim2 = myGrid.dim2;
03810   int dim3 = myGrid.dim3;
03811   int block1 = myGrid.block1;
03812   int block2 = myGrid.block2;
03813 
03814   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03815   int ib = msg->sourceNode / yBlocks;
03816   int jb = msg->sourceNode % yBlocks;
03817 
03818   int ibegin = ib*block1;
03819   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03820   int jbegin = jb*block2;
03821   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03822 
03823   int zlistlen = msg->zlistlen;
03824   int *zlist = msg->zlist;
03825   float *qmsg = msg->qgrid;
03826   int g;
03827   for ( g=0; g<numGrids; ++g ) {
03828     char *f = f_arr + g*fsize;
03829     float **q = q_arr + g*fsize;
03830     for ( int i=ibegin; i<iend; ++i ) {
03831      for ( int j=jbegin; j<jend; ++j ) {
03832       if( f[i*dim2+j] ) {
03833         f[i*dim2+j] = 0;
03834         for ( int k=0; k<zlistlen; ++k ) {
03835           q[i*dim2+j][zlist[k]] = *(qmsg++);
03836         }
03837         for (int h=0; h<myGrid.order-1; ++h) {
03838           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03839         }
03840       }
03841      }
03842     }
03843   }
03844 }
03845 
03846 
03847 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
03848 
03849   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03850 
03851   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03852 
03853   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03854   for (int j=first; j<=last; j++) {
03855     int pe = gridPeOrder[j];  // different order
03856     if ( ! recipPeDest[pe] && ! errors ) continue;
03857     int start = pe * bsize;
03858     int len = bsize;
03859     if ( start >= qsize ) { start = 0; len = 0; }
03860     if ( start + len > qsize ) { len = qsize - start; }
03861     int zdim = myGrid.dim3;
03862     int fstart = start / zdim;
03863     int flen = len / zdim;
03864     int fcount = 0;
03865     int i;
03866 
03867     int g;
03868     for ( g=0; g<numGrids; ++g ) {
03869       char *f = f_arr + fstart + g*fsize;
03870 #ifdef NAMD_CUDA
03871      if ( offload ) {
03872       int errcount = 0;
03873       for ( i=0; i<flen; ++i ) {
03874         f[i] = ffz_host[fstart+i];
03875         fcount += f[i];
03876         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03877       }
03878       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03879      } else
03880 #endif
03881       for ( i=0; i<flen; ++i ) {
03882         fcount += f[i];
03883       }
03884       if ( ! recipPeDest[pe] ) {
03885         int errfound = 0;
03886         for ( i=0; i<flen; ++i ) {
03887           if ( f[i] == 3 ) {
03888             errfound = 1;
03889             break;
03890           }
03891         }
03892         if ( errfound ) {
03893           iout << iERROR << "Stray PME grid charges detected: "
03894                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03895           int iz = -1;
03896           for ( i=0; i<flen; ++i ) {
03897             if ( f[i] == 3 ) {
03898               f[i] = 2;
03899               int jz = (i+fstart)/myGrid.K2;
03900               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03901             }
03902           }
03903           iout << "\n" << endi;
03904         }
03905       }
03906     }
03907 
03908 #ifdef NETWORK_PROGRESS
03909     CmiNetworkProgress();
03910 #endif
03911 
03912     if ( ! recipPeDest[pe] ) continue;
03913 
03914     int zlistlen = 0;
03915     for ( i=0; i<myGrid.K3; ++i ) {
03916       if ( fz_arr[i] ) ++zlistlen;
03917     }
03918 
03919     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03920                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03921 
03922     msg->sourceNode = sourcepe;
03923     msg->lattice = lattice;
03924     msg->start = fstart;
03925     msg->len = flen;
03926     msg->zlistlen = zlistlen;
03927     int *zlist = msg->zlist;
03928     zlistlen = 0;
03929     for ( i=0; i<myGrid.K3; ++i ) {
03930       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03931     }
03932     float *qmsg = msg->qgrid;
03933     for ( g=0; g<numGrids; ++g ) {
03934       char *f = f_arr + fstart + g*fsize;
03935       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03936       float **q = q_arr + fstart + g*fsize;
03937       for ( i=0; i<flen; ++i ) {
03938         if ( f[i] ) {
03939           for (int h=0; h<myGrid.order-1; ++h) {
03940             q[i][h] += q[i][myGrid.K3+h];
03941           }
03942           for ( int k=0; k<zlistlen; ++k ) {
03943             *(qmsg++) = q[i][zlist[k]];
03944           }
03945         }
03946       }
03947     }
03948 
03949     msg->sequence = compute_sequence;
03950     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03951     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03952   }
03953 
03954 }
03955 
03956 void ComputePmeMgr::sendDataHelper(int iter) {
03957   nodePmeMgr->sendDataHelper(iter);
03958 }
03959 
03960 void NodePmeMgr::sendDataHelper(int iter) {
03961 #ifdef NAMD_CUDA
03962   ComputePmeMgr *obj = masterPmeMgr;
03963   obj->sendDataPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe, obj->sendDataHelper_errors);
03964 #else
03965   NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
03966 #endif
03967 }
03968 
03969 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
03970 
03971   sendDataHelper_lattice = &lattice;
03972   sendDataHelper_sequence = sequence;
03973   sendDataHelper_sourcepe = CkMyPe();
03974   sendDataHelper_errors = strayChargeErrors;
03975   strayChargeErrors = 0;
03976 
03977 #ifdef NAMD_CUDA
03978   if ( offload ) {
03979     for ( int i=0; i < numGridPes; ++i ) {
03980       int pe = gridPeOrder[i];  // different order
03981       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03982 #if CMK_MULTICORE
03983       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03984       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03985 #else
03986       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03987 #endif
03988     }
03989   } else
03990 #endif
03991   {
03992     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03993   }
03994  
03995 }
03996 
03997 void ComputePmeMgr::copyResults(PmeGridMsg *msg) {
03998 
03999   int zdim = myGrid.dim3;
04000   int flen = msg->len;
04001   int fstart = msg->start;
04002   int zlistlen = msg->zlistlen;
04003   int *zlist = msg->zlist;
04004   float *qmsg = msg->qgrid;
04005   int g;
04006   for ( g=0; g<numGrids; ++g ) {
04007     char *f = msg->fgrid + g*flen;
04008     float **q = q_arr + fstart + g*fsize;
04009     for ( int i=0; i<flen; ++i ) {
04010       if ( f[i] ) {
04011         f[i] = 0;
04012         for ( int k=0; k<zlistlen; ++k ) {
04013           q[i][zlist[k]] = *(qmsg++);
04014         }
04015         for (int h=0; h<myGrid.order-1; ++h) {
04016           q[i][myGrid.K3+h] = q[i][h];
04017         }
04018       }
04019     }
04020   }
04021 }
04022 
04023 void ComputePme::ungridForces() {
04024 
04025     if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
04026  
04027     SimParameters *simParams = Node::Object()->simParameters;
04028 
04029     localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
04030     Vector *localResults = localResults_alloc.begin();
04031     Vector *gridResults;
04032     if ( alchOn || lesOn || selfOn || pairOn ) {
04033       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
04034       gridResults = localResults + numLocalAtoms;
04035     } else {
04036       gridResults = localResults;
04037     }
04038 
04039     Vector pairForce = 0.;
04040     Lattice &lattice = patch->flags.lattice;
04041     int g = 0;
04042     if(!simParams->commOnly) {
04043     for ( g=0; g<numGrids; ++g ) {
04044 #ifdef NETWORK_PROGRESS
04045       CmiNetworkProgress();
04046 #endif
04047 
04048 #ifdef NAMD_CUDA
04049       if ( offload ) {
04050         int errfound = 0;
04051         for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04052           // Neither isnan() nor x != x worked when testing on Cray; this does.
04053           if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }  // CUDA NaN
04054           gridResults[i].x = f_data_host[3*i];
04055           gridResults[i].y = f_data_host[3*i+1];
04056           gridResults[i].z = f_data_host[3*i+2];
04057         }
04058         if ( errfound ) {
04059           int errcount = 0;
04060           for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04061             float f = f_data_host[3*i];
04062             if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) {  // CUDA NaN
04063               ++errcount;
04064               gridResults[i] = 0.;
04065             }
04066           }
04067           iout << iERROR << "Stray PME grid charges detected: "
04068                 << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
04069         }
04070       } else
04071 #endif // NAMD_CUDA
04072         {
04073           myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
04074         }
04075       scale_forces(gridResults, numGridAtoms[g], lattice);
04076       
04077       if (alchOn) {
04078         float scale = 1.;
04079         BigReal elecLambdaUp, elecLambdaDown;
04080         if ( simParams->alchFepWhamOn ) {
04081           if ( simParams->alchFepElecOn ) {
04082             elecLambdaUp = simParams->alchElecLambda;
04083             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04084           }
04085           else {
04086             elecLambdaUp = 0.0;
04087             elecLambdaDown = 1.0;
04088           }
04089         }
04090         else {
04091           BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04092           myMgr->alchLambda = alchLambda;
04093           elecLambdaUp = simParams->getElecLambda(alchLambda);
04094           elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
04095         }
04096         
04097         if ( g == 0 ) scale = elecLambdaUp;
04098         else if ( g == 1 ) scale = elecLambdaDown;
04099         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04100 
04101         if (alchDecouple) {
04102           if ( g == 2 ) scale = 1 - elecLambdaUp;
04103           else if ( g == 3 ) scale = 1 - elecLambdaDown;
04104           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04105         }
04106         int nga = 0;
04107         if (!alchDecouple) {
04108           for(int i=0; i<numLocalAtoms; ++i) {
04109             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04110               // (g=2: only partition 0)
04111               localResults[i] += gridResults[nga++] * scale;
04112             }
04113           }
04114         }
04115         else {  // alchDecouple
04116           if ( g < 2 ) {
04117             for(int i=0; i<numLocalAtoms; ++i) {
04118               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04119                 // g = 0: partition 0 or partition 1
04120                 // g = 1: partition 0 or partition 2
04121                 localResults[i] += gridResults[nga++] * scale;
04122               }
04123             }
04124           }
04125           else {
04126             for(int i=0; i<numLocalAtoms; ++i) {
04127               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
04128                 // g = 2: partition 1 only
04129                 // g = 3: partition 2 only
04130                 // g = 4: partition 0 only
04131                 localResults[i] += gridResults[nga++] * scale;
04132               }
04133             }
04134           }
04135         }
04136       } else if ( lesOn ) {
04137         float scale = 1.;
04138         if ( alchFepOn ) {
04139           if(simParams->alchFepWhamOn) {
04140             if(simParams->alchFepElecOn) {
04141               if ( g == 0 ) scale = simParams->alchElecLambda;
04142               else if ( g == 1 ) scale = 1. - simParams->alchElecLambda;
04143             }
04144             else {
04145               if ( g == 0 ) scale = 0.0;
04146               else if ( g == 1 ) scale = 1.0;
04147             }
04148           }
04149           else {
04150             BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04151             myMgr->alchLambda = alchLambda;
04152             if ( g == 0 ) scale = alchLambda;
04153             else if ( g == 1 ) scale = 1. - alchLambda;
04154           }
04155         } else if ( lesOn ) {
04156           scale = 1.0 / (float)lesFactor;
04157         }
04158         int nga = 0;
04159         for(int i=0; i<numLocalAtoms; ++i) {
04160           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04161             localResults[i] += gridResults[nga++] * scale;
04162           }
04163         }
04164       } else if ( selfOn ) {
04165         PmeParticle *lgd = localGridData[g];
04166         int nga = 0;
04167         for(int i=0; i<numLocalAtoms; ++i) {
04168           if ( localPartition[i] == 1 ) {
04169             pairForce += gridResults[nga];  // should add up to almost zero
04170             localResults[i] += gridResults[nga++];
04171           }
04172         }
04173       } else if ( pairOn ) {
04174         if ( g == 0 ) {
04175           int nga = 0;
04176           for(int i=0; i<numLocalAtoms; ++i) {
04177             if ( localPartition[i] == 1 ) {
04178               pairForce += gridResults[nga];
04179             }
04180             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
04181               localResults[i] += gridResults[nga++];
04182             }
04183           }
04184         } else if ( g == 1 ) {
04185           int nga = 0;
04186           for(int i=0; i<numLocalAtoms; ++i) {
04187             if ( localPartition[i] == g ) {
04188               pairForce -= gridResults[nga];  // should add up to almost zero
04189               localResults[i] -= gridResults[nga++];
04190             }
04191           }
04192         } else {
04193           int nga = 0;
04194           for(int i=0; i<numLocalAtoms; ++i) {
04195             if ( localPartition[i] == g ) {
04196               localResults[i] -= gridResults[nga++];
04197             }
04198          }
04199         }
04200       }
04201     }
04202     }
04203 
04204     Vector *results_ptr = localResults;
04205     
04206     // add in forces
04207     {
04208       Results *r = forceBox->open();
04209       Force *f = r->f[Results::slow];
04210       int numAtoms = patch->getNumAtoms();
04211 
04212       if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
04213         for(int i=0; i<numAtoms; ++i) {
04214           f[i].x += results_ptr->x;
04215           f[i].y += results_ptr->y;
04216           f[i].z += results_ptr->z;
04217           ++results_ptr;
04218         }
04219       }
04220       forceBox->close(&r);
04221     }
04222 
04223     if ( pairOn || selfOn ) {
04224         ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
04225     }
04226 
04227 }
04228 
04229 void ComputePmeMgr::submitReductions() {
04230 
04231     SimParameters *simParams = Node::Object()->simParameters;
04232 
04233     for ( int g=0; g<numGrids; ++g ) {
04234       float scale = 1.;
04235       if (alchOn) {
04236         BigReal elecLambdaUp, elecLambdaDown;
04237         if( simParams->alchFepWhamOn ) {
04238           if( simParams->alchFepElecOn ) {
04239             elecLambdaUp = simParams->alchElecLambda;
04240             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04241           }
04242           else {
04243             elecLambdaUp = 0.0;
04244             elecLambdaDown = 1.0;
04245           }
04246         }
04247         else {
04248           // alchLambda set on each step in ComputePme::ungridForces()
04249           if ( alchLambda < 0 || alchLambda > 1 ) {
04250             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04251           }
04252           elecLambdaUp = simParams->getElecLambda(alchLambda);
04253           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04254         }
04255         if ( g == 0 ) scale = elecLambdaUp;
04256         else if ( g == 1 ) scale = elecLambdaDown;
04257         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04258         if (alchDecouple) {
04259           if ( g == 2 ) scale = 1-elecLambdaUp;
04260           else if ( g == 3 ) scale = 1-elecLambdaDown;
04261           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04262         }
04263       } else if ( lesOn ) {
04264         scale = 1.0 / lesFactor;
04265       } else if ( pairOn ) {
04266         scale = ( g == 0 ? 1. : -1. );
04267       }
04268       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04269       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04270       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04271       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04272       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04273       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04274       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04275       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04276       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04277       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04278 
04279       float scale2 = 0.;
04280 
04281       // why is this declared/defined again here?
04282       SimParameters *simParams = Node::Object()->simParameters;
04283 
04284       if (alchFepOn) {
04285         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04286         if(simParams->alchFepWhamOn) {
04287           if(simParams->alchFepElecOn) {
04288             elecLambda2Up = simParams->alchElecLambda;
04289             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04290           }
04291           else {
04292             elecLambda2Up = 0.0;
04293             elecLambda2Down =  1.0;
04294           }
04295         }
04296         else {
04297           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04298           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04299         }
04300         
04301         if ( g == 0 ) scale2 = elecLambda2Up;
04302         else if ( g == 1 ) scale2 = elecLambda2Down;
04303         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04304         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04305         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04306         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04307       }
04308       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04309         if( g==0 )      scale2 = scale + 1.0;
04310         else if( g==1 ) scale2 = scale - 1.0;
04311         else if( g==2 ) scale2 = scale - 1.0;
04312         else if( g==3 ) scale2 = scale + 1.0;
04313       }
04314       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04315       
04316       if (alchThermIntOn) {
04317         
04318         // no decoupling:
04319         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04320         // (interactions between all atoms [partition 0 OR partition 1], 
04321         // minus all [within partition 0])
04322         // U = elecLambdaUp * (U[0] - U[2])
04323         // dU/dl = U[0] - U[2];
04324         
04325         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04326         // (interactions between all atoms [partition 0 OR partition 2], 
04327         // minus all [within partition 0])
04328         // U = elecLambdaDown * (U[1] - U[2])
04329         // dU/dl = U[1] - U[2];
04330 
04331         // alchDecouple:
04332         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04333         // (interactions between all atoms [partition 0 OR partition 1]
04334         // minus all [within partition 1] minus all [within partition 0]
04335         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04336         // dU/dl = U[0] - U[2] - U[4];
04337 
04338         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04339         // (interactions between all atoms [partition 0 OR partition 2]
04340         // minus all [within partition 2] minus all [within partition 0]
04341         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04342         // dU/dl = U[1] - U[3] - U[4];
04343         
04344         
04345         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04346         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04347         if (!alchDecouple) {
04348           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04349           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04350         }
04351         else {  // alchDecouple
04352           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04353           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04354           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04355           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04356         }
04357       }
04358     }
04359 
04360     alchLambda = -1.;  // illegal value to catch if not updated
04361 
04362     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04363     reduction->submit();
04364 
04365   for ( int i=0; i<heldComputes.size(); ++i ) {
04366     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04367   }
04368   heldComputes.resize(0);
04369 }
04370 
04371 #if USE_TOPOMAP 
04372 
04373 #define NPRIMES 8
04374 const static unsigned int NAMDPrimes[] = {
04375   3,
04376   5,
04377   7,
04378   11,
04379   13,
04380   17,
04381   19,
04382   23,  
04383   29,
04384   31,
04385   37,
04386   59,
04387   73,
04388   93,
04389   113,
04390   157,
04391   307,
04392   617,
04393   1217                  //This should b enough for 64K nodes of BGL. 
04394 };
04395 
04396 #include "RecBisection.h"
04397 
04398 /***-----------------------------------------------------**********
04399     The Orthogonal Recursive Bisection strategy, which allocates PME
04400     objects close to the patches they communicate, and at the same
04401     time spreads them around the grid 
04402 ****----------------------------------------------------------****/
04403 
04404 bool generateBGLORBPmePeList(int *pemap, int numPes, 
04405                              int *block_pes, int nbpes) {
04406 
04407   PatchMap *pmap = PatchMap::Object();
04408   int *pmemap = new int [CkNumPes()];
04409 
04410   if (pemap == NULL)
04411     return false;
04412 
04413   TopoManager tmgr;
04414 
04415   memset(pmemap, 0, sizeof(int) * CkNumPes());
04416 
04417   for(int count = 0; count < CkNumPes(); count++) {
04418     if(count < nbpes)
04419       pmemap[block_pes[count]] = 1;
04420     
04421     if(pmap->numPatchesOnNode(count)) {
04422       pmemap[count] = 1;
04423       
04424       //Assumes an XYZT mapping !!
04425       if(tmgr.hasMultipleProcsPerNode()) {
04426         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
04427       }
04428     }
04429   }
04430 
04431   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
04432     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
04433     return false;
04434 
04435   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04436   Node *node = nd.ckLocalBranch();
04437   SimParameters *simParams = node->simParameters;
04438 
04439   //first split PME processors into patch groups
04440 
04441   int xsize = 0, ysize = 0, zsize = 0;
04442 
04443   xsize = tmgr.getDimNX();
04444   ysize = tmgr.getDimNY();
04445   zsize = tmgr.getDimNZ();
04446   
04447   int nx = xsize, ny = ysize, nz = zsize;
04448   DimensionMap dm;
04449   
04450   dm.x = 0;
04451   dm.y = 1;
04452   dm.z = 2;
04453   
04454   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
04455 
04456   //group size processors have to be allocated to each YZ plane
04457   int group_size = numPes/nx;
04458   if(numPes % nx)
04459     group_size ++;
04460 
04461   int my_prime = NAMDPrimes[0];
04462   int density = (ny * nz)/group_size + 1;
04463   int count = 0;
04464   
04465   //Choose a suitable prime Number
04466   for(count = 0; count < NPRIMES; count ++) {
04467     //Find a prime just greater than the density
04468     if(density < NAMDPrimes[count]) {
04469       my_prime = NAMDPrimes[count];
04470       break;
04471     }      
04472   }
04473   
04474   if(count == NPRIMES)
04475     my_prime = NAMDPrimes[NPRIMES-1];
04476 
04477   //int gcount = numPes/2;
04478   int gcount = 0;
04479   int npme_pes = 0;
04480   
04481   int coord[3];
04482 
04483   for(int x = 0; x < nx; x++) {
04484     coord[0] = (x + nx/2)%nx;
04485     
04486     for(count=0; count < group_size && npme_pes < numPes; count++) {
04487       int dest = (count + 1) * my_prime;      
04488       dest = dest % (ny * nz);      
04489       
04490       coord[2] = dest / ny;
04491       coord[1] = dest - coord[2] * ny;
04492       
04493       //Locate where in the actual grid the processor is
04494       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
04495         coord[dm.z] * xsize* ysize;
04496       
04497       if(pmemap[destPe] == 0) {
04498         pemap[gcount++] = destPe;
04499         pmemap[destPe] = 1;
04500         
04501         if(tmgr.hasMultipleProcsPerNode())
04502           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
04503 
04504         npme_pes ++;
04505       }
04506       else {
04507         for(int pos = 1; pos < ny * nz; pos++) {
04508           
04509           coord[2] += pos / ny;
04510           coord[1] += pos % ny;
04511           
04512           coord[2] = coord[2] % nz;
04513           coord[1] = coord[1] % ny;       
04514           
04515           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
04516             coord[dm.z] * xsize * ysize;
04517           
04518           if(pmemap[newdest] == 0) {
04519             pemap[gcount++] = newdest;
04520             pmemap[newdest] = 1;
04521             
04522             if(tmgr.hasMultipleProcsPerNode())
04523               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
04524             
04525             npme_pes ++;
04526             break;
04527           }
04528         }
04529       }      
04530     }   
04531     
04532     if(gcount == numPes)
04533       gcount = 0;    
04534     
04535     if(npme_pes >= numPes)
04536       break;
04537   }
04538   
04539   delete [] pmemap;
04540   
04541   if(npme_pes != numPes)
04542     //NAMD_bug("ORB PME allocator failed\n");
04543     return false;
04544 
04545   return true;
04546 }
04547 
04548 #endif
04549 
04550 template <class T> class PmePencil : public T {
04551 public:
04552   PmePencil() {
04553     data = 0;
04554     work = 0;
04555     send_order = 0;
04556     needs_reply = 0;
04557 #if USE_PERSISTENT
04558     trans_handle = untrans_handle = ungrid_handle = NULL;
04559 #endif
04560   }
04561   ~PmePencil() {
04562 #ifdef NAMD_FFTW
04563     fftwf_free(data);
04564 #endif
04565     delete [] work;
04566     delete [] send_order;
04567     delete [] needs_reply;
04568   }
04569   void base_init(PmePencilInitMsg *msg) {
04570     imsg=0;
04571     imsgb=0;
04572     hasData=0;
04573     initdata = msg->data;
04574   }
04575   void order_init(int nBlocks) {
04576     send_order = new int[nBlocks];
04577     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
04578     if ( Node::Object()->simParameters->PMESendOrder ) {
04579       std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
04580     } else {
04581       Random rand(CkMyPe());
04582       rand.reorder(send_order,nBlocks);
04583     }
04584     needs_reply = new int[nBlocks];
04585     offload = Node::Object()->simParameters->PMEOffload;
04586   }
04587   PmePencilInitMsgData initdata;
04588   Lattice lattice;
04589   PmeReduction evir;
04590   int sequence;  // used for priorities
04591   int imsg;  // used in sdag code
04592   int imsgb;  // Node par uses distinct counter for back path
04593   int hasData;  // used in message elimination
04594   int offload;
04595   float *data;
04596   float *work;
04597   int *send_order;
04598   int *needs_reply;
04599 #if USE_PERSISTENT
04600   PersistentHandle *trans_handle;
04601   PersistentHandle *untrans_handle;
04602   PersistentHandle *ungrid_handle;
04603 #endif
04604 };
04605 
04606 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
04607 public:
04608     PmeZPencil_SDAG_CODE
04609     PmeZPencil() { __sdag_init(); setMigratable(false); }
04610     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); imsg=imsgb=0;}
04611         ~PmeZPencil() {
04612         #ifdef NAMD_FFTW
04613         #ifdef NAMD_FFTW_3
04614                 delete [] forward_plans;
04615                 delete [] backward_plans;
04616         #endif
04617         #endif
04618         }
04619     void fft_init();
04620     void recv_grid(const PmeGridMsg *);
04621     void forward_fft();
04622     void send_trans();
04623         void send_subset_trans(int fromIdx, int toIdx);
04624     void recv_untrans(const PmeUntransMsg *);
04625     void node_process_untrans(PmeUntransMsg *);
04626     void node_process_grid(PmeGridMsg *);
04627     void backward_fft();
04628         void send_ungrid(PmeGridMsg *);
04629         void send_all_ungrid();
04630         void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
04631 private:
04632     ResizeArray<PmeGridMsg *> grid_msgs;
04633     ResizeArray<int> work_zlist;
04634 #ifdef NAMD_FFTW
04635 #ifdef NAMD_FFTW_3
04636     fftwf_plan forward_plan, backward_plan;
04637 
04638         //for ckloop usage
04639         int numPlans;
04640         fftwf_plan *forward_plans, *backward_plans;
04641 #else
04642     rfftwnd_plan forward_plan, backward_plan;
04643 #endif
04644 #endif
04645 
04646     int nx, ny;
04647 #if USE_PERSISTENT
04648     void setup_persistent() {
04649       int hd = 1;// ( hasData ? 1 : 0 );
04650       int zBlocks = initdata.zBlocks;
04651       int block3 = initdata.grid.block3;
04652       int dim3 = initdata.grid.dim3;
04653       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04654       CmiAssert(yPencil_local);
04655       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
04656       for ( int isend=0; isend<zBlocks; ++isend ) {
04657           int kb = send_order[isend];
04658           int nz1 = block3;
04659           if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
04660           int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
04661           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
04662           int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
04663           int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
04664           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04665       }
04666     }
04667     
04668     void setup_ungrid_persistent() 
04669     {
04670        ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
04671        for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
04672            int peer = grid_msgs[imsg]->sourceNode;
04673            //ungrid_handle[imsg] = CmiCreatePersistent(peer, 0); 
04674        }
04675     }
04676 #endif
04677 };
04678 
04679 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
04680 public:
04681     PmeYPencil_SDAG_CODE
04682     PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
04683     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
04684     void fft_init();
04685     void recv_trans(const PmeTransMsg *);
04686     void forward_fft();
04687         void forward_subset_fft(int fromIdx, int toIdx);
04688     void send_trans();
04689         void send_subset_trans(int fromIdx, int toIdx);
04690     void recv_untrans(const PmeUntransMsg *);    
04691     void node_process_trans(PmeTransMsg *);
04692     void node_process_untrans(PmeUntransMsg *);
04693     void backward_fft();
04694         void backward_subset_fft(int fromIdx, int toIdx);
04695     void send_untrans();
04696     void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
04697 private:
04698 #ifdef NAMD_FFTW
04699 #ifdef NAMD_FFTW_3
04700     fftwf_plan forward_plan, backward_plan;
04701 #else
04702     fftw_plan forward_plan, backward_plan;
04703 #endif
04704 #endif
04705 
04706     int nx, nz;
04707 #if USE_PERSISTENT
04708     void setup_persistent() {
04709       int yBlocks = initdata.yBlocks;
04710       int block2 = initdata.grid.block2;
04711       int K2 = initdata.grid.K2;
04712       int hd = 1;
04713       CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
04714       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04715       for ( int isend=0; isend<yBlocks; ++isend ) {
04716           int jb = send_order[isend];
04717           int ny1 = block2;
04718           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04719           int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
04720           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
04721           int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
04722           int compress_size = sizeof(float)*hd*nx*ny1*nz*2; 
04723           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04724       }
04725 
04726       CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
04727       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04728       for ( int isend=0; isend<yBlocks; ++isend ) {
04729           int jb = send_order[isend];
04730           int ny1 = block2;
04731           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04732           int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
04733           int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
04734           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04735           int compress_size = sizeof(float)*nx*ny1*nz*2;
04736           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size,  compress_start, compress_size, CMI_FLOATING);
04737       }
04738     }
04739 #endif
04740 };
04741 
04742 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
04743 public:
04744     PmeXPencil_SDAG_CODE
04745     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
04746     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
04747         ~PmeXPencil() {
04748         #ifdef NAMD_FFTW
04749         #ifdef NAMD_FFTW_3
04750                 delete [] forward_plans;
04751                 delete [] backward_plans;
04752         #endif
04753         #endif
04754         }
04755     void fft_init();
04756     void recv_trans(const PmeTransMsg *);
04757     void forward_fft();
04758     void pme_kspace();
04759     void backward_fft();
04760     void send_untrans();
04761         void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
04762     void node_process_trans(PmeTransMsg *);
04763 #ifdef NAMD_FFTW
04764 #ifdef NAMD_FFTW_3
04765     fftwf_plan forward_plan, backward_plan;
04766 
04767         int numPlans;
04768         fftwf_plan *forward_plans, *backward_plans;
04769 #else
04770     fftw_plan forward_plan, backward_plan;
04771 #endif
04772 #endif
04773     int ny, nz;
04774     int recipEvirPe;
04775     void evir_init();
04776     PmeKSpace *myKSpace;
04777 #if USE_PERSISTENT
04778     void  setup_persistent() {
04779       int xBlocks = initdata.xBlocks;
04780       int block1 = initdata.grid.block1;
04781       int K1 = initdata.grid.K1;
04782       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04783       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
04784       for ( int isend=0; isend<xBlocks; ++isend ) {
04785           int ib = send_order[isend];
04786           int nx1 = block1;
04787           if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
04788           int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
04789           int size = sizeof(PmeUntransMsg) +
04790               sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24; 
04791           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04792           int compress_size = sizeof(float)*nx1*ny*nz*2;
04793           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04794       }
04795     }
04796 #endif
04797 
04798 };
04799 
04800 void PmeXPencil::evir_init() {
04801   recipEvirPe = findRecipEvirPe();
04802   initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
04803 }
04804 
04805 void PmeZPencil::fft_init() {
04806   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04807   Node *node = nd.ckLocalBranch();
04808   SimParameters *simParams = node->simParameters;
04809 
04810 #if USE_NODE_PAR_RECEIVE
04811   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
04812 #endif
04813 
04814   int K1 = initdata.grid.K1;
04815   int K2 = initdata.grid.K2;
04816   int K3 = initdata.grid.K3;
04817   int dim3 = initdata.grid.dim3;
04818   int block1 = initdata.grid.block1;
04819   int block2 = initdata.grid.block2;
04820 
04821   nx = block1;
04822   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04823   ny = block2;
04824   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
04825 
04826 #ifdef NAMD_FFTW
04827   CmiLock(ComputePmeMgr::fftw_plan_lock);
04828 
04829   data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
04830   work = new float[dim3];
04831 
04832   order_init(initdata.zBlocks);
04833 
04834 #ifdef NAMD_FFTW_3
04835   /* need array of sizes for the how many */
04836 
04837   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04838   int sizeLines=nx*ny;
04839   int planLineSizes[1];
04840   planLineSizes[0]=K3;
04841   int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
04842   int ndimHalf=ndim/2;
04843   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
04844                                          (float *) data, NULL, 1, 
04845                                          ndim,
04846                                          (fftwf_complex *) data, NULL, 1,
04847                                          ndimHalf,
04848                                          fftwFlags);
04849 
04850   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
04851                                           (fftwf_complex *) data, NULL, 1, 
04852                                           ndimHalf,
04853                                           (float *) data, NULL, 1, 
04854                                           ndim,
04855                                           fftwFlags);
04856 #if     CMK_SMP && USE_CKLOOP
04857   if(simParams->useCkLoop) {
04858           //How many FFT plans to be created? The grain-size issue!!.
04859           //Currently, I am choosing the min(nx, ny) to be coarse-grain
04860           numPlans = (nx<=ny?nx:ny);
04861           if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
04862           if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
04863           int howmany = sizeLines/numPlans;
04864           forward_plans = new fftwf_plan[numPlans];
04865           backward_plans = new fftwf_plan[numPlans];
04866           for(int i=0; i<numPlans; i++) {
04867                   int dimStride = i*ndim*howmany;
04868                   int dimHalfStride = i*ndimHalf*howmany;
04869                   forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
04870                                                                                                          ((float *)data)+dimStride, NULL, 1,
04871                                                                                                          ndim,
04872                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04873                                                                                                          ndimHalf,
04874                                                                                                          fftwFlags);
04875 
04876                   backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
04877                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04878                                                                                                          ndimHalf,
04879                                                                                                          ((float *)data)+dimStride, NULL, 1,
04880                                                                                                          ndim,
04881                                                                                                          fftwFlags);
04882           }
04883   }else 
04884 #endif 
04885   {
04886           forward_plans = NULL;
04887           backward_plans = NULL;
04888   }
04889 #else
04890   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
04891         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04892         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04893   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
04894         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04895         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04896 #endif
04897   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04898 #else
04899   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04900 #endif
04901 
04902 #if USE_NODE_PAR_RECEIVE
04903     evir = 0.;
04904     memset(data, 0, sizeof(float) * nx*ny*dim3);
04905 #endif
04906 }
04907 
04908 void PmeYPencil::fft_init() {
04909   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04910   Node *node = nd.ckLocalBranch();
04911   SimParameters *simParams = node->simParameters;
04912 
04913 #if USE_NODE_PAR_RECEIVE
04914   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
04915 #endif
04916 
04917   int K1 = initdata.grid.K1;
04918   int K2 = initdata.grid.K2;
04919   int dim2 = initdata.grid.dim2;
04920   int dim3 = initdata.grid.dim3;
04921   int block1 = initdata.grid.block1;
04922   int block3 = initdata.grid.block3;
04923 
04924   nx = block1;
04925   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04926   nz = block3;
04927   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
04928 
04929 #ifdef NAMD_FFTW
04930   CmiLock(ComputePmeMgr::fftw_plan_lock);
04931 
04932   data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
04933   work = new float[2*K2];
04934 
04935   order_init(initdata.yBlocks);
04936 
04937 #ifdef NAMD_FFTW_3
04938   /* need array of sizes for the dimensions */
04939   /* ideally this should be implementable as a single multidimensional
04940    *  plan, but that has proven tricky to implement, so we maintain the
04941    *  loop of 1d plan executions. */
04942   int sizeLines=nz;
04943   int planLineSizes[1];
04944   planLineSizes[0]=K2;
04945   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04946   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04947                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04948                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04949                                      FFTW_FORWARD, 
04950                                      fftwFlags);
04951   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04952                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04953                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04954                                      FFTW_BACKWARD, 
04955                                       fftwFlags);
04956   CkAssert(forward_plan != NULL);
04957   CkAssert(backward_plan != NULL);
04958 #else
04959   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
04960         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04961         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04962         nz, (fftw_complex *) work, 1);
04963   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
04964         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04965         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04966         nz, (fftw_complex *) work, 1);
04967 #endif
04968   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04969 #else
04970   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04971 #endif
04972 
04973 #if USE_NODE_PAR_RECEIVE
04974   evir = 0;
04975   CmiMemoryWriteFence();
04976 #endif
04977 }
04978 
04979 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
04980 {
04981   if ( msg->hasData ) hasData = 1;
04982   needs_reply[msg->sourceNode] = msg->hasData;
04983   recv_trans(msg);
04984   int limsg;
04985   CmiMemoryAtomicFetchAndInc(imsg,limsg);
04986   if(limsg+1 == initdata.yBlocks)
04987     {
04988       if ( hasData ) {
04989         forward_fft();
04990       }
04991       send_trans();
04992       if( ! hasData)
04993         {
04994           send_untrans(); //todo, what is up with the recvAck in SDAG version?
04995         }
04996       imsg=0;
04997       CmiMemoryWriteFence();
04998     }
04999 }
05000 
05001 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
05002 {
05003   recv_untrans(msg);
05004   int limsg;
05005   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
05006   if(limsg+1 == initdata.yBlocks)
05007     {
05008       backward_fft();
05009       send_untrans();
05010       imsgb=0;
05011       CmiMemoryWriteFence();
05012     }
05013 }
05014 
05015 #define DEBUG_NODE_PAR_RECV 0
05016 
05017 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
05018   //  CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05019   PmeXPencil *target=xPencilObj.get(msg->destElem);
05020 #if DEBUG_NODE_PAR_RECV
05021   if(target == NULL)
05022     CkAbort("xpencil in recvXTrans not found, debug registeration");
05023 #endif  
05024     target->node_process_trans(msg);
05025   delete msg;
05026 }
05027 
05028 
05029 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
05030   //  CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05031   PmeYPencil *target=yPencilObj.get(msg->destElem);
05032 #if DEBUG_NODE_PAR_RECV
05033   if(target == NULL)
05034     CkAbort("ypencil in recvYTrans not found, debug registeration");
05035 #endif  
05036     target->node_process_trans(msg);
05037   delete msg;
05038  }
05039 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
05040   //  CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05041   PmeYPencil *target=yPencilObj.get(msg->destElem);
05042 #if DEBUG_NODE_PAR_RECV  
05043   if(target == NULL)
05044     CkAbort("ypencil in recvYUntrans not found, debug registeration");
05045 #endif  
05046     target->node_process_untrans(msg);
05047   delete msg;
05048  }
05049 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
05050   //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05051   PmeZPencil *target=zPencilObj.get(msg->destElem);
05052 #if DEBUG_NODE_PAR_RECV
05053   if(target == NULL)
05054     CkAbort("zpencil in recvZUntrans not found, debug registeration");
05055 #endif
05056   target->node_process_untrans(msg);
05057   delete msg;
05058 }
05059 
05060 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
05061   //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05062   PmeZPencil *target=zPencilObj.get(msg->destElem);
05063 #if DEBUG_NODE_PAR_RECV
05064   if(target == NULL){
05065     CkAbort("zpencil in recvZGrid not found, debug registeration");
05066   }
05067 #endif
05068   target->node_process_grid(msg); //msg is stored inside node_proces_grid
05069 }
05070 
05071 void PmeXPencil::fft_init() {
05072   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
05073   Node *node = nd.ckLocalBranch();
05074   SimParameters *simParams = node->simParameters;
05075 #if USE_NODE_PAR_RECEIVE
05076   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
05077 #endif
05078 
05079   int K1 = initdata.grid.K1;
05080   int K2 = initdata.grid.K2;
05081   int dim3 = initdata.grid.dim3;
05082   int block2 = initdata.grid.block2;
05083   int block3 = initdata.grid.block3;
05084 
05085   ny = block2;
05086   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
05087   nz = block3;
05088   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
05089 
05090 #ifdef NAMD_FFTW
05091   CmiLock(ComputePmeMgr::fftw_plan_lock);
05092 
05093   data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
05094   work = new float[2*K1];
05095 
05096   order_init(initdata.xBlocks);
05097 
05098 #ifdef NAMD_FFTW_3
05099   /* need array of sizes for the how many */
05100   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
05101   int sizeLines=ny*nz;
05102   int planLineSizes[1];
05103   planLineSizes[0]=K1;
05104   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05105                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05106                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05107                                    FFTW_FORWARD,
05108                                      fftwFlags);
05109   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05110                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05111                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05112                                           FFTW_BACKWARD,
05113                                       fftwFlags);
05114 
05115 #if     CMK_SMP && USE_CKLOOP
05116   if(simParams->useCkLoop) {
05117           //How many FFT plans to be created? The grain-size issue!!.
05118           //Currently, I am choosing the min(nx, ny) to be coarse-grain
05119           numPlans = (ny<=nz?ny:nz);
05120           // limit attempted parallelism due to false sharing
05121           //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
05122           //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
05123           if ( sizeLines/numPlans < 4 ) numPlans = 1;
05124           int howmany = sizeLines/numPlans;
05125           forward_plans = new fftwf_plan[numPlans];
05126           backward_plans = new fftwf_plan[numPlans];
05127           for(int i=0; i<numPlans; i++) {
05128                   int curStride = i*howmany;              
05129                   forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05130                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05131                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05132                                                                                                         FFTW_FORWARD,
05133                                                                                                          fftwFlags);
05134 
05135                   backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05136                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05137                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05138                                                                                                           FFTW_BACKWARD,
05139                                                                                                          fftwFlags);
05140           }
05141   }else
05142 #endif
05143   {
05144           forward_plans = NULL;
05145           backward_plans = NULL;
05146   }
05147 #else
05148   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
05149         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05150         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05151         ny*nz, (fftw_complex *) work, 1);
05152   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
05153         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05154         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05155         ny*nz, (fftw_complex *) work, 1);
05156 #endif
05157   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05158 #else
05159   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
05160 #endif
05161 
05162   myKSpace = new PmeKSpace(initdata.grid,
05163                 thisIndex.y*block2, thisIndex.y*block2 + ny,
05164                 thisIndex.z*block3, thisIndex.z*block3 + nz);
05165 
05166 }
05167 
05168 // #define FFTCHECK   // run a grid of integers through the fft
05169 // #define ZEROCHECK  // check for suspicious zeros in fft
05170 
05171 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
05172 
05173   int dim3 = initdata.grid.dim3;
05174   if ( imsg == 0 ) {
05175     lattice = msg->lattice;
05176     sequence = msg->sequence;
05177 #if ! USE_NODE_PAR_RECEIVE
05178     memset(data, 0, sizeof(float)*nx*ny*dim3);
05179 #endif
05180   }
05181 
05182   if ( ! msg->hasData ) return;
05183 
05184   int zlistlen = msg->zlistlen;
05185 #ifdef NAMD_KNL
05186   int * __restrict msg_zlist = msg->zlist;
05187   int * __restrict zlist = work_zlist.begin();
05188   __assume_aligned(zlist,64);
05189   for ( int k=0; k<zlistlen; ++k ) {
05190     zlist[k] = msg_zlist[k];
05191   }
05192 #else
05193   int * __restrict zlist = msg->zlist;
05194 #endif
05195   char * __restrict fmsg = msg->fgrid;
05196   float * __restrict qmsg = msg->qgrid;
05197   float * __restrict d = data;
05198   int numGrids = 1;  // pencil FFT doesn't support multiple grids
05199   for ( int g=0; g<numGrids; ++g ) {
05200     for ( int i=0; i<nx; ++i ) {
05201      for ( int j=0; j<ny; ++j, d += dim3 ) {
05202       if( *(fmsg++) ) {
05203         #pragma ivdep
05204         for ( int k=0; k<zlistlen; ++k ) {
05205           d[zlist[k]] += *(qmsg++);
05206         }
05207       }
05208      }
05209     }
05210   }
05211 }
05212 
05213 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
05214 #ifdef NAMD_FFTW
05215 #ifdef NAMD_FFTW_3    
05216     fftwf_plan *plans = (fftwf_plan *)param;
05217     for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
05218 #endif
05219 #endif        
05220 }
05221 
05222 void PmeZPencil::forward_fft() {
05223   evir = 0.;
05224 #ifdef FFTCHECK
05225   int dim3 = initdata.grid.dim3;
05226   int K3 = initdata.grid.K3;
05227   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05228   float *d = data;
05229   for ( int i=0; i<nx; ++i ) {
05230    for ( int j=0; j<ny; ++j, d += dim3 ) {
05231     for ( int k=0; k<dim3; ++k ) {
05232       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
05233     }
05234    }
05235   }
05236 #endif
05237 #ifdef NAMD_FFTW
05238 #ifdef MANUAL_DEBUG_FFTW3
05239   dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05240 #endif
05241 #ifdef NAMD_FFTW_3
05242 #if     CMK_SMP && USE_CKLOOP
05243   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05244   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05245      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05246           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05247           //transform the above loop
05248           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05249           return;
05250   }
05251 #endif
05252   fftwf_execute(forward_plan);
05253 #else
05254   rfftwnd_real_to_complex(forward_plan, nx*ny,
05255         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
05256 #endif
05257 #ifdef MANUAL_DEBUG_FFTW3
05258   dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05259 #endif
05260 
05261 #endif
05262 #ifdef ZEROCHECK
05263   int dim3 = initdata.grid.dim3;
05264   int K3 = initdata.grid.K3;
05265   float *d = data;
05266   for ( int i=0; i<nx; ++i ) {
05267    for ( int j=0; j<ny; ++j, d += dim3 ) {
05268     for ( int k=0; k<dim3; ++k ) {
05269       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
05270         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
05271     }
05272    }
05273   }
05274 #endif
05275 }
05276 
05277 /* A single task for partitioned PmeZPencil::send_trans work */
05278 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05279         PmeZPencil *zpencil = (PmeZPencil *)param;
05280         zpencil->send_subset_trans(first, last);        
05281 }
05282 
05283 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
05284         int zBlocks = initdata.zBlocks;
05285         int block3 = initdata.grid.block3;
05286         int dim3 = initdata.grid.dim3;
05287         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05288           int kb = send_order[isend];
05289           int nz = block3;
05290           if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05291           int hd = ( hasData ? 1 : 0 );
05292           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05293           msg->lattice = lattice;
05294           msg->sourceNode = thisIndex.y;
05295           msg->hasData = hasData;
05296           msg->nx = ny;
05297          if ( hasData ) {
05298           float *md = msg->qgrid;
05299           const float *d = data;
05300           for ( int i=0; i<nx; ++i ) {
05301            for ( int j=0; j<ny; ++j, d += dim3 ) {
05302                 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05303                   *(md++) = d[2*k];
05304                   *(md++) = d[2*k+1];
05305                 }
05306            }
05307           }
05308          }
05309           msg->sequence = sequence;
05310           SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05311 
05312     CmiEnableUrgentSend(1);
05313 #if USE_NODE_PAR_RECEIVE
05314       msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05315 #if Y_PERSIST 
05316       CmiUsePersistentHandle(&trans_handle[isend], 1);
05317 #endif
05318       initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05319 #if Y_PERSIST 
05320       CmiUsePersistentHandle(NULL, 0);
05321 #endif    
05322 #else
05323 #if Y_PERSIST 
05324       CmiUsePersistentHandle(&trans_handle[isend], 1);
05325 #endif
05326       initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05327 #if Y_PERSIST 
05328       CmiUsePersistentHandle(NULL, 0);
05329 #endif    
05330 #endif
05331     CmiEnableUrgentSend(0);
05332     }
05333 }
05334 
05335 void PmeZPencil::send_trans() {
05336 #if USE_PERSISTENT
05337     if (trans_handle == NULL) setup_persistent();
05338 #endif
05339 #if     CMK_SMP && USE_CKLOOP
05340         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05341         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05342            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05349                 //send_subset_trans(0, initdata.zBlocks-1);
05350                 CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
05351                 return;
05352         }
05353 #endif
05354   int zBlocks = initdata.zBlocks;
05355   int block3 = initdata.grid.block3;
05356   int dim3 = initdata.grid.dim3;
05357   for ( int isend=0; isend<zBlocks; ++isend ) {
05358     int kb = send_order[isend];
05359     int nz = block3;
05360     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05361     int hd = ( hasData ? 1 : 0 );
05362     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05363     msg->lattice = lattice;
05364     msg->sourceNode = thisIndex.y;
05365     msg->hasData = hasData;
05366     msg->nx = ny;
05367    if ( hasData ) {
05368     float *md = msg->qgrid;
05369     const float *d = data;
05370     for ( int i=0; i<nx; ++i ) {
05371      for ( int j=0; j<ny; ++j, d += dim3 ) {
05372       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05373         *(md++) = d[2*k];
05374         *(md++) = d[2*k+1];
05375       }
05376      }
05377     }
05378    }
05379     msg->sequence = sequence;
05380     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05381 
05382     CmiEnableUrgentSend(1);
05383 #if USE_NODE_PAR_RECEIVE
05384     msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05385 #if Y_PERSIST 
05386     CmiUsePersistentHandle(&trans_handle[isend], 1);
05387 #endif
05388     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05389 #if Y_PERSIST 
05390     CmiUsePersistentHandle(NULL, 0);
05391 #endif    
05392 #else
05393 #if Y_PERSIST 
05394     CmiUsePersistentHandle(&trans_handle[isend], 1);
05395 #endif
05396     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05397 #if Y_PERSIST 
05398     CmiUsePersistentHandle(NULL, 0);
05399 #endif    
05400 #endif
05401     CmiEnableUrgentSend(0);
05402   }
05403 }
05404 
05405 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
05406   if ( imsg == 0 ) {
05407     lattice = msg->lattice;
05408     sequence = msg->sequence;
05409   }
05410   int block2 = initdata.grid.block2;
05411   int K2 = initdata.grid.K2;
05412   int jb = msg->sourceNode;
05413   int ny = msg->nx;
05414  if ( msg->hasData ) {
05415   const float *md = msg->qgrid;
05416   float *d = data;
05417   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05418    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05419     for ( int k=0; k<nz; ++k ) {
05420 #ifdef ZEROCHECK
05421       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
05422         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05423 #endif
05424       d[2*(j*nz+k)] = *(md++);
05425       d[2*(j*nz+k)+1] = *(md++);
05426     }
05427    }
05428   }
05429  } else {
05430   float *d = data;
05431   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05432    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05433     for ( int k=0; k<nz; ++k ) {
05434       d[2*(j*nz+k)] = 0;
05435       d[2*(j*nz+k)+1] = 0;
05436     }
05437    }
05438   }
05439  }
05440 }
05441 
05442 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
05443         PmeYPencil *ypencil = (PmeYPencil *)param;
05444         ypencil->forward_subset_fft(first, last);
05445 }
05446 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
05447 #ifdef NAMD_FFTW
05448 #ifdef NAMD_FFTW_3
05449         for(int i=fromIdx; i<=toIdx; i++){
05450                 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05451                       * nz * initdata.grid.K2,  
05452                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05453         }
05454 #endif
05455 #endif
05456 }
05457 
05458 void PmeYPencil::forward_fft() {
05459     evir = 0.;
05460 #ifdef NAMD_FFTW
05461 #ifdef MANUAL_DEBUG_FFTW3
05462   dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05463 #endif
05464   
05465 #ifdef NAMD_FFTW_3
05466 #if     CMK_SMP && USE_CKLOOP
05467   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05468   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05469      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05470           CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
05471           return;
05472   }
05473 #endif
05474   //the above is a transformation of the following loop using CkLoop
05475   for ( int i=0; i<nx; ++i ) {
05476     fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05477                       * nz * initdata.grid.K2,  
05478                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05479   }
05480 #else
05481   for ( int i=0; i<nx; ++i ) {
05482     fftw(forward_plan, nz,
05483         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
05484         nz, 1, (fftw_complex *) work, 1, 0);
05485   }
05486 #endif
05487 #ifdef MANUAL_DEBUG_FFTW3
05488   dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05489 #endif
05490 
05491 #endif
05492 }
05493 
05494 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05495         PmeYPencil *ypencil = (PmeYPencil *)param;
05496         ypencil->send_subset_trans(first, last);
05497 }
05498 
05499 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
05500         int yBlocks = initdata.yBlocks;
05501         int block2 = initdata.grid.block2;
05502         int K2 = initdata.grid.K2;
05503     for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05504           int jb = send_order[isend];
05505           int ny = block2;
05506           if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05507           int hd = ( hasData ? 1 : 0 );
05508           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05509           msg->lattice = lattice;
05510           msg->sourceNode = thisIndex.x;
05511           msg->hasData = hasData;
05512           msg->nx = nx;
05513          if ( hasData ) {
05514           float *md = msg->qgrid;
05515           const float *d = data;
05516           for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05517            for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05518                 for ( int k=0; k<nz; ++k ) {
05519                   *(md++) = d[2*(j*nz+k)];
05520                   *(md++) = d[2*(j*nz+k)+1];
05521   #ifdef ZEROCHECK
05522                   if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05523           thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05524   #endif
05525                 }
05526            }
05527           }
05528           if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05529           thisIndex.x, jb, thisIndex.z);
05530          }
05531           msg->sequence = sequence;
05532           SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05533       CmiEnableUrgentSend(1);
05534 #if USE_NODE_PAR_RECEIVE
05535       msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05536 #if X_PERSIST 
05537       CmiUsePersistentHandle(&trans_handle[isend], 1);
05538 #endif
05539       initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05540 #if X_PERSIST 
05541       CmiUsePersistentHandle(NULL, 0);
05542 #endif
05543 #else      
05544 #if X_PERSIST 
05545       CmiUsePersistentHandle(&trans_handle[isend], 1);
05546 #endif
05547       initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05548 #if X_PERSIST 
05549       CmiUsePersistentHandle(NULL, 0);
05550 #endif
05551 #endif
05552       CmiEnableUrgentSend(0);
05553         }
05554 }
05555 
05556 void PmeYPencil::send_trans() {
05557 #if USE_PERSISTENT
05558     if (trans_handle == NULL) setup_persistent();
05559 #endif
05560 #if     CMK_SMP && USE_CKLOOP
05561         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05562         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05563            && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05570                 //send_subset_trans(0, initdata.yBlocks-1);
05571                 CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
05572                 return;
05573         }
05574 #endif
05575   int yBlocks = initdata.yBlocks;
05576   int block2 = initdata.grid.block2;
05577   int K2 = initdata.grid.K2;
05578   for ( int isend=0; isend<yBlocks; ++isend ) {
05579     int jb = send_order[isend];
05580     int ny = block2;
05581     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05582     int hd = ( hasData ? 1 : 0 );
05583     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05584     msg->lattice = lattice;
05585     msg->sourceNode = thisIndex.x;
05586     msg->hasData = hasData;
05587     msg->nx = nx;
05588    if ( hasData ) {
05589     float *md = msg->qgrid;
05590     const float *d = data;
05591     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05592      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05593       for ( int k=0; k<nz; ++k ) {
05594         *(md++) = d[2*(j*nz+k)];
05595         *(md++) = d[2*(j*nz+k)+1];
05596 #ifdef ZEROCHECK
05597         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05598         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05599 #endif
05600       }
05601      }
05602     }
05603     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05604         thisIndex.x, jb, thisIndex.z);
05605    }
05606     msg->sequence = sequence;
05607     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05608     CmiEnableUrgentSend(1);
05609 #if USE_NODE_PAR_RECEIVE
05610     msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05611 #if X_PERSIST 
05612         CmiUsePersistentHandle(&trans_handle[isend], 1);
05613 #endif
05614     initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05615 #if X_PERSIST 
05616         CmiUsePersistentHandle(NULL, 0);
05617 #endif
05618 #else
05619 #if X_PERSIST 
05620         CmiUsePersistentHandle(&trans_handle[isend], 1);
05621 #endif
05622     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05623 #if X_PERSIST 
05624         CmiUsePersistentHandle(NULL, 0);
05625 #endif
05626     
05627 #endif
05628     CmiEnableUrgentSend(0);
05629   }
05630 }
05631 
05632 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
05633 {
05634   if(msg->hasData) hasData=1;
05635   needs_reply[msg->sourceNode] = msg->hasData;
05636   recv_trans(msg);
05637   int limsg;
05638   CmiMemoryAtomicFetchAndInc(imsg,limsg);
05639   if(limsg+1 == initdata.xBlocks)
05640     {
05641       if(hasData){
05642         forward_fft();
05643         pme_kspace();
05644         backward_fft();
05645       }
05646       send_untrans();
05647       imsg=0;
05648       CmiMemoryWriteFence();
05649     }
05650 }
05651 
05652 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
05653   if ( imsg == 0 ) {
05654     lattice = msg->lattice;
05655     sequence = msg->sequence;
05656   }
05657   int block1 = initdata.grid.block1;
05658   int K1 = initdata.grid.K1;
05659   int ib = msg->sourceNode;
05660   int nx = msg->nx;
05661  if ( msg->hasData ) {
05662   const float *md = msg->qgrid;
05663   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05664    float *d = data + i*ny*nz*2;
05665    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05666     for ( int k=0; k<nz; ++k ) {
05667 #ifdef ZEROCHECK
05668       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
05669         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
05670 #endif
05671       d[2*k] = *(md++);
05672       d[2*k+1] = *(md++);
05673     }
05674    }
05675   }
05676  } else {
05677   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05678    float *d = data + i*ny*nz*2;
05679    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05680     for ( int k=0; k<nz; ++k ) {
05681       d[2*k] = 0;
05682       d[2*k+1] = 0;
05683     }
05684    }
05685   }
05686  }
05687 }
05688 
05689 void PmeXPencil::forward_fft() {
05690 #ifdef NAMD_FFTW
05691 
05692 #ifdef MANUAL_DEBUG_FFTW3
05693   dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05694 #endif
05695 
05696 #ifdef NAMD_FFTW_3
05697 #if     CMK_SMP && USE_CKLOOP
05698   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05699   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05700      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05701           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05702           //transform the above loop
05703           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05704           return;
05705   }
05706 #endif
05707   fftwf_execute(forward_plan);
05708 #else
05709   fftw(forward_plan, ny*nz,
05710         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05711 #endif
05712 #ifdef MANUAL_DEBUG_FFTW3
05713   dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05714 #endif
05715 
05716 #endif
05717 }
05718 
05719 void PmeXPencil::pme_kspace() {
05720 
05721   evir = 0.;
05722 
05723 #ifdef FFTCHECK
05724   return;
05725 #endif
05726 
05727   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
05728 
05729   int useCkLoop = 0;
05730 #if CMK_SMP && USE_CKLOOP
05731   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
05732        && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
05733     useCkLoop = 1;
05734   }
05735 #endif
05736 
05737   int numGrids = 1;
05738   for ( int g=0; g<numGrids; ++g ) {
05739     evir[0] = myKSpace->compute_energy(data+0*g,
05740                 lattice, ewaldcof, &(evir[1]), useCkLoop);
05741   }
05742   
05743 #if USE_NODE_PAR_RECEIVE
05744     CmiMemoryWriteFence();
05745 #endif
05746 }
05747 
05748 void PmeXPencil::backward_fft() {
05749 #ifdef NAMD_FFTW
05750 #ifdef MANUAL_DEBUG_FFTW3
05751   dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05752 #endif
05753 
05754 #ifdef NAMD_FFTW_3
05755 #if     CMK_SMP && USE_CKLOOP
05756   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05757   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
05758      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05759           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
05760           //transform the above loop
05761           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05762           return;
05763   }
05764 #endif
05765   fftwf_execute(backward_plan);
05766 #else
05767   fftw(backward_plan, ny*nz,
05768         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05769 #endif
05770 #ifdef MANUAL_DEBUG_FFTW3
05771   dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05772 #endif
05773 #endif
05774 }
05775 
05776 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
05777         int evirIdx = paraNum;
05778         PmeXPencil *xpencil = (PmeXPencil *)param;
05779         xpencil->send_subset_untrans(first, last, evirIdx);
05780 }
05781 
05782 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
05783         int xBlocks = initdata.xBlocks;
05784         int block1 = initdata.grid.block1;      
05785         int K1 = initdata.grid.K1;
05786 
05787         int ackL=0, ackH=-1;
05788         int unL=0, unH=-1;
05789         int send_evir=0;
05790         if(fromIdx >= evirIdx+1) {
05791                 //send PmeUntransMsg with has_evir=0
05792                 unL = fromIdx;
05793                 unH = toIdx;            
05794         } else if(toIdx <= evirIdx-1) {
05795                 //send PmeAckMsg
05796                 ackL=fromIdx;
05797                 ackH=toIdx;             
05798         } else {
05799                 //partially send PmeAckMsg and partially send PmeUntransMsg
05800                 ackL=fromIdx;
05801                 ackH=evirIdx-1;
05802                 send_evir=1;
05803                 unL=evirIdx+1;
05804                 unH=toIdx;
05805         }
05806 
05807         for(int isend=ackL; isend<=ackH; isend++) {
05808                 //send PmeAckMsg
05809         CmiEnableUrgentSend(1);
05810                 int ib = send_order[isend];
05811                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05812                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05813                 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05814         CmiEnableUrgentSend(0);
05815     }
05816 
05817     CmiEnableUrgentSend(1);
05818         //send PmeUntransMsg with has_evir=1
05819         if(send_evir) {
05820                 int ib = send_order[evirIdx];
05821                 int nx = block1;
05822                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05823                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;              
05824                 msg->sourceNode = thisIndex.y;
05825                 msg->ny = ny;
05826                 float *md = msg->qgrid;
05827                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05828                         float *d = data + i*ny*nz*2;
05829                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05830                                 for ( int k=0; k<nz; ++k ) {
05831                                         *(md++) = d[2*k];
05832                                         *(md++) = d[2*k+1];
05833                                 }
05834                         }
05835                 }
05836                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05837 #if USE_NODE_PAR_RECEIVE
05838         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05839         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05840 #else
05841         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05842 #endif
05843          }
05844     CmiEnableUrgentSend(0);
05845         
05846         //send PmeUntransMsg with has_evir=0
05847         for(int isend=unL; isend<=unH; isend++) {
05848                 int ib = send_order[isend];
05849                 int nx = block1;
05850                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05851                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05852                 msg->sourceNode = thisIndex.y;
05853                 msg->ny = ny;
05854                 float *md = msg->qgrid;
05855                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05856                         float *d = data + i*ny*nz*2;
05857                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05858                                 for ( int k=0; k<nz; ++k ) {
05859                                         *(md++) = d[2*k];
05860                                         *(md++) = d[2*k+1];
05861                                 }
05862                         }
05863                 }
05864                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05865         CmiEnableUrgentSend(1);
05866 #if USE_NODE_PAR_RECEIVE
05867         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05868 #if Y_PERSIST 
05869         CmiUsePersistentHandle(&untrans_handle[isend], 1);
05870 #endif
05871         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05872 #if Y_PERSIST 
05873         CmiUsePersistentHandle(NULL, 0);
05874 #endif
05875 #else
05876 #if Y_PERSIST 
05877   //      CmiUsePersistentHandle(&untrans_handle[isend], 1);
05878 #endif
05879         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05880 #if Y_PERSIST 
05881    //     CmiUsePersistentHandle(NULL, 0);
05882 #endif
05883 #endif
05884         CmiEnableUrgentSend(0);
05885         }
05886 }
05887 
05888 void PmeXPencil::send_untrans() {
05889 
05890   { // send energy and virial
05891     int numGrids = 1;
05892     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
05893     newmsg->evir[0] = evir;
05894     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
05895     CmiEnableUrgentSend(1);
05896     initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
05897     CmiEnableUrgentSend(0);
05898   }
05899 
05900 #if USE_PERSISTENT
05901   if (untrans_handle == NULL) setup_persistent();
05902 #endif
05903 #if     CMK_SMP && USE_CKLOOP
05904   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05905   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
05906      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05907                 int xBlocks = initdata.xBlocks;
05908                 int evirIdx = 0;
05909                 for ( int isend=0; isend<xBlocks; ++isend ) {
05910                         int ib = send_order[isend];
05911                         if (needs_reply[ib]) {
05912                                 evirIdx = isend;
05913                                 break;
05914                         }
05915                 }
05916 
05917                 //basically: 
05918                 //[0,evirIdx-1]->send PmeAckMsg
05919                 //evirIdx->send PmeUntransMsg with has_evir=1
05920                 //[evirIdx+1, xBlocks-1]->send PmeUntransMsg with has_evir=0
05921                 //send_subset_untrans(0, xBlocks-1, evirIdx);
05922 #if USE_NODE_PAR_RECEIVE
05923                 //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
05924                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
05925 #else
05926         //CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
05927                 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
05928 #endif        
05929                 return;
05930   }
05931 #endif
05932   int xBlocks = initdata.xBlocks;
05933   int block1 = initdata.grid.block1;
05934   int K1 = initdata.grid.K1;
05935   int send_evir = 1;
05936   for ( int isend=0; isend<xBlocks; ++isend ) {
05937     int ib = send_order[isend];
05938     if ( ! needs_reply[ib] ) {
05939       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05940       CmiEnableUrgentSend(1);
05941       SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05942       initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05943       CmiEnableUrgentSend(0);
05944       continue;
05945     }
05946     int nx = block1;
05947     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05948     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05949     if ( send_evir ) {
05950       send_evir = 0;
05951     }
05952     msg->sourceNode = thisIndex.y;
05953     msg->ny = ny;
05954     float *md = msg->qgrid;
05955     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05956      float *d = data + i*ny*nz*2;
05957      for ( int j=0; j<ny; ++j, d += nz*2 ) {
05958       for ( int k=0; k<nz; ++k ) {
05959         *(md++) = d[2*k];
05960         *(md++) = d[2*k+1];
05961       }
05962      }
05963     }
05964     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05965 
05966     CmiEnableUrgentSend(1);
05967 #if USE_NODE_PAR_RECEIVE
05968     msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05969 #if Y_PERSIST 
05970     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05971 #endif
05972     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05973 #if Y_PERSIST 
05974     CmiUsePersistentHandle(NULL, 0);
05975 #endif
05976 #else
05977 #if Y_PERSIST 
05978     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05979 #endif
05980     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05981 #if Y_PERSIST 
05982     CmiUsePersistentHandle(NULL, 0);
05983 #endif
05984 #endif
05985     CmiEnableUrgentSend(0);
05986   }
05987 }
05988 
05989 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
05990   int block2 = initdata.grid.block2;
05991   int K2 = initdata.grid.K2;
05992   int jb = msg->sourceNode;
05993   int ny = msg->ny;
05994   const float *md = msg->qgrid;
05995   float *d = data;
05996   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05997 #if CMK_BLUEGENEL
05998     CmiNetworkProgress();
05999 #endif   
06000     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06001       for ( int k=0; k<nz; ++k ) {
06002 #ifdef ZEROCHECK
06003         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
06004                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
06005 #endif
06006         d[2*(j*nz+k)] = *(md++);
06007         d[2*(j*nz+k)+1] = *(md++);
06008       }
06009     }
06010   }
06011 }
06012 
06013 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
06014         PmeYPencil *ypencil = (PmeYPencil *)param;
06015         ypencil->backward_subset_fft(first, last);
06016 }
06017 
06018 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
06019 #ifdef NAMD_FFTW
06020 #ifdef NAMD_FFTW_3
06021         for(int i=fromIdx; i<=toIdx; i++){
06022                 fftwf_execute_dft(backward_plan,        
06023                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2,         
06024                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
06025         }
06026 #endif
06027 #endif
06028 }
06029 
06030 void PmeYPencil::backward_fft() {
06031 #ifdef NAMD_FFTW
06032 #ifdef MANUAL_DEBUG_FFTW3
06033   dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
06034 #endif
06035 
06036 #ifdef NAMD_FFTW_3
06037 #if     CMK_SMP && USE_CKLOOP
06038   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06039   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06040      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06041           CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
06042           return;
06043   }
06044 #endif
06045   //the above is a transformation of the following loop using CkLoop
06046   for ( int i=0; i<nx; ++i ) {
06047 #if CMK_BLUEGENEL
06048         CmiNetworkProgress();
06049 #endif
06050     fftwf_execute_dft(backward_plan,    
06051                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
06052                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
06053   }
06054 #else
06055         for ( int i=0; i<nx; ++i ) {
06056 #if CMK_BLUEGENEL
06057           CmiNetworkProgress();
06058 #endif
06059                 fftw(backward_plan, nz,
06060                 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
06061                 nz, 1, (fftw_complex *) work, 1, 0);
06062         }
06063 #endif
06064 
06065 #ifdef MANUAL_DEBUG_FFTW3
06066   dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
06067 #endif
06068 
06069 #endif
06070 }
06071 
06072 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
06073         int evirIdx = paraNum;
06074         PmeYPencil *ypencil = (PmeYPencil *)param;
06075         ypencil->send_subset_untrans(first, last, evirIdx);
06076 }
06077 
06078 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
06079         int yBlocks = initdata.yBlocks;
06080         int block2 = initdata.grid.block2;      
06081         int K2 = initdata.grid.K2;
06082 
06083         int ackL=0, ackH=-1;
06084         int unL=0, unH=-1;
06085         int send_evir=0;
06086         if(fromIdx >= evirIdx+1) {
06087                 //send PmeUntransMsg with has_evir=0
06088                 unL = fromIdx;
06089                 unH = toIdx;            
06090         } else if(toIdx <= evirIdx-1) {
06091                 //send PmeAckMsg
06092                 ackL=fromIdx;
06093                 ackH=toIdx;             
06094         } else {
06095                 //partially send PmeAckMsg and partially send PmeUntransMsg
06096                 ackL=fromIdx;
06097                 ackH=evirIdx-1;
06098                 send_evir=1;
06099                 unL=evirIdx+1;
06100                 unH=toIdx;
06101         }
06102 
06103         for(int isend=ackL; isend<=ackH; isend++) {
06104                 //send PmeAckMsg
06105         CmiEnableUrgentSend(1);
06106                 int jb = send_order[isend];
06107                 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06108                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06109                 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06110         CmiEnableUrgentSend(0);
06111         }
06112 
06113     CmiEnableUrgentSend(1);
06114         //send PmeUntransMsg with has_evir=1
06115         if(send_evir) {
06116                 int jb = send_order[evirIdx];
06117                 int ny = block2;
06118                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06119                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;              
06120                 msg->sourceNode = thisIndex.z;
06121                 msg->ny = nz;
06122                 float *md = msg->qgrid;
06123                 const float *d = data;
06124                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06125                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06126                                 for ( int k=0; k<nz; ++k ) {
06127                                         *(md++) = d[2*(j*nz+k)];
06128                                         *(md++) = d[2*(j*nz+k)+1];
06129                                 }
06130                         }
06131                 }
06132                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06133 #if USE_NODE_PAR_RECEIVE
06134         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06135     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06136         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06137 #else
06138         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06139 #endif
06140         }
06141 
06142     CmiEnableUrgentSend(0);
06143         //send PmeUntransMsg with has_evir=0
06144         for(int isend=unL; isend<=unH; isend++) {
06145                 int jb = send_order[isend];
06146                 int ny = block2;
06147                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06148                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06149                 msg->sourceNode = thisIndex.z;
06150                 msg->ny = nz;
06151                 float *md = msg->qgrid;
06152                 const float *d = data;
06153                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06154                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06155                                 for ( int k=0; k<nz; ++k ) {
06156                                         *(md++) = d[2*(j*nz+k)];
06157                                         *(md++) = d[2*(j*nz+k)+1];
06158                                 }
06159                         }
06160                 }
06161                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06162             CmiEnableUrgentSend(1);
06163 #if USE_NODE_PAR_RECEIVE
06164         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06165         //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06166 #if Z_PERSIST 
06167         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06168 #endif
06169         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06170 #if Z_PERSIST 
06171         CmiUsePersistentHandle(NULL, 0);
06172 #endif
06173 #else
06174 #if Z_PERSIST 
06175         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06176 #endif
06177         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06178 #if Z_PERSIST 
06179         CmiUsePersistentHandle(NULL, 0);
06180 #endif
06181 #endif
06182     CmiEnableUrgentSend(0);
06183         }
06184 }
06185 
06186 void PmeYPencil::send_untrans() {
06187 #if USE_PERSISTENT
06188   if (untrans_handle == NULL) setup_persistent();
06189 #endif
06190 #if     CMK_SMP && USE_CKLOOP
06191   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06192   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06193      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06194           int yBlocks = initdata.yBlocks;
06195           int evirIdx = 0;
06196           for ( int isend=0; isend<yBlocks; ++isend ) {
06197                   int jb = send_order[isend];
06198                   if (needs_reply[jb]) {
06199                           evirIdx = isend;
06200                           break;
06201                   }
06202           }
06203 
06204           //basically: 
06205           //[0,evirIdx-1]->send PmeAckMsg
06206           //evirIdx->send PmeUntransMsg with has_evir=1
06207           //[evirIdx+1, yBlocks-1]->send PmeUntransMsg with has_evir=0
06208           //send_subset_untrans(0, yBlocks-1, evirIdx);
06209 #if USE_NODE_PAR_RECEIVE      
06210           //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
06211           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
06212       evir = 0.;
06213       CmiMemoryWriteFence();
06214 #else
06215       //CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
06216           CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
06217 #endif
06218           return;
06219   }
06220 #endif
06221   int yBlocks = initdata.yBlocks;
06222   int block2 = initdata.grid.block2;
06223   int K2 = initdata.grid.K2;
06224   int send_evir = 1;
06225   for ( int isend=0; isend<yBlocks; ++isend ) {
06226     int jb = send_order[isend];
06227     if ( ! needs_reply[jb] ) {
06228       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06229       CmiEnableUrgentSend(1);
06230       SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06231       initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06232       CmiEnableUrgentSend(0);
06233       continue;
06234     }
06235     int ny = block2;
06236     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06237     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06238     if ( send_evir ) {
06239       send_evir = 0;
06240     }
06241     msg->sourceNode = thisIndex.z;
06242     msg->ny = nz;
06243     float *md = msg->qgrid;
06244     const float *d = data;
06245     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06246      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06247       for ( int k=0; k<nz; ++k ) {
06248         *(md++) = d[2*(j*nz+k)];
06249         *(md++) = d[2*(j*nz+k)+1];
06250       }
06251      }
06252     }
06253     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06254 
06255     CmiEnableUrgentSend(1);
06256 #if USE_NODE_PAR_RECEIVE
06257     msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06258     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06259 #if Z_PERSIST 
06260     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06261 #endif
06262     initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06263 #if Z_PERSIST
06264     CmiUsePersistentHandle(NULL, 0);
06265 #endif
06266 #else
06267 #if Z_PERSIST 
06268     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06269 #endif
06270     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06271 #if Z_PERSIST 
06272     CmiUsePersistentHandle(NULL, 0);
06273 #endif
06274 #endif    
06275     CmiEnableUrgentSend(0);
06276   }
06277   
06278 #if USE_NODE_PAR_RECEIVE
06279   evir = 0.;
06280   CmiMemoryWriteFence();
06281 #endif
06282 }
06283 
06284 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
06285 #if ! USE_NODE_PAR_RECEIVE
06286     if(imsg==0) evir=0.;
06287 #endif
06288 
06289   int block3 = initdata.grid.block3;
06290   int dim3 = initdata.grid.dim3;
06291   int kb = msg->sourceNode;
06292   int nz = msg->ny;
06293   const float *md = msg->qgrid;
06294   float *d = data;
06295   for ( int i=0; i<nx; ++i ) {
06296 #if CMK_BLUEGENEL
06297     CmiNetworkProgress();
06298 #endif   
06299     for ( int j=0; j<ny; ++j, d += dim3 ) {
06300       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
06301 #ifdef ZEROCHECK
06302         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
06303                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
06304 #endif
06305         d[2*k] = *(md++);
06306         d[2*k+1] = *(md++);
06307       }
06308     }
06309   }
06310 }
06311 
06312 void PmeZPencil::backward_fft() {
06313 #ifdef NAMD_FFTW
06314 #ifdef MANUAL_DEBUG_FFTW3
06315   dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06316 #endif
06317 #ifdef NAMD_FFTW_3
06318 #if     CMK_SMP && USE_CKLOOP
06319   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06320   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06321      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06322           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
06323           //transform the above loop
06324           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
06325           return;
06326   }
06327 #endif
06328   fftwf_execute(backward_plan);
06329 #else
06330   rfftwnd_complex_to_real(backward_plan, nx*ny,
06331             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
06332 #endif
06333 #ifdef MANUAL_DEBUG_FFTW3
06334   dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06335 #endif
06336 
06337 #endif
06338   
06339 #if CMK_BLUEGENEL
06340   CmiNetworkProgress();
06341 #endif
06342 
06343 #ifdef FFTCHECK
06344   int dim3 = initdata.grid.dim3;
06345   int K1 = initdata.grid.K1;
06346   int K2 = initdata.grid.K2;
06347   int K3 = initdata.grid.K3;
06348   float scale = 1. / (1. * K1 * K2 * K3);
06349   float maxerr = 0.;
06350   float maxstd = 0.;
06351   int mi, mj, mk;  mi = mj = mk = -1;
06352   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
06353   const float *d = data;
06354   for ( int i=0; i<nx; ++i ) {
06355    for ( int j=0; j<ny; ++j, d += dim3 ) {
06356     for ( int k=0; k<K3; ++k ) {
06357       float std = 10. * (10. * (10. * std_base + i) + j) + k;
06358       float err = scale * d[k] - std;
06359       if ( fabsf(err) > fabsf(maxerr) ) {
06360         maxerr = err;
06361         maxstd = std;
06362         mi = i;  mj = j;  mk = k;
06363       }
06364     }
06365    }
06366   }
06367   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
06368                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
06369 #endif
06370 
06371 }
06372 
06373 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
06374         //to take advantage of the interface which allows 3 user params at most.
06375         //under such situtation, no new parameter list needs to be created!! -Chao Mei
06376         int specialIdx = paraNum;
06377         PmeZPencil *zpencil = (PmeZPencil *)param;
06378         zpencil->send_subset_ungrid(first, last, specialIdx);
06379 }
06380 
06381 void PmeZPencil::send_all_ungrid() {
06382 /* 
06383 //Original code: the transformation is to first extract the msg 
06384 //idx that will has evir value set. -Chao Mei  
06385         int send_evir = 1;
06386         for (int imsg=0; imsg < grid_msgs.size(); ++imsg ) {
06387                 PmeGridMsg *msg = grid_msgs[imsg];
06388                 if ( msg->hasData ) {
06389                         if ( send_evir ) {
06390                                 msg->evir[0] = evir;
06391                                 send_evir = 0;
06392                         } else {
06393                                 msg->evir[0] = 0.;
06394                         }
06395                 }
06396                 send_ungrid(msg);
06397         }
06398 */
06399         int evirIdx = 0;
06400         for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
06401                 if(grid_msgs[imsg]->hasData) {
06402                         evirIdx = imsg;
06403                         break;
06404                 }
06405         }
06406 
06407 #if     CMK_SMP && USE_CKLOOP
06408         int useCkLoop = Node::Object()->simParameters->useCkLoop;
06409         if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06410            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06411                 //????What's the best value for numChunks?????
06412 #if USE_NODE_PAR_RECEIVE        
06413                 //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 1); //has to sync
06414                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
06415 #else
06416         //CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 0); //not sync
06417                 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0); //not sync
06418 #endif        
06419                 return;
06420         }
06421 #endif
06422         send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
06423 }
06424 
06425 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
06426         for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
06427                 PmeGridMsg *msg = grid_msgs[imsg];
06428                 send_ungrid(msg);
06429         }
06430 }
06431 
06432 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
06433 
06434 #ifdef NAMD_CUDA
06435   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
06436 #else
06437   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
06438 #endif
06439 
06440   int pe = msg->sourceNode;
06441   if ( ! msg->hasData ) {
06442     delete msg;
06443     PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
06444     SET_PRIORITY(ackmsg,sequence,UNGRID_PRIORITY)
06445     CmiEnableUrgentSend(1);
06446     initdata.pmeProxy[pe].recvAck(ackmsg);
06447     CmiEnableUrgentSend(0);
06448     return;
06449   }
06450   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
06451   int dim3 = initdata.grid.dim3;
06452   int zlistlen = msg->zlistlen;
06453   int *zlist = msg->zlist;
06454   char *fmsg = msg->fgrid;
06455   float *qmsg = msg->qgrid;
06456   float *d = data;
06457   int numGrids = 1;  // pencil FFT doesn't support multiple grids
06458   for ( int g=0; g<numGrids; ++g ) {
06459 #if CMK_BLUEGENEL
06460     CmiNetworkProgress();
06461 #endif    
06462     for ( int i=0; i<nx; ++i ) {
06463       for ( int j=0; j<ny; ++j, d += dim3 ) {
06464         if( *(fmsg++) ) {
06465           for ( int k=0; k<zlistlen; ++k ) {
06466             *(qmsg++) = d[zlist[k]];
06467           }
06468         }
06469       }
06470     }
06471   }
06472   SET_PRIORITY(msg,sequence,UNGRID_PRIORITY)
06473     CmiEnableUrgentSend(1);
06474 #ifdef NAMD_CUDA
06475     if ( offload ) {
06476       initdata.pmeNodeProxy[CkNodeOf(pe)].recvUngrid(msg);
06477     } else
06478 #endif
06479   initdata.pmeProxy[pe].recvUngrid(msg);
06480     CmiEnableUrgentSend(0);
06481 }
06482 
06483 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
06484 {
06485 #if USE_NODE_PAR_RECEIVE
06486   CmiLock(ComputePmeMgr::fftw_plan_lock);
06487   CmiMemoryReadFence();
06488 #endif
06489   recv_grid(msg);
06490   if(msg->hasData) hasData=msg->hasData;
06491   int limsg;
06492   CmiMemoryAtomicFetchAndInc(imsg,limsg);
06493   grid_msgs[limsg] = msg;
06494   //  CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);      
06495   if(limsg+1 == grid_msgs.size())
06496     {
06497 
06498       if (hasData)
06499         {
06500           forward_fft();
06501         }
06502       send_trans();
06503       imsg=0;
06504       CmiMemoryWriteFence();
06505       //      CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06506     }
06507 #if USE_NODE_PAR_RECEIVE
06508   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06509   CmiMemoryWriteFence();
06510 #endif
06511 }
06512 
06513 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
06514 {
06515   recv_untrans(msg);
06516 #if USE_NODE_PAR_RECEIVE
06517   CmiMemoryWriteFence();
06518   CmiLock(ComputePmeMgr::fftw_plan_lock);
06519 #endif    
06520   int limsg;
06521   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
06522   if(limsg+1 == initdata.zBlocks)
06523     {
06524 #if USE_NODE_PAR_RECEIVE
06525       CmiMemoryReadFence();
06526 #endif    
06527       if(hasData) // maybe this should be an assert
06528         {
06529           backward_fft();
06530         }
06531         
06532         send_all_ungrid();
06533     /*  int send_evir = 1;
06534       // TODO: this part should use Chao's output parallelization
06535       for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
06536         PmeGridMsg *omsg = grid_msgs[limsg];
06537         if ( omsg->hasData ) {
06538           if ( send_evir ) {
06539             omsg->evir[0] = evir;
06540             send_evir = 0;
06541           } else {
06542             omsg->evir[0] = 0.;
06543           }
06544         }
06545         send_ungrid(omsg);
06546       } */
06547       imsgb=0;
06548       evir = 0;
06549       memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3); 
06550       CmiMemoryWriteFence();
06551       //      CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06552     }
06553 #if USE_NODE_PAR_RECEIVE
06554   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06555 #endif
06556 }
06557 
06558 
06559 #include "ComputePmeMgr.def.h"
06560 

Generated on Mon Nov 20 01:17:11 2017 for NAMD by  doxygen 1.4.7