ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 //#define MANUAL_DEBUG_FFTW3 1
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012 // fftw2 doesn't have these defined
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024 
00025 #include <vector>
00026 #include <algorithm>
00027 #include <deque>
00028 using namespace std;
00029 
00030 #include "InfoStream.h"
00031 #include "Node.h"
00032 #include "PatchMap.h"
00033 #include "PatchMap.inl"
00034 #include "AtomMap.h"
00035 #include "ComputePme.h"
00036 #include "ComputePmeMgr.decl.h"
00037 #include "PmeBase.inl"
00038 #include "PmeRealSpace.h"
00039 #include "PmeKSpace.h"
00040 #include "ComputeNonbondedUtil.h"
00041 #include "PatchMgr.h"
00042 #include "Molecule.h"
00043 #include "ReductionMgr.h"
00044 #include "ComputeMgr.h"
00045 #include "ComputeMgr.decl.h"
00046 // #define DEBUGM
00047 #define MIN_DEBUG_LEVEL 3
00048 #include "Debug.h"
00049 #include "SimParameters.h"
00050 #include "WorkDistrib.h"
00051 #include "varsizemsg.h"
00052 #include "Random.h"
00053 #include "ckhashtable.h"
00054 #include "Priorities.h"
00055 
00056 #include "ComputeMoa.h"
00057 #include "ComputeMoaMgr.decl.h" 
00058 
00059 //#define     USE_RANDOM_TOPO         1
00060 
00061 //#define USE_TOPO_SFC                    1
00062 //#define     USE_CKLOOP                1
00063 //#include "TopoManager.h"
00064 
00065 #include "DeviceCUDA.h"
00066 #ifdef NAMD_CUDA
00067 #include <cuda_runtime.h>
00068 #include <cuda.h>
00069 void cuda_errcheck(const char *msg);
00070 #ifdef WIN32
00071 #define __thread __declspec(thread)
00072 #endif
00073 extern __thread DeviceCUDA *deviceCUDA;
00074 #endif
00075 
00076 #include "ComputePmeCUDAKernel.h"
00077 
00078 #ifndef SQRT_PI
00079 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00080 #endif
00081 
00082 #if CMK_PERSISTENT_COMM 
00083 #define USE_PERSISTENT      1
00084 #endif
00085 
00086 #if USE_PERSISTENT
00087 #define Z_PERSIST 1
00088 #define Y_PERSIST 1
00089 #define X_PERSIST 1
00090 #endif
00091 
00092 #if defined(NAMD_CUDA) && defined(MEM_OPT_VERSION)
00093 #define USE_NODE_PAR_RECEIVE    1
00094 #endif
00095 
00096 int ComputePmeUtil::numGrids;
00097 Bool ComputePmeUtil::alchOn;
00098 Bool ComputePmeUtil::alchFepOn;
00099 Bool ComputePmeUtil::alchThermIntOn;
00100 Bool ComputePmeUtil::alchDecouple;
00101 BigReal ComputePmeUtil::alchElecLambdaStart;
00102 Bool ComputePmeUtil::lesOn;
00103 int ComputePmeUtil::lesFactor;
00104 Bool ComputePmeUtil::pairOn;
00105 Bool ComputePmeUtil::selfOn;
00106 
00107 char *pencilPMEProcessors;
00108 
00109 class PmeAckMsg : public CMessage_PmeAckMsg {
00110 };
00111 
00112 class PmeGridMsg : public CMessage_PmeGridMsg {
00113 public:
00114 
00115   int sourceNode;
00116   int sequence;
00117   int hasData;
00118   Lattice lattice;
00119   int start;
00120   int len;
00121   int zlistlen;
00122   int *zlist;
00123   char *fgrid;
00124   float *qgrid;
00125   CkArrayIndex3D destElem;
00126 };
00127 
00128 class PmeTransMsg : public CMessage_PmeTransMsg {
00129 public:
00130 
00131   int sourceNode;
00132   int sequence;
00133   int hasData;
00134   Lattice lattice;
00135   int x_start;
00136   int nx;
00137   float *qgrid;
00138   CkArrayIndex3D destElem;
00139 };
00140 
00141 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00142 public:
00143   PmeTransMsg *msg;
00144   int *count;
00145   CmiNodeLock lock;
00146 };
00147 
00148 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00149 public:
00150 
00151   int sourceNode;
00152   int y_start;
00153   int ny;
00154   float *qgrid;
00155   CkArrayIndex3D destElem;
00156 };
00157 
00158 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00159 public:
00160   PmeUntransMsg *msg;
00161   int *count;
00162   CmiNodeLock lock;
00163 };
00164 
00165 class PmeEvirMsg : public CMessage_PmeEvirMsg {
00166 public:
00167   PmeReduction *evir;
00168 };
00169 
00170 class PmePencilMap : public CBase_PmePencilMap {
00171 public:
00172   PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00173     : ia(i_a), ib(i_b), nb(n_b),
00174       size(n), data(newcopyint(n,d)) {
00175   }
00176   virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00177     //Return an ``arrayHdl'', given some information about the array
00178     return 0;
00179   }
00180   virtual int procNum(int, const CkArrayIndex &i) {
00181     //Return the home processor number for this element of this array
00182     return data[ i.data()[ia] * nb + i.data()[ib] ];
00183   }
00184   virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00185     int mype = CkMyPe();
00186     for ( int i=0; i < size; ++i ) {
00187       if ( data[i] == mype ) {
00188         CkArrayIndex3D ai(0,0,0);
00189         ai.data()[ia] = i / nb;
00190         ai.data()[ib] = i % nb;
00191         if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00192         if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00193         mgr->insertInitial(ai,msg);
00194         msg = 0;
00195       }
00196     }
00197     mgr->doneInserting();
00198     if ( msg ) CkFreeMsg(msg);
00199   }
00200 private:
00201   const int ia, ib, nb, size;
00202   const int* const data;
00203   static int* newcopyint(int n, int *d) {
00204     int *newd = new int[n];
00205     memcpy(newd, d, n*sizeof(int));
00206     return newd;
00207   }
00208 };
00209 
00210 // use this idiom since messages don't have copy constructors
00211 struct PmePencilInitMsgData {
00212   PmeGrid grid;
00213   int xBlocks, yBlocks, zBlocks;
00214   CProxy_PmeXPencil xPencil;
00215   CProxy_PmeYPencil yPencil;
00216   CProxy_PmeZPencil zPencil;
00217   CProxy_ComputePmeMgr pmeProxy;
00218   CProxy_NodePmeMgr pmeNodeProxy;
00219   CProxy_PmePencilMap xm;
00220   CProxy_PmePencilMap ym;
00221   CProxy_PmePencilMap zm;
00222 };
00223 
00224 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00225 public:
00226    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00227    PmePencilInitMsgData data;
00228 };
00229 
00230 
00231 struct LocalPmeInfo {
00232   int nx, x_start;
00233   int ny_after_transpose, y_start_after_transpose;
00234 };
00235 
00236 struct NodePmeInfo {
00237   int npe, pe_start, real_node;
00238 };
00239 
00240 
00241 static int findRecipEvirPe() {
00242     PatchMap *patchMap = PatchMap::Object();
00243     {
00244       int mype = CkMyPe();
00245       if ( patchMap->numPatchesOnNode(mype) ) {
00246         return mype; 
00247       }
00248     }
00249     {
00250       int node = CmiMyNode();
00251       int firstpe = CmiNodeFirst(node);
00252       int nodeSize = CmiNodeSize(node);
00253       int myrank = CkMyRank();
00254       for ( int i=0; i<nodeSize; ++i ) {
00255         int pe = firstpe + (myrank+i)%nodeSize;
00256         if ( patchMap->numPatchesOnNode(pe) ) {
00257           return pe;
00258         }
00259       }
00260     }
00261     {
00262       int *pelist;
00263       int nodeSize;
00264       CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
00265       int myrank;
00266       for ( int i=0; i<nodeSize; ++i ) {
00267         if ( pelist[i] == CkMyPe() ) myrank = i;
00268       }
00269       for ( int i=0; i<nodeSize; ++i ) {
00270         int pe = pelist[(myrank+i)%nodeSize];
00271         if ( patchMap->numPatchesOnNode(pe) ) {
00272           return pe;
00273         }
00274       }
00275     }
00276     {
00277       int mype = CkMyPe();
00278       int npes = CkNumPes();
00279       for ( int i=0; i<npes; ++i ) {
00280         int pe = (mype+i)%npes;
00281         if ( patchMap->numPatchesOnNode(pe) ) {
00282           return pe;
00283         }
00284       }
00285     }
00286     NAMD_bug("findRecipEvirPe() failed!");
00287     return -999;  // should never happen
00288 }
00289 
00290 
00291 //Assigns gridPeMap and transPeMap to different set of processors.
00292 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00293   int ncpus = CkNumPes();
00294   
00295   for ( int i=0; i<numGridPes; ++i ) {
00296     gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
00297   }
00298   std::sort(gridPeMap,gridPeMap+numGridPes);
00299   int firstTransPe = ncpus - numGridPes - numTransPes;
00300   if ( firstTransPe < 0 ) {
00301     firstTransPe = 0;
00302     // 0 should be first in list, skip if possible
00303     if ( ncpus > numTransPes ) firstTransPe = 1;
00304   }
00305   for ( int i=0; i<numTransPes; ++i ) {
00306     transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
00307   }
00308   std::sort(transPeMap,transPeMap+numTransPes);
00309 }
00310 
00311 #if USE_TOPOMAP 
00312 //Topology aware PME allocation
00313 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00314                              int nbpes=0);
00315 #endif
00316 
00317 
00318 int compare_bit_reversed(int a, int b) {
00319   int d = a ^ b;
00320   int c = 1;
00321   if ( d ) while ( ! (d & c) ) {
00322     c = c << 1;
00323   }
00324   return (a & c) - (b & c);
00325 }
00326 
00327 inline bool less_than_bit_reversed(int a, int b) {
00328   int d = a ^ b;
00329   int c = 1;
00330   if ( d ) while ( ! (d & c) ) {
00331     c = c << 1;
00332   }
00333   return d && (b & c);
00334 }
00335 
00336 struct sortop_bit_reversed {
00337   inline bool operator() (int a, int b) const {
00338     return less_than_bit_reversed(a,b);
00339   }
00340 };
00341 
00342 struct ijpair {
00343   int i,j;
00344   ijpair() {;}
00345   ijpair(int I, int J) : i(I), j(J) {;}
00346 };
00347 
00348 struct ijpair_sortop_bit_reversed {
00349   inline bool operator() (const ijpair &a, const ijpair &b) const {
00350     return ( less_than_bit_reversed(a.i,b.i)
00351              || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
00352   }
00353 };
00354 
00355 class ComputePmeMgr : public CBase_ComputePmeMgr, public ComputePmeUtil {
00356 public:
00357   friend class ComputePme;
00358   friend class NodePmeMgr;
00359   ComputePmeMgr();
00360   ~ComputePmeMgr();
00361 
00362   void initialize(CkQdMsg*);
00363   void initialize_pencils(CkQdMsg*);
00364   void activate_pencils(CkQdMsg*);
00365   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00366   void initialize_computes();
00367 
00368   void sendData(Lattice &, int sequence);
00369   void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
00370   Lattice *sendDataHelper_lattice;
00371   int sendDataHelper_sequence;
00372   int sendDataHelper_sourcepe;
00373   int sendDataHelper_errors;
00374   void sendPencils(Lattice &, int sequence);
00375   void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
00376   void recvGrid(PmeGridMsg *);
00377   void gridCalc1(void);
00378   void sendTransBarrier(void);
00379   void sendTransSubset(int first, int last);
00380   void sendTrans(void);
00381   void fwdSharedTrans(PmeTransMsg *);
00382   void recvSharedTrans(PmeSharedTransMsg *);
00383   void sendDataHelper(int);
00384   void sendPencilsHelper(int);
00385   void recvTrans(PmeTransMsg *);
00386   void procTrans(PmeTransMsg *);
00387   void gridCalc2(void);
00388   #ifdef OPENATOM_VERSION
00389   void gridCalc2Moa(void);
00390   #endif // OPENATOM_VERSION
00391   void gridCalc2R(void);
00392   void fwdSharedUntrans(PmeUntransMsg *);
00393   void recvSharedUntrans(PmeSharedUntransMsg *);
00394   void sendUntrans(void);
00395   void sendUntransSubset(int first, int last);
00396   void recvUntrans(PmeUntransMsg *);
00397   void procUntrans(PmeUntransMsg *);
00398   void gridCalc3(void);
00399   void sendUngrid(void);
00400   void sendUngridSubset(int first, int last);
00401   void recvUngrid(PmeGridMsg *);
00402   void recvAck(PmeAckMsg *);
00403   void copyResults(PmeGridMsg *);
00404   void copyPencils(PmeGridMsg *);
00405   void ungridCalc(void);
00406   void recvRecipEvir(PmeEvirMsg *);
00407   void addRecipEvirClient(void);
00408   void submitReductions();
00409 
00410 #if 0 && USE_PERSISTENT
00411   void setup_recvgrid_persistent();
00412 #endif
00413 
00414   static CmiNodeLock fftw_plan_lock;
00415   CmiNodeLock pmemgr_lock;  // for accessing this object from other threads
00416 
00417 #ifdef NAMD_CUDA
00418   float *a_data_host;
00419   float *a_data_dev;
00420   float *f_data_host;
00421   float *f_data_dev;
00422   int cuda_atoms_count;
00423   int cuda_atoms_alloc;
00424   static CmiNodeLock cuda_lock;
00425   void chargeGridSubmitted(Lattice &lattice, int sequence);
00426   cudaEvent_t end_charges;
00427   cudaEvent_t *end_forces;
00428   int forces_count;
00429   int forces_done_count;
00430   double charges_time;
00431   double forces_time;
00432   int check_charges_count;
00433   int check_forces_count;
00434   int master_pe;
00435   int this_pe;
00436 
00437   void cuda_submit_charges(Lattice &lattice, int sequence);
00438   struct cuda_submit_charges_args {
00439     ComputePmeMgr *mgr; Lattice *lattice; int sequence;
00440   };
00441   static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
00442   static bool cuda_busy;
00443 
00444   int chargeGridSubmittedCount;
00445   void sendChargeGridReady();
00446 #endif
00447   Lattice *saved_lattice;  // saved by chargeGridSubmitted
00448   int saved_sequence;      // saved by chargeGridSubmitted
00449   void pollChargeGridReady();
00450   void pollForcesReady();
00451   void recvChargeGridReady();
00452   void chargeGridReady(Lattice &lattice, int sequence);
00453 
00454   ResizeArray<ComputePme*> pmeComputes;
00455 
00456 private:
00457 
00458 #if 0 && USE_PERSISTENT
00459   PersistentHandle   *recvGrid_handle;
00460 #endif
00461 
00462   CProxy_ComputePmeMgr pmeProxy;
00463   CProxy_ComputePmeMgr pmeProxyDir;
00464   CProxy_NodePmeMgr pmeNodeProxy;
00465   NodePmeMgr *nodePmeMgr;
00466   ComputePmeMgr *masterPmeMgr;
00467   
00468   void addCompute(ComputePme *c) {
00469     if ( ! pmeComputes.size() ) initialize_computes();
00470     pmeComputes.add(c);
00471     c->setMgr(this);
00472   }
00473 
00474   ResizeArray<ComputePme*> heldComputes;
00475   PmeGrid myGrid;
00476   Lattice lattice;
00477   PmeKSpace *myKSpace;
00478   float *qgrid;
00479   float *kgrid;
00480 
00481 #ifdef NAMD_FFTW
00482 #ifdef NAMD_FFTW_3
00483   fftwf_plan *forward_plan_x, *backward_plan_x;
00484   fftwf_plan *forward_plan_yz, *backward_plan_yz;
00485   fftwf_complex *work;
00486 #else
00487   fftw_plan forward_plan_x, backward_plan_x;
00488   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00489   fftw_complex *work;
00490 #endif
00491 #else
00492   float *work;
00493 #endif
00494 
00495   int qsize, fsize, bsize;
00496   int offload;
00497   BigReal alchLambda;  // set on each step in ComputePme::ungridForces()
00498   BigReal alchLambda2; // set on each step in ComputePme::ungridForces()
00499 
00500   float **q_arr;
00501   // q_list and q_count not used for offload
00502   float **q_list;
00503   int q_count;
00504   char *f_arr;
00505   char *fz_arr;
00506   PmeReduction evir[PME_MAX_EVALS];
00507   SubmitReduction *reduction;
00508 
00509   int noWorkCount;
00510   int doWorkCount;
00511   int ungridForcesCount;
00512 
00513 #ifdef NAMD_CUDA
00514 #define NUM_STREAMS 1
00515   cudaStream_t streams[NUM_STREAMS];
00516   int stream;
00517 
00518   float **q_arr_dev;
00519   float **v_arr_dev;
00520   float *q_data_host;
00521   float *q_data_dev;
00522   float *v_data_dev;
00523   int *ffz_host;
00524   int *ffz_dev;
00525   int q_data_size;
00526   int ffz_size;
00527 
00528   int f_data_mgr_alloc;
00529   float *f_data_mgr_host;
00530   float *f_data_mgr_dev;
00531   float **afn_host;
00532   float **afn_dev;
00533 
00534   float *bspline_coeffs_dev;
00535   float *bspline_dcoeffs_dev;
00536 #endif
00537   int recipEvirCount;   // used in compute only
00538   int recipEvirClients; // used in compute only
00539   int recipEvirPe;      // used in trans only
00540   
00541   LocalPmeInfo *localInfo;
00542   NodePmeInfo *gridNodeInfo;
00543   NodePmeInfo *transNodeInfo;
00544   int qgrid_size;
00545   int qgrid_start;
00546   int qgrid_len;
00547   int fgrid_start;
00548   int fgrid_len;
00549 
00550   int numSources;
00551   int numGridPes;
00552   int numTransPes;
00553   int numGridNodes;
00554   int numTransNodes;
00555   int numDestRecipPes;
00556   int myGridPe, myGridNode;
00557   int myTransPe, myTransNode;
00558   int *gridPeMap;
00559   int *transPeMap;
00560   int *recipPeDest;
00561   int *gridPeOrder;
00562   int *gridNodeOrder;
00563   int *transNodeOrder;
00564   int grid_count;
00565   int trans_count;
00566   int untrans_count;
00567   int ungrid_count;
00568   PmeGridMsg **gridmsg_reuse;
00569   PmeReduction recip_evir2[PME_MAX_EVALS];
00570 
00571   int compute_sequence;  // set from patch computes, used for priorities
00572   int grid_sequence;  // set from grid messages, used for priorities
00573   int useBarrier;
00574   int sendTransBarrier_received;
00575 
00576   int usePencils;
00577   int xBlocks, yBlocks, zBlocks;
00578   CProxy_PmeXPencil xPencil;
00579   CProxy_PmeYPencil yPencil;
00580   CProxy_PmeZPencil zPencil;
00581   char *pencilActive;
00582   ijpair *activePencils;
00583   int numPencilsActive;
00584   int strayChargeErrors;
00585 };
00586 
00587 ResizeArray<ComputePme*>& getComputes(ComputePmeMgr *mgr) {
00588     return mgr->pmeComputes ;
00589 }
00590 
00591   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00592 #ifdef NAMD_CUDA
00593   CmiNodeLock ComputePmeMgr::cuda_lock;
00594   std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
00595   bool ComputePmeMgr::cuda_busy;
00596 #endif
00597 
00598 int isPmeProcessor(int p){ 
00599   SimParameters *simParams = Node::Object()->simParameters;
00600   if (simParams->usePMECUDA) {
00601     return 0;
00602   } else {
00603     return pencilPMEProcessors[p];
00604   }
00605 }
00606 
00607 class NodePmeMgr : public CBase_NodePmeMgr {
00608 public:
00609   friend class ComputePmeMgr;
00610   friend class ComputePme;
00611   NodePmeMgr();
00612   ~NodePmeMgr();
00613   void initialize();
00614   void sendDataHelper(int);
00615   void sendPencilsHelper(int);
00616   void recvTrans(PmeTransMsg *);
00617   void recvUntrans(PmeUntransMsg *);
00618   void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00619   void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00620   void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00621   void recvXTrans(PmeTransMsg *);
00622   void recvYTrans(PmeTransMsg *);
00623   void recvYUntrans(PmeUntransMsg *);
00624   void recvZGrid(PmeGridMsg *);
00625   void recvZUntrans(PmeUntransMsg *);
00626 
00627   void recvUngrid(PmeGridMsg *);
00628 
00629   void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00630       xm=_xm; ym=_ym; zm=_zm;
00631   }
00632   CProxy_PmePencilMap xm;
00633   CProxy_PmePencilMap ym;
00634   CProxy_PmePencilMap zm;
00635 
00636 private:
00637   CProxy_ComputePmeMgr mgrProxy;
00638   ComputePmeMgr *mgrObject;
00639   ComputePmeMgr **mgrObjects;
00640 #ifdef NAMD_CUDA
00641   ComputePmeMgr *masterPmeMgr;
00642   int master_pe;
00643 #endif
00644   CProxy_PmeXPencil xPencil;
00645   CProxy_PmeYPencil yPencil;
00646   CProxy_PmeZPencil zPencil;
00647   CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00648   CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00649   CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;  
00650 
00651 #ifdef NAMD_CUDA
00652   cudaEvent_t end_charge_memset;
00653   cudaEvent_t end_all_pme_kernels;
00654   cudaEvent_t end_potential_memcpy;
00655 #endif
00656 };
00657 
00658 NodePmeMgr::NodePmeMgr() {
00659   mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00660 }
00661 
00662 NodePmeMgr::~NodePmeMgr() {
00663   delete [] mgrObjects;
00664 }
00665 
00666 void NodePmeMgr::initialize() {
00667   CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00668   mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00669   if ( CkMyRank() == 0 ) {
00670     mgrProxy = proxy;
00671     mgrObject = proxy.ckLocalBranch();
00672   }
00673 }
00674 
00675 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00676   mgrObject->fwdSharedTrans(msg);
00677 }
00678 
00679 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00680   mgrObject->fwdSharedUntrans(msg);
00681 }
00682 
00683 void NodePmeMgr::recvUngrid(PmeGridMsg *msg) {
00684 #ifdef NAMD_CUDA
00685   masterPmeMgr->recvUngrid(msg);
00686 #else
00687   NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
00688 #endif
00689 }
00690 
00691 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00692 {
00693   CmiLock(ComputePmeMgr::fftw_plan_lock);
00694   xPencilObj.put(idx)=obj;
00695   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00696 }
00697 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00698 {
00699   CmiLock(ComputePmeMgr::fftw_plan_lock);
00700   yPencilObj.put(idx)=obj;
00701   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00702 }
00703 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00704 {
00705   CmiLock(ComputePmeMgr::fftw_plan_lock);
00706   zPencilObj.put(idx)=obj;
00707   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00708 }
00709 
00710 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00711                                  pmeProxyDir(thisgroup) {
00712 
00713   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00714   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00715   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00716 
00717   pmeNodeProxy.ckLocalBranch()->initialize();
00718 
00719   if ( CmiMyRank() == 0 ) {
00720     fftw_plan_lock = CmiCreateLock();
00721   }
00722   pmemgr_lock = CmiCreateLock();
00723 
00724   myKSpace = 0;
00725   kgrid = 0;
00726   work = 0;
00727   grid_count = 0;
00728   trans_count = 0;
00729   untrans_count = 0;
00730   ungrid_count = 0;
00731   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00732   useBarrier = 0;
00733   sendTransBarrier_received = 0;
00734   usePencils = 0;
00735 
00736 #ifdef NAMD_CUDA
00737  // offload has not been set so this happens on every run
00738   if ( CmiMyRank() == 0 ) {
00739     cuda_lock = CmiCreateLock();
00740   }
00741 
00742 #if CUDA_VERSION >= 5050
00743   int leastPriority, greatestPriority;
00744   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00745   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00746   //if ( CkMyNode() == 0 ) {
00747   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00748   //}
00749 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00750 #else
00751 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00752 #endif
00753 
00754   stream = 0;
00755   for ( int i=0; i<NUM_STREAMS; ++i ) {
00756 #if 1
00757     CUDA_STREAM_CREATE(&streams[i]);
00758     cuda_errcheck("cudaStreamCreate");
00759 #else
00760   streams[i] = 0;  // XXXX Testing!!!
00761 #endif
00762   }
00763 
00764   this_pe = CkMyPe();
00765  
00766   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00767   end_forces = 0;
00768   check_charges_count = 0;
00769   check_forces_count = 0;
00770   chargeGridSubmittedCount = 0;
00771 
00772   cuda_atoms_count = 0;
00773   cuda_atoms_alloc = 0;
00774 
00775   f_data_mgr_alloc = 0;
00776   f_data_mgr_host = 0;
00777   f_data_mgr_dev = 0;
00778   afn_host = 0;
00779   afn_dev = 0;
00780 
00781 #define CUDA_EVENT_ID_PME_CHARGES 80
00782 #define CUDA_EVENT_ID_PME_FORCES 81
00783 #define CUDA_EVENT_ID_PME_TICK 82
00784 #define CUDA_EVENT_ID_PME_COPY 83
00785 #define CUDA_EVENT_ID_PME_KERNEL 84
00786   if ( 0 == CkMyPe() ) {
00787     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00788     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00789     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00790     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00791     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00792   }
00793 #endif
00794   recipEvirCount = 0;
00795   recipEvirClients = 0;
00796   recipEvirPe = -999;
00797 }
00798 
00799 
00800 void ComputePmeMgr::recvArrays(
00801         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00802   xPencil = x;  yPencil = y;  zPencil = z;
00803   
00804     if(CmiMyRank()==0)
00805     {
00806       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00807       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00808       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00809     }
00810 }
00811 
00812 #if USE_TOPO_SFC
00813  struct Coord
00814   {
00815     int x, y, z;
00816     Coord(): x(0), y(0), z(0) {}
00817     Coord(int a, int b, int c): x(a), y(b), z(c) {}
00818   };
00819   extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00820 
00821   void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00822   {
00823      SortableResizeArray<int> newprocs(procs.size());
00824      int num = 0;
00825      for (int i=0; i<result.size(); i++) {
00826        Coord &c = result[i];
00827        for (int j=0; j<procs.size(); j++) {
00828          int pe = procs[j];
00829          int x,y,z,t;
00830          tmgr.rankToCoordinates(pe, x, y, z, t);    
00831          if (x==c.x && y==c.y && z==c.z)
00832            newprocs[num++] = pe;
00833        }
00834      } 
00835      CmiAssert(newprocs.size() == procs.size());
00836      procs = newprocs;
00837   }
00838 
00839   int find_level_grid(int x) 
00840   {
00841      int a = sqrt(x);
00842      int b;
00843      for (; a>0; a--) {
00844        if (x%a == 0) break;
00845      }
00846      if (a==1) a = x;
00847      b = x/a;
00848      //return a>b?a:b;
00849      return b;
00850   }
00851   CmiNodeLock tmgr_lock;
00852 #endif
00853 
00854 void Pme_init()
00855 {
00856 #if USE_TOPO_SFC
00857   if (CkMyRank() == 0) 
00858     tmgr_lock = CmiCreateLock();
00859 #endif
00860 }
00861 
00862 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00863   delete msg;
00864 
00865   localInfo = new LocalPmeInfo[CkNumPes()];
00866   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00867   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00868   gridPeMap = new int[CkNumPes()];
00869   transPeMap = new int[CkNumPes()];
00870   recipPeDest = new int[CkNumPes()];
00871   gridPeOrder = new int[CkNumPes()];
00872   gridNodeOrder = new int[CkNumNodes()];
00873   transNodeOrder = new int[CkNumNodes()];
00874 
00875   if (CkMyRank() == 0) {
00876     pencilPMEProcessors = new char [CkNumPes()];
00877     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00878   }
00879 
00880   SimParameters *simParams = Node::Object()->simParameters;
00881   PatchMap *patchMap = PatchMap::Object();
00882 
00883   offload = simParams->PMEOffload;
00884 #ifdef NAMD_CUDA
00885   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00886     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00887   }
00888   if ( offload ) {
00889     int dev;
00890     cudaGetDevice(&dev);
00891     cuda_errcheck("in cudaGetDevice");
00892     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00893     cudaDeviceProp deviceProp;
00894     cudaGetDeviceProperties(&deviceProp, dev);
00895     cuda_errcheck("in cudaGetDeviceProperties");
00896     if ( deviceProp.major < 2 )
00897       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00898   }
00899 #endif
00900 
00901   alchLambda = -1.;  // illegal value to catch if not updated
00902   alchLambda2 = -1.;
00903   useBarrier = simParams->PMEBarrier;
00904 
00905   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00906   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00907   else {
00908     int nrps = simParams->PMEProcessors;
00909     if ( nrps <= 0 ) nrps = CkNumPes();
00910     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00911     int dimx = simParams->PMEGridSizeX;
00912     int dimy = simParams->PMEGridSizeY;
00913     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00914     if ( maxslabs > nrps ) maxslabs = nrps;
00915     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00916                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00917     if ( maxpencils > nrps ) maxpencils = nrps;
00918     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00919     else usePencils = 0;
00920   }
00921 
00922   if ( usePencils ) {
00923     int nrps = simParams->PMEProcessors;
00924     if ( nrps <= 0 ) nrps = CkNumPes();
00925     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00926     if ( simParams->PMEPencils > 1 &&
00927          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00928       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00929     } else {
00930       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00931                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00932       if ( nb2 > nrps ) nb2 = nrps;
00933       if ( nb2 < 1 ) nb2 = 1;
00934       int nb = (int) sqrt((float)nb2);
00935       if ( nb < 1 ) nb = 1;
00936       xBlocks = zBlocks = nb;
00937       yBlocks = nb2 / nb;
00938     }
00939 
00940     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00941     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00942     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00943 
00944     int dimx = simParams->PMEGridSizeX;
00945     int bx = 1 + ( dimx - 1 ) / xBlocks;
00946     xBlocks = 1 + ( dimx - 1 ) / bx;
00947 
00948     int dimy = simParams->PMEGridSizeY;
00949     int by = 1 + ( dimy - 1 ) / yBlocks;
00950     yBlocks = 1 + ( dimy - 1 ) / by;
00951 
00952     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00953     int bz = 1 + ( dimz - 1 ) / zBlocks;
00954     zBlocks = 1 + ( dimz - 1 ) / bz;
00955 
00956     if ( xBlocks * yBlocks > CkNumPes() ) {
00957       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00958     }
00959     if ( xBlocks * zBlocks > CkNumPes() ) {
00960       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00961     }
00962     if ( yBlocks * zBlocks > CkNumPes() ) {
00963       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00964     }
00965 
00966     if ( ! CkMyPe() ) {
00967       iout << iINFO << "PME using " << xBlocks << " x " <<
00968         yBlocks << " x " << zBlocks <<
00969         " pencil grid for FFT and reciprocal sum.\n" << endi;
00970     }
00971   } else { // usePencils
00972 
00973   {  // decide how many pes to use for reciprocal sum
00974 
00975     // rules based on work available
00976     int minslices = simParams->PMEMinSlices;
00977     int dimx = simParams->PMEGridSizeX;
00978     int nrpx = ( dimx + minslices - 1 ) / minslices;
00979     int dimy = simParams->PMEGridSizeY;
00980     int nrpy = ( dimy + minslices - 1 ) / minslices;
00981 
00982     // rules based on processors available
00983     int nrpp = CkNumPes();
00984     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00985     if ( nrpp < nrpx ) nrpx = nrpp;
00986     if ( nrpp < nrpy ) nrpy = nrpp;
00987 
00988     // user override
00989     int nrps = simParams->PMEProcessors;
00990     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00991     if ( nrps > 0 ) nrpx = nrps;
00992     if ( nrps > 0 ) nrpy = nrps;
00993 
00994     // make sure there aren't any totally empty processors
00995     int bx = ( dimx + nrpx - 1 ) / nrpx;
00996     nrpx = ( dimx + bx - 1 ) / bx;
00997     int by = ( dimy + nrpy - 1 ) / nrpy;
00998     nrpy = ( dimy + by - 1 ) / by;
00999     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01000       NAMD_bug("Error in selecting number of PME processors.");
01001     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01002       NAMD_bug("Error in selecting number of PME processors.");
01003 
01004     numGridPes = nrpx;
01005     numTransPes = nrpy;
01006   }
01007   if ( ! CkMyPe() ) {
01008     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01009       " processors for FFT and reciprocal sum.\n" << endi;
01010   }
01011 
01012   int sum_npes = numTransPes + numGridPes;
01013   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01014 
01015 #if 0 // USE_TOPOMAP
01016   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01017   PatchMap * pmap = PatchMap::Object();
01018   
01019   int patch_pes = pmap->numNodesWithPatches();
01020   TopoManager tmgr;
01021   if(tmgr.hasMultipleProcsPerNode())
01022     patch_pes *= 2;
01023 
01024   bool done = false;
01025   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01026     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01027     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01028   }
01029   else 
01030     if(CkNumPes() > 2 *max_npes + patch_pes) {
01031       done = generateBGLORBPmePeList(transPeMap, max_npes);
01032       gridPeMap = transPeMap;
01033     }
01034 
01035   if (!done)
01036 #endif
01037     {
01038       //generatePmePeList(transPeMap, max_npes);
01039       //gridPeMap = transPeMap;
01040       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01041     }
01042   
01043   if ( ! CkMyPe() ) {
01044     iout << iINFO << "PME GRID LOCATIONS:";
01045     int i;
01046     for ( i=0; i<numGridPes && i<10; ++i ) {
01047       iout << " " << gridPeMap[i];
01048     }
01049     if ( i < numGridPes ) iout << " ...";
01050     iout << "\n" << endi;
01051     iout << iINFO << "PME TRANS LOCATIONS:";
01052     for ( i=0; i<numTransPes && i<10; ++i ) {
01053       iout << " " << transPeMap[i];
01054     }
01055     if ( i < numTransPes ) iout << " ...";
01056     iout << "\n" << endi;
01057   }
01058 
01059   // sort based on nodes and physical nodes
01060   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01061 
01062   myGridPe = -1;
01063   myGridNode = -1;
01064   int i = 0;
01065   int node = -1;
01066   int real_node = -1;
01067   for ( i=0; i<numGridPes; ++i ) {
01068     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01069     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01070     int real_node_i = CkNodeOf(gridPeMap[i]);
01071     if ( real_node_i == real_node ) {
01072       gridNodeInfo[node].npe += 1;
01073     } else {
01074       real_node = real_node_i;
01075       ++node;
01076       gridNodeInfo[node].real_node = real_node;
01077       gridNodeInfo[node].pe_start = i;
01078       gridNodeInfo[node].npe = 1;
01079     }
01080     if ( CkMyNode() == real_node_i ) myGridNode = node;
01081   }
01082   numGridNodes = node + 1;
01083   myTransPe = -1;
01084   myTransNode = -1;
01085   node = -1;
01086   real_node = -1;
01087   for ( i=0; i<numTransPes; ++i ) {
01088     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01089     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01090     int real_node_i = CkNodeOf(transPeMap[i]);
01091     if ( real_node_i == real_node ) {
01092       transNodeInfo[node].npe += 1;
01093     } else {
01094       real_node = real_node_i;
01095       ++node;
01096       transNodeInfo[node].real_node = real_node;
01097       transNodeInfo[node].pe_start = i;
01098       transNodeInfo[node].npe = 1;
01099     }
01100     if ( CkMyNode() == real_node_i ) myTransNode = node;
01101   }
01102   numTransNodes = node + 1;
01103 
01104   if ( ! CkMyPe() ) {
01105     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01106          << numTransNodes << " TRANS NODES\n" << endi;
01107   }
01108 
01109   { // generate random orderings for grid and trans messages
01110     int i;
01111     for ( i = 0; i < numGridPes; ++i ) {
01112       gridPeOrder[i] = i;
01113     }
01114     Random rand(CkMyPe());
01115     if ( myGridPe < 0 ) {
01116       rand.reorder(gridPeOrder,numGridPes);
01117     } else {  // self last
01118       gridPeOrder[myGridPe] = numGridPes-1;
01119       gridPeOrder[numGridPes-1] = myGridPe;
01120       rand.reorder(gridPeOrder,numGridPes-1);
01121     } 
01122     for ( i = 0; i < numGridNodes; ++i ) {
01123       gridNodeOrder[i] = i;
01124     }
01125     if ( myGridNode < 0 ) {
01126       rand.reorder(gridNodeOrder,numGridNodes);
01127     } else {  // self last
01128       gridNodeOrder[myGridNode] = numGridNodes-1;
01129       gridNodeOrder[numGridNodes-1] = myGridNode;
01130       rand.reorder(gridNodeOrder,numGridNodes-1);
01131     }
01132     for ( i = 0; i < numTransNodes; ++i ) {
01133       transNodeOrder[i] = i;
01134     }
01135     if ( myTransNode < 0 ) {
01136       rand.reorder(transNodeOrder,numTransNodes);
01137     } else {  // self last
01138       transNodeOrder[myTransNode] = numTransNodes-1;
01139       transNodeOrder[numTransNodes-1] = myTransNode;
01140       rand.reorder(transNodeOrder,numTransNodes-1);
01141     }
01142   }
01143   
01144   } // ! usePencils
01145 
01146   myGrid.K1 = simParams->PMEGridSizeX;
01147   myGrid.K2 = simParams->PMEGridSizeY;
01148   myGrid.K3 = simParams->PMEGridSizeZ;
01149   myGrid.order = simParams->PMEInterpOrder;
01150   myGrid.dim2 = myGrid.K2;
01151   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01152 
01153   if ( ! usePencils ) {
01154     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01155     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01156     myGrid.block3 = myGrid.dim3 / 2;  // complex
01157   }
01158 
01159   if ( usePencils ) {
01160     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01161     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01162     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01163 
01164 
01165       int pe = 0;
01166       int x,y,z;
01167 
01168                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01169                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01170                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01171       
01172                 // decide which pes to use by bit reversal and patch use
01173                 int i;
01174                 int ncpus = CkNumPes();
01175                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01176                 PatchMap *pmap = PatchMap::Object();
01177                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01178                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01179                         if ( ri ) { // keep 0 for special case
01180                                 // pretend pe 1 has patches to avoid placing extra PME load on node
01181                                 if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
01182                                 else nopatches.add(ri);
01183                         }
01184                 }
01185 
01186 #if USE_RANDOM_TOPO
01187             Random rand(CkMyPe());
01188             int *tmp = new int[patches.size()];
01189             int nn = patches.size();
01190             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01191             rand.reorder(tmp, nn);
01192             patches.resize(0);
01193             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01194             delete [] tmp;
01195             tmp = new int[nopatches.size()];
01196             nn = nopatches.size();
01197             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01198             rand.reorder(tmp, nn);
01199             nopatches.resize(0);
01200             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01201             delete [] tmp;
01202 #endif
01203 
01204                 // only use zero if it eliminates overloading or has patches
01205                 int useZero = 0;
01206                 int npens = xBlocks*yBlocks;
01207                 if ( npens % ncpus == 0 ) useZero = 1;
01208                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01209                 npens += xBlocks*zBlocks;
01210                 if ( npens % ncpus == 0 ) useZero = 1;
01211                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01212                 npens += yBlocks*zBlocks;
01213                 if ( npens % ncpus == 0 ) useZero = 1;
01214                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01215 
01216                 // add nopatches then patches in reversed order
01217                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01218                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01219                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01220                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01221   
01222                 int npes = pmeprocs.size();
01223                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01224                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01225 #if !USE_RANDOM_TOPO
01226                 zprocs.sort();
01227 #endif
01228                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01229                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01230 #if !USE_RANDOM_TOPO
01231                 yprocs.sort();
01232 #endif
01233       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01234       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01235 #if !USE_RANDOM_TOPO
01236       xprocs.sort();
01237 #endif
01238 
01239 #if USE_TOPO_SFC
01240   CmiLock(tmgr_lock);
01241   //{
01242   TopoManager tmgr;
01243   int xdim = tmgr.getDimNX();
01244   int ydim = tmgr.getDimNY();
01245   int zdim = tmgr.getDimNZ();
01246   int xdim1 = find_level_grid(xdim);
01247   int ydim1 = find_level_grid(ydim);
01248   int zdim1 = find_level_grid(zdim);
01249   if(CkMyPe() == 0)
01250       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01251 
01252   vector<Coord> result;
01253   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01254   sort_sfc(xprocs, tmgr, result);
01255   sort_sfc(yprocs, tmgr, result);
01256   sort_sfc(zprocs, tmgr, result);
01257   //}
01258   CmiUnlock(tmgr_lock);
01259 #endif
01260 
01261 
01262                 if(CkMyPe() == 0){  
01263               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01264           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01265 #if USE_TOPO_SFC
01266               int x,y,z,t;
01267               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01268               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01269 #else
01270               iout << " " << zprocs[i];
01271 #endif
01272           }
01273           if ( i < zprocs.size() ) iout << " ...";
01274               iout << "\n" << endi;
01275                 }
01276 
01277     if (CkMyRank() == 0) {
01278       for (pe=0, x = 0; x < xBlocks; ++x)
01279         for (y = 0; y < yBlocks; ++y, ++pe ) {
01280           pencilPMEProcessors[zprocs[pe]] = 1;
01281         }
01282     }
01283      
01284                 if(CkMyPe() == 0){  
01285               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01286           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01287 #if USE_TOPO_SFC
01288               int x,y,z,t;
01289               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01290               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01291 #else
01292               iout << " " << yprocs[i];
01293 #endif
01294           }
01295           if ( i < yprocs.size() ) iout << " ...";
01296               iout << "\n" << endi;
01297                 }
01298 
01299     if (CkMyRank() == 0) {
01300       for (pe=0, z = 0; z < zBlocks; ++z )
01301         for (x = 0; x < xBlocks; ++x, ++pe ) {
01302           pencilPMEProcessors[yprocs[pe]] = 1;
01303         }
01304     }
01305     
01306                 if(CkMyPe() == 0){  
01307                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01308                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01309 #if USE_TOPO_SFC
01310                 int x,y,z,t;
01311                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01312                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01313 #else
01314                 iout << " " << xprocs[i];
01315 #endif
01316             }
01317                 if ( i < xprocs.size() ) iout << " ...";
01318                 iout << "\n" << endi;
01319                 }
01320 
01321     if (CkMyRank() == 0) {
01322       for (pe=0, y = 0; y < yBlocks; ++y )      
01323         for (z = 0; z < zBlocks; ++z, ++pe ) {
01324           pencilPMEProcessors[xprocs[pe]] = 1;
01325         }
01326     }
01327         
01328 
01329         // creating the pencil arrays
01330         if ( CkMyPe() == 0 ){
01331 #if !USE_RANDOM_TOPO
01332         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01333         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01334         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01335         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01336 #endif
01337 #if 1
01338         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01339         CProxy_PmePencilMap ym;
01340         if ( simParams->PMEPencilsYLayout )
01341           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01342         else
01343           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01344         CProxy_PmePencilMap xm;
01345         if ( simParams->PMEPencilsXLayout )
01346           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01347         else
01348           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01349         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01350         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01351         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01352         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01353         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01354         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01355         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01356         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01357         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01358         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01359 #else
01360         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01361         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01362         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01363 
01364                 for (pe=0, x = 0; x < xBlocks; ++x)
01365                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01366                                 zPencil(x,y,0).insert(zprocs[pe]);
01367                         }
01368         zPencil.doneInserting();
01369 
01370                 for (pe=0, x = 0; x < xBlocks; ++x)
01371                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01372                                 yPencil(x,0,z).insert(yprocs[pe]);
01373                         }
01374         yPencil.doneInserting();
01375 
01376 
01377                 for (pe=0, y = 0; y < yBlocks; ++y )    
01378                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01379                                 xPencil(0,y,z).insert(xprocs[pe]);
01380                         }
01381                 xPencil.doneInserting();     
01382 #endif
01383 
01384                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01385                 PmePencilInitMsgData msgdata;
01386                 msgdata.grid = myGrid;
01387                 msgdata.xBlocks = xBlocks;
01388                 msgdata.yBlocks = yBlocks;
01389                 msgdata.zBlocks = zBlocks;
01390                 msgdata.xPencil = xPencil;
01391                 msgdata.yPencil = yPencil;
01392                 msgdata.zPencil = zPencil;
01393                 msgdata.pmeProxy = pmeProxyDir;
01394         msgdata.pmeNodeProxy = pmeNodeProxy;
01395         msgdata.xm = xm;
01396         msgdata.ym = ym;
01397         msgdata.zm = zm;
01398                 xPencil.init(new PmePencilInitMsg(msgdata));
01399                 yPencil.init(new PmePencilInitMsg(msgdata));
01400                 zPencil.init(new PmePencilInitMsg(msgdata));
01401         }
01402 
01403     return;  // continue in initialize_pencils() at next startup stage
01404   }
01405 
01406 
01407   int pe;
01408   int nx = 0;
01409   for ( pe = 0; pe < numGridPes; ++pe ) {
01410     localInfo[pe].x_start = nx;
01411     nx += myGrid.block1;
01412     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01413     localInfo[pe].nx = nx - localInfo[pe].x_start;
01414   }
01415   int ny = 0;
01416   for ( pe = 0; pe < numTransPes; ++pe ) {
01417     localInfo[pe].y_start_after_transpose = ny;
01418     ny += myGrid.block2;
01419     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01420     localInfo[pe].ny_after_transpose =
01421                         ny - localInfo[pe].y_start_after_transpose;
01422   }
01423 
01424   {  // decide how many pes this node exchanges charges with
01425 
01426   PatchMap *patchMap = PatchMap::Object();
01427   Lattice lattice = simParams->lattice;
01428   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01429   BigReal cutoff = simParams->cutoff;
01430   BigReal patchdim = simParams->patchDimension;
01431   int numPatches = patchMap->numPatches();
01432   int numNodes = CkNumPes();
01433   int *source_flags = new int[numNodes];
01434   int node;
01435   for ( node=0; node<numNodes; ++node ) {
01436     source_flags[node] = 0;
01437     recipPeDest[node] = 0;
01438   }
01439 
01440   // // make sure that we don't get ahead of ourselves on this node
01441   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01442   //   source_flags[CkMyPe()] = 1;
01443   //   recipPeDest[myRecipPe] = 1;
01444   // }
01445 
01446   for ( int pid=0; pid < numPatches; ++pid ) {
01447     int pnode = patchMap->node(pid);
01448 #ifdef NAMD_CUDA
01449     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01450 #endif
01451     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01452     BigReal minx = patchMap->min_a(pid);
01453     BigReal maxx = patchMap->max_a(pid);
01454     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01455     // min1 (max1) is smallest (largest) grid line for this patch
01456     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01457     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01458     for ( int i=min1; i<=max1; ++i ) {
01459       int ix = i;
01460       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01461       while ( ix < 0 ) ix += myGrid.K1;
01462       // set source_flags[pnode] if this patch sends to our node
01463       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01464            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01465         source_flags[pnode] = 1;
01466       }
01467       // set dest_flags[] for node that our patch sends to
01468 #ifdef NAMD_CUDA
01469       if ( offload ) {
01470         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01471           recipPeDest[ix / myGrid.block1] = 1;
01472         }
01473       } else
01474 #endif
01475       if ( pnode == CkMyPe() ) {
01476         recipPeDest[ix / myGrid.block1] = 1;
01477       }
01478     }
01479   }
01480 
01481   int numSourcesSamePhysicalNode = 0;
01482   numSources = 0;
01483   numDestRecipPes = 0;
01484   for ( node=0; node<numNodes; ++node ) {
01485     if ( source_flags[node] ) ++numSources;
01486     if ( recipPeDest[node] ) ++numDestRecipPes;
01487     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01488   }
01489 
01490 #if 0
01491   if ( numSources ) {
01492     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01493             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01494     iout << iINFO << "PME " << CkMyPe() << " sources:";
01495     for ( node=0; node<numNodes; ++node ) {
01496       if ( source_flags[node] ) iout << " " << node;
01497     }
01498     iout << "\n" << endi;
01499   }
01500 #endif
01501 
01502   delete [] source_flags;
01503 
01504   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01505   //           CkMyPe(), numSources, numDestRecipPes);
01506 
01507   }  // decide how many pes this node exchanges charges with (end)
01508 
01509   ungrid_count = numDestRecipPes;
01510 
01511   sendTransBarrier_received = 0;
01512 
01513   if ( myGridPe < 0 && myTransPe < 0 ) return;
01514   // the following only for nodes doing reciprocal sum
01515 
01516   if ( myTransPe >= 0 ) {
01517     recipEvirPe = findRecipEvirPe();
01518     pmeProxy[recipEvirPe].addRecipEvirClient();
01519   }
01520 
01521   if ( myTransPe >= 0 ) {
01522       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01523       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01524       #ifdef OPENATOM_VERSION
01525       if ( simParams->openatomOn ) { 
01526         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01527         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01528       } else {
01529         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01530       }
01531       #else  // OPENATOM_VERSION
01532       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01533       #endif // OPENATOM_VERSION
01534   }
01535 
01536   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01537   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01538   if ( local_size < local_size_2 ) local_size = local_size_2;
01539   qgrid = new float[local_size*numGrids];
01540   if ( numGridPes > 1 || numTransPes > 1 ) {
01541     kgrid = new float[local_size*numGrids];
01542   } else {
01543     kgrid = qgrid;
01544   }
01545   qgrid_size = local_size;
01546 
01547   if ( myGridPe >= 0 ) {
01548   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01549   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01550   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01551   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01552   }
01553 
01554   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01555 #ifdef NAMD_FFTW
01556   CmiLock(fftw_plan_lock);
01557 #ifdef NAMD_FFTW_3
01558   work = new fftwf_complex[n[0]];
01559   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01560   if ( myGridPe >= 0 ) {
01561     forward_plan_yz=new fftwf_plan[numGrids];
01562     backward_plan_yz=new fftwf_plan[numGrids];
01563   }
01564   if ( myTransPe >= 0 ) {
01565     forward_plan_x=new fftwf_plan[numGrids];
01566     backward_plan_x=new fftwf_plan[numGrids];
01567   }
01568   /* need one plan per grid */
01569   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01570   if ( myGridPe >= 0 ) {
01571     for( int g=0; g<numGrids; g++)
01572       {
01573         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01574                                                      localInfo[myGridPe].nx,
01575                                                      qgrid + qgrid_size * g,
01576                                                      NULL,
01577                                                      1,
01578                                                      myGrid.dim2 * myGrid.dim3,
01579                                                      (fftwf_complex *) 
01580                                                      (qgrid + qgrid_size * g),
01581                                                      NULL,
01582                                                      1,
01583                                                      myGrid.dim2 * (myGrid.dim3/2),
01584                                                      fftwFlags);
01585       }
01586   }
01587   int zdim = myGrid.dim3;
01588   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01589   if ( ! CkMyPe() ) iout << " 2..." << endi;
01590   if ( myTransPe >= 0 ) {
01591     for( int g=0; g<numGrids; g++)
01592       {
01593 
01594         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01595                                                 (fftwf_complex *)
01596                                                 (kgrid+qgrid_size*g),
01597                                                 NULL,
01598                                                 xStride,
01599                                                 1,
01600                                                 (fftwf_complex *)
01601                                                 (kgrid+qgrid_size*g),
01602                                                 NULL,
01603                                                 xStride,
01604                                                 1,
01605                                                 FFTW_FORWARD,fftwFlags);
01606         
01607       }
01608   }
01609   if ( ! CkMyPe() ) iout << " 3..." << endi;
01610   if ( myTransPe >= 0 ) {
01611     for( int g=0; g<numGrids; g++)
01612       {
01613         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01614                                                  (fftwf_complex *)
01615                                                  (kgrid+qgrid_size*g),
01616                                                  NULL,
01617                                                  xStride,
01618                                                  1,
01619                                                  (fftwf_complex *)
01620                                                  (kgrid+qgrid_size*g),
01621                                                  NULL,
01622                                                  xStride,
01623                                                  1,
01624                                                  FFTW_BACKWARD, fftwFlags);
01625 
01626       }
01627   }
01628   if ( ! CkMyPe() ) iout << " 4..." << endi;
01629   if ( myGridPe >= 0 ) {
01630     for( int g=0; g<numGrids; g++)
01631       {
01632         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01633                                                       localInfo[myGridPe].nx,
01634                                                       (fftwf_complex *)
01635                                                       (qgrid + qgrid_size * g),
01636                                                       NULL,
01637                                                       1,
01638                                                       myGrid.dim2*(myGrid.dim3/2),
01639                                                       qgrid + qgrid_size * g,
01640                                                       NULL,
01641                                                       1,
01642                                                       myGrid.dim2 * myGrid.dim3,
01643                                                       fftwFlags);
01644       }
01645   }
01646   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01647 
01648 #else
01649   work = new fftw_complex[n[0]];
01650 
01651   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01652   if ( myGridPe >= 0 ) {
01653   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01654         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01655         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01656   }
01657   if ( ! CkMyPe() ) iout << " 2..." << endi;
01658   if ( myTransPe >= 0 ) {
01659       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01660         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01661         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01662         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01663   }
01664   if ( ! CkMyPe() ) iout << " 3..." << endi;
01665   if ( myTransPe >= 0 ) {
01666   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01667         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01668         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01669         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01670   }
01671   if ( ! CkMyPe() ) iout << " 4..." << endi;
01672   if ( myGridPe >= 0 ) {
01673   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01674         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01675         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01676   }
01677   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01678 #endif
01679   CmiUnlock(fftw_plan_lock);
01680 #else
01681   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01682 #endif
01683 
01684   if ( myGridPe >= 0 && numSources == 0 )
01685                 NAMD_bug("PME grid elements exist without sources.");
01686   grid_count = numSources;
01687   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01688   trans_count = numGridPes;
01689 }
01690 
01691 
01692 
01693 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01694   delete msg;
01695   if ( ! usePencils ) return;
01696 
01697   SimParameters *simParams = Node::Object()->simParameters;
01698 
01699   PatchMap *patchMap = PatchMap::Object();
01700   Lattice lattice = simParams->lattice;
01701   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01702   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01703   BigReal cutoff = simParams->cutoff;
01704   BigReal patchdim = simParams->patchDimension;
01705   int numPatches = patchMap->numPatches();
01706 
01707   pencilActive = new char[xBlocks*yBlocks];
01708   for ( int i=0; i<xBlocks; ++i ) {
01709     for ( int j=0; j<yBlocks; ++j ) {
01710       pencilActive[i*yBlocks+j] = 0;
01711     }
01712   }
01713 
01714   for ( int pid=0; pid < numPatches; ++pid ) {
01715     int pnode = patchMap->node(pid);
01716 #ifdef NAMD_CUDA
01717     if ( offload ) {
01718       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01719     } else
01720 #endif
01721     if ( pnode != CkMyPe() ) continue;
01722 
01723     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01724     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01725 
01726     BigReal minx = patchMap->min_a(pid);
01727     BigReal maxx = patchMap->max_a(pid);
01728     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01729     // min1 (max1) is smallest (largest) grid line for this patch
01730     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01731     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01732 
01733     BigReal miny = patchMap->min_b(pid);
01734     BigReal maxy = patchMap->max_b(pid);
01735     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01736     // min2 (max2) is smallest (largest) grid line for this patch
01737     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01738     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01739 
01740     for ( int i=min1; i<=max1; ++i ) {
01741       int ix = i;
01742       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01743       while ( ix < 0 ) ix += myGrid.K1;
01744       for ( int j=min2; j<=max2; ++j ) {
01745         int jy = j;
01746         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01747         while ( jy < 0 ) jy += myGrid.K2;
01748         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01749       }
01750     }
01751   }
01752 
01753   numPencilsActive = 0;
01754   for ( int i=0; i<xBlocks; ++i ) {
01755     for ( int j=0; j<yBlocks; ++j ) {
01756       if ( pencilActive[i*yBlocks+j] ) {
01757         ++numPencilsActive;
01758 #ifdef NAMD_CUDA
01759         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01760 #endif
01761         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01762       }
01763     }
01764   }
01765   activePencils = new ijpair[numPencilsActive];
01766   numPencilsActive = 0;
01767   for ( int i=0; i<xBlocks; ++i ) {
01768     for ( int j=0; j<yBlocks; ++j ) {
01769       if ( pencilActive[i*yBlocks+j] ) {
01770         activePencils[numPencilsActive++] = ijpair(i,j);
01771       }
01772     }
01773   }
01774   if ( simParams->PMESendOrder ) {
01775     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01776   } else {
01777     Random rand(CkMyPe());
01778     rand.reorder(activePencils,numPencilsActive);
01779   }
01780   //if ( numPencilsActive ) {
01781   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01782   //}
01783 
01784   ungrid_count = numPencilsActive;
01785 }
01786 
01787 
01788 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01789   if ( ! usePencils ) return;
01790   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01791 }
01792 
01793 
01794 ComputePmeMgr::~ComputePmeMgr() {
01795 
01796   if ( CmiMyRank() == 0 ) {
01797     CmiDestroyLock(fftw_plan_lock);
01798   }
01799   CmiDestroyLock(pmemgr_lock);
01800 
01801   delete myKSpace;
01802   delete [] localInfo;
01803   delete [] gridNodeInfo;
01804   delete [] transNodeInfo;
01805   delete [] gridPeMap;
01806   delete [] transPeMap;
01807   delete [] recipPeDest;
01808   delete [] gridPeOrder;
01809   delete [] gridNodeOrder;
01810   delete [] transNodeOrder;
01811   delete [] qgrid;
01812   if ( kgrid != qgrid ) delete [] kgrid;
01813   delete [] work;
01814   delete [] gridmsg_reuse;
01815 
01816  if ( ! offload ) {
01817   for (int i=0; i<q_count; ++i) {
01818     delete [] q_list[i];
01819   }
01820   delete [] q_list;
01821   delete [] fz_arr;
01822  }
01823   delete [] f_arr;
01824   delete [] q_arr;
01825 }
01826 
01827 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01828   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01829   if ( grid_count == 0 ) {
01830     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01831   }
01832   if ( grid_count == numSources ) {
01833     lattice = msg->lattice;
01834     grid_sequence = msg->sequence;
01835   }
01836 
01837   int zdim = myGrid.dim3;
01838   int zlistlen = msg->zlistlen;
01839   int *zlist = msg->zlist;
01840   float *qmsg = msg->qgrid;
01841   for ( int g=0; g<numGrids; ++g ) {
01842     char *f = msg->fgrid + fgrid_len * g;
01843     float *q = qgrid + qgrid_size * g;
01844     for ( int i=0; i<fgrid_len; ++i ) {
01845       if ( f[i] ) {
01846         for ( int k=0; k<zlistlen; ++k ) {
01847           q[zlist[k]] += *(qmsg++);
01848         }
01849       }
01850       q += zdim;
01851     }
01852   }
01853 
01854   gridmsg_reuse[numSources-grid_count] = msg;
01855   --grid_count;
01856 
01857   if ( grid_count == 0 ) {
01858     pmeProxyDir[CkMyPe()].gridCalc1();
01859     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01860   }
01861 }
01862 #ifdef MANUAL_DEBUG_FFTW3
01863 
01864 /* utility functions for manual debugging */
01865 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01866 {
01867 
01868   char fmt[1000];
01869   char filename[1000];
01870   strncpy(fmt,infilename,999);
01871   strncat(fmt,"_%d.out",999);
01872   sprintf(filename,fmt, pe);
01873   FILE *loutfile = fopen(filename, "w");
01874 #ifdef PAIRCALC_TEST_DUMP
01875   fprintf(loutfile,"%d\n",ydim);
01876 #endif
01877   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01878   for(int i=0;i<xdim;i++)
01879     for(int j=0;j<ydim;j++)
01880       for(int k=0;k<zdim;k++)
01881         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01882   fclose(loutfile);
01883 
01884 }
01885 
01886 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01887 {
01888   char fmt[1000];
01889   char filename[1000];
01890   strncpy(fmt,infilename,999);
01891   strncat(fmt,"_%d_%d_%d.out",999);
01892   sprintf(filename,fmt, x,y,z);
01893   FILE *loutfile = fopen(filename, "w");
01894   CkAssert(loutfile!=NULL);
01895   CkPrintf("opened %s for dump\n",filename);
01896   fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01897   for(int i=0;i<xdim;i++)
01898     for(int j=0;j<ydim;j++)
01899       for(int k=0;k<zdim;k++)
01900         fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01901   fclose(loutfile);
01902 }
01903 
01904 #endif
01905 
01906 void ComputePmeMgr::gridCalc1(void) {
01907   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01908 
01909 #ifdef NAMD_FFTW
01910   for ( int g=0; g<numGrids; ++g ) {
01911 #ifdef NAMD_FFTW_3
01912     fftwf_execute(forward_plan_yz[g]);
01913 #else
01914     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01915         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01916 #endif
01917 
01918   }
01919 #endif
01920 
01921   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01922 }
01923 
01924 void ComputePmeMgr::sendTransBarrier(void) {
01925   sendTransBarrier_received += 1;
01926   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01927   if ( sendTransBarrier_received < numGridPes ) return;
01928   sendTransBarrier_received = 0;
01929   for ( int i=0; i<numGridPes; ++i ) {
01930     pmeProxyDir[gridPeMap[i]].sendTrans();
01931   }
01932 }
01933 
01934 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
01935   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
01936   mgr->sendTransSubset(first, last);
01937 }
01938 
01939 void ComputePmeMgr::sendTrans(void) {
01940 
01941   untrans_count = numTransPes;
01942 
01943 #if     CMK_SMP && USE_CKLOOP
01944   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01945   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01946     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01947   } else
01948 #endif
01949   {
01950     sendTransSubset(0, numTransNodes-1);
01951   }
01952 
01953 }
01954 
01955 void ComputePmeMgr::sendTransSubset(int first, int last) {
01956   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01957 
01958   // send data for transpose
01959   int zdim = myGrid.dim3;
01960   int nx = localInfo[myGridPe].nx;
01961   int x_start = localInfo[myGridPe].x_start;
01962   int slicelen = myGrid.K2 * zdim;
01963 
01964   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01965 
01966 #if CMK_BLUEGENEL
01967   CmiNetworkProgressAfter (0);
01968 #endif
01969 
01970   for (int j=first; j<=last; j++) {
01971     int node = transNodeOrder[j];  // different order on each node
01972     int pe = transNodeInfo[node].pe_start;
01973     int npe = transNodeInfo[node].npe;
01974     int totlen = 0;
01975     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01976       LocalPmeInfo &li = localInfo[pe];
01977       int cpylen = li.ny_after_transpose * zdim;
01978       totlen += cpylen;
01979     }
01980     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01981                                 PRIORITY_SIZE) PmeTransMsg;
01982     newmsg->sourceNode = myGridPe;
01983     newmsg->lattice = lattice;
01984     newmsg->x_start = x_start;
01985     newmsg->nx = nx;
01986     for ( int g=0; g<numGrids; ++g ) {
01987       float *qmsg = newmsg->qgrid + nx * totlen * g;
01988       pe = transNodeInfo[node].pe_start;
01989       for (int i=0; i<npe; ++i, ++pe) {
01990         LocalPmeInfo &li = localInfo[pe];
01991         int cpylen = li.ny_after_transpose * zdim;
01992         if ( node == myTransNode ) {
01993           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01994           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01995         }
01996         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01997         for ( int x = 0; x < nx; ++x ) {
01998           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01999           q += slicelen;
02000           qmsg += cpylen;
02001         }
02002       }
02003     }
02004     newmsg->sequence = grid_sequence;
02005     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02006     if ( node == myTransNode ) newmsg->nx = 0;
02007     if ( npe > 1 ) {
02008       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02009       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02010     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02011   }
02012 }
02013 
02014 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
02015   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02016   int pe = transNodeInfo[myTransNode].pe_start;
02017   int npe = transNodeInfo[myTransNode].npe;
02018   CmiNodeLock lock = CmiCreateLock();
02019   int *count = new int; *count = npe;
02020   for (int i=0; i<npe; ++i, ++pe) {
02021     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02022     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02023     shmsg->msg = msg;
02024     shmsg->count = count;
02025     shmsg->lock = lock;
02026     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02027   }
02028 }
02029 
02030 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
02031   procTrans(msg->msg);
02032   CmiLock(msg->lock);
02033   int count = --(*msg->count);
02034   CmiUnlock(msg->lock);
02035   if ( count == 0 ) {
02036     CmiDestroyLock(msg->lock);
02037     delete msg->count;
02038     delete msg->msg;
02039   }
02040   delete msg;
02041 }
02042 
02043 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
02044   procTrans(msg);
02045   delete msg;
02046 }
02047 
02048 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
02049   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02050   if ( trans_count == numGridPes ) {
02051     lattice = msg->lattice;
02052     grid_sequence = msg->sequence;
02053   }
02054 
02055  if ( msg->nx ) {
02056   int zdim = myGrid.dim3;
02057   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02058   int first_pe = nodeInfo.pe_start;
02059   int last_pe = first_pe+nodeInfo.npe-1;
02060   int y_skip = localInfo[myTransPe].y_start_after_transpose
02061              - localInfo[first_pe].y_start_after_transpose;
02062   int ny_msg = localInfo[last_pe].y_start_after_transpose
02063              + localInfo[last_pe].ny_after_transpose
02064              - localInfo[first_pe].y_start_after_transpose;
02065   int ny = localInfo[myTransPe].ny_after_transpose;
02066   int x_start = msg->x_start;
02067   int nx = msg->nx;
02068   for ( int g=0; g<numGrids; ++g ) {
02069     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02070         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02071         nx*ny*zdim*sizeof(float));
02072   }
02073  }
02074 
02075   --trans_count;
02076 
02077   if ( trans_count == 0 ) {
02078     pmeProxyDir[CkMyPe()].gridCalc2();
02079   }
02080 }
02081 
02082 void ComputePmeMgr::gridCalc2(void) {
02083   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02084 
02085 #if CMK_BLUEGENEL
02086   CmiNetworkProgressAfter (0);
02087 #endif
02088 
02089   int zdim = myGrid.dim3;
02090   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02091   int ny = localInfo[myTransPe].ny_after_transpose;
02092 
02093   for ( int g=0; g<numGrids; ++g ) {
02094     // finish forward FFT (x dimension)
02095 #ifdef NAMD_FFTW
02096 #ifdef NAMD_FFTW_3
02097     fftwf_execute(forward_plan_x[g]);
02098 #else
02099     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02100         ny * zdim / 2, 1, work, 1, 0);
02101 #endif
02102 #endif
02103   }
02104 
02105 #ifdef OPENATOM_VERSION
02106     if ( ! simParams -> openatomOn ) { 
02107 #endif // OPENATOM_VERSION
02108       gridCalc2R();
02109 #ifdef OPENATOM_VERSION
02110     } else {
02111       gridCalc2Moa();
02112     }
02113 #endif // OPENATOM_VERSION
02114 }
02115 
02116 #ifdef OPENATOM_VERSION
02117 void ComputePmeMgr::gridCalc2Moa(void) {
02118 
02119   int zdim = myGrid.dim3;
02120   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02121   int ny = localInfo[myTransPe].ny_after_transpose;
02122 
02123   SimParameters *simParams = Node::Object()->simParameters;
02124 
02125   CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
02126 
02127   for ( int g=0; g<numGrids; ++g ) {
02128     #ifdef OPENATOM_VERSION_DEBUG 
02129     CkPrintf("Sending recQ on processor %d \n", CkMyPe());
02130     for ( int i=0; i<=(ny * zdim / 2); ++i) 
02131     {
02132       CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
02133     }
02134     #endif // OPENATOM_VERSION_DEBUG
02135 //     mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
02136     CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
02137     moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
02138   }
02139 }
02140 #endif // OPENATOM_VERSION
02141 
02142 void ComputePmeMgr::gridCalc2R(void) {
02143 
02144   int useCkLoop = 0;
02145 #if CMK_SMP && USE_CKLOOP
02146   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02147        && CkNumPes() >= 2 * numTransPes ) {
02148     useCkLoop = 1;
02149   }
02150 #endif
02151 
02152   int zdim = myGrid.dim3;
02153   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02154   int ny = localInfo[myTransPe].ny_after_transpose;
02155 
02156   for ( int g=0; g<numGrids; ++g ) {
02157     // reciprocal space portion of PME
02158     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02159     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02160                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02161     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02162 
02163     // start backward FFT (x dimension)
02164 
02165 #ifdef NAMD_FFTW
02166 #ifdef NAMD_FFTW_3
02167     fftwf_execute(backward_plan_x[g]);
02168 #else
02169     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02170         ny * zdim / 2, 1, work, 1, 0);
02171 #endif
02172 #endif
02173   }
02174   
02175   pmeProxyDir[CkMyPe()].sendUntrans();
02176 }
02177 
02178 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
02179   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02180   mgr->sendUntransSubset(first, last);
02181 }
02182 
02183 void ComputePmeMgr::sendUntrans(void) {
02184 
02185   trans_count = numGridPes;
02186 
02187   { // send energy and virial
02188     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02189     for ( int g=0; g<numGrids; ++g ) {
02190       newmsg->evir[g] = recip_evir2[g];
02191     }
02192     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02193     CmiEnableUrgentSend(1);
02194     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02195     CmiEnableUrgentSend(0);
02196   }
02197 
02198 #if     CMK_SMP && USE_CKLOOP
02199   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02200   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02201     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02202   } else
02203 #endif
02204   {
02205     sendUntransSubset(0, numGridNodes-1);
02206   }
02207 
02208 }
02209 
02210 void ComputePmeMgr::sendUntransSubset(int first, int last) {
02211 
02212   int zdim = myGrid.dim3;
02213   int y_start = localInfo[myTransPe].y_start_after_transpose;
02214   int ny = localInfo[myTransPe].ny_after_transpose;
02215   int slicelen = myGrid.K2 * zdim;
02216 
02217   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02218 
02219 #if CMK_BLUEGENEL
02220   CmiNetworkProgressAfter (0);
02221 #endif
02222 
02223   // send data for reverse transpose
02224   for (int j=first; j<=last; j++) {
02225     int node = gridNodeOrder[j];  // different order on each node
02226     int pe = gridNodeInfo[node].pe_start;
02227     int npe = gridNodeInfo[node].npe;
02228     int totlen = 0;
02229     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02230       LocalPmeInfo &li = localInfo[pe];
02231       int cpylen = li.nx * zdim;
02232       totlen += cpylen;
02233     }
02234     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02235     newmsg->sourceNode = myTransPe;
02236     newmsg->y_start = y_start;
02237     newmsg->ny = ny;
02238     for ( int g=0; g<numGrids; ++g ) {
02239       float *qmsg = newmsg->qgrid + ny * totlen * g;
02240       pe = gridNodeInfo[node].pe_start;
02241       for (int i=0; i<npe; ++i, ++pe) {
02242         LocalPmeInfo &li = localInfo[pe];
02243         if ( node == myGridNode ) {
02244           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02245           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02246           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02247           int cpylen = ny * zdim;
02248           for ( int x = 0; x < li.nx; ++x ) {
02249             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02250             q += cpylen;
02251             qmsg += slicelen;
02252           }
02253         } else {
02254           CmiMemcpy((void*)qmsg,
02255                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02256                 li.nx*ny*zdim*sizeof(float));
02257           qmsg += li.nx*ny*zdim;
02258         }
02259       }
02260     }
02261     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02262     if ( node == myGridNode ) newmsg->ny = 0;
02263     if ( npe > 1 ) {
02264       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02265       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02266     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02267   }
02268 }
02269 
02270 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
02271   int pe = gridNodeInfo[myGridNode].pe_start;
02272   int npe = gridNodeInfo[myGridNode].npe;
02273   CmiNodeLock lock = CmiCreateLock();
02274   int *count = new int; *count = npe;
02275   for (int i=0; i<npe; ++i, ++pe) {
02276     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02277     shmsg->msg = msg;
02278     shmsg->count = count;
02279     shmsg->lock = lock;
02280     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02281   }
02282 }
02283 
02284 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
02285   procUntrans(msg->msg);
02286   CmiLock(msg->lock);
02287   int count = --(*msg->count);
02288   CmiUnlock(msg->lock);
02289   if ( count == 0 ) {
02290     CmiDestroyLock(msg->lock);
02291     delete msg->count;
02292     delete msg->msg;
02293   }
02294   delete msg;
02295 }
02296 
02297 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
02298   procUntrans(msg);
02299   delete msg;
02300 }
02301 
02302 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
02303   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02304 
02305 #if CMK_BLUEGENEL
02306   CmiNetworkProgressAfter (0);
02307 #endif
02308 
02309   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02310   int first_pe = nodeInfo.pe_start;
02311   int g;
02312 
02313  if ( msg->ny ) {
02314   int zdim = myGrid.dim3;
02315   int last_pe = first_pe+nodeInfo.npe-1;
02316   int x_skip = localInfo[myGridPe].x_start
02317              - localInfo[first_pe].x_start;
02318   int nx_msg = localInfo[last_pe].x_start
02319              + localInfo[last_pe].nx
02320              - localInfo[first_pe].x_start;
02321   int nx = localInfo[myGridPe].nx;
02322   int y_start = msg->y_start;
02323   int ny = msg->ny;
02324   int slicelen = myGrid.K2 * zdim;
02325   int cpylen = ny * zdim;
02326   for ( g=0; g<numGrids; ++g ) {
02327     float *q = qgrid + qgrid_size * g + y_start * zdim;
02328     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02329     for ( int x = 0; x < nx; ++x ) {
02330       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02331       q += slicelen;
02332       qmsg += cpylen;
02333     }
02334   }
02335  }
02336 
02337   --untrans_count;
02338 
02339   if ( untrans_count == 0 ) {
02340     pmeProxyDir[CkMyPe()].gridCalc3();
02341   }
02342 }
02343 
02344 void ComputePmeMgr::gridCalc3(void) {
02345   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02346 
02347   // finish backward FFT
02348 #ifdef NAMD_FFTW
02349   for ( int g=0; g<numGrids; ++g ) {
02350 #ifdef NAMD_FFTW_3
02351     fftwf_execute(backward_plan_yz[g]);
02352 #else
02353     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02354         (fftw_complex *) (qgrid + qgrid_size * g),
02355         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02356 #endif
02357   }
02358 
02359 #endif
02360 
02361   pmeProxyDir[CkMyPe()].sendUngrid();
02362 }
02363 
02364 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
02365   ComputePmeMgr *mgr = (ComputePmeMgr *)param;
02366   mgr->sendUngridSubset(first, last);
02367 }
02368 
02369 void ComputePmeMgr::sendUngrid(void) {
02370 
02371 #if     CMK_SMP && USE_CKLOOP
02372   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02373   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02374     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02375   } else
02376 #endif
02377   {
02378     sendUngridSubset(0, numSources-1);
02379   }
02380 
02381   grid_count = numSources;
02382   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02383 }
02384 
02385 void ComputePmeMgr::sendUngridSubset(int first, int last) {
02386 
02387 #ifdef NAMD_CUDA
02388   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02389 #else
02390   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02391 #endif
02392 
02393   for ( int j=first; j<=last; ++j ) {
02394     // int msglen = qgrid_len;
02395     PmeGridMsg *newmsg = gridmsg_reuse[j];
02396     int pe = newmsg->sourceNode;
02397     int zdim = myGrid.dim3;
02398     int flen = newmsg->len;
02399     int fstart = newmsg->start;
02400     int zlistlen = newmsg->zlistlen;
02401     int *zlist = newmsg->zlist;
02402     float *qmsg = newmsg->qgrid;
02403     for ( int g=0; g<numGrids; ++g ) {
02404       char *f = newmsg->fgrid + fgrid_len * g;
02405       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02406       for ( int i=0; i<flen; ++i ) {
02407         if ( f[i] ) {
02408           for ( int k=0; k<zlistlen; ++k ) {
02409             *(qmsg++) = q[zlist[k]];
02410           }
02411         }
02412         q += zdim;
02413       }
02414     }
02415     newmsg->sourceNode = myGridPe;
02416 
02417     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02418     CmiEnableUrgentSend(1);
02419 #ifdef NAMD_CUDA
02420     if ( offload ) {
02421       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02422     } else
02423 #endif
02424     pmeProxyDir[pe].recvUngrid(newmsg);
02425     CmiEnableUrgentSend(0);
02426   }
02427 }
02428 
02429 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02430   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02431 #ifdef NAMD_CUDA
02432   if ( ! offload )  // would need lock
02433 #endif
02434   if ( ungrid_count == 0 ) {
02435     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02436   }
02437 
02438   if ( usePencils ) copyPencils(msg);
02439   else copyResults(msg);
02440   delete msg;
02441   recvAck(0);
02442 }
02443 
02444 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02445   if ( msg ) delete msg;
02446 #ifdef NAMD_CUDA
02447   if ( offload ) {
02448     CmiLock(cuda_lock);
02449     if ( ungrid_count == 0 ) {
02450       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02451     }
02452     int uc = --ungrid_count;
02453     CmiUnlock(cuda_lock);
02454 
02455     if ( uc == 0 ) {
02456       pmeProxyDir[master_pe].ungridCalc();
02457     }
02458     return;
02459   }
02460 #endif
02461   --ungrid_count;
02462 
02463   if ( ungrid_count == 0 ) {
02464     pmeProxyDir[CkMyPe()].ungridCalc();
02465   }
02466 }
02467 
02468 #ifdef NAMD_CUDA
02469 #define count_limit 1000000
02470 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
02471 #define EVENT_STRIDE 10
02472 
02473 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
02474 
02475 void cuda_check_pme_forces(void *arg, double walltime) {
02476   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
02477 
02478  while ( 1 ) { // process multiple events per call
02479   cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
02480   if ( err == cudaSuccess ) {
02481     argp->check_forces_count = 0;
02482     for ( int i=0; i<EVENT_STRIDE; ++i ) {
02483       WorkDistrib::messageEnqueueWork(argp->pmeComputes[argp->forces_done_count]);
02484       if ( ++(argp->forces_done_count) == argp->forces_count ) break;
02485     }
02486     if ( argp->forces_done_count == argp->forces_count ) { // last event
02487       traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
02488       argp->forces_time = walltime - argp->forces_time;
02489       //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
02490       return;
02491     } else { // more events
02492       continue; // check next event
02493     }
02494   } else if ( err != cudaErrorNotReady ) {
02495     cuda_errcheck("in cuda_check_pme_forces");
02496     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_forces");
02497   } else if ( ++(argp->check_forces_count) >= count_limit ) {
02498     char errmsg[256];
02499     sprintf(errmsg,"cuda_check_pme_forces polled %d times over %f s on seq %d",
02500             argp->check_forces_count, walltime - argp->forces_time,
02501             argp->saved_sequence);
02502     cuda_errcheck(errmsg);
02503     NAMD_die(errmsg);
02504   } else {
02505     break; // call again
02506   }
02507  } // while ( 1 )
02508  CcdCallBacksReset(0,walltime);  // fix Charm++
02509  CUDA_POLL(cuda_check_pme_forces, arg);
02510 }
02511 #endif // NAMD_CUDA
02512 
02513 void ComputePmeMgr::ungridCalc(void) {
02514   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02515 
02516   ungridForcesCount = pmeComputes.size();
02517 
02518 #ifdef NAMD_CUDA
02519  if ( offload ) {
02520   //CmiLock(cuda_lock);
02521   cudaSetDevice(deviceCUDA->getDeviceID());
02522 
02523   if ( this == masterPmeMgr ) {
02524     double before = CmiWallTimer();
02525     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02526     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02527     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02528 
02529     const int myrank = CkMyRank();
02530     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02531       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02532         nodePmeMgr->mgrObjects[i]->ungridCalc();
02533       }
02534     }
02535     if ( ! pmeComputes.size() ) return;
02536   }
02537 
02538   if ( ! end_forces ) {
02539     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02540     end_forces = new cudaEvent_t[n];
02541     for ( int i=0; i<n; ++i ) {
02542       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02543     }
02544   }
02545 
02546   const int pcsz = pmeComputes.size();
02547   if ( ! afn_host ) {
02548     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02549     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02550     cuda_errcheck("malloc params for pme");
02551   }
02552   int totn = 0;
02553   for ( int i=0; i<pcsz; ++i ) {
02554     int n = pmeComputes[i]->numGridAtoms[0];
02555     totn += n;
02556   }
02557   if ( totn > f_data_mgr_alloc ) {
02558     if ( f_data_mgr_alloc ) {
02559       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02560       cudaFree(f_data_mgr_dev);
02561       cudaFreeHost(f_data_mgr_host);
02562     }
02563     f_data_mgr_alloc = 1.2 * (totn + 100);
02564     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02565     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02566     cuda_errcheck("malloc forces for pme");
02567   }
02568   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02569   float *f_dev = f_data_mgr_dev;
02570   float *f_host = f_data_mgr_host;
02571   for ( int i=0; i<pcsz; ++i ) {
02572     int n = pmeComputes[i]->numGridAtoms[0];
02573     pmeComputes[i]->f_data_dev = f_dev;
02574     pmeComputes[i]->f_data_host = f_host;
02575     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02576     afn_host[3*i+1] = f_dev;
02577     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02578     f_dev += 3*n;
02579     f_host += 3*n;
02580   }
02581   //CmiLock(cuda_lock);
02582   double before = CmiWallTimer();
02583   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02584   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02585   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02586   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02587 
02588   for ( int i=0; i<pcsz; ++i ) {
02589     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02590     if ( i%EVENT_STRIDE == 0 ) {
02591       int dimy = pcsz - i;
02592       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02593       int maxn = 0;
02594       int subtotn = 0;
02595       for ( int j=0; j<dimy; ++j ) {
02596         int n = pmeComputes[i+j]->numGridAtoms[0];
02597         subtotn += n;
02598         if ( n > maxn ) maxn = n;
02599       }
02600       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02601       before = CmiWallTimer();
02602       cuda_pme_forces(
02603         bspline_coeffs_dev,
02604         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02605         pmeComputes[i]->a_data_dev,
02606         pmeComputes[i]->f_data_dev,
02607         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02608         streams[stream]);
02609       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02610       before = CmiWallTimer();
02611       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02612         cudaMemcpyDeviceToHost, streams[stream]);
02613       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02614       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02615       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02616     }
02617     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02618   }
02619   //CmiUnlock(cuda_lock);
02620  } else
02621 #endif // NAMD_CUDA
02622  {
02623   for ( int i=0; i<pmeComputes.size(); ++i ) {
02624     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02625     // pmeComputes[i]->ungridForces();
02626   }
02627  }
02628   // submitReductions();  // must follow all ungridForces()
02629 
02630 #ifdef NAMD_CUDA
02631  if ( offload ) {
02632   forces_time = CmiWallTimer();
02633   forces_count = ungridForcesCount;
02634   forces_done_count = 0;
02635   pmeProxy[this_pe].pollForcesReady();
02636  }
02637 #endif
02638 
02639   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02640 }
02641 
02642 void ComputePmeMgr::pollForcesReady() {
02643 #ifdef NAMD_CUDA
02644   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02645   CUDA_POLL(cuda_check_pme_forces,this);
02646 #else
02647   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02648 #endif
02649 }
02650 
02651 void ComputePme::atomUpdate() { atomsChanged = 1; }
02652 
02653 ComputePme::ComputePme(ComputeID c, PatchID pid) : Compute(c), patchID(pid)
02654 {
02655   DebugM(4,"ComputePme created.\n");
02656   basePriority = PME_PRIORITY;
02657   setNumPatches(1);
02658 
02659   CProxy_ComputePmeMgr::ckLocalBranch(
02660         CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
02661 
02662   SimParameters *simParams = Node::Object()->simParameters;
02663 
02664   qmForcesOn =  simParams->qmForcesOn;
02665   offload = simParams->PMEOffload;
02666 
02667   numGridsMax = numGrids;
02668 
02669   myGrid.K1 = simParams->PMEGridSizeX;
02670   myGrid.K2 = simParams->PMEGridSizeY;
02671   myGrid.K3 = simParams->PMEGridSizeZ;
02672   myGrid.order = simParams->PMEInterpOrder;
02673   myGrid.dim2 = myGrid.K2;
02674   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02675 
02676 #ifdef NAMD_CUDA
02677   cuda_atoms_offset = 0;
02678   f_data_host = 0;
02679   f_data_dev = 0;
02680  if ( ! offload )
02681 #endif
02682  {
02683   for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
02684  }
02685 
02686   atomsChanged = 0;
02687   
02688   qmLoclIndx = 0;
02689   qmLocalCharges = 0;
02690 }
02691 
02692 void ComputePme::initialize() {
02693   if (!(patch = PatchMap::Object()->patch(patchID))) {
02694     NAMD_bug("ComputePme used with unknown patch.");
02695   }
02696   positionBox = patch->registerPositionPickup(this);
02697   avgPositionBox = patch->registerAvgPositionPickup(this);
02698   forceBox = patch->registerForceDeposit(this);
02699 #ifdef NAMD_CUDA
02700  if ( offload ) {
02701   myMgr->cuda_atoms_count += patch->getNumAtoms();
02702  }
02703 #endif
02704 }
02705 
02706 void ComputePmeMgr::initialize_computes() {
02707 
02708   noWorkCount = 0;
02709   doWorkCount = 0;
02710   ungridForcesCount = 0;
02711 
02712   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02713 
02714   SimParameters *simParams = Node::Object()->simParameters;
02715 
02716   strayChargeErrors = 0;
02717 
02718 #ifdef NAMD_CUDA
02719  PatchMap *patchMap = PatchMap::Object();
02720  int pe = master_pe = CkNodeFirst(CkMyNode());
02721  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02722     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02723     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02724     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02725     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02726     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02727         && pe != deviceCUDA->getMasterPe() ) {
02728       master_pe = pe;
02729     }
02730  }
02731  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02732    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02733  }
02734 
02735  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02736  bool cudaFirst = 1;
02737  if ( offload ) {
02738   CmiLock(cuda_lock);
02739   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02740  }
02741 
02742  if ( cudaFirst ) {
02743   nodePmeMgr->master_pe = master_pe;
02744   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02745  }
02746 #endif
02747 
02748   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02749   fsize = myGrid.K1 * myGrid.dim2;
02750   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02751 #ifdef NAMD_CUDA
02752  if ( ! offload )
02753 #endif
02754  {
02755   q_arr = new float*[fsize*numGrids];
02756   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02757   q_list = new float*[fsize*numGrids];
02758   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02759   q_count = 0;
02760  }
02761 
02762 #ifdef NAMD_CUDA
02763  if ( cudaFirst || ! offload ) {
02764 #endif
02765   f_arr = new char[fsize*numGrids];
02766   // memset to non-zero value has race condition on BlueGene/Q
02767   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02768   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02769 
02770   for ( int g=0; g<numGrids; ++g ) {
02771     char *f = f_arr + g*fsize;
02772     if ( usePencils ) {
02773       int K1 = myGrid.K1;
02774       int K2 = myGrid.K2;
02775       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02776       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02777       int dim2 = myGrid.dim2;
02778       for (int ap=0; ap<numPencilsActive; ++ap) {
02779         int ib = activePencils[ap].i;
02780         int jb = activePencils[ap].j;
02781         int ibegin = ib*block1;
02782         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02783         int jbegin = jb*block2;
02784         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02785         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02786         for ( int i=ibegin; i<iend; ++i ) {
02787           for ( int j=jbegin; j<jend; ++j ) {
02788             f[i*dim2+j] = 0;
02789           }
02790         }
02791       }
02792     } else {
02793       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02794       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02795       for (int pe=0; pe<numGridPes; pe++) {
02796         if ( ! recipPeDest[pe] ) continue;
02797         int start = pe * bsize;
02798         int len = bsize;
02799         if ( start >= qsize ) { start = 0; len = 0; }
02800         if ( start + len > qsize ) { len = qsize - start; }
02801         int zdim = myGrid.dim3;
02802         int fstart = start / zdim;
02803         int flen = len / zdim;
02804         memset(f + fstart, 0, flen*sizeof(char));
02805         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02806       }
02807     }
02808   }
02809 #ifdef NAMD_CUDA
02810  }
02811  if ( offload ) {
02812  cudaSetDevice(deviceCUDA->getDeviceID());
02813  if ( cudaFirst ) {
02814 
02815   int f_alloc_count = 0;
02816   for ( int n=fsize, i=0; i<n; ++i ) {
02817     if ( f_arr[i] == 0 ) {
02818       ++f_alloc_count;
02819     }
02820   }
02821   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02822 
02823   q_arr = new float*[fsize*numGrids];
02824   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02825 
02826   float **q_arr_dev_host = new float*[fsize];
02827   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02828 
02829   float **v_arr_dev_host = new float*[fsize];
02830   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02831 
02832   int q_stride = myGrid.K3+myGrid.order-1;
02833   q_data_size = f_alloc_count * q_stride * sizeof(float);
02834   ffz_size = (fsize + q_stride) * sizeof(int);
02835 
02836   // tack ffz onto end of q_data to allow merged transfer
02837   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02838   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02839   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02840   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02841   cudaMalloc((void**) &v_data_dev, q_data_size);
02842   cuda_errcheck("malloc grid data for pme");
02843   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02844   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02845   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02846   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02847   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02848 
02849   f_alloc_count = 0;
02850   for ( int n=fsize, i=0; i<n; ++i ) {
02851     if ( f_arr[i] == 0 ) {
02852       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02853       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02854       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02855       ++f_alloc_count;
02856     } else {
02857       q_arr[i] = 0;
02858       q_arr_dev_host[i] = 0;
02859       v_arr_dev_host[i] = 0;
02860     }
02861   }
02862 
02863   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02864   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02865   delete [] q_arr_dev_host;
02866   delete [] v_arr_dev_host;
02867   delete [] f_arr;
02868   f_arr = new char[fsize + q_stride];
02869   fz_arr = f_arr + fsize;
02870   memset(f_arr, 0, fsize + q_stride);
02871   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02872 
02873   cuda_errcheck("initialize grid data for pme");
02874 
02875   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02876   cuda_errcheck("initialize bspline coefficients for pme");
02877 
02878 #define XCOPY(X) masterPmeMgr->X = X;
02879   XCOPY(bspline_coeffs_dev)
02880   XCOPY(bspline_dcoeffs_dev)
02881   XCOPY(q_arr)
02882   XCOPY(q_arr_dev)
02883   XCOPY(v_arr_dev)
02884   XCOPY(q_data_size)
02885   XCOPY(q_data_host)
02886   XCOPY(q_data_dev)
02887   XCOPY(v_data_dev)
02888   XCOPY(ffz_size)
02889   XCOPY(ffz_host)
02890   XCOPY(ffz_dev)
02891   XCOPY(f_arr)
02892   XCOPY(fz_arr)
02893 #undef XCOPY
02894   //CkPrintf("pe %d init first\n", CkMyPe());
02895  } else { // cudaFirst
02896   //CkPrintf("pe %d init later\n", CkMyPe());
02897 #define XCOPY(X) X = masterPmeMgr->X;
02898   XCOPY(bspline_coeffs_dev)
02899   XCOPY(bspline_dcoeffs_dev)
02900   XCOPY(q_arr)
02901   XCOPY(q_arr_dev)
02902   XCOPY(v_arr_dev)
02903   XCOPY(q_data_size)
02904   XCOPY(q_data_host)
02905   XCOPY(q_data_dev)
02906   XCOPY(v_data_dev)
02907   XCOPY(ffz_size)
02908   XCOPY(ffz_host)
02909   XCOPY(ffz_dev)
02910   XCOPY(f_arr)
02911   XCOPY(fz_arr)
02912 #undef XCOPY
02913  } // cudaFirst
02914   CmiUnlock(cuda_lock);
02915  } else // offload
02916 #endif // NAMD_CUDA
02917  {
02918   fz_arr = new char[myGrid.K3+myGrid.order-1];
02919  }
02920 
02921 #if 0 && USE_PERSISTENT
02922   recvGrid_handle = NULL;
02923 #endif
02924 }
02925 
02926 ComputePme::~ComputePme()
02927 {
02928 #ifdef NAMD_CUDA
02929   if ( ! offload )
02930 #endif
02931   {
02932     for ( int g=0; g<numGridsMax; ++g ) delete myRealSpace[g];
02933   }
02934 }
02935 
02936 #if 0 && USE_PERSISTENT 
02937 void ComputePmeMgr::setup_recvgrid_persistent() 
02938 {
02939     int K1 = myGrid.K1;
02940     int K2 = myGrid.K2;
02941     int dim2 = myGrid.dim2;
02942     int dim3 = myGrid.dim3;
02943     int block1 = myGrid.block1;
02944     int block2 = myGrid.block2;
02945 
02946     CkArray *zPencil_local = zPencil.ckLocalBranch();
02947     recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02948     for (int ap=0; ap<numPencilsActive; ++ap) {
02949         int ib = activePencils[ap].i;
02950         int jb = activePencils[ap].j;
02951         int ibegin = ib*block1;
02952         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02953         int jbegin = jb*block2;
02954         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02955         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02956         // f is changing
02957         int fcount = 0;
02958         for ( int g=0; g<numGrids; ++g ) {
02959             char *f = f_arr + g*fsize;
02960             for ( int i=ibegin; i<iend; ++i ) {
02961                 for ( int j=jbegin; j<jend; ++j ) {
02962                     fcount += f[i*dim2+j];
02963                 }
02964             }
02965         }
02966         int zlistlen = 0;
02967         for ( int i=0; i<myGrid.K3; ++i ) {
02968             if ( fz_arr[i] ) ++zlistlen;
02969         }
02970         int hd = ( fcount? 1 : 0 );  // has data?
02971         int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
02972         int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
02973         int compress_size = sizeof(float)*hd*fcount*zlistlen;
02974         int size = compress_start +  compress_size  + PRIORITY_SIZE/8+6;
02975         recvGrid_handle[ap] =  CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
02976     }
02977 }
02978 #endif
02979 
02980 int ComputePme::noWork() {
02981 
02982   if ( patch->flags.doFullElectrostatics ) {
02983     // In QM/MM simulations, atom charges form QM regions need special treatment.
02984     if ( qmForcesOn ) {
02985         return 1;
02986     }
02987     if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0;  // work to do, enqueue as usual
02988     myMgr->heldComputes.add(this);
02989     return 1;  // don't enqueue yet
02990   }
02991 
02992   positionBox->skip();
02993   forceBox->skip();
02994 
02995   if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
02996     myMgr->noWorkCount = 0;
02997     myMgr->reduction->submit();
02998   }
02999 
03000   atomsChanged = 0;
03001 
03002   return 1;  // no work for this step
03003 }
03004 
03005 void ComputePmeMgr::addRecipEvirClient() {
03006   ++recipEvirClients;
03007 }
03008 
03009 void ComputePmeMgr::recvRecipEvir(PmeEvirMsg *msg) {
03010   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03011   for ( int g=0; g<numGrids; ++g ) {
03012     evir[g] += msg->evir[g];
03013   }
03014   delete msg;
03015   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03016   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03017 }
03018 
03019 void ComputePme::doQMWork() {
03020     
03021 //     iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
03022     
03023     
03024     int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
03025     const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
03026     const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
03027     const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
03028     
03029     const CompAtomExt *xExt = patch->getCompAtomExtInfo();
03030     
03031     // Determine number of qm atoms in this patch for the current step.
03032     numLocalQMAtoms = 0;
03033     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03034         if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
03035             numLocalQMAtoms++;
03036         }
03037     }
03038     
03039     // We prepare a charge vector with QM charges for use in the PME calculation.
03040     
03041     // Clears data from last step, if there is any.
03042     if (qmLoclIndx != 0)
03043         delete [] qmLoclIndx;
03044     if (qmLocalCharges != 0)
03045         delete [] qmLocalCharges;
03046     
03047     qmLoclIndx = new int[numLocalQMAtoms] ;
03048     qmLocalCharges = new Real[numLocalQMAtoms] ;
03049     
03050     // I am assuming there will be (in general) more QM atoms among all QM groups
03051     // than MM atoms in a patch.
03052     int procAtms = 0;
03053     
03054     for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
03055         
03056         for (int i=0; i<numQMAtms; i++) {
03057             
03058             if (qmAtmIndx[i] == xExt[paIter].id) {
03059                 
03060                 qmLoclIndx[procAtms] = paIter ;
03061                 qmLocalCharges[procAtms] = qmAtmChrg[i];
03062                 
03063                 procAtms++;
03064                 break;
03065             }
03066             
03067         }
03068         
03069         if (procAtms == numLocalQMAtoms)
03070             break;
03071     }
03072     
03073     doWork();
03074     return ;
03075 }
03076 
03077 void ComputePme::doWork()
03078 {
03079   DebugM(4,"Entering ComputePme::doWork().\n");
03080 
03081   if ( basePriority >= COMPUTE_HOME_PRIORITY ) {
03082 #ifdef NAMD_CUDA
03083     basePriority = ( offload ? PME_OFFLOAD_PRIORITY : PME_PRIORITY );
03084 #else
03085     basePriority = PME_PRIORITY;
03086 #endif
03087     ungridForces();
03088     // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03089     if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
03090     return;
03091   }
03092   basePriority = COMPUTE_HOME_PRIORITY + PATCH_PRIORITY(patchID);
03093   // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
03094 
03095 #ifdef TRACE_COMPUTE_OBJECTS
03096     double traceObjStartTime = CmiWallTimer();
03097 #endif
03098 
03099 #ifdef NAMD_CUDA
03100   if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
03101 #endif
03102 
03103   // allocate storage
03104   numLocalAtoms = patch->getNumAtoms();
03105 
03106   Lattice &lattice = patch->flags.lattice;
03107 
03108   localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
03109   localData = localData_alloc.begin();
03110   localPartition_alloc.resize(numLocalAtoms);
03111   localPartition = localPartition_alloc.begin();
03112 
03113   int g;
03114   for ( g=0; g<numGrids; ++g ) {
03115     localGridData[g] = localData + numLocalAtoms*(g+1);
03116   }
03117 
03118   // get positions and charges
03119   PmeParticle * data_ptr = localData;
03120   unsigned char * part_ptr = localPartition;
03121   const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
03122                                 * ComputeNonbondedUtil::dielectric_1 );
03123 
03124   {
03125     CompAtom *x = positionBox->open();
03126     // CompAtomExt *xExt = patch->getCompAtomExtInfo();
03127     if ( patch->flags.doMolly ) {
03128       positionBox->close(&x);
03129       x = avgPositionBox->open();
03130     }
03131     int numAtoms = patch->getNumAtoms();
03132 
03133     for(int i=0; i<numAtoms; ++i)
03134     {
03135       data_ptr->x = x[i].position.x;
03136       data_ptr->y = x[i].position.y;
03137       data_ptr->z = x[i].position.z;
03138       data_ptr->cg = coulomb_sqrt * x[i].charge;
03139       ++data_ptr;
03140       *part_ptr = x[i].partition;
03141       ++part_ptr;
03142     }
03143 
03144     // QM loop to overwrite charges of QM atoms.
03145     // They are zero for NAMD, but are updated in ComputeQM.
03146     if ( qmForcesOn ) {
03147         
03148         for(int i=0; i<numLocalQMAtoms; ++i)
03149         {
03150           localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
03151         }
03152         
03153     }
03154     
03155     if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
03156     else { positionBox->close(&x); }
03157   }
03158 
03159   // copy to other grids if needed
03160   if ( (alchOn && (!alchDecouple)) || lesOn ) {
03161     for ( g=0; g<numGrids; ++g ) {
03162       PmeParticle *lgd = localGridData[g];
03163       int nga = 0;
03164       for(int i=0; i<numLocalAtoms; ++i) {
03165         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03166           // for FEP/TI: grid 0 gets non-alch + partition 1;
03167           // grid 1 gets non-alch + partition 2;
03168           // grid 2 (only if called for with numGrids=3) gets only non-alch
03169           lgd[nga++] = localData[i];
03170         }
03171       }
03172       numGridAtoms[g] = nga;
03173     }
03174   } else if ( alchOn && alchDecouple) {
03175     // alchemical decoupling: four grids
03176     // g=0: partition 0 and partition 1
03177     // g=1: partition 0 and partition 2
03178     // g=2: only partition 1 atoms
03179     // g=3: only partition 2 atoms
03180     // plus one grid g=4, only partition 0, if numGrids=5
03181     for ( g=0; g<2; ++g ) {  // same as before for first 2
03182       PmeParticle *lgd = localGridData[g];
03183       int nga = 0;
03184       for(int i=0; i<numLocalAtoms; ++i) {
03185         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
03186           lgd[nga++] = localData[i];
03187         }
03188       }
03189       numGridAtoms[g] = nga;
03190     }
03191     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
03192       PmeParticle *lgd = localGridData[g];
03193       int nga = 0;
03194       for(int i=0; i<numLocalAtoms; ++i) {
03195         if ( localPartition[i] == (g-1) ) {
03196           lgd[nga++] = localData[i];
03197         }
03198       }
03199       numGridAtoms[g] = nga;
03200     }
03201     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
03202       // numGrids=5 only if alchElecLambdaStart > 0
03203       PmeParticle *lgd = localGridData[g];
03204       int nga = 0;
03205       for(int i=0; i<numLocalAtoms; ++i) {
03206         if ( localPartition[i] == 0 ) {
03207           lgd[nga++] = localData[i];
03208         }
03209       }
03210       numGridAtoms[g] = nga;
03211     }
03212   } else if ( selfOn ) {
03213     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
03214     g = 0;
03215     PmeParticle *lgd = localGridData[g];
03216     int nga = 0;
03217     for(int i=0; i<numLocalAtoms; ++i) {
03218       if ( localPartition[i] == 1 ) {
03219         lgd[nga++] = localData[i];
03220       }
03221     }
03222     numGridAtoms[g] = nga;
03223   } else if ( pairOn ) {
03224     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
03225     g = 0;
03226     PmeParticle *lgd = localGridData[g];
03227     int nga = 0;
03228     for(int i=0; i<numLocalAtoms; ++i) {
03229       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
03230         lgd[nga++] = localData[i];
03231       }
03232     }
03233     numGridAtoms[g] = nga;
03234     for ( g=1; g<3; ++g ) {
03235       PmeParticle *lgd = localGridData[g];
03236       int nga = 0;
03237       for(int i=0; i<numLocalAtoms; ++i) {
03238         if ( localPartition[i] == g ) {
03239           lgd[nga++] = localData[i];
03240         }
03241       }
03242       numGridAtoms[g] = nga;
03243     }
03244   } else {
03245     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
03246     localGridData[0] = localData;
03247     numGridAtoms[0] = numLocalAtoms;
03248   }
03249 
03250  if ( ! myMgr->doWorkCount ) {
03251   myMgr->doWorkCount = myMgr->pmeComputes.size();
03252 
03253 #ifdef NAMD_CUDA
03254  if ( !  offload )
03255 #endif // NAMD_CUDA
03256  {
03257   memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
03258 
03259   for (int i=0; i<myMgr->q_count; ++i) {
03260     memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
03261   }
03262  }
03263 
03264   for ( g=0; g<numGrids; ++g ) {
03265     myMgr->evir[g] = 0;
03266   }
03267 
03268   myMgr->strayChargeErrors = 0;
03269 
03270   myMgr->compute_sequence = sequence();
03271  }
03272 
03273   if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
03274 
03275   int strayChargeErrors = 0;
03276 
03277   // calculate self energy
03278   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
03279   for ( g=0; g<numGrids; ++g ) {
03280     BigReal selfEnergy = 0;
03281     data_ptr = localGridData[g];
03282     int i;
03283     for(i=0; i<numGridAtoms[g]; ++i)
03284     {
03285       selfEnergy += data_ptr->cg * data_ptr->cg;
03286       ++data_ptr;
03287     }
03288     selfEnergy *= -1. * ewaldcof / SQRT_PI;
03289     myMgr->evir[g][0] += selfEnergy;
03290 
03291     float **q = myMgr->q_arr + g*myMgr->fsize;
03292     char *f = myMgr->f_arr + g*myMgr->fsize;
03293 
03294     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
03295 #ifdef NAMD_CUDA
03296    if ( offload ) {
03297     if ( myMgr->cuda_atoms_alloc == 0 ) {  // first call
03298       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03299       cuda_errcheck("before malloc atom data for pme");
03300       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03301       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03302       cuda_errcheck("malloc atom data for pme");
03303       myMgr->cuda_atoms_count = 0;
03304     }
03305     cuda_atoms_offset = myMgr->cuda_atoms_count;
03306     int n = numGridAtoms[g];
03307     myMgr->cuda_atoms_count += n;
03308     if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
03309       CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
03310                         CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
03311       cuda_errcheck("before malloc expanded atom data for pme");
03312       int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
03313       const float *a_data_host_old = myMgr->a_data_host;
03314       cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
03315       cuda_errcheck("malloc expanded host atom data for pme");
03316       memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
03317       cudaFreeHost((void*) a_data_host_old);
03318       cuda_errcheck("free expanded host atom data for pme");
03319       cudaFree(myMgr->a_data_dev);
03320       cuda_errcheck("free expanded dev atom data for pme");
03321       cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
03322       cuda_errcheck("malloc expanded dev atom data for pme");
03323     }
03324     float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
03325     data_ptr = localGridData[g];
03326     double order_1 = myGrid.order - 1;
03327     double K1 = myGrid.K1;
03328     double K2 = myGrid.K2;
03329     double K3 = myGrid.K3;
03330     int found_negative = 0;
03331     for ( int i=0; i<n; ++i ) {
03332       if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
03333         found_negative = 1;
03334         // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
03335       }
03336       double x_int = (int) data_ptr[i].x;
03337       double y_int = (int) data_ptr[i].y;
03338       double z_int = (int) data_ptr[i].z;
03339       a_data_host[7*i  ] = data_ptr[i].x - x_int;  // subtract in double precision
03340       a_data_host[7*i+1] = data_ptr[i].y - y_int;
03341       a_data_host[7*i+2] = data_ptr[i].z - z_int;
03342       a_data_host[7*i+3] = data_ptr[i].cg;
03343       x_int -= order_1;  if ( x_int < 0 ) x_int += K1;
03344       y_int -= order_1;  if ( y_int < 0 ) y_int += K2;
03345       z_int -= order_1;  if ( z_int < 0 ) z_int += K3;
03346       a_data_host[7*i+4] = x_int;
03347       a_data_host[7*i+5] = y_int;
03348       a_data_host[7*i+6] = z_int;
03349     }
03350     if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
03351    } else
03352 #endif // NAMD_CUDA
03353    {
03354     myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
03355     myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
03356    }
03357   }
03358   myMgr->strayChargeErrors += strayChargeErrors;
03359 
03360 #ifdef TRACE_COMPUTE_OBJECTS
03361     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
03362 #endif
03363 
03364  if ( --(myMgr->doWorkCount) == 0 ) {
03365 // cudaDeviceSynchronize();  // XXXX
03366 #ifdef NAMD_CUDA
03367   if ( offload ) {
03368     ComputePmeMgr::cuda_submit_charges_args args;
03369     args.mgr = myMgr;
03370     args.lattice = &lattice;
03371     args.sequence = sequence();
03372     CmiLock(ComputePmeMgr::cuda_lock);
03373     if ( ComputePmeMgr::cuda_busy ) {
03374       ComputePmeMgr::cuda_submit_charges_deque.push_back(args);
03375     } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
03376       // avoid adding work to nonbonded data preparation pe
03377       args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03378     } else {
03379       ComputePmeMgr::cuda_busy = true;
03380       while ( 1 ) {
03381         CmiUnlock(ComputePmeMgr::cuda_lock);
03382         args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
03383         CmiLock(ComputePmeMgr::cuda_lock);
03384         if ( ComputePmeMgr::cuda_submit_charges_deque.size() ) {
03385           args = ComputePmeMgr::cuda_submit_charges_deque.front();
03386           ComputePmeMgr::cuda_submit_charges_deque.pop_front();
03387         } else {
03388           ComputePmeMgr::cuda_busy = false;
03389           break;
03390         }
03391       }
03392     }
03393     CmiUnlock(ComputePmeMgr::cuda_lock);
03394   } else
03395 #endif // NAMD_CUDA
03396   {
03397     myMgr->chargeGridReady(lattice,sequence());
03398   }
03399  }
03400  atomsChanged = 0;
03401 }
03402 
03403 #ifdef NAMD_CUDA
03404 
03405 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
03406 
03407     int n = cuda_atoms_count;
03408     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03409     cuda_atoms_count = 0;
03410 
03411     const double before = CmiWallTimer();
03412     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03413                           cudaMemcpyHostToDevice, streams[stream]);
03414     const double after = CmiWallTimer();
03415 
03416     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03417 
03418     cuda_pme_charges(
03419       bspline_coeffs_dev,
03420       q_arr_dev, ffz_dev, ffz_dev + fsize,
03421       a_data_dev, n,
03422       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03423       streams[stream]);
03424     const double after2 = CmiWallTimer();
03425 
03426     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03427 
03428     masterPmeMgr->charges_time = before;
03429     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03430     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03431 }
03432 
03433 void cuda_check_pme_charges(void *arg, double walltime) {
03434   ComputePmeMgr *argp = (ComputePmeMgr *) arg;
03435 
03436   cudaError_t err = cudaEventQuery(argp->end_charges);
03437   if ( err == cudaSuccess ) {
03438     traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
03439     argp->charges_time = walltime - argp->charges_time;
03440     argp->sendChargeGridReady();
03441     argp->check_charges_count = 0;
03442   } else if ( err != cudaErrorNotReady ) {
03443     cuda_errcheck("in cuda_check_pme_charges");
03444     NAMD_bug("cuda_errcheck missed error in cuda_check_pme_charges");
03445   } else if ( ++(argp->check_charges_count) >= count_limit ) {
03446     char errmsg[256];
03447     sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
03448             argp->check_charges_count, walltime - argp->charges_time,
03449             argp->saved_sequence);
03450     cuda_errcheck(errmsg);
03451     NAMD_die(errmsg);
03452   } else {
03453     CcdCallBacksReset(0,walltime);  // fix Charm++
03454     CUDA_POLL(cuda_check_pme_charges, arg);
03455   }
03456 }
03457 
03458 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
03459   saved_lattice = &lattice;
03460   saved_sequence = sequence;
03461 
03462   // cudaDeviceSynchronize();  //  XXXX TESTING
03463   //int q_stride = myGrid.K3+myGrid.order-1;
03464   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03465   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03466   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03467   //  }
03468   //}
03469   //CmiLock(cuda_lock);
03470 
03471  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03472   double before = CmiWallTimer();
03473   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03474   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03475   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03476                         cudaMemcpyDeviceToHost, streams[stream]);
03477   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03478   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03479   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03480   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03481   //CmiUnlock(cuda_lock);
03482   // cudaDeviceSynchronize();  //  XXXX TESTING
03483   // cuda_errcheck("after memcpy grid to host");
03484 
03485   SimParameters *simParams = Node::Object()->simParameters;
03486   if ( ! simParams->useCUDA2 ) {
03487     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03488     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03489   }
03490 
03491   pmeProxy[master_pe].pollChargeGridReady();
03492  }
03493 }
03494 
03495 void ComputePmeMgr::sendChargeGridReady() {
03496   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03497     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03498     int cs = mgr->pmeComputes.size();
03499     if ( cs ) {
03500       mgr->ungridForcesCount = cs;
03501       mgr->recipEvirCount = mgr->recipEvirClients;
03502       masterPmeMgr->chargeGridSubmittedCount++;
03503     }
03504   }
03505   pmeProxy[master_pe].recvChargeGridReady();
03506 }
03507 #endif // NAMD_CUDA
03508 
03509 void ComputePmeMgr::pollChargeGridReady() {
03510 #ifdef NAMD_CUDA
03511   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03512   CUDA_POLL(cuda_check_pme_charges,this);
03513 #else
03514   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03515 #endif
03516 }
03517 
03518 void ComputePmeMgr::recvChargeGridReady() {
03519   chargeGridReady(*saved_lattice,saved_sequence);
03520 }
03521 
03522 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
03523 
03524 #ifdef NAMD_CUDA
03525  if ( offload ) {
03526   int errcount = 0;
03527   int q_stride = myGrid.K3+myGrid.order-1;
03528   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03529     f_arr[j] = ffz_host[j];
03530     if ( ffz_host[j] & ~1 ) ++errcount;
03531   }
03532   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03533  }
03534 #endif
03535   recipEvirCount = recipEvirClients;
03536   ungridForcesCount = pmeComputes.size();
03537 
03538   for (int j=0; j<myGrid.order-1; ++j) {
03539     fz_arr[j] |= fz_arr[myGrid.K3+j];
03540   }
03541 
03542   if ( usePencils ) {
03543     sendPencils(lattice,sequence);
03544   } else {
03545     sendData(lattice,sequence);
03546   }
03547 }
03548 
03549 
03550 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
03551 
03552   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03553 
03554 #if 0 && USE_PERSISTENT
03555     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03556 #endif
03557   int K1 = myGrid.K1;
03558   int K2 = myGrid.K2;
03559   int dim2 = myGrid.dim2;
03560   int dim3 = myGrid.dim3;
03561   int block1 = myGrid.block1;
03562   int block2 = myGrid.block2;
03563 
03564   // int savedMessages = 0;
03565   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03566 
03567   for (int ap=first; ap<=last; ++ap) {
03568     int ib = activePencils[ap].i;
03569     int jb = activePencils[ap].j;
03570     int ibegin = ib*block1;
03571     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03572     int jbegin = jb*block2;
03573     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03574     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03575 
03576     int fcount = 0;
03577     for ( int g=0; g<numGrids; ++g ) {
03578       char *f = f_arr + g*fsize;
03579 #ifdef NAMD_CUDA
03580      if ( offload ) {
03581       int errcount = 0;
03582       for ( int i=ibegin; i<iend; ++i ) {
03583        for ( int j=jbegin; j<jend; ++j ) {
03584         int k = i*dim2+j;
03585         f[k] = ffz_host[k];
03586         fcount += f[k];
03587         if ( ffz_host[k] & ~1 ) ++errcount;
03588        }
03589       }
03590       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03591      } else
03592 #endif
03593       for ( int i=ibegin; i<iend; ++i ) {
03594        for ( int j=jbegin; j<jend; ++j ) {
03595         fcount += f[i*dim2+j];
03596        }
03597       }
03598     }
03599 
03600 #ifdef NETWORK_PROGRESS
03601     CmiNetworkProgress();
03602 #endif
03603 
03604     if ( ! pencilActive[ib*yBlocks+jb] )
03605       NAMD_bug("PME activePencils list inconsistent");
03606 
03607     int zlistlen = 0;
03608     for ( int i=0; i<myGrid.K3; ++i ) {
03609       if ( fz_arr[i] ) ++zlistlen;
03610     }
03611 
03612     int hd = ( fcount? 1 : 0 );  // has data?
03613     // if ( ! hd ) ++savedMessages;
03614 
03615     
03616     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03617         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03618     msg->sourceNode = sourcepe;
03619     msg->hasData = hd;
03620     msg->lattice = lattice;
03621    if ( hd ) {
03622 #if 0
03623     msg->start = fstart;
03624     msg->len = flen;
03625 #else
03626     msg->start = -1;   // obsolete?
03627     msg->len = -1;   // obsolete?
03628 #endif
03629     msg->zlistlen = zlistlen;
03630     int *zlist = msg->zlist;
03631     zlistlen = 0;
03632     for ( int i=0; i<myGrid.K3; ++i ) {
03633       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03634     }
03635     char *fmsg = msg->fgrid;
03636     float *qmsg = msg->qgrid;
03637     for ( int g=0; g<numGrids; ++g ) {
03638       char *f = f_arr + g*fsize;
03639       float **q = q_arr + g*fsize;
03640       for ( int i=ibegin; i<iend; ++i ) {
03641        for ( int j=jbegin; j<jend; ++j ) {
03642         *(fmsg++) = f[i*dim2+j];
03643         if( f[i*dim2+j] ) {
03644           for (int h=0; h<myGrid.order-1; ++h) {
03645             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03646           }
03647           for ( int k=0; k<zlistlen; ++k ) {
03648             *(qmsg++) = q[i*dim2+j][zlist[k]];
03649           }
03650         }
03651        }
03652       }
03653     }
03654    }
03655 
03656     msg->sequence = compute_sequence;
03657     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03658     CmiEnableUrgentSend(1);
03659 #if USE_NODE_PAR_RECEIVE
03660     msg->destElem=CkArrayIndex3D(ib,jb,0);
03661     CProxy_PmePencilMap lzm = npMgr->zm;
03662     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03663     int destnode = CmiNodeOf(destproc);
03664     
03665 #if  0 
03666     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03667 #endif
03668     pmeNodeProxy[destnode].recvZGrid(msg);
03669 #if 0 
03670     CmiUsePersistentHandle(NULL, 0);
03671 #endif
03672 #else
03673 #if 0 
03674     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03675 #endif
03676     zPencil(ib,jb,0).recvGrid(msg);
03677 #if 0 
03678     CmiUsePersistentHandle(NULL, 0);
03679 #endif
03680 #endif
03681     CmiEnableUrgentSend(0);
03682   }
03683 
03684 
03685   // if ( savedMessages ) {
03686   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03687   // }
03688 
03689 }
03690 
03691 
03692 void ComputePmeMgr::sendPencilsHelper(int iter) {
03693   nodePmeMgr->sendPencilsHelper(iter);
03694 }
03695 
03696 void NodePmeMgr::sendPencilsHelper(int iter) {
03697 #ifdef NAMD_CUDA
03698   ComputePmeMgr *obj = masterPmeMgr;
03699   obj->sendPencilsPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe);
03700 #else
03701   NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
03702 #endif
03703 }
03704 
03705 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
03706 
03707   sendDataHelper_lattice = &lattice;
03708   sendDataHelper_sequence = sequence;
03709   sendDataHelper_sourcepe = CkMyPe();
03710 
03711 #ifdef NAMD_CUDA
03712   if ( offload ) {
03713     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03714 #if CMK_MULTICORE
03715       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03716       int ib = activePencils[ap].i;
03717       int jb = activePencils[ap].j;
03718       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03719       pmeProxy[destproc].sendPencilsHelper(ap);
03720 #else
03721       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03722 #endif
03723     }
03724   } else
03725 #endif
03726   {
03727     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03728   }
03729 
03730   if ( strayChargeErrors ) {
03731    strayChargeErrors = 0;
03732    iout << iERROR << "Stray PME grid charges detected: "
03733         << CkMyPe() << " sending to (x,y)";
03734    int K1 = myGrid.K1;
03735    int K2 = myGrid.K2;
03736    int dim2 = myGrid.dim2;
03737    int block1 = myGrid.block1;
03738    int block2 = myGrid.block2;
03739    for (int ib=0; ib<xBlocks; ++ib) {
03740     for (int jb=0; jb<yBlocks; ++jb) {
03741      int ibegin = ib*block1;
03742      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03743      int jbegin = jb*block2;
03744      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03745      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03746 
03747      for ( int g=0; g<numGrids; ++g ) {
03748        char *f = f_arr + g*fsize;
03749        if ( ! pencilActive[ib*yBlocks+jb] ) {
03750            for ( int i=ibegin; i<iend; ++i ) {
03751             for ( int j=jbegin; j<jend; ++j ) {
03752              if ( f[i*dim2+j] == 3 ) {
03753                f[i*dim2+j] = 2;
03754                iout << " (" << i << "," << j << ")";
03755              }
03756             }
03757            }
03758        }
03759      }
03760     }
03761    }
03762    iout << "\n" << endi;
03763   }
03764  
03765 }
03766 
03767 
03768 void ComputePmeMgr::copyPencils(PmeGridMsg *msg) {
03769 
03770   int K1 = myGrid.K1;
03771   int K2 = myGrid.K2;
03772   int dim2 = myGrid.dim2;
03773   int dim3 = myGrid.dim3;
03774   int block1 = myGrid.block1;
03775   int block2 = myGrid.block2;
03776 
03777   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03778   int ib = msg->sourceNode / yBlocks;
03779   int jb = msg->sourceNode % yBlocks;
03780 
03781   int ibegin = ib*block1;
03782   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03783   int jbegin = jb*block2;
03784   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03785 
03786   int zlistlen = msg->zlistlen;
03787   int *zlist = msg->zlist;
03788   float *qmsg = msg->qgrid;
03789   int g;
03790   for ( g=0; g<numGrids; ++g ) {
03791     char *f = f_arr + g*fsize;
03792     float **q = q_arr + g*fsize;
03793     for ( int i=ibegin; i<iend; ++i ) {
03794      for ( int j=jbegin; j<jend; ++j ) {
03795       if( f[i*dim2+j] ) {
03796         f[i*dim2+j] = 0;
03797         for ( int k=0; k<zlistlen; ++k ) {
03798           q[i*dim2+j][zlist[k]] = *(qmsg++);
03799         }
03800         for (int h=0; h<myGrid.order-1; ++h) {
03801           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03802         }
03803       }
03804      }
03805     }
03806   }
03807 }
03808 
03809 
03810 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
03811 
03812   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03813 
03814   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03815 
03816   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03817   for (int j=first; j<=last; j++) {
03818     int pe = gridPeOrder[j];  // different order
03819     if ( ! recipPeDest[pe] && ! errors ) continue;
03820     int start = pe * bsize;
03821     int len = bsize;
03822     if ( start >= qsize ) { start = 0; len = 0; }
03823     if ( start + len > qsize ) { len = qsize - start; }
03824     int zdim = myGrid.dim3;
03825     int fstart = start / zdim;
03826     int flen = len / zdim;
03827     int fcount = 0;
03828     int i;
03829 
03830     int g;
03831     for ( g=0; g<numGrids; ++g ) {
03832       char *f = f_arr + fstart + g*fsize;
03833 #ifdef NAMD_CUDA
03834      if ( offload ) {
03835       int errcount = 0;
03836       for ( i=0; i<flen; ++i ) {
03837         f[i] = ffz_host[fstart+i];
03838         fcount += f[i];
03839         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03840       }
03841       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03842      } else
03843 #endif
03844       for ( i=0; i<flen; ++i ) {
03845         fcount += f[i];
03846       }
03847       if ( ! recipPeDest[pe] ) {
03848         int errfound = 0;
03849         for ( i=0; i<flen; ++i ) {
03850           if ( f[i] == 3 ) {
03851             errfound = 1;
03852             break;
03853           }
03854         }
03855         if ( errfound ) {
03856           iout << iERROR << "Stray PME grid charges detected: "
03857                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03858           int iz = -1;
03859           for ( i=0; i<flen; ++i ) {
03860             if ( f[i] == 3 ) {
03861               f[i] = 2;
03862               int jz = (i+fstart)/myGrid.K2;
03863               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03864             }
03865           }
03866           iout << "\n" << endi;
03867         }
03868       }
03869     }
03870 
03871 #ifdef NETWORK_PROGRESS
03872     CmiNetworkProgress();
03873 #endif
03874 
03875     if ( ! recipPeDest[pe] ) continue;
03876 
03877     int zlistlen = 0;
03878     for ( i=0; i<myGrid.K3; ++i ) {
03879       if ( fz_arr[i] ) ++zlistlen;
03880     }
03881 
03882     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03883                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03884 
03885     msg->sourceNode = sourcepe;
03886     msg->lattice = lattice;
03887     msg->start = fstart;
03888     msg->len = flen;
03889     msg->zlistlen = zlistlen;
03890     int *zlist = msg->zlist;
03891     zlistlen = 0;
03892     for ( i=0; i<myGrid.K3; ++i ) {
03893       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03894     }
03895     float *qmsg = msg->qgrid;
03896     for ( g=0; g<numGrids; ++g ) {
03897       char *f = f_arr + fstart + g*fsize;
03898       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03899       float **q = q_arr + fstart + g*fsize;
03900       for ( i=0; i<flen; ++i ) {
03901         if ( f[i] ) {
03902           for (int h=0; h<myGrid.order-1; ++h) {
03903             q[i][h] += q[i][myGrid.K3+h];
03904           }
03905           for ( int k=0; k<zlistlen; ++k ) {
03906             *(qmsg++) = q[i][zlist[k]];
03907           }
03908         }
03909       }
03910     }
03911 
03912     msg->sequence = compute_sequence;
03913     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03914     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03915   }
03916 
03917 }
03918 
03919 void ComputePmeMgr::sendDataHelper(int iter) {
03920   nodePmeMgr->sendDataHelper(iter);
03921 }
03922 
03923 void NodePmeMgr::sendDataHelper(int iter) {
03924 #ifdef NAMD_CUDA
03925   ComputePmeMgr *obj = masterPmeMgr;
03926   obj->sendDataPart(iter, iter, *obj->sendDataHelper_lattice, obj->sendDataHelper_sequence, obj->sendDataHelper_sourcepe, obj->sendDataHelper_errors);
03927 #else
03928   NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
03929 #endif
03930 }
03931 
03932 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
03933 
03934   sendDataHelper_lattice = &lattice;
03935   sendDataHelper_sequence = sequence;
03936   sendDataHelper_sourcepe = CkMyPe();
03937   sendDataHelper_errors = strayChargeErrors;
03938   strayChargeErrors = 0;
03939 
03940 #ifdef NAMD_CUDA
03941   if ( offload ) {
03942     for ( int i=0; i < numGridPes; ++i ) {
03943       int pe = gridPeOrder[i];  // different order
03944       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03945 #if CMK_MULTICORE
03946       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03947       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03948 #else
03949       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03950 #endif
03951     }
03952   } else
03953 #endif
03954   {
03955     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03956   }
03957  
03958 }
03959 
03960 void ComputePmeMgr::copyResults(PmeGridMsg *msg) {
03961 
03962   int zdim = myGrid.dim3;
03963   int flen = msg->len;
03964   int fstart = msg->start;
03965   int zlistlen = msg->zlistlen;
03966   int *zlist = msg->zlist;
03967   float *qmsg = msg->qgrid;
03968   int g;
03969   for ( g=0; g<numGrids; ++g ) {
03970     char *f = msg->fgrid + g*flen;
03971     float **q = q_arr + fstart + g*fsize;
03972     for ( int i=0; i<flen; ++i ) {
03973       if ( f[i] ) {
03974         f[i] = 0;
03975         for ( int k=0; k<zlistlen; ++k ) {
03976           q[i][zlist[k]] = *(qmsg++);
03977         }
03978         for (int h=0; h<myGrid.order-1; ++h) {
03979           q[i][myGrid.K3+h] = q[i][h];
03980         }
03981       }
03982     }
03983   }
03984 }
03985 
03986 void ComputePme::ungridForces() {
03987 
03988     if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
03989  
03990     SimParameters *simParams = Node::Object()->simParameters;
03991 
03992     localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
03993     Vector *localResults = localResults_alloc.begin();
03994     Vector *gridResults;
03995 
03996     if ( alchOn || lesOn || selfOn || pairOn ) {
03997       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
03998       gridResults = localResults + numLocalAtoms;
03999     } else {
04000       gridResults = localResults;
04001     }
04002 
04003     Vector pairForce = 0.;
04004     Lattice &lattice = patch->flags.lattice;
04005     int g = 0;
04006     if(!simParams->commOnly) {
04007     for ( g=0; g<numGrids; ++g ) {
04008 #ifdef NETWORK_PROGRESS
04009       CmiNetworkProgress();
04010 #endif
04011 
04012 #ifdef NAMD_CUDA
04013       if ( offload ) {
04014         int errfound = 0;
04015         for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04016           // Neither isnan() nor x != x worked when testing on Cray; this does.
04017           if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }  // CUDA NaN
04018           gridResults[i].x = f_data_host[3*i];
04019           gridResults[i].y = f_data_host[3*i+1];
04020           gridResults[i].z = f_data_host[3*i+2];
04021         }
04022         if ( errfound ) {
04023           int errcount = 0;
04024           for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
04025             float f = f_data_host[3*i];
04026             if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) {  // CUDA NaN
04027               ++errcount;
04028               gridResults[i] = 0.;
04029             }
04030           }
04031           iout << iERROR << "Stray PME grid charges detected: "
04032                 << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
04033         }
04034       } else
04035 #endif // NAMD_CUDA
04036         {
04037           myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
04038         }
04039       scale_forces(gridResults, numGridAtoms[g], lattice);
04040       
04041       if (alchOn) {
04042         float scale = 1.;
04043         BigReal elecLambdaUp, elecLambdaDown;
04044         if ( simParams->alchFepWhamOn ) {
04045           if ( simParams->alchFepElecOn ) {
04046             elecLambdaUp = simParams->alchElecLambda;
04047             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04048           }
04049           else {
04050             elecLambdaUp = 0.0;
04051             elecLambdaDown = 1.0;
04052           }
04053         }
04054         else {
04055           BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04056           myMgr->alchLambda = alchLambda;
04057           BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
04058           myMgr->alchLambda2 = alchLambda2;
04059           elecLambdaUp = simParams->getElecLambda(alchLambda);
04060           elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
04061         }
04062         
04063         if ( g == 0 ) scale = elecLambdaUp;
04064         else if ( g == 1 ) scale = elecLambdaDown;
04065         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04066 
04067         if (alchDecouple) {
04068           if ( g == 2 ) scale = 1 - elecLambdaUp;
04069           else if ( g == 3 ) scale = 1 - elecLambdaDown;
04070           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04071         }
04072         int nga = 0;
04073         if (!alchDecouple) {
04074           for(int i=0; i<numLocalAtoms; ++i) {
04075             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04076               // (g=2: only partition 0)
04077               localResults[i] += gridResults[nga++] * scale;
04078             }
04079           }
04080         }
04081         else {  // alchDecouple
04082           if ( g < 2 ) {
04083             for(int i=0; i<numLocalAtoms; ++i) {
04084               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04085                 // g = 0: partition 0 or partition 1
04086                 // g = 1: partition 0 or partition 2
04087                 localResults[i] += gridResults[nga++] * scale;
04088               }
04089             }
04090           }
04091           else {
04092             for(int i=0; i<numLocalAtoms; ++i) {
04093               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
04094                 // g = 2: partition 1 only
04095                 // g = 3: partition 2 only
04096                 // g = 4: partition 0 only
04097                 localResults[i] += gridResults[nga++] * scale;
04098               }
04099             }
04100           }
04101         }
04102       } else if ( lesOn ) {
04103         float scale = 1.;
04104         if ( alchFepOn ) {
04105           if(simParams->alchFepWhamOn) {
04106             if(simParams->alchFepElecOn) {
04107               if ( g == 0 ) scale = simParams->alchElecLambda;
04108               else if ( g == 1 ) scale = 1. - simParams->alchElecLambda;
04109             }
04110             else {
04111               if ( g == 0 ) scale = 0.0;
04112               else if ( g == 1 ) scale = 1.0;
04113             }
04114           }
04115           else {
04116             BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
04117             myMgr->alchLambda = alchLambda;
04118             BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
04119             myMgr->alchLambda2 = alchLambda2;
04120             if ( g == 0 ) scale = alchLambda;
04121             else if ( g == 1 ) scale = 1. - alchLambda;
04122           }
04123         } else if ( lesOn ) {
04124           scale = 1.0 / (float)lesFactor;
04125         }
04126         int nga = 0;
04127         for(int i=0; i<numLocalAtoms; ++i) {
04128           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
04129             localResults[i] += gridResults[nga++] * scale;
04130           }
04131         }
04132       } else if ( selfOn ) {
04133         PmeParticle *lgd = localGridData[g];
04134         int nga = 0;
04135         for(int i=0; i<numLocalAtoms; ++i) {
04136           if ( localPartition[i] == 1 ) {
04137             pairForce += gridResults[nga];  // should add up to almost zero
04138             localResults[i] += gridResults[nga++];
04139           }
04140         }
04141       } else if ( pairOn ) {
04142         if ( g == 0 ) {
04143           int nga = 0;
04144           for(int i=0; i<numLocalAtoms; ++i) {
04145             if ( localPartition[i] == 1 ) {
04146               pairForce += gridResults[nga];
04147             }
04148             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
04149               localResults[i] += gridResults[nga++];
04150             }
04151           }
04152         } else if ( g == 1 ) {
04153           int nga = 0;
04154           for(int i=0; i<numLocalAtoms; ++i) {
04155             if ( localPartition[i] == g ) {
04156               pairForce -= gridResults[nga];  // should add up to almost zero
04157               localResults[i] -= gridResults[nga++];
04158             }
04159           }
04160         } else {
04161           int nga = 0;
04162           for(int i=0; i<numLocalAtoms; ++i) {
04163             if ( localPartition[i] == g ) {
04164               localResults[i] -= gridResults[nga++];
04165             }
04166          }
04167         }
04168       }
04169     }
04170     }
04171 
04172     Vector *results_ptr = localResults;
04173     
04174     // add in forces
04175     {
04176       Results *r = forceBox->open();
04177       Force *f = r->f[Results::slow];
04178       int numAtoms = patch->getNumAtoms();
04179 
04180       if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
04181         for(int i=0; i<numAtoms; ++i) {
04182           f[i].x += results_ptr->x;
04183           f[i].y += results_ptr->y;
04184           f[i].z += results_ptr->z;
04185           ++results_ptr;
04186         }
04187       }
04188       forceBox->close(&r);
04189     }
04190 
04191     if ( pairOn || selfOn ) {
04192         ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
04193     }
04194 
04195 }
04196 
04197 void ComputePmeMgr::submitReductions() {
04198 
04199     SimParameters *simParams = Node::Object()->simParameters;
04200 
04201     for ( int g=0; g<numGrids; ++g ) {
04202       float scale = 1.;
04203       if (alchOn) {
04204         BigReal elecLambdaUp, elecLambdaDown;
04205         if( simParams->alchFepWhamOn ) {
04206           if( simParams->alchFepElecOn ) {
04207             elecLambdaUp = simParams->alchElecLambda;
04208             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04209           }
04210           else {
04211             elecLambdaUp = 0.0;
04212             elecLambdaDown = 1.0;
04213           }
04214         }
04215         else {
04216           // alchLambda set on each step in ComputePme::ungridForces()
04217           if ( alchLambda < 0 || alchLambda > 1 ) {
04218             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04219           }
04220           elecLambdaUp = simParams->getElecLambda(alchLambda);
04221           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04222         }
04223         if ( g == 0 ) scale = elecLambdaUp;
04224         else if ( g == 1 ) scale = elecLambdaDown;
04225         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04226         if (alchDecouple) {
04227           if ( g == 2 ) scale = 1-elecLambdaUp;
04228           else if ( g == 3 ) scale = 1-elecLambdaDown;
04229           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04230         }
04231       } else if ( lesOn ) {
04232         scale = 1.0 / lesFactor;
04233       } else if ( pairOn ) {
04234         scale = ( g == 0 ? 1. : -1. );
04235       }
04236       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04237       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04238       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04239       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04240       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04241       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04242       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04243       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04244       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04245       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04246 
04247       float scale2 = 0.;
04248 
04249       // why is this declared/defined again here?
04250       SimParameters *simParams = Node::Object()->simParameters;
04251 
04252       if (alchFepOn) {
04253         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04254         if(simParams->alchFepWhamOn) {
04255           if(simParams->alchFepElecOn) {
04256             elecLambda2Up = simParams->alchElecLambda;
04257             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04258           }
04259           else {
04260             elecLambda2Up = 0.0;
04261             elecLambda2Down =  1.0;
04262           }
04263         }
04264         else {
04265           elecLambda2Up = simParams->getElecLambda(alchLambda2);
04266           elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
04267         }
04268         
04269         if ( g == 0 ) scale2 = elecLambda2Up;
04270         else if ( g == 1 ) scale2 = elecLambda2Down;
04271         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04272         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04273         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04274         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04275       }
04276       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04277         if( g==0 )      scale2 = scale + 1.0;
04278         else if( g==1 ) scale2 = scale - 1.0;
04279         else if( g==2 ) scale2 = scale - 1.0;
04280         else if( g==3 ) scale2 = scale + 1.0;
04281       }
04282       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04283       
04284       if (alchThermIntOn) {
04285         
04286         // no decoupling:
04287         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04288         // (interactions between all atoms [partition 0 OR partition 1], 
04289         // minus all [within partition 0])
04290         // U = elecLambdaUp * (U[0] - U[2])
04291         // dU/dl = U[0] - U[2];
04292         
04293         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04294         // (interactions between all atoms [partition 0 OR partition 2], 
04295         // minus all [within partition 0])
04296         // U = elecLambdaDown * (U[1] - U[2])
04297         // dU/dl = U[1] - U[2];
04298 
04299         // alchDecouple:
04300         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04301         // (interactions between all atoms [partition 0 OR partition 1]
04302         // minus all [within partition 1] minus all [within partition 0]
04303         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04304         // dU/dl = U[0] - U[2] - U[4];
04305 
04306         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04307         // (interactions between all atoms [partition 0 OR partition 2]
04308         // minus all [within partition 2] minus all [within partition 0]
04309         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04310         // dU/dl = U[1] - U[3] - U[4];
04311         
04312         
04313         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04314         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04315         if (!alchDecouple) {
04316           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04317           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04318         }
04319         else {  // alchDecouple
04320           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04321           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04322           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04323           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04324         }
04325       }
04326     }
04327 
04328     alchLambda = -1.;  // illegal value to catch if not updated
04329 
04330     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04331     reduction->submit();
04332 
04333   for ( int i=0; i<heldComputes.size(); ++i ) {
04334     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04335   }
04336   heldComputes.resize(0);
04337 }
04338 
04339 #if USE_TOPOMAP 
04340 
04341 #define NPRIMES 8
04342 const static unsigned int NAMDPrimes[] = {
04343   3,
04344   5,
04345   7,
04346   11,
04347   13,
04348   17,
04349   19,
04350   23,  
04351   29,
04352   31,
04353   37,
04354   59,
04355   73,
04356   93,
04357   113,
04358   157,
04359   307,
04360   617,
04361   1217                  //This should b enough for 64K nodes of BGL. 
04362 };
04363 
04364 #include "RecBisection.h"
04365 
04366 /***-----------------------------------------------------**********
04367     The Orthogonal Recursive Bisection strategy, which allocates PME
04368     objects close to the patches they communicate, and at the same
04369     time spreads them around the grid 
04370 ****----------------------------------------------------------****/
04371 
04372 bool generateBGLORBPmePeList(int *pemap, int numPes, 
04373                              int *block_pes, int nbpes) {
04374 
04375   PatchMap *pmap = PatchMap::Object();
04376   int *pmemap = new int [CkNumPes()];
04377 
04378   if (pemap == NULL)
04379     return false;
04380 
04381   TopoManager tmgr;
04382 
04383   memset(pmemap, 0, sizeof(int) * CkNumPes());
04384 
04385   for(int count = 0; count < CkNumPes(); count++) {
04386     if(count < nbpes)
04387       pmemap[block_pes[count]] = 1;
04388     
04389     if(pmap->numPatchesOnNode(count)) {
04390       pmemap[count] = 1;
04391       
04392       //Assumes an XYZT mapping !!
04393       if(tmgr.hasMultipleProcsPerNode()) {
04394         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
04395       }
04396     }
04397   }
04398 
04399   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
04400     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
04401     return false;
04402 
04403   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04404   Node *node = nd.ckLocalBranch();
04405   SimParameters *simParams = node->simParameters;
04406 
04407   //first split PME processors into patch groups
04408 
04409   int xsize = 0, ysize = 0, zsize = 0;
04410 
04411   xsize = tmgr.getDimNX();
04412   ysize = tmgr.getDimNY();
04413   zsize = tmgr.getDimNZ();
04414   
04415   int nx = xsize, ny = ysize, nz = zsize;
04416   DimensionMap dm;
04417   
04418   dm.x = 0;
04419   dm.y = 1;
04420   dm.z = 2;
04421   
04422   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
04423 
04424   //group size processors have to be allocated to each YZ plane
04425   int group_size = numPes/nx;
04426   if(numPes % nx)
04427     group_size ++;
04428 
04429   int my_prime = NAMDPrimes[0];
04430   int density = (ny * nz)/group_size + 1;
04431   int count = 0;
04432   
04433   //Choose a suitable prime Number
04434   for(count = 0; count < NPRIMES; count ++) {
04435     //Find a prime just greater than the density
04436     if(density < NAMDPrimes[count]) {
04437       my_prime = NAMDPrimes[count];
04438       break;
04439     }      
04440   }
04441   
04442   if(count == NPRIMES)
04443     my_prime = NAMDPrimes[NPRIMES-1];
04444 
04445   //int gcount = numPes/2;
04446   int gcount = 0;
04447   int npme_pes = 0;
04448   
04449   int coord[3];
04450 
04451   for(int x = 0; x < nx; x++) {
04452     coord[0] = (x + nx/2)%nx;
04453     
04454     for(count=0; count < group_size && npme_pes < numPes; count++) {
04455       int dest = (count + 1) * my_prime;      
04456       dest = dest % (ny * nz);      
04457       
04458       coord[2] = dest / ny;
04459       coord[1] = dest - coord[2] * ny;
04460       
04461       //Locate where in the actual grid the processor is
04462       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
04463         coord[dm.z] * xsize* ysize;
04464       
04465       if(pmemap[destPe] == 0) {
04466         pemap[gcount++] = destPe;
04467         pmemap[destPe] = 1;
04468         
04469         if(tmgr.hasMultipleProcsPerNode())
04470           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
04471 
04472         npme_pes ++;
04473       }
04474       else {
04475         for(int pos = 1; pos < ny * nz; pos++) {
04476           
04477           coord[2] += pos / ny;
04478           coord[1] += pos % ny;
04479           
04480           coord[2] = coord[2] % nz;
04481           coord[1] = coord[1] % ny;       
04482           
04483           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
04484             coord[dm.z] * xsize * ysize;
04485           
04486           if(pmemap[newdest] == 0) {
04487             pemap[gcount++] = newdest;
04488             pmemap[newdest] = 1;
04489             
04490             if(tmgr.hasMultipleProcsPerNode())
04491               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
04492             
04493             npme_pes ++;
04494             break;
04495           }
04496         }
04497       }      
04498     }   
04499     
04500     if(gcount == numPes)
04501       gcount = 0;    
04502     
04503     if(npme_pes >= numPes)
04504       break;
04505   }
04506   
04507   delete [] pmemap;
04508   
04509   if(npme_pes != numPes)
04510     //NAMD_bug("ORB PME allocator failed\n");
04511     return false;
04512 
04513   return true;
04514 }
04515 
04516 #endif
04517 
04518 template <class T> class PmePencil : public T {
04519 public:
04520   PmePencil() {
04521     data = 0;
04522     work = 0;
04523     send_order = 0;
04524     needs_reply = 0;
04525 #if USE_PERSISTENT
04526     trans_handle = untrans_handle = ungrid_handle = NULL;
04527 #endif
04528   }
04529   ~PmePencil() {
04530 #ifdef NAMD_FFTW
04531     fftwf_free(data);
04532 #endif
04533     delete [] work;
04534     delete [] send_order;
04535     delete [] needs_reply;
04536   }
04537   void base_init(PmePencilInitMsg *msg) {
04538     imsg=0;
04539     imsgb=0;
04540     hasData=0;
04541     initdata = msg->data;
04542   }
04543   void order_init(int nBlocks) {
04544     send_order = new int[nBlocks];
04545     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
04546     if ( Node::Object()->simParameters->PMESendOrder ) {
04547       std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
04548     } else {
04549       Random rand(CkMyPe());
04550       rand.reorder(send_order,nBlocks);
04551     }
04552     needs_reply = new int[nBlocks];
04553     offload = Node::Object()->simParameters->PMEOffload;
04554   }
04555   PmePencilInitMsgData initdata;
04556   Lattice lattice;
04557   PmeReduction evir;
04558   int sequence;  // used for priorities
04559   int imsg;  // used in sdag code
04560   int imsgb;  // Node par uses distinct counter for back path
04561   int hasData;  // used in message elimination
04562   int offload;
04563   float *data;
04564   float *work;
04565   int *send_order;
04566   int *needs_reply;
04567 #if USE_PERSISTENT
04568   PersistentHandle *trans_handle;
04569   PersistentHandle *untrans_handle;
04570   PersistentHandle *ungrid_handle;
04571 #endif
04572 };
04573 
04574 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
04575 public:
04576     PmeZPencil_SDAG_CODE
04577     PmeZPencil() { __sdag_init(); setMigratable(false); }
04578     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); imsg=imsgb=0;}
04579         ~PmeZPencil() {
04580         #ifdef NAMD_FFTW
04581         #ifdef NAMD_FFTW_3
04582                 delete [] forward_plans;
04583                 delete [] backward_plans;
04584         #endif
04585         #endif
04586         }
04587     void fft_init();
04588     void recv_grid(const PmeGridMsg *);
04589     void forward_fft();
04590     void send_trans();
04591         void send_subset_trans(int fromIdx, int toIdx);
04592     void recv_untrans(const PmeUntransMsg *);
04593     void recvNodeAck(PmeAckMsg *);
04594     void node_process_untrans(PmeUntransMsg *);
04595     void node_process_grid(PmeGridMsg *);
04596     void backward_fft();
04597         void send_ungrid(PmeGridMsg *);
04598         void send_all_ungrid();
04599         void send_subset_ungrid(int fromIdx, int toIdx);
04600 private:
04601     ResizeArray<PmeGridMsg *> grid_msgs;
04602     ResizeArray<int> work_zlist;
04603 #ifdef NAMD_FFTW
04604 #ifdef NAMD_FFTW_3
04605     fftwf_plan forward_plan, backward_plan;
04606 
04607         //for ckloop usage
04608         int numPlans;
04609         fftwf_plan *forward_plans, *backward_plans;
04610 #else
04611     rfftwnd_plan forward_plan, backward_plan;
04612 #endif
04613 #endif
04614 
04615     int nx, ny;
04616 #if USE_PERSISTENT
04617     void setup_persistent() {
04618       int hd = 1;// ( hasData ? 1 : 0 );
04619       int zBlocks = initdata.zBlocks;
04620       int block3 = initdata.grid.block3;
04621       int dim3 = initdata.grid.dim3;
04622       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04623       CmiAssert(yPencil_local);
04624       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
04625       for ( int isend=0; isend<zBlocks; ++isend ) {
04626           int kb = send_order[isend];
04627           int nz1 = block3;
04628           if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
04629           int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
04630           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
04631           int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
04632           int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
04633           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04634       }
04635     }
04636     
04637     void setup_ungrid_persistent() 
04638     {
04639        ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
04640        for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
04641            int peer = grid_msgs[imsg]->sourceNode;
04642            //ungrid_handle[imsg] = CmiCreatePersistent(peer, 0); 
04643        }
04644     }
04645 #endif
04646 };
04647 
04648 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
04649 public:
04650     PmeYPencil_SDAG_CODE
04651     PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
04652     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
04653     void fft_init();
04654     void recv_trans(const PmeTransMsg *);
04655     void forward_fft();
04656         void forward_subset_fft(int fromIdx, int toIdx);
04657     void send_trans();
04658         void send_subset_trans(int fromIdx, int toIdx);
04659     void recv_untrans(const PmeUntransMsg *);    
04660     void node_process_trans(PmeTransMsg *);
04661     void recvNodeAck(PmeAckMsg *);
04662     void node_process_untrans(PmeUntransMsg *);
04663     void backward_fft();
04664         void backward_subset_fft(int fromIdx, int toIdx);
04665     void send_untrans();
04666     void send_subset_untrans(int fromIdx, int toIdx);
04667 private:
04668 #ifdef NAMD_FFTW
04669 #ifdef NAMD_FFTW_3
04670     fftwf_plan forward_plan, backward_plan;
04671 #else
04672     fftw_plan forward_plan, backward_plan;
04673 #endif
04674 #endif
04675 
04676     int nx, nz;
04677 #if USE_PERSISTENT
04678     void setup_persistent() {
04679       int yBlocks = initdata.yBlocks;
04680       int block2 = initdata.grid.block2;
04681       int K2 = initdata.grid.K2;
04682       int hd = 1;
04683       CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
04684       trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04685       for ( int isend=0; isend<yBlocks; ++isend ) {
04686           int jb = send_order[isend];
04687           int ny1 = block2;
04688           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04689           int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
04690           int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
04691           int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
04692           int compress_size = sizeof(float)*hd*nx*ny1*nz*2; 
04693           trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04694       }
04695 
04696       CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
04697       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
04698       for ( int isend=0; isend<yBlocks; ++isend ) {
04699           int jb = send_order[isend];
04700           int ny1 = block2;
04701           if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
04702           int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
04703           int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
04704           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04705           int compress_size = sizeof(float)*nx*ny1*nz*2;
04706           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size,  compress_start, compress_size, CMI_FLOATING);
04707       }
04708     }
04709 #endif
04710 };
04711 
04712 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
04713 public:
04714     PmeXPencil_SDAG_CODE
04715     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
04716     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
04717         ~PmeXPencil() {
04718         #ifdef NAMD_FFTW
04719         #ifdef NAMD_FFTW_3
04720                 delete [] forward_plans;
04721                 delete [] backward_plans;
04722         #endif
04723         #endif
04724         }
04725     void fft_init();
04726     void recv_trans(const PmeTransMsg *);
04727     void forward_fft();
04728     void pme_kspace();
04729     void backward_fft();
04730     void send_untrans();
04731         void send_subset_untrans(int fromIdx, int toIdx);
04732     void node_process_trans(PmeTransMsg *);
04733 #ifdef NAMD_FFTW
04734 #ifdef NAMD_FFTW_3
04735     fftwf_plan forward_plan, backward_plan;
04736 
04737         int numPlans;
04738         fftwf_plan *forward_plans, *backward_plans;
04739 #else
04740     fftw_plan forward_plan, backward_plan;
04741 #endif
04742 #endif
04743     int ny, nz;
04744     int recipEvirPe;
04745     void evir_init();
04746     PmeKSpace *myKSpace;
04747 #if USE_PERSISTENT
04748     void  setup_persistent() {
04749       int xBlocks = initdata.xBlocks;
04750       int block1 = initdata.grid.block1;
04751       int K1 = initdata.grid.K1;
04752       CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
04753       untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
04754       for ( int isend=0; isend<xBlocks; ++isend ) {
04755           int ib = send_order[isend];
04756           int nx1 = block1;
04757           if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
04758           int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
04759           int size = sizeof(PmeUntransMsg) +
04760               sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24; 
04761           int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope); 
04762           int compress_size = sizeof(float)*nx1*ny*nz*2;
04763           untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
04764       }
04765     }
04766 #endif
04767 
04768 };
04769 
04770 void PmeXPencil::evir_init() {
04771   recipEvirPe = findRecipEvirPe();
04772   initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
04773 }
04774 
04775 void PmeZPencil::fft_init() {
04776   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04777   Node *node = nd.ckLocalBranch();
04778   SimParameters *simParams = node->simParameters;
04779 
04780 #if USE_NODE_PAR_RECEIVE
04781   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
04782 #endif
04783 
04784   int K1 = initdata.grid.K1;
04785   int K2 = initdata.grid.K2;
04786   int K3 = initdata.grid.K3;
04787   int dim3 = initdata.grid.dim3;
04788   int block1 = initdata.grid.block1;
04789   int block2 = initdata.grid.block2;
04790 
04791   nx = block1;
04792   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04793   ny = block2;
04794   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
04795 
04796 #ifdef NAMD_FFTW
04797   CmiLock(ComputePmeMgr::fftw_plan_lock);
04798 
04799   data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
04800   work = new float[dim3];
04801 
04802   order_init(initdata.zBlocks);
04803 
04804 #ifdef NAMD_FFTW_3
04805   /* need array of sizes for the how many */
04806 
04807   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04808   int sizeLines=nx*ny;
04809   int planLineSizes[1];
04810   planLineSizes[0]=K3;
04811   int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
04812   int ndimHalf=ndim/2;
04813   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
04814                                          (float *) data, NULL, 1, 
04815                                          ndim,
04816                                          (fftwf_complex *) data, NULL, 1,
04817                                          ndimHalf,
04818                                          fftwFlags);
04819 
04820   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
04821                                           (fftwf_complex *) data, NULL, 1, 
04822                                           ndimHalf,
04823                                           (float *) data, NULL, 1, 
04824                                           ndim,
04825                                           fftwFlags);
04826 #if     CMK_SMP && USE_CKLOOP
04827   if(simParams->useCkLoop) {
04828           //How many FFT plans to be created? The grain-size issue!!.
04829           //Currently, I am choosing the min(nx, ny) to be coarse-grain
04830           numPlans = (nx<=ny?nx:ny);
04831           if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
04832           if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
04833           int howmany = sizeLines/numPlans;
04834           forward_plans = new fftwf_plan[numPlans];
04835           backward_plans = new fftwf_plan[numPlans];
04836           for(int i=0; i<numPlans; i++) {
04837                   int dimStride = i*ndim*howmany;
04838                   int dimHalfStride = i*ndimHalf*howmany;
04839                   forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
04840                                                                                                          ((float *)data)+dimStride, NULL, 1,
04841                                                                                                          ndim,
04842                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04843                                                                                                          ndimHalf,
04844                                                                                                          fftwFlags);
04845 
04846                   backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
04847                                                                                                          ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
04848                                                                                                          ndimHalf,
04849                                                                                                          ((float *)data)+dimStride, NULL, 1,
04850                                                                                                          ndim,
04851                                                                                                          fftwFlags);
04852           }
04853   }else 
04854 #endif 
04855   {
04856           forward_plans = NULL;
04857           backward_plans = NULL;
04858   }
04859 #else
04860   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
04861         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04862         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04863   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
04864         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04865         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
04866 #endif
04867   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04868 #else
04869   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04870 #endif
04871 
04872 #if USE_NODE_PAR_RECEIVE
04873     evir = 0.;
04874     memset(data, 0, sizeof(float) * nx*ny*dim3);
04875 #endif
04876 }
04877 
04878 void PmeYPencil::fft_init() {
04879   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
04880   Node *node = nd.ckLocalBranch();
04881   SimParameters *simParams = node->simParameters;
04882 
04883 #if USE_NODE_PAR_RECEIVE
04884   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
04885 #endif
04886 
04887   int K1 = initdata.grid.K1;
04888   int K2 = initdata.grid.K2;
04889   int dim2 = initdata.grid.dim2;
04890   int dim3 = initdata.grid.dim3;
04891   int block1 = initdata.grid.block1;
04892   int block3 = initdata.grid.block3;
04893 
04894   nx = block1;
04895   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
04896   nz = block3;
04897   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
04898 
04899 #ifdef NAMD_FFTW
04900   CmiLock(ComputePmeMgr::fftw_plan_lock);
04901 
04902   data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
04903   work = new float[2*K2];
04904 
04905   order_init(initdata.yBlocks);
04906 
04907 #ifdef NAMD_FFTW_3
04908   /* need array of sizes for the dimensions */
04909   /* ideally this should be implementable as a single multidimensional
04910    *  plan, but that has proven tricky to implement, so we maintain the
04911    *  loop of 1d plan executions. */
04912   int sizeLines=nz;
04913   int planLineSizes[1];
04914   planLineSizes[0]=K2;
04915   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
04916   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04917                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04918                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04919                                      FFTW_FORWARD, 
04920                                      fftwFlags);
04921   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines, 
04922                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04923                                      (fftwf_complex *) data, NULL, sizeLines, 1,
04924                                      FFTW_BACKWARD, 
04925                                       fftwFlags);
04926   CkAssert(forward_plan != NULL);
04927   CkAssert(backward_plan != NULL);
04928 #else
04929   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
04930         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04931         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04932         nz, (fftw_complex *) work, 1);
04933   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
04934         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
04935         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
04936         nz, (fftw_complex *) work, 1);
04937 #endif
04938   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
04939 #else
04940   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
04941 #endif
04942 
04943 #if USE_NODE_PAR_RECEIVE
04944   evir = 0;
04945   CmiMemoryWriteFence();
04946 #endif
04947 }
04948 
04949 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
04950 {
04951   if ( msg->hasData ) hasData = 1;
04952   needs_reply[msg->sourceNode] = msg->hasData;
04953   recv_trans(msg);
04954   int limsg;
04955   CmiMemoryAtomicFetchAndInc(imsg,limsg);
04956   if(limsg+1 == initdata.yBlocks)
04957     {
04958       if ( hasData ) {
04959         forward_fft();
04960       }
04961       send_trans();
04962       imsg=0;
04963       CmiMemoryWriteFence();
04964     }
04965 }
04966 
04967 void PmeYPencil::recvNodeAck(PmeAckMsg *msg) {
04968   delete msg;
04969   node_process_untrans(0);
04970 }
04971 
04972 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
04973 {
04974   if ( msg ) {
04975     if ( ! hasData ) NAMD_bug("PmeYPencil::node_process_untrans non-null msg but not hasData");
04976     recv_untrans(msg);
04977   } else if ( hasData ) NAMD_bug("PmeYPencil::node_process_untrans hasData but null msg");
04978   int limsg;
04979   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
04980   if(limsg+1 == initdata.yBlocks)
04981     {
04982      if ( hasData ) {
04983       backward_fft();
04984      }
04985       hasData=0;
04986       imsgb=0;
04987       CmiMemoryWriteFence();
04988       send_untrans();
04989     }
04990 }
04991 
04992 #define DEBUG_NODE_PAR_RECV 0
04993 
04994 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
04995   //  CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
04996   PmeXPencil *target=xPencilObj.get(msg->destElem);
04997 #if DEBUG_NODE_PAR_RECV
04998   if(target == NULL)
04999     CkAbort("xpencil in recvXTrans not found, debug registeration");
05000 #endif  
05001     target->node_process_trans(msg);
05002   delete msg;
05003 }
05004 
05005 
05006 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
05007   //  CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05008   PmeYPencil *target=yPencilObj.get(msg->destElem);
05009 #if DEBUG_NODE_PAR_RECV
05010   if(target == NULL)
05011     CkAbort("ypencil in recvYTrans not found, debug registeration");
05012 #endif  
05013     target->node_process_trans(msg);
05014   delete msg;
05015  }
05016 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
05017   //  CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05018   PmeYPencil *target=yPencilObj.get(msg->destElem);
05019 #if DEBUG_NODE_PAR_RECV  
05020   if(target == NULL)
05021     CkAbort("ypencil in recvYUntrans not found, debug registeration");
05022 #endif  
05023     target->node_process_untrans(msg);
05024   delete msg;
05025  }
05026 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
05027   //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05028   PmeZPencil *target=zPencilObj.get(msg->destElem);
05029 #if DEBUG_NODE_PAR_RECV
05030   if(target == NULL)
05031     CkAbort("zpencil in recvZUntrans not found, debug registeration");
05032 #endif
05033   target->node_process_untrans(msg);
05034   delete msg;
05035 }
05036 
05037 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
05038   //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
05039   PmeZPencil *target=zPencilObj.get(msg->destElem);
05040 #if DEBUG_NODE_PAR_RECV
05041   if(target == NULL){
05042     CkAbort("zpencil in recvZGrid not found, debug registeration");
05043   }
05044 #endif
05045   target->node_process_grid(msg); //msg is stored inside node_proces_grid
05046 }
05047 
05048 void PmeXPencil::fft_init() {
05049   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
05050   Node *node = nd.ckLocalBranch();
05051   SimParameters *simParams = node->simParameters;
05052 #if USE_NODE_PAR_RECEIVE
05053   ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
05054 #endif
05055 
05056   int K1 = initdata.grid.K1;
05057   int K2 = initdata.grid.K2;
05058   int dim3 = initdata.grid.dim3;
05059   int block2 = initdata.grid.block2;
05060   int block3 = initdata.grid.block3;
05061 
05062   ny = block2;
05063   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
05064   nz = block3;
05065   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
05066 
05067 #ifdef NAMD_FFTW
05068   CmiLock(ComputePmeMgr::fftw_plan_lock);
05069 
05070   data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
05071   work = new float[2*K1];
05072 
05073   order_init(initdata.xBlocks);
05074 
05075 #ifdef NAMD_FFTW_3
05076   /* need array of sizes for the how many */
05077   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
05078   int sizeLines=ny*nz;
05079   int planLineSizes[1];
05080   planLineSizes[0]=K1;
05081   forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05082                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05083                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05084                                    FFTW_FORWARD,
05085                                      fftwFlags);
05086   backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
05087                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05088                                      (fftwf_complex *) data, NULL, sizeLines, 1,
05089                                           FFTW_BACKWARD,
05090                                       fftwFlags);
05091 
05092 #if     CMK_SMP && USE_CKLOOP
05093   if(simParams->useCkLoop) {
05094           //How many FFT plans to be created? The grain-size issue!!.
05095           //Currently, I am choosing the min(nx, ny) to be coarse-grain
05096           numPlans = (ny<=nz?ny:nz);
05097           // limit attempted parallelism due to false sharing
05098           //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
05099           //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
05100           if ( sizeLines/numPlans < 4 ) numPlans = 1;
05101           int howmany = sizeLines/numPlans;
05102           forward_plans = new fftwf_plan[numPlans];
05103           backward_plans = new fftwf_plan[numPlans];
05104           for(int i=0; i<numPlans; i++) {
05105                   int curStride = i*howmany;              
05106                   forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05107                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05108                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05109                                                                                                         FFTW_FORWARD,
05110                                                                                                          fftwFlags);
05111 
05112                   backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
05113                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05114                                                                                                          ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
05115                                                                                                           FFTW_BACKWARD,
05116                                                                                                          fftwFlags);
05117           }
05118   }else
05119 #endif
05120   {
05121           forward_plans = NULL;
05122           backward_plans = NULL;
05123   }
05124 #else
05125   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
05126         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05127         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05128         ny*nz, (fftw_complex *) work, 1);
05129   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
05130         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
05131         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
05132         ny*nz, (fftw_complex *) work, 1);
05133 #endif
05134   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05135 #else
05136   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
05137 #endif
05138 
05139   myKSpace = new PmeKSpace(initdata.grid,
05140                 thisIndex.y*block2, thisIndex.y*block2 + ny,
05141                 thisIndex.z*block3, thisIndex.z*block3 + nz);
05142 
05143 }
05144 
05145 // #define FFTCHECK   // run a grid of integers through the fft
05146 // #define ZEROCHECK  // check for suspicious zeros in fft
05147 
05148 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
05149 
05150   int dim3 = initdata.grid.dim3;
05151   if ( imsg == 0 ) {
05152     lattice = msg->lattice;
05153     sequence = msg->sequence;
05154 #if ! USE_NODE_PAR_RECEIVE
05155     memset(data, 0, sizeof(float)*nx*ny*dim3);
05156 #endif
05157   }
05158 
05159   if ( ! msg->hasData ) return;
05160 
05161   int zlistlen = msg->zlistlen;
05162 #ifdef NAMD_KNL
05163   int * __restrict msg_zlist = msg->zlist;
05164   int * __restrict zlist = work_zlist.begin();
05165   __assume_aligned(zlist,64);
05166   for ( int k=0; k<zlistlen; ++k ) {
05167     zlist[k] = msg_zlist[k];
05168   }
05169 #else
05170   int * __restrict zlist = msg->zlist;
05171 #endif
05172   char * __restrict fmsg = msg->fgrid;
05173   float * __restrict qmsg = msg->qgrid;
05174   float * __restrict d = data;
05175   int numGrids = 1;  // pencil FFT doesn't support multiple grids
05176   for ( int g=0; g<numGrids; ++g ) {
05177     for ( int i=0; i<nx; ++i ) {
05178      for ( int j=0; j<ny; ++j, d += dim3 ) {
05179       if( *(fmsg++) ) {
05180         #pragma ivdep
05181         for ( int k=0; k<zlistlen; ++k ) {
05182           d[zlist[k]] += *(qmsg++);
05183         }
05184       }
05185      }
05186     }
05187   }
05188 }
05189 
05190 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
05191 #ifdef NAMD_FFTW
05192 #ifdef NAMD_FFTW_3    
05193     fftwf_plan *plans = (fftwf_plan *)param;
05194     for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
05195 #endif
05196 #endif        
05197 }
05198 
05199 void PmeZPencil::forward_fft() {
05200   evir = 0.;
05201 #ifdef FFTCHECK
05202   int dim3 = initdata.grid.dim3;
05203   int K3 = initdata.grid.K3;
05204   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05205   float *d = data;
05206   for ( int i=0; i<nx; ++i ) {
05207    for ( int j=0; j<ny; ++j, d += dim3 ) {
05208     for ( int k=0; k<dim3; ++k ) {
05209       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
05210     }
05211    }
05212   }
05213 #endif
05214 #ifdef NAMD_FFTW
05215 #ifdef MANUAL_DEBUG_FFTW3
05216   dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05217 #endif
05218 #ifdef NAMD_FFTW_3
05219 #if     CMK_SMP && USE_CKLOOP
05220   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05221   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05222      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05223           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05224           //transform the above loop
05225           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05226           return;
05227   }
05228 #endif
05229   fftwf_execute(forward_plan);
05230 #else
05231   rfftwnd_real_to_complex(forward_plan, nx*ny,
05232         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
05233 #endif
05234 #ifdef MANUAL_DEBUG_FFTW3
05235   dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05236 #endif
05237 
05238 #endif
05239 #ifdef ZEROCHECK
05240   int dim3 = initdata.grid.dim3;
05241   int K3 = initdata.grid.K3;
05242   float *d = data;
05243   for ( int i=0; i<nx; ++i ) {
05244    for ( int j=0; j<ny; ++j, d += dim3 ) {
05245     for ( int k=0; k<dim3; ++k ) {
05246       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
05247         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
05248     }
05249    }
05250   }
05251 #endif
05252 }
05253 
05254 /* A single task for partitioned PmeZPencil::send_trans work */
05255 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05256         PmeZPencil *zpencil = (PmeZPencil *)param;
05257         zpencil->send_subset_trans(first, last);        
05258 }
05259 
05260 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
05261         int zBlocks = initdata.zBlocks;
05262         int block3 = initdata.grid.block3;
05263         int dim3 = initdata.grid.dim3;
05264         for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05265           int kb = send_order[isend];
05266           int nz = block3;
05267           if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05268           int hd = ( hasData ? 1 : 0 );
05269           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05270           msg->lattice = lattice;
05271           msg->sourceNode = thisIndex.y;
05272           msg->hasData = hasData;
05273           msg->nx = ny;
05274          if ( hasData ) {
05275           float *md = msg->qgrid;
05276           const float *d = data;
05277           for ( int i=0; i<nx; ++i ) {
05278            for ( int j=0; j<ny; ++j, d += dim3 ) {
05279                 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05280                   *(md++) = d[2*k];
05281                   *(md++) = d[2*k+1];
05282                 }
05283            }
05284           }
05285          }
05286           msg->sequence = sequence;
05287           SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05288 
05289     CmiEnableUrgentSend(1);
05290 #if USE_NODE_PAR_RECEIVE
05291       msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05292 #if Y_PERSIST 
05293       CmiUsePersistentHandle(&trans_handle[isend], 1);
05294 #endif
05295       initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05296 #if Y_PERSIST 
05297       CmiUsePersistentHandle(NULL, 0);
05298 #endif    
05299 #else
05300 #if Y_PERSIST 
05301       CmiUsePersistentHandle(&trans_handle[isend], 1);
05302 #endif
05303       initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05304 #if Y_PERSIST 
05305       CmiUsePersistentHandle(NULL, 0);
05306 #endif    
05307 #endif
05308     CmiEnableUrgentSend(0);
05309     }
05310 }
05311 
05312 void PmeZPencil::send_trans() {
05313 #if USE_PERSISTENT
05314     if (trans_handle == NULL) setup_persistent();
05315 #endif
05316 #if     CMK_SMP && USE_CKLOOP
05317         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05318         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05319            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
05326                 //send_subset_trans(0, initdata.zBlocks-1);
05327                 CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
05328                 return;
05329         }
05330 #endif
05331   int zBlocks = initdata.zBlocks;
05332   int block3 = initdata.grid.block3;
05333   int dim3 = initdata.grid.dim3;
05334   for ( int isend=0; isend<zBlocks; ++isend ) {
05335     int kb = send_order[isend];
05336     int nz = block3;
05337     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
05338     int hd = ( hasData ? 1 : 0 );
05339     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05340     msg->lattice = lattice;
05341     msg->sourceNode = thisIndex.y;
05342     msg->hasData = hasData;
05343     msg->nx = ny;
05344    if ( hasData ) {
05345     float *md = msg->qgrid;
05346     const float *d = data;
05347     for ( int i=0; i<nx; ++i ) {
05348      for ( int j=0; j<ny; ++j, d += dim3 ) {
05349       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05350         *(md++) = d[2*k];
05351         *(md++) = d[2*k+1];
05352       }
05353      }
05354     }
05355    }
05356     msg->sequence = sequence;
05357     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
05358 
05359     CmiEnableUrgentSend(1);
05360 #if USE_NODE_PAR_RECEIVE
05361     msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
05362 #if Y_PERSIST 
05363     CmiUsePersistentHandle(&trans_handle[isend], 1);
05364 #endif
05365     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
05366 #if Y_PERSIST 
05367     CmiUsePersistentHandle(NULL, 0);
05368 #endif    
05369 #else
05370 #if Y_PERSIST 
05371     CmiUsePersistentHandle(&trans_handle[isend], 1);
05372 #endif
05373     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
05374 #if Y_PERSIST 
05375     CmiUsePersistentHandle(NULL, 0);
05376 #endif    
05377 #endif
05378     CmiEnableUrgentSend(0);
05379   }
05380 }
05381 
05382 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
05383   if ( imsg == 0 ) {
05384     lattice = msg->lattice;
05385     sequence = msg->sequence;
05386   }
05387   int block2 = initdata.grid.block2;
05388   int K2 = initdata.grid.K2;
05389   int jb = msg->sourceNode;
05390   int ny = msg->nx;
05391  if ( msg->hasData ) {
05392   const float *md = msg->qgrid;
05393   float *d = data;
05394   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05395    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05396     for ( int k=0; k<nz; ++k ) {
05397 #ifdef ZEROCHECK
05398       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
05399         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05400 #endif
05401       d[2*(j*nz+k)] = *(md++);
05402       d[2*(j*nz+k)+1] = *(md++);
05403     }
05404    }
05405   }
05406  } else {
05407   float *d = data;
05408   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05409    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05410     for ( int k=0; k<nz; ++k ) {
05411       d[2*(j*nz+k)] = 0;
05412       d[2*(j*nz+k)+1] = 0;
05413     }
05414    }
05415   }
05416  }
05417 }
05418 
05419 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
05420         PmeYPencil *ypencil = (PmeYPencil *)param;
05421         ypencil->forward_subset_fft(first, last);
05422 }
05423 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
05424 #ifdef NAMD_FFTW
05425 #ifdef NAMD_FFTW_3
05426         for(int i=fromIdx; i<=toIdx; i++){
05427                 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05428                       * nz * initdata.grid.K2,  
05429                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05430         }
05431 #endif
05432 #endif
05433 }
05434 
05435 void PmeYPencil::forward_fft() {
05436     evir = 0.;
05437 #ifdef NAMD_FFTW
05438 #ifdef MANUAL_DEBUG_FFTW3
05439   dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05440 #endif
05441   
05442 #ifdef NAMD_FFTW_3
05443 #if     CMK_SMP && USE_CKLOOP
05444   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05445   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05446      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05447           CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
05448           return;
05449   }
05450 #endif
05451   //the above is a transformation of the following loop using CkLoop
05452   for ( int i=0; i<nx; ++i ) {
05453     fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i 
05454                       * nz * initdata.grid.K2,  
05455                       ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05456   }
05457 #else
05458   for ( int i=0; i<nx; ++i ) {
05459     fftw(forward_plan, nz,
05460         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
05461         nz, 1, (fftw_complex *) work, 1, 0);
05462   }
05463 #endif
05464 #ifdef MANUAL_DEBUG_FFTW3
05465   dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05466 #endif
05467 
05468 #endif
05469 }
05470 
05471 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
05472         PmeYPencil *ypencil = (PmeYPencil *)param;
05473         ypencil->send_subset_trans(first, last);
05474 }
05475 
05476 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
05477         int yBlocks = initdata.yBlocks;
05478         int block2 = initdata.grid.block2;
05479         int K2 = initdata.grid.K2;
05480     for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
05481           int jb = send_order[isend];
05482           int ny = block2;
05483           if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05484           int hd = ( hasData ? 1 : 0 );
05485           PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05486           msg->lattice = lattice;
05487           msg->sourceNode = thisIndex.x;
05488           msg->hasData = hasData;
05489           msg->nx = nx;
05490          if ( hasData ) {
05491           float *md = msg->qgrid;
05492           const float *d = data;
05493           for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05494            for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05495                 for ( int k=0; k<nz; ++k ) {
05496                   *(md++) = d[2*(j*nz+k)];
05497                   *(md++) = d[2*(j*nz+k)+1];
05498   #ifdef ZEROCHECK
05499                   if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05500           thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05501   #endif
05502                 }
05503            }
05504           }
05505           if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05506           thisIndex.x, jb, thisIndex.z);
05507          }
05508           msg->sequence = sequence;
05509           SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05510       CmiEnableUrgentSend(1);
05511 #if USE_NODE_PAR_RECEIVE
05512       msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05513 #if X_PERSIST 
05514       CmiUsePersistentHandle(&trans_handle[isend], 1);
05515 #endif
05516       initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05517 #if X_PERSIST 
05518       CmiUsePersistentHandle(NULL, 0);
05519 #endif
05520 #else      
05521 #if X_PERSIST 
05522       CmiUsePersistentHandle(&trans_handle[isend], 1);
05523 #endif
05524       initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05525 #if X_PERSIST 
05526       CmiUsePersistentHandle(NULL, 0);
05527 #endif
05528 #endif
05529       CmiEnableUrgentSend(0);
05530         }
05531 }
05532 
05533 void PmeYPencil::send_trans() {
05534 #if USE_PERSISTENT
05535     if (trans_handle == NULL) setup_persistent();
05536 #endif
05537 #if     CMK_SMP && USE_CKLOOP
05538         int useCkLoop = Node::Object()->simParameters->useCkLoop;
05539         if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
05540            && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05547                 //send_subset_trans(0, initdata.yBlocks-1);
05548                 CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
05549                 return;
05550         }
05551 #endif
05552   int yBlocks = initdata.yBlocks;
05553   int block2 = initdata.grid.block2;
05554   int K2 = initdata.grid.K2;
05555   for ( int isend=0; isend<yBlocks; ++isend ) {
05556     int jb = send_order[isend];
05557     int ny = block2;
05558     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05559     int hd = ( hasData ? 1 : 0 );
05560     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
05561     msg->lattice = lattice;
05562     msg->sourceNode = thisIndex.x;
05563     msg->hasData = hasData;
05564     msg->nx = nx;
05565    if ( hasData ) {
05566     float *md = msg->qgrid;
05567     const float *d = data;
05568     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05569      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05570       for ( int k=0; k<nz; ++k ) {
05571         *(md++) = d[2*(j*nz+k)];
05572         *(md++) = d[2*(j*nz+k)+1];
05573 #ifdef ZEROCHECK
05574         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
05575         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05576 #endif
05577       }
05578      }
05579     }
05580     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
05581         thisIndex.x, jb, thisIndex.z);
05582    }
05583     msg->sequence = sequence;
05584     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
05585     CmiEnableUrgentSend(1);
05586 #if USE_NODE_PAR_RECEIVE
05587     msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
05588 #if X_PERSIST 
05589         CmiUsePersistentHandle(&trans_handle[isend], 1);
05590 #endif
05591     initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);   
05592 #if X_PERSIST 
05593         CmiUsePersistentHandle(NULL, 0);
05594 #endif
05595 #else
05596 #if X_PERSIST 
05597         CmiUsePersistentHandle(&trans_handle[isend], 1);
05598 #endif
05599     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
05600 #if X_PERSIST 
05601         CmiUsePersistentHandle(NULL, 0);
05602 #endif
05603     
05604 #endif
05605     CmiEnableUrgentSend(0);
05606   }
05607 }
05608 
05609 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
05610 {
05611   if(msg->hasData) hasData=1;
05612   needs_reply[msg->sourceNode] = msg->hasData;
05613   recv_trans(msg);
05614   int limsg;
05615   CmiMemoryAtomicFetchAndInc(imsg,limsg);
05616   if(limsg+1 == initdata.xBlocks)
05617     {
05618       if(hasData){
05619         forward_fft();
05620         pme_kspace();
05621         backward_fft();
05622       }
05623       send_untrans();
05624       imsg=0;
05625       CmiMemoryWriteFence();
05626     }
05627 }
05628 
05629 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
05630   if ( imsg == 0 ) {
05631     lattice = msg->lattice;
05632     sequence = msg->sequence;
05633   }
05634   int block1 = initdata.grid.block1;
05635   int K1 = initdata.grid.K1;
05636   int ib = msg->sourceNode;
05637   int nx = msg->nx;
05638  if ( msg->hasData ) {
05639   const float *md = msg->qgrid;
05640   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05641    float *d = data + i*ny*nz*2;
05642    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05643     for ( int k=0; k<nz; ++k ) {
05644 #ifdef ZEROCHECK
05645       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
05646         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
05647 #endif
05648       d[2*k] = *(md++);
05649       d[2*k+1] = *(md++);
05650     }
05651    }
05652   }
05653  } else {
05654   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05655    float *d = data + i*ny*nz*2;
05656    for ( int j=0; j<ny; ++j, d += nz*2 ) {
05657     for ( int k=0; k<nz; ++k ) {
05658       d[2*k] = 0;
05659       d[2*k+1] = 0;
05660     }
05661    }
05662   }
05663  }
05664 }
05665 
05666 void PmeXPencil::forward_fft() {
05667 #ifdef NAMD_FFTW
05668 
05669 #ifdef MANUAL_DEBUG_FFTW3
05670   dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05671 #endif
05672 
05673 #ifdef NAMD_FFTW_3
05674 #if     CMK_SMP && USE_CKLOOP
05675   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05676   if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
05677      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05678           //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
05679           //transform the above loop
05680           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05681           return;
05682   }
05683 #endif
05684   fftwf_execute(forward_plan);
05685 #else
05686   fftw(forward_plan, ny*nz,
05687         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05688 #endif
05689 #ifdef MANUAL_DEBUG_FFTW3
05690   dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05691 #endif
05692 
05693 #endif
05694 }
05695 
05696 void PmeXPencil::pme_kspace() {
05697 
05698   evir = 0.;
05699 
05700 #ifdef FFTCHECK
05701   return;
05702 #endif
05703 
05704   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
05705 
05706   int useCkLoop = 0;
05707 #if CMK_SMP && USE_CKLOOP
05708   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
05709        && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
05710     useCkLoop = 1;
05711   }
05712 #endif
05713 
05714   int numGrids = 1;
05715   for ( int g=0; g<numGrids; ++g ) {
05716     evir[0] = myKSpace->compute_energy(data+0*g,
05717                 lattice, ewaldcof, &(evir[1]), useCkLoop);
05718   }
05719   
05720 #if USE_NODE_PAR_RECEIVE
05721     CmiMemoryWriteFence();
05722 #endif
05723 }
05724 
05725 void PmeXPencil::backward_fft() {
05726 #ifdef NAMD_FFTW
05727 #ifdef MANUAL_DEBUG_FFTW3
05728   dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05729 #endif
05730 
05731 #ifdef NAMD_FFTW_3
05732 #if     CMK_SMP && USE_CKLOOP
05733   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05734   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
05735      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05736           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
05737           //transform the above loop
05738           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
05739           return;
05740   }
05741 #endif
05742   fftwf_execute(backward_plan);
05743 #else
05744   fftw(backward_plan, ny*nz,
05745         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
05746 #endif
05747 #ifdef MANUAL_DEBUG_FFTW3
05748   dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05749 #endif
05750 #endif
05751 }
05752 
05753 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
05754         PmeXPencil *xpencil = (PmeXPencil *)param;
05755         xpencil->send_subset_untrans(first, last);
05756 }
05757 
05758 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx){
05759         int xBlocks = initdata.xBlocks;
05760         int block1 = initdata.grid.block1;      
05761         int K1 = initdata.grid.K1;
05762 
05763         for(int isend=fromIdx; isend<=toIdx; isend++) {
05764                 int ib = send_order[isend];
05765                 if ( ! needs_reply[ib] ) {
05766                         PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05767                         CmiEnableUrgentSend(1);
05768                         SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05769 #if USE_NODE_PAR_RECEIVE
05770                         initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
05771 #else
05772                         initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05773 #endif
05774                         CmiEnableUrgentSend(0);
05775                         continue;
05776                 }
05777                 int nx = block1;
05778                 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05779                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05780                 msg->sourceNode = thisIndex.y;
05781                 msg->ny = ny;
05782                 float *md = msg->qgrid;
05783                 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05784                         float *d = data + i*ny*nz*2;
05785                         for ( int j=0; j<ny; ++j, d += nz*2 ) {
05786                                 for ( int k=0; k<nz; ++k ) {
05787                                         *(md++) = d[2*k];
05788                                         *(md++) = d[2*k+1];
05789                                 }
05790                         }
05791                 }
05792                 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05793         CmiEnableUrgentSend(1);
05794 #if USE_NODE_PAR_RECEIVE
05795         msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05796 #if Y_PERSIST 
05797         CmiUsePersistentHandle(&untrans_handle[isend], 1);
05798 #endif
05799         initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05800 #if Y_PERSIST 
05801         CmiUsePersistentHandle(NULL, 0);
05802 #endif
05803 #else
05804 #if Y_PERSIST 
05805   //      CmiUsePersistentHandle(&untrans_handle[isend], 1);
05806 #endif
05807         initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05808 #if Y_PERSIST 
05809    //     CmiUsePersistentHandle(NULL, 0);
05810 #endif
05811 #endif
05812         CmiEnableUrgentSend(0);
05813         }
05814 }
05815 
05816 void PmeXPencil::send_untrans() {
05817 
05818   { // send energy and virial
05819     int numGrids = 1;
05820     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
05821     newmsg->evir[0] = evir;
05822     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
05823     CmiEnableUrgentSend(1);
05824     initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
05825     CmiEnableUrgentSend(0);
05826   }
05827 
05828 #if USE_PERSISTENT
05829   if (untrans_handle == NULL) setup_persistent();
05830 #endif
05831 #if     CMK_SMP && USE_CKLOOP
05832   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05833   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
05834      && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
05835                 int xBlocks = initdata.xBlocks;
05836 
05837 #if USE_NODE_PAR_RECEIVE
05838                 //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
05839                 CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
05840 #else
05841         //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
05842                 CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
05843 #endif        
05844                 return;
05845   }
05846 #endif
05847   int xBlocks = initdata.xBlocks;
05848   int block1 = initdata.grid.block1;
05849   int K1 = initdata.grid.K1;
05850   for ( int isend=0; isend<xBlocks; ++isend ) {
05851     int ib = send_order[isend];
05852     if ( ! needs_reply[ib] ) {
05853       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05854       CmiEnableUrgentSend(1);
05855       SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05856 #if USE_NODE_PAR_RECEIVE
05857       initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
05858 #else
05859       initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
05860 #endif
05861       CmiEnableUrgentSend(0);
05862       continue;
05863     }
05864     int nx = block1;
05865     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
05866     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05867     msg->sourceNode = thisIndex.y;
05868     msg->ny = ny;
05869     float *md = msg->qgrid;
05870     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
05871      float *d = data + i*ny*nz*2;
05872      for ( int j=0; j<ny; ++j, d += nz*2 ) {
05873       for ( int k=0; k<nz; ++k ) {
05874         *(md++) = d[2*k];
05875         *(md++) = d[2*k+1];
05876       }
05877      }
05878     }
05879     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
05880 
05881     CmiEnableUrgentSend(1);
05882 #if USE_NODE_PAR_RECEIVE
05883     msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
05884 #if Y_PERSIST 
05885     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05886 #endif
05887     initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
05888 #if Y_PERSIST 
05889     CmiUsePersistentHandle(NULL, 0);
05890 #endif
05891 #else
05892 #if Y_PERSIST 
05893     CmiUsePersistentHandle(&untrans_handle[isend], 1);
05894 #endif
05895     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
05896 #if Y_PERSIST 
05897     CmiUsePersistentHandle(NULL, 0);
05898 #endif
05899 #endif
05900     CmiEnableUrgentSend(0);
05901   }
05902 }
05903 
05904 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
05905   int block2 = initdata.grid.block2;
05906   int K2 = initdata.grid.K2;
05907   int jb = msg->sourceNode;
05908   int ny = msg->ny;
05909   const float *md = msg->qgrid;
05910   float *d = data;
05911   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05912 #if CMK_BLUEGENEL
05913     CmiNetworkProgress();
05914 #endif   
05915     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05916       for ( int k=0; k<nz; ++k ) {
05917 #ifdef ZEROCHECK
05918         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
05919                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
05920 #endif
05921         d[2*(j*nz+k)] = *(md++);
05922         d[2*(j*nz+k)+1] = *(md++);
05923       }
05924     }
05925   }
05926 }
05927 
05928 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
05929         PmeYPencil *ypencil = (PmeYPencil *)param;
05930         ypencil->backward_subset_fft(first, last);
05931 }
05932 
05933 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
05934 #ifdef NAMD_FFTW
05935 #ifdef NAMD_FFTW_3
05936         for(int i=fromIdx; i<=toIdx; i++){
05937                 fftwf_execute_dft(backward_plan,        
05938                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2,         
05939                                                   ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05940         }
05941 #endif
05942 #endif
05943 }
05944 
05945 void PmeYPencil::backward_fft() {
05946 #ifdef NAMD_FFTW
05947 #ifdef MANUAL_DEBUG_FFTW3
05948   dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05949 #endif
05950 
05951 #ifdef NAMD_FFTW_3
05952 #if     CMK_SMP && USE_CKLOOP
05953   int useCkLoop = Node::Object()->simParameters->useCkLoop;
05954   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
05955      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
05956           CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
05957           return;
05958   }
05959 #endif
05960   //the above is a transformation of the following loop using CkLoop
05961   for ( int i=0; i<nx; ++i ) {
05962 #if CMK_BLUEGENEL
05963         CmiNetworkProgress();
05964 #endif
05965     fftwf_execute_dft(backward_plan,    
05966                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
05967                                           ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
05968   }
05969 #else
05970         for ( int i=0; i<nx; ++i ) {
05971 #if CMK_BLUEGENEL
05972           CmiNetworkProgress();
05973 #endif
05974                 fftw(backward_plan, nz,
05975                 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
05976                 nz, 1, (fftw_complex *) work, 1, 0);
05977         }
05978 #endif
05979 
05980 #ifdef MANUAL_DEBUG_FFTW3
05981   dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
05982 #endif
05983 
05984 #endif
05985 }
05986 
05987 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
05988         PmeYPencil *ypencil = (PmeYPencil *)param;
05989         ypencil->send_subset_untrans(first, last);
05990 }
05991 
05992 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx){
05993         int yBlocks = initdata.yBlocks;
05994         int block2 = initdata.grid.block2;      
05995         int K2 = initdata.grid.K2;
05996 
05997         for(int isend=fromIdx; isend<=toIdx; isend++) {
05998                 int jb = send_order[isend];
05999                 if ( ! needs_reply[jb] ) {
06000                         PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06001                         CmiEnableUrgentSend(1);
06002                         SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06003 #if USE_NODE_PAR_RECEIVE
06004                         initdata.zPencil(thisIndex.x,jb,0).recvNodeAck(msg);
06005 #else
06006                         initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06007 #endif
06008                         CmiEnableUrgentSend(0);
06009                         continue;
06010                 }
06011                 int ny = block2;
06012                 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06013                 PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06014                 msg->sourceNode = thisIndex.z;
06015                 msg->ny = nz;
06016                 float *md = msg->qgrid;
06017                 const float *d = data;
06018                 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06019                         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06020                                 for ( int k=0; k<nz; ++k ) {
06021                                         *(md++) = d[2*(j*nz+k)];
06022                                         *(md++) = d[2*(j*nz+k)+1];
06023                                 }
06024                         }
06025                 }
06026                 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06027             CmiEnableUrgentSend(1);
06028 #if USE_NODE_PAR_RECEIVE
06029         msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06030         //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06031 #if Z_PERSIST 
06032         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06033 #endif
06034         initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06035 #if Z_PERSIST 
06036         CmiUsePersistentHandle(NULL, 0);
06037 #endif
06038 #else
06039 #if Z_PERSIST 
06040         CmiUsePersistentHandle(&untrans_handle[isend], 1);
06041 #endif
06042         initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06043 #if Z_PERSIST 
06044         CmiUsePersistentHandle(NULL, 0);
06045 #endif
06046 #endif
06047     CmiEnableUrgentSend(0);
06048         }
06049 }
06050 
06051 void PmeYPencil::send_untrans() {
06052 #if USE_PERSISTENT
06053   if (untrans_handle == NULL) setup_persistent();
06054 #endif
06055 #if     CMK_SMP && USE_CKLOOP
06056   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06057   if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06058      && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
06059           int yBlocks = initdata.yBlocks;
06060 
06061 #if USE_NODE_PAR_RECEIVE      
06062           //CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
06063           CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, yBlocks, 0, yBlocks-1, 1);
06064 #else
06065       //CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
06066           CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
06067 #endif
06068           return;
06069   }
06070 #endif
06071   int yBlocks = initdata.yBlocks;
06072   int block2 = initdata.grid.block2;
06073   int K2 = initdata.grid.K2;
06074   for ( int isend=0; isend<yBlocks; ++isend ) {
06075     int jb = send_order[isend];
06076     if ( ! needs_reply[jb] ) {
06077       PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
06078       CmiEnableUrgentSend(1);
06079       SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06080 #if USE_NODE_PAR_RECEIVE
06081       initdata.zPencil(thisIndex.x,jb,0).recvNodeAck(msg);
06082 #else
06083       initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
06084 #endif
06085       CmiEnableUrgentSend(0);
06086       continue;
06087     }
06088     int ny = block2;
06089     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
06090     PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
06091     msg->sourceNode = thisIndex.z;
06092     msg->ny = nz;
06093     float *md = msg->qgrid;
06094     const float *d = data;
06095     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
06096      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
06097       for ( int k=0; k<nz; ++k ) {
06098         *(md++) = d[2*(j*nz+k)];
06099         *(md++) = d[2*(j*nz+k)+1];
06100       }
06101      }
06102     }
06103     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
06104 
06105     CmiEnableUrgentSend(1);
06106 #if USE_NODE_PAR_RECEIVE
06107     msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
06108     //    CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
06109 #if Z_PERSIST 
06110     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06111 #endif
06112     initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
06113 #if Z_PERSIST
06114     CmiUsePersistentHandle(NULL, 0);
06115 #endif
06116 #else
06117 #if Z_PERSIST 
06118     CmiUsePersistentHandle(&untrans_handle[isend], 1);
06119 #endif
06120     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
06121 #if Z_PERSIST 
06122     CmiUsePersistentHandle(NULL, 0);
06123 #endif
06124 #endif    
06125     CmiEnableUrgentSend(0);
06126   }
06127   
06128 #if USE_NODE_PAR_RECEIVE
06129   evir = 0.;
06130   CmiMemoryWriteFence();
06131 #endif
06132 }
06133 
06134 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
06135 #if ! USE_NODE_PAR_RECEIVE
06136     if(imsg==0) evir=0.;
06137 #endif
06138 
06139   int block3 = initdata.grid.block3;
06140   int dim3 = initdata.grid.dim3;
06141   int kb = msg->sourceNode;
06142   int nz = msg->ny;
06143   const float *md = msg->qgrid;
06144   float *d = data;
06145   for ( int i=0; i<nx; ++i ) {
06146 #if CMK_BLUEGENEL
06147     CmiNetworkProgress();
06148 #endif   
06149     for ( int j=0; j<ny; ++j, d += dim3 ) {
06150       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
06151 #ifdef ZEROCHECK
06152         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
06153                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
06154 #endif
06155         d[2*k] = *(md++);
06156         d[2*k+1] = *(md++);
06157       }
06158     }
06159   }
06160 }
06161 
06162 void PmeZPencil::backward_fft() {
06163 #ifdef NAMD_FFTW
06164 #ifdef MANUAL_DEBUG_FFTW3
06165   dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06166 #endif
06167 #ifdef NAMD_FFTW_3
06168 #if     CMK_SMP && USE_CKLOOP
06169   int useCkLoop = Node::Object()->simParameters->useCkLoop;
06170   if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
06171      && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06172           //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
06173           //transform the above loop
06174           CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
06175           return;
06176   }
06177 #endif
06178   fftwf_execute(backward_plan);
06179 #else
06180   rfftwnd_complex_to_real(backward_plan, nx*ny,
06181             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
06182 #endif
06183 #ifdef MANUAL_DEBUG_FFTW3
06184   dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
06185 #endif
06186 
06187 #endif
06188   
06189 #if CMK_BLUEGENEL
06190   CmiNetworkProgress();
06191 #endif
06192 
06193 #ifdef FFTCHECK
06194   int dim3 = initdata.grid.dim3;
06195   int K1 = initdata.grid.K1;
06196   int K2 = initdata.grid.K2;
06197   int K3 = initdata.grid.K3;
06198   float scale = 1. / (1. * K1 * K2 * K3);
06199   float maxerr = 0.;
06200   float maxstd = 0.;
06201   int mi, mj, mk;  mi = mj = mk = -1;
06202   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
06203   const float *d = data;
06204   for ( int i=0; i<nx; ++i ) {
06205    for ( int j=0; j<ny; ++j, d += dim3 ) {
06206     for ( int k=0; k<K3; ++k ) {
06207       float std = 10. * (10. * (10. * std_base + i) + j) + k;
06208       float err = scale * d[k] - std;
06209       if ( fabsf(err) > fabsf(maxerr) ) {
06210         maxerr = err;
06211         maxstd = std;
06212         mi = i;  mj = j;  mk = k;
06213       }
06214     }
06215    }
06216   }
06217   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
06218                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
06219 #endif
06220 
06221 }
06222 
06223 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
06224         //to take advantage of the interface which allows 3 user params at most.
06225         //under such situtation, no new parameter list needs to be created!! -Chao Mei
06226         PmeZPencil *zpencil = (PmeZPencil *)param;
06227         zpencil->send_subset_ungrid(first, last);
06228 }
06229 
06230 void PmeZPencil::send_all_ungrid() {
06231 
06232 #if     CMK_SMP && USE_CKLOOP
06233         int useCkLoop = Node::Object()->simParameters->useCkLoop;
06234         if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
06235            && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
06236                 //????What's the best value for numChunks?????
06237 #if USE_NODE_PAR_RECEIVE        
06238                 //CkLoop_Parallelize(PmeZPencilSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 1); //has to sync
06239                 CkLoop_Parallelize(PmeZPencilSendUngrid, 1, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
06240 #else
06241         //CkLoop_Parallelize(PmeZPencilSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, grid_msgs.size()-1, 0); //not sync
06242                 CkLoop_Parallelize(PmeZPencilSendUngrid, 1, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0); //not sync
06243 #endif        
06244                 return;
06245         }
06246 #endif
06247         send_subset_ungrid(0, grid_msgs.size()-1);
06248 }
06249 
06250 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx){
06251         for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
06252                 PmeGridMsg *msg = grid_msgs[imsg];
06253                 send_ungrid(msg);
06254         }
06255 }
06256 
06257 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
06258 
06259 #ifdef NAMD_CUDA
06260   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
06261 #else
06262   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
06263 #endif
06264 
06265   int pe = msg->sourceNode;
06266   if ( ! msg->hasData ) {
06267     delete msg;
06268     PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
06269     SET_PRIORITY(ackmsg,sequence,UNGRID_PRIORITY)
06270     CmiEnableUrgentSend(1);
06271     initdata.pmeProxy[pe].recvAck(ackmsg);
06272     CmiEnableUrgentSend(0);
06273     return;
06274   }
06275   if ( ! hasData ) NAMD_bug("PmeZPencil::send_ungrid msg->hasData but not pencil->hasData");
06276   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
06277   int dim3 = initdata.grid.dim3;
06278   int zlistlen = msg->zlistlen;
06279   int *zlist = msg->zlist;
06280   char *fmsg = msg->fgrid;
06281   float *qmsg = msg->qgrid;
06282   float *d = data;
06283   int numGrids = 1;  // pencil FFT doesn't support multiple grids
06284   for ( int g=0; g<numGrids; ++g ) {
06285 #if CMK_BLUEGENEL
06286     CmiNetworkProgress();
06287 #endif    
06288     for ( int i=0; i<nx; ++i ) {
06289       for ( int j=0; j<ny; ++j, d += dim3 ) {
06290         if( *(fmsg++) ) {
06291           for ( int k=0; k<zlistlen; ++k ) {
06292             *(qmsg++) = d[zlist[k]];
06293           }
06294         }
06295       }
06296     }
06297   }
06298   SET_PRIORITY(msg,sequence,UNGRID_PRIORITY)
06299     CmiEnableUrgentSend(1);
06300 #ifdef NAMD_CUDA
06301     if ( offload ) {
06302       initdata.pmeNodeProxy[CkNodeOf(pe)].recvUngrid(msg);
06303     } else
06304 #endif
06305   initdata.pmeProxy[pe].recvUngrid(msg);
06306     CmiEnableUrgentSend(0);
06307 }
06308 
06309 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
06310 {
06311 #if USE_NODE_PAR_RECEIVE
06312   CmiLock(ComputePmeMgr::fftw_plan_lock);
06313   CmiMemoryReadFence();
06314 #endif
06315   recv_grid(msg);
06316   if(msg->hasData) hasData=msg->hasData;
06317   int limsg;
06318   CmiMemoryAtomicFetchAndInc(imsg,limsg);
06319   grid_msgs[limsg] = msg;
06320   //  CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);      
06321   if(limsg+1 == grid_msgs.size())
06322     {
06323 
06324       if (hasData)
06325         {
06326           forward_fft();
06327         }
06328       send_trans();
06329       imsg=0;
06330       CmiMemoryWriteFence();
06331       //      CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06332     }
06333 #if USE_NODE_PAR_RECEIVE
06334   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06335   CmiMemoryWriteFence();
06336 #endif
06337 }
06338 
06339 void PmeZPencil::recvNodeAck(PmeAckMsg *msg) {
06340   delete msg;
06341   node_process_untrans(0);
06342 }
06343 
06344 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
06345 {
06346   if ( msg ) {
06347     if ( ! hasData ) NAMD_bug("PmeZPencil::node_process_untrans non-null msg but not hasData");
06348     recv_untrans(msg);
06349   } else if ( hasData ) NAMD_bug("PmeZPencil::node_process_untrans hasData but null msg");
06350 #if USE_NODE_PAR_RECEIVE
06351   CmiMemoryWriteFence();
06352   CmiLock(ComputePmeMgr::fftw_plan_lock);
06353 #endif    
06354   int limsg;
06355   CmiMemoryAtomicFetchAndInc(imsgb,limsg);
06356   if(limsg+1 == initdata.zBlocks)
06357     {
06358 #if USE_NODE_PAR_RECEIVE
06359       CmiMemoryReadFence();
06360 #endif    
06361       if(hasData) {
06362         backward_fft();
06363       }
06364       send_all_ungrid();
06365       hasData=0;
06366       imsgb=0;
06367       evir = 0;
06368       memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3); 
06369       CmiMemoryWriteFence();
06370       //      CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
06371     }
06372 #if USE_NODE_PAR_RECEIVE
06373   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
06374 #endif
06375 }
06376 
06377 void ComputePmeUtil::select(void)
06378 {
06379   if ( CkMyRank() ) return;
06380   
06381   SimParameters *simParams = Node::Object()->simParameters;
06382 
06383   alchOn = simParams->alchOn;
06384   alchFepOn = simParams->alchFepOn;
06385   alchThermIntOn = simParams->alchThermIntOn;
06386   alchDecouple = alchOn && simParams->alchDecouple;
06387   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0; 
06388   lesOn = simParams->lesOn;
06389   lesFactor = simParams->lesFactor;
06390   pairOn = simParams->pairInteractionOn;
06391   selfOn = simParams->pairInteractionSelf;
06392 
06393   if ( alchOn ) {
06394     numGrids = 2;
06395     if (alchDecouple) numGrids += 2;
06396     if (alchElecLambdaStart || alchThermIntOn) numGrids++;
06397   } else if ( lesOn ) {
06398     numGrids = lesFactor;
06399   } else if ( pairOn ) {
06400     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
06401     numGrids = (selfOn ? 1 : 3);
06402   } else {
06403     numGrids = 1;
06404   }
06405 
06406 }
06407 
06408 #include "ComputePmeMgr.def.h"
06409 

Generated on Sat Sep 22 01:17:12 2018 for NAMD by  doxygen 1.4.7